Don't templatize based on X86 traits

With the classes previously having been duplicated into X8632 and X8664
specific ones, the TargetX8632Traits and TargetX8664Traits now no longer
have to be template arguments, and can be used as free-standing classes.

Bug: b/192890685
Change-Id: Ic1a613376c0fc9375a6e87f380a97a7895ae4fac
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/55408
Presubmit-Ready: Nicolas Capens <nicolascapens@google.com>
Kokoro-Result: kokoro <noreply+kokoro@google.com>
Tested-by: Nicolas Capens <nicolascapens@google.com>
Reviewed-by: Sean Risser <srisser@google.com>
Reviewed-by: Nicolas Capens <nicolascapens@google.com>
diff --git a/third_party/subzero/src/IceAssemblerX8632.h b/third_party/subzero/src/IceAssemblerX8632.h
index a61b661..81a3c89 100644
--- a/third_party/subzero/src/IceAssemblerX8632.h
+++ b/third_party/subzero/src/IceAssemblerX8632.h
@@ -39,7 +39,7 @@
   AssemblerX8632 &operator=(const AssemblerX8632 &) = delete;
 
 protected:
-  explicit AssemblerX8632() : Assembler(Traits::AsmKind) {}
+  explicit AssemblerX8632() : Assembler(Asm_X8632) {}
 
 public:
   using Traits = TargetX8632Traits;
@@ -54,7 +54,7 @@
   static constexpr int MAX_NOP_SIZE = 8;
 
   static bool classof(const Assembler *Asm) {
-    return Asm->getKind() == Traits::AsmKind;
+    return Asm->getKind() == Asm_X8632;
   }
 
   class Immediate {
diff --git a/third_party/subzero/src/IceAssemblerX8664.h b/third_party/subzero/src/IceAssemblerX8664.h
index 82c2ddd..5d1219d 100644
--- a/third_party/subzero/src/IceAssemblerX8664.h
+++ b/third_party/subzero/src/IceAssemblerX8664.h
@@ -39,7 +39,7 @@
   AssemblerX8664 &operator=(const AssemblerX8664 &) = delete;
 
 protected:
-  explicit AssemblerX8664() : Assembler(Traits::AsmKind) {}
+  explicit AssemblerX8664() : Assembler(Asm_X8664) {}
 
 public:
   using Traits = TargetX8664Traits;
@@ -54,7 +54,7 @@
   static constexpr int MAX_NOP_SIZE = 8;
 
   static bool classof(const Assembler *Asm) {
-    return Asm->getKind() == Traits::AsmKind;
+    return Asm->getKind() == Asm_X8664;
   }
 
   class Immediate {
diff --git a/third_party/subzero/src/IceInstX8632.cpp b/third_party/subzero/src/IceInstX8632.cpp
index 87b8b0d..3b8fb22 100644
--- a/third_party/subzero/src/IceInstX8632.cpp
+++ b/third_party/subzero/src/IceInstX8632.cpp
@@ -15,21 +15,2891 @@
 /// file also defines X8632 operand specific methods (dump and emit.)
 ///
 //===----------------------------------------------------------------------===//
+
 #include "IceInstX8632.h"
 
 #include "IceAssemblerX8632.h"
 #include "IceCfg.h"
 #include "IceCfgNode.h"
 #include "IceConditionCodesX86.h"
+#include "IceDefs.h"
 #include "IceInst.h"
 #include "IceOperand.h"
 #include "IceRegistersX8632.h"
+#include "IceTargetLowering.h"
 #include "IceTargetLoweringX8632.h"
 
 namespace Ice {
-
 namespace X8632 {
 
+const char *InstX86Base::getWidthString(Type Ty) {
+  return Traits::TypeAttributes[Ty].WidthString;
+}
+
+const char *InstX86Base::getFldString(Type Ty) {
+  return Traits::TypeAttributes[Ty].FldString;
+}
+
+typename Cond::BrCond InstX86Base::getOppositeCondition(BrCond Cond) {
+  return Traits::InstBrAttributes[Cond].Opposite;
+}
+
+InstX86FakeRMW::InstX86FakeRMW(Cfg *Func, Operand *Data, Operand *Addr,
+                               InstArithmetic::OpKind Op, Variable *Beacon)
+    : InstX86Base(Func, InstX86Base::FakeRMW, 3, nullptr), Op(Op) {
+  this->addSource(Data);
+  this->addSource(Addr);
+  this->addSource(Beacon);
+}
+
+InstX86Mul::InstX86Mul(Cfg *Func, Variable *Dest, Variable *Source1,
+                       Operand *Source2)
+    : InstX86Base(Func, InstX86Base::Mul, 2, Dest) {
+  this->addSource(Source1);
+  this->addSource(Source2);
+}
+
+InstX86Shld::InstX86Shld(Cfg *Func, Variable *Dest, Variable *Source1,
+                         Operand *Source2)
+    : InstX86Base(Func, InstX86Base::Shld, 3, Dest) {
+  this->addSource(Dest);
+  this->addSource(Source1);
+  this->addSource(Source2);
+}
+
+InstX86Shrd::InstX86Shrd(Cfg *Func, Variable *Dest, Variable *Source1,
+                         Operand *Source2)
+    : InstX86Base(Func, InstX86Base::Shrd, 3, Dest) {
+  this->addSource(Dest);
+  this->addSource(Source1);
+  this->addSource(Source2);
+}
+
+InstX86Label::InstX86Label(Cfg *Func, TargetLowering *Target)
+    : InstX86Base(Func, InstX86Base::Label, 0, nullptr),
+      LabelNumber(Target->makeNextLabelNumber()) {
+  if (BuildDefs::dump()) {
+    Name = GlobalString::createWithString(
+        Func->getContext(), ".L" + Func->getFunctionName() + "$local$__" +
+                                std::to_string(LabelNumber));
+  } else {
+    Name = GlobalString::createWithoutString(Func->getContext());
+  }
+}
+
+InstX86Br::InstX86Br(Cfg *Func, const CfgNode *TargetTrue,
+                     const CfgNode *TargetFalse, const InstX86Label *Label,
+                     BrCond Condition, Mode Kind)
+    : InstX86Base(Func, InstX86Base::Br, 0, nullptr), Condition(Condition),
+      TargetTrue(TargetTrue), TargetFalse(TargetFalse), Label(Label),
+      Kind(Kind) {}
+
+bool InstX86Br::optimizeBranch(const CfgNode *NextNode) {
+  // If there is no next block, then there can be no fallthrough to optimize.
+  if (NextNode == nullptr)
+    return false;
+  // Intra-block conditional branches can't be optimized.
+  if (Label)
+    return false;
+  // If there is no fallthrough node, such as a non-default case label for a
+  // switch instruction, then there is no opportunity to optimize.
+  if (getTargetFalse() == nullptr)
+    return false;
+
+  // Unconditional branch to the next node can be removed.
+  if (Condition == Cond::Br_None && getTargetFalse() == NextNode) {
+    assert(getTargetTrue() == nullptr);
+    this->setDeleted();
+    return true;
+  }
+  // If the fallthrough is to the next node, set fallthrough to nullptr to
+  // indicate.
+  if (getTargetFalse() == NextNode) {
+    TargetFalse = nullptr;
+    return true;
+  }
+  // If TargetTrue is the next node, and TargetFalse is not nullptr (which was
+  // already tested above), then invert the branch condition, swap the targets,
+  // and set new fallthrough to nullptr.
+  if (getTargetTrue() == NextNode) {
+    assert(Condition != Cond::Br_None);
+    Condition = this->getOppositeCondition(Condition);
+    TargetTrue = getTargetFalse();
+    TargetFalse = nullptr;
+    return true;
+  }
+  return false;
+}
+
+bool InstX86Br::repointEdges(CfgNode *OldNode, CfgNode *NewNode) {
+  bool Found = false;
+  if (TargetFalse == OldNode) {
+    TargetFalse = NewNode;
+    Found = true;
+  }
+  if (TargetTrue == OldNode) {
+    TargetTrue = NewNode;
+    Found = true;
+  }
+  return Found;
+}
+
+InstX86Jmp::InstX86Jmp(Cfg *Func, Operand *Target)
+    : InstX86Base(Func, InstX86Base::Jmp, 1, nullptr) {
+  this->addSource(Target);
+}
+
+InstX86Call::InstX86Call(Cfg *Func, Variable *Dest, Operand *CallTarget)
+    : InstX86Base(Func, InstX86Base::Call, 1, Dest) {
+  this->HasSideEffects = true;
+  this->addSource(CallTarget);
+}
+
+InstX86Movmsk::InstX86Movmsk(Cfg *Func, Variable *Dest, Operand *Source)
+    : InstX86Base(Func, InstX86Base::Movmsk, 1, Dest) {
+  this->addSource(Source);
+}
+
+InstX86Cmov::InstX86Cmov(Cfg *Func, Variable *Dest, Operand *Source,
+                         BrCond Condition)
+    : InstX86Base(Func, InstX86Base::Cmov, 2, Dest), Condition(Condition) {
+  // The final result is either the original Dest, or Source, so mark both as
+  // sources.
+  this->addSource(Dest);
+  this->addSource(Source);
+}
+
+InstX86Cmpps::InstX86Cmpps(Cfg *Func, Variable *Dest, Operand *Source,
+                           CmppsCond Condition)
+    : InstX86Base(Func, InstX86Base::Cmpps, 2, Dest), Condition(Condition) {
+  this->addSource(Dest);
+  this->addSource(Source);
+}
+
+InstX86Cmpxchg::InstX86Cmpxchg(Cfg *Func, Operand *DestOrAddr, Variable *Eax,
+                               Variable *Desired, bool Locked)
+    : InstX86BaseLockable(Func, InstX86Base::Cmpxchg, 3,
+                          llvm::dyn_cast<Variable>(DestOrAddr), Locked) {
+  constexpr uint16_t Encoded_rAX = 0;
+  (void)Encoded_rAX;
+  assert(Traits::getEncodedGPR(Eax->getRegNum()) == Encoded_rAX);
+  this->addSource(DestOrAddr);
+  this->addSource(Eax);
+  this->addSource(Desired);
+}
+
+InstX86Cmpxchg8b::InstX86Cmpxchg8b(Cfg *Func, X86OperandMem *Addr,
+                                   Variable *Edx, Variable *Eax, Variable *Ecx,
+                                   Variable *Ebx, bool Locked)
+    : InstX86BaseLockable(Func, InstX86Base::Cmpxchg, 5, nullptr, Locked) {
+  assert(Edx->getRegNum() == RegisterSet::Reg_edx);
+  assert(Eax->getRegNum() == RegisterSet::Reg_eax);
+  assert(Ecx->getRegNum() == RegisterSet::Reg_ecx);
+  assert(Ebx->getRegNum() == RegisterSet::Reg_ebx);
+  this->addSource(Addr);
+  this->addSource(Edx);
+  this->addSource(Eax);
+  this->addSource(Ecx);
+  this->addSource(Ebx);
+}
+
+InstX86Cvt::InstX86Cvt(Cfg *Func, Variable *Dest, Operand *Source,
+                       CvtVariant Variant)
+    : InstX86Base(Func, InstX86Base::Cvt, 1, Dest), Variant(Variant) {
+  this->addSource(Source);
+}
+
+InstX86Icmp::InstX86Icmp(Cfg *Func, Operand *Src0, Operand *Src1)
+    : InstX86Base(Func, InstX86Base::Icmp, 2, nullptr) {
+  this->addSource(Src0);
+  this->addSource(Src1);
+}
+
+InstX86Ucomiss::InstX86Ucomiss(Cfg *Func, Operand *Src0, Operand *Src1)
+    : InstX86Base(Func, InstX86Base::Ucomiss, 2, nullptr) {
+  this->addSource(Src0);
+  this->addSource(Src1);
+}
+
+InstX86UD2::InstX86UD2(Cfg *Func)
+    : InstX86Base(Func, InstX86Base::UD2, 0, nullptr) {}
+
+InstX86Int3::InstX86Int3(Cfg *Func)
+    : InstX86Base(Func, InstX86Base::Int3, 0, nullptr) {}
+
+InstX86Test::InstX86Test(Cfg *Func, Operand *Src1, Operand *Src2)
+    : InstX86Base(Func, InstX86Base::Test, 2, nullptr) {
+  this->addSource(Src1);
+  this->addSource(Src2);
+}
+
+InstX86Mfence::InstX86Mfence(Cfg *Func)
+    : InstX86Base(Func, InstX86Base::Mfence, 0, nullptr) {
+  this->HasSideEffects = true;
+}
+
+InstX86Store::InstX86Store(Cfg *Func, Operand *Value, X86Operand *Mem)
+    : InstX86Base(Func, InstX86Base::Store, 2, nullptr) {
+  this->addSource(Value);
+  this->addSource(Mem);
+}
+
+InstX86StoreP::InstX86StoreP(Cfg *Func, Variable *Value, X86OperandMem *Mem)
+    : InstX86Base(Func, InstX86Base::StoreP, 2, nullptr) {
+  this->addSource(Value);
+  this->addSource(Mem);
+}
+
+InstX86StoreQ::InstX86StoreQ(Cfg *Func, Operand *Value, X86OperandMem *Mem)
+    : InstX86Base(Func, InstX86Base::StoreQ, 2, nullptr) {
+  this->addSource(Value);
+  this->addSource(Mem);
+}
+
+InstX86StoreD::InstX86StoreD(Cfg *Func, Operand *Value, X86OperandMem *Mem)
+    : InstX86Base(Func, InstX86Base::StoreD, 2, nullptr) {
+  this->addSource(Value);
+  this->addSource(Mem);
+}
+
+InstX86Nop::InstX86Nop(Cfg *Func, NopVariant Variant)
+    : InstX86Base(Func, InstX86Base::Nop, 0, nullptr), Variant(Variant) {}
+
+InstX86Fld::InstX86Fld(Cfg *Func, Operand *Src)
+    : InstX86Base(Func, InstX86Base::Fld, 1, nullptr) {
+  this->addSource(Src);
+}
+
+InstX86Fstp::InstX86Fstp(Cfg *Func, Variable *Dest)
+    : InstX86Base(Func, InstX86Base::Fstp, 0, Dest) {}
+
+InstX86Pop::InstX86Pop(Cfg *Func, Variable *Dest)
+    : InstX86Base(Func, InstX86Base::Pop, 0, Dest) {
+  // A pop instruction affects the stack pointer and so it should not be
+  // allowed to be automatically dead-code eliminated. (The corresponding push
+  // instruction doesn't need this treatment because it has no dest variable
+  // and therefore won't be dead-code eliminated.) This is needed for
+  // late-stage liveness analysis (e.g. asm-verbose mode).
+  this->HasSideEffects = true;
+}
+
+InstX86Push::InstX86Push(Cfg *Func, Operand *Source)
+    : InstX86Base(Func, InstX86Base::Push, 1, nullptr) {
+  this->addSource(Source);
+}
+
+InstX86Ret::InstX86Ret(Cfg *Func, Variable *Source)
+    : InstX86Base(Func, InstX86Base::Ret, Source ? 1 : 0, nullptr) {
+  if (Source)
+    this->addSource(Source);
+}
+
+InstX86Setcc::InstX86Setcc(Cfg *Func, Variable *Dest, BrCond Cond)
+    : InstX86Base(Func, InstX86Base::Setcc, 0, Dest), Condition(Cond) {}
+
+InstX86Xadd::InstX86Xadd(Cfg *Func, Operand *Dest, Variable *Source,
+                         bool Locked)
+    : InstX86BaseLockable(Func, InstX86Base::Xadd, 2,
+                          llvm::dyn_cast<Variable>(Dest), Locked) {
+  this->addSource(Dest);
+  this->addSource(Source);
+}
+
+InstX86Xchg::InstX86Xchg(Cfg *Func, Operand *Dest, Variable *Source)
+    : InstX86Base(Func, InstX86Base::Xchg, 2, llvm::dyn_cast<Variable>(Dest)) {
+  this->addSource(Dest);
+  this->addSource(Source);
+}
+
+InstX86IacaStart::InstX86IacaStart(Cfg *Func)
+    : InstX86Base(Func, InstX86Base::IacaStart, 0, nullptr) {
+  assert(getFlags().getAllowIacaMarks());
+}
+
+InstX86IacaEnd::InstX86IacaEnd(Cfg *Func)
+    : InstX86Base(Func, InstX86Base::IacaEnd, 0, nullptr) {
+  assert(getFlags().getAllowIacaMarks());
+}
+
+// ======================== Dump routines ======================== //
+
+void InstX86Base::dump(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  Str << "[" << Traits::TargetName << "] ";
+  Inst::dump(Func);
+}
+
+void InstX86FakeRMW::dump(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  Type Ty = getData()->getType();
+  Str << "rmw " << InstArithmetic::getOpName(getOp()) << " " << Ty << " *";
+  getAddr()->dump(Func);
+  Str << ", ";
+  getData()->dump(Func);
+  Str << ", beacon=";
+  getBeacon()->dump(Func);
+}
+
+void InstX86Label::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  Str << getLabelName() << ":";
+}
+
+void InstX86Label::emitIAS(const Cfg *Func) const {
+  Assembler *Asm = Func->getAssembler<Assembler>();
+  Asm->bindLocalLabel(LabelNumber);
+  if (OffsetReloc != nullptr) {
+    Asm->bindRelocOffset(OffsetReloc);
+  }
+}
+
+void InstX86Label::dump(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  Str << getLabelName() << ":";
+}
+
+void InstX86Br::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  Str << "\t";
+
+  if (Condition == Cond::Br_None) {
+    Str << "jmp";
+  } else {
+    Str << Traits::InstBrAttributes[Condition].EmitString;
+  }
+
+  if (Label) {
+    Str << "\t" << Label->getLabelName();
+  } else {
+    if (Condition == Cond::Br_None) {
+      Str << "\t" << getTargetFalse()->getAsmName();
+    } else {
+      Str << "\t" << getTargetTrue()->getAsmName();
+      if (getTargetFalse()) {
+        Str << "\n\t"
+               "jmp\t"
+            << getTargetFalse()->getAsmName();
+      }
+    }
+  }
+}
+
+void InstX86Br::emitIAS(const Cfg *Func) const {
+  Assembler *Asm = Func->getAssembler<Assembler>();
+  if (Label) {
+    auto *L = Asm->getOrCreateLocalLabel(Label->getLabelNumber());
+    if (Condition == Cond::Br_None) {
+      Asm->jmp(L, isNear());
+    } else {
+      Asm->j(Condition, L, isNear());
+    }
+  } else {
+    if (Condition == Cond::Br_None) {
+      auto *L = Asm->getOrCreateCfgNodeLabel(getTargetFalse()->getIndex());
+      assert(!getTargetTrue());
+      Asm->jmp(L, isNear());
+    } else {
+      auto *L = Asm->getOrCreateCfgNodeLabel(getTargetTrue()->getIndex());
+      Asm->j(Condition, L, isNear());
+      if (getTargetFalse()) {
+        auto *L2 = Asm->getOrCreateCfgNodeLabel(getTargetFalse()->getIndex());
+        Asm->jmp(L2, isNear());
+      }
+    }
+  }
+}
+
+void InstX86Br::dump(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  Str << "br ";
+
+  if (Condition == Cond::Br_None) {
+    if (Label) {
+      Str << "label %" << Label->getLabelName();
+    } else {
+      Str << "label %" << getTargetFalse()->getName();
+    }
+    return;
+  }
+
+  Str << Traits::InstBrAttributes[Condition].DisplayString;
+  if (Label) {
+    Str << ", label %" << Label->getLabelName();
+  } else {
+    Str << ", label %" << getTargetTrue()->getName();
+    if (getTargetFalse()) {
+      Str << ", label %" << getTargetFalse()->getName();
+    }
+  }
+
+  Str << " // (" << (isNear() ? "near" : "far") << " jump)";
+}
+
+void InstX86Jmp::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(this->getSrcSize() == 1);
+  const Operand *Src = this->getSrc(0);
+  if (Traits::Is64Bit) {
+    if (const auto *CR = llvm::dyn_cast<ConstantRelocatable>(Src)) {
+      Str << "\t"
+             "jmp"
+             "\t"
+          << CR->getName();
+      return;
+    }
+  }
+  Str << "\t"
+         "jmp"
+         "\t*";
+  getJmpTarget()->emit(Func);
+}
+
+void InstX86Jmp::emitIAS(const Cfg *Func) const {
+  // Note: Adapted (mostly copied) from
+  // InstX86Call::emitIAS().
+  Assembler *Asm = Func->getAssembler<Assembler>();
+  Operand *Target = getJmpTarget();
+  if (const auto *Var = llvm::dyn_cast<Variable>(Target)) {
+    if (Var->hasReg()) {
+      Asm->jmp(Traits::getEncodedGPR(Var->getRegNum()));
+    } else {
+      // The jmp instruction with a memory operand should be possible to
+      // encode, but it isn't a valid sandboxed instruction, and there
+      // shouldn't be a register allocation issue to jump through a scratch
+      // register, so we don't really need to bother implementing it.
+      llvm::report_fatal_error("Assembler can't jmp to memory operand");
+    }
+  } else if (const auto *Mem = llvm::dyn_cast<X86OperandMem>(Target)) {
+    (void)Mem;
+    assert(Mem->getSegmentRegister() == X86OperandMem::DefaultSegment);
+    llvm::report_fatal_error("Assembler can't jmp to memory operand");
+  } else if (const auto *CR = llvm::dyn_cast<ConstantRelocatable>(Target)) {
+    Asm->jmp(CR);
+  } else if (const auto *Imm = llvm::dyn_cast<ConstantInteger32>(Target)) {
+    // NaCl trampoline calls refer to an address within the sandbox directly.
+    // This is usually only needed for non-IRT builds and otherwise not very
+    // portable or stable. Usually this is only done for "calls" and not jumps.
+    Asm->jmp(AssemblerImmediate(Imm->getValue()));
+  } else {
+    llvm::report_fatal_error("Unexpected operand type");
+  }
+}
+
+void InstX86Jmp::dump(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  Str << "jmp ";
+  getJmpTarget()->dump(Func);
+}
+
+void InstX86Call::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(this->getSrcSize() == 1);
+  Str << "\t"
+         "call\t";
+  Operand *CallTarget = getCallTarget();
+  auto *Target = InstX86Base::getTarget(Func);
+  if (const auto *CI = llvm::dyn_cast<ConstantInteger32>(CallTarget)) {
+    // Emit without a leading '$'.
+    Str << CI->getValue();
+  } else if (const auto DirectCallTarget =
+                 llvm::dyn_cast<ConstantRelocatable>(CallTarget)) {
+    DirectCallTarget->emitWithoutPrefix(Target);
+  } else {
+    Str << "*";
+    CallTarget->emit(Func);
+  }
+}
+
+void InstX86Call::emitIAS(const Cfg *Func) const {
+  Assembler *Asm = Func->getAssembler<Assembler>();
+  Operand *CallTarget = getCallTarget();
+  auto *Target = InstX86Base::getTarget(Func);
+  if (const auto *Var = llvm::dyn_cast<Variable>(CallTarget)) {
+    if (Var->hasReg()) {
+      Asm->call(Traits::getEncodedGPR(Var->getRegNum()));
+    } else {
+      Asm->call(Target->stackVarToAsmOperand(Var));
+    }
+  } else if (const auto *Mem = llvm::dyn_cast<X86OperandMem>(CallTarget)) {
+    assert(Mem->getSegmentRegister() == X86OperandMem::DefaultSegment);
+    Asm->call(Mem->toAsmAddress(Asm, Target));
+  } else if (const auto *CR = llvm::dyn_cast<ConstantRelocatable>(CallTarget)) {
+    Asm->call(CR);
+  } else if (const auto *Imm = llvm::dyn_cast<ConstantInteger32>(CallTarget)) {
+    Asm->call(AssemblerImmediate(Imm->getValue()));
+  } else {
+    llvm_unreachable("Unexpected operand type");
+  }
+}
+
+void InstX86Call::dump(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  if (this->getDest()) {
+    this->dumpDest(Func);
+    Str << " = ";
+  }
+  Str << "call ";
+  getCallTarget()->dump(Func);
+}
+
+// The this->Opcode parameter needs to be char* and not std::string because of
+// template issues.
+
+void InstX86Base::emitTwoAddress(const Cfg *Func, const char *Opcode,
+                                 const char *Suffix) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(getSrcSize() == 2);
+  Operand *Dest = getDest();
+  if (Dest == nullptr)
+    Dest = getSrc(0);
+  assert(Dest == getSrc(0));
+  Operand *Src1 = getSrc(1);
+  Str << "\t" << Opcode << Suffix
+      << InstX86Base::getWidthString(Dest->getType()) << "\t";
+  Src1->emit(Func);
+  Str << ", ";
+  Dest->emit(Func);
+}
+
+void emitIASOpTyGPR(const Cfg *Func, Type Ty, const Operand *Op,
+                    const GPREmitterOneOp &Emitter) {
+  auto *Target = InstX86Base::getTarget(Func);
+  Assembler *Asm = Func->getAssembler<Assembler>();
+  if (const auto *Var = llvm::dyn_cast<Variable>(Op)) {
+    if (Var->hasReg()) {
+      // We cheat a little and use GPRRegister even for byte operations.
+      GPRRegister VarReg = Traits::getEncodedGPR(Var->getRegNum());
+      (Asm->*(Emitter.Reg))(Ty, VarReg);
+    } else {
+      Address StackAddr(Target->stackVarToAsmOperand(Var));
+      (Asm->*(Emitter.Addr))(Ty, StackAddr);
+    }
+  } else if (const auto *Mem = llvm::dyn_cast<X86OperandMem>(Op)) {
+    Mem->emitSegmentOverride(Asm);
+    (Asm->*(Emitter.Addr))(Ty, Mem->toAsmAddress(Asm, Target));
+  } else {
+    llvm_unreachable("Unexpected operand type");
+  }
+}
+
+template <bool VarCanBeByte, bool SrcCanBeByte>
+void emitIASRegOpTyGPR(const Cfg *Func, bool IsLea, Type Ty,
+                       const Variable *Var, const Operand *Src,
+                       const GPREmitterRegOp &Emitter) {
+  auto *Target = InstX86Base::getTarget(Func);
+  Assembler *Asm = Func->getAssembler<Assembler>();
+  assert(Var->hasReg());
+  // We cheat a little and use GPRRegister even for byte operations.
+  GPRRegister VarReg = VarCanBeByte ? Traits::getEncodedGPR(Var->getRegNum())
+                                    : Traits::getEncodedGPR(Var->getRegNum());
+  if (const auto *SrcVar = llvm::dyn_cast<Variable>(Src)) {
+    if (SrcVar->hasReg()) {
+      GPRRegister SrcReg = SrcCanBeByte
+                               ? Traits::getEncodedGPR(SrcVar->getRegNum())
+                               : Traits::getEncodedGPR(SrcVar->getRegNum());
+      (Asm->*(Emitter.GPRGPR))(Ty, VarReg, SrcReg);
+    } else {
+      Address SrcStackAddr = Target->stackVarToAsmOperand(SrcVar);
+      (Asm->*(Emitter.GPRAddr))(Ty, VarReg, SrcStackAddr);
+    }
+  } else if (const auto *Mem = llvm::dyn_cast<X86OperandMem>(Src)) {
+    Mem->emitSegmentOverride(Asm);
+    (Asm->*(Emitter.GPRAddr))(Ty, VarReg,
+                              Mem->toAsmAddress(Asm, Target, IsLea));
+  } else if (const auto *Imm = llvm::dyn_cast<ConstantInteger32>(Src)) {
+    (Asm->*(Emitter.GPRImm))(Ty, VarReg, AssemblerImmediate(Imm->getValue()));
+  } else if (const auto *Imm = llvm::dyn_cast<ConstantInteger64>(Src)) {
+    assert(Traits::Is64Bit);
+    assert(Utils::IsInt(32, Imm->getValue()));
+    (Asm->*(Emitter.GPRImm))(Ty, VarReg, AssemblerImmediate(Imm->getValue()));
+  } else if (const auto *Reloc = llvm::dyn_cast<ConstantRelocatable>(Src)) {
+    const auto FixupKind = (Reloc->getName().hasStdString() &&
+                            Reloc->getName().toString() == GlobalOffsetTable)
+                               ? Traits::FK_GotPC
+                               : Traits::TargetLowering::getAbsFixup();
+    AssemblerFixup *Fixup = Asm->createFixup(FixupKind, Reloc);
+    (Asm->*(Emitter.GPRImm))(Ty, VarReg, AssemblerImmediate(Fixup));
+  } else if (const auto *Split = llvm::dyn_cast<VariableSplit>(Src)) {
+    (Asm->*(Emitter.GPRAddr))(Ty, VarReg, Split->toAsmAddress(Func));
+  } else {
+    llvm_unreachable("Unexpected operand type");
+  }
+}
+
+void emitIASAddrOpTyGPR(const Cfg *Func, Type Ty, const Address &Addr,
+                        const Operand *Src, const GPREmitterAddrOp &Emitter) {
+  Assembler *Asm = Func->getAssembler<Assembler>();
+  // Src can only be Reg or AssemblerImmediate.
+  if (const auto *SrcVar = llvm::dyn_cast<Variable>(Src)) {
+    assert(SrcVar->hasReg());
+    GPRRegister SrcReg = Traits::getEncodedGPR(SrcVar->getRegNum());
+    (Asm->*(Emitter.AddrGPR))(Ty, Addr, SrcReg);
+  } else if (const auto *Imm = llvm::dyn_cast<ConstantInteger32>(Src)) {
+    (Asm->*(Emitter.AddrImm))(Ty, Addr, AssemblerImmediate(Imm->getValue()));
+  } else if (const auto *Imm = llvm::dyn_cast<ConstantInteger64>(Src)) {
+    assert(Traits::Is64Bit);
+    assert(Utils::IsInt(32, Imm->getValue()));
+    (Asm->*(Emitter.AddrImm))(Ty, Addr, AssemblerImmediate(Imm->getValue()));
+  } else if (const auto *Reloc = llvm::dyn_cast<ConstantRelocatable>(Src)) {
+    const auto FixupKind = (Reloc->getName().hasStdString() &&
+                            Reloc->getName().toString() == GlobalOffsetTable)
+                               ? Traits::FK_GotPC
+                               : Traits::TargetLowering::getAbsFixup();
+    AssemblerFixup *Fixup = Asm->createFixup(FixupKind, Reloc);
+    (Asm->*(Emitter.AddrImm))(Ty, Addr, AssemblerImmediate(Fixup));
+  } else {
+    llvm_unreachable("Unexpected operand type");
+  }
+}
+
+void emitIASAsAddrOpTyGPR(const Cfg *Func, Type Ty, const Operand *Op0,
+                          const Operand *Op1, const GPREmitterAddrOp &Emitter) {
+  auto *Target = InstX86Base::getTarget(Func);
+  if (const auto *Op0Var = llvm::dyn_cast<Variable>(Op0)) {
+    assert(!Op0Var->hasReg());
+    Address StackAddr(Target->stackVarToAsmOperand(Op0Var));
+    emitIASAddrOpTyGPR(Func, Ty, StackAddr, Op1, Emitter);
+  } else if (const auto *Op0Mem = llvm::dyn_cast<X86OperandMem>(Op0)) {
+    Assembler *Asm = Func->getAssembler<Assembler>();
+    Op0Mem->emitSegmentOverride(Asm);
+    emitIASAddrOpTyGPR(Func, Ty, Op0Mem->toAsmAddress(Asm, Target), Op1,
+                       Emitter);
+  } else if (const auto *Split = llvm::dyn_cast<VariableSplit>(Op0)) {
+    emitIASAddrOpTyGPR(Func, Ty, Split->toAsmAddress(Func), Op1, Emitter);
+  } else {
+    llvm_unreachable("Unexpected operand type");
+  }
+}
+
+void emitIASGPRShift(const Cfg *Func, Type Ty, const Variable *Var,
+                     const Operand *Src, const GPREmitterShiftOp &Emitter) {
+  Assembler *Asm = Func->getAssembler<Assembler>();
+  // Technically, the Dest Var can be mem as well, but we only use Reg. We can
+  // extend this to check Dest if we decide to use that form.
+  assert(Var->hasReg());
+  // We cheat a little and use GPRRegister even for byte operations.
+  GPRRegister VarReg = Traits::getEncodedGPR(Var->getRegNum());
+  // Src must be reg == ECX or an Imm8. This is asserted by the assembler.
+  if (const auto *SrcVar = llvm::dyn_cast<Variable>(Src)) {
+    assert(SrcVar->hasReg());
+    GPRRegister SrcReg = Traits::getEncodedGPR(SrcVar->getRegNum());
+    (Asm->*(Emitter.GPRGPR))(Ty, VarReg, SrcReg);
+  } else if (const auto *Imm = llvm::dyn_cast<ConstantInteger32>(Src)) {
+    (Asm->*(Emitter.GPRImm))(Ty, VarReg, AssemblerImmediate(Imm->getValue()));
+  } else if (const auto *Imm = llvm::dyn_cast<ConstantInteger64>(Src)) {
+    assert(Traits::Is64Bit);
+    assert(Utils::IsInt(32, Imm->getValue()));
+    (Asm->*(Emitter.GPRImm))(Ty, VarReg, AssemblerImmediate(Imm->getValue()));
+  } else {
+    llvm_unreachable("Unexpected operand type");
+  }
+}
+
+void emitIASGPRShiftDouble(const Cfg *Func, const Variable *Dest,
+                           const Operand *Src1Op, const Operand *Src2Op,
+                           const GPREmitterShiftD &Emitter) {
+  Assembler *Asm = Func->getAssembler<Assembler>();
+  // Dest can be reg or mem, but we only use the reg variant.
+  assert(Dest->hasReg());
+  GPRRegister DestReg = Traits::getEncodedGPR(Dest->getRegNum());
+  // SrcVar1 must be reg.
+  const auto *SrcVar1 = llvm::cast<Variable>(Src1Op);
+  assert(SrcVar1->hasReg());
+  GPRRegister SrcReg = Traits::getEncodedGPR(SrcVar1->getRegNum());
+  Type Ty = SrcVar1->getType();
+  // Src2 can be the implicit CL register or an immediate.
+  if (const auto *Imm = llvm::dyn_cast<ConstantInteger32>(Src2Op)) {
+    (Asm->*(Emitter.GPRGPRImm))(Ty, DestReg, SrcReg,
+                                AssemblerImmediate(Imm->getValue()));
+  } else {
+    assert(llvm::cast<Variable>(Src2Op)->getRegNum() == RegisterSet::Reg_cl);
+    (Asm->*(Emitter.GPRGPR))(Ty, DestReg, SrcReg);
+  }
+}
+
+void emitIASXmmShift(const Cfg *Func, Type Ty, const Variable *Var,
+                     const Operand *Src, const XmmEmitterShiftOp &Emitter) {
+  auto *Target = InstX86Base::getTarget(Func);
+  Assembler *Asm = Func->getAssembler<Assembler>();
+  assert(Var->hasReg());
+  XmmRegister VarReg = Traits::getEncodedXmm(Var->getRegNum());
+  if (const auto *SrcVar = llvm::dyn_cast<Variable>(Src)) {
+    if (SrcVar->hasReg()) {
+      XmmRegister SrcReg = Traits::getEncodedXmm(SrcVar->getRegNum());
+      (Asm->*(Emitter.XmmXmm))(Ty, VarReg, SrcReg);
+    } else {
+      Address SrcStackAddr = Target->stackVarToAsmOperand(SrcVar);
+      (Asm->*(Emitter.XmmAddr))(Ty, VarReg, SrcStackAddr);
+    }
+  } else if (const auto *Mem = llvm::dyn_cast<X86OperandMem>(Src)) {
+    assert(Mem->getSegmentRegister() == X86OperandMem::DefaultSegment);
+    (Asm->*(Emitter.XmmAddr))(Ty, VarReg, Mem->toAsmAddress(Asm, Target));
+  } else if (const auto *Imm = llvm::dyn_cast<ConstantInteger32>(Src)) {
+    (Asm->*(Emitter.XmmImm))(Ty, VarReg, AssemblerImmediate(Imm->getValue()));
+  } else {
+    llvm_unreachable("Unexpected operand type");
+  }
+}
+
+void emitIASRegOpTyXMM(const Cfg *Func, Type Ty, const Variable *Var,
+                       const Operand *Src, const XmmEmitterRegOp &Emitter) {
+  auto *Target = InstX86Base::getTarget(Func);
+  Assembler *Asm = Func->getAssembler<Assembler>();
+  assert(Var->hasReg());
+  XmmRegister VarReg = Traits::getEncodedXmm(Var->getRegNum());
+  if (const auto *SrcVar = llvm::dyn_cast<Variable>(Src)) {
+    if (SrcVar->hasReg()) {
+      XmmRegister SrcReg = Traits::getEncodedXmm(SrcVar->getRegNum());
+      (Asm->*(Emitter.XmmXmm))(Ty, VarReg, SrcReg);
+    } else {
+      Address SrcStackAddr = Target->stackVarToAsmOperand(SrcVar);
+      (Asm->*(Emitter.XmmAddr))(Ty, VarReg, SrcStackAddr);
+    }
+  } else if (const auto *Mem = llvm::dyn_cast<X86OperandMem>(Src)) {
+    assert(Mem->getSegmentRegister() == X86OperandMem::DefaultSegment);
+    (Asm->*(Emitter.XmmAddr))(Ty, VarReg, Mem->toAsmAddress(Asm, Target));
+  } else if (const auto *Imm = llvm::dyn_cast<Constant>(Src)) {
+    (Asm->*(Emitter.XmmAddr))(Ty, VarReg,
+                              Traits::Address::ofConstPool(Asm, Imm));
+  } else {
+    llvm_unreachable("Unexpected operand type");
+  }
+}
+
+template <typename DReg_t, typename SReg_t, DReg_t (*destEnc)(RegNumT),
+          SReg_t (*srcEnc)(RegNumT)>
+void emitIASCastRegOp(const Cfg *Func, Type DestTy, const Variable *Dest,
+                      Type SrcTy, const Operand *Src,
+                      const CastEmitterRegOp<DReg_t, SReg_t> &Emitter) {
+  auto *Target = InstX86Base::getTarget(Func);
+  Assembler *Asm = Func->getAssembler<Assembler>();
+  assert(Dest->hasReg());
+  DReg_t DestReg = destEnc(Dest->getRegNum());
+  if (const auto *SrcVar = llvm::dyn_cast<Variable>(Src)) {
+    if (SrcVar->hasReg()) {
+      SReg_t SrcReg = srcEnc(SrcVar->getRegNum());
+      (Asm->*(Emitter.RegReg))(DestTy, DestReg, SrcTy, SrcReg);
+    } else {
+      Address SrcStackAddr = Target->stackVarToAsmOperand(SrcVar);
+      (Asm->*(Emitter.RegAddr))(DestTy, DestReg, SrcTy, SrcStackAddr);
+    }
+  } else if (const auto *Mem = llvm::dyn_cast<X86OperandMem>(Src)) {
+    Mem->emitSegmentOverride(Asm);
+    (Asm->*(Emitter.RegAddr))(DestTy, DestReg, SrcTy,
+                              Mem->toAsmAddress(Asm, Target));
+  } else {
+    llvm_unreachable("Unexpected operand type");
+  }
+}
+
+template <typename DReg_t, typename SReg_t, DReg_t (*destEnc)(RegNumT),
+          SReg_t (*srcEnc)(RegNumT)>
+void emitIASThreeOpImmOps(const Cfg *Func, Type DispatchTy,
+                          const Variable *Dest, const Operand *Src0,
+                          const Operand *Src1,
+                          const ThreeOpImmEmitter<DReg_t, SReg_t> Emitter) {
+  auto *Target = InstX86Base::getTarget(Func);
+  Assembler *Asm = Func->getAssembler<Assembler>();
+  // This only handles Dest being a register, and Src1 being an immediate.
+  assert(Dest->hasReg());
+  DReg_t DestReg = destEnc(Dest->getRegNum());
+  AssemblerImmediate Imm(llvm::cast<ConstantInteger32>(Src1)->getValue());
+  if (const auto *SrcVar = llvm::dyn_cast<Variable>(Src0)) {
+    if (SrcVar->hasReg()) {
+      SReg_t SrcReg = srcEnc(SrcVar->getRegNum());
+      (Asm->*(Emitter.RegRegImm))(DispatchTy, DestReg, SrcReg, Imm);
+    } else {
+      Address SrcStackAddr = Target->stackVarToAsmOperand(SrcVar);
+      (Asm->*(Emitter.RegAddrImm))(DispatchTy, DestReg, SrcStackAddr, Imm);
+    }
+  } else if (const auto *Mem = llvm::dyn_cast<X86OperandMem>(Src0)) {
+    Mem->emitSegmentOverride(Asm);
+    (Asm->*(Emitter.RegAddrImm))(DispatchTy, DestReg,
+                                 Mem->toAsmAddress(Asm, Target), Imm);
+  } else {
+    llvm_unreachable("Unexpected operand type");
+  }
+}
+
+void emitIASMovlikeXMM(const Cfg *Func, const Variable *Dest,
+                       const Operand *Src, const XmmEmitterMovOps Emitter) {
+  auto *Target = InstX86Base::getTarget(Func);
+  Assembler *Asm = Func->getAssembler<Assembler>();
+  if (Dest->hasReg()) {
+    XmmRegister DestReg = Traits::getEncodedXmm(Dest->getRegNum());
+    if (const auto *SrcVar = llvm::dyn_cast<Variable>(Src)) {
+      if (SrcVar->hasReg()) {
+        (Asm->*(Emitter.XmmXmm))(DestReg,
+                                 Traits::getEncodedXmm(SrcVar->getRegNum()));
+      } else {
+        Address StackAddr(Target->stackVarToAsmOperand(SrcVar));
+        (Asm->*(Emitter.XmmAddr))(DestReg, StackAddr);
+      }
+    } else if (const auto *SrcMem = llvm::dyn_cast<X86OperandMem>(Src)) {
+      assert(SrcMem->getSegmentRegister() == X86OperandMem::DefaultSegment);
+      (Asm->*(Emitter.XmmAddr))(DestReg, SrcMem->toAsmAddress(Asm, Target));
+    } else {
+      llvm_unreachable("Unexpected operand type");
+    }
+  } else {
+    Address StackAddr(Target->stackVarToAsmOperand(Dest));
+    // Src must be a register in this case.
+    const auto *SrcVar = llvm::cast<Variable>(Src);
+    assert(SrcVar->hasReg());
+    (Asm->*(Emitter.AddrXmm))(StackAddr,
+                              Traits::getEncodedXmm(SrcVar->getRegNum()));
+  }
+}
+
+void InstX86Movmsk::dump(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  this->dumpDest(Func);
+  Str << " = movmsk." << this->getSrc(0)->getType() << " ";
+  this->dumpSources(Func);
+}
+
+void InstX86Movmsk::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(this->getSrcSize() == 1);
+  Type SrcTy = this->getSrc(0)->getType();
+  assert(isVectorType(SrcTy));
+  switch (SrcTy) {
+  case IceType_v16i8:
+    Str << "\t"
+           "pmovmskb"
+           "\t";
+    break;
+  case IceType_v4i32:
+  case IceType_v4f32:
+    Str << "\t"
+           "movmskps"
+           "\t";
+    break;
+  default:
+    llvm_unreachable("Unexpected operand type");
+  }
+  this->getSrc(0)->emit(Func);
+  Str << ", ";
+  this->getDest()->emit(Func);
+}
+
+void InstX86Movmsk::emitIAS(const Cfg *Func) const {
+  assert(this->getSrcSize() == 1);
+  Assembler *Asm = Func->getAssembler<Assembler>();
+  const Variable *Dest = this->getDest();
+  const Variable *Src = llvm::cast<Variable>(this->getSrc(0));
+  const Type DestTy = Dest->getType();
+  (void)DestTy;
+  const Type SrcTy = Src->getType();
+  assert(isVectorType(SrcTy));
+  assert(isScalarIntegerType(DestTy));
+  if (Traits::Is64Bit) {
+    assert(DestTy == IceType_i32 || DestTy == IceType_i64);
+  } else {
+    assert(typeWidthInBytes(DestTy) <= 4);
+  }
+  XmmRegister SrcReg = Traits::getEncodedXmm(Src->getRegNum());
+  GPRRegister DestReg = Traits::getEncodedGPR(Dest->getRegNum());
+  Asm->movmsk(SrcTy, DestReg, SrcReg);
+}
+
+void InstX86Sqrt::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(this->getSrcSize() == 1);
+  Type Ty = this->getSrc(0)->getType();
+  assert(isScalarFloatingType(Ty));
+  Str << "\t"
+         "sqrt"
+      << Traits::TypeAttributes[Ty].SpSdString << "\t";
+  this->getSrc(0)->emit(Func);
+  Str << ", ";
+  this->getDest()->emit(Func);
+}
+
+void InstX86Div::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(this->getSrcSize() == 3);
+  Operand *Src1 = this->getSrc(1);
+  Str << "\t" << this->Opcode << this->getWidthString(Src1->getType()) << "\t";
+  Src1->emit(Func);
+}
+
+void InstX86Div::emitIAS(const Cfg *Func) const {
+  assert(this->getSrcSize() == 3);
+  const Operand *Src = this->getSrc(1);
+  Type Ty = Src->getType();
+  static GPREmitterOneOp Emitter = {&Assembler::div, &Assembler::div};
+  emitIASOpTyGPR(Func, Ty, Src, Emitter);
+}
+
+void InstX86Idiv::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(this->getSrcSize() == 3);
+  Operand *Src1 = this->getSrc(1);
+  Str << "\t" << this->Opcode << this->getWidthString(Src1->getType()) << "\t";
+  Src1->emit(Func);
+}
+
+void InstX86Idiv::emitIAS(const Cfg *Func) const {
+  assert(this->getSrcSize() == 3);
+  const Operand *Src = this->getSrc(1);
+  Type Ty = Src->getType();
+  static const GPREmitterOneOp Emitter = {&Assembler::idiv, &Assembler::idiv};
+  emitIASOpTyGPR(Func, Ty, Src, Emitter);
+}
+
+// pblendvb and blendvps take xmm0 as a final implicit argument.
+
+void emitVariableBlendInst(const char *Opcode, const Inst *Instr,
+                           const Cfg *Func) {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(Instr->getSrcSize() == 3);
+  assert(llvm::cast<Variable>(Instr->getSrc(2))->getRegNum() ==
+         RegisterSet::Reg_xmm0);
+  Str << "\t" << Opcode << "\t";
+  Instr->getSrc(1)->emit(Func);
+  Str << ", ";
+  Instr->getDest()->emit(Func);
+}
+
+void emitIASVariableBlendInst(const Inst *Instr, const Cfg *Func,
+                              const XmmEmitterRegOp &Emitter) {
+  assert(Instr->getSrcSize() == 3);
+  assert(llvm::cast<Variable>(Instr->getSrc(2))->getRegNum() ==
+         RegisterSet::Reg_xmm0);
+  const Variable *Dest = Instr->getDest();
+  const Operand *Src = Instr->getSrc(1);
+  emitIASRegOpTyXMM(Func, Dest->getType(), Dest, Src, Emitter);
+}
+
+void InstX86Blendvps::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  emitVariableBlendInst(this->Opcode, this, Func);
+}
+
+void InstX86Blendvps::emitIAS(const Cfg *Func) const {
+  static const XmmEmitterRegOp Emitter = {&Assembler::blendvps,
+                                          &Assembler::blendvps};
+  emitIASVariableBlendInst(this, Func, Emitter);
+}
+
+void InstX86Pblendvb::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  emitVariableBlendInst(this->Opcode, this, Func);
+}
+
+void InstX86Pblendvb::emitIAS(const Cfg *Func) const {
+  static const XmmEmitterRegOp Emitter = {&Assembler::pblendvb,
+                                          &Assembler::pblendvb};
+  emitIASVariableBlendInst(this, Func, Emitter);
+}
+
+void InstX86Imul::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(this->getSrcSize() == 2);
+  Variable *Dest = this->getDest();
+  if (isByteSizedArithType(Dest->getType())) {
+    // The 8-bit version of imul only allows the form "imul r/m8".
+    const auto *Src0Var = llvm::dyn_cast<Variable>(this->getSrc(0));
+    (void)Src0Var;
+    assert(Src0Var->getRegNum() == RegisterSet::Reg_al);
+    Str << "\t"
+           "imulb\t";
+    this->getSrc(1)->emit(Func);
+  } else if (llvm::isa<Constant>(this->getSrc(1))) {
+    Str << "\t"
+           "imul"
+        << this->getWidthString(Dest->getType()) << "\t";
+    this->getSrc(1)->emit(Func);
+    Str << ", ";
+    this->getSrc(0)->emit(Func);
+    Str << ", ";
+    Dest->emit(Func);
+  } else {
+    this->emitTwoAddress(Func, this->Opcode);
+  }
+}
+
+void InstX86Imul::emitIAS(const Cfg *Func) const {
+  assert(this->getSrcSize() == 2);
+  const Variable *Var = this->getDest();
+  Type Ty = Var->getType();
+  const Operand *Src = this->getSrc(1);
+  if (isByteSizedArithType(Ty)) {
+    // The 8-bit version of imul only allows the form "imul r/m8".
+    const auto *Src0Var = llvm::dyn_cast<Variable>(this->getSrc(0));
+    (void)Src0Var;
+    assert(Src0Var->getRegNum() == RegisterSet::Reg_al);
+    static const GPREmitterOneOp Emitter = {&Assembler::imul, &Assembler::imul};
+    emitIASOpTyGPR(Func, Ty, this->getSrc(1), Emitter);
+  } else {
+    // The two-address version is used when multiplying by a non-constant
+    // or doing an 8-bit multiply.
+    assert(Var == this->getSrc(0));
+    static const GPREmitterRegOp Emitter = {&Assembler::imul, &Assembler::imul,
+                                            &Assembler::imul};
+    constexpr bool NotLea = false;
+    emitIASRegOpTyGPR(Func, NotLea, Ty, Var, Src, Emitter);
+  }
+}
+
+void InstX86ImulImm::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(this->getSrcSize() == 2);
+  Variable *Dest = this->getDest();
+  assert(Dest->getType() == IceType_i16 || Dest->getType() == IceType_i32);
+  assert(llvm::isa<Constant>(this->getSrc(1)));
+  Str << "\t"
+         "imul"
+      << this->getWidthString(Dest->getType()) << "\t";
+  this->getSrc(1)->emit(Func);
+  Str << ", ";
+  this->getSrc(0)->emit(Func);
+  Str << ", ";
+  Dest->emit(Func);
+}
+
+void InstX86ImulImm::emitIAS(const Cfg *Func) const {
+  assert(this->getSrcSize() == 2);
+  const Variable *Dest = this->getDest();
+  Type Ty = Dest->getType();
+  assert(llvm::isa<Constant>(this->getSrc(1)));
+  static const ThreeOpImmEmitter<GPRRegister, GPRRegister> Emitter = {
+      &Assembler::imul, &Assembler::imul};
+  emitIASThreeOpImmOps<GPRRegister, GPRRegister, Traits::getEncodedGPR,
+                       Traits::getEncodedGPR>(Func, Ty, Dest, this->getSrc(0),
+                                              this->getSrc(1), Emitter);
+}
+
+void InstX86Insertps::emitIAS(const Cfg *Func) const {
+  assert(this->getSrcSize() == 3);
+  assert(InstX86Base::getTarget(Func)->getInstructionSet() >= SSE4_1);
+  const Variable *Dest = this->getDest();
+  assert(Dest == this->getSrc(0));
+  Type Ty = Dest->getType();
+  static const ThreeOpImmEmitter<XmmRegister, XmmRegister> Emitter = {
+      &Assembler::insertps, &Assembler::insertps};
+  emitIASThreeOpImmOps<XmmRegister, XmmRegister, Traits::getEncodedXmm,
+                       Traits::getEncodedXmm>(Func, Ty, Dest, this->getSrc(1),
+                                              this->getSrc(2), Emitter);
+}
+
+void InstX86Cbwdq::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(this->getSrcSize() == 1);
+  Operand *Src0 = this->getSrc(0);
+  const auto DestReg = this->getDest()->getRegNum();
+  const auto SrcReg = llvm::cast<Variable>(Src0)->getRegNum();
+  (void)DestReg;
+  (void)SrcReg;
+  switch (Src0->getType()) {
+  default:
+    llvm_unreachable("unexpected source type!");
+    break;
+  case IceType_i8:
+    assert(SrcReg == RegisterSet::Reg_al);
+    assert(DestReg == RegisterSet::Reg_ax || DestReg == RegisterSet::Reg_ah);
+    Str << "\t"
+           "cbtw";
+    break;
+  case IceType_i16:
+    assert(SrcReg == RegisterSet::Reg_ax);
+    assert(DestReg == RegisterSet::Reg_dx);
+    Str << "\t"
+           "cwtd";
+    break;
+  case IceType_i32:
+    assert(SrcReg == RegisterSet::Reg_eax);
+    assert(DestReg == RegisterSet::Reg_edx);
+    Str << "\t"
+           "cltd";
+    break;
+  case IceType_i64:
+    assert(Traits::Is64Bit);
+    assert(SrcReg == Traits::getRaxOrDie());
+    assert(DestReg == Traits::getRdxOrDie());
+    Str << "\t"
+           "cqo";
+    break;
+  }
+}
+
+void InstX86Cbwdq::emitIAS(const Cfg *Func) const {
+  Assembler *Asm = Func->getAssembler<Assembler>();
+  assert(this->getSrcSize() == 1);
+  Operand *Src0 = this->getSrc(0);
+  const auto DestReg = this->getDest()->getRegNum();
+  const auto SrcReg = llvm::cast<Variable>(Src0)->getRegNum();
+  (void)DestReg;
+  (void)SrcReg;
+  switch (Src0->getType()) {
+  default:
+    llvm_unreachable("unexpected source type!");
+    break;
+  case IceType_i8:
+    assert(SrcReg == RegisterSet::Reg_al);
+    assert(DestReg == RegisterSet::Reg_ax || DestReg == RegisterSet::Reg_ah);
+    Asm->cbw();
+    break;
+  case IceType_i16:
+    assert(SrcReg == RegisterSet::Reg_ax);
+    assert(DestReg == RegisterSet::Reg_dx);
+    Asm->cwd();
+    break;
+  case IceType_i32:
+    assert(SrcReg == RegisterSet::Reg_eax);
+    assert(DestReg == RegisterSet::Reg_edx);
+    Asm->cdq();
+    break;
+  case IceType_i64:
+    assert(Traits::Is64Bit);
+    assert(SrcReg == Traits::getRaxOrDie());
+    assert(DestReg == Traits::getRdxOrDie());
+    Asm->cqo();
+    break;
+  }
+}
+
+void InstX86Mul::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(this->getSrcSize() == 2);
+  assert(llvm::isa<Variable>(this->getSrc(0)));
+  assert(llvm::cast<Variable>(this->getSrc(0))->getRegNum() ==
+         RegisterSet::Reg_eax);
+  assert(this->getDest()->getRegNum() == RegisterSet::Reg_eax); // TODO:
+                                                                // allow
+                                                                // edx?
+  Str << "\t"
+         "mul"
+      << this->getWidthString(this->getDest()->getType()) << "\t";
+  this->getSrc(1)->emit(Func);
+}
+
+void InstX86Mul::emitIAS(const Cfg *Func) const {
+  assert(this->getSrcSize() == 2);
+  assert(llvm::isa<Variable>(this->getSrc(0)));
+  assert(llvm::cast<Variable>(this->getSrc(0))->getRegNum() ==
+         RegisterSet::Reg_eax);
+  assert(this->getDest()->getRegNum() == RegisterSet::Reg_eax); // TODO:
+                                                                // allow
+                                                                // edx?
+  const Operand *Src = this->getSrc(1);
+  Type Ty = Src->getType();
+  static const GPREmitterOneOp Emitter = {&Assembler::mul, &Assembler::mul};
+  emitIASOpTyGPR(Func, Ty, Src, Emitter);
+}
+
+void InstX86Mul::dump(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  this->dumpDest(Func);
+  Str << " = mul." << this->getDest()->getType() << " ";
+  this->dumpSources(Func);
+}
+
+void InstX86Shld::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  Variable *Dest = this->getDest();
+  assert(this->getSrcSize() == 3);
+  assert(Dest == this->getSrc(0));
+  Str << "\t"
+         "shld"
+      << this->getWidthString(Dest->getType()) << "\t";
+  this->getSrc(2)->emit(Func);
+  Str << ", ";
+  this->getSrc(1)->emit(Func);
+  Str << ", ";
+  Dest->emit(Func);
+}
+
+void InstX86Shld::emitIAS(const Cfg *Func) const {
+  assert(this->getSrcSize() == 3);
+  assert(this->getDest() == this->getSrc(0));
+  const Variable *Dest = this->getDest();
+  const Operand *Src1 = this->getSrc(1);
+  const Operand *Src2 = this->getSrc(2);
+  static const GPREmitterShiftD Emitter = {&Assembler::shld, &Assembler::shld};
+  emitIASGPRShiftDouble(Func, Dest, Src1, Src2, Emitter);
+}
+
+void InstX86Shld::dump(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  this->dumpDest(Func);
+  Str << " = shld." << this->getDest()->getType() << " ";
+  this->dumpSources(Func);
+}
+
+void InstX86Shrd::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  Variable *Dest = this->getDest();
+  assert(this->getSrcSize() == 3);
+  assert(Dest == this->getSrc(0));
+  Str << "\t"
+         "shrd"
+      << this->getWidthString(Dest->getType()) << "\t";
+  this->getSrc(2)->emit(Func);
+  Str << ", ";
+  this->getSrc(1)->emit(Func);
+  Str << ", ";
+  Dest->emit(Func);
+}
+
+void InstX86Shrd::emitIAS(const Cfg *Func) const {
+  assert(this->getSrcSize() == 3);
+  assert(this->getDest() == this->getSrc(0));
+  const Variable *Dest = this->getDest();
+  const Operand *Src1 = this->getSrc(1);
+  const Operand *Src2 = this->getSrc(2);
+  static const GPREmitterShiftD Emitter = {&Assembler::shrd, &Assembler::shrd};
+  emitIASGPRShiftDouble(Func, Dest, Src1, Src2, Emitter);
+}
+
+void InstX86Shrd::dump(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  this->dumpDest(Func);
+  Str << " = shrd." << this->getDest()->getType() << " ";
+  this->dumpSources(Func);
+}
+
+void InstX86Cmov::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  Variable *Dest = this->getDest();
+  Str << "\t";
+  assert(Condition != Cond::Br_None);
+  assert(this->getDest()->hasReg());
+  Str << "cmov" << Traits::InstBrAttributes[Condition].DisplayString
+      << this->getWidthString(Dest->getType()) << "\t";
+  this->getSrc(1)->emit(Func);
+  Str << ", ";
+  Dest->emit(Func);
+}
+
+void InstX86Cmov::emitIAS(const Cfg *Func) const {
+  assert(Condition != Cond::Br_None);
+  assert(this->getDest()->hasReg());
+  assert(this->getSrcSize() == 2);
+  Operand *Src = this->getSrc(1);
+  Type SrcTy = Src->getType();
+  assert(SrcTy == IceType_i16 || SrcTy == IceType_i32 || (Traits::Is64Bit));
+  Assembler *Asm = Func->getAssembler<Assembler>();
+  auto *Target = InstX86Base::getTarget(Func);
+  if (const auto *SrcVar = llvm::dyn_cast<Variable>(Src)) {
+    if (SrcVar->hasReg()) {
+      Asm->cmov(SrcTy, Condition,
+                Traits::getEncodedGPR(this->getDest()->getRegNum()),
+                Traits::getEncodedGPR(SrcVar->getRegNum()));
+    } else {
+      Asm->cmov(SrcTy, Condition,
+                Traits::getEncodedGPR(this->getDest()->getRegNum()),
+                Target->stackVarToAsmOperand(SrcVar));
+    }
+  } else if (const auto *Mem = llvm::dyn_cast<X86OperandMem>(Src)) {
+    assert(Mem->getSegmentRegister() == X86OperandMem::DefaultSegment);
+    Asm->cmov(SrcTy, Condition,
+              Traits::getEncodedGPR(this->getDest()->getRegNum()),
+              Mem->toAsmAddress(Asm, Target));
+  } else {
+    llvm_unreachable("Unexpected operand type");
+  }
+}
+
+void InstX86Cmov::dump(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  Str << "cmov" << Traits::InstBrAttributes[Condition].DisplayString << ".";
+  Str << this->getDest()->getType() << " ";
+  this->dumpDest(Func);
+  Str << ", ";
+  this->dumpSources(Func);
+}
+
+void InstX86Cmpps::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(this->getSrcSize() == 2);
+  assert(Condition < Cond::Cmpps_Invalid);
+  Type DestTy = this->Dest->getType();
+  Str << "\t"
+         "cmp"
+      << Traits::InstCmppsAttributes[Condition].EmitString
+      << Traits::TypeAttributes[DestTy].PdPsString << "\t";
+  this->getSrc(1)->emit(Func);
+  Str << ", ";
+  this->getDest()->emit(Func);
+}
+
+void InstX86Cmpps::emitIAS(const Cfg *Func) const {
+  Assembler *Asm = Func->getAssembler<Assembler>();
+  assert(this->getSrcSize() == 2);
+  assert(Condition < Cond::Cmpps_Invalid);
+  // Assuming there isn't any load folding for cmpps, and vector constants are
+  // not allowed in PNaCl.
+  assert(llvm::isa<Variable>(this->getSrc(1)));
+  auto *Target = InstX86Base::getTarget(Func);
+  const auto *SrcVar = llvm::cast<Variable>(this->getSrc(1));
+  if (SrcVar->hasReg()) {
+    Asm->cmpps(this->getDest()->getType(),
+               Traits::getEncodedXmm(this->getDest()->getRegNum()),
+               Traits::getEncodedXmm(SrcVar->getRegNum()), Condition);
+  } else {
+    Address SrcStackAddr = Target->stackVarToAsmOperand(SrcVar);
+    Asm->cmpps(this->getDest()->getType(),
+               Traits::getEncodedXmm(this->getDest()->getRegNum()),
+               SrcStackAddr, Condition);
+  }
+}
+
+void InstX86Cmpps::dump(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  assert(Condition < Cond::Cmpps_Invalid);
+  this->dumpDest(Func);
+  Str << " = cmp" << Traits::InstCmppsAttributes[Condition].EmitString
+      << "ps"
+         "\t";
+  this->dumpSources(Func);
+}
+
+void InstX86Cmpxchg::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(this->getSrcSize() == 3);
+  if (this->Locked) {
+    Str << "\t"
+           "lock";
+  }
+  Str << "\t"
+         "cmpxchg"
+      << this->getWidthString(this->getSrc(0)->getType()) << "\t";
+  this->getSrc(2)->emit(Func);
+  Str << ", ";
+  this->getSrc(0)->emit(Func);
+}
+
+void InstX86Cmpxchg::emitIAS(const Cfg *Func) const {
+  assert(this->getSrcSize() == 3);
+  Assembler *Asm = Func->getAssembler<Assembler>();
+  Type Ty = this->getSrc(0)->getType();
+  auto *Target = InstX86Base::getTarget(Func);
+  const auto Mem = llvm::cast<X86OperandMem>(this->getSrc(0));
+  assert(Mem->getSegmentRegister() == X86OperandMem::DefaultSegment);
+  const Address Addr = Mem->toAsmAddress(Asm, Target);
+  const auto *VarReg = llvm::cast<Variable>(this->getSrc(2));
+  assert(VarReg->hasReg());
+  const GPRRegister Reg = Traits::getEncodedGPR(VarReg->getRegNum());
+  Asm->cmpxchg(Ty, Addr, Reg, this->Locked);
+}
+
+void InstX86Cmpxchg::dump(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  if (this->Locked) {
+    Str << "lock ";
+  }
+  Str << "cmpxchg." << this->getSrc(0)->getType() << " ";
+  this->dumpSources(Func);
+}
+
+void InstX86Cmpxchg8b::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(this->getSrcSize() == 5);
+  if (this->Locked) {
+    Str << "\t"
+           "lock";
+  }
+  Str << "\t"
+         "cmpxchg8b\t";
+  this->getSrc(0)->emit(Func);
+}
+
+void InstX86Cmpxchg8b::emitIAS(const Cfg *Func) const {
+  assert(this->getSrcSize() == 5);
+  Assembler *Asm = Func->getAssembler<Assembler>();
+  const auto Mem = llvm::cast<X86OperandMem>(this->getSrc(0));
+  assert(Mem->getSegmentRegister() == X86OperandMem::DefaultSegment);
+  auto *Target = InstX86Base::getTarget(Func);
+  const Address Addr = Mem->toAsmAddress(Asm, Target);
+  Asm->cmpxchg8b(Addr, this->Locked);
+}
+
+void InstX86Cmpxchg8b::dump(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  if (this->Locked) {
+    Str << "lock ";
+  }
+  Str << "cmpxchg8b ";
+  this->dumpSources(Func);
+}
+
+void InstX86Cvt::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(this->getSrcSize() == 1);
+  Str << "\t"
+         "cvt";
+  if (isTruncating())
+    Str << "t";
+  Str << Traits::TypeAttributes[this->getSrc(0)->getType()].CvtString << "2"
+      << Traits::TypeAttributes[this->getDest()->getType()].CvtString << "\t";
+  this->getSrc(0)->emit(Func);
+  Str << ", ";
+  this->getDest()->emit(Func);
+}
+
+void InstX86Cvt::emitIAS(const Cfg *Func) const {
+  assert(this->getSrcSize() == 1);
+  const Variable *Dest = this->getDest();
+  const Operand *Src = this->getSrc(0);
+  Type DestTy = Dest->getType();
+  Type SrcTy = Src->getType();
+  switch (Variant) {
+  case Si2ss: {
+    assert(isScalarIntegerType(SrcTy));
+    if (!Traits::Is64Bit) {
+      assert(typeWidthInBytes(SrcTy) <= 4);
+    } else {
+      assert(SrcTy == IceType_i32 || SrcTy == IceType_i64);
+    }
+    assert(isScalarFloatingType(DestTy));
+    static const CastEmitterRegOp<XmmRegister, GPRRegister> Emitter = {
+        &Assembler::cvtsi2ss, &Assembler::cvtsi2ss};
+    emitIASCastRegOp<XmmRegister, GPRRegister, Traits::getEncodedXmm,
+                     Traits::getEncodedGPR>(Func, DestTy, Dest, SrcTy, Src,
+                                            Emitter);
+    return;
+  }
+  case Tss2si: {
+    assert(isScalarFloatingType(SrcTy));
+    assert(isScalarIntegerType(DestTy));
+    if (Traits::Is64Bit) {
+      assert(DestTy == IceType_i32 || DestTy == IceType_i64);
+    } else {
+      assert(typeWidthInBytes(DestTy) <= 4);
+    }
+    static const CastEmitterRegOp<GPRRegister, XmmRegister> Emitter = {
+        &Assembler::cvttss2si, &Assembler::cvttss2si};
+    emitIASCastRegOp<GPRRegister, XmmRegister, Traits::getEncodedGPR,
+                     Traits::getEncodedXmm>(Func, DestTy, Dest, SrcTy, Src,
+                                            Emitter);
+    return;
+  }
+  case Ss2si: {
+    assert(isScalarFloatingType(SrcTy));
+    assert(isScalarIntegerType(DestTy));
+    if (Traits::Is64Bit) {
+      assert(DestTy == IceType_i32 || DestTy == IceType_i64);
+    } else {
+      assert(typeWidthInBytes(DestTy) <= 4);
+    }
+    static const CastEmitterRegOp<GPRRegister, XmmRegister> Emitter = {
+        &Assembler::cvtss2si, &Assembler::cvtss2si};
+    emitIASCastRegOp<GPRRegister, XmmRegister, Traits::getEncodedGPR,
+                     Traits::getEncodedXmm>(Func, DestTy, Dest, SrcTy, Src,
+                                            Emitter);
+    return;
+  }
+  case Float2float: {
+    assert(isScalarFloatingType(SrcTy));
+    assert(isScalarFloatingType(DestTy));
+    assert(DestTy != SrcTy);
+    static const XmmEmitterRegOp Emitter = {&Assembler::cvtfloat2float,
+                                            &Assembler::cvtfloat2float};
+    emitIASRegOpTyXMM(Func, SrcTy, Dest, Src, Emitter);
+    return;
+  }
+  case Dq2ps: {
+    assert(isVectorIntegerType(SrcTy));
+    assert(isVectorFloatingType(DestTy));
+    static const XmmEmitterRegOp Emitter = {&Assembler::cvtdq2ps,
+                                            &Assembler::cvtdq2ps};
+    emitIASRegOpTyXMM(Func, DestTy, Dest, Src, Emitter);
+    return;
+  }
+  case Tps2dq: {
+    assert(isVectorFloatingType(SrcTy));
+    assert(isVectorIntegerType(DestTy));
+    static const XmmEmitterRegOp Emitter = {&Assembler::cvttps2dq,
+                                            &Assembler::cvttps2dq};
+    emitIASRegOpTyXMM(Func, DestTy, Dest, Src, Emitter);
+    return;
+  }
+  case Ps2dq: {
+    assert(isVectorFloatingType(SrcTy));
+    assert(isVectorIntegerType(DestTy));
+    static const XmmEmitterRegOp Emitter = {&Assembler::cvtps2dq,
+                                            &Assembler::cvtps2dq};
+    emitIASRegOpTyXMM(Func, DestTy, Dest, Src, Emitter);
+    return;
+  }
+  }
+}
+
+void InstX86Cvt::dump(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  this->dumpDest(Func);
+  Str << " = cvt";
+  if (isTruncating())
+    Str << "t";
+  Str << Traits::TypeAttributes[this->getSrc(0)->getType()].CvtString << "2"
+      << Traits::TypeAttributes[this->getDest()->getType()].CvtString << " ";
+  this->dumpSources(Func);
+}
+
+void InstX86Round::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(this->getSrcSize() == 3);
+  Str << "\t" << this->Opcode
+      << Traits::TypeAttributes[this->getDest()->getType()].SpSdString << "\t";
+  this->getSrc(1)->emit(Func);
+  Str << ", ";
+  this->getSrc(0)->emit(Func);
+  Str << ", ";
+  this->getDest()->emit(Func);
+}
+
+void InstX86Round::emitIAS(const Cfg *Func) const {
+  assert(this->getSrcSize() == 2);
+  assert(InstX86Base::getTarget(Func)->getInstructionSet() >= SSE4_1);
+  const Variable *Dest = this->getDest();
+  Type Ty = Dest->getType();
+  static const ThreeOpImmEmitter<XmmRegister, XmmRegister> Emitter = {
+      &Assembler::round, &Assembler::round};
+  emitIASThreeOpImmOps<XmmRegister, XmmRegister, Traits::getEncodedXmm,
+                       Traits::getEncodedXmm>(Func, Ty, Dest, this->getSrc(0),
+                                              this->getSrc(1), Emitter);
+}
+
+void InstX86Icmp::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(this->getSrcSize() == 2);
+  Str << "\t"
+         "cmp"
+      << this->getWidthString(this->getSrc(0)->getType()) << "\t";
+  this->getSrc(1)->emit(Func);
+  Str << ", ";
+  this->getSrc(0)->emit(Func);
+}
+
+void InstX86Icmp::emitIAS(const Cfg *Func) const {
+  assert(this->getSrcSize() == 2);
+  const Operand *Src0 = this->getSrc(0);
+  const Operand *Src1 = this->getSrc(1);
+  Type Ty = Src0->getType();
+  static const GPREmitterRegOp RegEmitter = {&Assembler::cmp, &Assembler::cmp,
+                                             &Assembler::cmp};
+  static const GPREmitterAddrOp AddrEmitter = {&Assembler::cmp,
+                                               &Assembler::cmp};
+  if (const auto *SrcVar0 = llvm::dyn_cast<Variable>(Src0)) {
+    if (SrcVar0->hasReg()) {
+      constexpr bool NotLea = false;
+      emitIASRegOpTyGPR(Func, NotLea, Ty, SrcVar0, Src1, RegEmitter);
+      return;
+    }
+  }
+  emitIASAsAddrOpTyGPR(Func, Ty, Src0, Src1, AddrEmitter);
+}
+
+void InstX86Icmp::dump(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  Str << "cmp." << this->getSrc(0)->getType() << " ";
+  this->dumpSources(Func);
+}
+
+void InstX86Ucomiss::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(this->getSrcSize() == 2);
+  Str << "\t"
+         "ucomi"
+      << Traits::TypeAttributes[this->getSrc(0)->getType()].SdSsString << "\t";
+  this->getSrc(1)->emit(Func);
+  Str << ", ";
+  this->getSrc(0)->emit(Func);
+}
+
+void InstX86Ucomiss::emitIAS(const Cfg *Func) const {
+  assert(this->getSrcSize() == 2);
+  // Currently src0 is always a variable by convention, to avoid having two
+  // memory operands.
+  assert(llvm::isa<Variable>(this->getSrc(0)));
+  const auto *Src0Var = llvm::cast<Variable>(this->getSrc(0));
+  Type Ty = Src0Var->getType();
+  static const XmmEmitterRegOp Emitter = {&Assembler::ucomiss,
+                                          &Assembler::ucomiss};
+  emitIASRegOpTyXMM(Func, Ty, Src0Var, this->getSrc(1), Emitter);
+}
+
+void InstX86Ucomiss::dump(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  Str << "ucomiss." << this->getSrc(0)->getType() << " ";
+  this->dumpSources(Func);
+}
+
+void InstX86UD2::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(this->getSrcSize() == 0);
+  Str << "\t"
+         "ud2";
+}
+
+void InstX86UD2::emitIAS(const Cfg *Func) const {
+  Assembler *Asm = Func->getAssembler<Assembler>();
+  Asm->ud2();
+}
+
+void InstX86UD2::dump(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  Str << "ud2";
+}
+
+void InstX86Int3::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(this->getSrcSize() == 0);
+  Str << "\t"
+         "int 3";
+}
+
+void InstX86Int3::emitIAS(const Cfg *Func) const {
+  Assembler *Asm = Func->getAssembler<Assembler>();
+  Asm->int3();
+}
+
+void InstX86Int3::dump(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  Str << "int 3";
+}
+
+void InstX86Test::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(this->getSrcSize() == 2);
+  Str << "\t"
+         "test"
+      << this->getWidthString(this->getSrc(0)->getType()) << "\t";
+  this->getSrc(1)->emit(Func);
+  Str << ", ";
+  this->getSrc(0)->emit(Func);
+}
+
+void InstX86Test::emitIAS(const Cfg *Func) const {
+  assert(this->getSrcSize() == 2);
+  const Operand *Src0 = this->getSrc(0);
+  const Operand *Src1 = this->getSrc(1);
+  Type Ty = Src0->getType();
+  // The Reg/Addr form of test is not encodeable.
+  static const GPREmitterRegOp RegEmitter = {&Assembler::test, nullptr,
+                                             &Assembler::test};
+  static const GPREmitterAddrOp AddrEmitter = {&Assembler::test,
+                                               &Assembler::test};
+  if (const auto *SrcVar0 = llvm::dyn_cast<Variable>(Src0)) {
+    if (SrcVar0->hasReg()) {
+      constexpr bool NotLea = false;
+      emitIASRegOpTyGPR(Func, NotLea, Ty, SrcVar0, Src1, RegEmitter);
+      return;
+    }
+  }
+  emitIASAsAddrOpTyGPR(Func, Ty, Src0, Src1, AddrEmitter);
+}
+
+void InstX86Test::dump(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  Str << "test." << this->getSrc(0)->getType() << " ";
+  this->dumpSources(Func);
+}
+
+void InstX86Mfence::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(this->getSrcSize() == 0);
+  Str << "\t"
+         "mfence";
+}
+
+void InstX86Mfence::emitIAS(const Cfg *Func) const {
+  Assembler *Asm = Func->getAssembler<Assembler>();
+  Asm->mfence();
+}
+
+void InstX86Mfence::dump(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  Str << "mfence";
+}
+
+void InstX86Store::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(this->getSrcSize() == 2);
+  Type Ty = this->getSrc(0)->getType();
+  Str << "\t"
+         "mov"
+      << this->getWidthString(Ty) << Traits::TypeAttributes[Ty].SdSsString
+      << "\t";
+  this->getSrc(0)->emit(Func);
+  Str << ", ";
+  this->getSrc(1)->emit(Func);
+}
+
+void InstX86Store::emitIAS(const Cfg *Func) const {
+  assert(this->getSrcSize() == 2);
+  const Operand *Dest = this->getSrc(1);
+  const Operand *Src = this->getSrc(0);
+  Type DestTy = Dest->getType();
+  if (isScalarFloatingType(DestTy)) {
+    // Src must be a register, since Dest is a Mem operand of some kind.
+    const auto *SrcVar = llvm::cast<Variable>(Src);
+    assert(SrcVar->hasReg());
+    XmmRegister SrcReg = Traits::getEncodedXmm(SrcVar->getRegNum());
+    Assembler *Asm = Func->getAssembler<Assembler>();
+    auto *Target = InstX86Base::getTarget(Func);
+    if (const auto *DestVar = llvm::dyn_cast<Variable>(Dest)) {
+      assert(!DestVar->hasReg());
+      Address StackAddr(Target->stackVarToAsmOperand(DestVar));
+      Asm->movss(DestTy, StackAddr, SrcReg);
+    } else {
+      const auto DestMem = llvm::cast<X86OperandMem>(Dest);
+      assert(DestMem->getSegmentRegister() == X86OperandMem::DefaultSegment);
+      Asm->movss(DestTy, DestMem->toAsmAddress(Asm, Target), SrcReg);
+    }
+    return;
+  } else {
+    assert(isScalarIntegerType(DestTy));
+    static const GPREmitterAddrOp GPRAddrEmitter = {&Assembler::mov,
+                                                    &Assembler::mov};
+    emitIASAsAddrOpTyGPR(Func, DestTy, Dest, Src, GPRAddrEmitter);
+  }
+}
+
+void InstX86Store::dump(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  Str << "mov." << this->getSrc(0)->getType() << " ";
+  this->getSrc(1)->dump(Func);
+  Str << ", ";
+  this->getSrc(0)->dump(Func);
+}
+
+void InstX86StoreP::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(this->getSrcSize() == 2);
+  assert(isVectorType(this->getSrc(1)->getType()));
+  Str << "\t"
+         "movups\t";
+  this->getSrc(0)->emit(Func);
+  Str << ", ";
+  this->getSrc(1)->emit(Func);
+}
+
+void InstX86StoreP::emitIAS(const Cfg *Func) const {
+  Assembler *Asm = Func->getAssembler<Assembler>();
+  assert(this->getSrcSize() == 2);
+  const auto *SrcVar = llvm::cast<Variable>(this->getSrc(0));
+  const auto DestMem = llvm::cast<X86OperandMem>(this->getSrc(1));
+  assert(DestMem->getSegmentRegister() == X86OperandMem::DefaultSegment);
+  assert(SrcVar->hasReg());
+  auto *Target = InstX86Base::getTarget(Func);
+  Asm->movups(DestMem->toAsmAddress(Asm, Target),
+              Traits::getEncodedXmm(SrcVar->getRegNum()));
+}
+
+void InstX86StoreP::dump(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  Str << "storep." << this->getSrc(0)->getType() << " ";
+  this->getSrc(1)->dump(Func);
+  Str << ", ";
+  this->getSrc(0)->dump(Func);
+}
+
+void InstX86StoreQ::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(this->getSrcSize() == 2);
+  assert(this->getSrc(1)->getType() == IceType_i64 ||
+         this->getSrc(1)->getType() == IceType_f64 ||
+         isVectorType(this->getSrc(1)->getType()));
+  Str << "\t"
+         "movq\t";
+  this->getSrc(0)->emit(Func);
+  Str << ", ";
+  this->getSrc(1)->emit(Func);
+}
+
+void InstX86StoreQ::emitIAS(const Cfg *Func) const {
+  Assembler *Asm = Func->getAssembler<Assembler>();
+  assert(this->getSrcSize() == 2);
+  const auto *SrcVar = llvm::cast<Variable>(this->getSrc(0));
+  const auto DestMem = llvm::cast<X86OperandMem>(this->getSrc(1));
+  assert(DestMem->getSegmentRegister() == X86OperandMem::DefaultSegment);
+  assert(SrcVar->hasReg());
+  auto *Target = InstX86Base::getTarget(Func);
+  Asm->movq(DestMem->toAsmAddress(Asm, Target),
+            Traits::getEncodedXmm(SrcVar->getRegNum()));
+}
+
+void InstX86StoreQ::dump(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  Str << "storeq." << this->getSrc(0)->getType() << " ";
+  this->getSrc(1)->dump(Func);
+  Str << ", ";
+  this->getSrc(0)->dump(Func);
+}
+
+void InstX86StoreD::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(this->getSrcSize() == 2);
+  assert(this->getSrc(1)->getType() == IceType_i64 ||
+         this->getSrc(1)->getType() == IceType_f64 ||
+         isVectorType(this->getSrc(1)->getType()));
+  Str << "\t"
+         "movd\t";
+  this->getSrc(0)->emit(Func);
+  Str << ", ";
+  this->getSrc(1)->emit(Func);
+}
+
+void InstX86StoreD::emitIAS(const Cfg *Func) const {
+  Assembler *Asm = Func->getAssembler<Assembler>();
+  assert(this->getSrcSize() == 2);
+  const auto *SrcVar = llvm::cast<Variable>(this->getSrc(0));
+  const auto DestMem = llvm::cast<X86OperandMem>(this->getSrc(1));
+  assert(DestMem->getSegmentRegister() == X86OperandMem::DefaultSegment);
+  assert(SrcVar->hasReg());
+  auto *Target = InstX86Base::getTarget(Func);
+  Asm->movd(SrcVar->getType(), DestMem->toAsmAddress(Asm, Target),
+            Traits::getEncodedXmm(SrcVar->getRegNum()));
+}
+
+void InstX86StoreD::dump(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  Str << "stored." << this->getSrc(0)->getType() << " ";
+  this->getSrc(1)->dump(Func);
+  Str << ", ";
+  this->getSrc(0)->dump(Func);
+}
+
+void InstX86Lea::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  if (auto *Add = this->deoptToAddOrNull(Func)) {
+    Add->emit(Func);
+    return;
+  }
+
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(this->getSrcSize() == 1);
+  assert(this->getDest()->hasReg());
+  Str << "\t"
+         "lea"
+      << this->getWidthString(this->getDest()->getType()) << "\t";
+  Operand *Src0 = this->getSrc(0);
+  if (const auto *Src0Var = llvm::dyn_cast<Variable>(Src0)) {
+    Type Ty = Src0Var->getType();
+    // lea on x86-32 doesn't accept mem128 operands, so cast VSrc0 to an
+    // acceptable type.
+    Src0Var->asType(Func, isVectorType(Ty) ? IceType_i32 : Ty, RegNumT())
+        ->emit(Func);
+  } else {
+    Src0->emit(Func);
+  }
+  Str << ", ";
+  this->getDest()->emit(Func);
+}
+
+void InstX86Lea::emitIAS(const Cfg *Func) const {
+  assert(this->getSrcSize() == 1);
+  const Variable *Var = this->getDest();
+  Type Ty = Var->getType();
+  const Operand *Src = this->getSrc(0);
+  bool IsLea = true;
+
+  if (auto *Add = this->deoptToAddOrNull(Func)) {
+    Add->emitIAS(Func);
+    return;
+  }
+
+  emitIASRegOpTyGPR(Func, IsLea, Ty, Var, Src, Emitter);
+}
+
+Inst *InstX86Lea::deoptToAddOrNull(const Cfg *Func) const {
+  // Revert back to Add when the Lea is a 2-address instruction.
+  // Caller has to emit, this just produces the add instruction.
+  if (auto *MemOp = llvm::dyn_cast<X86OperandMem>(this->getSrc(0))) {
+    if (getFlags().getAggressiveLea() &&
+        MemOp->getBase()->getRegNum() == this->getDest()->getRegNum() &&
+        MemOp->getIndex() == nullptr && MemOp->getShift() == 0) {
+      auto *Add = InstX86Add::create(const_cast<Cfg *>(Func), this->getDest(),
+                                     MemOp->getOffset());
+      // TODO(manasijm): Remove const_cast by emitting code for add
+      // directly.
+      return Add;
+    }
+  }
+  return nullptr;
+}
+
+void InstX86Mov::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(this->getSrcSize() == 1);
+  Operand *Src = this->getSrc(0);
+  Type SrcTy = Src->getType();
+  Type DestTy = this->getDest()->getType();
+  if (Traits::Is64Bit && DestTy == IceType_i64 &&
+      llvm::isa<ConstantInteger64>(Src) &&
+      !Utils::IsInt(32, llvm::cast<ConstantInteger64>(Src)->getValue())) {
+    Str << "\t"
+           "movabs"
+           "\t";
+  } else {
+    Str << "\t"
+           "mov"
+        << (!isScalarFloatingType(DestTy)
+                ? this->getWidthString(DestTy)
+                : Traits::TypeAttributes[DestTy].SdSsString)
+        << "\t";
+  }
+  // For an integer truncation operation, src is wider than dest. In this case,
+  // we use a mov instruction whose data width matches the narrower dest.
+  // TODO: This assert disallows usages such as copying a floating
+  // point value between a vector and a scalar (which movss is used for). Clean
+  // this up.
+  assert(InstX86Base::getTarget(Func)->typeWidthInBytesOnStack(DestTy) ==
+         InstX86Base::getTarget(Func)->typeWidthInBytesOnStack(SrcTy));
+  const Operand *NewSrc = Src;
+  if (auto *SrcVar = llvm::dyn_cast<Variable>(Src)) {
+    RegNumT NewRegNum;
+    if (SrcVar->hasReg())
+      NewRegNum = Traits::getGprForType(DestTy, SrcVar->getRegNum());
+    if (SrcTy != DestTy)
+      NewSrc = SrcVar->asType(Func, DestTy, NewRegNum);
+  }
+  NewSrc->emit(Func);
+  Str << ", ";
+  this->getDest()->emit(Func);
+}
+
+void InstX86Mov::emitIAS(const Cfg *Func) const {
+  assert(this->getSrcSize() == 1);
+  const Variable *Dest = this->getDest();
+  const Operand *Src = this->getSrc(0);
+  Type DestTy = Dest->getType();
+  Type SrcTy = Src->getType();
+  // Mov can be used for GPRs or XMM registers. Also, the type does not
+  // necessarily match (Mov can be used for bitcasts). However, when the type
+  // does not match, one of the operands must be a register. Thus, the strategy
+  // is to find out if Src or Dest are a register, then use that register's
+  // type to decide on which emitter set to use. The emitter set will include
+  // reg-reg movs, but that case should be unused when the types don't match.
+  static const XmmEmitterRegOp XmmRegEmitter = {&Assembler::movss,
+                                                &Assembler::movss};
+  static const GPREmitterRegOp GPRRegEmitter = {
+      &Assembler::mov, &Assembler::mov, &Assembler::mov};
+  static const GPREmitterAddrOp GPRAddrEmitter = {&Assembler::mov,
+                                                  &Assembler::mov};
+  // For an integer truncation operation, src is wider than dest. In this case,
+  // we use a mov instruction whose data width matches the narrower dest.
+  // TODO: This assert disallows usages such as copying a floating
+  // point value between a vector and a scalar (which movss is used for). Clean
+  // this up.
+  auto *Target = InstX86Base::getTarget(Func);
+  assert(Target->typeWidthInBytesOnStack(this->getDest()->getType()) ==
+         Target->typeWidthInBytesOnStack(Src->getType()));
+  if (Dest->hasReg()) {
+    if (isScalarFloatingType(DestTy)) {
+      emitIASRegOpTyXMM(Func, DestTy, Dest, Src, XmmRegEmitter);
+      return;
+    } else {
+      assert(isScalarIntegerType(DestTy));
+      // Widen DestTy for truncation (see above note). We should only do this
+      // when both Src and Dest are integer types.
+      if (isScalarIntegerType(SrcTy)) {
+        SrcTy = DestTy;
+      }
+      constexpr bool NotLea = false;
+      emitIASRegOpTyGPR(Func, NotLea, DestTy, Dest, Src, GPRRegEmitter);
+      return;
+    }
+  } else {
+    // Dest must be Stack and Src *could* be a register. Use Src's type to
+    // decide on the emitters.
+    Address StackAddr(Target->stackVarToAsmOperand(Dest));
+    if (isScalarFloatingType(SrcTy)) {
+      // Src must be a register.
+      const auto *SrcVar = llvm::cast<Variable>(Src);
+      assert(SrcVar->hasReg());
+      Assembler *Asm = Func->getAssembler<Assembler>();
+      Asm->movss(SrcTy, StackAddr, Traits::getEncodedXmm(SrcVar->getRegNum()));
+      return;
+    } else if (isVectorType(SrcTy)) {
+      // Src must be a register
+      const auto *SrcVar = llvm::cast<Variable>(Src);
+      assert(SrcVar->hasReg());
+      Assembler *Asm = Func->getAssembler<Assembler>();
+      Asm->movups(StackAddr, Traits::getEncodedXmm(SrcVar->getRegNum()));
+    } else {
+      // Src can be a register or immediate.
+      assert(isScalarIntegerType(SrcTy));
+      emitIASAddrOpTyGPR(Func, SrcTy, StackAddr, Src, GPRAddrEmitter);
+      return;
+    }
+    return;
+  }
+}
+
+void InstX86Movd::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  assert(this->getSrcSize() == 1);
+  Variable *Dest = this->getDest();
+  Operand *Src = this->getSrc(0);
+
+  if (Dest->getType() == IceType_i64 || Src->getType() == IceType_i64) {
+    assert(Dest->getType() == IceType_f64 || Src->getType() == IceType_f64);
+    assert(Dest->getType() != Src->getType());
+    Ostream &Str = Func->getContext()->getStrEmit();
+    Str << "\t"
+           "movq"
+           "\t";
+    Src->emit(Func);
+    Str << ", ";
+    Dest->emit(Func);
+    return;
+  }
+
+  InstX86BaseUnaryopXmm<InstX86Base::Movd>::emit(Func);
+}
+
+void InstX86Movd::emitIAS(const Cfg *Func) const {
+  Assembler *Asm = Func->getAssembler<Assembler>();
+  assert(this->getSrcSize() == 1);
+  const Variable *Dest = this->getDest();
+  auto *Target = InstX86Base::getTarget(Func);
+  // For insert/extract element (one of Src/Dest is an Xmm vector and the other
+  // is an int type).
+  if (const auto *SrcVar = llvm::dyn_cast<Variable>(this->getSrc(0))) {
+    if (SrcVar->getType() == IceType_i32 ||
+        (Traits::Is64Bit && SrcVar->getType() == IceType_i64)) {
+      assert(isVectorType(Dest->getType()) ||
+             (isScalarFloatingType(Dest->getType()) &&
+              typeWidthInBytes(SrcVar->getType()) ==
+                  typeWidthInBytes(Dest->getType())));
+      assert(Dest->hasReg());
+      XmmRegister DestReg = Traits::getEncodedXmm(Dest->getRegNum());
+      if (SrcVar->hasReg()) {
+        Asm->movd(SrcVar->getType(), DestReg,
+                  Traits::getEncodedGPR(SrcVar->getRegNum()));
+      } else {
+        Address StackAddr(Target->stackVarToAsmOperand(SrcVar));
+        Asm->movd(SrcVar->getType(), DestReg, StackAddr);
+      }
+    } else {
+      assert(isVectorType(SrcVar->getType()) ||
+             (isScalarFloatingType(SrcVar->getType()) &&
+              typeWidthInBytes(SrcVar->getType()) ==
+                  typeWidthInBytes(Dest->getType())));
+      assert(SrcVar->hasReg());
+      assert(Dest->getType() == IceType_i32 ||
+             (Traits::Is64Bit && Dest->getType() == IceType_i64));
+      XmmRegister SrcReg = Traits::getEncodedXmm(SrcVar->getRegNum());
+      if (Dest->hasReg()) {
+        Asm->movd(Dest->getType(), Traits::getEncodedGPR(Dest->getRegNum()),
+                  SrcReg);
+      } else {
+        Address StackAddr(Target->stackVarToAsmOperand(Dest));
+        Asm->movd(Dest->getType(), StackAddr, SrcReg);
+      }
+    }
+  } else {
+    assert(Dest->hasReg());
+    XmmRegister DestReg = Traits::getEncodedXmm(Dest->getRegNum());
+    auto *Mem = llvm::cast<X86OperandMem>(this->getSrc(0));
+    Asm->movd(Mem->getType(), DestReg, Mem->toAsmAddress(Asm, Target));
+  }
+}
+
+void InstX86Movp::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  // TODO(wala,stichnot): movups works with all vector operands, but there
+  // exist other instructions (movaps, movdqa, movdqu) that may perform better,
+  // depending on the data type and alignment of the operands.
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(this->getSrcSize() == 1);
+  Str << "\t"
+         "movups\t";
+  this->getSrc(0)->emit(Func);
+  Str << ", ";
+  this->getDest()->emit(Func);
+}
+
+void InstX86Movp::emitIAS(const Cfg *Func) const {
+  assert(this->getSrcSize() == 1);
+  assert(isVectorType(this->getDest()->getType()));
+  const Variable *Dest = this->getDest();
+  const Operand *Src = this->getSrc(0);
+  static const XmmEmitterMovOps Emitter = {
+      &Assembler::movups, &Assembler::movups, &Assembler::movups};
+  emitIASMovlikeXMM(Func, Dest, Src, Emitter);
+}
+
+void InstX86Movq::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(this->getSrcSize() == 1);
+  assert(this->getDest()->getType() == IceType_i64 ||
+         this->getDest()->getType() == IceType_f64);
+  Str << "\t"
+         "movq"
+         "\t";
+  this->getSrc(0)->emit(Func);
+  Str << ", ";
+  this->getDest()->emit(Func);
+}
+
+void InstX86Movq::emitIAS(const Cfg *Func) const {
+  assert(this->getSrcSize() == 1);
+  assert(this->getDest()->getType() == IceType_i64 ||
+         this->getDest()->getType() == IceType_f64 ||
+         isVectorType(this->getDest()->getType()));
+  const Variable *Dest = this->getDest();
+  const Operand *Src = this->getSrc(0);
+  static const XmmEmitterMovOps Emitter = {&Assembler::movq, &Assembler::movq,
+                                           &Assembler::movq};
+  emitIASMovlikeXMM(Func, Dest, Src, Emitter);
+}
+
+void InstX86MovssRegs::emitIAS(const Cfg *Func) const {
+  // This is Binop variant is only intended to be used for reg-reg moves where
+  // part of the Dest register is untouched.
+  assert(this->getSrcSize() == 2);
+  const Variable *Dest = this->getDest();
+  assert(Dest == this->getSrc(0));
+  const auto *SrcVar = llvm::cast<Variable>(this->getSrc(1));
+  assert(Dest->hasReg() && SrcVar->hasReg());
+  Assembler *Asm = Func->getAssembler<Assembler>();
+  Asm->movss(IceType_f32, Traits::getEncodedXmm(Dest->getRegNum()),
+             Traits::getEncodedXmm(SrcVar->getRegNum()));
+}
+
+void InstX86Movsx::emitIAS(const Cfg *Func) const {
+  assert(this->getSrcSize() == 1);
+  const Variable *Dest = this->getDest();
+  const Operand *Src = this->getSrc(0);
+  // Dest must be a > 8-bit register, but Src can be 8-bit. In practice we just
+  // use the full register for Dest to avoid having an OperandSizeOverride
+  // prefix. It also allows us to only dispatch on SrcTy.
+  Type SrcTy = Src->getType();
+  assert(typeWidthInBytes(Dest->getType()) > 1);
+  assert(typeWidthInBytes(Dest->getType()) > typeWidthInBytes(SrcTy));
+  constexpr bool NotLea = false;
+  emitIASRegOpTyGPR<false, true>(Func, NotLea, SrcTy, Dest, Src, this->Emitter);
+}
+
+bool InstX86Movzx::mayBeElided(const Variable *Dest,
+                               const Operand *SrcOpnd) const {
+  assert(Traits::Is64Bit);
+  const auto *Src = llvm::dyn_cast<Variable>(SrcOpnd);
+
+  // Src is not a Variable, so it does not have a register. Movzx can't be
+  // elided.
+  if (Src == nullptr)
+    return false;
+
+  // Movzx to/from memory can't be elided.
+  if (!Src->hasReg() || !Dest->hasReg())
+    return false;
+
+  // Reg/reg move with different source and dest can't be elided.
+  if (Traits::getEncodedGPR(Src->getRegNum()) !=
+      Traits::getEncodedGPR(Dest->getRegNum()))
+    return false;
+
+  // A must-keep movzx 32- to 64-bit is sometimes needed in x86-64 sandboxing.
+  return !MustKeep;
+}
+
+void InstX86Movzx::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  if (Traits::Is64Bit) {
+    // There's no movzx %eXX, %rXX. To zero extend 32- to 64-bits, we emit a
+    // mov %eXX, %eXX. The processor will still do a movzx[bw]q.
+    assert(this->getSrcSize() == 1);
+    const Operand *Src = this->getSrc(0);
+    const Variable *Dest = this->Dest;
+    if (Src->getType() == IceType_i32 && Dest->getType() == IceType_i64) {
+      Ostream &Str = Func->getContext()->getStrEmit();
+      if (mayBeElided(Dest, Src)) {
+        Str << "\t/* elided movzx */";
+      } else {
+        Str << "\t"
+               "mov"
+               "\t";
+        Src->emit(Func);
+        Str << ", ";
+        Dest->asType(Func, IceType_i32,
+                     Traits::getGprForType(IceType_i32, Dest->getRegNum()))
+            ->emit(Func);
+        Str << " /* movzx */";
+      }
+      return;
+    }
+  }
+  InstX86BaseUnaryopGPR<InstX86Base::Movzx>::emit(Func);
+}
+
+void InstX86Movzx::emitIAS(const Cfg *Func) const {
+  assert(this->getSrcSize() == 1);
+  const Variable *Dest = this->getDest();
+  const Operand *Src = this->getSrc(0);
+  Type SrcTy = Src->getType();
+  assert(typeWidthInBytes(Dest->getType()) > 1);
+  assert(typeWidthInBytes(Dest->getType()) > typeWidthInBytes(SrcTy));
+  if (Traits::Is64Bit) {
+    if (Src->getType() == IceType_i32 && Dest->getType() == IceType_i64 &&
+        mayBeElided(Dest, Src)) {
+      return;
+    }
+  }
+  constexpr bool NotLea = false;
+  emitIASRegOpTyGPR<false, true>(Func, NotLea, SrcTy, Dest, Src, this->Emitter);
+}
+
+void InstX86Nop::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  // TODO: Emit the right code for each variant.
+  Str << "\t"
+         "nop\t/* variant = "
+      << Variant << " */";
+}
+
+void InstX86Nop::emitIAS(const Cfg *Func) const {
+  Assembler *Asm = Func->getAssembler<Assembler>();
+  // TODO: Emit the right code for the variant.
+  Asm->nop();
+}
+
+void InstX86Nop::dump(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  Str << "nop (variant = " << Variant << ")";
+}
+
+void InstX86Fld::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(this->getSrcSize() == 1);
+  Type Ty = this->getSrc(0)->getType();
+  const auto *Var = llvm::dyn_cast<Variable>(this->getSrc(0));
+  if (Var && Var->hasReg()) {
+    // This is a physical xmm register, so we need to spill it to a temporary
+    // stack slot.  Function prolog emission guarantees that there is sufficient
+    // space to do this.
+    Str << "\t"
+           "mov"
+        << Traits::TypeAttributes[Ty].SdSsString << "\t";
+    Var->emit(Func);
+    Str << ", (%esp)\n"
+           "\t"
+           "fld"
+        << this->getFldString(Ty)
+        << "\t"
+           "(%esp)";
+    return;
+  }
+  Str << "\t"
+         "fld"
+      << this->getFldString(Ty) << "\t";
+  this->getSrc(0)->emit(Func);
+}
+
+void InstX86Fld::emitIAS(const Cfg *Func) const {
+  Assembler *Asm = Func->getAssembler<Assembler>();
+  assert(this->getSrcSize() == 1);
+  const Operand *Src = this->getSrc(0);
+  auto *Target = InstX86Base::getTarget(Func);
+  Type Ty = Src->getType();
+  if (const auto *Var = llvm::dyn_cast<Variable>(Src)) {
+    if (Var->hasReg()) {
+      // This is a physical xmm register, so we need to spill it to a temporary
+      // stack slot.  Function prolog emission guarantees that there is
+      // sufficient space to do this.
+      Address StackSlot =
+          Address(RegisterSet::Encoded_Reg_esp, 0, AssemblerFixup::NoFixup);
+      Asm->movss(Ty, StackSlot, Traits::getEncodedXmm(Var->getRegNum()));
+      Asm->fld(Ty, StackSlot);
+    } else {
+      Address StackAddr(Target->stackVarToAsmOperand(Var));
+      Asm->fld(Ty, StackAddr);
+    }
+  } else if (const auto *Mem = llvm::dyn_cast<X86OperandMem>(Src)) {
+    assert(Mem->getSegmentRegister() == X86OperandMem::DefaultSegment);
+    Asm->fld(Ty, Mem->toAsmAddress(Asm, Target));
+  } else if (const auto *Imm = llvm::dyn_cast<Constant>(Src)) {
+    Asm->fld(Ty, Traits::Address::ofConstPool(Asm, Imm));
+  } else {
+    llvm_unreachable("Unexpected operand type");
+  }
+}
+
+void InstX86Fld::dump(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  Str << "fld." << this->getSrc(0)->getType() << " ";
+  this->dumpSources(Func);
+}
+
+void InstX86Fstp::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(this->getSrcSize() == 0);
+  // TODO(jvoung,stichnot): Utilize this by setting Dest to nullptr to
+  // "partially" delete the fstp if the Dest is unused. Even if Dest is unused,
+  // the fstp should be kept for the SideEffects of popping the stack.
+  if (!this->getDest()) {
+    Str << "\t"
+           "fstp\t"
+           "st(0)";
+    return;
+  }
+  Type Ty = this->getDest()->getType();
+  if (!this->getDest()->hasReg()) {
+    Str << "\t"
+           "fstp"
+        << this->getFldString(Ty) << "\t";
+    this->getDest()->emit(Func);
+    return;
+  }
+  // Dest is a physical (xmm) register, so st(0) needs to go through memory.
+  // Hack this by using caller-reserved memory at the top of stack, spilling
+  // st(0) there, and loading it into the xmm register.
+  Str << "\t"
+         "fstp"
+      << this->getFldString(Ty)
+      << "\t"
+         "(%esp)\n";
+  Str << "\t"
+         "mov"
+      << Traits::TypeAttributes[Ty].SdSsString
+      << "\t"
+         "(%esp), ";
+  this->getDest()->emit(Func);
+}
+
+void InstX86Fstp::emitIAS(const Cfg *Func) const {
+  Assembler *Asm = Func->getAssembler<Assembler>();
+  assert(this->getSrcSize() == 0);
+  const Variable *Dest = this->getDest();
+  // TODO(jvoung,stichnot): Utilize this by setting Dest to nullptr to
+  // "partially" delete the fstp if the Dest is unused. Even if Dest is unused,
+  // the fstp should be kept for the SideEffects of popping the stack.
+  if (!Dest) {
+    Asm->fstp(RegisterSet::getEncodedSTReg(0));
+    return;
+  }
+  auto *Target = InstX86Base::getTarget(Func);
+  Type Ty = Dest->getType();
+  if (!Dest->hasReg()) {
+    Address StackAddr(Target->stackVarToAsmOperand(Dest));
+    Asm->fstp(Ty, StackAddr);
+  } else {
+    // Dest is a physical (xmm) register, so st(0) needs to go through memory.
+    // Hack this by using caller-reserved memory at the top of stack, spilling
+    // st(0) there, and loading it into the xmm register.
+    Address StackSlot =
+        Address(RegisterSet::Encoded_Reg_esp, 0, AssemblerFixup::NoFixup);
+    Asm->fstp(Ty, StackSlot);
+    Asm->movss(Ty, Traits::getEncodedXmm(Dest->getRegNum()), StackSlot);
+  }
+}
+
+void InstX86Fstp::dump(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  this->dumpDest(Func);
+  Str << " = fstp." << this->getDest()->getType() << ", st(0)";
+}
+
+void InstX86Pextr::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(this->getSrcSize() == 2);
+  // pextrb and pextrd are SSE4.1 instructions.
+  Str << "\t" << this->Opcode
+      << Traits::TypeAttributes[this->getSrc(0)->getType()].IntegralString
+      << "\t";
+  this->getSrc(1)->emit(Func);
+  Str << ", ";
+  this->getSrc(0)->emit(Func);
+  Str << ", ";
+  Variable *Dest = this->getDest();
+  // pextrw must take a register dest. There is an SSE4.1 version that takes a
+  // memory dest, but we aren't using it. For uniformity, just restrict them
+  // all to have a register dest for now.
+  assert(Dest->hasReg());
+  Dest->asType(Func, IceType_i32, Dest->getRegNum())->emit(Func);
+}
+
+void InstX86Pextr::emitIAS(const Cfg *Func) const {
+  assert(this->getSrcSize() == 2);
+  // pextrb and pextrd are SSE4.1 instructions.
+  const Variable *Dest = this->getDest();
+  Type DispatchTy = Traits::getInVectorElementType(this->getSrc(0)->getType());
+  // pextrw must take a register dest. There is an SSE4.1 version that takes a
+  // memory dest, but we aren't using it. For uniformity, just restrict them
+  // all to have a register dest for now.
+  assert(Dest->hasReg());
+  // pextrw's Src(0) must be a register (both SSE4.1 and SSE2).
+  assert(llvm::cast<Variable>(this->getSrc(0))->hasReg());
+  static const ThreeOpImmEmitter<GPRRegister, XmmRegister> Emitter = {
+      &Assembler::pextr, nullptr};
+  emitIASThreeOpImmOps<GPRRegister, XmmRegister, Traits::getEncodedGPR,
+                       Traits::getEncodedXmm>(
+      Func, DispatchTy, Dest, this->getSrc(0), this->getSrc(1), Emitter);
+}
+
+void InstX86Pinsr::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(this->getSrcSize() == 3);
+  Str << "\t" << this->Opcode
+      << Traits::TypeAttributes[this->getDest()->getType()].IntegralString
+      << "\t";
+  this->getSrc(2)->emit(Func);
+  Str << ", ";
+  Operand *Src1 = this->getSrc(1);
+  if (const auto *Src1Var = llvm::dyn_cast<Variable>(Src1)) {
+    // If src1 is a register, it should always be r32.
+    if (Src1Var->hasReg()) {
+      const auto NewRegNum = Traits::getBaseReg(Src1Var->getRegNum());
+      const Variable *NewSrc = Src1Var->asType(Func, IceType_i32, NewRegNum);
+      NewSrc->emit(Func);
+    } else {
+      Src1Var->emit(Func);
+    }
+  } else {
+    Src1->emit(Func);
+  }
+  Str << ", ";
+  this->getDest()->emit(Func);
+}
+
+void InstX86Pinsr::emitIAS(const Cfg *Func) const {
+  assert(this->getSrcSize() == 3);
+  assert(this->getDest() == this->getSrc(0));
+  // pinsrb and pinsrd are SSE4.1 instructions.
+  const Operand *Src0 = this->getSrc(1);
+  Type DispatchTy = Src0->getType();
+  // If src1 is a register, it should always be r32 (this should fall out from
+  // the encodings for ByteRegs overlapping the encodings for r32), but we have
+  // to make sure the register allocator didn't choose an 8-bit high register
+  // like "ah".
+  if (BuildDefs::asserts()) {
+    if (auto *Src0Var = llvm::dyn_cast<Variable>(Src0)) {
+      if (Src0Var->hasReg()) {
+        const auto RegNum = Src0Var->getRegNum();
+        const auto BaseRegNum = Traits::getBaseReg(RegNum);
+        (void)BaseRegNum;
+        assert(Traits::getEncodedGPR(RegNum) ==
+               Traits::getEncodedGPR(BaseRegNum));
+      }
+    }
+  }
+  static const ThreeOpImmEmitter<XmmRegister, GPRRegister> Emitter = {
+      &Assembler::pinsr, &Assembler::pinsr};
+  emitIASThreeOpImmOps<XmmRegister, GPRRegister, Traits::getEncodedXmm,
+                       Traits::getEncodedGPR>(Func, DispatchTy, this->getDest(),
+                                              Src0, this->getSrc(2), Emitter);
+}
+
+void InstX86Pshufd::emitIAS(const Cfg *Func) const {
+  assert(this->getSrcSize() == 2);
+  const Variable *Dest = this->getDest();
+  Type Ty = Dest->getType();
+  static const ThreeOpImmEmitter<XmmRegister, XmmRegister> Emitter = {
+      &Assembler::pshufd, &Assembler::pshufd};
+  emitIASThreeOpImmOps<XmmRegister, XmmRegister, Traits::getEncodedXmm,
+                       Traits::getEncodedXmm>(Func, Ty, Dest, this->getSrc(0),
+                                              this->getSrc(1), Emitter);
+}
+
+void InstX86Shufps::emitIAS(const Cfg *Func) const {
+  assert(this->getSrcSize() == 3);
+  const Variable *Dest = this->getDest();
+  assert(Dest == this->getSrc(0));
+  Type Ty = Dest->getType();
+  static const ThreeOpImmEmitter<XmmRegister, XmmRegister> Emitter = {
+      &Assembler::shufps, &Assembler::shufps};
+  emitIASThreeOpImmOps<XmmRegister, XmmRegister, Traits::getEncodedXmm,
+                       Traits::getEncodedXmm>(Func, Ty, Dest, this->getSrc(1),
+                                              this->getSrc(2), Emitter);
+}
+
+void InstX86Pop::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(this->getSrcSize() == 0);
+  Str << "\t"
+         "pop\t";
+  this->getDest()->emit(Func);
+}
+
+void InstX86Pop::emitIAS(const Cfg *Func) const {
+  assert(this->getSrcSize() == 0);
+  Assembler *Asm = Func->getAssembler<Assembler>();
+  if (this->getDest()->hasReg()) {
+    Asm->popl(Traits::getEncodedGPR(this->getDest()->getRegNum()));
+  } else {
+    auto *Target = InstX86Base::getTarget(Func);
+    Asm->popl(Target->stackVarToAsmOperand(this->getDest()));
+  }
+}
+
+void InstX86Pop::dump(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  this->dumpDest(Func);
+  Str << " = pop." << this->getDest()->getType() << " ";
+}
+
+void InstX86Push::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  Str << "\t"
+         "push"
+         "\t";
+  assert(this->getSrcSize() == 1);
+  const Operand *Src = this->getSrc(0);
+  Src->emit(Func);
+}
+
+void InstX86Push::emitIAS(const Cfg *Func) const {
+  Assembler *Asm = Func->getAssembler<Assembler>();
+
+  assert(this->getSrcSize() == 1);
+  const Operand *Src = this->getSrc(0);
+
+  if (const auto *Var = llvm::dyn_cast<Variable>(Src)) {
+    Asm->pushl(Traits::getEncodedGPR(Var->getRegNum()));
+  } else if (const auto *Const32 = llvm::dyn_cast<ConstantInteger32>(Src)) {
+    Asm->pushl(AssemblerImmediate(Const32->getValue()));
+  } else if (auto *CR = llvm::dyn_cast<ConstantRelocatable>(Src)) {
+    Asm->pushl(CR);
+  } else {
+    llvm_unreachable("Unexpected operand type");
+  }
+}
+
+void InstX86Push::dump(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  Str << "push." << this->getSrc(0)->getType() << " ";
+  this->dumpSources(Func);
+}
+
+void InstX86Ret::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  Str << "\t"
+         "ret";
+}
+
+void InstX86Ret::emitIAS(const Cfg *Func) const {
+  Assembler *Asm = Func->getAssembler<Assembler>();
+  Asm->ret();
+}
+
+void InstX86Ret::dump(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  Type Ty =
+      (this->getSrcSize() == 0 ? IceType_void : this->getSrc(0)->getType());
+  Str << "ret." << Ty << " ";
+  this->dumpSources(Func);
+}
+
+void InstX86Setcc::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  Str << "\t"
+         "set"
+      << Traits::InstBrAttributes[Condition].DisplayString << "\t";
+  this->Dest->emit(Func);
+}
+
+void InstX86Setcc::emitIAS(const Cfg *Func) const {
+  assert(Condition != Cond::Br_None);
+  assert(this->getDest()->getType() == IceType_i1);
+  assert(this->getSrcSize() == 0);
+  Assembler *Asm = Func->getAssembler<Assembler>();
+  auto *Target = InstX86Base::getTarget(Func);
+  if (this->getDest()->hasReg())
+    Asm->setcc(Condition,
+               Traits::getEncodedByteReg(this->getDest()->getRegNum()));
+  else
+    Asm->setcc(Condition, Target->stackVarToAsmOperand(this->getDest()));
+  return;
+}
+
+void InstX86Setcc::dump(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  Str << "setcc." << Traits::InstBrAttributes[Condition].DisplayString << " ";
+  this->dumpDest(Func);
+}
+
+void InstX86Xadd::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  if (this->Locked) {
+    Str << "\t"
+           "lock";
+  }
+  Str << "\t"
+         "xadd"
+      << this->getWidthString(this->getSrc(0)->getType()) << "\t";
+  this->getSrc(1)->emit(Func);
+  Str << ", ";
+  this->getSrc(0)->emit(Func);
+}
+
+void InstX86Xadd::emitIAS(const Cfg *Func) const {
+  assert(this->getSrcSize() == 2);
+  Assembler *Asm = Func->getAssembler<Assembler>();
+  Type Ty = this->getSrc(0)->getType();
+  const auto Mem = llvm::cast<X86OperandMem>(this->getSrc(0));
+  assert(Mem->getSegmentRegister() == X86OperandMem::DefaultSegment);
+  auto *Target = InstX86Base::getTarget(Func);
+  const Address Addr = Mem->toAsmAddress(Asm, Target);
+  const auto *VarReg = llvm::cast<Variable>(this->getSrc(1));
+  assert(VarReg->hasReg());
+  const GPRRegister Reg = Traits::getEncodedGPR(VarReg->getRegNum());
+  Asm->xadd(Ty, Addr, Reg, this->Locked);
+}
+
+void InstX86Xadd::dump(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  if (this->Locked) {
+    Str << "lock ";
+  }
+  Type Ty = this->getSrc(0)->getType();
+  Str << "xadd." << Ty << " ";
+  this->dumpSources(Func);
+}
+
+void InstX86Xchg::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  Str << "\t"
+         "xchg"
+      << this->getWidthString(this->getSrc(0)->getType()) << "\t";
+  this->getSrc(1)->emit(Func);
+  Str << ", ";
+  this->getSrc(0)->emit(Func);
+}
+
+void InstX86Xchg::emitIAS(const Cfg *Func) const {
+  assert(this->getSrcSize() == 2);
+  Assembler *Asm = Func->getAssembler<Assembler>();
+  Type Ty = this->getSrc(0)->getType();
+  const auto *VarReg1 = llvm::cast<Variable>(this->getSrc(1));
+  assert(VarReg1->hasReg());
+  const GPRRegister Reg1 = Traits::getEncodedGPR(VarReg1->getRegNum());
+
+  if (const auto *VarReg0 = llvm::dyn_cast<Variable>(this->getSrc(0))) {
+    assert(VarReg0->hasReg());
+    const GPRRegister Reg0 = Traits::getEncodedGPR(VarReg0->getRegNum());
+    Asm->xchg(Ty, Reg0, Reg1);
+    return;
+  }
+
+  const auto *Mem = llvm::cast<X86OperandMem>(this->getSrc(0));
+  assert(Mem->getSegmentRegister() == X86OperandMem::DefaultSegment);
+  auto *Target = InstX86Base::getTarget(Func);
+  const Address Addr = Mem->toAsmAddress(Asm, Target);
+  Asm->xchg(Ty, Addr, Reg1);
+}
+
+void InstX86Xchg::dump(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  Type Ty = this->getSrc(0)->getType();
+  Str << "xchg." << Ty << " ";
+  this->dumpSources(Func);
+}
+
+void InstX86IacaStart::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  Str << "\t# IACA_START\n"
+         "\t.byte 0x0F, 0x0B\n"
+         "\t"
+         "movl\t$111, %ebx\n"
+         "\t.byte 0x64, 0x67, 0x90";
+}
+
+void InstX86IacaStart::emitIAS(const Cfg *Func) const {
+  Assembler *Asm = Func->getAssembler<Assembler>();
+  Asm->iaca_start();
+}
+
+void InstX86IacaStart::dump(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  Str << "IACA_START";
+}
+
+void InstX86IacaEnd::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  Str << "\t# IACA_END\n"
+         "\t"
+         "movl\t$222, %ebx\n"
+         "\t.byte 0x64, 0x67, 0x90\n"
+         "\t.byte 0x0F, 0x0B";
+}
+
+void InstX86IacaEnd::emitIAS(const Cfg *Func) const {
+  Assembler *Asm = Func->getAssembler<Assembler>();
+  Asm->iaca_end();
+}
+
+void InstX86IacaEnd::dump(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  Str << "IACA_END";
+}
+
 const TargetX8632Traits::InstBrAttributesType
     TargetX8632Traits::InstBrAttributes[] = {
 #define X(val, encode, opp, dump, emit) {CondX86::opp, dump, emit},
diff --git a/third_party/subzero/src/IceInstX8632.h b/third_party/subzero/src/IceInstX8632.h
index 98cf27f..f0117c9 100644
--- a/third_party/subzero/src/IceInstX8632.h
+++ b/third_party/subzero/src/IceInstX8632.h
@@ -1,4 +1,4 @@
-//===- subzero/src/IceInstX8632.h - x86-32 machine instructions -*- C++ -*-===//
+//===- subzero/src/IceInstX86.h - Generic x86 instructions -*- C++ -*--===//
 //
 //                        The Subzero Code Generator
 //
@@ -8,30 +8,3512 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief Used to house all the X8632 instructions.
+/// \brief This file defines the InstX86Base template class, as well as the
+/// generic X86 Instruction class hierarchy.
 ///
-/// Subzero has been modified to use templates for X86 instructions, so all
-/// those definitions are are in IceInstX86Base.h
-///
-/// When interacting with the X8632 target (which should only happen in the
-/// X8632 TargetLowering) clients have should use the Ice::X8632::Traits::Insts
-/// traits, which hides all the template verboseness behind a type alias.
-///
-/// For example, to create an X8632 MOV Instruction, clients should do
-///
-/// ::Ice::X8632::Traits::Insts::Mov::create
+/// Only X86 instructions common across all/most X86 targets should be defined
+/// here, with target-specific instructions declared in the target's traits.
 ///
 //===----------------------------------------------------------------------===//
 
 #ifndef SUBZERO_SRC_ICEINSTX8632_H
 #define SUBZERO_SRC_ICEINSTX8632_H
 
-#include "IceDefs.h"
-#include "IceInst.h"
-#include "IceInstX8632Base.h"
-#include "IceOperand.h"
 #include "IceTargetLoweringX8632Traits.h"
 
-X86INSTS_DEFINE_STATIC_DATA(X8632::Traits)
+#include "IceAssemblerX8632.h"
+#include "IceDefs.h"
+#include "IceInst.h"
+#include "IceOperand.h"
+#include "IceTargetLoweringX86.h"
+
+namespace Ice {
+
+namespace X8632 {
+
+using Traits = TargetX8632Traits;
+using Assembler = typename Traits::Assembler;
+using AssemblerImmediate = typename Assembler::Immediate;
+using TargetLowering = typename Traits::TargetLowering;
+using Address = typename Traits::Address;
+using X86Operand = typename Traits::X86Operand;
+using X86OperandMem = typename Traits::X86OperandMem;
+using VariableSplit = typename Traits::VariableSplit;
+
+using GPRRegister = typename Traits::RegisterSet::GPRRegister;
+using RegisterSet = typename Traits::RegisterSet;
+using XmmRegister = typename Traits::RegisterSet::XmmRegister;
+
+using Cond = CondX86;
+using BrCond = Cond::BrCond;
+using CmppsCond = Cond::CmppsCond;
+
+template <typename SReg_t, typename DReg_t>
+using CastEmitterRegOp =
+    typename Traits::Assembler::template CastEmitterRegOp<SReg_t, DReg_t>;
+template <typename SReg_t, typename DReg_t>
+using ThreeOpImmEmitter =
+    typename Traits::Assembler::template ThreeOpImmEmitter<SReg_t, DReg_t>;
+using GPREmitterAddrOp = typename Traits::Assembler::GPREmitterAddrOp;
+using GPREmitterRegOp = typename Traits::Assembler::GPREmitterRegOp;
+using GPREmitterShiftD = typename Traits::Assembler::GPREmitterShiftD;
+using GPREmitterShiftOp = typename Traits::Assembler::GPREmitterShiftOp;
+using GPREmitterOneOp = typename Traits::Assembler::GPREmitterOneOp;
+using XmmEmitterRegOp = typename Traits::Assembler::XmmEmitterRegOp;
+using XmmEmitterShiftOp = typename Traits::Assembler::XmmEmitterShiftOp;
+using XmmEmitterMovOps = typename Traits::Assembler::XmmEmitterMovOps;
+
+class InstX86Base : public InstTarget {
+  InstX86Base() = delete;
+  InstX86Base(const InstX86Base &) = delete;
+  InstX86Base &operator=(const InstX86Base &) = delete;
+
+public:
+  enum InstKindX86 {
+    k__Start = Inst::Target,
+    Adc,
+    AdcRMW,
+    Add,
+    AddRMW,
+    Addps,
+    Addss,
+    And,
+    Andnps,
+    Andps,
+    AndRMW,
+    Blendvps,
+    Br,
+    Bsf,
+    Bsr,
+    Bswap,
+    Call,
+    Cbwdq,
+    Cmov,
+    Cmpps,
+    Cmpxchg,
+    Cmpxchg8b,
+    Cvt,
+    Div,
+    Divps,
+    Divss,
+    FakeRMW,
+    Fld,
+    Fstp,
+    Icmp,
+    Idiv,
+    Imul,
+    ImulImm,
+    Insertps,
+    Int3,
+    Jmp,
+    Label,
+    Lea,
+    Load,
+    Mfence,
+    Minps,
+    Maxps,
+    Minss,
+    Maxss,
+    Mov,
+    Movd,
+    Movmsk,
+    Movp,
+    Movq,
+    MovssRegs,
+    Movsx,
+    Movzx,
+    Mul,
+    Mulps,
+    Mulss,
+    Neg,
+    Nop,
+    Or,
+    Orps,
+    OrRMW,
+    Padd,
+    Padds,
+    Paddus,
+    Pand,
+    Pandn,
+    Pblendvb,
+    Pcmpeq,
+    Pcmpgt,
+    Pextr,
+    Pinsr,
+    Pmull,
+    Pmulhw,
+    Pmulhuw,
+    Pmaddwd,
+    Pmuludq,
+    Pop,
+    Por,
+    Pshufb,
+    Pshufd,
+    Punpckl,
+    Punpckh,
+    Packss,
+    Packus,
+    Psll,
+    Psra,
+    Psrl,
+    Psub,
+    Psubs,
+    Psubus,
+    Push,
+    Pxor,
+    Ret,
+    Rol,
+    Round,
+    Sar,
+    Sbb,
+    SbbRMW,
+    Setcc,
+    Shl,
+    Shld,
+    Shr,
+    Shrd,
+    Shufps,
+    Sqrt,
+    Store,
+    StoreP,
+    StoreQ,
+    StoreD,
+    Sub,
+    SubRMW,
+    Subps,
+    Subss,
+    Test,
+    Ucomiss,
+    UD2,
+    Xadd,
+    Xchg,
+    Xor,
+    Xorps,
+    XorRMW,
+
+    /// Intel Architecture Code Analyzer markers. These are not executable so
+    /// must only be used for analysis.
+    IacaStart,
+    IacaEnd
+  };
+
+  enum SseSuffix { None, Packed, Unpack, Scalar, Integral, Pack };
+
+  static const char *getWidthString(Type Ty);
+  static const char *getFldString(Type Ty);
+  static BrCond getOppositeCondition(BrCond Cond);
+  void dump(const Cfg *Func) const override;
+
+  // Shared emit routines for common forms of instructions.
+  void emitTwoAddress(const Cfg *Func, const char *Opcode,
+                      const char *Suffix = "") const;
+
+  static TargetLowering *getTarget(const Cfg *Func) {
+    return reinterpret_cast<TargetLowering *>(Func->getTarget());
+  }
+
+protected:
+  InstX86Base(Cfg *Func, InstKindX86 Kind, SizeT Maxsrcs, Variable *Dest)
+      : InstTarget(Func, static_cast<InstKind>(Kind), Maxsrcs, Dest) {}
+
+  static bool isClassof(const Inst *Instr, InstKindX86 MyKind) {
+    return Instr->getKind() == static_cast<InstKind>(MyKind);
+  }
+  // Most instructions that operate on vector arguments require vector memory
+  // operands to be fully aligned (16-byte alignment for PNaCl vector types).
+  // The stack frame layout and call ABI ensure proper alignment for stack
+  // operands, but memory operands (originating from load/store bitcode
+  // instructions) only have element-size alignment guarantees. This function
+  // validates that none of the operands is a memory operand of vector type,
+  // calling report_fatal_error() if one is found. This function should be
+  // called during emission, and maybe also in the ctor (as long as that fits
+  // the lowering style).
+  void validateVectorAddrMode() const {
+    if (this->getDest())
+      this->validateVectorAddrModeOpnd(this->getDest());
+    for (SizeT i = 0; i < this->getSrcSize(); ++i) {
+      this->validateVectorAddrModeOpnd(this->getSrc(i));
+    }
+  }
+
+private:
+  static void validateVectorAddrModeOpnd(const Operand *Opnd) {
+    if (llvm::isa<X86OperandMem>(Opnd) && isVectorType(Opnd->getType())) {
+      llvm::report_fatal_error("Possible misaligned vector memory operation");
+    }
+  }
+};
+
+/// InstX86FakeRMW represents a non-atomic read-modify-write operation on a
+/// memory location. An InstX86FakeRMW is a "fake" instruction in that it
+/// still needs to be lowered to some actual RMW instruction.
+///
+/// If A is some memory address, D is some data value to apply, and OP is an
+/// arithmetic operator, the instruction operates as: (*A) = (*A) OP D
+class InstX86FakeRMW final : public InstX86Base {
+  InstX86FakeRMW() = delete;
+  InstX86FakeRMW(const InstX86FakeRMW &) = delete;
+  InstX86FakeRMW &operator=(const InstX86FakeRMW &) = delete;
+
+public:
+  static InstX86FakeRMW *create(Cfg *Func, Operand *Data, Operand *Addr,
+                                Variable *Beacon, InstArithmetic::OpKind Op,
+                                uint32_t Align = 1) {
+    // TODO(stichnot): Stop ignoring alignment specification.
+    (void)Align;
+    return new (Func->allocate<InstX86FakeRMW>())
+        InstX86FakeRMW(Func, Data, Addr, Op, Beacon);
+  }
+  Operand *getAddr() const { return this->getSrc(1); }
+  Operand *getData() const { return this->getSrc(0); }
+  InstArithmetic::OpKind getOp() const { return Op; }
+  Variable *getBeacon() const { return llvm::cast<Variable>(this->getSrc(2)); }
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, InstX86Base::FakeRMW);
+  }
+
+private:
+  InstArithmetic::OpKind Op;
+  InstX86FakeRMW(Cfg *Func, Operand *Data, Operand *Addr,
+                 InstArithmetic::OpKind Op, Variable *Beacon);
+};
+
+/// InstX86Label represents an intra-block label that is the target of an
+/// intra-block branch. The offset between the label and the branch must be
+/// fit into one byte (considered "near"). These are used for lowering i1
+/// calculations, Select instructions, and 64-bit compares on a 32-bit
+/// architecture, without basic block splitting. Basic block splitting is not
+/// so desirable for several reasons, one of which is the impact on decisions
+/// based on whether a variable's live range spans multiple basic blocks.
+///
+/// Intra-block control flow must be used with caution. Consider the sequence
+/// for "c = (a >= b ? x : y)".
+///     cmp a, b
+///     br lt, L1
+///     mov c, x
+///     jmp L2
+///   L1:
+///     mov c, y
+///   L2:
+///
+/// Labels L1 and L2 are intra-block labels. Without knowledge of the
+/// intra-block control flow, liveness analysis will determine the "mov c, x"
+/// instruction to be dead. One way to prevent this is to insert a
+/// "FakeUse(c)" instruction anywhere between the two "mov c, ..."
+/// instructions, e.g.:
+///
+///     cmp a, b
+///     br lt, L1
+///     mov c, x
+///     jmp L2
+///     FakeUse(c)
+///   L1:
+///     mov c, y
+///   L2:
+///
+/// The down-side is that "mov c, x" can never be dead-code eliminated even if
+/// there are no uses of c. As unlikely as this situation is, it may be
+/// prevented by running dead code elimination before lowering.
+class InstX86Label final : public InstX86Base {
+  InstX86Label() = delete;
+  InstX86Label(const InstX86Label &) = delete;
+  InstX86Label &operator=(const InstX86Label &) = delete;
+
+public:
+  static InstX86Label *create(Cfg *Func, TargetLowering *Target) {
+    return new (Func->allocate<InstX86Label>()) InstX86Label(Func, Target);
+  }
+  uint32_t getEmitInstCount() const override { return 0; }
+  GlobalString getLabelName() const { return Name; }
+  SizeT getLabelNumber() const { return LabelNumber; }
+  bool isLabel() const override { return true; }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  void setRelocOffset(RelocOffset *Value) { OffsetReloc = Value; }
+
+private:
+  InstX86Label(Cfg *Func, TargetLowering *Target);
+
+  SizeT LabelNumber; // used for unique label generation.
+  RelocOffset *OffsetReloc = nullptr;
+  GlobalString Name;
+};
+
+/// Conditional and unconditional branch instruction.
+class InstX86Br final : public InstX86Base {
+  InstX86Br() = delete;
+  InstX86Br(const InstX86Br &) = delete;
+  InstX86Br &operator=(const InstX86Br &) = delete;
+
+public:
+  enum Mode { Near, Far };
+
+  /// Create a conditional branch to a node.
+  static InstX86Br *create(Cfg *Func, CfgNode *TargetTrue, CfgNode *TargetFalse,
+                           BrCond Condition, Mode Kind) {
+    assert(Condition != Cond::Br_None);
+    constexpr InstX86Label *NoLabel = nullptr;
+    return new (Func->allocate<InstX86Br>())
+        InstX86Br(Func, TargetTrue, TargetFalse, NoLabel, Condition, Kind);
+  }
+  /// Create an unconditional branch to a node.
+  static InstX86Br *create(Cfg *Func, CfgNode *Target, Mode Kind) {
+    constexpr CfgNode *NoCondTarget = nullptr;
+    constexpr InstX86Label *NoLabel = nullptr;
+    return new (Func->allocate<InstX86Br>())
+        InstX86Br(Func, NoCondTarget, Target, NoLabel, Cond::Br_None, Kind);
+  }
+  /// Create a non-terminator conditional branch to a node, with a fallthrough
+  /// to the next instruction in the current node. This is used for switch
+  /// lowering.
+  static InstX86Br *create(Cfg *Func, CfgNode *Target, BrCond Condition,
+                           Mode Kind) {
+    assert(Condition != Cond::Br_None);
+    constexpr CfgNode *NoUncondTarget = nullptr;
+    constexpr InstX86Label *NoLabel = nullptr;
+    return new (Func->allocate<InstX86Br>())
+        InstX86Br(Func, Target, NoUncondTarget, NoLabel, Condition, Kind);
+  }
+  /// Create a conditional intra-block branch (or unconditional, if
+  /// Condition==Br_None) to a label in the current block.
+  static InstX86Br *create(Cfg *Func, InstX86Label *Label, BrCond Condition,
+                           Mode Kind) {
+    constexpr CfgNode *NoCondTarget = nullptr;
+    constexpr CfgNode *NoUncondTarget = nullptr;
+    return new (Func->allocate<InstX86Br>())
+        InstX86Br(Func, NoCondTarget, NoUncondTarget, Label, Condition, Kind);
+  }
+  const CfgNode *getTargetTrue() const { return TargetTrue; }
+  const CfgNode *getTargetFalse() const { return TargetFalse; }
+  bool isNear() const { return Kind == Near; }
+  bool optimizeBranch(const CfgNode *NextNode);
+  uint32_t getEmitInstCount() const override {
+    uint32_t Sum = 0;
+    if (Label)
+      ++Sum;
+    if (getTargetTrue())
+      ++Sum;
+    if (getTargetFalse())
+      ++Sum;
+    return Sum;
+  }
+  bool isUnconditionalBranch() const override {
+    return !Label && Condition == Cond::Br_None;
+  }
+  const Inst *getIntraBlockBranchTarget() const override { return Label; }
+  bool repointEdges(CfgNode *OldNode, CfgNode *NewNode) override;
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, InstX86Base::Br);
+  }
+
+private:
+  InstX86Br(Cfg *Func, const CfgNode *TargetTrue, const CfgNode *TargetFalse,
+            const InstX86Label *Label, BrCond Condition, Mode Kind);
+
+  BrCond Condition;
+  const CfgNode *TargetTrue;
+  const CfgNode *TargetFalse;
+  const InstX86Label *Label; // Intra-block branch target
+  const Mode Kind;
+};
+
+/// Jump to a target outside this function, such as tailcall, nacljump,
+/// naclret, unreachable. This is different from a Branch instruction in that
+/// there is no intra-function control flow to represent.
+class InstX86Jmp final : public InstX86Base {
+  InstX86Jmp() = delete;
+  InstX86Jmp(const InstX86Jmp &) = delete;
+  InstX86Jmp &operator=(const InstX86Jmp &) = delete;
+
+public:
+  static InstX86Jmp *create(Cfg *Func, Operand *Target) {
+    return new (Func->allocate<InstX86Jmp>()) InstX86Jmp(Func, Target);
+  }
+  Operand *getJmpTarget() const { return this->getSrc(0); }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, InstX86Base::Jmp);
+  }
+
+private:
+  InstX86Jmp(Cfg *Func, Operand *Target);
+};
+
+/// Call instruction. Arguments should have already been pushed.
+class InstX86Call final : public InstX86Base {
+  InstX86Call() = delete;
+  InstX86Call(const InstX86Call &) = delete;
+  InstX86Call &operator=(const InstX86Call &) = delete;
+
+public:
+  static InstX86Call *create(Cfg *Func, Variable *Dest, Operand *CallTarget) {
+    return new (Func->allocate<InstX86Call>())
+        InstX86Call(Func, Dest, CallTarget);
+  }
+  Operand *getCallTarget() const { return this->getSrc(0); }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, InstX86Base::Call);
+  }
+
+private:
+  InstX86Call(Cfg *Func, Variable *Dest, Operand *CallTarget);
+};
+
+/// Emit a one-operand (GPR) instruction.
+void emitIASOpTyGPR(const Cfg *Func, Type Ty, const Operand *Var,
+                    const GPREmitterOneOp &Emitter);
+
+void emitIASAsAddrOpTyGPR(const Cfg *Func, Type Ty, const Operand *Op0,
+                          const Operand *Op1, const GPREmitterAddrOp &Emitter);
+
+void emitIASGPRShift(const Cfg *Func, Type Ty, const Variable *Var,
+                     const Operand *Src, const GPREmitterShiftOp &Emitter);
+
+void emitIASAddrOpTyGPR(const Cfg *Func, Type Ty, const Address &Addr,
+                        const Operand *Src, const GPREmitterAddrOp &Emitter);
+
+void emitIASRegOpTyXMM(const Cfg *Func, Type Ty, const Variable *Var,
+                       const Operand *Src, const XmmEmitterRegOp &Emitter);
+
+void emitIASGPRShiftDouble(const Cfg *Func, const Variable *Dest,
+                           const Operand *Src1Op, const Operand *Src2Op,
+                           const GPREmitterShiftD &Emitter);
+
+template <typename DReg_t, typename SReg_t, DReg_t (*destEnc)(RegNumT),
+          SReg_t (*srcEnc)(RegNumT)>
+void emitIASCastRegOp(const Cfg *Func, Type DestTy, const Variable *Dest,
+                      Type SrcTy, const Operand *Src,
+                      const CastEmitterRegOp<DReg_t, SReg_t> &Emitter);
+
+template <typename DReg_t, typename SReg_t, DReg_t (*destEnc)(RegNumT),
+          SReg_t (*srcEnc)(RegNumT)>
+void emitIASThreeOpImmOps(const Cfg *Func, Type DispatchTy,
+                          const Variable *Dest, const Operand *Src0,
+                          const Operand *Src1,
+                          const ThreeOpImmEmitter<DReg_t, SReg_t> Emitter);
+
+void emitIASMovlikeXMM(const Cfg *Func, const Variable *Dest,
+                       const Operand *Src, const XmmEmitterMovOps Emitter);
+
+void emitVariableBlendInst(const char *Opcode, const Inst *Instr,
+                           const Cfg *Func);
+
+void emitIASVariableBlendInst(const Inst *Instr, const Cfg *Func,
+                              const XmmEmitterRegOp &Emitter);
+
+void emitIASXmmShift(const Cfg *Func, Type Ty, const Variable *Var,
+                     const Operand *Src, const XmmEmitterShiftOp &Emitter);
+
+/// Emit a two-operand (GPR) instruction, where the dest operand is a Variable
+/// that's guaranteed to be a register.
+template <bool VarCanBeByte = true, bool SrcCanBeByte = true>
+void emitIASRegOpTyGPR(const Cfg *Func, bool IsLea, Type Ty,
+                       const Variable *Dst, const Operand *Src,
+                       const GPREmitterRegOp &Emitter);
+
+/// Instructions of the form x := op(x).
+template <typename InstX86Base::InstKindX86 K>
+class InstX86BaseInplaceopGPR : public InstX86Base {
+  InstX86BaseInplaceopGPR() = delete;
+  InstX86BaseInplaceopGPR(const InstX86BaseInplaceopGPR &) = delete;
+  InstX86BaseInplaceopGPR &operator=(const InstX86BaseInplaceopGPR &) = delete;
+
+public:
+  using Base = InstX86BaseInplaceopGPR<K>;
+
+  void emit(const Cfg *Func) const override {
+    if (!BuildDefs::dump())
+      return;
+    Ostream &Str = Func->getContext()->getStrEmit();
+    assert(this->getSrcSize() == 1);
+    Str << "\t" << Opcode << "\t";
+    this->getSrc(0)->emit(Func);
+  }
+  void emitIAS(const Cfg *Func) const override {
+    assert(this->getSrcSize() == 1);
+    const Variable *Var = this->getDest();
+    Type Ty = Var->getType();
+    emitIASOpTyGPR(Func, Ty, Var, Emitter);
+  }
+  void dump(const Cfg *Func) const override {
+    if (!BuildDefs::dump())
+      return;
+    Ostream &Str = Func->getContext()->getStrDump();
+    this->dumpDest(Func);
+    Str << " = " << Opcode << "." << this->getDest()->getType() << " ";
+    this->dumpSources(Func);
+  }
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, K);
+  }
+
+protected:
+  InstX86BaseInplaceopGPR(Cfg *Func, Operand *SrcDest)
+      : InstX86Base(Func, K, 1, llvm::dyn_cast<Variable>(SrcDest)) {
+    this->addSource(SrcDest);
+  }
+
+private:
+  static const char *const Opcode;
+  static const GPREmitterOneOp Emitter;
+};
+
+/// Instructions of the form x := op(y).
+template <typename InstX86Base::InstKindX86 K>
+class InstX86BaseUnaryopGPR : public InstX86Base {
+  InstX86BaseUnaryopGPR() = delete;
+  InstX86BaseUnaryopGPR(const InstX86BaseUnaryopGPR &) = delete;
+  InstX86BaseUnaryopGPR &operator=(const InstX86BaseUnaryopGPR &) = delete;
+
+public:
+  using Base = InstX86BaseUnaryopGPR<K>;
+
+  void emit(const Cfg *Func) const override {
+    if (!BuildDefs::dump())
+      return;
+    Ostream &Str = Func->getContext()->getStrEmit();
+    assert(this->getSrcSize() == 1);
+    Type SrcTy = this->getSrc(0)->getType();
+    Type DestTy = this->getDest()->getType();
+    Str << "\t" << Opcode << this->getWidthString(SrcTy);
+    // Movsx and movzx need both the source and dest type width letter to
+    // define the operation. The other unary operations have the same source
+    // and dest type and as a result need only one letter.
+    if (SrcTy != DestTy)
+      Str << this->getWidthString(DestTy);
+    Str << "\t";
+    this->getSrc(0)->emit(Func);
+    Str << ", ";
+    this->getDest()->emit(Func);
+  }
+  void emitIAS(const Cfg *Func) const override {
+    assert(this->getSrcSize() == 1 && K != InstX86Base::Lea);
+    const Variable *Var = this->getDest();
+    Type Ty = Var->getType();
+    const Operand *Src = this->getSrc(0);
+    bool IsLea = false;
+    emitIASRegOpTyGPR(Func, IsLea, Ty, Var, Src, Emitter);
+  }
+  void dump(const Cfg *Func) const override {
+    if (!BuildDefs::dump())
+      return;
+    Ostream &Str = Func->getContext()->getStrDump();
+    this->dumpDest(Func);
+    Str << " = " << Opcode << "." << this->getSrc(0)->getType() << " ";
+    this->dumpSources(Func);
+  }
+
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, K);
+  }
+
+protected:
+  InstX86BaseUnaryopGPR(Cfg *Func, Variable *Dest, Operand *Src)
+      : InstX86Base(Func, K, 1, Dest) {
+    this->addSource(Src);
+  }
+
+  static const char *const Opcode;
+  static const GPREmitterRegOp Emitter;
+};
+
+template <typename InstX86Base::InstKindX86 K>
+class InstX86BaseUnaryopXmm : public InstX86Base {
+  InstX86BaseUnaryopXmm() = delete;
+  InstX86BaseUnaryopXmm(const InstX86BaseUnaryopXmm &) = delete;
+  InstX86BaseUnaryopXmm &operator=(const InstX86BaseUnaryopXmm &) = delete;
+
+public:
+  using Base = InstX86BaseUnaryopXmm<K>;
+
+  void emit(const Cfg *Func) const override {
+    if (!BuildDefs::dump())
+      return;
+    Ostream &Str = Func->getContext()->getStrEmit();
+    assert(this->getSrcSize() == 1);
+    Str << "\t" << Opcode << "\t";
+    this->getSrc(0)->emit(Func);
+    Str << ", ";
+    this->getDest()->emit(Func);
+  }
+  void emitIAS(const Cfg *Func) const override {
+    Type Ty = this->getDest()->getType();
+    assert(this->getSrcSize() == 1);
+    emitIASRegOpTyXMM(Func, Ty, this->getDest(), this->getSrc(0), Emitter);
+  }
+  void dump(const Cfg *Func) const override {
+    if (!BuildDefs::dump())
+      return;
+    Ostream &Str = Func->getContext()->getStrDump();
+    this->dumpDest(Func);
+    Str << " = " << Opcode << "." << this->getDest()->getType() << " ";
+    this->dumpSources(Func);
+  }
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, K);
+  }
+
+protected:
+  InstX86BaseUnaryopXmm(Cfg *Func, Variable *Dest, Operand *Src)
+      : InstX86Base(Func, K, 1, Dest) {
+    this->addSource(Src);
+  }
+
+  static const char *const Opcode;
+  static const XmmEmitterRegOp Emitter;
+};
+
+template <typename InstX86Base::InstKindX86 K>
+class InstX86BaseBinopGPRShift : public InstX86Base {
+  InstX86BaseBinopGPRShift() = delete;
+  InstX86BaseBinopGPRShift(const InstX86BaseBinopGPRShift &) = delete;
+  InstX86BaseBinopGPRShift &
+  operator=(const InstX86BaseBinopGPRShift &) = delete;
+
+public:
+  using Base = InstX86BaseBinopGPRShift<K>;
+
+  void emit(const Cfg *Func) const override {
+    if (!BuildDefs::dump())
+      return;
+    this->emitTwoAddress(Func, Opcode);
+  }
+  void emitIAS(const Cfg *Func) const override {
+    Type Ty = this->getDest()->getType();
+    assert(this->getSrcSize() == 2);
+    emitIASGPRShift(Func, Ty, this->getDest(), this->getSrc(1), Emitter);
+  }
+  void dump(const Cfg *Func) const override {
+    if (!BuildDefs::dump())
+      return;
+    Ostream &Str = Func->getContext()->getStrDump();
+    this->dumpDest(Func);
+    Str << " = " << Opcode << "." << this->getDest()->getType() << " ";
+    this->dumpSources(Func);
+  }
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, K);
+  }
+
+protected:
+  InstX86BaseBinopGPRShift(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86Base(Func, K, 2, Dest) {
+    this->addSource(Dest);
+    this->addSource(Source);
+  }
+
+  static const char *const Opcode;
+  static const GPREmitterShiftOp Emitter;
+};
+
+template <typename InstX86Base::InstKindX86 K>
+class InstX86BaseBinopGPR : public InstX86Base {
+  InstX86BaseBinopGPR() = delete;
+  InstX86BaseBinopGPR(const InstX86BaseBinopGPR &) = delete;
+  InstX86BaseBinopGPR &operator=(const InstX86BaseBinopGPR &) = delete;
+
+public:
+  using Base = InstX86BaseBinopGPR<K>;
+
+  void emit(const Cfg *Func) const override {
+    if (!BuildDefs::dump())
+      return;
+    this->emitTwoAddress(Func, Opcode);
+  }
+  void emitIAS(const Cfg *Func) const override {
+    Type Ty = this->getDest()->getType();
+    assert(this->getSrcSize() == 2);
+    constexpr bool ThisIsLEA = K == InstX86Base::Lea;
+    static_assert(!ThisIsLEA, "Lea should be a unaryop.");
+    emitIASRegOpTyGPR(Func, !ThisIsLEA, Ty, this->getDest(), this->getSrc(1),
+                      Emitter);
+  }
+  void dump(const Cfg *Func) const override {
+    if (!BuildDefs::dump())
+      return;
+    Ostream &Str = Func->getContext()->getStrDump();
+    this->dumpDest(Func);
+    Str << " = " << Opcode << "." << this->getDest()->getType() << " ";
+    this->dumpSources(Func);
+  }
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, K);
+  }
+
+protected:
+  InstX86BaseBinopGPR(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86Base(Func, K, 2, Dest) {
+    this->addSource(Dest);
+    this->addSource(Source);
+  }
+
+  static const char *const Opcode;
+  static const GPREmitterRegOp Emitter;
+};
+
+template <typename InstX86Base::InstKindX86 K>
+class InstX86BaseBinopRMW : public InstX86Base {
+  InstX86BaseBinopRMW() = delete;
+  InstX86BaseBinopRMW(const InstX86BaseBinopRMW &) = delete;
+  InstX86BaseBinopRMW &operator=(const InstX86BaseBinopRMW &) = delete;
+
+public:
+  using Base = InstX86BaseBinopRMW<K>;
+
+  void emit(const Cfg *Func) const override {
+    if (!BuildDefs::dump())
+      return;
+    this->emitTwoAddress(Func, Opcode);
+  }
+  void emitIAS(const Cfg *Func) const override {
+    Type Ty = this->getSrc(0)->getType();
+    assert(this->getSrcSize() == 2);
+    emitIASAsAddrOpTyGPR(Func, Ty, this->getSrc(0), this->getSrc(1), Emitter);
+  }
+  void dump(const Cfg *Func) const override {
+    if (!BuildDefs::dump())
+      return;
+    Ostream &Str = Func->getContext()->getStrDump();
+    Str << Opcode << "." << this->getSrc(0)->getType() << " ";
+    this->dumpSources(Func);
+  }
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, K);
+  }
+
+protected:
+  InstX86BaseBinopRMW(Cfg *Func, X86OperandMem *DestSrc0, Operand *Src1)
+      : InstX86Base(Func, K, 2, nullptr) {
+    this->addSource(DestSrc0);
+    this->addSource(Src1);
+  }
+
+  static const char *const Opcode;
+  static const GPREmitterAddrOp Emitter;
+};
+
+template <typename InstX86Base::InstKindX86 K, bool NeedsElementType,
+          typename InstX86Base::SseSuffix Suffix>
+class InstX86BaseBinopXmm : public InstX86Base {
+  InstX86BaseBinopXmm() = delete;
+  InstX86BaseBinopXmm(const InstX86BaseBinopXmm &) = delete;
+  InstX86BaseBinopXmm &operator=(const InstX86BaseBinopXmm &) = delete;
+
+public:
+  using Base = InstX86BaseBinopXmm<K, NeedsElementType, Suffix>;
+
+  void emit(const Cfg *Func) const override {
+    if (!BuildDefs::dump())
+      return;
+    this->validateVectorAddrMode();
+    const Type DestTy = ArithmeticTypeOverride == IceType_void
+                            ? this->getDest()->getType()
+                            : ArithmeticTypeOverride;
+    const char *SuffixString = "";
+    switch (Suffix) {
+    case InstX86Base::SseSuffix::None:
+      break;
+    case InstX86Base::SseSuffix::Packed:
+      SuffixString = Traits::TypeAttributes[DestTy].PdPsString;
+      break;
+    case InstX86Base::SseSuffix::Unpack:
+      SuffixString = Traits::TypeAttributes[DestTy].UnpackString;
+      break;
+    case InstX86Base::SseSuffix::Scalar:
+      SuffixString = Traits::TypeAttributes[DestTy].SdSsString;
+      break;
+    case InstX86Base::SseSuffix::Integral:
+      SuffixString = Traits::TypeAttributes[DestTy].IntegralString;
+      break;
+    case InstX86Base::SseSuffix::Pack:
+      SuffixString = Traits::TypeAttributes[DestTy].PackString;
+      break;
+    }
+    this->emitTwoAddress(Func, Opcode, SuffixString);
+  }
+  void emitIAS(const Cfg *Func) const override {
+    this->validateVectorAddrMode();
+    Type Ty = this->getDest()->getType();
+    if (NeedsElementType)
+      Ty = typeElementType(Ty);
+    assert(this->getSrcSize() == 2);
+    emitIASRegOpTyXMM(Func, Ty, this->getDest(), this->getSrc(1), Emitter);
+  }
+  void dump(const Cfg *Func) const override {
+    if (!BuildDefs::dump())
+      return;
+    Ostream &Str = Func->getContext()->getStrDump();
+    this->dumpDest(Func);
+    Str << " = " << Opcode << "." << this->getDest()->getType() << " ";
+    this->dumpSources(Func);
+  }
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, K);
+  }
+
+protected:
+  InstX86BaseBinopXmm(Cfg *Func, Variable *Dest, Operand *Source,
+                      Type ArithmeticTypeOverride = IceType_void)
+      : InstX86Base(Func, K, 2, Dest),
+        ArithmeticTypeOverride(ArithmeticTypeOverride) {
+    this->addSource(Dest);
+    this->addSource(Source);
+  }
+
+  const Type ArithmeticTypeOverride;
+  static const char *const Opcode;
+  static const XmmEmitterRegOp Emitter;
+};
+
+template <typename InstX86Base::InstKindX86 K, bool AllowAllTypes = false>
+class InstX86BaseBinopXmmShift : public InstX86Base {
+  InstX86BaseBinopXmmShift() = delete;
+  InstX86BaseBinopXmmShift(const InstX86BaseBinopXmmShift &) = delete;
+  InstX86BaseBinopXmmShift &
+  operator=(const InstX86BaseBinopXmmShift &) = delete;
+
+public:
+  using Base = InstX86BaseBinopXmmShift<K, AllowAllTypes>;
+
+  void emit(const Cfg *Func) const override {
+    if (!BuildDefs::dump())
+      return;
+    this->validateVectorAddrMode();
+    // Shift operations are always integral, and hence always need a suffix.
+    const Type DestTy = this->getDest()->getType();
+    this->emitTwoAddress(Func, this->Opcode,
+                         Traits::TypeAttributes[DestTy].IntegralString);
+  }
+  void emitIAS(const Cfg *Func) const override {
+    this->validateVectorAddrMode();
+    Type Ty = this->getDest()->getType();
+    assert(AllowAllTypes || isVectorType(Ty));
+    Type ElementTy = typeElementType(Ty);
+    assert(this->getSrcSize() == 2);
+    emitIASXmmShift(Func, ElementTy, this->getDest(), this->getSrc(1), Emitter);
+  }
+  void dump(const Cfg *Func) const override {
+    if (!BuildDefs::dump())
+      return;
+    Ostream &Str = Func->getContext()->getStrDump();
+    this->dumpDest(Func);
+    Str << " = " << Opcode << "." << this->getDest()->getType() << " ";
+    this->dumpSources(Func);
+  }
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, K);
+  }
+
+protected:
+  InstX86BaseBinopXmmShift(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86Base(Func, K, 2, Dest) {
+    this->addSource(Dest);
+    this->addSource(Source);
+  }
+
+  static const char *const Opcode;
+  static const XmmEmitterShiftOp Emitter;
+};
+
+template <typename InstX86Base::InstKindX86 K>
+class InstX86BaseTernop : public InstX86Base {
+  InstX86BaseTernop() = delete;
+  InstX86BaseTernop(const InstX86BaseTernop &) = delete;
+  InstX86BaseTernop &operator=(const InstX86BaseTernop &) = delete;
+
+public:
+  using Base = InstX86BaseTernop<K>;
+
+  void emit(const Cfg *Func) const override {
+    if (!BuildDefs::dump())
+      return;
+    Ostream &Str = Func->getContext()->getStrEmit();
+    assert(this->getSrcSize() == 3);
+    Str << "\t" << Opcode << "\t";
+    this->getSrc(2)->emit(Func);
+    Str << ", ";
+    this->getSrc(1)->emit(Func);
+    Str << ", ";
+    this->getDest()->emit(Func);
+  }
+  void dump(const Cfg *Func) const override {
+    if (!BuildDefs::dump())
+      return;
+    Ostream &Str = Func->getContext()->getStrDump();
+    this->dumpDest(Func);
+    Str << " = " << Opcode << "." << this->getDest()->getType() << " ";
+    this->dumpSources(Func);
+  }
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, K);
+  }
+
+protected:
+  InstX86BaseTernop(Cfg *Func, Variable *Dest, Operand *Source1,
+                    Operand *Source2)
+      : InstX86Base(Func, K, 3, Dest) {
+    this->addSource(Dest);
+    this->addSource(Source1);
+    this->addSource(Source2);
+  }
+
+  static const char *const Opcode;
+};
+
+// Instructions of the form x := y op z
+template <typename InstX86Base::InstKindX86 K>
+class InstX86BaseThreeAddressop : public InstX86Base {
+  InstX86BaseThreeAddressop() = delete;
+  InstX86BaseThreeAddressop(const InstX86BaseThreeAddressop &) = delete;
+  InstX86BaseThreeAddressop &
+  operator=(const InstX86BaseThreeAddressop &) = delete;
+
+public:
+  using Base = InstX86BaseThreeAddressop<K>;
+
+  void emit(const Cfg *Func) const override {
+    if (!BuildDefs::dump())
+      return;
+    Ostream &Str = Func->getContext()->getStrEmit();
+    assert(this->getSrcSize() == 2);
+    Str << "\t" << Opcode << "\t";
+    this->getSrc(1)->emit(Func);
+    Str << ", ";
+    this->getSrc(0)->emit(Func);
+    Str << ", ";
+    this->getDest()->emit(Func);
+  }
+  void dump(const Cfg *Func) const override {
+    if (!BuildDefs::dump())
+      return;
+    Ostream &Str = Func->getContext()->getStrDump();
+    this->dumpDest(Func);
+    Str << " = " << Opcode << "." << this->getDest()->getType() << " ";
+    this->dumpSources(Func);
+  }
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, K);
+  }
+
+protected:
+  InstX86BaseThreeAddressop(Cfg *Func, Variable *Dest, Operand *Source0,
+                            Operand *Source1)
+      : InstX86Base(Func, K, 2, Dest) {
+    this->addSource(Source0);
+    this->addSource(Source1);
+  }
+
+  static const char *const Opcode;
+};
+
+/// Base class for assignment instructions
+template <typename InstX86Base::InstKindX86 K>
+class InstX86BaseMovlike : public InstX86Base {
+  InstX86BaseMovlike() = delete;
+  InstX86BaseMovlike(const InstX86BaseMovlike &) = delete;
+  InstX86BaseMovlike &operator=(const InstX86BaseMovlike &) = delete;
+
+public:
+  using Base = InstX86BaseMovlike<K>;
+
+  bool isRedundantAssign() const override {
+    if (const auto *SrcVar = llvm::dyn_cast<const Variable>(this->getSrc(0))) {
+      if (SrcVar->hasReg() && this->Dest->hasReg()) {
+        // An assignment between physical registers is considered redundant if
+        // they have the same base register and the same encoding. E.g.:
+        //   mov cl, ecx ==> redundant
+        //   mov ch, ecx ==> not redundant due to different encodings
+        //   mov ch, ebp ==> not redundant due to different base registers
+        //   mov ecx, ecx ==> redundant, and dangerous in x86-64. i64 zexting
+        //                    is handled by Inst86Zext.
+        const auto SrcReg = SrcVar->getRegNum();
+        const auto DestReg = this->Dest->getRegNum();
+        return (Traits::getEncoding(SrcReg) == Traits::getEncoding(DestReg)) &&
+               (Traits::getBaseReg(SrcReg) == Traits::getBaseReg(DestReg));
+      }
+    }
+    return checkForRedundantAssign(this->getDest(), this->getSrc(0));
+  }
+  bool isVarAssign() const override {
+    return llvm::isa<Variable>(this->getSrc(0));
+  }
+  void dump(const Cfg *Func) const override {
+    if (!BuildDefs::dump())
+      return;
+    Ostream &Str = Func->getContext()->getStrDump();
+    Str << Opcode << "." << this->getDest()->getType() << " ";
+    this->dumpDest(Func);
+    Str << ", ";
+    this->dumpSources(Func);
+  }
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, K);
+  }
+
+protected:
+  InstX86BaseMovlike(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86Base(Func, K, 1, Dest) {
+    this->addSource(Source);
+    // For an integer assignment, make sure it's either a same-type assignment
+    // or a truncation.
+    assert(!isScalarIntegerType(Dest->getType()) ||
+           (typeWidthInBytes(Dest->getType()) <=
+            typeWidthInBytes(Source->getType())));
+  }
+
+  static const char *const Opcode;
+};
+
+class InstX86Bswap : public InstX86BaseInplaceopGPR<InstX86Base::Bswap> {
+public:
+  static InstX86Bswap *create(Cfg *Func, Operand *SrcDest) {
+    return new (Func->allocate<InstX86Bswap>()) InstX86Bswap(Func, SrcDest);
+  }
+
+private:
+  InstX86Bswap(Cfg *Func, Operand *SrcDest)
+      : InstX86BaseInplaceopGPR<InstX86Base::Bswap>(Func, SrcDest) {}
+};
+
+class InstX86Neg : public InstX86BaseInplaceopGPR<InstX86Base::Neg> {
+public:
+  static InstX86Neg *create(Cfg *Func, Operand *SrcDest) {
+    return new (Func->allocate<InstX86Neg>()) InstX86Neg(Func, SrcDest);
+  }
+
+private:
+  InstX86Neg(Cfg *Func, Operand *SrcDest)
+      : InstX86BaseInplaceopGPR<InstX86Base::Neg>(Func, SrcDest) {}
+};
+
+class InstX86Bsf : public InstX86BaseUnaryopGPR<InstX86Base::Bsf> {
+public:
+  static InstX86Bsf *create(Cfg *Func, Variable *Dest, Operand *Src) {
+    return new (Func->allocate<InstX86Bsf>()) InstX86Bsf(Func, Dest, Src);
+  }
+
+private:
+  InstX86Bsf(Cfg *Func, Variable *Dest, Operand *Src)
+      : InstX86BaseUnaryopGPR<InstX86Base::Bsf>(Func, Dest, Src) {}
+};
+
+class InstX86Bsr : public InstX86BaseUnaryopGPR<InstX86Base::Bsr> {
+public:
+  static InstX86Bsr *create(Cfg *Func, Variable *Dest, Operand *Src) {
+    return new (Func->allocate<InstX86Bsr>()) InstX86Bsr(Func, Dest, Src);
+  }
+
+private:
+  InstX86Bsr(Cfg *Func, Variable *Dest, Operand *Src)
+      : InstX86BaseUnaryopGPR<InstX86Base::Bsr>(Func, Dest, Src) {}
+};
+
+class InstX86Lea : public InstX86BaseUnaryopGPR<InstX86Base::Lea> {
+public:
+  static InstX86Lea *create(Cfg *Func, Variable *Dest, Operand *Src) {
+    return new (Func->allocate<InstX86Lea>()) InstX86Lea(Func, Dest, Src);
+  }
+
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+
+private:
+  InstX86Lea(Cfg *Func, Variable *Dest, Operand *Src)
+      : InstX86BaseUnaryopGPR<InstX86Base::Lea>(Func, Dest, Src) {}
+
+  Inst *deoptToAddOrNull(const Cfg *Func) const;
+};
+
+// Cbwdq instruction - wrapper for cbw, cwd, and cdq
+class InstX86Cbwdq : public InstX86BaseUnaryopGPR<InstX86Base::Cbwdq> {
+public:
+  static InstX86Cbwdq *create(Cfg *Func, Variable *Dest, Operand *Src) {
+    return new (Func->allocate<InstX86Cbwdq>()) InstX86Cbwdq(Func, Dest, Src);
+  }
+
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+
+private:
+  InstX86Cbwdq(Cfg *Func, Variable *Dest, Operand *Src)
+      : InstX86BaseUnaryopGPR<InstX86Base::Cbwdq>(Func, Dest, Src) {}
+};
+
+class InstX86Movsx : public InstX86BaseUnaryopGPR<InstX86Base::Movsx> {
+public:
+  static InstX86Movsx *create(Cfg *Func, Variable *Dest, Operand *Src) {
+    assert(typeWidthInBytes(Dest->getType()) >
+           typeWidthInBytes(Src->getType()));
+    return new (Func->allocate<InstX86Movsx>()) InstX86Movsx(Func, Dest, Src);
+  }
+
+  void emitIAS(const Cfg *Func) const override;
+
+private:
+  InstX86Movsx(Cfg *Func, Variable *Dest, Operand *Src)
+      : InstX86BaseUnaryopGPR<InstX86Base::Movsx>(Func, Dest, Src) {}
+};
+
+class InstX86Movzx : public InstX86BaseUnaryopGPR<InstX86Base::Movzx> {
+public:
+  static InstX86Movzx *create(Cfg *Func, Variable *Dest, Operand *Src) {
+    assert(typeWidthInBytes(Dest->getType()) >
+           typeWidthInBytes(Src->getType()));
+    return new (Func->allocate<InstX86Movzx>()) InstX86Movzx(Func, Dest, Src);
+  }
+
+  void emit(const Cfg *Func) const override;
+
+  void emitIAS(const Cfg *Func) const override;
+
+  void setMustKeep() { MustKeep = true; }
+
+private:
+  bool MustKeep = false;
+
+  InstX86Movzx(Cfg *Func, Variable *Dest, Operand *Src)
+      : InstX86BaseUnaryopGPR<InstX86Base::Movzx>(Func, Dest, Src) {}
+
+  bool mayBeElided(const Variable *Dest, const Operand *Src) const;
+};
+
+class InstX86Movd : public InstX86BaseUnaryopXmm<InstX86Base::Movd> {
+public:
+  static InstX86Movd *create(Cfg *Func, Variable *Dest, Operand *Src) {
+    return new (Func->allocate<InstX86Movd>()) InstX86Movd(Func, Dest, Src);
+  }
+
+  void emit(const Cfg *Func) const override;
+
+  void emitIAS(const Cfg *Func) const override;
+
+private:
+  InstX86Movd(Cfg *Func, Variable *Dest, Operand *Src)
+      : InstX86BaseUnaryopXmm<InstX86Base::Movd>(Func, Dest, Src) {}
+};
+
+class InstX86Movmsk final : public InstX86Base {
+  InstX86Movmsk() = delete;
+  InstX86Movmsk(const InstX86Movmsk &) = delete;
+  InstX86Movmsk &operator=(const InstX86Movmsk &) = delete;
+
+public:
+  static InstX86Movmsk *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Movmsk>())
+        InstX86Movmsk(Func, Dest, Source);
+  }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, InstX86Base::Movmsk);
+  }
+
+private:
+  InstX86Movmsk(Cfg *Func, Variable *Dest, Operand *Source);
+};
+
+class InstX86Sqrt : public InstX86BaseUnaryopXmm<InstX86Base::Sqrt> {
+public:
+  static InstX86Sqrt *create(Cfg *Func, Variable *Dest, Operand *Src) {
+    return new (Func->allocate<InstX86Sqrt>()) InstX86Sqrt(Func, Dest, Src);
+  }
+
+  virtual void emit(const Cfg *Func) const override;
+
+private:
+  InstX86Sqrt(Cfg *Func, Variable *Dest, Operand *Src)
+      : InstX86BaseUnaryopXmm<InstX86Base::Sqrt>(Func, Dest, Src) {}
+};
+
+/// Move/assignment instruction - wrapper for mov/movss/movsd.
+class InstX86Mov : public InstX86BaseMovlike<InstX86Base::Mov> {
+public:
+  static InstX86Mov *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    assert(!isScalarIntegerType(Dest->getType()) ||
+           (typeWidthInBytes(Dest->getType()) <=
+            typeWidthInBytes(Source->getType())));
+    return new (Func->allocate<InstX86Mov>()) InstX86Mov(Func, Dest, Source);
+  }
+
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+
+private:
+  InstX86Mov(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseMovlike<InstX86Base::Mov>(Func, Dest, Source) {}
+};
+
+/// Move packed - copy 128 bit values between XMM registers, or mem128 and XMM
+/// registers.
+class InstX86Movp : public InstX86BaseMovlike<InstX86Base::Movp> {
+public:
+  static InstX86Movp *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Movp>()) InstX86Movp(Func, Dest, Source);
+  }
+
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+
+private:
+  InstX86Movp(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseMovlike<InstX86Base::Movp>(Func, Dest, Source) {}
+};
+
+/// Movq - copy between XMM registers, or mem64 and XMM registers.
+class InstX86Movq : public InstX86BaseMovlike<InstX86Base::Movq> {
+public:
+  static InstX86Movq *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Movq>()) InstX86Movq(Func, Dest, Source);
+  }
+
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+
+private:
+  InstX86Movq(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseMovlike<InstX86Base::Movq>(Func, Dest, Source) {}
+};
+
+class InstX86Add : public InstX86BaseBinopGPR<InstX86Base::Add> {
+public:
+  static InstX86Add *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Add>()) InstX86Add(Func, Dest, Source);
+  }
+
+private:
+  InstX86Add(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopGPR<InstX86Base::Add>(Func, Dest, Source) {}
+};
+
+class InstX86AddRMW : public InstX86BaseBinopRMW<InstX86Base::AddRMW> {
+public:
+  static InstX86AddRMW *create(Cfg *Func, X86OperandMem *DestSrc0,
+                               Operand *Src1) {
+    return new (Func->allocate<InstX86AddRMW>())
+        InstX86AddRMW(Func, DestSrc0, Src1);
+  }
+
+private:
+  InstX86AddRMW(Cfg *Func, X86OperandMem *DestSrc0, Operand *Src1)
+      : InstX86BaseBinopRMW<InstX86Base::AddRMW>(Func, DestSrc0, Src1) {}
+};
+
+class InstX86Addps
+    : public InstX86BaseBinopXmm<InstX86Base::Addps, true,
+                                 InstX86Base::SseSuffix::Packed> {
+public:
+  static InstX86Addps *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Addps>())
+        InstX86Addps(Func, Dest, Source);
+  }
+
+private:
+  InstX86Addps(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopXmm<InstX86Base::Addps, true,
+                            InstX86Base::SseSuffix::Packed>(Func, Dest,
+                                                            Source) {}
+};
+
+class InstX86Adc : public InstX86BaseBinopGPR<InstX86Base::Adc> {
+public:
+  static InstX86Adc *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Adc>()) InstX86Adc(Func, Dest, Source);
+  }
+
+private:
+  InstX86Adc(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopGPR<InstX86Base::Adc>(Func, Dest, Source) {}
+};
+
+class InstX86AdcRMW : public InstX86BaseBinopRMW<InstX86Base::AdcRMW> {
+public:
+  static InstX86AdcRMW *create(Cfg *Func, X86OperandMem *DestSrc0,
+                               Operand *Src1) {
+    return new (Func->allocate<InstX86AdcRMW>())
+        InstX86AdcRMW(Func, DestSrc0, Src1);
+  }
+
+private:
+  InstX86AdcRMW(Cfg *Func, X86OperandMem *DestSrc0, Operand *Src1)
+      : InstX86BaseBinopRMW<InstX86Base::AdcRMW>(Func, DestSrc0, Src1) {}
+};
+
+class InstX86Addss
+    : public InstX86BaseBinopXmm<InstX86Base::Addss, false,
+                                 InstX86Base::SseSuffix::Scalar> {
+public:
+  static InstX86Addss *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Addss>())
+        InstX86Addss(Func, Dest, Source);
+  }
+
+private:
+  InstX86Addss(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopXmm<InstX86Base::Addss, false,
+                            InstX86Base::SseSuffix::Scalar>(Func, Dest,
+                                                            Source) {}
+};
+
+class InstX86Padd
+    : public InstX86BaseBinopXmm<InstX86Base::Padd, true,
+                                 InstX86Base::SseSuffix::Integral> {
+public:
+  static InstX86Padd *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Padd>()) InstX86Padd(Func, Dest, Source);
+  }
+
+private:
+  InstX86Padd(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopXmm<InstX86Base::Padd, true,
+                            InstX86Base::SseSuffix::Integral>(Func, Dest,
+                                                              Source) {}
+};
+
+class InstX86Padds
+    : public InstX86BaseBinopXmm<InstX86Base::Padds, true,
+                                 InstX86Base::SseSuffix::Integral> {
+public:
+  static InstX86Padds *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Padds>())
+        InstX86Padds(Func, Dest, Source);
+  }
+
+private:
+  InstX86Padds(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopXmm<InstX86Base::Padds, true,
+                            InstX86Base::SseSuffix::Integral>(Func, Dest,
+                                                              Source) {}
+};
+
+class InstX86Paddus
+    : public InstX86BaseBinopXmm<InstX86Base::Paddus, true,
+                                 InstX86Base::SseSuffix::Integral> {
+public:
+  static InstX86Paddus *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Paddus>())
+        InstX86Paddus(Func, Dest, Source);
+  }
+
+private:
+  InstX86Paddus(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopXmm<InstX86Base::Paddus, true,
+                            InstX86Base::SseSuffix::Integral>(Func, Dest,
+                                                              Source) {}
+};
+
+class InstX86Sub : public InstX86BaseBinopGPR<InstX86Base::Sub> {
+public:
+  static InstX86Sub *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Sub>()) InstX86Sub(Func, Dest, Source);
+  }
+
+private:
+  InstX86Sub(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopGPR<InstX86Base::Sub>(Func, Dest, Source) {}
+};
+
+class InstX86SubRMW : public InstX86BaseBinopRMW<InstX86Base::SubRMW> {
+public:
+  static InstX86SubRMW *create(Cfg *Func, X86OperandMem *DestSrc0,
+                               Operand *Src1) {
+    return new (Func->allocate<InstX86SubRMW>())
+        InstX86SubRMW(Func, DestSrc0, Src1);
+  }
+
+private:
+  InstX86SubRMW(Cfg *Func, X86OperandMem *DestSrc0, Operand *Src1)
+      : InstX86BaseBinopRMW<InstX86Base::SubRMW>(Func, DestSrc0, Src1) {}
+};
+
+class InstX86Subps
+    : public InstX86BaseBinopXmm<InstX86Base::Subps, true,
+                                 InstX86Base::SseSuffix::Packed> {
+public:
+  static InstX86Subps *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Subps>())
+        InstX86Subps(Func, Dest, Source);
+  }
+
+private:
+  InstX86Subps(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopXmm<InstX86Base::Subps, true,
+                            InstX86Base::SseSuffix::Packed>(Func, Dest,
+                                                            Source) {}
+};
+
+class InstX86Subss
+    : public InstX86BaseBinopXmm<InstX86Base::Subss, false,
+                                 InstX86Base::SseSuffix::Scalar> {
+public:
+  static InstX86Subss *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Subss>())
+        InstX86Subss(Func, Dest, Source);
+  }
+
+private:
+  InstX86Subss(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopXmm<InstX86Base::Subss, false,
+                            InstX86Base::SseSuffix::Scalar>(Func, Dest,
+                                                            Source) {}
+};
+
+class InstX86Sbb : public InstX86BaseBinopGPR<InstX86Base::Sbb> {
+public:
+  static InstX86Sbb *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Sbb>()) InstX86Sbb(Func, Dest, Source);
+  }
+
+private:
+  InstX86Sbb(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopGPR<InstX86Base::Sbb>(Func, Dest, Source) {}
+};
+
+class InstX86SbbRMW : public InstX86BaseBinopRMW<InstX86Base::SbbRMW> {
+public:
+  static InstX86SbbRMW *create(Cfg *Func, X86OperandMem *DestSrc0,
+                               Operand *Src1) {
+    return new (Func->allocate<InstX86SbbRMW>())
+        InstX86SbbRMW(Func, DestSrc0, Src1);
+  }
+
+private:
+  InstX86SbbRMW(Cfg *Func, X86OperandMem *DestSrc0, Operand *Src1)
+      : InstX86BaseBinopRMW<InstX86Base::SbbRMW>(Func, DestSrc0, Src1) {}
+};
+
+class InstX86Psub
+    : public InstX86BaseBinopXmm<InstX86Base::Psub, true,
+                                 InstX86Base::SseSuffix::Integral> {
+public:
+  static InstX86Psub *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Psub>()) InstX86Psub(Func, Dest, Source);
+  }
+
+private:
+  InstX86Psub(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopXmm<InstX86Base::Psub, true,
+                            InstX86Base::SseSuffix::Integral>(Func, Dest,
+                                                              Source) {}
+};
+
+class InstX86Psubs
+    : public InstX86BaseBinopXmm<InstX86Base::Psubs, true,
+                                 InstX86Base::SseSuffix::Integral> {
+public:
+  static InstX86Psubs *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Psubs>())
+        InstX86Psubs(Func, Dest, Source);
+  }
+
+private:
+  InstX86Psubs(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopXmm<InstX86Base::Psubs, true,
+                            InstX86Base::SseSuffix::Integral>(Func, Dest,
+                                                              Source) {}
+};
+
+class InstX86Psubus
+    : public InstX86BaseBinopXmm<InstX86Base::Psubus, true,
+                                 InstX86Base::SseSuffix::Integral> {
+public:
+  static InstX86Psubus *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Psubus>())
+        InstX86Psubus(Func, Dest, Source);
+  }
+
+private:
+  InstX86Psubus(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopXmm<InstX86Base::Psubus, true,
+                            InstX86Base::SseSuffix::Integral>(Func, Dest,
+                                                              Source) {}
+};
+
+class InstX86And : public InstX86BaseBinopGPR<InstX86Base::And> {
+public:
+  static InstX86And *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86And>()) InstX86And(Func, Dest, Source);
+  }
+
+private:
+  InstX86And(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopGPR<InstX86Base::And>(Func, Dest, Source) {}
+};
+
+class InstX86Andnps
+    : public InstX86BaseBinopXmm<InstX86Base::Andnps, true,
+                                 InstX86Base::SseSuffix::Packed> {
+public:
+  static InstX86Andnps *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Andnps>())
+        InstX86Andnps(Func, Dest, Source);
+  }
+
+private:
+  InstX86Andnps(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopXmm<InstX86Base::Andnps, true,
+                            InstX86Base::SseSuffix::Packed>(Func, Dest,
+                                                            Source) {}
+};
+
+class InstX86Andps
+    : public InstX86BaseBinopXmm<InstX86Base::Andps, true,
+                                 InstX86Base::SseSuffix::Packed> {
+public:
+  static InstX86Andps *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Andps>())
+        InstX86Andps(Func, Dest, Source);
+  }
+
+private:
+  InstX86Andps(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopXmm<InstX86Base::Andps, true,
+                            InstX86Base::SseSuffix::Packed>(Func, Dest,
+                                                            Source) {}
+};
+
+class InstX86AndRMW : public InstX86BaseBinopRMW<InstX86Base::AndRMW> {
+public:
+  static InstX86AndRMW *create(Cfg *Func, X86OperandMem *DestSrc0,
+                               Operand *Src1) {
+    return new (Func->allocate<InstX86AndRMW>())
+        InstX86AndRMW(Func, DestSrc0, Src1);
+  }
+
+private:
+  InstX86AndRMW(Cfg *Func, X86OperandMem *DestSrc0, Operand *Src1)
+      : InstX86BaseBinopRMW<InstX86Base::AndRMW>(Func, DestSrc0, Src1) {}
+};
+
+class InstX86Pand : public InstX86BaseBinopXmm<InstX86Base::Pand, false,
+                                               InstX86Base::SseSuffix::None> {
+public:
+  static InstX86Pand *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Pand>()) InstX86Pand(Func, Dest, Source);
+  }
+
+private:
+  InstX86Pand(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopXmm<InstX86Base::Pand, false,
+                            InstX86Base::SseSuffix::None>(Func, Dest, Source) {}
+};
+
+class InstX86Pandn : public InstX86BaseBinopXmm<InstX86Base::Pandn, false,
+                                                InstX86Base::SseSuffix::None> {
+public:
+  static InstX86Pandn *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Pandn>())
+        InstX86Pandn(Func, Dest, Source);
+  }
+
+private:
+  InstX86Pandn(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopXmm<InstX86Base::Pandn, false,
+                            InstX86Base::SseSuffix::None>(Func, Dest, Source) {}
+};
+
+class InstX86Maxss
+    : public InstX86BaseBinopXmm<InstX86Base::Maxss, true,
+                                 InstX86Base::SseSuffix::Scalar> {
+public:
+  static InstX86Maxss *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Maxss>())
+        InstX86Maxss(Func, Dest, Source);
+  }
+
+private:
+  InstX86Maxss(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopXmm<InstX86Base::Maxss, true,
+                            InstX86Base::SseSuffix::Scalar>(Func, Dest,
+                                                            Source) {}
+};
+
+class InstX86Minss
+    : public InstX86BaseBinopXmm<InstX86Base::Minss, true,
+                                 InstX86Base::SseSuffix::Scalar> {
+public:
+  static InstX86Minss *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Minss>())
+        InstX86Minss(Func, Dest, Source);
+  }
+
+private:
+  InstX86Minss(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopXmm<InstX86Base::Minss, true,
+                            InstX86Base::SseSuffix::Scalar>(Func, Dest,
+                                                            Source) {}
+};
+
+class InstX86Maxps : public InstX86BaseBinopXmm<InstX86Base::Maxps, true,
+                                                InstX86Base::SseSuffix::None> {
+public:
+  static InstX86Maxps *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Maxps>())
+        InstX86Maxps(Func, Dest, Source);
+  }
+
+private:
+  InstX86Maxps(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopXmm<InstX86Base::Maxps, true,
+                            InstX86Base::SseSuffix::None>(Func, Dest, Source) {}
+};
+
+class InstX86Minps : public InstX86BaseBinopXmm<InstX86Base::Minps, true,
+                                                InstX86Base::SseSuffix::None> {
+public:
+  static InstX86Minps *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Minps>())
+        InstX86Minps(Func, Dest, Source);
+  }
+
+private:
+  InstX86Minps(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopXmm<InstX86Base::Minps, true,
+                            InstX86Base::SseSuffix::None>(Func, Dest, Source) {}
+};
+
+class InstX86Or : public InstX86BaseBinopGPR<InstX86Base::Or> {
+public:
+  static InstX86Or *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Or>()) InstX86Or(Func, Dest, Source);
+  }
+
+private:
+  InstX86Or(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopGPR<InstX86Base::Or>(Func, Dest, Source) {}
+};
+
+class InstX86Orps : public InstX86BaseBinopXmm<InstX86Base::Orps, true,
+                                               InstX86Base::SseSuffix::Packed> {
+public:
+  static InstX86Orps *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Orps>()) InstX86Orps(Func, Dest, Source);
+  }
+
+private:
+  InstX86Orps(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopXmm<InstX86Base::Orps, true,
+                            InstX86Base::SseSuffix::Packed>(Func, Dest,
+                                                            Source) {}
+};
+
+class InstX86OrRMW : public InstX86BaseBinopRMW<InstX86Base::OrRMW> {
+public:
+  static InstX86OrRMW *create(Cfg *Func, X86OperandMem *DestSrc0,
+                              Operand *Src1) {
+    return new (Func->allocate<InstX86OrRMW>())
+        InstX86OrRMW(Func, DestSrc0, Src1);
+  }
+
+private:
+  InstX86OrRMW(Cfg *Func, X86OperandMem *DestSrc0, Operand *Src1)
+      : InstX86BaseBinopRMW<InstX86Base::OrRMW>(Func, DestSrc0, Src1) {}
+};
+
+class InstX86Por : public InstX86BaseBinopXmm<InstX86Base::Por, false,
+                                              InstX86Base::SseSuffix::None> {
+public:
+  static InstX86Por *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Por>()) InstX86Por(Func, Dest, Source);
+  }
+
+private:
+  InstX86Por(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopXmm<InstX86Base::Por, false,
+                            InstX86Base::SseSuffix::None>(Func, Dest, Source) {}
+};
+
+class InstX86Xor : public InstX86BaseBinopGPR<InstX86Base::Xor> {
+public:
+  static InstX86Xor *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Xor>()) InstX86Xor(Func, Dest, Source);
+  }
+
+private:
+  InstX86Xor(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopGPR<InstX86Base::Xor>(Func, Dest, Source) {}
+};
+
+class InstX86Xorps
+    : public InstX86BaseBinopXmm<InstX86Base::Xorps, true,
+                                 InstX86Base::SseSuffix::Packed> {
+public:
+  static InstX86Xorps *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Xorps>())
+        InstX86Xorps(Func, Dest, Source);
+  }
+
+private:
+  InstX86Xorps(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopXmm<InstX86Base::Xorps, true,
+                            InstX86Base::SseSuffix::Packed>(Func, Dest,
+                                                            Source) {}
+};
+
+class InstX86XorRMW : public InstX86BaseBinopRMW<InstX86Base::XorRMW> {
+public:
+  static InstX86XorRMW *create(Cfg *Func, X86OperandMem *DestSrc0,
+                               Operand *Src1) {
+    return new (Func->allocate<InstX86XorRMW>())
+        InstX86XorRMW(Func, DestSrc0, Src1);
+  }
+
+private:
+  InstX86XorRMW(Cfg *Func, X86OperandMem *DestSrc0, Operand *Src1)
+      : InstX86BaseBinopRMW<InstX86Base::XorRMW>(Func, DestSrc0, Src1) {}
+};
+
+class InstX86Pxor : public InstX86BaseBinopXmm<InstX86Base::Pxor, false,
+                                               InstX86Base::SseSuffix::None> {
+public:
+  static InstX86Pxor *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Pxor>()) InstX86Pxor(Func, Dest, Source);
+  }
+
+private:
+  InstX86Pxor(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopXmm<InstX86Base::Pxor, false,
+                            InstX86Base::SseSuffix::None>(Func, Dest, Source) {}
+};
+
+class InstX86Imul : public InstX86BaseBinopGPR<InstX86Base::Imul> {
+public:
+  static InstX86Imul *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Imul>()) InstX86Imul(Func, Dest, Source);
+  }
+
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+
+private:
+  InstX86Imul(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopGPR<InstX86Base::Imul>(Func, Dest, Source) {}
+};
+
+class InstX86ImulImm : public InstX86BaseThreeAddressop<InstX86Base::ImulImm> {
+public:
+  static InstX86ImulImm *create(Cfg *Func, Variable *Dest, Operand *Source0,
+                                Operand *Source1) {
+    return new (Func->allocate<InstX86ImulImm>())
+        InstX86ImulImm(Func, Dest, Source0, Source1);
+  }
+
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+
+private:
+  InstX86ImulImm(Cfg *Func, Variable *Dest, Operand *Source0, Operand *Source1)
+      : InstX86BaseThreeAddressop<InstX86Base::ImulImm>(Func, Dest, Source0,
+                                                        Source1) {}
+};
+
+class InstX86Mulps
+    : public InstX86BaseBinopXmm<InstX86Base::Mulps, true,
+                                 InstX86Base::SseSuffix::Packed> {
+public:
+  static InstX86Mulps *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Mulps>())
+        InstX86Mulps(Func, Dest, Source);
+  }
+
+private:
+  InstX86Mulps(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopXmm<InstX86Base::Mulps, true,
+                            InstX86Base::SseSuffix::Packed>(Func, Dest,
+                                                            Source) {}
+};
+
+class InstX86Mulss
+    : public InstX86BaseBinopXmm<InstX86Base::Mulss, false,
+                                 InstX86Base::SseSuffix::Scalar> {
+public:
+  static InstX86Mulss *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Mulss>())
+        InstX86Mulss(Func, Dest, Source);
+  }
+
+private:
+  InstX86Mulss(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopXmm<InstX86Base::Mulss, false,
+                            InstX86Base::SseSuffix::Scalar>(Func, Dest,
+                                                            Source) {}
+};
+
+class InstX86Pmull
+    : public InstX86BaseBinopXmm<InstX86Base::Pmull, true,
+                                 InstX86Base::SseSuffix::Integral> {
+public:
+  static InstX86Pmull *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    bool TypesAreValid =
+        Dest->getType() == IceType_v4i32 || Dest->getType() == IceType_v8i16;
+    bool InstructionSetIsValid =
+        Dest->getType() == IceType_v8i16 || getInstructionSet(Func) >= SSE4_1;
+    (void)TypesAreValid;
+    (void)InstructionSetIsValid;
+    assert(TypesAreValid);
+    assert(InstructionSetIsValid);
+    return new (Func->allocate<InstX86Pmull>())
+        InstX86Pmull(Func, Dest, Source);
+  }
+
+private:
+  InstX86Pmull(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopXmm<InstX86Base::Pmull, true,
+                            InstX86Base::SseSuffix::Integral>(Func, Dest,
+                                                              Source) {}
+};
+
+class InstX86Pmulhw : public InstX86BaseBinopXmm<InstX86Base::Pmulhw, false,
+                                                 InstX86Base::SseSuffix::None> {
+public:
+  static InstX86Pmulhw *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    assert(Dest->getType() == IceType_v8i16 &&
+           Source->getType() == IceType_v8i16);
+    return new (Func->allocate<InstX86Pmulhw>())
+        InstX86Pmulhw(Func, Dest, Source);
+  }
+
+private:
+  InstX86Pmulhw(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopXmm<InstX86Base::Pmulhw, false,
+                            InstX86Base::SseSuffix::None>(Func, Dest, Source) {}
+};
+
+class InstX86Pmulhuw
+    : public InstX86BaseBinopXmm<InstX86Base::Pmulhuw, false,
+                                 InstX86Base::SseSuffix::None> {
+public:
+  static InstX86Pmulhuw *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    assert(Dest->getType() == IceType_v8i16 &&
+           Source->getType() == IceType_v8i16);
+    return new (Func->allocate<InstX86Pmulhuw>())
+        InstX86Pmulhuw(Func, Dest, Source);
+  }
+
+private:
+  InstX86Pmulhuw(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopXmm<InstX86Base::Pmulhuw, false,
+                            InstX86Base::SseSuffix::None>(Func, Dest, Source) {}
+};
+
+class InstX86Pmaddwd
+    : public InstX86BaseBinopXmm<InstX86Base::Pmaddwd, false,
+                                 InstX86Base::SseSuffix::None> {
+public:
+  static InstX86Pmaddwd *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    assert(Dest->getType() == IceType_v8i16 &&
+           Source->getType() == IceType_v8i16);
+    return new (Func->allocate<InstX86Pmaddwd>())
+        InstX86Pmaddwd(Func, Dest, Source);
+  }
+
+private:
+  InstX86Pmaddwd(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopXmm<InstX86Base::Pmaddwd, false,
+                            InstX86Base::SseSuffix::None>(Func, Dest, Source) {}
+};
+
+class InstX86Pmuludq
+    : public InstX86BaseBinopXmm<InstX86Base::Pmuludq, false,
+                                 InstX86Base::SseSuffix::None> {
+public:
+  static InstX86Pmuludq *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    assert(Dest->getType() == IceType_v4i32 &&
+           Source->getType() == IceType_v4i32);
+    return new (Func->allocate<InstX86Pmuludq>())
+        InstX86Pmuludq(Func, Dest, Source);
+  }
+
+private:
+  InstX86Pmuludq(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopXmm<InstX86Base::Pmuludq, false,
+                            InstX86Base::SseSuffix::None>(Func, Dest, Source) {}
+};
+
+class InstX86Divps
+    : public InstX86BaseBinopXmm<InstX86Base::Divps, true,
+                                 InstX86Base::SseSuffix::Packed> {
+public:
+  static InstX86Divps *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Divps>())
+        InstX86Divps(Func, Dest, Source);
+  }
+
+private:
+  InstX86Divps(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopXmm<InstX86Base::Divps, true,
+                            InstX86Base::SseSuffix::Packed>(Func, Dest,
+                                                            Source) {}
+};
+
+class InstX86Divss
+    : public InstX86BaseBinopXmm<InstX86Base::Divss, false,
+                                 InstX86Base::SseSuffix::Scalar> {
+public:
+  static InstX86Divss *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Divss>())
+        InstX86Divss(Func, Dest, Source);
+  }
+
+private:
+  InstX86Divss(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopXmm<InstX86Base::Divss, false,
+                            InstX86Base::SseSuffix::Scalar>(Func, Dest,
+                                                            Source) {}
+};
+
+class InstX86Rol : public InstX86BaseBinopGPRShift<InstX86Base::Rol> {
+public:
+  static InstX86Rol *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Rol>()) InstX86Rol(Func, Dest, Source);
+  }
+
+private:
+  InstX86Rol(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopGPRShift<InstX86Base::Rol>(Func, Dest, Source) {}
+};
+
+class InstX86Shl : public InstX86BaseBinopGPRShift<InstX86Base::Shl> {
+public:
+  static InstX86Shl *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Shl>()) InstX86Shl(Func, Dest, Source);
+  }
+
+private:
+  InstX86Shl(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopGPRShift<InstX86Base::Shl>(Func, Dest, Source) {}
+};
+
+class InstX86Psll : public InstX86BaseBinopXmmShift<InstX86Base::Psll> {
+public:
+  static InstX86Psll *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    assert(Dest->getType() == IceType_v8i16 ||
+           Dest->getType() == IceType_v8i1 ||
+           Dest->getType() == IceType_v4i32 || Dest->getType() == IceType_v4i1);
+    return new (Func->allocate<InstX86Psll>()) InstX86Psll(Func, Dest, Source);
+  }
+
+private:
+  InstX86Psll(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopXmmShift<InstX86Base::Psll>(Func, Dest, Source) {}
+};
+
+class InstX86Psrl : public InstX86BaseBinopXmmShift<InstX86Base::Psrl, true> {
+public:
+  static InstX86Psrl *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Psrl>()) InstX86Psrl(Func, Dest, Source);
+  }
+
+private:
+  InstX86Psrl(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopXmmShift<InstX86Base::Psrl, true>(Func, Dest, Source) {}
+};
+
+class InstX86Shr : public InstX86BaseBinopGPRShift<InstX86Base::Shr> {
+public:
+  static InstX86Shr *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Shr>()) InstX86Shr(Func, Dest, Source);
+  }
+
+private:
+  InstX86Shr(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopGPRShift<InstX86Base::Shr>(Func, Dest, Source) {}
+};
+
+class InstX86Sar : public InstX86BaseBinopGPRShift<InstX86Base::Sar> {
+public:
+  static InstX86Sar *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Sar>()) InstX86Sar(Func, Dest, Source);
+  }
+
+private:
+  InstX86Sar(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopGPRShift<InstX86Base::Sar>(Func, Dest, Source) {}
+};
+
+class InstX86Psra : public InstX86BaseBinopXmmShift<InstX86Base::Psra> {
+public:
+  static InstX86Psra *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    assert(Dest->getType() == IceType_v8i16 ||
+           Dest->getType() == IceType_v8i1 ||
+           Dest->getType() == IceType_v4i32 || Dest->getType() == IceType_v4i1);
+    return new (Func->allocate<InstX86Psra>()) InstX86Psra(Func, Dest, Source);
+  }
+
+private:
+  InstX86Psra(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopXmmShift<InstX86Base::Psra>(Func, Dest, Source) {}
+};
+
+class InstX86Pcmpeq
+    : public InstX86BaseBinopXmm<InstX86Base::Pcmpeq, true,
+                                 InstX86Base::SseSuffix::Integral> {
+public:
+  static InstX86Pcmpeq *create(Cfg *Func, Variable *Dest, Operand *Source,
+                               Type ArithmeticTypeOverride = IceType_void) {
+    const Type Ty = ArithmeticTypeOverride == IceType_void
+                        ? Dest->getType()
+                        : ArithmeticTypeOverride;
+    (void)Ty;
+    assert((Ty != IceType_f64 && Ty != IceType_i64) ||
+           getInstructionSet(Func) >= SSE4_1);
+    return new (Func->allocate<InstX86Pcmpeq>())
+        InstX86Pcmpeq(Func, Dest, Source, ArithmeticTypeOverride);
+  }
+
+private:
+  InstX86Pcmpeq(Cfg *Func, Variable *Dest, Operand *Source,
+                Type ArithmeticTypeOverride)
+      : InstX86BaseBinopXmm<InstX86Base::Pcmpeq, true,
+                            InstX86Base::SseSuffix::Integral>(
+            Func, Dest, Source, ArithmeticTypeOverride) {}
+};
+
+class InstX86Pcmpgt
+    : public InstX86BaseBinopXmm<InstX86Base::Pcmpgt, true,
+                                 InstX86Base::SseSuffix::Integral> {
+public:
+  static InstX86Pcmpgt *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    assert(Dest->getType() != IceType_f64 || getInstructionSet(Func) >= SSE4_1);
+    return new (Func->allocate<InstX86Pcmpgt>())
+        InstX86Pcmpgt(Func, Dest, Source);
+  }
+
+private:
+  InstX86Pcmpgt(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopXmm<InstX86Base::Pcmpgt, true,
+                            InstX86Base::SseSuffix::Integral>(Func, Dest,
+                                                              Source) {}
+};
+
+/// movss is only a binary operation when the source and dest operands are
+/// both registers (the high bits of dest are left untouched). In other cases,
+/// it behaves like a copy (mov-like) operation (and the high bits of dest are
+/// cleared). InstX86Movss will assert that both its source and dest operands
+/// are registers, so the lowering code should use _mov instead of _movss in
+/// cases where a copy operation is intended.
+class InstX86MovssRegs
+    : public InstX86BaseBinopXmm<InstX86Base::MovssRegs, false,
+                                 InstX86Base::SseSuffix::None> {
+public:
+  static InstX86MovssRegs *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86MovssRegs>())
+        InstX86MovssRegs(Func, Dest, Source);
+  }
+
+  void emitIAS(const Cfg *Func) const override;
+
+private:
+  InstX86MovssRegs(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopXmm<InstX86Base::MovssRegs, false,
+                            InstX86Base::SseSuffix::None>(Func, Dest, Source) {}
+};
+
+class InstX86Idiv : public InstX86BaseTernop<InstX86Base::Idiv> {
+public:
+  static InstX86Idiv *create(Cfg *Func, Variable *Dest, Operand *Source1,
+                             Operand *Source2) {
+    return new (Func->allocate<InstX86Idiv>())
+        InstX86Idiv(Func, Dest, Source1, Source2);
+  }
+
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+
+private:
+  InstX86Idiv(Cfg *Func, Variable *Dest, Operand *Source1, Operand *Source2)
+      : InstX86BaseTernop<InstX86Base::Idiv>(Func, Dest, Source1, Source2) {}
+};
+
+class InstX86Div : public InstX86BaseTernop<InstX86Base::Div> {
+public:
+  static InstX86Div *create(Cfg *Func, Variable *Dest, Operand *Source1,
+                            Operand *Source2) {
+    return new (Func->allocate<InstX86Div>())
+        InstX86Div(Func, Dest, Source1, Source2);
+  }
+
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+
+private:
+  InstX86Div(Cfg *Func, Variable *Dest, Operand *Source1, Operand *Source2)
+      : InstX86BaseTernop<InstX86Base::Div>(Func, Dest, Source1, Source2) {}
+};
+
+class InstX86Insertps : public InstX86BaseTernop<InstX86Base::Insertps> {
+public:
+  static InstX86Insertps *create(Cfg *Func, Variable *Dest, Operand *Source1,
+                                 Operand *Source2) {
+    return new (Func->allocate<InstX86Insertps>())
+        InstX86Insertps(Func, Dest, Source1, Source2);
+  }
+
+  void emitIAS(const Cfg *Func) const override;
+
+private:
+  InstX86Insertps(Cfg *Func, Variable *Dest, Operand *Source1, Operand *Source2)
+      : InstX86BaseTernop<InstX86Base::Insertps>(Func, Dest, Source1, Source2) {
+  }
+};
+
+class InstX86Pinsr : public InstX86BaseTernop<InstX86Base::Pinsr> {
+public:
+  static InstX86Pinsr *create(Cfg *Func, Variable *Dest, Operand *Source1,
+                              Operand *Source2) {
+    // pinsrb and pinsrd are SSE4.1 instructions.
+    assert(Dest->getType() == IceType_v8i16 ||
+           Dest->getType() == IceType_v8i1 ||
+           getInstructionSet(Func) >= SSE4_1);
+    return new (Func->allocate<InstX86Pinsr>())
+        InstX86Pinsr(Func, Dest, Source1, Source2);
+  }
+
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+
+private:
+  InstX86Pinsr(Cfg *Func, Variable *Dest, Operand *Source1, Operand *Source2)
+      : InstX86BaseTernop<InstX86Base::Pinsr>(Func, Dest, Source1, Source2) {}
+};
+
+class InstX86Shufps : public InstX86BaseTernop<InstX86Base::Shufps> {
+public:
+  static InstX86Shufps *create(Cfg *Func, Variable *Dest, Operand *Source1,
+                               Operand *Source2) {
+    return new (Func->allocate<InstX86Shufps>())
+        InstX86Shufps(Func, Dest, Source1, Source2);
+  }
+
+  void emitIAS(const Cfg *Func) const override;
+
+private:
+  InstX86Shufps(Cfg *Func, Variable *Dest, Operand *Source1, Operand *Source2)
+      : InstX86BaseTernop<InstX86Base::Shufps>(Func, Dest, Source1, Source2) {}
+};
+
+class InstX86Blendvps : public InstX86BaseTernop<InstX86Base::Blendvps> {
+public:
+  static InstX86Blendvps *create(Cfg *Func, Variable *Dest, Operand *Source1,
+                                 Operand *Source2) {
+    assert(getInstructionSet(Func) >= SSE4_1);
+    return new (Func->allocate<InstX86Blendvps>())
+        InstX86Blendvps(Func, Dest, Source1, Source2);
+  }
+
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Fund) const override;
+
+private:
+  InstX86Blendvps(Cfg *Func, Variable *Dest, Operand *Source1, Operand *Source2)
+      : InstX86BaseTernop<InstX86Base::Blendvps>(Func, Dest, Source1, Source2) {
+  }
+};
+
+class InstX86Pblendvb : public InstX86BaseTernop<InstX86Base::Pblendvb> {
+public:
+  static InstX86Pblendvb *create(Cfg *Func, Variable *Dest, Operand *Source1,
+                                 Operand *Source2) {
+    assert(getInstructionSet(Func) >= SSE4_1);
+    return new (Func->allocate<InstX86Pblendvb>())
+        InstX86Pblendvb(Func, Dest, Source1, Source2);
+  }
+
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+
+private:
+  InstX86Pblendvb(Cfg *Func, Variable *Dest, Operand *Source1, Operand *Source2)
+      : InstX86BaseTernop<InstX86Base::Pblendvb>(Func, Dest, Source1, Source2) {
+  }
+};
+
+class InstX86Pextr : public InstX86BaseThreeAddressop<InstX86Base::Pextr> {
+public:
+  static InstX86Pextr *create(Cfg *Func, Variable *Dest, Operand *Source0,
+                              Operand *Source1) {
+    assert(Source0->getType() == IceType_v8i16 ||
+           Source0->getType() == IceType_v8i1 ||
+           getInstructionSet(Func) >= SSE4_1);
+    return new (Func->allocate<InstX86Pextr>())
+        InstX86Pextr(Func, Dest, Source0, Source1);
+  }
+
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+
+private:
+  InstX86Pextr(Cfg *Func, Variable *Dest, Operand *Source0, Operand *Source1)
+      : InstX86BaseThreeAddressop<InstX86Base::Pextr>(Func, Dest, Source0,
+                                                      Source1) {}
+};
+
+class InstX86Pshufd : public InstX86BaseThreeAddressop<InstX86Base::Pshufd> {
+public:
+  static InstX86Pshufd *create(Cfg *Func, Variable *Dest, Operand *Source0,
+                               Operand *Source1) {
+    return new (Func->allocate<InstX86Pshufd>())
+        InstX86Pshufd(Func, Dest, Source0, Source1);
+  }
+
+  void emitIAS(const Cfg *Func) const override;
+
+private:
+  InstX86Pshufd(Cfg *Func, Variable *Dest, Operand *Source0, Operand *Source1)
+      : InstX86BaseThreeAddressop<InstX86Base::Pshufd>(Func, Dest, Source0,
+                                                       Source1) {}
+};
+
+/// Base class for a lockable x86-32 instruction (emits a locked prefix).
+class InstX86BaseLockable : public InstX86Base {
+  InstX86BaseLockable() = delete;
+  InstX86BaseLockable(const InstX86BaseLockable &) = delete;
+  InstX86BaseLockable &operator=(const InstX86BaseLockable &) = delete;
+
+protected:
+  bool Locked;
+
+  InstX86BaseLockable(Cfg *Func, typename InstX86Base::InstKindX86 Kind,
+                      SizeT Maxsrcs, Variable *Dest, bool Locked)
+      : InstX86Base(Func, Kind, Maxsrcs, Dest), Locked(Locked) {
+    // Assume that such instructions are used for Atomics and be careful with
+    // optimizations.
+    this->HasSideEffects = Locked;
+  }
+};
+
+/// Mul instruction - unsigned multiply.
+class InstX86Mul final : public InstX86Base {
+  InstX86Mul() = delete;
+  InstX86Mul(const InstX86Mul &) = delete;
+  InstX86Mul &operator=(const InstX86Mul &) = delete;
+
+public:
+  static InstX86Mul *create(Cfg *Func, Variable *Dest, Variable *Source1,
+                            Operand *Source2) {
+    return new (Func->allocate<InstX86Mul>())
+        InstX86Mul(Func, Dest, Source1, Source2);
+  }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, InstX86Base::Mul);
+  }
+
+private:
+  InstX86Mul(Cfg *Func, Variable *Dest, Variable *Source1, Operand *Source2);
+};
+
+/// Shld instruction - shift across a pair of operands.
+class InstX86Shld final : public InstX86Base {
+  InstX86Shld() = delete;
+  InstX86Shld(const InstX86Shld &) = delete;
+  InstX86Shld &operator=(const InstX86Shld &) = delete;
+
+public:
+  static InstX86Shld *create(Cfg *Func, Variable *Dest, Variable *Source1,
+                             Operand *Source2) {
+    return new (Func->allocate<InstX86Shld>())
+        InstX86Shld(Func, Dest, Source1, Source2);
+  }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, InstX86Base::Shld);
+  }
+
+private:
+  InstX86Shld(Cfg *Func, Variable *Dest, Variable *Source1, Operand *Source2);
+};
+
+/// Shrd instruction - shift across a pair of operands.
+class InstX86Shrd final : public InstX86Base {
+  InstX86Shrd() = delete;
+  InstX86Shrd(const InstX86Shrd &) = delete;
+  InstX86Shrd &operator=(const InstX86Shrd &) = delete;
+
+public:
+  static InstX86Shrd *create(Cfg *Func, Variable *Dest, Variable *Source1,
+                             Operand *Source2) {
+    return new (Func->allocate<InstX86Shrd>())
+        InstX86Shrd(Func, Dest, Source1, Source2);
+  }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, InstX86Base::Shrd);
+  }
+
+private:
+  InstX86Shrd(Cfg *Func, Variable *Dest, Variable *Source1, Operand *Source2);
+};
+
+/// Conditional move instruction.
+class InstX86Cmov final : public InstX86Base {
+  InstX86Cmov() = delete;
+  InstX86Cmov(const InstX86Cmov &) = delete;
+  InstX86Cmov &operator=(const InstX86Cmov &) = delete;
+
+public:
+  static InstX86Cmov *create(Cfg *Func, Variable *Dest, Operand *Source,
+                             BrCond Cond) {
+    return new (Func->allocate<InstX86Cmov>())
+        InstX86Cmov(Func, Dest, Source, Cond);
+  }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, InstX86Base::Cmov);
+  }
+
+private:
+  InstX86Cmov(Cfg *Func, Variable *Dest, Operand *Source, BrCond Cond);
+
+  BrCond Condition;
+};
+
+/// Cmpps instruction - compare packed singled-precision floating point values
+class InstX86Cmpps final : public InstX86Base {
+  InstX86Cmpps() = delete;
+  InstX86Cmpps(const InstX86Cmpps &) = delete;
+  InstX86Cmpps &operator=(const InstX86Cmpps &) = delete;
+
+public:
+  static InstX86Cmpps *create(Cfg *Func, Variable *Dest, Operand *Source,
+                              CmppsCond Condition) {
+    return new (Func->allocate<InstX86Cmpps>())
+        InstX86Cmpps(Func, Dest, Source, Condition);
+  }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, InstX86Base::Cmpps);
+  }
+
+private:
+  InstX86Cmpps(Cfg *Func, Variable *Dest, Operand *Source, CmppsCond Cond);
+
+  CmppsCond Condition;
+};
+
+/// Cmpxchg instruction - cmpxchg <dest>, <desired> will compare if <dest>
+/// equals eax. If so, the ZF is set and <desired> is stored in <dest>. If
+/// not, ZF is cleared and <dest> is copied to eax (or subregister). <dest>
+/// can be a register or memory, while <desired> must be a register. It is
+/// the user's responsibility to mark eax with a FakeDef.
+class InstX86Cmpxchg final : public InstX86BaseLockable {
+  InstX86Cmpxchg() = delete;
+  InstX86Cmpxchg(const InstX86Cmpxchg &) = delete;
+  InstX86Cmpxchg &operator=(const InstX86Cmpxchg &) = delete;
+
+public:
+  static InstX86Cmpxchg *create(Cfg *Func, Operand *DestOrAddr, Variable *Eax,
+                                Variable *Desired, bool Locked) {
+    return new (Func->allocate<InstX86Cmpxchg>())
+        InstX86Cmpxchg(Func, DestOrAddr, Eax, Desired, Locked);
+  }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, InstX86Base::Cmpxchg);
+  }
+
+private:
+  InstX86Cmpxchg(Cfg *Func, Operand *DestOrAddr, Variable *Eax,
+                 Variable *Desired, bool Locked);
+};
+
+/// Cmpxchg8b instruction - cmpxchg8b <m64> will compare if <m64> equals
+/// edx:eax. If so, the ZF is set and ecx:ebx is stored in <m64>. If not, ZF
+/// is cleared and <m64> is copied to edx:eax. The caller is responsible for
+/// inserting FakeDefs to mark edx and eax as modified. <m64> must be a memory
+/// operand.
+class InstX86Cmpxchg8b final : public InstX86BaseLockable {
+  InstX86Cmpxchg8b() = delete;
+  InstX86Cmpxchg8b(const InstX86Cmpxchg8b &) = delete;
+  InstX86Cmpxchg8b &operator=(const InstX86Cmpxchg8b &) = delete;
+
+public:
+  static InstX86Cmpxchg8b *create(Cfg *Func, X86OperandMem *Dest, Variable *Edx,
+                                  Variable *Eax, Variable *Ecx, Variable *Ebx,
+                                  bool Locked) {
+    return new (Func->allocate<InstX86Cmpxchg8b>())
+        InstX86Cmpxchg8b(Func, Dest, Edx, Eax, Ecx, Ebx, Locked);
+  }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, InstX86Base::Cmpxchg8b);
+  }
+
+private:
+  InstX86Cmpxchg8b(Cfg *Func, X86OperandMem *Dest, Variable *Edx, Variable *Eax,
+                   Variable *Ecx, Variable *Ebx, bool Locked);
+};
+
+/// Cvt instruction - wrapper for cvtsX2sY where X and Y are in {s,d,i} as
+/// appropriate.  s=float, d=double, i=int. X and Y are determined from
+/// dest/src types. Sign and zero extension on the integer operand needs to be
+/// done separately.
+class InstX86Cvt final : public InstX86Base {
+  InstX86Cvt() = delete;
+  InstX86Cvt(const InstX86Cvt &) = delete;
+  InstX86Cvt &operator=(const InstX86Cvt &) = delete;
+
+public:
+  enum CvtVariant { Si2ss, Tss2si, Ss2si, Float2float, Dq2ps, Tps2dq, Ps2dq };
+  static InstX86Cvt *create(Cfg *Func, Variable *Dest, Operand *Source,
+                            CvtVariant Variant) {
+    return new (Func->allocate<InstX86Cvt>())
+        InstX86Cvt(Func, Dest, Source, Variant);
+  }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, InstX86Base::Cvt);
+  }
+  bool isTruncating() const { return Variant == Tss2si || Variant == Tps2dq; }
+
+private:
+  CvtVariant Variant;
+  InstX86Cvt(Cfg *Func, Variable *Dest, Operand *Source, CvtVariant Variant);
+};
+
+/// Round instruction
+class InstX86Round final
+    : public InstX86BaseThreeAddressop<InstX86Base::Round> {
+public:
+  static InstX86Round *create(Cfg *Func, Variable *Dest, Operand *Source,
+                              Operand *Imm) {
+    return new (Func->allocate<InstX86Round>())
+        InstX86Round(Func, Dest, Source, Imm);
+  }
+
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+
+private:
+  InstX86Round(Cfg *Func, Variable *Dest, Operand *Source, Operand *Imm)
+      : InstX86BaseThreeAddressop<InstX86Base::Round>(Func, Dest, Source, Imm) {
+  }
+};
+
+/// cmp - Integer compare instruction.
+class InstX86Icmp final : public InstX86Base {
+  InstX86Icmp() = delete;
+  InstX86Icmp(const InstX86Icmp &) = delete;
+  InstX86Icmp &operator=(const InstX86Icmp &) = delete;
+
+public:
+  static InstX86Icmp *create(Cfg *Func, Operand *Src1, Operand *Src2) {
+    return new (Func->allocate<InstX86Icmp>()) InstX86Icmp(Func, Src1, Src2);
+  }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, InstX86Base::Icmp);
+  }
+
+private:
+  InstX86Icmp(Cfg *Func, Operand *Src1, Operand *Src2);
+};
+
+/// ucomiss/ucomisd - floating-point compare instruction.
+class InstX86Ucomiss final : public InstX86Base {
+  InstX86Ucomiss() = delete;
+  InstX86Ucomiss(const InstX86Ucomiss &) = delete;
+  InstX86Ucomiss &operator=(const InstX86Ucomiss &) = delete;
+
+public:
+  static InstX86Ucomiss *create(Cfg *Func, Operand *Src1, Operand *Src2) {
+    return new (Func->allocate<InstX86Ucomiss>())
+        InstX86Ucomiss(Func, Src1, Src2);
+  }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, InstX86Base::Ucomiss);
+  }
+
+private:
+  InstX86Ucomiss(Cfg *Func, Operand *Src1, Operand *Src2);
+};
+
+/// UD2 instruction.
+class InstX86UD2 final : public InstX86Base {
+  InstX86UD2() = delete;
+  InstX86UD2(const InstX86UD2 &) = delete;
+  InstX86UD2 &operator=(const InstX86UD2 &) = delete;
+
+public:
+  static InstX86UD2 *create(Cfg *Func) {
+    return new (Func->allocate<InstX86UD2>()) InstX86UD2(Func);
+  }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, InstX86Base::UD2);
+  }
+
+private:
+  explicit InstX86UD2(Cfg *Func);
+};
+
+/// Int3 instruction.
+class InstX86Int3 final : public InstX86Base {
+  InstX86Int3() = delete;
+  InstX86Int3(const InstX86Int3 &) = delete;
+  InstX86Int3 &operator=(const InstX86Int3 &) = delete;
+
+public:
+  static InstX86Int3 *create(Cfg *Func) {
+    return new (Func->allocate<InstX86Int3>()) InstX86Int3(Func);
+  }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, InstX86Base::Int3);
+  }
+
+private:
+  explicit InstX86Int3(Cfg *Func);
+};
+
+/// Test instruction.
+class InstX86Test final : public InstX86Base {
+  InstX86Test() = delete;
+  InstX86Test(const InstX86Test &) = delete;
+  InstX86Test &operator=(const InstX86Test &) = delete;
+
+public:
+  static InstX86Test *create(Cfg *Func, Operand *Source1, Operand *Source2) {
+    return new (Func->allocate<InstX86Test>())
+        InstX86Test(Func, Source1, Source2);
+  }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, InstX86Base::Test);
+  }
+
+private:
+  InstX86Test(Cfg *Func, Operand *Source1, Operand *Source2);
+};
+
+/// Mfence instruction.
+class InstX86Mfence final : public InstX86Base {
+  InstX86Mfence() = delete;
+  InstX86Mfence(const InstX86Mfence &) = delete;
+  InstX86Mfence &operator=(const InstX86Mfence &) = delete;
+
+public:
+  static InstX86Mfence *create(Cfg *Func) {
+    return new (Func->allocate<InstX86Mfence>()) InstX86Mfence(Func);
+  }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, InstX86Base::Mfence);
+  }
+
+private:
+  explicit InstX86Mfence(Cfg *Func);
+};
+
+/// This is essentially a "mov" instruction with anX86OperandMem operand
+/// instead of Variable as the destination. It's important for liveness that
+/// there is no Dest operand.
+class InstX86Store final : public InstX86Base {
+  InstX86Store() = delete;
+  InstX86Store(const InstX86Store &) = delete;
+  InstX86Store &operator=(const InstX86Store &) = delete;
+
+public:
+  static InstX86Store *create(Cfg *Func, Operand *Value, X86Operand *Mem) {
+    return new (Func->allocate<InstX86Store>()) InstX86Store(Func, Value, Mem);
+  }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, InstX86Base::Store);
+  }
+
+private:
+  InstX86Store(Cfg *Func, Operand *Value, X86Operand *Mem);
+};
+
+/// This is essentially a vector "mov" instruction with an typename
+/// X86OperandMem operand instead of Variable as the destination. It's
+/// important for liveness that there is no Dest operand. The source must be
+/// an Xmm register, since Dest is mem.
+class InstX86StoreP final : public InstX86Base {
+  InstX86StoreP() = delete;
+  InstX86StoreP(const InstX86StoreP &) = delete;
+  InstX86StoreP &operator=(const InstX86StoreP &) = delete;
+
+public:
+  static InstX86StoreP *create(Cfg *Func, Variable *Value, X86OperandMem *Mem) {
+    return new (Func->allocate<InstX86StoreP>())
+        InstX86StoreP(Func, Value, Mem);
+  }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, InstX86Base::StoreP);
+  }
+
+private:
+  InstX86StoreP(Cfg *Func, Variable *Value, X86OperandMem *Mem);
+};
+
+class InstX86StoreQ final : public InstX86Base {
+  InstX86StoreQ() = delete;
+  InstX86StoreQ(const InstX86StoreQ &) = delete;
+  InstX86StoreQ &operator=(const InstX86StoreQ &) = delete;
+
+public:
+  static InstX86StoreQ *create(Cfg *Func, Operand *Value, X86OperandMem *Mem) {
+    return new (Func->allocate<InstX86StoreQ>())
+        InstX86StoreQ(Func, Value, Mem);
+  }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, InstX86Base::StoreQ);
+  }
+
+private:
+  InstX86StoreQ(Cfg *Func, Operand *Value, X86OperandMem *Mem);
+};
+
+class InstX86StoreD final : public InstX86Base {
+  InstX86StoreD() = delete;
+  InstX86StoreD(const InstX86StoreD &) = delete;
+  InstX86StoreD &operator=(const InstX86StoreD &) = delete;
+
+public:
+  static InstX86StoreD *create(Cfg *Func, Operand *Value, X86OperandMem *Mem) {
+    return new (Func->allocate<InstX86StoreD>())
+        InstX86StoreD(Func, Value, Mem);
+  }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, InstX86Base::StoreQ);
+  }
+
+private:
+  InstX86StoreD(Cfg *Func, Operand *Value, X86OperandMem *Mem);
+};
+
+/// Nop instructions of varying length
+class InstX86Nop final : public InstX86Base {
+  InstX86Nop() = delete;
+  InstX86Nop(const InstX86Nop &) = delete;
+  InstX86Nop &operator=(const InstX86Nop &) = delete;
+
+public:
+  // TODO: Replace with enum.
+  using NopVariant = unsigned;
+
+  static InstX86Nop *create(Cfg *Func, NopVariant Variant) {
+    return new (Func->allocate<InstX86Nop>()) InstX86Nop(Func, Variant);
+  }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, InstX86Base::Nop);
+  }
+
+private:
+  InstX86Nop(Cfg *Func, NopVariant Length);
+
+  NopVariant Variant;
+};
+
+/// Fld - load a value onto the x87 FP stack.
+class InstX86Fld final : public InstX86Base {
+  InstX86Fld() = delete;
+  InstX86Fld(const InstX86Fld &) = delete;
+  InstX86Fld &operator=(const InstX86Fld &) = delete;
+
+public:
+  static InstX86Fld *create(Cfg *Func, Operand *Src) {
+    return new (Func->allocate<InstX86Fld>()) InstX86Fld(Func, Src);
+  }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, InstX86Base::Fld);
+  }
+
+private:
+  InstX86Fld(Cfg *Func, Operand *Src);
+};
+
+/// Fstp - store x87 st(0) into memory and pop st(0).
+class InstX86Fstp final : public InstX86Base {
+  InstX86Fstp() = delete;
+  InstX86Fstp(const InstX86Fstp &) = delete;
+  InstX86Fstp &operator=(const InstX86Fstp &) = delete;
+
+public:
+  static InstX86Fstp *create(Cfg *Func, Variable *Dest) {
+    return new (Func->allocate<InstX86Fstp>()) InstX86Fstp(Func, Dest);
+  }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, InstX86Base::Fstp);
+  }
+
+private:
+  InstX86Fstp(Cfg *Func, Variable *Dest);
+};
+
+class InstX86Pop final : public InstX86Base {
+  InstX86Pop() = delete;
+  InstX86Pop(const InstX86Pop &) = delete;
+  InstX86Pop &operator=(const InstX86Pop &) = delete;
+
+public:
+  static InstX86Pop *create(Cfg *Func, Variable *Dest) {
+    return new (Func->allocate<InstX86Pop>()) InstX86Pop(Func, Dest);
+  }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, InstX86Base::Pop);
+  }
+
+private:
+  InstX86Pop(Cfg *Func, Variable *Dest);
+};
+
+class InstX86Push final : public InstX86Base {
+  InstX86Push() = delete;
+  InstX86Push(const InstX86Push &) = delete;
+  InstX86Push &operator=(const InstX86Push &) = delete;
+
+public:
+  static InstX86Push *create(Cfg *Func, Operand *Source) {
+    return new (Func->allocate<InstX86Push>()) InstX86Push(Func, Source);
+  }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, InstX86Base::Push);
+  }
+
+private:
+  InstX86Push(Cfg *Func, Operand *Source);
+};
+
+/// Ret instruction. Currently only supports the "ret" version that does not
+/// pop arguments. This instruction takes a Source operand (for non-void
+/// returning functions) for liveness analysis, though a FakeUse before the
+/// ret would do just as well.
+class InstX86Ret final : public InstX86Base {
+  InstX86Ret() = delete;
+  InstX86Ret(const InstX86Ret &) = delete;
+  InstX86Ret &operator=(const InstX86Ret &) = delete;
+
+public:
+  static InstX86Ret *create(Cfg *Func, Variable *Source = nullptr) {
+    return new (Func->allocate<InstX86Ret>()) InstX86Ret(Func, Source);
+  }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, InstX86Base::Ret);
+  }
+
+private:
+  InstX86Ret(Cfg *Func, Variable *Source);
+};
+
+/// Conditional set-byte instruction.
+class InstX86Setcc final : public InstX86Base {
+  InstX86Setcc() = delete;
+  InstX86Setcc(const InstX86Cmov &) = delete;
+  InstX86Setcc &operator=(const InstX86Setcc &) = delete;
+
+public:
+  static InstX86Setcc *create(Cfg *Func, Variable *Dest, BrCond Cond) {
+    return new (Func->allocate<InstX86Setcc>()) InstX86Setcc(Func, Dest, Cond);
+  }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, InstX86Base::Setcc);
+  }
+
+private:
+  InstX86Setcc(Cfg *Func, Variable *Dest, BrCond Cond);
+
+  const BrCond Condition;
+};
+
+/// Exchanging Add instruction. Exchanges the first operand (destination
+/// operand) with the second operand (source operand), then loads the sum of
+/// the two values into the destination operand. The destination may be a
+/// register or memory, while the source must be a register.
+///
+/// Both the dest and source are updated. The caller should then insert a
+/// FakeDef to reflect the second udpate.
+class InstX86Xadd final : public InstX86BaseLockable {
+  InstX86Xadd() = delete;
+  InstX86Xadd(const InstX86Xadd &) = delete;
+  InstX86Xadd &operator=(const InstX86Xadd &) = delete;
+
+public:
+  static InstX86Xadd *create(Cfg *Func, Operand *Dest, Variable *Source,
+                             bool Locked) {
+    return new (Func->allocate<InstX86Xadd>())
+        InstX86Xadd(Func, Dest, Source, Locked);
+  }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, InstX86Base::Xadd);
+  }
+
+private:
+  InstX86Xadd(Cfg *Func, Operand *Dest, Variable *Source, bool Locked);
+};
+
+/// Exchange instruction. Exchanges the first operand (destination operand)
+/// with the second operand (source operand). At least one of the operands
+/// must be a register (and the other can be reg or mem). Both the Dest and
+/// Source are updated. If there is a memory operand, then the instruction is
+/// automatically "locked" without the need for a lock prefix.
+class InstX86Xchg final : public InstX86Base {
+  InstX86Xchg() = delete;
+  InstX86Xchg(const InstX86Xchg &) = delete;
+  InstX86Xchg &operator=(const InstX86Xchg &) = delete;
+
+public:
+  static InstX86Xchg *create(Cfg *Func, Operand *Dest, Variable *Source) {
+    return new (Func->allocate<InstX86Xchg>()) InstX86Xchg(Func, Dest, Source);
+  }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, InstX86Base::Xchg);
+  }
+
+private:
+  InstX86Xchg(Cfg *Func, Operand *Dest, Variable *Source);
+};
+
+/// Start marker for the Intel Architecture Code Analyzer. This is not an
+/// executable instruction and must only be used for analysis.
+class InstX86IacaStart final : public InstX86Base {
+  InstX86IacaStart() = delete;
+  InstX86IacaStart(const InstX86IacaStart &) = delete;
+  InstX86IacaStart &operator=(const InstX86IacaStart &) = delete;
+
+public:
+  static InstX86IacaStart *create(Cfg *Func) {
+    return new (Func->allocate<InstX86IacaStart>()) InstX86IacaStart(Func);
+  }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, InstX86Base::IacaStart);
+  }
+
+private:
+  InstX86IacaStart(Cfg *Func);
+};
+
+/// End marker for the Intel Architecture Code Analyzer. This is not an
+/// executable instruction and must only be used for analysis.
+class InstX86IacaEnd final : public InstX86Base {
+  InstX86IacaEnd() = delete;
+  InstX86IacaEnd(const InstX86IacaEnd &) = delete;
+  InstX86IacaEnd &operator=(const InstX86IacaEnd &) = delete;
+
+public:
+  static InstX86IacaEnd *create(Cfg *Func) {
+    return new (Func->allocate<InstX86IacaEnd>()) InstX86IacaEnd(Func);
+  }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, InstX86Base::IacaEnd);
+  }
+
+private:
+  InstX86IacaEnd(Cfg *Func);
+};
+
+class InstX86Pshufb : public InstX86BaseBinopXmm<InstX86Base::Pshufb, false,
+                                                 InstX86Base::SseSuffix::None> {
+public:
+  static InstX86Pshufb *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Pshufb>())
+        InstX86Pshufb(Func, Dest, Source);
+  }
+
+private:
+  InstX86Pshufb(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopXmm<InstX86Base::Pshufb, false,
+                            InstX86Base::SseSuffix::None>(Func, Dest, Source) {}
+};
+
+class InstX86Punpckl
+    : public InstX86BaseBinopXmm<InstX86Base::Punpckl, false,
+                                 InstX86Base::SseSuffix::Unpack> {
+public:
+  static InstX86Punpckl *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Punpckl>())
+        InstX86Punpckl(Func, Dest, Source);
+  }
+
+private:
+  InstX86Punpckl(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopXmm<InstX86Base::Punpckl, false,
+                            InstX86Base::SseSuffix::Unpack>(Func, Dest,
+                                                            Source) {}
+};
+
+class InstX86Punpckh
+    : public InstX86BaseBinopXmm<InstX86Base::Punpckh, false,
+                                 InstX86Base::SseSuffix::Unpack> {
+public:
+  static InstX86Punpckh *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Punpckh>())
+        InstX86Punpckh(Func, Dest, Source);
+  }
+
+private:
+  InstX86Punpckh(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopXmm<InstX86Base::Punpckh, false,
+                            InstX86Base::SseSuffix::Unpack>(Func, Dest,
+                                                            Source) {}
+};
+
+class InstX86Packss : public InstX86BaseBinopXmm<InstX86Base::Packss, false,
+                                                 InstX86Base::SseSuffix::Pack> {
+public:
+  static InstX86Packss *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Packss>())
+        InstX86Packss(Func, Dest, Source);
+  }
+
+private:
+  InstX86Packss(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopXmm<InstX86Base::Packss, false,
+                            InstX86Base::SseSuffix::Pack>(Func, Dest, Source) {}
+};
+
+class InstX86Packus : public InstX86BaseBinopXmm<InstX86Base::Packus, false,
+                                                 InstX86Base::SseSuffix::Pack> {
+public:
+  static InstX86Packus *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Packus>())
+        InstX86Packus(Func, Dest, Source);
+  }
+
+private:
+  InstX86Packus(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopXmm<InstX86Base::Packus, false,
+                            InstX86Base::SseSuffix::Pack>(Func, Dest, Source) {}
+};
+
+/// struct Insts is a template that can be used to instantiate all the X86
+/// instructions for a target with a simple
+///
+/// using Insts = ::Ice::X8632::Insts<TraitsType>;
+struct Insts {
+  using FakeRMW = InstX86FakeRMW;
+  using Label = InstX86Label;
+
+  using Call = InstX86Call;
+
+  using Br = InstX86Br;
+  using Jmp = InstX86Jmp;
+  using Bswap = InstX86Bswap;
+  using Neg = InstX86Neg;
+  using Bsf = InstX86Bsf;
+  using Bsr = InstX86Bsr;
+  using Lea = InstX86Lea;
+  using Cbwdq = InstX86Cbwdq;
+  using Movsx = InstX86Movsx;
+  using Movzx = InstX86Movzx;
+  using Movd = InstX86Movd;
+  using Movmsk = InstX86Movmsk;
+  using Sqrt = InstX86Sqrt;
+  using Mov = InstX86Mov;
+  using Movp = InstX86Movp;
+  using Movq = InstX86Movq;
+  using Add = InstX86Add;
+  using AddRMW = InstX86AddRMW;
+  using Addps = InstX86Addps;
+  using Adc = InstX86Adc;
+  using AdcRMW = InstX86AdcRMW;
+  using Addss = InstX86Addss;
+  using Andnps = InstX86Andnps;
+  using Andps = InstX86Andps;
+  using Padd = InstX86Padd;
+  using Padds = InstX86Padds;
+  using Paddus = InstX86Paddus;
+  using Sub = InstX86Sub;
+  using SubRMW = InstX86SubRMW;
+  using Subps = InstX86Subps;
+  using Subss = InstX86Subss;
+  using Sbb = InstX86Sbb;
+  using SbbRMW = InstX86SbbRMW;
+  using Psub = InstX86Psub;
+  using Psubs = InstX86Psubs;
+  using Psubus = InstX86Psubus;
+  using And = InstX86And;
+  using AndRMW = InstX86AndRMW;
+  using Pand = InstX86Pand;
+  using Pandn = InstX86Pandn;
+  using Or = InstX86Or;
+  using Orps = InstX86Orps;
+  using OrRMW = InstX86OrRMW;
+  using Por = InstX86Por;
+  using Xor = InstX86Xor;
+  using Xorps = InstX86Xorps;
+  using XorRMW = InstX86XorRMW;
+  using Pxor = InstX86Pxor;
+  using Maxss = InstX86Maxss;
+  using Minss = InstX86Minss;
+  using Maxps = InstX86Maxps;
+  using Minps = InstX86Minps;
+  using Imul = InstX86Imul;
+  using ImulImm = InstX86ImulImm;
+  using Mulps = InstX86Mulps;
+  using Mulss = InstX86Mulss;
+  using Pmull = InstX86Pmull;
+  using Pmulhw = InstX86Pmulhw;
+  using Pmulhuw = InstX86Pmulhuw;
+  using Pmaddwd = InstX86Pmaddwd;
+  using Pmuludq = InstX86Pmuludq;
+  using Divps = InstX86Divps;
+  using Divss = InstX86Divss;
+  using Rol = InstX86Rol;
+  using Shl = InstX86Shl;
+  using Psll = InstX86Psll;
+  using Psrl = InstX86Psrl;
+  using Shr = InstX86Shr;
+  using Sar = InstX86Sar;
+  using Psra = InstX86Psra;
+  using Pcmpeq = InstX86Pcmpeq;
+  using Pcmpgt = InstX86Pcmpgt;
+  using MovssRegs = InstX86MovssRegs;
+  using Idiv = InstX86Idiv;
+  using Div = InstX86Div;
+  using Insertps = InstX86Insertps;
+  using Pinsr = InstX86Pinsr;
+  using Shufps = InstX86Shufps;
+  using Blendvps = InstX86Blendvps;
+  using Pblendvb = InstX86Pblendvb;
+  using Pextr = InstX86Pextr;
+  using Pshufd = InstX86Pshufd;
+  using Lockable = InstX86BaseLockable;
+  using Mul = InstX86Mul;
+  using Shld = InstX86Shld;
+  using Shrd = InstX86Shrd;
+  using Cmov = InstX86Cmov;
+  using Cmpps = InstX86Cmpps;
+  using Cmpxchg = InstX86Cmpxchg;
+  using Cmpxchg8b = InstX86Cmpxchg8b;
+  using Cvt = InstX86Cvt;
+  using Round = InstX86Round;
+  using Icmp = InstX86Icmp;
+  using Ucomiss = InstX86Ucomiss;
+  using UD2 = InstX86UD2;
+  using Int3 = InstX86Int3;
+  using Test = InstX86Test;
+  using Mfence = InstX86Mfence;
+  using Store = InstX86Store;
+  using StoreP = InstX86StoreP;
+  using StoreQ = InstX86StoreQ;
+  using StoreD = InstX86StoreD;
+  using Nop = InstX86Nop;
+  using Fld = InstX86Fld;
+  using Fstp = InstX86Fstp;
+  using Pop = InstX86Pop;
+  using Push = InstX86Push;
+  using Ret = InstX86Ret;
+  using Setcc = InstX86Setcc;
+  using Xadd = InstX86Xadd;
+  using Xchg = InstX86Xchg;
+
+  using IacaStart = InstX86IacaStart;
+  using IacaEnd = InstX86IacaEnd;
+
+  using Pshufb = InstX86Pshufb;
+  using Punpckl = InstX86Punpckl;
+  using Punpckh = InstX86Punpckh;
+  using Packss = InstX86Packss;
+  using Packus = InstX86Packus;
+};
+
+/// X86 Instructions have static data (particularly, opcodes and instruction
+/// emitters). Each X86 target needs to define all of these, so this macro is
+/// provided so that, if something changes, then all X86 targets will be updated
+/// automatically.
+/* In-place ops */
+template <> constexpr const char *InstX86Bswap::Base::Opcode = "bswap";
+template <> constexpr const char *InstX86Neg::Base::Opcode = "neg";
+/* Unary ops */
+template <> constexpr const char *InstX86Bsf::Base::Opcode = "bsf";
+template <> constexpr const char *InstX86Bsr::Base::Opcode = "bsr";
+template <> constexpr const char *InstX86Lea::Base::Opcode = "lea";
+template <> constexpr const char *InstX86Movd::Base::Opcode = "movd";
+template <> constexpr const char *InstX86Movsx::Base::Opcode = "movs";
+template <> constexpr const char *InstX86Movzx::Base::Opcode = "movz";
+template <> constexpr const char *InstX86Sqrt::Base::Opcode = "sqrt";
+template <> constexpr const char *InstX86Cbwdq::Base::Opcode = "cbw/cwd/cdq";
+/* Mov-like ops */
+template <> constexpr const char *InstX86Mov::Base::Opcode = "mov";
+template <> constexpr const char *InstX86Movp::Base::Opcode = "movups";
+template <> constexpr const char *InstX86Movq::Base::Opcode = "movq";
+/* Binary ops */
+template <> constexpr const char *InstX86Add::Base::Opcode = "add";
+template <> constexpr const char *InstX86AddRMW::Base::Opcode = "add";
+template <> constexpr const char *InstX86Addps::Base::Opcode = "add";
+template <> constexpr const char *InstX86Adc::Base::Opcode = "adc";
+template <> constexpr const char *InstX86AdcRMW::Base::Opcode = "adc";
+template <> constexpr const char *InstX86Addss::Base::Opcode = "add";
+template <> constexpr const char *InstX86Andnps::Base::Opcode = "andn";
+template <> constexpr const char *InstX86Andps::Base::Opcode = "and";
+template <> constexpr const char *InstX86Maxss::Base::Opcode = "max";
+template <> constexpr const char *InstX86Minss::Base::Opcode = "min";
+template <> constexpr const char *InstX86Maxps::Base::Opcode = "max";
+template <> constexpr const char *InstX86Minps::Base::Opcode = "min";
+template <> constexpr const char *InstX86Padd::Base::Opcode = "padd";
+template <> constexpr const char *InstX86Padds::Base::Opcode = "padds";
+template <> constexpr const char *InstX86Paddus::Base::Opcode = "paddus";
+template <> constexpr const char *InstX86Sub::Base::Opcode = "sub";
+template <> constexpr const char *InstX86SubRMW::Base::Opcode = "sub";
+template <> constexpr const char *InstX86Subps::Base::Opcode = "sub";
+template <> constexpr const char *InstX86Subss::Base::Opcode = "sub";
+template <> constexpr const char *InstX86Sbb::Base::Opcode = "sbb";
+template <> constexpr const char *InstX86SbbRMW::Base::Opcode = "sbb";
+template <> constexpr const char *InstX86Psub::Base::Opcode = "psub";
+template <> constexpr const char *InstX86Psubs::Base::Opcode = "psubs";
+template <> constexpr const char *InstX86Psubus::Base::Opcode = "psubus";
+template <> constexpr const char *InstX86And::Base::Opcode = "and";
+template <> constexpr const char *InstX86AndRMW::Base::Opcode = "and";
+template <> constexpr const char *InstX86Pand::Base::Opcode = "pand";
+template <> constexpr const char *InstX86Pandn::Base::Opcode = "pandn";
+template <> constexpr const char *InstX86Or::Base::Opcode = "or";
+template <> constexpr const char *InstX86Orps::Base::Opcode = "or";
+template <> constexpr const char *InstX86OrRMW::Base::Opcode = "or";
+template <> constexpr const char *InstX86Por::Base::Opcode = "por";
+template <> constexpr const char *InstX86Xor::Base::Opcode = "xor";
+template <> constexpr const char *InstX86Xorps::Base::Opcode = "xor";
+template <> constexpr const char *InstX86XorRMW::Base::Opcode = "xor";
+template <> constexpr const char *InstX86Pxor::Base::Opcode = "pxor";
+template <> constexpr const char *InstX86Imul::Base::Opcode = "imul";
+template <> constexpr const char *InstX86ImulImm::Base::Opcode = "imul";
+template <> constexpr const char *InstX86Mulps::Base::Opcode = "mul";
+template <> constexpr const char *InstX86Mulss::Base::Opcode = "mul";
+template <> constexpr const char *InstX86Pmull::Base::Opcode = "pmull";
+template <> constexpr const char *InstX86Pmulhw::Base::Opcode = "pmulhw";
+template <> constexpr const char *InstX86Pmulhuw::Base::Opcode = "pmulhuw";
+template <> constexpr const char *InstX86Pmaddwd::Base::Opcode = "pmaddwd";
+template <> constexpr const char *InstX86Pmuludq::Base::Opcode = "pmuludq";
+template <> constexpr const char *InstX86Div::Base::Opcode = "div";
+template <> constexpr const char *InstX86Divps::Base::Opcode = "div";
+template <> constexpr const char *InstX86Divss::Base::Opcode = "div";
+template <> constexpr const char *InstX86Idiv::Base::Opcode = "idiv";
+template <> constexpr const char *InstX86Rol::Base::Opcode = "rol";
+template <> constexpr const char *InstX86Shl::Base::Opcode = "shl";
+template <> constexpr const char *InstX86Psll::Base::Opcode = "psll";
+template <> constexpr const char *InstX86Shr::Base::Opcode = "shr";
+template <> constexpr const char *InstX86Sar::Base::Opcode = "sar";
+template <> constexpr const char *InstX86Psra::Base::Opcode = "psra";
+template <> constexpr const char *InstX86Psrl::Base::Opcode = "psrl";
+template <> constexpr const char *InstX86Pcmpeq::Base::Opcode = "pcmpeq";
+template <> constexpr const char *InstX86Pcmpgt::Base::Opcode = "pcmpgt";
+template <> constexpr const char *InstX86MovssRegs::Base::Opcode = "movss";
+/* Ternary ops */
+template <> constexpr const char *InstX86Insertps::Base::Opcode = "insertps";
+template <> constexpr const char *InstX86Round::Base::Opcode = "round";
+template <> constexpr const char *InstX86Shufps::Base::Opcode = "shufps";
+template <> constexpr const char *InstX86Pinsr::Base::Opcode = "pinsr";
+template <> constexpr const char *InstX86Blendvps::Base::Opcode = "blendvps";
+template <> constexpr const char *InstX86Pblendvb::Base::Opcode = "pblendvb";
+/* Three address ops */
+template <> constexpr const char *InstX86Pextr::Base::Opcode = "pextr";
+template <> constexpr const char *InstX86Pshufd::Base::Opcode = "pshufd";
+template <> constexpr const char *InstX86Pshufb::Base::Opcode = "pshufb";
+template <> constexpr const char *InstX86Punpckl::Base::Opcode = "punpckl";
+template <> constexpr const char *InstX86Punpckh::Base::Opcode = "punpckh";
+template <> constexpr const char *InstX86Packss::Base::Opcode = "packss";
+template <> constexpr const char *InstX86Packus::Base::Opcode = "packus";
+/* Inplace GPR ops */
+template <>
+constexpr const Assembler::GPREmitterOneOp InstX86Bswap::Base::Emitter = {
+    &Assembler::bswap, nullptr /* only a reg form exists */
+};
+template <>
+constexpr const Assembler::GPREmitterOneOp InstX86Neg::Base::Emitter = {
+    &Assembler::neg, &Assembler::neg};
+/* Unary GPR ops */
+/* uses specialized emitter. */
+template <>
+constexpr const Assembler::GPREmitterRegOp InstX86Cbwdq::Base::Emitter = {
+    nullptr, nullptr, nullptr};
+template <>
+constexpr const Assembler::GPREmitterRegOp InstX86Bsf::Base::Emitter = {
+    &Assembler::bsf, &Assembler::bsf, nullptr};
+template <>
+constexpr const Assembler::GPREmitterRegOp InstX86Bsr::Base::Emitter = {
+    &Assembler::bsr, &Assembler::bsr, nullptr};
+template <>
+constexpr const Assembler::GPREmitterRegOp InstX86Lea::Base::Emitter = {
+    /* reg/reg and reg/imm are illegal */ nullptr, &Assembler::lea, nullptr};
+template <>
+constexpr const Assembler::GPREmitterRegOp InstX86Movsx::Base::Emitter = {
+    &Assembler::movsx, &Assembler::movsx, nullptr};
+template <>
+constexpr const Assembler::GPREmitterRegOp InstX86Movzx::Base::Emitter = {
+    &Assembler::movzx, &Assembler::movzx, nullptr};
+/* Unary XMM ops */
+/* uses specialized emitter. */
+template <>
+constexpr const Assembler::XmmEmitterRegOp InstX86Movd::Base::Emitter = {
+    nullptr, nullptr};
+template <>
+constexpr const Assembler::XmmEmitterRegOp InstX86Sqrt::Base::Emitter = {
+    &Assembler::sqrt, &Assembler::sqrt};
+/* Binary GPR ops */
+/* uses specialized emitter. */
+template <>
+constexpr const Assembler::GPREmitterRegOp InstX86Imul::Base::Emitter = {
+    nullptr, nullptr, nullptr};
+template <>
+constexpr const Assembler::GPREmitterRegOp InstX86Add::Base::Emitter = {
+    &Assembler::add, &Assembler::add, &Assembler::add};
+template <>
+constexpr const Assembler::GPREmitterAddrOp InstX86AddRMW::Base::Emitter = {
+    &Assembler::add, &Assembler::add};
+template <>
+constexpr const Assembler::GPREmitterRegOp InstX86Adc::Base::Emitter = {
+    &Assembler::adc, &Assembler::adc, &Assembler::adc};
+template <>
+constexpr const Assembler::GPREmitterAddrOp InstX86AdcRMW::Base::Emitter = {
+    &Assembler::adc, &Assembler::adc};
+template <>
+constexpr const Assembler::GPREmitterRegOp InstX86And::Base::Emitter = {
+    &Assembler::And, &Assembler::And, &Assembler::And};
+template <>
+constexpr const Assembler::GPREmitterAddrOp InstX86AndRMW::Base::Emitter = {
+    &Assembler::And, &Assembler::And};
+template <>
+constexpr const Assembler::GPREmitterRegOp InstX86Or::Base::Emitter = {
+    &Assembler::Or, &Assembler::Or, &Assembler::Or};
+template <>
+constexpr const Assembler::GPREmitterAddrOp InstX86OrRMW::Base::Emitter = {
+    &Assembler::Or, &Assembler::Or};
+template <>
+constexpr const Assembler::GPREmitterRegOp InstX86Sbb::Base::Emitter = {
+    &Assembler::sbb, &Assembler::sbb, &Assembler::sbb};
+template <>
+constexpr const Assembler::GPREmitterAddrOp InstX86SbbRMW::Base::Emitter = {
+    &Assembler::sbb, &Assembler::sbb};
+template <>
+constexpr const Assembler::GPREmitterRegOp InstX86Sub::Base::Emitter = {
+    &Assembler::sub, &Assembler::sub, &Assembler::sub};
+template <>
+constexpr const Assembler::GPREmitterAddrOp InstX86SubRMW::Base::Emitter = {
+    &Assembler::sub, &Assembler::sub};
+template <>
+constexpr const Assembler::GPREmitterRegOp InstX86Xor::Base::Emitter = {
+    &Assembler::Xor, &Assembler::Xor, &Assembler::Xor};
+template <>
+constexpr const Assembler::GPREmitterAddrOp InstX86XorRMW::Base::Emitter = {
+    &Assembler::Xor, &Assembler::Xor};
+/* Binary Shift GPR ops */
+template <>
+constexpr const Assembler::GPREmitterShiftOp InstX86Rol::Base::Emitter = {
+    &Assembler::rol, &Assembler::rol};
+template <>
+constexpr const Assembler::GPREmitterShiftOp InstX86Sar::Base::Emitter = {
+    &Assembler::sar, &Assembler::sar};
+template <>
+constexpr const Assembler::GPREmitterShiftOp InstX86Shl::Base::Emitter = {
+    &Assembler::shl, &Assembler::shl};
+template <>
+constexpr const Assembler::GPREmitterShiftOp InstX86Shr::Base::Emitter = {
+    &Assembler::shr, &Assembler::shr};
+/* Binary XMM ops */
+/* uses specialized emitter. */
+template <>
+constexpr const Assembler::XmmEmitterRegOp InstX86MovssRegs::Base::Emitter = {
+    nullptr, nullptr};
+template <>
+constexpr const Assembler::XmmEmitterRegOp InstX86Addss::Base::Emitter = {
+    &Assembler::addss, &Assembler::addss};
+template <>
+constexpr const Assembler::XmmEmitterRegOp InstX86Addps::Base::Emitter = {
+    &Assembler::addps, &Assembler::addps};
+template <>
+constexpr const Assembler::XmmEmitterRegOp InstX86Divss::Base::Emitter = {
+    &Assembler::divss, &Assembler::divss};
+template <>
+constexpr const Assembler::XmmEmitterRegOp InstX86Divps::Base::Emitter = {
+    &Assembler::divps, &Assembler::divps};
+template <>
+constexpr const Assembler::XmmEmitterRegOp InstX86Mulss::Base::Emitter = {
+    &Assembler::mulss, &Assembler::mulss};
+template <>
+constexpr const Assembler::XmmEmitterRegOp InstX86Mulps::Base::Emitter = {
+    &Assembler::mulps, &Assembler::mulps};
+template <>
+constexpr const Assembler::XmmEmitterRegOp InstX86Padd::Base::Emitter = {
+    &Assembler::padd, &Assembler::padd};
+template <>
+constexpr const Assembler::XmmEmitterRegOp InstX86Padds::Base::Emitter = {
+    &Assembler::padds, &Assembler::padds};
+template <>
+constexpr const Assembler::XmmEmitterRegOp InstX86Paddus::Base::Emitter = {
+    &Assembler::paddus, &Assembler::paddus};
+template <>
+constexpr const Assembler::XmmEmitterRegOp InstX86Pand::Base::Emitter = {
+    &Assembler::pand, &Assembler::pand};
+template <>
+constexpr const Assembler::XmmEmitterRegOp InstX86Pandn::Base::Emitter = {
+    &Assembler::pandn, &Assembler::pandn};
+template <>
+constexpr const Assembler::XmmEmitterRegOp InstX86Pcmpeq::Base::Emitter = {
+    &Assembler::pcmpeq, &Assembler::pcmpeq};
+template <>
+constexpr const Assembler::XmmEmitterRegOp InstX86Pcmpgt::Base::Emitter = {
+    &Assembler::pcmpgt, &Assembler::pcmpgt};
+template <>
+constexpr const Assembler::XmmEmitterRegOp InstX86Pmull::Base::Emitter = {
+    &Assembler::pmull, &Assembler::pmull};
+template <>
+constexpr const Assembler::XmmEmitterRegOp InstX86Pmulhw::Base::Emitter = {
+    &Assembler::pmulhw, &Assembler::pmulhw};
+template <>
+constexpr const Assembler::XmmEmitterRegOp InstX86Pmulhuw::Base::Emitter = {
+    &Assembler::pmulhuw, &Assembler::pmulhuw};
+template <>
+constexpr const Assembler::XmmEmitterRegOp InstX86Pmaddwd::Base::Emitter = {
+    &Assembler::pmaddwd, &Assembler::pmaddwd};
+template <>
+constexpr const Assembler::XmmEmitterRegOp InstX86Pmuludq::Base::Emitter = {
+    &Assembler::pmuludq, &Assembler::pmuludq};
+template <>
+constexpr const Assembler::XmmEmitterRegOp InstX86Por::Base::Emitter = {
+    &Assembler::por, &Assembler::por};
+template <>
+constexpr const Assembler::XmmEmitterRegOp InstX86Psub::Base::Emitter = {
+    &Assembler::psub, &Assembler::psub};
+template <>
+constexpr const Assembler::XmmEmitterRegOp InstX86Psubs::Base::Emitter = {
+    &Assembler::psubs, &Assembler::psubs};
+template <>
+constexpr const Assembler::XmmEmitterRegOp InstX86Psubus::Base::Emitter = {
+    &Assembler::psubus, &Assembler::psubus};
+template <>
+constexpr const Assembler::XmmEmitterRegOp InstX86Pxor::Base::Emitter = {
+    &Assembler::pxor, &Assembler::pxor};
+template <>
+constexpr const Assembler::XmmEmitterRegOp InstX86Subss::Base::Emitter = {
+    &Assembler::subss, &Assembler::subss};
+template <>
+constexpr const Assembler::XmmEmitterRegOp InstX86Subps::Base::Emitter = {
+    &Assembler::subps, &Assembler::subps};
+template <>
+constexpr const Assembler::XmmEmitterRegOp InstX86Andnps::Base::Emitter = {
+    &Assembler::andnps, &Assembler::andnps};
+template <>
+constexpr const Assembler::XmmEmitterRegOp InstX86Andps::Base::Emitter = {
+    &Assembler::andps, &Assembler::andps};
+template <>
+constexpr const Assembler::XmmEmitterRegOp InstX86Maxss::Base::Emitter = {
+    &Assembler::maxss, &Assembler::maxss};
+template <>
+constexpr const Assembler::XmmEmitterRegOp InstX86Minss::Base::Emitter = {
+    &Assembler::minss, &Assembler::minss};
+template <>
+constexpr const Assembler::XmmEmitterRegOp InstX86Maxps::Base::Emitter = {
+    &Assembler::maxps, &Assembler::maxps};
+template <>
+constexpr const Assembler::XmmEmitterRegOp InstX86Minps::Base::Emitter = {
+    &Assembler::minps, &Assembler::minps};
+template <>
+constexpr const Assembler::XmmEmitterRegOp InstX86Orps::Base::Emitter = {
+    &Assembler::orps, &Assembler::orps};
+template <>
+constexpr const Assembler::XmmEmitterRegOp InstX86Xorps::Base::Emitter = {
+    &Assembler::xorps, &Assembler::xorps}; /* Binary XMM Shift ops */
+template <>
+constexpr const Assembler::XmmEmitterShiftOp InstX86Psll::Base::Emitter = {
+    &Assembler::psll, &Assembler::psll, &Assembler::psll};
+template <>
+constexpr const Assembler::XmmEmitterShiftOp InstX86Psra::Base::Emitter = {
+    &Assembler::psra, &Assembler::psra, &Assembler::psra};
+template <>
+constexpr const Assembler::XmmEmitterShiftOp InstX86Psrl::Base::Emitter = {
+    &Assembler::psrl, &Assembler::psrl, &Assembler::psrl};
+template <>
+constexpr const Assembler::XmmEmitterRegOp InstX86Pshufb::Base::Emitter = {
+    &Assembler::pshufb, &Assembler::pshufb};
+template <>
+constexpr const Assembler::XmmEmitterRegOp InstX86Punpckl::Base::Emitter = {
+    &Assembler::punpckl, &Assembler::punpckl};
+template <>
+constexpr const Assembler::XmmEmitterRegOp InstX86Punpckh::Base::Emitter = {
+    &Assembler::punpckh, &Assembler::punpckh};
+template <>
+constexpr const Assembler::XmmEmitterRegOp InstX86Packss::Base::Emitter = {
+    &Assembler::packss, &Assembler::packss};
+template <>
+constexpr const Assembler::XmmEmitterRegOp InstX86Packus::Base::Emitter = {
+    &Assembler::packus, &Assembler::packus};
+
+} // end of namespace X8632
+} // end of namespace Ice
 
 #endif // SUBZERO_SRC_ICEINSTX8632_H
diff --git a/third_party/subzero/src/IceInstX8632Base.h b/third_party/subzero/src/IceInstX8632Base.h
deleted file mode 100644
index 3b3c462..0000000
--- a/third_party/subzero/src/IceInstX8632Base.h
+++ /dev/null
@@ -1,4100 +0,0 @@
-//===- subzero/src/IceInstX8632Base.h - Generic x86 instructions -*- C++
-//-*--===//
-//
-//                        The Subzero Code Generator
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// \brief This file defines the InstX86Base template class, as well as the
-/// generic X86 Instruction class hierarchy.
-///
-/// Only X86 instructions common across all/most X86 targets should be defined
-/// here, with target-specific instructions declared in the target's traits.
-///
-//===----------------------------------------------------------------------===//
-
-#ifndef SUBZERO_SRC_ICEINSTX8632BASE_H
-#define SUBZERO_SRC_ICEINSTX8632BASE_H
-
-#include "IceAssemblerX8632.h"
-#include "IceDefs.h"
-#include "IceInst.h"
-#include "IceOperand.h"
-
-namespace Ice {
-namespace X8632 {
-
-template <typename TraitsType> struct InstImpl {
-  using Traits = TraitsType;
-  using Assembler = AssemblerX8632;
-  using AssemblerLabel = typename Assembler::Label;
-  using AssemblerImmediate = typename Assembler::Immediate;
-  using TargetLowering = typename Traits::TargetLowering;
-  using Address = typename Traits::Address;
-  using X86Operand = typename Traits::X86Operand;
-  using X86OperandMem = typename Traits::X86OperandMem;
-  using VariableSplit = typename Traits::VariableSplit;
-
-  using GPRRegister = typename Traits::RegisterSet::GPRRegister;
-  using RegisterSet = typename Traits::RegisterSet;
-  using XmmRegister = typename Traits::RegisterSet::XmmRegister;
-
-  using Cond = CondX86;
-  using BrCond = Cond::BrCond;
-  using CmppsCond = Cond::CmppsCond;
-
-  template <typename SReg_t, typename DReg_t>
-  using CastEmitterRegOp =
-      typename Assembler::template CastEmitterRegOp<SReg_t, DReg_t>;
-  template <typename SReg_t, typename DReg_t>
-  using ThreeOpImmEmitter =
-      typename Assembler::template ThreeOpImmEmitter<SReg_t, DReg_t>;
-  using GPREmitterAddrOp = typename Assembler::GPREmitterAddrOp;
-  using GPREmitterRegOp = typename Assembler::GPREmitterRegOp;
-  using GPREmitterShiftD = typename Assembler::GPREmitterShiftD;
-  using GPREmitterShiftOp = typename Assembler::GPREmitterShiftOp;
-  using GPREmitterOneOp = typename Assembler::GPREmitterOneOp;
-  using XmmEmitterRegOp = typename Assembler::XmmEmitterRegOp;
-  using XmmEmitterShiftOp = typename Assembler::XmmEmitterShiftOp;
-  using XmmEmitterMovOps = typename Assembler::XmmEmitterMovOps;
-
-  class InstX86Base : public InstTarget {
-    InstX86Base() = delete;
-    InstX86Base(const InstX86Base &) = delete;
-    InstX86Base &operator=(const InstX86Base &) = delete;
-
-  public:
-    enum InstKindX86 {
-      k__Start = Inst::Target,
-      Adc,
-      AdcRMW,
-      Add,
-      AddRMW,
-      Addps,
-      Addss,
-      And,
-      Andnps,
-      Andps,
-      AndRMW,
-      Blendvps,
-      Br,
-      Bsf,
-      Bsr,
-      Bswap,
-      Call,
-      Cbwdq,
-      Cmov,
-      Cmpps,
-      Cmpxchg,
-      Cmpxchg8b,
-      Cvt,
-      Div,
-      Divps,
-      Divss,
-      FakeRMW,
-      Fld,
-      Fstp,
-      Icmp,
-      Idiv,
-      Imul,
-      ImulImm,
-      Insertps,
-      Int3,
-      Jmp,
-      Label,
-      Lea,
-      Load,
-      Mfence,
-      Minps,
-      Maxps,
-      Minss,
-      Maxss,
-      Mov,
-      Movd,
-      Movmsk,
-      Movp,
-      Movq,
-      MovssRegs,
-      Movsx,
-      Movzx,
-      Mul,
-      Mulps,
-      Mulss,
-      Neg,
-      Nop,
-      Or,
-      Orps,
-      OrRMW,
-      Padd,
-      Padds,
-      Paddus,
-      Pand,
-      Pandn,
-      Pblendvb,
-      Pcmpeq,
-      Pcmpgt,
-      Pextr,
-      Pinsr,
-      Pmull,
-      Pmulhw,
-      Pmulhuw,
-      Pmaddwd,
-      Pmuludq,
-      Pop,
-      Por,
-      Pshufb,
-      Pshufd,
-      Punpckl,
-      Punpckh,
-      Packss,
-      Packus,
-      Psll,
-      Psra,
-      Psrl,
-      Psub,
-      Psubs,
-      Psubus,
-      Push,
-      Pxor,
-      Ret,
-      Rol,
-      Round,
-      Sar,
-      Sbb,
-      SbbRMW,
-      Setcc,
-      Shl,
-      Shld,
-      Shr,
-      Shrd,
-      Shufps,
-      Sqrt,
-      Store,
-      StoreP,
-      StoreQ,
-      StoreD,
-      Sub,
-      SubRMW,
-      Subps,
-      Subss,
-      Test,
-      Ucomiss,
-      UD2,
-      Xadd,
-      Xchg,
-      Xor,
-      Xorps,
-      XorRMW,
-
-      /// Intel Architecture Code Analyzer markers. These are not executable so
-      /// must only be used for analysis.
-      IacaStart,
-      IacaEnd
-    };
-
-    enum SseSuffix { None, Packed, Unpack, Scalar, Integral, Pack };
-
-    static const char *getWidthString(Type Ty);
-    static const char *getFldString(Type Ty);
-    static BrCond getOppositeCondition(BrCond Cond);
-    void dump(const Cfg *Func) const override;
-
-    // Shared emit routines for common forms of instructions.
-    void emitTwoAddress(const Cfg *Func, const char *Opcode,
-                        const char *Suffix = "") const;
-
-    static TargetLowering *getTarget(const Cfg *Func) {
-      return static_cast<TargetLowering *>(Func->getTarget());
-    }
-
-  protected:
-    InstX86Base(Cfg *Func, InstKindX86 Kind, SizeT Maxsrcs, Variable *Dest)
-        : InstTarget(Func, static_cast<InstKind>(Kind), Maxsrcs, Dest) {}
-
-    static bool isClassof(const Inst *Instr, InstKindX86 MyKind) {
-      return Instr->getKind() == static_cast<InstKind>(MyKind);
-    }
-    // Most instructions that operate on vector arguments require vector memory
-    // operands to be fully aligned (16-byte alignment for PNaCl vector types).
-    // The stack frame layout and call ABI ensure proper alignment for stack
-    // operands, but memory operands (originating from load/store bitcode
-    // instructions) only have element-size alignment guarantees. This function
-    // validates that none of the operands is a memory operand of vector type,
-    // calling report_fatal_error() if one is found. This function should be
-    // called during emission, and maybe also in the ctor (as long as that fits
-    // the lowering style).
-    void validateVectorAddrMode() const {
-      if (this->getDest())
-        this->validateVectorAddrModeOpnd(this->getDest());
-      for (SizeT i = 0; i < this->getSrcSize(); ++i) {
-        this->validateVectorAddrModeOpnd(this->getSrc(i));
-      }
-    }
-
-  private:
-    static void validateVectorAddrModeOpnd(const Operand *Opnd) {
-      if (llvm::isa<X86OperandMem>(Opnd) && isVectorType(Opnd->getType())) {
-        llvm::report_fatal_error("Possible misaligned vector memory operation");
-      }
-    }
-  };
-
-  /// InstX86FakeRMW represents a non-atomic read-modify-write operation on a
-  /// memory location. An InstX86FakeRMW is a "fake" instruction in that it
-  /// still needs to be lowered to some actual RMW instruction.
-  ///
-  /// If A is some memory address, D is some data value to apply, and OP is an
-  /// arithmetic operator, the instruction operates as: (*A) = (*A) OP D
-  class InstX86FakeRMW final : public InstX86Base {
-    InstX86FakeRMW() = delete;
-    InstX86FakeRMW(const InstX86FakeRMW &) = delete;
-    InstX86FakeRMW &operator=(const InstX86FakeRMW &) = delete;
-
-  public:
-    static InstX86FakeRMW *create(Cfg *Func, Operand *Data, Operand *Addr,
-                                  Variable *Beacon, InstArithmetic::OpKind Op,
-                                  uint32_t Align = 1) {
-      // TODO(stichnot): Stop ignoring alignment specification.
-      (void)Align;
-      return new (Func->allocate<InstX86FakeRMW>())
-          InstX86FakeRMW(Func, Data, Addr, Op, Beacon);
-    }
-    Operand *getAddr() const { return this->getSrc(1); }
-    Operand *getData() const { return this->getSrc(0); }
-    InstArithmetic::OpKind getOp() const { return Op; }
-    Variable *getBeacon() const {
-      return llvm::cast<Variable>(this->getSrc(2));
-    }
-    void dump(const Cfg *Func) const override;
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::FakeRMW);
-    }
-
-  private:
-    InstArithmetic::OpKind Op;
-    InstX86FakeRMW(Cfg *Func, Operand *Data, Operand *Addr,
-                   InstArithmetic::OpKind Op, Variable *Beacon);
-  };
-
-  /// InstX86Label represents an intra-block label that is the target of an
-  /// intra-block branch. The offset between the label and the branch must be
-  /// fit into one byte (considered "near"). These are used for lowering i1
-  /// calculations, Select instructions, and 64-bit compares on a 32-bit
-  /// architecture, without basic block splitting. Basic block splitting is not
-  /// so desirable for several reasons, one of which is the impact on decisions
-  /// based on whether a variable's live range spans multiple basic blocks.
-  ///
-  /// Intra-block control flow must be used with caution. Consider the sequence
-  /// for "c = (a >= b ? x : y)".
-  ///     cmp a, b
-  ///     br lt, L1
-  ///     mov c, x
-  ///     jmp L2
-  ///   L1:
-  ///     mov c, y
-  ///   L2:
-  ///
-  /// Labels L1 and L2 are intra-block labels. Without knowledge of the
-  /// intra-block control flow, liveness analysis will determine the "mov c, x"
-  /// instruction to be dead. One way to prevent this is to insert a
-  /// "FakeUse(c)" instruction anywhere between the two "mov c, ..."
-  /// instructions, e.g.:
-  ///
-  ///     cmp a, b
-  ///     br lt, L1
-  ///     mov c, x
-  ///     jmp L2
-  ///     FakeUse(c)
-  ///   L1:
-  ///     mov c, y
-  ///   L2:
-  ///
-  /// The down-side is that "mov c, x" can never be dead-code eliminated even if
-  /// there are no uses of c. As unlikely as this situation is, it may be
-  /// prevented by running dead code elimination before lowering.
-  class InstX86Label final : public InstX86Base {
-    InstX86Label() = delete;
-    InstX86Label(const InstX86Label &) = delete;
-    InstX86Label &operator=(const InstX86Label &) = delete;
-
-  public:
-    static InstX86Label *create(Cfg *Func, TargetLowering *Target) {
-      return new (Func->allocate<InstX86Label>()) InstX86Label(Func, Target);
-    }
-    uint32_t getEmitInstCount() const override { return 0; }
-    GlobalString getLabelName() const { return Name; }
-    SizeT getLabelNumber() const { return LabelNumber; }
-    bool isLabel() const override { return true; }
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-    void dump(const Cfg *Func) const override;
-    void setRelocOffset(RelocOffset *Value) { OffsetReloc = Value; }
-
-  private:
-    InstX86Label(Cfg *Func, TargetLowering *Target);
-
-    SizeT LabelNumber; // used for unique label generation.
-    RelocOffset *OffsetReloc = nullptr;
-    GlobalString Name;
-  };
-
-  /// Conditional and unconditional branch instruction.
-  class InstX86Br final : public InstX86Base {
-    InstX86Br() = delete;
-    InstX86Br(const InstX86Br &) = delete;
-    InstX86Br &operator=(const InstX86Br &) = delete;
-
-  public:
-    enum Mode { Near, Far };
-
-    /// Create a conditional branch to a node.
-    static InstX86Br *create(Cfg *Func, CfgNode *TargetTrue,
-                             CfgNode *TargetFalse, BrCond Condition,
-                             Mode Kind) {
-      assert(Condition != Cond::Br_None);
-      constexpr InstX86Label *NoLabel = nullptr;
-      return new (Func->allocate<InstX86Br>())
-          InstX86Br(Func, TargetTrue, TargetFalse, NoLabel, Condition, Kind);
-    }
-    /// Create an unconditional branch to a node.
-    static InstX86Br *create(Cfg *Func, CfgNode *Target, Mode Kind) {
-      constexpr CfgNode *NoCondTarget = nullptr;
-      constexpr InstX86Label *NoLabel = nullptr;
-      return new (Func->allocate<InstX86Br>())
-          InstX86Br(Func, NoCondTarget, Target, NoLabel, Cond::Br_None, Kind);
-    }
-    /// Create a non-terminator conditional branch to a node, with a fallthrough
-    /// to the next instruction in the current node. This is used for switch
-    /// lowering.
-    static InstX86Br *create(Cfg *Func, CfgNode *Target, BrCond Condition,
-                             Mode Kind) {
-      assert(Condition != Cond::Br_None);
-      constexpr CfgNode *NoUncondTarget = nullptr;
-      constexpr InstX86Label *NoLabel = nullptr;
-      return new (Func->allocate<InstX86Br>())
-          InstX86Br(Func, Target, NoUncondTarget, NoLabel, Condition, Kind);
-    }
-    /// Create a conditional intra-block branch (or unconditional, if
-    /// Condition==Br_None) to a label in the current block.
-    static InstX86Br *create(Cfg *Func, InstX86Label *Label, BrCond Condition,
-                             Mode Kind) {
-      constexpr CfgNode *NoCondTarget = nullptr;
-      constexpr CfgNode *NoUncondTarget = nullptr;
-      return new (Func->allocate<InstX86Br>())
-          InstX86Br(Func, NoCondTarget, NoUncondTarget, Label, Condition, Kind);
-    }
-    const CfgNode *getTargetTrue() const { return TargetTrue; }
-    const CfgNode *getTargetFalse() const { return TargetFalse; }
-    bool isNear() const { return Kind == Near; }
-    bool optimizeBranch(const CfgNode *NextNode);
-    uint32_t getEmitInstCount() const override {
-      uint32_t Sum = 0;
-      if (Label)
-        ++Sum;
-      if (getTargetTrue())
-        ++Sum;
-      if (getTargetFalse())
-        ++Sum;
-      return Sum;
-    }
-    bool isUnconditionalBranch() const override {
-      return !Label && Condition == Cond::Br_None;
-    }
-    const Inst *getIntraBlockBranchTarget() const override { return Label; }
-    bool repointEdges(CfgNode *OldNode, CfgNode *NewNode) override;
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-    void dump(const Cfg *Func) const override;
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::Br);
-    }
-
-  private:
-    InstX86Br(Cfg *Func, const CfgNode *TargetTrue, const CfgNode *TargetFalse,
-              const InstX86Label *Label, BrCond Condition, Mode Kind);
-
-    BrCond Condition;
-    const CfgNode *TargetTrue;
-    const CfgNode *TargetFalse;
-    const InstX86Label *Label; // Intra-block branch target
-    const Mode Kind;
-  };
-
-  /// Jump to a target outside this function, such as tailcall, nacljump,
-  /// naclret, unreachable. This is different from a Branch instruction in that
-  /// there is no intra-function control flow to represent.
-  class InstX86Jmp final : public InstX86Base {
-    InstX86Jmp() = delete;
-    InstX86Jmp(const InstX86Jmp &) = delete;
-    InstX86Jmp &operator=(const InstX86Jmp &) = delete;
-
-  public:
-    static InstX86Jmp *create(Cfg *Func, Operand *Target) {
-      return new (Func->allocate<InstX86Jmp>()) InstX86Jmp(Func, Target);
-    }
-    Operand *getJmpTarget() const { return this->getSrc(0); }
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-    void dump(const Cfg *Func) const override;
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::Jmp);
-    }
-
-  private:
-    InstX86Jmp(Cfg *Func, Operand *Target);
-  };
-
-  /// Call instruction. Arguments should have already been pushed.
-  class InstX86Call final : public InstX86Base {
-    InstX86Call() = delete;
-    InstX86Call(const InstX86Call &) = delete;
-    InstX86Call &operator=(const InstX86Call &) = delete;
-
-  public:
-    static InstX86Call *create(Cfg *Func, Variable *Dest, Operand *CallTarget) {
-      return new (Func->allocate<InstX86Call>())
-          InstX86Call(Func, Dest, CallTarget);
-    }
-    Operand *getCallTarget() const { return this->getSrc(0); }
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-    void dump(const Cfg *Func) const override;
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::Call);
-    }
-
-  private:
-    InstX86Call(Cfg *Func, Variable *Dest, Operand *CallTarget);
-  };
-
-  /// Emit a one-operand (GPR) instruction.
-  static void emitIASOpTyGPR(const Cfg *Func, Type Ty, const Operand *Var,
-                             const GPREmitterOneOp &Emitter);
-
-  static void emitIASAsAddrOpTyGPR(const Cfg *Func, Type Ty, const Operand *Op0,
-                                   const Operand *Op1,
-                                   const GPREmitterAddrOp &Emitter);
-
-  static void emitIASGPRShift(const Cfg *Func, Type Ty, const Variable *Var,
-                              const Operand *Src,
-                              const GPREmitterShiftOp &Emitter);
-
-  static void emitIASAddrOpTyGPR(const Cfg *Func, Type Ty, const Address &Addr,
-                                 const Operand *Src,
-                                 const GPREmitterAddrOp &Emitter);
-
-  static void emitIASRegOpTyXMM(const Cfg *Func, Type Ty, const Variable *Var,
-                                const Operand *Src,
-                                const XmmEmitterRegOp &Emitter);
-
-  static void emitIASGPRShiftDouble(const Cfg *Func, const Variable *Dest,
-                                    const Operand *Src1Op,
-                                    const Operand *Src2Op,
-                                    const GPREmitterShiftD &Emitter);
-
-  template <typename DReg_t, typename SReg_t, DReg_t (*destEnc)(RegNumT),
-            SReg_t (*srcEnc)(RegNumT)>
-  static void emitIASCastRegOp(const Cfg *Func, Type DestTy,
-                               const Variable *Dest, Type SrcTy,
-                               const Operand *Src,
-                               const CastEmitterRegOp<DReg_t, SReg_t> &Emitter);
-
-  template <typename DReg_t, typename SReg_t, DReg_t (*destEnc)(RegNumT),
-            SReg_t (*srcEnc)(RegNumT)>
-  static void
-  emitIASThreeOpImmOps(const Cfg *Func, Type DispatchTy, const Variable *Dest,
-                       const Operand *Src0, const Operand *Src1,
-                       const ThreeOpImmEmitter<DReg_t, SReg_t> Emitter);
-
-  static void emitIASMovlikeXMM(const Cfg *Func, const Variable *Dest,
-                                const Operand *Src,
-                                const XmmEmitterMovOps Emitter);
-
-  static void emitVariableBlendInst(const char *Opcode, const Inst *Instr,
-                                    const Cfg *Func);
-
-  static void emitIASVariableBlendInst(const Inst *Instr, const Cfg *Func,
-                                       const XmmEmitterRegOp &Emitter);
-
-  static void emitIASXmmShift(const Cfg *Func, Type Ty, const Variable *Var,
-                              const Operand *Src,
-                              const XmmEmitterShiftOp &Emitter);
-
-  /// Emit a two-operand (GPR) instruction, where the dest operand is a Variable
-  /// that's guaranteed to be a register.
-  template <bool VarCanBeByte = true, bool SrcCanBeByte = true>
-  static void emitIASRegOpTyGPR(const Cfg *Func, bool IsLea, Type Ty,
-                                const Variable *Dst, const Operand *Src,
-                                const GPREmitterRegOp &Emitter);
-
-  /// Instructions of the form x := op(x).
-  template <typename InstX86Base::InstKindX86 K>
-  class InstX86BaseInplaceopGPR : public InstX86Base {
-    InstX86BaseInplaceopGPR() = delete;
-    InstX86BaseInplaceopGPR(const InstX86BaseInplaceopGPR &) = delete;
-    InstX86BaseInplaceopGPR &
-    operator=(const InstX86BaseInplaceopGPR &) = delete;
-
-  public:
-    using Base = InstX86BaseInplaceopGPR<K>;
-
-    void emit(const Cfg *Func) const override {
-      if (!BuildDefs::dump())
-        return;
-      Ostream &Str = Func->getContext()->getStrEmit();
-      assert(this->getSrcSize() == 1);
-      Str << "\t" << Opcode << "\t";
-      this->getSrc(0)->emit(Func);
-    }
-    void emitIAS(const Cfg *Func) const override {
-      assert(this->getSrcSize() == 1);
-      const Variable *Var = this->getDest();
-      Type Ty = Var->getType();
-      emitIASOpTyGPR(Func, Ty, Var, Emitter);
-    }
-    void dump(const Cfg *Func) const override {
-      if (!BuildDefs::dump())
-        return;
-      Ostream &Str = Func->getContext()->getStrDump();
-      this->dumpDest(Func);
-      Str << " = " << Opcode << "." << this->getDest()->getType() << " ";
-      this->dumpSources(Func);
-    }
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::K);
-    }
-
-  protected:
-    InstX86BaseInplaceopGPR(Cfg *Func, Operand *SrcDest)
-        : InstX86Base(Func, K, 1, llvm::dyn_cast<Variable>(SrcDest)) {
-      this->addSource(SrcDest);
-    }
-
-  private:
-    static const char *const Opcode;
-    static const GPREmitterOneOp Emitter;
-  };
-
-  /// Instructions of the form x := op(y).
-  template <typename InstX86Base::InstKindX86 K>
-  class InstX86BaseUnaryopGPR : public InstX86Base {
-    InstX86BaseUnaryopGPR() = delete;
-    InstX86BaseUnaryopGPR(const InstX86BaseUnaryopGPR &) = delete;
-    InstX86BaseUnaryopGPR &operator=(const InstX86BaseUnaryopGPR &) = delete;
-
-  public:
-    using Base = InstX86BaseUnaryopGPR<K>;
-
-    void emit(const Cfg *Func) const override {
-      if (!BuildDefs::dump())
-        return;
-      Ostream &Str = Func->getContext()->getStrEmit();
-      assert(this->getSrcSize() == 1);
-      Type SrcTy = this->getSrc(0)->getType();
-      Type DestTy = this->getDest()->getType();
-      Str << "\t" << Opcode << this->getWidthString(SrcTy);
-      // Movsx and movzx need both the source and dest type width letter to
-      // define the operation. The other unary operations have the same source
-      // and dest type and as a result need only one letter.
-      if (SrcTy != DestTy)
-        Str << this->getWidthString(DestTy);
-      Str << "\t";
-      this->getSrc(0)->emit(Func);
-      Str << ", ";
-      this->getDest()->emit(Func);
-    }
-    void emitIAS(const Cfg *Func) const override {
-      assert(this->getSrcSize() == 1);
-      const Variable *Var = this->getDest();
-      Type Ty = Var->getType();
-      const Operand *Src = this->getSrc(0);
-      constexpr bool IsLea = K == InstX86Base::Lea;
-
-      if (IsLea) {
-        if (auto *Add = deoptLeaToAddOrNull(Func)) {
-          Add->emitIAS(Func);
-          return;
-        }
-      }
-      emitIASRegOpTyGPR(Func, IsLea, Ty, Var, Src, Emitter);
-    }
-    void dump(const Cfg *Func) const override {
-      if (!BuildDefs::dump())
-        return;
-      Ostream &Str = Func->getContext()->getStrDump();
-      this->dumpDest(Func);
-      Str << " = " << Opcode << "." << this->getSrc(0)->getType() << " ";
-      this->dumpSources(Func);
-    }
-
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::K);
-    }
-
-  protected:
-    InstX86BaseUnaryopGPR(Cfg *Func, Variable *Dest, Operand *Src)
-        : InstX86Base(Func, K, 1, Dest) {
-      this->addSource(Src);
-    }
-
-    Inst *deoptLeaToAddOrNull(const Cfg *Func) const {
-      // Revert back to Add when the Lea is a 2-address instruction.
-      // Caller has to emit, this just produces the add instruction.
-      if (auto *MemOp = llvm::dyn_cast<X86OperandMem>(this->getSrc(0))) {
-        if (getFlags().getAggressiveLea() &&
-            MemOp->getBase()->getRegNum() == this->getDest()->getRegNum() &&
-            MemOp->getIndex() == nullptr && MemOp->getShift() == 0) {
-          auto *Add = InstImpl<TraitsType>::InstX86Add::create(
-              const_cast<Cfg *>(Func), this->getDest(), MemOp->getOffset());
-          // TODO(manasijm): Remove const_cast by emitting code for add
-          // directly.
-          return Add;
-        }
-      }
-      return nullptr;
-    }
-
-    static const char *const Opcode;
-    static const GPREmitterRegOp Emitter;
-  };
-
-  template <typename InstX86Base::InstKindX86 K>
-  class InstX86BaseUnaryopXmm : public InstX86Base {
-    InstX86BaseUnaryopXmm() = delete;
-    InstX86BaseUnaryopXmm(const InstX86BaseUnaryopXmm &) = delete;
-    InstX86BaseUnaryopXmm &operator=(const InstX86BaseUnaryopXmm &) = delete;
-
-  public:
-    using Base = InstX86BaseUnaryopXmm<K>;
-
-    void emit(const Cfg *Func) const override {
-      if (!BuildDefs::dump())
-        return;
-      Ostream &Str = Func->getContext()->getStrEmit();
-      assert(this->getSrcSize() == 1);
-      Str << "\t" << Opcode << "\t";
-      this->getSrc(0)->emit(Func);
-      Str << ", ";
-      this->getDest()->emit(Func);
-    }
-    void emitIAS(const Cfg *Func) const override {
-      Type Ty = this->getDest()->getType();
-      assert(this->getSrcSize() == 1);
-      emitIASRegOpTyXMM(Func, Ty, this->getDest(), this->getSrc(0), Emitter);
-    }
-    void dump(const Cfg *Func) const override {
-      if (!BuildDefs::dump())
-        return;
-      Ostream &Str = Func->getContext()->getStrDump();
-      this->dumpDest(Func);
-      Str << " = " << Opcode << "." << this->getDest()->getType() << " ";
-      this->dumpSources(Func);
-    }
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::K);
-    }
-
-  protected:
-    InstX86BaseUnaryopXmm(Cfg *Func, Variable *Dest, Operand *Src)
-        : InstX86Base(Func, K, 1, Dest) {
-      this->addSource(Src);
-    }
-
-    static const char *const Opcode;
-    static const XmmEmitterRegOp Emitter;
-  };
-
-  template <typename InstX86Base::InstKindX86 K>
-  class InstX86BaseBinopGPRShift : public InstX86Base {
-    InstX86BaseBinopGPRShift() = delete;
-    InstX86BaseBinopGPRShift(const InstX86BaseBinopGPRShift &) = delete;
-    InstX86BaseBinopGPRShift &
-    operator=(const InstX86BaseBinopGPRShift &) = delete;
-
-  public:
-    using Base = InstX86BaseBinopGPRShift<K>;
-
-    void emit(const Cfg *Func) const override {
-      if (!BuildDefs::dump())
-        return;
-      this->emitTwoAddress(Func, Opcode);
-    }
-    void emitIAS(const Cfg *Func) const override {
-      Type Ty = this->getDest()->getType();
-      assert(this->getSrcSize() == 2);
-      emitIASGPRShift(Func, Ty, this->getDest(), this->getSrc(1), Emitter);
-    }
-    void dump(const Cfg *Func) const override {
-      if (!BuildDefs::dump())
-        return;
-      Ostream &Str = Func->getContext()->getStrDump();
-      this->dumpDest(Func);
-      Str << " = " << Opcode << "." << this->getDest()->getType() << " ";
-      this->dumpSources(Func);
-    }
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::K);
-    }
-
-  protected:
-    InstX86BaseBinopGPRShift(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86Base(Func, K, 2, Dest) {
-      this->addSource(Dest);
-      this->addSource(Source);
-    }
-
-    static const char *const Opcode;
-    static const GPREmitterShiftOp Emitter;
-  };
-
-  template <typename InstX86Base::InstKindX86 K>
-  class InstX86BaseBinopGPR : public InstX86Base {
-    InstX86BaseBinopGPR() = delete;
-    InstX86BaseBinopGPR(const InstX86BaseBinopGPR &) = delete;
-    InstX86BaseBinopGPR &operator=(const InstX86BaseBinopGPR &) = delete;
-
-  public:
-    using Base = InstX86BaseBinopGPR<K>;
-
-    void emit(const Cfg *Func) const override {
-      if (!BuildDefs::dump())
-        return;
-      this->emitTwoAddress(Func, Opcode);
-    }
-    void emitIAS(const Cfg *Func) const override {
-      Type Ty = this->getDest()->getType();
-      assert(this->getSrcSize() == 2);
-      constexpr bool ThisIsLEA = K == InstX86Base::Lea;
-      static_assert(!ThisIsLEA, "Lea should be a unaryop.");
-      emitIASRegOpTyGPR(Func, !ThisIsLEA, Ty, this->getDest(), this->getSrc(1),
-                        Emitter);
-    }
-    void dump(const Cfg *Func) const override {
-      if (!BuildDefs::dump())
-        return;
-      Ostream &Str = Func->getContext()->getStrDump();
-      this->dumpDest(Func);
-      Str << " = " << Opcode << "." << this->getDest()->getType() << " ";
-      this->dumpSources(Func);
-    }
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::K);
-    }
-
-  protected:
-    InstX86BaseBinopGPR(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86Base(Func, K, 2, Dest) {
-      this->addSource(Dest);
-      this->addSource(Source);
-    }
-
-    static const char *const Opcode;
-    static const GPREmitterRegOp Emitter;
-  };
-
-  template <typename InstX86Base::InstKindX86 K>
-  class InstX86BaseBinopRMW : public InstX86Base {
-    InstX86BaseBinopRMW() = delete;
-    InstX86BaseBinopRMW(const InstX86BaseBinopRMW &) = delete;
-    InstX86BaseBinopRMW &operator=(const InstX86BaseBinopRMW &) = delete;
-
-  public:
-    using Base = InstX86BaseBinopRMW<K>;
-
-    void emit(const Cfg *Func) const override {
-      if (!BuildDefs::dump())
-        return;
-      this->emitTwoAddress(Func, Opcode);
-    }
-    void emitIAS(const Cfg *Func) const override {
-      Type Ty = this->getSrc(0)->getType();
-      assert(this->getSrcSize() == 2);
-      emitIASAsAddrOpTyGPR(Func, Ty, this->getSrc(0), this->getSrc(1), Emitter);
-    }
-    void dump(const Cfg *Func) const override {
-      if (!BuildDefs::dump())
-        return;
-      Ostream &Str = Func->getContext()->getStrDump();
-      Str << Opcode << "." << this->getSrc(0)->getType() << " ";
-      this->dumpSources(Func);
-    }
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::K);
-    }
-
-  protected:
-    InstX86BaseBinopRMW(Cfg *Func, X86OperandMem *DestSrc0, Operand *Src1)
-        : InstX86Base(Func, K, 2, nullptr) {
-      this->addSource(DestSrc0);
-      this->addSource(Src1);
-    }
-
-    static const char *const Opcode;
-    static const GPREmitterAddrOp Emitter;
-  };
-
-  template <typename InstX86Base::InstKindX86 K, bool NeedsElementType,
-            typename InstX86Base::SseSuffix Suffix>
-  class InstX86BaseBinopXmm : public InstX86Base {
-    InstX86BaseBinopXmm() = delete;
-    InstX86BaseBinopXmm(const InstX86BaseBinopXmm &) = delete;
-    InstX86BaseBinopXmm &operator=(const InstX86BaseBinopXmm &) = delete;
-
-  public:
-    using Base = InstX86BaseBinopXmm<K, NeedsElementType, Suffix>;
-
-    void emit(const Cfg *Func) const override {
-      if (!BuildDefs::dump())
-        return;
-      this->validateVectorAddrMode();
-      const Type DestTy = ArithmeticTypeOverride == IceType_void
-                              ? this->getDest()->getType()
-                              : ArithmeticTypeOverride;
-      const char *SuffixString = "";
-      switch (Suffix) {
-      case InstX86Base::SseSuffix::None:
-        break;
-      case InstX86Base::SseSuffix::Packed:
-        SuffixString = Traits::TypeAttributes[DestTy].PdPsString;
-        break;
-      case InstX86Base::SseSuffix::Unpack:
-        SuffixString = Traits::TypeAttributes[DestTy].UnpackString;
-        break;
-      case InstX86Base::SseSuffix::Scalar:
-        SuffixString = Traits::TypeAttributes[DestTy].SdSsString;
-        break;
-      case InstX86Base::SseSuffix::Integral:
-        SuffixString = Traits::TypeAttributes[DestTy].IntegralString;
-        break;
-      case InstX86Base::SseSuffix::Pack:
-        SuffixString = Traits::TypeAttributes[DestTy].PackString;
-        break;
-      }
-      this->emitTwoAddress(Func, Opcode, SuffixString);
-    }
-    void emitIAS(const Cfg *Func) const override {
-      this->validateVectorAddrMode();
-      Type Ty = this->getDest()->getType();
-      if (NeedsElementType)
-        Ty = typeElementType(Ty);
-      assert(this->getSrcSize() == 2);
-      emitIASRegOpTyXMM(Func, Ty, this->getDest(), this->getSrc(1), Emitter);
-    }
-    void dump(const Cfg *Func) const override {
-      if (!BuildDefs::dump())
-        return;
-      Ostream &Str = Func->getContext()->getStrDump();
-      this->dumpDest(Func);
-      Str << " = " << Opcode << "." << this->getDest()->getType() << " ";
-      this->dumpSources(Func);
-    }
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::K);
-    }
-
-  protected:
-    InstX86BaseBinopXmm(Cfg *Func, Variable *Dest, Operand *Source,
-                        Type ArithmeticTypeOverride = IceType_void)
-        : InstX86Base(Func, K, 2, Dest),
-          ArithmeticTypeOverride(ArithmeticTypeOverride) {
-      this->addSource(Dest);
-      this->addSource(Source);
-    }
-
-    const Type ArithmeticTypeOverride;
-
-    static const char *const Opcode;
-    static const XmmEmitterRegOp Emitter;
-  };
-
-  template <typename InstX86Base::InstKindX86 K, bool AllowAllTypes = false>
-  class InstX86BaseBinopXmmShift : public InstX86Base {
-    InstX86BaseBinopXmmShift() = delete;
-    InstX86BaseBinopXmmShift(const InstX86BaseBinopXmmShift &) = delete;
-    InstX86BaseBinopXmmShift &
-    operator=(const InstX86BaseBinopXmmShift &) = delete;
-
-  public:
-    using Base = InstX86BaseBinopXmmShift<K, AllowAllTypes>;
-
-    void emit(const Cfg *Func) const override {
-      if (!BuildDefs::dump())
-        return;
-      this->validateVectorAddrMode();
-      // Shift operations are always integral, and hence always need a suffix.
-      const Type DestTy = this->getDest()->getType();
-      this->emitTwoAddress(Func, this->Opcode,
-                           Traits::TypeAttributes[DestTy].IntegralString);
-    }
-    void emitIAS(const Cfg *Func) const override {
-      this->validateVectorAddrMode();
-      Type Ty = this->getDest()->getType();
-      assert(AllowAllTypes || isVectorType(Ty));
-      Type ElementTy = typeElementType(Ty);
-      assert(this->getSrcSize() == 2);
-      emitIASXmmShift(Func, ElementTy, this->getDest(), this->getSrc(1),
-                      Emitter);
-    }
-    void dump(const Cfg *Func) const override {
-      if (!BuildDefs::dump())
-        return;
-      Ostream &Str = Func->getContext()->getStrDump();
-      this->dumpDest(Func);
-      Str << " = " << Opcode << "." << this->getDest()->getType() << " ";
-      this->dumpSources(Func);
-    }
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::K);
-    }
-
-  protected:
-    InstX86BaseBinopXmmShift(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86Base(Func, K, 2, Dest) {
-      this->addSource(Dest);
-      this->addSource(Source);
-    }
-
-    static const char *const Opcode;
-    static const XmmEmitterShiftOp Emitter;
-  };
-
-  template <typename InstX86Base::InstKindX86 K>
-  class InstX86BaseTernop : public InstX86Base {
-    InstX86BaseTernop() = delete;
-    InstX86BaseTernop(const InstX86BaseTernop &) = delete;
-    InstX86BaseTernop &operator=(const InstX86BaseTernop &) = delete;
-
-  public:
-    using Base = InstX86BaseTernop<K>;
-
-    void emit(const Cfg *Func) const override {
-      if (!BuildDefs::dump())
-        return;
-      Ostream &Str = Func->getContext()->getStrEmit();
-      assert(this->getSrcSize() == 3);
-      Str << "\t" << Opcode << "\t";
-      this->getSrc(2)->emit(Func);
-      Str << ", ";
-      this->getSrc(1)->emit(Func);
-      Str << ", ";
-      this->getDest()->emit(Func);
-    }
-    void dump(const Cfg *Func) const override {
-      if (!BuildDefs::dump())
-        return;
-      Ostream &Str = Func->getContext()->getStrDump();
-      this->dumpDest(Func);
-      Str << " = " << Opcode << "." << this->getDest()->getType() << " ";
-      this->dumpSources(Func);
-    }
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::K);
-    }
-
-  protected:
-    InstX86BaseTernop(Cfg *Func, Variable *Dest, Operand *Source1,
-                      Operand *Source2)
-        : InstX86Base(Func, K, 3, Dest) {
-      this->addSource(Dest);
-      this->addSource(Source1);
-      this->addSource(Source2);
-    }
-
-    static const char *const Opcode;
-  };
-
-  // Instructions of the form x := y op z
-  template <typename InstX86Base::InstKindX86 K>
-  class InstX86BaseThreeAddressop : public InstX86Base {
-    InstX86BaseThreeAddressop() = delete;
-    InstX86BaseThreeAddressop(const InstX86BaseThreeAddressop &) = delete;
-    InstX86BaseThreeAddressop &
-    operator=(const InstX86BaseThreeAddressop &) = delete;
-
-  public:
-    using Base = InstX86BaseThreeAddressop<K>;
-
-    void emit(const Cfg *Func) const override {
-      if (!BuildDefs::dump())
-        return;
-      Ostream &Str = Func->getContext()->getStrEmit();
-      assert(this->getSrcSize() == 2);
-      Str << "\t" << Opcode << "\t";
-      this->getSrc(1)->emit(Func);
-      Str << ", ";
-      this->getSrc(0)->emit(Func);
-      Str << ", ";
-      this->getDest()->emit(Func);
-    }
-    void dump(const Cfg *Func) const override {
-      if (!BuildDefs::dump())
-        return;
-      Ostream &Str = Func->getContext()->getStrDump();
-      this->dumpDest(Func);
-      Str << " = " << Opcode << "." << this->getDest()->getType() << " ";
-      this->dumpSources(Func);
-    }
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::K);
-    }
-
-  protected:
-    InstX86BaseThreeAddressop(Cfg *Func, Variable *Dest, Operand *Source0,
-                              Operand *Source1)
-        : InstX86Base(Func, K, 2, Dest) {
-      this->addSource(Source0);
-      this->addSource(Source1);
-    }
-
-    static const char *const Opcode;
-  };
-
-  /// Base class for assignment instructions
-  template <typename InstX86Base::InstKindX86 K>
-  class InstX86BaseMovlike : public InstX86Base {
-    InstX86BaseMovlike() = delete;
-    InstX86BaseMovlike(const InstX86BaseMovlike &) = delete;
-    InstX86BaseMovlike &operator=(const InstX86BaseMovlike &) = delete;
-
-  public:
-    using Base = InstX86BaseMovlike<K>;
-
-    bool isRedundantAssign() const override {
-      if (const auto *SrcVar =
-              llvm::dyn_cast<const Variable>(this->getSrc(0))) {
-        if (SrcVar->hasReg() && this->Dest->hasReg()) {
-          // An assignment between physical registers is considered redundant if
-          // they have the same base register and the same encoding. E.g.:
-          //   mov cl, ecx ==> redundant
-          //   mov ch, ecx ==> not redundant due to different encodings
-          //   mov ch, ebp ==> not redundant due to different base registers
-          //   mov ecx, ecx ==> redundant, and dangerous in x86-64. i64 zexting
-          //                    is handled by Inst86Zext.
-          const auto SrcReg = SrcVar->getRegNum();
-          const auto DestReg = this->Dest->getRegNum();
-          return (Traits::getEncoding(SrcReg) ==
-                  Traits::getEncoding(DestReg)) &&
-                 (Traits::getBaseReg(SrcReg) == Traits::getBaseReg(DestReg));
-        }
-      }
-      return checkForRedundantAssign(this->getDest(), this->getSrc(0));
-    }
-    bool isVarAssign() const override {
-      return llvm::isa<Variable>(this->getSrc(0));
-    }
-    void dump(const Cfg *Func) const override {
-      if (!BuildDefs::dump())
-        return;
-      Ostream &Str = Func->getContext()->getStrDump();
-      Str << Opcode << "." << this->getDest()->getType() << " ";
-      this->dumpDest(Func);
-      Str << ", ";
-      this->dumpSources(Func);
-    }
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::K);
-    }
-
-  protected:
-    InstX86BaseMovlike(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86Base(Func, K, 1, Dest) {
-      this->addSource(Source);
-      // For an integer assignment, make sure it's either a same-type assignment
-      // or a truncation.
-      assert(!isScalarIntegerType(Dest->getType()) ||
-             (typeWidthInBytes(Dest->getType()) <=
-              typeWidthInBytes(Source->getType())));
-    }
-
-    static const char *const Opcode;
-  };
-
-  class InstX86Bswap : public InstX86BaseInplaceopGPR<InstX86Base::Bswap> {
-  public:
-    static InstX86Bswap *create(Cfg *Func, Operand *SrcDest) {
-      return new (Func->allocate<InstX86Bswap>()) InstX86Bswap(Func, SrcDest);
-    }
-
-  private:
-    InstX86Bswap(Cfg *Func, Operand *SrcDest)
-        : InstX86BaseInplaceopGPR<InstX86Base::Bswap>(Func, SrcDest) {}
-  };
-
-  class InstX86Neg : public InstX86BaseInplaceopGPR<InstX86Base::Neg> {
-  public:
-    static InstX86Neg *create(Cfg *Func, Operand *SrcDest) {
-      return new (Func->allocate<InstX86Neg>()) InstX86Neg(Func, SrcDest);
-    }
-
-  private:
-    InstX86Neg(Cfg *Func, Operand *SrcDest)
-        : InstX86BaseInplaceopGPR<InstX86Base::Neg>(Func, SrcDest) {}
-  };
-
-  class InstX86Bsf : public InstX86BaseUnaryopGPR<InstX86Base::Bsf> {
-  public:
-    static InstX86Bsf *create(Cfg *Func, Variable *Dest, Operand *Src) {
-      return new (Func->allocate<InstX86Bsf>()) InstX86Bsf(Func, Dest, Src);
-    }
-
-  private:
-    InstX86Bsf(Cfg *Func, Variable *Dest, Operand *Src)
-        : InstX86BaseUnaryopGPR<InstX86Base::Bsf>(Func, Dest, Src) {}
-  };
-
-  class InstX86Bsr : public InstX86BaseUnaryopGPR<InstX86Base::Bsr> {
-  public:
-    static InstX86Bsr *create(Cfg *Func, Variable *Dest, Operand *Src) {
-      return new (Func->allocate<InstX86Bsr>()) InstX86Bsr(Func, Dest, Src);
-    }
-
-  private:
-    InstX86Bsr(Cfg *Func, Variable *Dest, Operand *Src)
-        : InstX86BaseUnaryopGPR<InstX86Base::Bsr>(Func, Dest, Src) {}
-  };
-
-  class InstX86Lea : public InstX86BaseUnaryopGPR<InstX86Base::Lea> {
-  public:
-    static InstX86Lea *create(Cfg *Func, Variable *Dest, Operand *Src) {
-      return new (Func->allocate<InstX86Lea>()) InstX86Lea(Func, Dest, Src);
-    }
-
-    void emit(const Cfg *Func) const override;
-
-  private:
-    InstX86Lea(Cfg *Func, Variable *Dest, Operand *Src)
-        : InstX86BaseUnaryopGPR<InstX86Base::Lea>(Func, Dest, Src) {}
-  };
-
-  // Cbwdq instruction - wrapper for cbw, cwd, and cdq
-  class InstX86Cbwdq : public InstX86BaseUnaryopGPR<InstX86Base::Cbwdq> {
-  public:
-    static InstX86Cbwdq *create(Cfg *Func, Variable *Dest, Operand *Src) {
-      return new (Func->allocate<InstX86Cbwdq>()) InstX86Cbwdq(Func, Dest, Src);
-    }
-
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-
-  private:
-    InstX86Cbwdq(Cfg *Func, Variable *Dest, Operand *Src)
-        : InstX86BaseUnaryopGPR<InstX86Base::Cbwdq>(Func, Dest, Src) {}
-  };
-
-  class InstX86Movsx : public InstX86BaseUnaryopGPR<InstX86Base::Movsx> {
-  public:
-    static InstX86Movsx *create(Cfg *Func, Variable *Dest, Operand *Src) {
-      assert(typeWidthInBytes(Dest->getType()) >
-             typeWidthInBytes(Src->getType()));
-      return new (Func->allocate<InstX86Movsx>()) InstX86Movsx(Func, Dest, Src);
-    }
-
-    void emitIAS(const Cfg *Func) const override;
-
-  private:
-    InstX86Movsx(Cfg *Func, Variable *Dest, Operand *Src)
-        : InstX86BaseUnaryopGPR<InstX86Base::Movsx>(Func, Dest, Src) {}
-  };
-
-  class InstX86Movzx : public InstX86BaseUnaryopGPR<InstX86Base::Movzx> {
-  public:
-    static InstX86Movzx *create(Cfg *Func, Variable *Dest, Operand *Src) {
-      assert(typeWidthInBytes(Dest->getType()) >
-             typeWidthInBytes(Src->getType()));
-      return new (Func->allocate<InstX86Movzx>()) InstX86Movzx(Func, Dest, Src);
-    }
-
-    void emit(const Cfg *Func) const override;
-
-    void emitIAS(const Cfg *Func) const override;
-
-    void setMustKeep() { MustKeep = true; }
-
-  private:
-    bool MustKeep = false;
-
-    InstX86Movzx(Cfg *Func, Variable *Dest, Operand *Src)
-        : InstX86BaseUnaryopGPR<InstX86Base::Movzx>(Func, Dest, Src) {}
-
-    bool mayBeElided(const Variable *Dest, const Operand *Src) const;
-  };
-
-  class InstX86Movd : public InstX86BaseUnaryopXmm<InstX86Base::Movd> {
-  public:
-    static InstX86Movd *create(Cfg *Func, Variable *Dest, Operand *Src) {
-      return new (Func->allocate<InstX86Movd>()) InstX86Movd(Func, Dest, Src);
-    }
-
-    void emit(const Cfg *Func) const override;
-
-    void emitIAS(const Cfg *Func) const override;
-
-  private:
-    InstX86Movd(Cfg *Func, Variable *Dest, Operand *Src)
-        : InstX86BaseUnaryopXmm<InstX86Base::Movd>(Func, Dest, Src) {}
-  };
-
-  class InstX86Movmsk final : public InstX86Base {
-    InstX86Movmsk() = delete;
-    InstX86Movmsk(const InstX86Movmsk &) = delete;
-    InstX86Movmsk &operator=(const InstX86Movmsk &) = delete;
-
-  public:
-    static InstX86Movmsk *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Movmsk>())
-          InstX86Movmsk(Func, Dest, Source);
-    }
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-    void dump(const Cfg *Func) const override;
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::InstX86Movmsk);
-    }
-
-  private:
-    InstX86Movmsk(Cfg *Func, Variable *Dest, Operand *Source);
-  };
-
-  class InstX86Sqrt : public InstX86BaseUnaryopXmm<InstX86Base::Sqrt> {
-  public:
-    static InstX86Sqrt *create(Cfg *Func, Variable *Dest, Operand *Src) {
-      return new (Func->allocate<InstX86Sqrt>()) InstX86Sqrt(Func, Dest, Src);
-    }
-
-    virtual void emit(const Cfg *Func) const override;
-
-  private:
-    InstX86Sqrt(Cfg *Func, Variable *Dest, Operand *Src)
-        : InstX86BaseUnaryopXmm<InstX86Base::Sqrt>(Func, Dest, Src) {}
-  };
-
-  /// Move/assignment instruction - wrapper for mov/movss/movsd.
-  class InstX86Mov : public InstX86BaseMovlike<InstX86Base::Mov> {
-  public:
-    static InstX86Mov *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      assert(!isScalarIntegerType(Dest->getType()) ||
-             (typeWidthInBytes(Dest->getType()) <=
-              typeWidthInBytes(Source->getType())));
-      return new (Func->allocate<InstX86Mov>()) InstX86Mov(Func, Dest, Source);
-    }
-
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-
-  private:
-    InstX86Mov(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseMovlike<InstX86Base::Mov>(Func, Dest, Source) {}
-  };
-
-  /// Move packed - copy 128 bit values between XMM registers, or mem128 and XMM
-  /// registers.
-  class InstX86Movp : public InstX86BaseMovlike<InstX86Base::Movp> {
-  public:
-    static InstX86Movp *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Movp>())
-          InstX86Movp(Func, Dest, Source);
-    }
-
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-
-  private:
-    InstX86Movp(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseMovlike<InstX86Base::Movp>(Func, Dest, Source) {}
-  };
-
-  /// Movq - copy between XMM registers, or mem64 and XMM registers.
-  class InstX86Movq : public InstX86BaseMovlike<InstX86Base::Movq> {
-  public:
-    static InstX86Movq *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Movq>())
-          InstX86Movq(Func, Dest, Source);
-    }
-
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-
-  private:
-    InstX86Movq(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseMovlike<InstX86Base::Movq>(Func, Dest, Source) {}
-  };
-
-  class InstX86Add : public InstX86BaseBinopGPR<InstX86Base::Add> {
-  public:
-    static InstX86Add *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Add>()) InstX86Add(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Add(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopGPR<InstX86Base::Add>(Func, Dest, Source) {}
-  };
-
-  class InstX86AddRMW : public InstX86BaseBinopRMW<InstX86Base::AddRMW> {
-  public:
-    static InstX86AddRMW *create(Cfg *Func, X86OperandMem *DestSrc0,
-                                 Operand *Src1) {
-      return new (Func->allocate<InstX86AddRMW>())
-          InstX86AddRMW(Func, DestSrc0, Src1);
-    }
-
-  private:
-    InstX86AddRMW(Cfg *Func, X86OperandMem *DestSrc0, Operand *Src1)
-        : InstX86BaseBinopRMW<InstX86Base::AddRMW>(Func, DestSrc0, Src1) {}
-  };
-
-  class InstX86Addps
-      : public InstX86BaseBinopXmm<InstX86Base::Addps, true,
-                                   InstX86Base::SseSuffix::Packed> {
-  public:
-    static InstX86Addps *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Addps>())
-          InstX86Addps(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Addps(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopXmm<InstX86Base::Addps, true,
-                              InstX86Base::SseSuffix::Packed>(Func, Dest,
-                                                              Source) {}
-  };
-
-  class InstX86Adc : public InstX86BaseBinopGPR<InstX86Base::Adc> {
-  public:
-    static InstX86Adc *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Adc>()) InstX86Adc(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Adc(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopGPR<InstX86Base::Adc>(Func, Dest, Source) {}
-  };
-
-  class InstX86AdcRMW : public InstX86BaseBinopRMW<InstX86Base::AdcRMW> {
-  public:
-    static InstX86AdcRMW *create(Cfg *Func, X86OperandMem *DestSrc0,
-                                 Operand *Src1) {
-      return new (Func->allocate<InstX86AdcRMW>())
-          InstX86AdcRMW(Func, DestSrc0, Src1);
-    }
-
-  private:
-    InstX86AdcRMW(Cfg *Func, X86OperandMem *DestSrc0, Operand *Src1)
-        : InstX86BaseBinopRMW<InstX86Base::AdcRMW>(Func, DestSrc0, Src1) {}
-  };
-
-  class InstX86Addss
-      : public InstX86BaseBinopXmm<InstX86Base::Addss, false,
-                                   InstX86Base::SseSuffix::Scalar> {
-  public:
-    static InstX86Addss *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Addss>())
-          InstX86Addss(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Addss(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopXmm<InstX86Base::Addss, false,
-                              InstX86Base::SseSuffix::Scalar>(Func, Dest,
-                                                              Source) {}
-  };
-
-  class InstX86Padd
-      : public InstX86BaseBinopXmm<InstX86Base::Padd, true,
-                                   InstX86Base::SseSuffix::Integral> {
-  public:
-    static InstX86Padd *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Padd>())
-          InstX86Padd(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Padd(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopXmm<InstX86Base::Padd, true,
-                              InstX86Base::SseSuffix::Integral>(Func, Dest,
-                                                                Source) {}
-  };
-
-  class InstX86Padds
-      : public InstX86BaseBinopXmm<InstX86Base::Padds, true,
-                                   InstX86Base::SseSuffix::Integral> {
-  public:
-    static InstX86Padds *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Padds>())
-          InstX86Padds(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Padds(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopXmm<InstX86Base::Padds, true,
-                              InstX86Base::SseSuffix::Integral>(Func, Dest,
-                                                                Source) {}
-  };
-
-  class InstX86Paddus
-      : public InstX86BaseBinopXmm<InstX86Base::Paddus, true,
-                                   InstX86Base::SseSuffix::Integral> {
-  public:
-    static InstX86Paddus *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Paddus>())
-          InstX86Paddus(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Paddus(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopXmm<InstX86Base::Paddus, true,
-                              InstX86Base::SseSuffix::Integral>(Func, Dest,
-                                                                Source) {}
-  };
-
-  class InstX86Sub : public InstX86BaseBinopGPR<InstX86Base::Sub> {
-  public:
-    static InstX86Sub *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Sub>()) InstX86Sub(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Sub(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopGPR<InstX86Base::Sub>(Func, Dest, Source) {}
-  };
-
-  class InstX86SubRMW : public InstX86BaseBinopRMW<InstX86Base::SubRMW> {
-  public:
-    static InstX86SubRMW *create(Cfg *Func, X86OperandMem *DestSrc0,
-                                 Operand *Src1) {
-      return new (Func->allocate<InstX86SubRMW>())
-          InstX86SubRMW(Func, DestSrc0, Src1);
-    }
-
-  private:
-    InstX86SubRMW(Cfg *Func, X86OperandMem *DestSrc0, Operand *Src1)
-        : InstX86BaseBinopRMW<InstX86Base::SubRMW>(Func, DestSrc0, Src1) {}
-  };
-
-  class InstX86Subps
-      : public InstX86BaseBinopXmm<InstX86Base::Subps, true,
-                                   InstX86Base::SseSuffix::Packed> {
-  public:
-    static InstX86Subps *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Subps>())
-          InstX86Subps(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Subps(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopXmm<InstX86Base::Subps, true,
-                              InstX86Base::SseSuffix::Packed>(Func, Dest,
-                                                              Source) {}
-  };
-
-  class InstX86Subss
-      : public InstX86BaseBinopXmm<InstX86Base::Subss, false,
-                                   InstX86Base::SseSuffix::Scalar> {
-  public:
-    static InstX86Subss *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Subss>())
-          InstX86Subss(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Subss(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopXmm<InstX86Base::Subss, false,
-                              InstX86Base::SseSuffix::Scalar>(Func, Dest,
-                                                              Source) {}
-  };
-
-  class InstX86Sbb : public InstX86BaseBinopGPR<InstX86Base::Sbb> {
-  public:
-    static InstX86Sbb *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Sbb>()) InstX86Sbb(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Sbb(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopGPR<InstX86Base::Sbb>(Func, Dest, Source) {}
-  };
-
-  class InstX86SbbRMW : public InstX86BaseBinopRMW<InstX86Base::SbbRMW> {
-  public:
-    static InstX86SbbRMW *create(Cfg *Func, X86OperandMem *DestSrc0,
-                                 Operand *Src1) {
-      return new (Func->allocate<InstX86SbbRMW>())
-          InstX86SbbRMW(Func, DestSrc0, Src1);
-    }
-
-  private:
-    InstX86SbbRMW(Cfg *Func, X86OperandMem *DestSrc0, Operand *Src1)
-        : InstX86BaseBinopRMW<InstX86Base::SbbRMW>(Func, DestSrc0, Src1) {}
-  };
-
-  class InstX86Psub
-      : public InstX86BaseBinopXmm<InstX86Base::Psub, true,
-                                   InstX86Base::SseSuffix::Integral> {
-  public:
-    static InstX86Psub *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Psub>())
-          InstX86Psub(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Psub(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopXmm<InstX86Base::Psub, true,
-                              InstX86Base::SseSuffix::Integral>(Func, Dest,
-                                                                Source) {}
-  };
-
-  class InstX86Psubs
-      : public InstX86BaseBinopXmm<InstX86Base::Psubs, true,
-                                   InstX86Base::SseSuffix::Integral> {
-  public:
-    static InstX86Psubs *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Psubs>())
-          InstX86Psubs(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Psubs(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopXmm<InstX86Base::Psubs, true,
-                              InstX86Base::SseSuffix::Integral>(Func, Dest,
-                                                                Source) {}
-  };
-
-  class InstX86Psubus
-      : public InstX86BaseBinopXmm<InstX86Base::Psubus, true,
-                                   InstX86Base::SseSuffix::Integral> {
-  public:
-    static InstX86Psubus *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Psubus>())
-          InstX86Psubus(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Psubus(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopXmm<InstX86Base::Psubus, true,
-                              InstX86Base::SseSuffix::Integral>(Func, Dest,
-                                                                Source) {}
-  };
-
-  class InstX86And : public InstX86BaseBinopGPR<InstX86Base::And> {
-  public:
-    static InstX86And *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86And>()) InstX86And(Func, Dest, Source);
-    }
-
-  private:
-    InstX86And(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopGPR<InstX86Base::And>(Func, Dest, Source) {}
-  };
-
-  class InstX86Andnps
-      : public InstX86BaseBinopXmm<InstX86Base::Andnps, true,
-                                   InstX86Base::SseSuffix::Packed> {
-  public:
-    static InstX86Andnps *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Andnps>())
-          InstX86Andnps(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Andnps(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopXmm<InstX86Base::Andnps, true,
-                              InstX86Base::SseSuffix::Packed>(Func, Dest,
-                                                              Source) {}
-  };
-
-  class InstX86Andps
-      : public InstX86BaseBinopXmm<InstX86Base::Andps, true,
-                                   InstX86Base::SseSuffix::Packed> {
-  public:
-    static InstX86Andps *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Andps>())
-          InstX86Andps(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Andps(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopXmm<InstX86Base::Andps, true,
-                              InstX86Base::SseSuffix::Packed>(Func, Dest,
-                                                              Source) {}
-  };
-
-  class InstX86AndRMW : public InstX86BaseBinopRMW<InstX86Base::AndRMW> {
-  public:
-    static InstX86AndRMW *create(Cfg *Func, X86OperandMem *DestSrc0,
-                                 Operand *Src1) {
-      return new (Func->allocate<InstX86AndRMW>())
-          InstX86AndRMW(Func, DestSrc0, Src1);
-    }
-
-  private:
-    InstX86AndRMW(Cfg *Func, X86OperandMem *DestSrc0, Operand *Src1)
-        : InstX86BaseBinopRMW<InstX86Base::AndRMW>(Func, DestSrc0, Src1) {}
-  };
-
-  class InstX86Pand : public InstX86BaseBinopXmm<InstX86Base::Pand, false,
-                                                 InstX86Base::SseSuffix::None> {
-  public:
-    static InstX86Pand *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Pand>())
-          InstX86Pand(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Pand(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopXmm<InstX86Base::Pand, false,
-                              InstX86Base::SseSuffix::None>(Func, Dest,
-                                                            Source) {}
-  };
-
-  class InstX86Pandn
-      : public InstX86BaseBinopXmm<InstX86Base::Pandn, false,
-                                   InstX86Base::SseSuffix::None> {
-  public:
-    static InstX86Pandn *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Pandn>())
-          InstX86Pandn(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Pandn(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopXmm<InstX86Base::Pandn, false,
-                              InstX86Base::SseSuffix::None>(Func, Dest,
-                                                            Source) {}
-  };
-
-  class InstX86Maxss
-      : public InstX86BaseBinopXmm<InstX86Base::Maxss, true,
-                                   InstX86Base::SseSuffix::Scalar> {
-  public:
-    static InstX86Maxss *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Maxss>())
-          InstX86Maxss(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Maxss(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopXmm<InstX86Base::Maxss, true,
-                              InstX86Base::SseSuffix::Scalar>(Func, Dest,
-                                                              Source) {}
-  };
-
-  class InstX86Minss
-      : public InstX86BaseBinopXmm<InstX86Base::Minss, true,
-                                   InstX86Base::SseSuffix::Scalar> {
-  public:
-    static InstX86Minss *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Minss>())
-          InstX86Minss(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Minss(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopXmm<InstX86Base::Minss, true,
-                              InstX86Base::SseSuffix::Scalar>(Func, Dest,
-                                                              Source) {}
-  };
-
-  class InstX86Maxps
-      : public InstX86BaseBinopXmm<InstX86Base::Maxps, true,
-                                   InstX86Base::SseSuffix::None> {
-  public:
-    static InstX86Maxps *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Maxps>())
-          InstX86Maxps(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Maxps(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopXmm<InstX86Base::Maxps, true,
-                              InstX86Base::SseSuffix::None>(Func, Dest,
-                                                            Source) {}
-  };
-
-  class InstX86Minps
-      : public InstX86BaseBinopXmm<InstX86Base::Minps, true,
-                                   InstX86Base::SseSuffix::None> {
-  public:
-    static InstX86Minps *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Minps>())
-          InstX86Minps(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Minps(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopXmm<InstX86Base::Minps, true,
-                              InstX86Base::SseSuffix::None>(Func, Dest,
-                                                            Source) {}
-  };
-
-  class InstX86Or : public InstX86BaseBinopGPR<InstX86Base::Or> {
-  public:
-    static InstX86Or *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Or>()) InstX86Or(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Or(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopGPR<InstX86Base::Or>(Func, Dest, Source) {}
-  };
-
-  class InstX86Orps
-      : public InstX86BaseBinopXmm<InstX86Base::Orps, true,
-                                   InstX86Base::SseSuffix::Packed> {
-  public:
-    static InstX86Orps *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Orps>())
-          InstX86Orps(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Orps(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopXmm<InstX86Base::Orps, true,
-                              InstX86Base::SseSuffix::Packed>(Func, Dest,
-                                                              Source) {}
-  };
-
-  class InstX86OrRMW : public InstX86BaseBinopRMW<InstX86Base::OrRMW> {
-  public:
-    static InstX86OrRMW *create(Cfg *Func, X86OperandMem *DestSrc0,
-                                Operand *Src1) {
-      return new (Func->allocate<InstX86OrRMW>())
-          InstX86OrRMW(Func, DestSrc0, Src1);
-    }
-
-  private:
-    InstX86OrRMW(Cfg *Func, X86OperandMem *DestSrc0, Operand *Src1)
-        : InstX86BaseBinopRMW<InstX86Base::OrRMW>(Func, DestSrc0, Src1) {}
-  };
-
-  class InstX86Por : public InstX86BaseBinopXmm<InstX86Base::Por, false,
-                                                InstX86Base::SseSuffix::None> {
-  public:
-    static InstX86Por *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Por>()) InstX86Por(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Por(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopXmm<InstX86Base::Por, false,
-                              InstX86Base::SseSuffix::None>(Func, Dest,
-                                                            Source) {}
-  };
-
-  class InstX86Xor : public InstX86BaseBinopGPR<InstX86Base::Xor> {
-  public:
-    static InstX86Xor *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Xor>()) InstX86Xor(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Xor(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopGPR<InstX86Base::Xor>(Func, Dest, Source) {}
-  };
-
-  class InstX86Xorps
-      : public InstX86BaseBinopXmm<InstX86Base::Xorps, true,
-                                   InstX86Base::SseSuffix::Packed> {
-  public:
-    static InstX86Xorps *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Xorps>())
-          InstX86Xorps(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Xorps(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopXmm<InstX86Base::Xorps, true,
-                              InstX86Base::SseSuffix::Packed>(Func, Dest,
-                                                              Source) {}
-  };
-
-  class InstX86XorRMW : public InstX86BaseBinopRMW<InstX86Base::XorRMW> {
-  public:
-    static InstX86XorRMW *create(Cfg *Func, X86OperandMem *DestSrc0,
-                                 Operand *Src1) {
-      return new (Func->allocate<InstX86XorRMW>())
-          InstX86XorRMW(Func, DestSrc0, Src1);
-    }
-
-  private:
-    InstX86XorRMW(Cfg *Func, X86OperandMem *DestSrc0, Operand *Src1)
-        : InstX86BaseBinopRMW<InstX86Base::XorRMW>(Func, DestSrc0, Src1) {}
-  };
-
-  class InstX86Pxor : public InstX86BaseBinopXmm<InstX86Base::Pxor, false,
-                                                 InstX86Base::SseSuffix::None> {
-  public:
-    static InstX86Pxor *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Pxor>())
-          InstX86Pxor(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Pxor(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopXmm<InstX86Base::Pxor, false,
-                              InstX86Base::SseSuffix::None>(Func, Dest,
-                                                            Source) {}
-  };
-
-  class InstX86Imul : public InstX86BaseBinopGPR<InstX86Base::Imul> {
-  public:
-    static InstX86Imul *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Imul>())
-          InstX86Imul(Func, Dest, Source);
-    }
-
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-
-  private:
-    InstX86Imul(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopGPR<InstX86Base::Imul>(Func, Dest, Source) {}
-  };
-
-  class InstX86ImulImm
-      : public InstX86BaseThreeAddressop<InstX86Base::ImulImm> {
-  public:
-    static InstX86ImulImm *create(Cfg *Func, Variable *Dest, Operand *Source0,
-                                  Operand *Source1) {
-      return new (Func->allocate<InstX86ImulImm>())
-          InstX86ImulImm(Func, Dest, Source0, Source1);
-    }
-
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-
-  private:
-    InstX86ImulImm(Cfg *Func, Variable *Dest, Operand *Source0,
-                   Operand *Source1)
-        : InstX86BaseThreeAddressop<InstX86Base::ImulImm>(Func, Dest, Source0,
-                                                          Source1) {}
-  };
-
-  class InstX86Mulps
-      : public InstX86BaseBinopXmm<InstX86Base::Mulps, true,
-                                   InstX86Base::SseSuffix::Packed> {
-  public:
-    static InstX86Mulps *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Mulps>())
-          InstX86Mulps(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Mulps(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopXmm<InstX86Base::Mulps, true,
-                              InstX86Base::SseSuffix::Packed>(Func, Dest,
-                                                              Source) {}
-  };
-
-  class InstX86Mulss
-      : public InstX86BaseBinopXmm<InstX86Base::Mulss, false,
-                                   InstX86Base::SseSuffix::Scalar> {
-  public:
-    static InstX86Mulss *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Mulss>())
-          InstX86Mulss(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Mulss(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopXmm<InstX86Base::Mulss, false,
-                              InstX86Base::SseSuffix::Scalar>(Func, Dest,
-                                                              Source) {}
-  };
-
-  class InstX86Pmull
-      : public InstX86BaseBinopXmm<InstX86Base::Pmull, true,
-                                   InstX86Base::SseSuffix::Integral> {
-  public:
-    static InstX86Pmull *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      bool TypesAreValid =
-          Dest->getType() == IceType_v4i32 || Dest->getType() == IceType_v8i16;
-      auto *Target = InstX86Base::getTarget(Func);
-      bool InstructionSetIsValid =
-          Dest->getType() == IceType_v8i16 ||
-          Target->getInstructionSet() >= Traits::SSE4_1;
-      (void)TypesAreValid;
-      (void)InstructionSetIsValid;
-      assert(TypesAreValid);
-      assert(InstructionSetIsValid);
-      return new (Func->allocate<InstX86Pmull>())
-          InstX86Pmull(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Pmull(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopXmm<InstX86Base::Pmull, true,
-                              InstX86Base::SseSuffix::Integral>(Func, Dest,
-                                                                Source) {}
-  };
-
-  class InstX86Pmulhw
-      : public InstX86BaseBinopXmm<InstX86Base::Pmulhw, false,
-                                   InstX86Base::SseSuffix::None> {
-  public:
-    static InstX86Pmulhw *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      assert(Dest->getType() == IceType_v8i16 &&
-             Source->getType() == IceType_v8i16);
-      return new (Func->allocate<InstX86Pmulhw>())
-          InstX86Pmulhw(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Pmulhw(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopXmm<InstX86Base::Pmulhw, false,
-                              InstX86Base::SseSuffix::None>(Func, Dest,
-                                                            Source) {}
-  };
-
-  class InstX86Pmulhuw
-      : public InstX86BaseBinopXmm<InstX86Base::Pmulhuw, false,
-                                   InstX86Base::SseSuffix::None> {
-  public:
-    static InstX86Pmulhuw *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      assert(Dest->getType() == IceType_v8i16 &&
-             Source->getType() == IceType_v8i16);
-      return new (Func->allocate<InstX86Pmulhuw>())
-          InstX86Pmulhuw(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Pmulhuw(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopXmm<InstX86Base::Pmulhuw, false,
-                              InstX86Base::SseSuffix::None>(Func, Dest,
-                                                            Source) {}
-  };
-
-  class InstX86Pmaddwd
-      : public InstX86BaseBinopXmm<InstX86Base::Pmaddwd, false,
-                                   InstX86Base::SseSuffix::None> {
-  public:
-    static InstX86Pmaddwd *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      assert(Dest->getType() == IceType_v8i16 &&
-             Source->getType() == IceType_v8i16);
-      return new (Func->allocate<InstX86Pmaddwd>())
-          InstX86Pmaddwd(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Pmaddwd(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopXmm<InstX86Base::Pmaddwd, false,
-                              InstX86Base::SseSuffix::None>(Func, Dest,
-                                                            Source) {}
-  };
-
-  class InstX86Pmuludq
-      : public InstX86BaseBinopXmm<InstX86Base::Pmuludq, false,
-                                   InstX86Base::SseSuffix::None> {
-  public:
-    static InstX86Pmuludq *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      assert(Dest->getType() == IceType_v4i32 &&
-             Source->getType() == IceType_v4i32);
-      return new (Func->allocate<InstX86Pmuludq>())
-          InstX86Pmuludq(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Pmuludq(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopXmm<InstX86Base::Pmuludq, false,
-                              InstX86Base::SseSuffix::None>(Func, Dest,
-                                                            Source) {}
-  };
-
-  class InstX86Divps
-      : public InstX86BaseBinopXmm<InstX86Base::Divps, true,
-                                   InstX86Base::SseSuffix::Packed> {
-  public:
-    static InstX86Divps *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Divps>())
-          InstX86Divps(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Divps(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopXmm<InstX86Base::Divps, true,
-                              InstX86Base::SseSuffix::Packed>(Func, Dest,
-                                                              Source) {}
-  };
-
-  class InstX86Divss
-      : public InstX86BaseBinopXmm<InstX86Base::Divss, false,
-                                   InstX86Base::SseSuffix::Scalar> {
-  public:
-    static InstX86Divss *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Divss>())
-          InstX86Divss(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Divss(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopXmm<InstX86Base::Divss, false,
-                              InstX86Base::SseSuffix::Scalar>(Func, Dest,
-                                                              Source) {}
-  };
-
-  class InstX86Rol : public InstX86BaseBinopGPRShift<InstX86Base::Rol> {
-  public:
-    static InstX86Rol *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Rol>()) InstX86Rol(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Rol(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopGPRShift<InstX86Base::Rol>(Func, Dest, Source) {}
-  };
-
-  class InstX86Shl : public InstX86BaseBinopGPRShift<InstX86Base::Shl> {
-  public:
-    static InstX86Shl *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Shl>()) InstX86Shl(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Shl(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopGPRShift<InstX86Base::Shl>(Func, Dest, Source) {}
-  };
-
-  class InstX86Psll : public InstX86BaseBinopXmmShift<InstX86Base::Psll> {
-  public:
-    static InstX86Psll *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      assert(
-          Dest->getType() == IceType_v8i16 || Dest->getType() == IceType_v8i1 ||
-          Dest->getType() == IceType_v4i32 || Dest->getType() == IceType_v4i1);
-      return new (Func->allocate<InstX86Psll>())
-          InstX86Psll(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Psll(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopXmmShift<InstX86Base::Psll>(Func, Dest, Source) {}
-  };
-
-  class InstX86Psrl : public InstX86BaseBinopXmmShift<InstX86Base::Psrl, true> {
-  public:
-    static InstX86Psrl *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Psrl>())
-          InstX86Psrl(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Psrl(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopXmmShift<InstX86Base::Psrl, true>(Func, Dest,
-                                                            Source) {}
-  };
-
-  class InstX86Shr : public InstX86BaseBinopGPRShift<InstX86Base::Shr> {
-  public:
-    static InstX86Shr *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Shr>()) InstX86Shr(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Shr(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopGPRShift<InstX86Base::Shr>(Func, Dest, Source) {}
-  };
-
-  class InstX86Sar : public InstX86BaseBinopGPRShift<InstX86Base::Sar> {
-  public:
-    static InstX86Sar *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Sar>()) InstX86Sar(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Sar(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopGPRShift<InstX86Base::Sar>(Func, Dest, Source) {}
-  };
-
-  class InstX86Psra : public InstX86BaseBinopXmmShift<InstX86Base::Psra> {
-  public:
-    static InstX86Psra *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      assert(
-          Dest->getType() == IceType_v8i16 || Dest->getType() == IceType_v8i1 ||
-          Dest->getType() == IceType_v4i32 || Dest->getType() == IceType_v4i1);
-      return new (Func->allocate<InstX86Psra>())
-          InstX86Psra(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Psra(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopXmmShift<InstX86Base::Psra>(Func, Dest, Source) {}
-  };
-
-  class InstX86Pcmpeq
-      : public InstX86BaseBinopXmm<InstX86Base::Pcmpeq, true,
-                                   InstX86Base::SseSuffix::Integral> {
-  public:
-    static InstX86Pcmpeq *create(Cfg *Func, Variable *Dest, Operand *Source,
-                                 Type ArithmeticTypeOverride = IceType_void) {
-      const Type Ty = ArithmeticTypeOverride == IceType_void
-                          ? Dest->getType()
-                          : ArithmeticTypeOverride;
-      (void)Ty;
-      assert((Ty != IceType_f64 && Ty != IceType_i64) ||
-             InstX86Base::getTarget(Func)->getInstructionSet() >=
-                 Traits::SSE4_1);
-      return new (Func->allocate<InstX86Pcmpeq>())
-          InstX86Pcmpeq(Func, Dest, Source, ArithmeticTypeOverride);
-    }
-
-  private:
-    InstX86Pcmpeq(Cfg *Func, Variable *Dest, Operand *Source,
-                  Type ArithmeticTypeOverride)
-        : InstX86BaseBinopXmm<InstX86Base::Pcmpeq, true,
-                              InstX86Base::SseSuffix::Integral>(
-              Func, Dest, Source, ArithmeticTypeOverride) {}
-  };
-
-  class InstX86Pcmpgt
-      : public InstX86BaseBinopXmm<InstX86Base::Pcmpgt, true,
-                                   InstX86Base::SseSuffix::Integral> {
-  public:
-    static InstX86Pcmpgt *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      assert(Dest->getType() != IceType_f64 ||
-             InstX86Base::getTarget(Func)->getInstructionSet() >=
-                 Traits::SSE4_1);
-      return new (Func->allocate<InstX86Pcmpgt>())
-          InstX86Pcmpgt(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Pcmpgt(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopXmm<InstX86Base::Pcmpgt, true,
-                              InstX86Base::SseSuffix::Integral>(Func, Dest,
-                                                                Source) {}
-  };
-
-  /// movss is only a binary operation when the source and dest operands are
-  /// both registers (the high bits of dest are left untouched). In other cases,
-  /// it behaves like a copy (mov-like) operation (and the high bits of dest are
-  /// cleared). InstX86Movss will assert that both its source and dest operands
-  /// are registers, so the lowering code should use _mov instead of _movss in
-  /// cases where a copy operation is intended.
-  class InstX86MovssRegs
-      : public InstX86BaseBinopXmm<InstX86Base::MovssRegs, false,
-                                   InstX86Base::SseSuffix::None> {
-  public:
-    static InstX86MovssRegs *create(Cfg *Func, Variable *Dest,
-                                    Operand *Source) {
-      return new (Func->allocate<InstX86MovssRegs>())
-          InstX86MovssRegs(Func, Dest, Source);
-    }
-
-    void emitIAS(const Cfg *Func) const override;
-
-  private:
-    InstX86MovssRegs(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopXmm<InstX86Base::MovssRegs, false,
-                              InstX86Base::SseSuffix::None>(Func, Dest,
-                                                            Source) {}
-  };
-
-  class InstX86Idiv : public InstX86BaseTernop<InstX86Base::Idiv> {
-  public:
-    static InstX86Idiv *create(Cfg *Func, Variable *Dest, Operand *Source1,
-                               Operand *Source2) {
-      return new (Func->allocate<InstX86Idiv>())
-          InstX86Idiv(Func, Dest, Source1, Source2);
-    }
-
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-
-  private:
-    InstX86Idiv(Cfg *Func, Variable *Dest, Operand *Source1, Operand *Source2)
-        : InstX86BaseTernop<InstX86Base::Idiv>(Func, Dest, Source1, Source2) {}
-  };
-
-  class InstX86Div : public InstX86BaseTernop<InstX86Base::Div> {
-  public:
-    static InstX86Div *create(Cfg *Func, Variable *Dest, Operand *Source1,
-                              Operand *Source2) {
-      return new (Func->allocate<InstX86Div>())
-          InstX86Div(Func, Dest, Source1, Source2);
-    }
-
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-
-  private:
-    InstX86Div(Cfg *Func, Variable *Dest, Operand *Source1, Operand *Source2)
-        : InstX86BaseTernop<InstX86Base::Div>(Func, Dest, Source1, Source2) {}
-  };
-
-  class InstX86Insertps : public InstX86BaseTernop<InstX86Base::Insertps> {
-  public:
-    static InstX86Insertps *create(Cfg *Func, Variable *Dest, Operand *Source1,
-                                   Operand *Source2) {
-      return new (Func->allocate<InstX86Insertps>())
-          InstX86Insertps(Func, Dest, Source1, Source2);
-    }
-
-    void emitIAS(const Cfg *Func) const override;
-
-  private:
-    InstX86Insertps(Cfg *Func, Variable *Dest, Operand *Source1,
-                    Operand *Source2)
-        : InstX86BaseTernop<InstX86Base::Insertps>(Func, Dest, Source1,
-                                                   Source2) {}
-  };
-
-  class InstX86Pinsr : public InstX86BaseTernop<InstX86Base::Pinsr> {
-  public:
-    static InstX86Pinsr *create(Cfg *Func, Variable *Dest, Operand *Source1,
-                                Operand *Source2) {
-      // pinsrb and pinsrd are SSE4.1 instructions.
-      assert(
-          Dest->getType() == IceType_v8i16 || Dest->getType() == IceType_v8i1 ||
-          InstX86Base::getTarget(Func)->getInstructionSet() >= Traits::SSE4_1);
-      return new (Func->allocate<InstX86Pinsr>())
-          InstX86Pinsr(Func, Dest, Source1, Source2);
-    }
-
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-
-  private:
-    InstX86Pinsr(Cfg *Func, Variable *Dest, Operand *Source1, Operand *Source2)
-        : InstX86BaseTernop<InstX86Base::Pinsr>(Func, Dest, Source1, Source2) {}
-  };
-
-  class InstX86Shufps : public InstX86BaseTernop<InstX86Base::Shufps> {
-  public:
-    static InstX86Shufps *create(Cfg *Func, Variable *Dest, Operand *Source1,
-                                 Operand *Source2) {
-      return new (Func->allocate<InstX86Shufps>())
-          InstX86Shufps(Func, Dest, Source1, Source2);
-    }
-
-    void emitIAS(const Cfg *Func) const override;
-
-  private:
-    InstX86Shufps(Cfg *Func, Variable *Dest, Operand *Source1, Operand *Source2)
-        : InstX86BaseTernop<InstX86Base::Shufps>(Func, Dest, Source1, Source2) {
-    }
-  };
-
-  class InstX86Blendvps : public InstX86BaseTernop<InstX86Base::Blendvps> {
-  public:
-    static InstX86Blendvps *create(Cfg *Func, Variable *Dest, Operand *Source1,
-                                   Operand *Source2) {
-      assert(InstX86Base::getTarget(Func)->getInstructionSet() >=
-             Traits::SSE4_1);
-      return new (Func->allocate<InstX86Blendvps>())
-          InstX86Blendvps(Func, Dest, Source1, Source2);
-    }
-
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Fund) const override;
-
-  private:
-    InstX86Blendvps(Cfg *Func, Variable *Dest, Operand *Source1,
-                    Operand *Source2)
-        : InstX86BaseTernop<InstX86Base::Blendvps>(Func, Dest, Source1,
-                                                   Source2) {}
-  };
-
-  class InstX86Pblendvb : public InstX86BaseTernop<InstX86Base::Pblendvb> {
-  public:
-    static InstX86Pblendvb *create(Cfg *Func, Variable *Dest, Operand *Source1,
-                                   Operand *Source2) {
-      assert(InstX86Base::getTarget(Func)->getInstructionSet() >=
-             Traits::SSE4_1);
-      return new (Func->allocate<InstX86Pblendvb>())
-          InstX86Pblendvb(Func, Dest, Source1, Source2);
-    }
-
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-
-  private:
-    InstX86Pblendvb(Cfg *Func, Variable *Dest, Operand *Source1,
-                    Operand *Source2)
-        : InstX86BaseTernop<InstX86Base::Pblendvb>(Func, Dest, Source1,
-                                                   Source2) {}
-  };
-
-  class InstX86Pextr : public InstX86BaseThreeAddressop<InstX86Base::Pextr> {
-  public:
-    static InstX86Pextr *create(Cfg *Func, Variable *Dest, Operand *Source0,
-                                Operand *Source1) {
-      assert(Source0->getType() == IceType_v8i16 ||
-             Source0->getType() == IceType_v8i1 ||
-             InstX86Base::getTarget(Func)->getInstructionSet() >=
-                 Traits::SSE4_1);
-      return new (Func->allocate<InstX86Pextr>())
-          InstX86Pextr(Func, Dest, Source0, Source1);
-    }
-
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-
-  private:
-    InstX86Pextr(Cfg *Func, Variable *Dest, Operand *Source0, Operand *Source1)
-        : InstX86BaseThreeAddressop<InstX86Base::Pextr>(Func, Dest, Source0,
-                                                        Source1) {}
-  };
-
-  class InstX86Pshufd : public InstX86BaseThreeAddressop<InstX86Base::Pshufd> {
-  public:
-    static InstX86Pshufd *create(Cfg *Func, Variable *Dest, Operand *Source0,
-                                 Operand *Source1) {
-      return new (Func->allocate<InstX86Pshufd>())
-          InstX86Pshufd(Func, Dest, Source0, Source1);
-    }
-
-    void emitIAS(const Cfg *Func) const override;
-
-  private:
-    InstX86Pshufd(Cfg *Func, Variable *Dest, Operand *Source0, Operand *Source1)
-        : InstX86BaseThreeAddressop<InstX86Base::Pshufd>(Func, Dest, Source0,
-                                                         Source1) {}
-  };
-
-  /// Base class for a lockable x86-32 instruction (emits a locked prefix).
-  class InstX86BaseLockable : public InstX86Base {
-    InstX86BaseLockable() = delete;
-    InstX86BaseLockable(const InstX86BaseLockable &) = delete;
-    InstX86BaseLockable &operator=(const InstX86BaseLockable &) = delete;
-
-  protected:
-    bool Locked;
-
-    InstX86BaseLockable(Cfg *Func, typename InstX86Base::InstKindX86 Kind,
-                        SizeT Maxsrcs, Variable *Dest, bool Locked)
-        : InstX86Base(Func, Kind, Maxsrcs, Dest), Locked(Locked) {
-      // Assume that such instructions are used for Atomics and be careful with
-      // optimizations.
-      this->HasSideEffects = Locked;
-    }
-  };
-
-  /// Mul instruction - unsigned multiply.
-  class InstX86Mul final : public InstX86Base {
-    InstX86Mul() = delete;
-    InstX86Mul(const InstX86Mul &) = delete;
-    InstX86Mul &operator=(const InstX86Mul &) = delete;
-
-  public:
-    static InstX86Mul *create(Cfg *Func, Variable *Dest, Variable *Source1,
-                              Operand *Source2) {
-      return new (Func->allocate<InstX86Mul>())
-          InstX86Mul(Func, Dest, Source1, Source2);
-    }
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-    void dump(const Cfg *Func) const override;
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::Mul);
-    }
-
-  private:
-    InstX86Mul(Cfg *Func, Variable *Dest, Variable *Source1, Operand *Source2);
-  };
-
-  /// Shld instruction - shift across a pair of operands.
-  class InstX86Shld final : public InstX86Base {
-    InstX86Shld() = delete;
-    InstX86Shld(const InstX86Shld &) = delete;
-    InstX86Shld &operator=(const InstX86Shld &) = delete;
-
-  public:
-    static InstX86Shld *create(Cfg *Func, Variable *Dest, Variable *Source1,
-                               Operand *Source2) {
-      return new (Func->allocate<InstX86Shld>())
-          InstX86Shld(Func, Dest, Source1, Source2);
-    }
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-    void dump(const Cfg *Func) const override;
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::Shld);
-    }
-
-  private:
-    InstX86Shld(Cfg *Func, Variable *Dest, Variable *Source1, Operand *Source2);
-  };
-
-  /// Shrd instruction - shift across a pair of operands.
-  class InstX86Shrd final : public InstX86Base {
-    InstX86Shrd() = delete;
-    InstX86Shrd(const InstX86Shrd &) = delete;
-    InstX86Shrd &operator=(const InstX86Shrd &) = delete;
-
-  public:
-    static InstX86Shrd *create(Cfg *Func, Variable *Dest, Variable *Source1,
-                               Operand *Source2) {
-      return new (Func->allocate<InstX86Shrd>())
-          InstX86Shrd(Func, Dest, Source1, Source2);
-    }
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-    void dump(const Cfg *Func) const override;
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::Shrd);
-    }
-
-  private:
-    InstX86Shrd(Cfg *Func, Variable *Dest, Variable *Source1, Operand *Source2);
-  };
-
-  /// Conditional move instruction.
-  class InstX86Cmov final : public InstX86Base {
-    InstX86Cmov() = delete;
-    InstX86Cmov(const InstX86Cmov &) = delete;
-    InstX86Cmov &operator=(const InstX86Cmov &) = delete;
-
-  public:
-    static InstX86Cmov *create(Cfg *Func, Variable *Dest, Operand *Source,
-                               BrCond Cond) {
-      return new (Func->allocate<InstX86Cmov>())
-          InstX86Cmov(Func, Dest, Source, Cond);
-    }
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-    void dump(const Cfg *Func) const override;
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::Cmov);
-    }
-
-  private:
-    InstX86Cmov(Cfg *Func, Variable *Dest, Operand *Source, BrCond Cond);
-
-    BrCond Condition;
-  };
-
-  /// Cmpps instruction - compare packed singled-precision floating point values
-  class InstX86Cmpps final : public InstX86Base {
-    InstX86Cmpps() = delete;
-    InstX86Cmpps(const InstX86Cmpps &) = delete;
-    InstX86Cmpps &operator=(const InstX86Cmpps &) = delete;
-
-  public:
-    static InstX86Cmpps *create(Cfg *Func, Variable *Dest, Operand *Source,
-                                CmppsCond Condition) {
-      return new (Func->allocate<InstX86Cmpps>())
-          InstX86Cmpps(Func, Dest, Source, Condition);
-    }
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-    void dump(const Cfg *Func) const override;
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::Cmpps);
-    }
-
-  private:
-    InstX86Cmpps(Cfg *Func, Variable *Dest, Operand *Source, CmppsCond Cond);
-
-    CmppsCond Condition;
-  };
-
-  /// Cmpxchg instruction - cmpxchg <dest>, <desired> will compare if <dest>
-  /// equals eax. If so, the ZF is set and <desired> is stored in <dest>. If
-  /// not, ZF is cleared and <dest> is copied to eax (or subregister). <dest>
-  /// can be a register or memory, while <desired> must be a register. It is
-  /// the user's responsibility to mark eax with a FakeDef.
-  class InstX86Cmpxchg final : public InstX86BaseLockable {
-    InstX86Cmpxchg() = delete;
-    InstX86Cmpxchg(const InstX86Cmpxchg &) = delete;
-    InstX86Cmpxchg &operator=(const InstX86Cmpxchg &) = delete;
-
-  public:
-    static InstX86Cmpxchg *create(Cfg *Func, Operand *DestOrAddr, Variable *Eax,
-                                  Variable *Desired, bool Locked) {
-      return new (Func->allocate<InstX86Cmpxchg>())
-          InstX86Cmpxchg(Func, DestOrAddr, Eax, Desired, Locked);
-    }
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-    void dump(const Cfg *Func) const override;
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::Cmpxchg);
-    }
-
-  private:
-    InstX86Cmpxchg(Cfg *Func, Operand *DestOrAddr, Variable *Eax,
-                   Variable *Desired, bool Locked);
-  };
-
-  /// Cmpxchg8b instruction - cmpxchg8b <m64> will compare if <m64> equals
-  /// edx:eax. If so, the ZF is set and ecx:ebx is stored in <m64>. If not, ZF
-  /// is cleared and <m64> is copied to edx:eax. The caller is responsible for
-  /// inserting FakeDefs to mark edx and eax as modified. <m64> must be a memory
-  /// operand.
-  class InstX86Cmpxchg8b final : public InstX86BaseLockable {
-    InstX86Cmpxchg8b() = delete;
-    InstX86Cmpxchg8b(const InstX86Cmpxchg8b &) = delete;
-    InstX86Cmpxchg8b &operator=(const InstX86Cmpxchg8b &) = delete;
-
-  public:
-    static InstX86Cmpxchg8b *create(Cfg *Func, X86OperandMem *Dest,
-                                    Variable *Edx, Variable *Eax, Variable *Ecx,
-                                    Variable *Ebx, bool Locked) {
-      return new (Func->allocate<InstX86Cmpxchg8b>())
-          InstX86Cmpxchg8b(Func, Dest, Edx, Eax, Ecx, Ebx, Locked);
-    }
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-    void dump(const Cfg *Func) const override;
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::Cmpxchg8b);
-    }
-
-  private:
-    InstX86Cmpxchg8b(Cfg *Func, X86OperandMem *Dest, Variable *Edx,
-                     Variable *Eax, Variable *Ecx, Variable *Ebx, bool Locked);
-  };
-
-  /// Cvt instruction - wrapper for cvtsX2sY where X and Y are in {s,d,i} as
-  /// appropriate.  s=float, d=double, i=int. X and Y are determined from
-  /// dest/src types. Sign and zero extension on the integer operand needs to be
-  /// done separately.
-  class InstX86Cvt final : public InstX86Base {
-    InstX86Cvt() = delete;
-    InstX86Cvt(const InstX86Cvt &) = delete;
-    InstX86Cvt &operator=(const InstX86Cvt &) = delete;
-
-  public:
-    enum CvtVariant { Si2ss, Tss2si, Ss2si, Float2float, Dq2ps, Tps2dq, Ps2dq };
-    static InstX86Cvt *create(Cfg *Func, Variable *Dest, Operand *Source,
-                              CvtVariant Variant) {
-      return new (Func->allocate<InstX86Cvt>())
-          InstX86Cvt(Func, Dest, Source, Variant);
-    }
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-    void dump(const Cfg *Func) const override;
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::Cvt);
-    }
-    bool isTruncating() const { return Variant == Tss2si || Variant == Tps2dq; }
-
-  private:
-    CvtVariant Variant;
-    InstX86Cvt(Cfg *Func, Variable *Dest, Operand *Source, CvtVariant Variant);
-  };
-
-  /// Round instruction
-  class InstX86Round final
-      : public InstX86BaseThreeAddressop<InstX86Base::Round> {
-  public:
-    static InstX86Round *create(Cfg *Func, Variable *Dest, Operand *Source,
-                                Operand *Imm) {
-      return new (Func->allocate<InstX86Round>())
-          InstX86Round(Func, Dest, Source, Imm);
-    }
-
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-
-  private:
-    InstX86Round(Cfg *Func, Variable *Dest, Operand *Source, Operand *Imm)
-        : InstX86BaseThreeAddressop<InstX86Base::Round>(Func, Dest, Source,
-                                                        Imm) {}
-  };
-
-  /// cmp - Integer compare instruction.
-  class InstX86Icmp final : public InstX86Base {
-    InstX86Icmp() = delete;
-    InstX86Icmp(const InstX86Icmp &) = delete;
-    InstX86Icmp &operator=(const InstX86Icmp &) = delete;
-
-  public:
-    static InstX86Icmp *create(Cfg *Func, Operand *Src1, Operand *Src2) {
-      return new (Func->allocate<InstX86Icmp>()) InstX86Icmp(Func, Src1, Src2);
-    }
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-    void dump(const Cfg *Func) const override;
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::Icmp);
-    }
-
-  private:
-    InstX86Icmp(Cfg *Func, Operand *Src1, Operand *Src2);
-  };
-
-  /// ucomiss/ucomisd - floating-point compare instruction.
-  class InstX86Ucomiss final : public InstX86Base {
-    InstX86Ucomiss() = delete;
-    InstX86Ucomiss(const InstX86Ucomiss &) = delete;
-    InstX86Ucomiss &operator=(const InstX86Ucomiss &) = delete;
-
-  public:
-    static InstX86Ucomiss *create(Cfg *Func, Operand *Src1, Operand *Src2) {
-      return new (Func->allocate<InstX86Ucomiss>())
-          InstX86Ucomiss(Func, Src1, Src2);
-    }
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-    void dump(const Cfg *Func) const override;
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::Ucomiss);
-    }
-
-  private:
-    InstX86Ucomiss(Cfg *Func, Operand *Src1, Operand *Src2);
-  };
-
-  /// UD2 instruction.
-  class InstX86UD2 final : public InstX86Base {
-    InstX86UD2() = delete;
-    InstX86UD2(const InstX86UD2 &) = delete;
-    InstX86UD2 &operator=(const InstX86UD2 &) = delete;
-
-  public:
-    static InstX86UD2 *create(Cfg *Func) {
-      return new (Func->allocate<InstX86UD2>()) InstX86UD2(Func);
-    }
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-    void dump(const Cfg *Func) const override;
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::UD2);
-    }
-
-  private:
-    explicit InstX86UD2(Cfg *Func);
-  };
-
-  /// Int3 instruction.
-  class InstX86Int3 final : public InstX86Base {
-    InstX86Int3() = delete;
-    InstX86Int3(const InstX86Int3 &) = delete;
-    InstX86Int3 &operator=(const InstX86Int3 &) = delete;
-
-  public:
-    static InstX86Int3 *create(Cfg *Func) {
-      return new (Func->allocate<InstX86Int3>()) InstX86Int3(Func);
-    }
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-    void dump(const Cfg *Func) const override;
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::Int3);
-    }
-
-  private:
-    explicit InstX86Int3(Cfg *Func);
-  };
-
-  /// Test instruction.
-  class InstX86Test final : public InstX86Base {
-    InstX86Test() = delete;
-    InstX86Test(const InstX86Test &) = delete;
-    InstX86Test &operator=(const InstX86Test &) = delete;
-
-  public:
-    static InstX86Test *create(Cfg *Func, Operand *Source1, Operand *Source2) {
-      return new (Func->allocate<InstX86Test>())
-          InstX86Test(Func, Source1, Source2);
-    }
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-    void dump(const Cfg *Func) const override;
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::Test);
-    }
-
-  private:
-    InstX86Test(Cfg *Func, Operand *Source1, Operand *Source2);
-  };
-
-  /// Mfence instruction.
-  class InstX86Mfence final : public InstX86Base {
-    InstX86Mfence() = delete;
-    InstX86Mfence(const InstX86Mfence &) = delete;
-    InstX86Mfence &operator=(const InstX86Mfence &) = delete;
-
-  public:
-    static InstX86Mfence *create(Cfg *Func) {
-      return new (Func->allocate<InstX86Mfence>()) InstX86Mfence(Func);
-    }
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-    void dump(const Cfg *Func) const override;
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::Mfence);
-    }
-
-  private:
-    explicit InstX86Mfence(Cfg *Func);
-  };
-
-  /// This is essentially a "mov" instruction with anX86OperandMem operand
-  /// instead of Variable as the destination. It's important for liveness that
-  /// there is no Dest operand.
-  class InstX86Store final : public InstX86Base {
-    InstX86Store() = delete;
-    InstX86Store(const InstX86Store &) = delete;
-    InstX86Store &operator=(const InstX86Store &) = delete;
-
-  public:
-    static InstX86Store *create(Cfg *Func, Operand *Value, X86Operand *Mem) {
-      return new (Func->allocate<InstX86Store>())
-          InstX86Store(Func, Value, Mem);
-    }
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-    void dump(const Cfg *Func) const override;
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::Store);
-    }
-
-  private:
-    InstX86Store(Cfg *Func, Operand *Value, X86Operand *Mem);
-  };
-
-  /// This is essentially a vector "mov" instruction with an typename
-  /// X86OperandMem operand instead of Variable as the destination. It's
-  /// important for liveness that there is no Dest operand. The source must be
-  /// an Xmm register, since Dest is mem.
-  class InstX86StoreP final : public InstX86Base {
-    InstX86StoreP() = delete;
-    InstX86StoreP(const InstX86StoreP &) = delete;
-    InstX86StoreP &operator=(const InstX86StoreP &) = delete;
-
-  public:
-    static InstX86StoreP *create(Cfg *Func, Variable *Value,
-                                 X86OperandMem *Mem) {
-      return new (Func->allocate<InstX86StoreP>())
-          InstX86StoreP(Func, Value, Mem);
-    }
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-    void dump(const Cfg *Func) const override;
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::StoreP);
-    }
-
-  private:
-    InstX86StoreP(Cfg *Func, Variable *Value, X86OperandMem *Mem);
-  };
-
-  class InstX86StoreQ final : public InstX86Base {
-    InstX86StoreQ() = delete;
-    InstX86StoreQ(const InstX86StoreQ &) = delete;
-    InstX86StoreQ &operator=(const InstX86StoreQ &) = delete;
-
-  public:
-    static InstX86StoreQ *create(Cfg *Func, Operand *Value,
-                                 X86OperandMem *Mem) {
-      return new (Func->allocate<InstX86StoreQ>())
-          InstX86StoreQ(Func, Value, Mem);
-    }
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-    void dump(const Cfg *Func) const override;
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::StoreQ);
-    }
-
-  private:
-    InstX86StoreQ(Cfg *Func, Operand *Value, X86OperandMem *Mem);
-  };
-
-  class InstX86StoreD final : public InstX86Base {
-    InstX86StoreD() = delete;
-    InstX86StoreD(const InstX86StoreD &) = delete;
-    InstX86StoreD &operator=(const InstX86StoreD &) = delete;
-
-  public:
-    static InstX86StoreD *create(Cfg *Func, Operand *Value,
-                                 X86OperandMem *Mem) {
-      return new (Func->allocate<InstX86StoreD>())
-          InstX86StoreD(Func, Value, Mem);
-    }
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-    void dump(const Cfg *Func) const override;
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::StoreQ);
-    }
-
-  private:
-    InstX86StoreD(Cfg *Func, Operand *Value, X86OperandMem *Mem);
-  };
-
-  /// Nop instructions of varying length
-  class InstX86Nop final : public InstX86Base {
-    InstX86Nop() = delete;
-    InstX86Nop(const InstX86Nop &) = delete;
-    InstX86Nop &operator=(const InstX86Nop &) = delete;
-
-  public:
-    // TODO: Replace with enum.
-    using NopVariant = unsigned;
-
-    static InstX86Nop *create(Cfg *Func, NopVariant Variant) {
-      return new (Func->allocate<InstX86Nop>()) InstX86Nop(Func, Variant);
-    }
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-    void dump(const Cfg *Func) const override;
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::Nop);
-    }
-
-  private:
-    InstX86Nop(Cfg *Func, NopVariant Length);
-
-    NopVariant Variant;
-  };
-
-  /// Fld - load a value onto the x87 FP stack.
-  class InstX86Fld final : public InstX86Base {
-    InstX86Fld() = delete;
-    InstX86Fld(const InstX86Fld &) = delete;
-    InstX86Fld &operator=(const InstX86Fld &) = delete;
-
-  public:
-    static InstX86Fld *create(Cfg *Func, Operand *Src) {
-      return new (Func->allocate<InstX86Fld>()) InstX86Fld(Func, Src);
-    }
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-    void dump(const Cfg *Func) const override;
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::Fld);
-    }
-
-  private:
-    InstX86Fld(Cfg *Func, Operand *Src);
-  };
-
-  /// Fstp - store x87 st(0) into memory and pop st(0).
-  class InstX86Fstp final : public InstX86Base {
-    InstX86Fstp() = delete;
-    InstX86Fstp(const InstX86Fstp &) = delete;
-    InstX86Fstp &operator=(const InstX86Fstp &) = delete;
-
-  public:
-    static InstX86Fstp *create(Cfg *Func, Variable *Dest) {
-      return new (Func->allocate<InstX86Fstp>()) InstX86Fstp(Func, Dest);
-    }
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-    void dump(const Cfg *Func) const override;
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::Fstp);
-    }
-
-  private:
-    InstX86Fstp(Cfg *Func, Variable *Dest);
-  };
-
-  class InstX86Pop final : public InstX86Base {
-    InstX86Pop() = delete;
-    InstX86Pop(const InstX86Pop &) = delete;
-    InstX86Pop &operator=(const InstX86Pop &) = delete;
-
-  public:
-    static InstX86Pop *create(Cfg *Func, Variable *Dest) {
-      return new (Func->allocate<InstX86Pop>()) InstX86Pop(Func, Dest);
-    }
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-    void dump(const Cfg *Func) const override;
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::Pop);
-    }
-
-  private:
-    InstX86Pop(Cfg *Func, Variable *Dest);
-  };
-
-  class InstX86Push final : public InstX86Base {
-    InstX86Push() = delete;
-    InstX86Push(const InstX86Push &) = delete;
-    InstX86Push &operator=(const InstX86Push &) = delete;
-
-  public:
-    static InstX86Push *create(Cfg *Func, InstX86Label *Label) {
-      return new (Func->allocate<InstX86Push>()) InstX86Push(Func, Label);
-    }
-    static InstX86Push *create(Cfg *Func, Operand *Source) {
-      return new (Func->allocate<InstX86Push>()) InstX86Push(Func, Source);
-    }
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-    void dump(const Cfg *Func) const override;
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::Push);
-    }
-
-  private:
-    InstX86Label *Label = nullptr;
-
-    InstX86Push(Cfg *Func, Operand *Source);
-    InstX86Push(Cfg *Func, InstX86Label *Label);
-  };
-
-  /// Ret instruction. Currently only supports the "ret" version that does not
-  /// pop arguments. This instruction takes a Source operand (for non-void
-  /// returning functions) for liveness analysis, though a FakeUse before the
-  /// ret would do just as well.
-  class InstX86Ret final : public InstX86Base {
-    InstX86Ret() = delete;
-    InstX86Ret(const InstX86Ret &) = delete;
-    InstX86Ret &operator=(const InstX86Ret &) = delete;
-
-  public:
-    static InstX86Ret *create(Cfg *Func, Variable *Source = nullptr) {
-      return new (Func->allocate<InstX86Ret>()) InstX86Ret(Func, Source);
-    }
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-    void dump(const Cfg *Func) const override;
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::Ret);
-    }
-
-  private:
-    InstX86Ret(Cfg *Func, Variable *Source);
-  };
-
-  /// Conditional set-byte instruction.
-  class InstX86Setcc final : public InstX86Base {
-    InstX86Setcc() = delete;
-    InstX86Setcc(const InstX86Cmov &) = delete;
-    InstX86Setcc &operator=(const InstX86Setcc &) = delete;
-
-  public:
-    static InstX86Setcc *create(Cfg *Func, Variable *Dest, BrCond Cond) {
-      return new (Func->allocate<InstX86Setcc>())
-          InstX86Setcc(Func, Dest, Cond);
-    }
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-    void dump(const Cfg *Func) const override;
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::Setcc);
-    }
-
-  private:
-    InstX86Setcc(Cfg *Func, Variable *Dest, BrCond Cond);
-
-    const BrCond Condition;
-  };
-
-  /// Exchanging Add instruction. Exchanges the first operand (destination
-  /// operand) with the second operand (source operand), then loads the sum of
-  /// the two values into the destination operand. The destination may be a
-  /// register or memory, while the source must be a register.
-  ///
-  /// Both the dest and source are updated. The caller should then insert a
-  /// FakeDef to reflect the second udpate.
-  class InstX86Xadd final : public InstX86BaseLockable {
-    InstX86Xadd() = delete;
-    InstX86Xadd(const InstX86Xadd &) = delete;
-    InstX86Xadd &operator=(const InstX86Xadd &) = delete;
-
-  public:
-    static InstX86Xadd *create(Cfg *Func, Operand *Dest, Variable *Source,
-                               bool Locked) {
-      return new (Func->allocate<InstX86Xadd>())
-          InstX86Xadd(Func, Dest, Source, Locked);
-    }
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-    void dump(const Cfg *Func) const override;
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::Xadd);
-    }
-
-  private:
-    InstX86Xadd(Cfg *Func, Operand *Dest, Variable *Source, bool Locked);
-  };
-
-  /// Exchange instruction. Exchanges the first operand (destination operand)
-  /// with the second operand (source operand). At least one of the operands
-  /// must be a register (and the other can be reg or mem). Both the Dest and
-  /// Source are updated. If there is a memory operand, then the instruction is
-  /// automatically "locked" without the need for a lock prefix.
-  class InstX86Xchg final : public InstX86Base {
-    InstX86Xchg() = delete;
-    InstX86Xchg(const InstX86Xchg &) = delete;
-    InstX86Xchg &operator=(const InstX86Xchg &) = delete;
-
-  public:
-    static InstX86Xchg *create(Cfg *Func, Operand *Dest, Variable *Source) {
-      return new (Func->allocate<InstX86Xchg>())
-          InstX86Xchg(Func, Dest, Source);
-    }
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-    void dump(const Cfg *Func) const override;
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::Xchg);
-    }
-
-  private:
-    InstX86Xchg(Cfg *Func, Operand *Dest, Variable *Source);
-  };
-
-  /// Start marker for the Intel Architecture Code Analyzer. This is not an
-  /// executable instruction and must only be used for analysis.
-  class InstX86IacaStart final : public InstX86Base {
-    InstX86IacaStart() = delete;
-    InstX86IacaStart(const InstX86IacaStart &) = delete;
-    InstX86IacaStart &operator=(const InstX86IacaStart &) = delete;
-
-  public:
-    static InstX86IacaStart *create(Cfg *Func) {
-      return new (Func->allocate<InstX86IacaStart>()) InstX86IacaStart(Func);
-    }
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-    void dump(const Cfg *Func) const override;
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::IacaStart);
-    }
-
-  private:
-    InstX86IacaStart(Cfg *Func);
-  };
-
-  /// End marker for the Intel Architecture Code Analyzer. This is not an
-  /// executable instruction and must only be used for analysis.
-  class InstX86IacaEnd final : public InstX86Base {
-    InstX86IacaEnd() = delete;
-    InstX86IacaEnd(const InstX86IacaEnd &) = delete;
-    InstX86IacaEnd &operator=(const InstX86IacaEnd &) = delete;
-
-  public:
-    static InstX86IacaEnd *create(Cfg *Func) {
-      return new (Func->allocate<InstX86IacaEnd>()) InstX86IacaEnd(Func);
-    }
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-    void dump(const Cfg *Func) const override;
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::IacaEnd);
-    }
-
-  private:
-    InstX86IacaEnd(Cfg *Func);
-  };
-
-  class InstX86Pshufb
-      : public InstX86BaseBinopXmm<InstX86Base::Pshufb, false,
-                                   InstX86Base::SseSuffix::None> {
-  public:
-    static InstX86Pshufb *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Pshufb>())
-          InstX86Pshufb(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Pshufb(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopXmm<InstX86Base::Pshufb, false,
-                              InstX86Base::SseSuffix::None>(Func, Dest,
-                                                            Source) {}
-  };
-
-  class InstX86Punpckl
-      : public InstX86BaseBinopXmm<InstX86Base::Punpckl, false,
-                                   InstX86Base::SseSuffix::Unpack> {
-  public:
-    static InstX86Punpckl *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Punpckl>())
-          InstX86Punpckl(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Punpckl(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopXmm<InstX86Base::Punpckl, false,
-                              InstX86Base::SseSuffix::Unpack>(Func, Dest,
-                                                              Source) {}
-  };
-
-  class InstX86Punpckh
-      : public InstX86BaseBinopXmm<InstX86Base::Punpckh, false,
-                                   InstX86Base::SseSuffix::Unpack> {
-  public:
-    static InstX86Punpckh *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Punpckh>())
-          InstX86Punpckh(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Punpckh(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopXmm<InstX86Base::Punpckh, false,
-                              InstX86Base::SseSuffix::Unpack>(Func, Dest,
-                                                              Source) {}
-  };
-
-  class InstX86Packss
-      : public InstX86BaseBinopXmm<InstX86Base::Packss, false,
-                                   InstX86Base::SseSuffix::Pack> {
-  public:
-    static InstX86Packss *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Packss>())
-          InstX86Packss(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Packss(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopXmm<InstX86Base::Packss, false,
-                              InstX86Base::SseSuffix::Pack>(Func, Dest,
-                                                            Source) {}
-  };
-
-  class InstX86Packus
-      : public InstX86BaseBinopXmm<InstX86Base::Packus, false,
-                                   InstX86Base::SseSuffix::Pack> {
-  public:
-    static InstX86Packus *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Packus>())
-          InstX86Packus(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Packus(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopXmm<InstX86Base::Packus, false,
-                              InstX86Base::SseSuffix::Pack>(Func, Dest,
-                                                            Source) {}
-  };
-
-}; // struct InstImpl
-
-/// struct Insts is a template that can be used to instantiate all the X86
-/// instructions for a target with a simple
-///
-/// using Insts = ::Ice::X8632::Insts<TraitsType>;
-template <typename TraitsType> struct Insts {
-  using FakeRMW = typename InstImpl<TraitsType>::InstX86FakeRMW;
-  using Label = typename InstImpl<TraitsType>::InstX86Label;
-
-  using Call = typename InstImpl<TraitsType>::InstX86Call;
-
-  using Br = typename InstImpl<TraitsType>::InstX86Br;
-  using Jmp = typename InstImpl<TraitsType>::InstX86Jmp;
-  using Bswap = typename InstImpl<TraitsType>::InstX86Bswap;
-  using Neg = typename InstImpl<TraitsType>::InstX86Neg;
-  using Bsf = typename InstImpl<TraitsType>::InstX86Bsf;
-  using Bsr = typename InstImpl<TraitsType>::InstX86Bsr;
-  using Lea = typename InstImpl<TraitsType>::InstX86Lea;
-  using Cbwdq = typename InstImpl<TraitsType>::InstX86Cbwdq;
-  using Movsx = typename InstImpl<TraitsType>::InstX86Movsx;
-  using Movzx = typename InstImpl<TraitsType>::InstX86Movzx;
-  using Movd = typename InstImpl<TraitsType>::InstX86Movd;
-  using Movmsk = typename InstImpl<TraitsType>::InstX86Movmsk;
-  using Sqrt = typename InstImpl<TraitsType>::InstX86Sqrt;
-  using Mov = typename InstImpl<TraitsType>::InstX86Mov;
-  using Movp = typename InstImpl<TraitsType>::InstX86Movp;
-  using Movq = typename InstImpl<TraitsType>::InstX86Movq;
-  using Add = typename InstImpl<TraitsType>::InstX86Add;
-  using AddRMW = typename InstImpl<TraitsType>::InstX86AddRMW;
-  using Addps = typename InstImpl<TraitsType>::InstX86Addps;
-  using Adc = typename InstImpl<TraitsType>::InstX86Adc;
-  using AdcRMW = typename InstImpl<TraitsType>::InstX86AdcRMW;
-  using Addss = typename InstImpl<TraitsType>::InstX86Addss;
-  using Andnps = typename InstImpl<TraitsType>::InstX86Andnps;
-  using Andps = typename InstImpl<TraitsType>::InstX86Andps;
-  using Padd = typename InstImpl<TraitsType>::InstX86Padd;
-  using Padds = typename InstImpl<TraitsType>::InstX86Padds;
-  using Paddus = typename InstImpl<TraitsType>::InstX86Paddus;
-  using Sub = typename InstImpl<TraitsType>::InstX86Sub;
-  using SubRMW = typename InstImpl<TraitsType>::InstX86SubRMW;
-  using Subps = typename InstImpl<TraitsType>::InstX86Subps;
-  using Subss = typename InstImpl<TraitsType>::InstX86Subss;
-  using Sbb = typename InstImpl<TraitsType>::InstX86Sbb;
-  using SbbRMW = typename InstImpl<TraitsType>::InstX86SbbRMW;
-  using Psub = typename InstImpl<TraitsType>::InstX86Psub;
-  using Psubs = typename InstImpl<TraitsType>::InstX86Psubs;
-  using Psubus = typename InstImpl<TraitsType>::InstX86Psubus;
-  using And = typename InstImpl<TraitsType>::InstX86And;
-  using AndRMW = typename InstImpl<TraitsType>::InstX86AndRMW;
-  using Pand = typename InstImpl<TraitsType>::InstX86Pand;
-  using Pandn = typename InstImpl<TraitsType>::InstX86Pandn;
-  using Or = typename InstImpl<TraitsType>::InstX86Or;
-  using Orps = typename InstImpl<TraitsType>::InstX86Orps;
-  using OrRMW = typename InstImpl<TraitsType>::InstX86OrRMW;
-  using Por = typename InstImpl<TraitsType>::InstX86Por;
-  using Xor = typename InstImpl<TraitsType>::InstX86Xor;
-  using Xorps = typename InstImpl<TraitsType>::InstX86Xorps;
-  using XorRMW = typename InstImpl<TraitsType>::InstX86XorRMW;
-  using Pxor = typename InstImpl<TraitsType>::InstX86Pxor;
-  using Maxss = typename InstImpl<TraitsType>::InstX86Maxss;
-  using Minss = typename InstImpl<TraitsType>::InstX86Minss;
-  using Maxps = typename InstImpl<TraitsType>::InstX86Maxps;
-  using Minps = typename InstImpl<TraitsType>::InstX86Minps;
-  using Imul = typename InstImpl<TraitsType>::InstX86Imul;
-  using ImulImm = typename InstImpl<TraitsType>::InstX86ImulImm;
-  using Mulps = typename InstImpl<TraitsType>::InstX86Mulps;
-  using Mulss = typename InstImpl<TraitsType>::InstX86Mulss;
-  using Pmull = typename InstImpl<TraitsType>::InstX86Pmull;
-  using Pmulhw = typename InstImpl<TraitsType>::InstX86Pmulhw;
-  using Pmulhuw = typename InstImpl<TraitsType>::InstX86Pmulhuw;
-  using Pmaddwd = typename InstImpl<TraitsType>::InstX86Pmaddwd;
-  using Pmuludq = typename InstImpl<TraitsType>::InstX86Pmuludq;
-  using Divps = typename InstImpl<TraitsType>::InstX86Divps;
-  using Divss = typename InstImpl<TraitsType>::InstX86Divss;
-  using Rol = typename InstImpl<TraitsType>::InstX86Rol;
-  using Shl = typename InstImpl<TraitsType>::InstX86Shl;
-  using Psll = typename InstImpl<TraitsType>::InstX86Psll;
-  using Psrl = typename InstImpl<TraitsType>::InstX86Psrl;
-  using Shr = typename InstImpl<TraitsType>::InstX86Shr;
-  using Sar = typename InstImpl<TraitsType>::InstX86Sar;
-  using Psra = typename InstImpl<TraitsType>::InstX86Psra;
-  using Pcmpeq = typename InstImpl<TraitsType>::InstX86Pcmpeq;
-  using Pcmpgt = typename InstImpl<TraitsType>::InstX86Pcmpgt;
-  using MovssRegs = typename InstImpl<TraitsType>::InstX86MovssRegs;
-  using Idiv = typename InstImpl<TraitsType>::InstX86Idiv;
-  using Div = typename InstImpl<TraitsType>::InstX86Div;
-  using Insertps = typename InstImpl<TraitsType>::InstX86Insertps;
-  using Pinsr = typename InstImpl<TraitsType>::InstX86Pinsr;
-  using Shufps = typename InstImpl<TraitsType>::InstX86Shufps;
-  using Blendvps = typename InstImpl<TraitsType>::InstX86Blendvps;
-  using Pblendvb = typename InstImpl<TraitsType>::InstX86Pblendvb;
-  using Pextr = typename InstImpl<TraitsType>::InstX86Pextr;
-  using Pshufd = typename InstImpl<TraitsType>::InstX86Pshufd;
-  using Lockable = typename InstImpl<TraitsType>::InstX86BaseLockable;
-  using Mul = typename InstImpl<TraitsType>::InstX86Mul;
-  using Shld = typename InstImpl<TraitsType>::InstX86Shld;
-  using Shrd = typename InstImpl<TraitsType>::InstX86Shrd;
-  using Cmov = typename InstImpl<TraitsType>::InstX86Cmov;
-  using Cmpps = typename InstImpl<TraitsType>::InstX86Cmpps;
-  using Cmpxchg = typename InstImpl<TraitsType>::InstX86Cmpxchg;
-  using Cmpxchg8b = typename InstImpl<TraitsType>::InstX86Cmpxchg8b;
-  using Cvt = typename InstImpl<TraitsType>::InstX86Cvt;
-  using Round = typename InstImpl<TraitsType>::InstX86Round;
-  using Icmp = typename InstImpl<TraitsType>::InstX86Icmp;
-  using Ucomiss = typename InstImpl<TraitsType>::InstX86Ucomiss;
-  using UD2 = typename InstImpl<TraitsType>::InstX86UD2;
-  using Int3 = typename InstImpl<TraitsType>::InstX86Int3;
-  using Test = typename InstImpl<TraitsType>::InstX86Test;
-  using Mfence = typename InstImpl<TraitsType>::InstX86Mfence;
-  using Store = typename InstImpl<TraitsType>::InstX86Store;
-  using StoreP = typename InstImpl<TraitsType>::InstX86StoreP;
-  using StoreQ = typename InstImpl<TraitsType>::InstX86StoreQ;
-  using StoreD = typename InstImpl<TraitsType>::InstX86StoreD;
-  using Nop = typename InstImpl<TraitsType>::InstX86Nop;
-  using Fld = typename InstImpl<TraitsType>::InstX86Fld;
-  using Fstp = typename InstImpl<TraitsType>::InstX86Fstp;
-  using Pop = typename InstImpl<TraitsType>::InstX86Pop;
-  using Push = typename InstImpl<TraitsType>::InstX86Push;
-  using Ret = typename InstImpl<TraitsType>::InstX86Ret;
-  using Setcc = typename InstImpl<TraitsType>::InstX86Setcc;
-  using Xadd = typename InstImpl<TraitsType>::InstX86Xadd;
-  using Xchg = typename InstImpl<TraitsType>::InstX86Xchg;
-
-  using IacaStart = typename InstImpl<TraitsType>::InstX86IacaStart;
-  using IacaEnd = typename InstImpl<TraitsType>::InstX86IacaEnd;
-
-  using Pshufb = typename InstImpl<TraitsType>::InstX86Pshufb;
-  using Punpckl = typename InstImpl<TraitsType>::InstX86Punpckl;
-  using Punpckh = typename InstImpl<TraitsType>::InstX86Punpckh;
-  using Packss = typename InstImpl<TraitsType>::InstX86Packss;
-  using Packus = typename InstImpl<TraitsType>::InstX86Packus;
-};
-
-/// X86 Instructions have static data (particularly, opcodes and instruction
-/// emitters). Each X86 target needs to declare and define all of these, so the
-/// macros below are provided so that, if something changes, all X86
-/// targets will be updated automatically.
-#define X86INSTS_DEFINE_STATIC_DATA(TraitsType)                                \
-  namespace Ice {                                                              \
-  namespace X8632 {                                                            \
-  /* In-place ops */                                                           \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Bswap::Base::Opcode =     \
-      "bswap";                                                                 \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Neg::Base::Opcode =       \
-      "neg";                                                                   \
-  /* Unary ops */                                                              \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Bsf::Base::Opcode =       \
-      "bsf";                                                                   \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Bsr::Base::Opcode =       \
-      "bsr";                                                                   \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Lea::Base::Opcode =       \
-      "lea";                                                                   \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Movd::Base::Opcode =      \
-      "movd";                                                                  \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Movsx::Base::Opcode =     \
-      "movs";                                                                  \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Movzx::Base::Opcode =     \
-      "movz";                                                                  \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Sqrt::Base::Opcode =      \
-      "sqrt";                                                                  \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Cbwdq::Base::Opcode =     \
-      "cbw/cwd/cdq";                                                           \
-  /* Mov-like ops */                                                           \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Mov::Base::Opcode =       \
-      "mov";                                                                   \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Movp::Base::Opcode =      \
-      "movups";                                                                \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Movq::Base::Opcode =      \
-      "movq";                                                                  \
-  /* Binary ops */                                                             \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Add::Base::Opcode =       \
-      "add";                                                                   \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86AddRMW::Base::Opcode =    \
-      "add";                                                                   \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Addps::Base::Opcode =     \
-      "add";                                                                   \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Adc::Base::Opcode =       \
-      "adc";                                                                   \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86AdcRMW::Base::Opcode =    \
-      "adc";                                                                   \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Addss::Base::Opcode =     \
-      "add";                                                                   \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Andnps::Base::Opcode =    \
-      "andn";                                                                  \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Andps::Base::Opcode =     \
-      "and";                                                                   \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Maxss::Base::Opcode =     \
-      "max";                                                                   \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Minss::Base::Opcode =     \
-      "min";                                                                   \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Maxps::Base::Opcode =     \
-      "max";                                                                   \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Minps::Base::Opcode =     \
-      "min";                                                                   \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Padd::Base::Opcode =      \
-      "padd";                                                                  \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Padds::Base::Opcode =     \
-      "padds";                                                                 \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Paddus::Base::Opcode =    \
-      "paddus";                                                                \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Sub::Base::Opcode =       \
-      "sub";                                                                   \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86SubRMW::Base::Opcode =    \
-      "sub";                                                                   \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Subps::Base::Opcode =     \
-      "sub";                                                                   \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Subss::Base::Opcode =     \
-      "sub";                                                                   \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Sbb::Base::Opcode =       \
-      "sbb";                                                                   \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86SbbRMW::Base::Opcode =    \
-      "sbb";                                                                   \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Psub::Base::Opcode =      \
-      "psub";                                                                  \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Psubs::Base::Opcode =     \
-      "psubs";                                                                 \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Psubus::Base::Opcode =    \
-      "psubus";                                                                \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86And::Base::Opcode =       \
-      "and";                                                                   \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86AndRMW::Base::Opcode =    \
-      "and";                                                                   \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Pand::Base::Opcode =      \
-      "pand";                                                                  \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Pandn::Base::Opcode =     \
-      "pandn";                                                                 \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Or::Base::Opcode = "or";  \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Orps::Base::Opcode =      \
-      "or";                                                                    \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86OrRMW::Base::Opcode =     \
-      "or";                                                                    \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Por::Base::Opcode =       \
-      "por";                                                                   \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Xor::Base::Opcode =       \
-      "xor";                                                                   \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Xorps::Base::Opcode =     \
-      "xor";                                                                   \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86XorRMW::Base::Opcode =    \
-      "xor";                                                                   \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Pxor::Base::Opcode =      \
-      "pxor";                                                                  \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Imul::Base::Opcode =      \
-      "imul";                                                                  \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86ImulImm::Base::Opcode =   \
-      "imul";                                                                  \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Mulps::Base::Opcode =     \
-      "mul";                                                                   \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Mulss::Base::Opcode =     \
-      "mul";                                                                   \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Pmull::Base::Opcode =     \
-      "pmull";                                                                 \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Pmulhw::Base::Opcode =    \
-      "pmulhw";                                                                \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Pmulhuw::Base::Opcode =   \
-      "pmulhuw";                                                               \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Pmaddwd::Base::Opcode =   \
-      "pmaddwd";                                                               \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Pmuludq::Base::Opcode =   \
-      "pmuludq";                                                               \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Div::Base::Opcode =       \
-      "div";                                                                   \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Divps::Base::Opcode =     \
-      "div";                                                                   \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Divss::Base::Opcode =     \
-      "div";                                                                   \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Idiv::Base::Opcode =      \
-      "idiv";                                                                  \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Rol::Base::Opcode =       \
-      "rol";                                                                   \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Shl::Base::Opcode =       \
-      "shl";                                                                   \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Psll::Base::Opcode =      \
-      "psll";                                                                  \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Shr::Base::Opcode =       \
-      "shr";                                                                   \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Sar::Base::Opcode =       \
-      "sar";                                                                   \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Psra::Base::Opcode =      \
-      "psra";                                                                  \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Psrl::Base::Opcode =      \
-      "psrl";                                                                  \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Pcmpeq::Base::Opcode =    \
-      "pcmpeq";                                                                \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Pcmpgt::Base::Opcode =    \
-      "pcmpgt";                                                                \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86MovssRegs::Base::Opcode = \
-      "movss";                                                                 \
-  /* Ternary ops */                                                            \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Insertps::Base::Opcode =  \
-      "insertps";                                                              \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Round::Base::Opcode =     \
-      "round";                                                                 \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Shufps::Base::Opcode =    \
-      "shufps";                                                                \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Pinsr::Base::Opcode =     \
-      "pinsr";                                                                 \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Blendvps::Base::Opcode =  \
-      "blendvps";                                                              \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Pblendvb::Base::Opcode =  \
-      "pblendvb";                                                              \
-  /* Three address ops */                                                      \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Pextr::Base::Opcode =     \
-      "pextr";                                                                 \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Pshufd::Base::Opcode =    \
-      "pshufd";                                                                \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Pshufb::Base::Opcode =    \
-      "pshufb";                                                                \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Punpckl::Base::Opcode =   \
-      "punpckl";                                                               \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Punpckh::Base::Opcode =   \
-      "punpckh";                                                               \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Packss::Base::Opcode =    \
-      "packss";                                                                \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Packus::Base::Opcode =    \
-      "packus";                                                                \
-  /* Inplace GPR ops */                                                        \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::GPREmitterOneOp                   \
-      InstImpl<TraitsType>::InstX86Bswap::Base::Emitter = {                    \
-          &InstImpl<TraitsType>::Assembler::bswap,                             \
-          nullptr /* only a reg form exists */                                 \
-  };                                                                           \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::GPREmitterOneOp                   \
-      InstImpl<TraitsType>::InstX86Neg::Base::Emitter = {                      \
-          &InstImpl<TraitsType>::Assembler::neg,                               \
-          &InstImpl<TraitsType>::Assembler::neg};                              \
-                                                                               \
-  /* Unary GPR ops */                                                          \
-  template <>                                                                  \
-  template <> /* uses specialized emitter. */                                  \
-  constexpr InstImpl<TraitsType>::Assembler::GPREmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Cbwdq::Base::Emitter = {nullptr, nullptr,   \
-                                                           nullptr};           \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::GPREmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Bsf::Base::Emitter = {                      \
-          &InstImpl<TraitsType>::Assembler::bsf,                               \
-          &InstImpl<TraitsType>::Assembler::bsf, nullptr};                     \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::GPREmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Bsr::Base::Emitter = {                      \
-          &InstImpl<TraitsType>::Assembler::bsr,                               \
-          &InstImpl<TraitsType>::Assembler::bsr, nullptr};                     \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::GPREmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Lea::Base::Emitter = {                      \
-          /* reg/reg and reg/imm are illegal */ nullptr,                       \
-          &InstImpl<TraitsType>::Assembler::lea, nullptr};                     \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::GPREmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Movsx::Base::Emitter = {                    \
-          &InstImpl<TraitsType>::Assembler::movsx,                             \
-          &InstImpl<TraitsType>::Assembler::movsx, nullptr};                   \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::GPREmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Movzx::Base::Emitter = {                    \
-          &InstImpl<TraitsType>::Assembler::movzx,                             \
-          &InstImpl<TraitsType>::Assembler::movzx, nullptr};                   \
-                                                                               \
-  /* Unary XMM ops */                                                          \
-  template <>                                                                  \
-  template <> /* uses specialized emitter. */                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Movd::Base::Emitter = {nullptr, nullptr};   \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Sqrt::Base::Emitter = {                     \
-          &InstImpl<TraitsType>::Assembler::sqrt,                              \
-          &InstImpl<TraitsType>::Assembler::sqrt};                             \
-                                                                               \
-  /* Binary GPR ops */                                                         \
-  template <>                                                                  \
-  template <> /* uses specialized emitter. */                                  \
-  constexpr InstImpl<TraitsType>::Assembler::GPREmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Imul::Base::Emitter = {nullptr, nullptr,    \
-                                                          nullptr};            \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::GPREmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Add::Base::Emitter = {                      \
-          &InstImpl<TraitsType>::Assembler::add,                               \
-          &InstImpl<TraitsType>::Assembler::add,                               \
-          &InstImpl<TraitsType>::Assembler::add};                              \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::GPREmitterAddrOp                  \
-      InstImpl<TraitsType>::InstX86AddRMW::Base::Emitter = {                   \
-          &InstImpl<TraitsType>::Assembler::add,                               \
-          &InstImpl<TraitsType>::Assembler::add};                              \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::GPREmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Adc::Base::Emitter = {                      \
-          &InstImpl<TraitsType>::Assembler::adc,                               \
-          &InstImpl<TraitsType>::Assembler::adc,                               \
-          &InstImpl<TraitsType>::Assembler::adc};                              \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::GPREmitterAddrOp                  \
-      InstImpl<TraitsType>::InstX86AdcRMW::Base::Emitter = {                   \
-          &InstImpl<TraitsType>::Assembler::adc,                               \
-          &InstImpl<TraitsType>::Assembler::adc};                              \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::GPREmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86And::Base::Emitter = {                      \
-          &InstImpl<TraitsType>::Assembler::And,                               \
-          &InstImpl<TraitsType>::Assembler::And,                               \
-          &InstImpl<TraitsType>::Assembler::And};                              \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::GPREmitterAddrOp                  \
-      InstImpl<TraitsType>::InstX86AndRMW::Base::Emitter = {                   \
-          &InstImpl<TraitsType>::Assembler::And,                               \
-          &InstImpl<TraitsType>::Assembler::And};                              \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::GPREmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Or::Base::Emitter = {                       \
-          &InstImpl<TraitsType>::Assembler::Or,                                \
-          &InstImpl<TraitsType>::Assembler::Or,                                \
-          &InstImpl<TraitsType>::Assembler::Or};                               \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::GPREmitterAddrOp                  \
-      InstImpl<TraitsType>::InstX86OrRMW::Base::Emitter = {                    \
-          &InstImpl<TraitsType>::Assembler::Or,                                \
-          &InstImpl<TraitsType>::Assembler::Or};                               \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::GPREmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Sbb::Base::Emitter = {                      \
-          &InstImpl<TraitsType>::Assembler::sbb,                               \
-          &InstImpl<TraitsType>::Assembler::sbb,                               \
-          &InstImpl<TraitsType>::Assembler::sbb};                              \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::GPREmitterAddrOp                  \
-      InstImpl<TraitsType>::InstX86SbbRMW::Base::Emitter = {                   \
-          &InstImpl<TraitsType>::Assembler::sbb,                               \
-          &InstImpl<TraitsType>::Assembler::sbb};                              \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::GPREmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Sub::Base::Emitter = {                      \
-          &InstImpl<TraitsType>::Assembler::sub,                               \
-          &InstImpl<TraitsType>::Assembler::sub,                               \
-          &InstImpl<TraitsType>::Assembler::sub};                              \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::GPREmitterAddrOp                  \
-      InstImpl<TraitsType>::InstX86SubRMW::Base::Emitter = {                   \
-          &InstImpl<TraitsType>::Assembler::sub,                               \
-          &InstImpl<TraitsType>::Assembler::sub};                              \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::GPREmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Xor::Base::Emitter = {                      \
-          &InstImpl<TraitsType>::Assembler::Xor,                               \
-          &InstImpl<TraitsType>::Assembler::Xor,                               \
-          &InstImpl<TraitsType>::Assembler::Xor};                              \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::GPREmitterAddrOp                  \
-      InstImpl<TraitsType>::InstX86XorRMW::Base::Emitter = {                   \
-          &InstImpl<TraitsType>::Assembler::Xor,                               \
-          &InstImpl<TraitsType>::Assembler::Xor};                              \
-                                                                               \
-  /* Binary Shift GPR ops */                                                   \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::GPREmitterShiftOp                 \
-      InstImpl<TraitsType>::InstX86Rol::Base::Emitter = {                      \
-          &InstImpl<TraitsType>::Assembler::rol,                               \
-          &InstImpl<TraitsType>::Assembler::rol};                              \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::GPREmitterShiftOp                 \
-      InstImpl<TraitsType>::InstX86Sar::Base::Emitter = {                      \
-          &InstImpl<TraitsType>::Assembler::sar,                               \
-          &InstImpl<TraitsType>::Assembler::sar};                              \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::GPREmitterShiftOp                 \
-      InstImpl<TraitsType>::InstX86Shl::Base::Emitter = {                      \
-          &InstImpl<TraitsType>::Assembler::shl,                               \
-          &InstImpl<TraitsType>::Assembler::shl};                              \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::GPREmitterShiftOp                 \
-      InstImpl<TraitsType>::InstX86Shr::Base::Emitter = {                      \
-          &InstImpl<TraitsType>::Assembler::shr,                               \
-          &InstImpl<TraitsType>::Assembler::shr};                              \
-                                                                               \
-  /* Binary XMM ops */                                                         \
-  template <>                                                                  \
-  template <> /* uses specialized emitter. */                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86MovssRegs::Base::Emitter = {nullptr,        \
-                                                               nullptr};       \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Addss::Base::Emitter = {                    \
-          &InstImpl<TraitsType>::Assembler::addss,                             \
-          &InstImpl<TraitsType>::Assembler::addss};                            \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Addps::Base::Emitter = {                    \
-          &InstImpl<TraitsType>::Assembler::addps,                             \
-          &InstImpl<TraitsType>::Assembler::addps};                            \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Divss::Base::Emitter = {                    \
-          &InstImpl<TraitsType>::Assembler::divss,                             \
-          &InstImpl<TraitsType>::Assembler::divss};                            \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Divps::Base::Emitter = {                    \
-          &InstImpl<TraitsType>::Assembler::divps,                             \
-          &InstImpl<TraitsType>::Assembler::divps};                            \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Mulss::Base::Emitter = {                    \
-          &InstImpl<TraitsType>::Assembler::mulss,                             \
-          &InstImpl<TraitsType>::Assembler::mulss};                            \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Mulps::Base::Emitter = {                    \
-          &InstImpl<TraitsType>::Assembler::mulps,                             \
-          &InstImpl<TraitsType>::Assembler::mulps};                            \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Padd::Base::Emitter = {                     \
-          &InstImpl<TraitsType>::Assembler::padd,                              \
-          &InstImpl<TraitsType>::Assembler::padd};                             \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Padds::Base::Emitter = {                    \
-          &InstImpl<TraitsType>::Assembler::padds,                             \
-          &InstImpl<TraitsType>::Assembler::padds};                            \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Paddus::Base::Emitter = {                   \
-          &InstImpl<TraitsType>::Assembler::paddus,                            \
-          &InstImpl<TraitsType>::Assembler::paddus};                           \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Pand::Base::Emitter = {                     \
-          &InstImpl<TraitsType>::Assembler::pand,                              \
-          &InstImpl<TraitsType>::Assembler::pand};                             \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Pandn::Base::Emitter = {                    \
-          &InstImpl<TraitsType>::Assembler::pandn,                             \
-          &InstImpl<TraitsType>::Assembler::pandn};                            \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Pcmpeq::Base::Emitter = {                   \
-          &InstImpl<TraitsType>::Assembler::pcmpeq,                            \
-          &InstImpl<TraitsType>::Assembler::pcmpeq};                           \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Pcmpgt::Base::Emitter = {                   \
-          &InstImpl<TraitsType>::Assembler::pcmpgt,                            \
-          &InstImpl<TraitsType>::Assembler::pcmpgt};                           \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Pmull::Base::Emitter = {                    \
-          &InstImpl<TraitsType>::Assembler::pmull,                             \
-          &InstImpl<TraitsType>::Assembler::pmull};                            \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Pmulhw::Base::Emitter = {                   \
-          &InstImpl<TraitsType>::Assembler::pmulhw,                            \
-          &InstImpl<TraitsType>::Assembler::pmulhw};                           \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Pmulhuw::Base::Emitter = {                  \
-          &InstImpl<TraitsType>::Assembler::pmulhuw,                           \
-          &InstImpl<TraitsType>::Assembler::pmulhuw};                          \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Pmaddwd::Base::Emitter = {                  \
-          &InstImpl<TraitsType>::Assembler::pmaddwd,                           \
-          &InstImpl<TraitsType>::Assembler::pmaddwd};                          \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Pmuludq::Base::Emitter = {                  \
-          &InstImpl<TraitsType>::Assembler::pmuludq,                           \
-          &InstImpl<TraitsType>::Assembler::pmuludq};                          \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Por::Base::Emitter = {                      \
-          &InstImpl<TraitsType>::Assembler::por,                               \
-          &InstImpl<TraitsType>::Assembler::por};                              \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Psub::Base::Emitter = {                     \
-          &InstImpl<TraitsType>::Assembler::psub,                              \
-          &InstImpl<TraitsType>::Assembler::psub};                             \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Psubs::Base::Emitter = {                    \
-          &InstImpl<TraitsType>::Assembler::psubs,                             \
-          &InstImpl<TraitsType>::Assembler::psubs};                            \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Psubus::Base::Emitter = {                   \
-          &InstImpl<TraitsType>::Assembler::psubus,                            \
-          &InstImpl<TraitsType>::Assembler::psubus};                           \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Pxor::Base::Emitter = {                     \
-          &InstImpl<TraitsType>::Assembler::pxor,                              \
-          &InstImpl<TraitsType>::Assembler::pxor};                             \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Subss::Base::Emitter = {                    \
-          &InstImpl<TraitsType>::Assembler::subss,                             \
-          &InstImpl<TraitsType>::Assembler::subss};                            \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Subps::Base::Emitter = {                    \
-          &InstImpl<TraitsType>::Assembler::subps,                             \
-          &InstImpl<TraitsType>::Assembler::subps};                            \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Andnps::Base::Emitter = {                   \
-          &InstImpl<TraitsType>::Assembler::andnps,                            \
-          &InstImpl<TraitsType>::Assembler::andnps};                           \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Andps::Base::Emitter = {                    \
-          &InstImpl<TraitsType>::Assembler::andps,                             \
-          &InstImpl<TraitsType>::Assembler::andps};                            \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Maxss::Base::Emitter = {                    \
-          &InstImpl<TraitsType>::Assembler::maxss,                             \
-          &InstImpl<TraitsType>::Assembler::maxss};                            \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Minss::Base::Emitter = {                    \
-          &InstImpl<TraitsType>::Assembler::minss,                             \
-          &InstImpl<TraitsType>::Assembler::minss};                            \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Maxps::Base::Emitter = {                    \
-          &InstImpl<TraitsType>::Assembler::maxps,                             \
-          &InstImpl<TraitsType>::Assembler::maxps};                            \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Minps::Base::Emitter = {                    \
-          &InstImpl<TraitsType>::Assembler::minps,                             \
-          &InstImpl<TraitsType>::Assembler::minps};                            \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Orps::Base::Emitter = {                     \
-          &InstImpl<TraitsType>::Assembler::orps,                              \
-          &InstImpl<TraitsType>::Assembler::orps};                             \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Xorps::Base::Emitter = {                    \
-          &InstImpl<TraitsType>::Assembler::xorps,                             \
-          &InstImpl<TraitsType>::Assembler::xorps};                            \
-                                                                               \
-  /* Binary XMM Shift ops */                                                   \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterShiftOp                 \
-      InstImpl<TraitsType>::InstX86Psll::Base::Emitter = {                     \
-          &InstImpl<TraitsType>::Assembler::psll,                              \
-          &InstImpl<TraitsType>::Assembler::psll,                              \
-          &InstImpl<TraitsType>::Assembler::psll};                             \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterShiftOp                 \
-      InstImpl<TraitsType>::InstX86Psra::Base::Emitter = {                     \
-          &InstImpl<TraitsType>::Assembler::psra,                              \
-          &InstImpl<TraitsType>::Assembler::psra,                              \
-          &InstImpl<TraitsType>::Assembler::psra};                             \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterShiftOp                 \
-      InstImpl<TraitsType>::InstX86Psrl::Base::Emitter = {                     \
-          &InstImpl<TraitsType>::Assembler::psrl,                              \
-          &InstImpl<TraitsType>::Assembler::psrl,                              \
-          &InstImpl<TraitsType>::Assembler::psrl};                             \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Pshufb::Base::Emitter = {                   \
-          &InstImpl<TraitsType>::Assembler::pshufb,                            \
-          &InstImpl<TraitsType>::Assembler::pshufb};                           \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Punpckl::Base::Emitter = {                  \
-          &InstImpl<TraitsType>::Assembler::punpckl,                           \
-          &InstImpl<TraitsType>::Assembler::punpckl};                          \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Punpckh::Base::Emitter = {                  \
-          &InstImpl<TraitsType>::Assembler::punpckh,                           \
-          &InstImpl<TraitsType>::Assembler::punpckh};                          \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Packss::Base::Emitter = {                   \
-          &InstImpl<TraitsType>::Assembler::packss,                            \
-          &InstImpl<TraitsType>::Assembler::packss};                           \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Packus::Base::Emitter = {                   \
-          &InstImpl<TraitsType>::Assembler::packus,                            \
-          &InstImpl<TraitsType>::Assembler::packus};                           \
-  }                                                                            \
-  }
-
-} // namespace X8632
-} // end of namespace Ice
-
-#include "IceInstX8632BaseImpl.h"
-
-#endif // SUBZERO_SRC_ICEINSTX8632BASE_H
diff --git a/third_party/subzero/src/IceInstX8632BaseImpl.h b/third_party/subzero/src/IceInstX8632BaseImpl.h
deleted file mode 100644
index c14354f..0000000
--- a/third_party/subzero/src/IceInstX8632BaseImpl.h
+++ /dev/null
@@ -1,3111 +0,0 @@
-//===- subzero/src/IceInstX8632BaseImpl.h - Generic X86 instructions -*- C++
-//-*=//
-//
-//                        The Subzero Code Generator
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// \brief Implements the InstX86Base class and its descendants.
-///
-//===----------------------------------------------------------------------===//
-
-#ifndef SUBZERO_SRC_ICEINSTX8632BASEIMPL_H
-#define SUBZERO_SRC_ICEINSTX8632BASEIMPL_H
-
-#include "IceInstX8632Base.h"
-
-#include "IceAssemblerX8632.h"
-#include "IceCfg.h"
-#include "IceCfgNode.h"
-#include "IceDefs.h"
-#include "IceInst.h"
-#include "IceOperand.h"
-#include "IceTargetLowering.h"
-#include "IceTargetLoweringX8632Base.h"
-
-namespace Ice {
-namespace X8632 {
-
-template <typename TraitsType>
-const char *InstImpl<TraitsType>::InstX86Base::getWidthString(Type Ty) {
-  return Traits::TypeAttributes[Ty].WidthString;
-}
-
-template <typename TraitsType>
-const char *InstImpl<TraitsType>::InstX86Base::getFldString(Type Ty) {
-  return Traits::TypeAttributes[Ty].FldString;
-}
-
-template <typename TraitsType>
-typename InstImpl<TraitsType>::Cond::BrCond
-InstImpl<TraitsType>::InstX86Base::getOppositeCondition(BrCond Cond) {
-  return Traits::InstBrAttributes[Cond].Opposite;
-}
-
-template <typename TraitsType>
-InstImpl<TraitsType>::InstX86FakeRMW::InstX86FakeRMW(Cfg *Func, Operand *Data,
-                                                     Operand *Addr,
-                                                     InstArithmetic::OpKind Op,
-                                                     Variable *Beacon)
-    : InstX86Base(Func, InstX86Base::FakeRMW, 3, nullptr), Op(Op) {
-  this->addSource(Data);
-  this->addSource(Addr);
-  this->addSource(Beacon);
-}
-
-template <typename TraitsType>
-InstImpl<TraitsType>::InstX86Mul::InstX86Mul(Cfg *Func, Variable *Dest,
-                                             Variable *Source1,
-                                             Operand *Source2)
-    : InstX86Base(Func, InstX86Base::Mul, 2, Dest) {
-  this->addSource(Source1);
-  this->addSource(Source2);
-}
-
-template <typename TraitsType>
-InstImpl<TraitsType>::InstX86Shld::InstX86Shld(Cfg *Func, Variable *Dest,
-                                               Variable *Source1,
-                                               Operand *Source2)
-    : InstX86Base(Func, InstX86Base::Shld, 3, Dest) {
-  this->addSource(Dest);
-  this->addSource(Source1);
-  this->addSource(Source2);
-}
-
-template <typename TraitsType>
-InstImpl<TraitsType>::InstX86Shrd::InstX86Shrd(Cfg *Func, Variable *Dest,
-                                               Variable *Source1,
-                                               Operand *Source2)
-    : InstX86Base(Func, InstX86Base::Shrd, 3, Dest) {
-  this->addSource(Dest);
-  this->addSource(Source1);
-  this->addSource(Source2);
-}
-
-template <typename TraitsType>
-InstImpl<TraitsType>::InstX86Label::InstX86Label(Cfg *Func,
-                                                 TargetLowering *Target)
-    : InstX86Base(Func, InstX86Base::Label, 0, nullptr),
-      LabelNumber(Target->makeNextLabelNumber()) {
-  if (BuildDefs::dump()) {
-    Name = GlobalString::createWithString(
-        Func->getContext(), ".L" + Func->getFunctionName() + "$local$__" +
-                                std::to_string(LabelNumber));
-  } else {
-    Name = GlobalString::createWithoutString(Func->getContext());
-  }
-}
-
-template <typename TraitsType>
-InstImpl<TraitsType>::InstX86Br::InstX86Br(Cfg *Func, const CfgNode *TargetTrue,
-                                           const CfgNode *TargetFalse,
-                                           const InstX86Label *Label,
-                                           BrCond Condition, Mode Kind)
-    : InstX86Base(Func, InstX86Base::Br, 0, nullptr), Condition(Condition),
-      TargetTrue(TargetTrue), TargetFalse(TargetFalse), Label(Label),
-      Kind(Kind) {}
-
-template <typename TraitsType>
-bool InstImpl<TraitsType>::InstX86Br::optimizeBranch(const CfgNode *NextNode) {
-  // If there is no next block, then there can be no fallthrough to optimize.
-  if (NextNode == nullptr)
-    return false;
-  // Intra-block conditional branches can't be optimized.
-  if (Label)
-    return false;
-  // If there is no fallthrough node, such as a non-default case label for a
-  // switch instruction, then there is no opportunity to optimize.
-  if (getTargetFalse() == nullptr)
-    return false;
-
-  // Unconditional branch to the next node can be removed.
-  if (Condition == Cond::Br_None && getTargetFalse() == NextNode) {
-    assert(getTargetTrue() == nullptr);
-    this->setDeleted();
-    return true;
-  }
-  // If the fallthrough is to the next node, set fallthrough to nullptr to
-  // indicate.
-  if (getTargetFalse() == NextNode) {
-    TargetFalse = nullptr;
-    return true;
-  }
-  // If TargetTrue is the next node, and TargetFalse is not nullptr (which was
-  // already tested above), then invert the branch condition, swap the targets,
-  // and set new fallthrough to nullptr.
-  if (getTargetTrue() == NextNode) {
-    assert(Condition != Cond::Br_None);
-    Condition = this->getOppositeCondition(Condition);
-    TargetTrue = getTargetFalse();
-    TargetFalse = nullptr;
-    return true;
-  }
-  return false;
-}
-
-template <typename TraitsType>
-bool InstImpl<TraitsType>::InstX86Br::repointEdges(CfgNode *OldNode,
-                                                   CfgNode *NewNode) {
-  bool Found = false;
-  if (TargetFalse == OldNode) {
-    TargetFalse = NewNode;
-    Found = true;
-  }
-  if (TargetTrue == OldNode) {
-    TargetTrue = NewNode;
-    Found = true;
-  }
-  return Found;
-}
-
-template <typename TraitsType>
-InstImpl<TraitsType>::InstX86Jmp::InstX86Jmp(Cfg *Func, Operand *Target)
-    : InstX86Base(Func, InstX86Base::Jmp, 1, nullptr) {
-  this->addSource(Target);
-}
-
-template <typename TraitsType>
-InstImpl<TraitsType>::InstX86Call::InstX86Call(Cfg *Func, Variable *Dest,
-                                               Operand *CallTarget)
-    : InstX86Base(Func, InstX86Base::Call, 1, Dest) {
-  this->HasSideEffects = true;
-  this->addSource(CallTarget);
-}
-
-template <typename TraitsType>
-InstImpl<TraitsType>::InstX86Movmsk::InstX86Movmsk(Cfg *Func, Variable *Dest,
-                                                   Operand *Source)
-    : InstX86Base(Func, InstX86Base::Movmsk, 1, Dest) {
-  this->addSource(Source);
-}
-
-template <typename TraitsType>
-InstImpl<TraitsType>::InstX86Cmov::InstX86Cmov(Cfg *Func, Variable *Dest,
-                                               Operand *Source,
-                                               BrCond Condition)
-    : InstX86Base(Func, InstX86Base::Cmov, 2, Dest), Condition(Condition) {
-  // The final result is either the original Dest, or Source, so mark both as
-  // sources.
-  this->addSource(Dest);
-  this->addSource(Source);
-}
-
-template <typename TraitsType>
-InstImpl<TraitsType>::InstX86Cmpps::InstX86Cmpps(Cfg *Func, Variable *Dest,
-                                                 Operand *Source,
-                                                 CmppsCond Condition)
-    : InstX86Base(Func, InstX86Base::Cmpps, 2, Dest), Condition(Condition) {
-  this->addSource(Dest);
-  this->addSource(Source);
-}
-
-template <typename TraitsType>
-InstImpl<TraitsType>::InstX86Cmpxchg::InstX86Cmpxchg(Cfg *Func,
-                                                     Operand *DestOrAddr,
-                                                     Variable *Eax,
-                                                     Variable *Desired,
-                                                     bool Locked)
-    : InstImpl<TraitsType>::InstX86BaseLockable(
-          Func, InstX86Base::Cmpxchg, 3, llvm::dyn_cast<Variable>(DestOrAddr),
-          Locked) {
-  constexpr uint16_t Encoded_rAX = 0;
-  (void)Encoded_rAX;
-  assert(Traits::getEncodedGPR(Eax->getRegNum()) == Encoded_rAX);
-  this->addSource(DestOrAddr);
-  this->addSource(Eax);
-  this->addSource(Desired);
-}
-
-template <typename TraitsType>
-InstImpl<TraitsType>::InstX86Cmpxchg8b::InstX86Cmpxchg8b(
-    Cfg *Func, X86OperandMem *Addr, Variable *Edx, Variable *Eax, Variable *Ecx,
-    Variable *Ebx, bool Locked)
-    : InstImpl<TraitsType>::InstX86BaseLockable(Func, InstX86Base::Cmpxchg, 5,
-                                                nullptr, Locked) {
-  assert(Edx->getRegNum() == RegisterSet::Reg_edx);
-  assert(Eax->getRegNum() == RegisterSet::Reg_eax);
-  assert(Ecx->getRegNum() == RegisterSet::Reg_ecx);
-  assert(Ebx->getRegNum() == RegisterSet::Reg_ebx);
-  this->addSource(Addr);
-  this->addSource(Edx);
-  this->addSource(Eax);
-  this->addSource(Ecx);
-  this->addSource(Ebx);
-}
-
-template <typename TraitsType>
-InstImpl<TraitsType>::InstX86Cvt::InstX86Cvt(Cfg *Func, Variable *Dest,
-                                             Operand *Source,
-                                             CvtVariant Variant)
-    : InstX86Base(Func, InstX86Base::Cvt, 1, Dest), Variant(Variant) {
-  this->addSource(Source);
-}
-
-template <typename TraitsType>
-InstImpl<TraitsType>::InstX86Icmp::InstX86Icmp(Cfg *Func, Operand *Src0,
-                                               Operand *Src1)
-    : InstX86Base(Func, InstX86Base::Icmp, 2, nullptr) {
-  this->addSource(Src0);
-  this->addSource(Src1);
-}
-
-template <typename TraitsType>
-InstImpl<TraitsType>::InstX86Ucomiss::InstX86Ucomiss(Cfg *Func, Operand *Src0,
-                                                     Operand *Src1)
-    : InstX86Base(Func, InstX86Base::Ucomiss, 2, nullptr) {
-  this->addSource(Src0);
-  this->addSource(Src1);
-}
-
-template <typename TraitsType>
-InstImpl<TraitsType>::InstX86UD2::InstX86UD2(Cfg *Func)
-    : InstX86Base(Func, InstX86Base::UD2, 0, nullptr) {}
-
-template <typename TraitsType>
-InstImpl<TraitsType>::InstX86Int3::InstX86Int3(Cfg *Func)
-    : InstX86Base(Func, InstX86Base::Int3, 0, nullptr) {}
-
-template <typename TraitsType>
-InstImpl<TraitsType>::InstX86Test::InstX86Test(Cfg *Func, Operand *Src1,
-                                               Operand *Src2)
-    : InstX86Base(Func, InstX86Base::Test, 2, nullptr) {
-  this->addSource(Src1);
-  this->addSource(Src2);
-}
-
-template <typename TraitsType>
-InstImpl<TraitsType>::InstX86Mfence::InstX86Mfence(Cfg *Func)
-    : InstX86Base(Func, InstX86Base::Mfence, 0, nullptr) {
-  this->HasSideEffects = true;
-}
-
-template <typename TraitsType>
-InstImpl<TraitsType>::InstX86Store::InstX86Store(Cfg *Func, Operand *Value,
-                                                 X86Operand *Mem)
-    : InstX86Base(Func, InstX86Base::Store, 2, nullptr) {
-  this->addSource(Value);
-  this->addSource(Mem);
-}
-
-template <typename TraitsType>
-InstImpl<TraitsType>::InstX86StoreP::InstX86StoreP(Cfg *Func, Variable *Value,
-                                                   X86OperandMem *Mem)
-    : InstX86Base(Func, InstX86Base::StoreP, 2, nullptr) {
-  this->addSource(Value);
-  this->addSource(Mem);
-}
-
-template <typename TraitsType>
-InstImpl<TraitsType>::InstX86StoreQ::InstX86StoreQ(Cfg *Func, Operand *Value,
-                                                   X86OperandMem *Mem)
-    : InstX86Base(Func, InstX86Base::StoreQ, 2, nullptr) {
-  this->addSource(Value);
-  this->addSource(Mem);
-}
-
-template <typename TraitsType>
-InstImpl<TraitsType>::InstX86StoreD::InstX86StoreD(Cfg *Func, Operand *Value,
-                                                   X86OperandMem *Mem)
-    : InstX86Base(Func, InstX86Base::StoreD, 2, nullptr) {
-  this->addSource(Value);
-  this->addSource(Mem);
-}
-
-template <typename TraitsType>
-InstImpl<TraitsType>::InstX86Nop::InstX86Nop(Cfg *Func, NopVariant Variant)
-    : InstX86Base(Func, InstX86Base::Nop, 0, nullptr), Variant(Variant) {}
-
-template <typename TraitsType>
-InstImpl<TraitsType>::InstX86Fld::InstX86Fld(Cfg *Func, Operand *Src)
-    : InstX86Base(Func, InstX86Base::Fld, 1, nullptr) {
-  this->addSource(Src);
-}
-
-template <typename TraitsType>
-InstImpl<TraitsType>::InstX86Fstp::InstX86Fstp(Cfg *Func, Variable *Dest)
-    : InstX86Base(Func, InstX86Base::Fstp, 0, Dest) {}
-
-template <typename TraitsType>
-InstImpl<TraitsType>::InstX86Pop::InstX86Pop(Cfg *Func, Variable *Dest)
-    : InstX86Base(Func, InstX86Base::Pop, 0, Dest) {
-  // A pop instruction affects the stack pointer and so it should not be
-  // allowed to be automatically dead-code eliminated. (The corresponding push
-  // instruction doesn't need this treatment because it has no dest variable
-  // and therefore won't be dead-code eliminated.) This is needed for
-  // late-stage liveness analysis (e.g. asm-verbose mode).
-  this->HasSideEffects = true;
-}
-
-template <typename TraitsType>
-InstImpl<TraitsType>::InstX86Push::InstX86Push(Cfg *Func, Operand *Source)
-    : InstX86Base(Func, InstX86Base::Push, 1, nullptr) {
-  this->addSource(Source);
-}
-
-template <typename TraitsType>
-InstImpl<TraitsType>::InstX86Push::InstX86Push(Cfg *Func, InstX86Label *L)
-    : InstX86Base(Func, InstX86Base::Push, 0, nullptr), Label(L) {}
-
-template <typename TraitsType>
-InstImpl<TraitsType>::InstX86Ret::InstX86Ret(Cfg *Func, Variable *Source)
-    : InstX86Base(Func, InstX86Base::Ret, Source ? 1 : 0, nullptr) {
-  if (Source)
-    this->addSource(Source);
-}
-
-template <typename TraitsType>
-InstImpl<TraitsType>::InstX86Setcc::InstX86Setcc(Cfg *Func, Variable *Dest,
-                                                 BrCond Cond)
-    : InstX86Base(Func, InstX86Base::Setcc, 0, Dest), Condition(Cond) {}
-
-template <typename TraitsType>
-InstImpl<TraitsType>::InstX86Xadd::InstX86Xadd(Cfg *Func, Operand *Dest,
-                                               Variable *Source, bool Locked)
-    : InstImpl<TraitsType>::InstX86BaseLockable(
-          Func, InstX86Base::Xadd, 2, llvm::dyn_cast<Variable>(Dest), Locked) {
-  this->addSource(Dest);
-  this->addSource(Source);
-}
-
-template <typename TraitsType>
-InstImpl<TraitsType>::InstX86Xchg::InstX86Xchg(Cfg *Func, Operand *Dest,
-                                               Variable *Source)
-    : InstX86Base(Func, InstX86Base::Xchg, 2, llvm::dyn_cast<Variable>(Dest)) {
-  this->addSource(Dest);
-  this->addSource(Source);
-}
-
-template <typename TraitsType>
-InstImpl<TraitsType>::InstX86IacaStart::InstX86IacaStart(Cfg *Func)
-    : InstX86Base(Func, InstX86Base::IacaStart, 0, nullptr) {
-  assert(getFlags().getAllowIacaMarks());
-}
-
-template <typename TraitsType>
-InstImpl<TraitsType>::InstX86IacaEnd::InstX86IacaEnd(Cfg *Func)
-    : InstX86Base(Func, InstX86Base::IacaEnd, 0, nullptr) {
-  assert(getFlags().getAllowIacaMarks());
-}
-
-// ======================== Dump routines ======================== //
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Base::dump(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrDump();
-  Str << "[" << Traits::TargetName << "] ";
-  Inst::dump(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86FakeRMW::dump(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrDump();
-  Type Ty = getData()->getType();
-  Str << "rmw " << InstArithmetic::getOpName(getOp()) << " " << Ty << " *";
-  getAddr()->dump(Func);
-  Str << ", ";
-  getData()->dump(Func);
-  Str << ", beacon=";
-  getBeacon()->dump(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Label::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  Str << getLabelName() << ":";
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Label::emitIAS(const Cfg *Func) const {
-  Assembler *Asm = Func->getAssembler<Assembler>();
-  Asm->bindLocalLabel(LabelNumber);
-  if (OffsetReloc != nullptr) {
-    Asm->bindRelocOffset(OffsetReloc);
-  }
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Label::dump(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrDump();
-  Str << getLabelName() << ":";
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Br::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  Str << "\t";
-
-  if (Condition == Cond::Br_None) {
-    Str << "jmp";
-  } else {
-    Str << Traits::InstBrAttributes[Condition].EmitString;
-  }
-
-  if (Label) {
-    Str << "\t" << Label->getLabelName();
-  } else {
-    if (Condition == Cond::Br_None) {
-      Str << "\t" << getTargetFalse()->getAsmName();
-    } else {
-      Str << "\t" << getTargetTrue()->getAsmName();
-      if (getTargetFalse()) {
-        Str << "\n\t"
-               "jmp\t"
-            << getTargetFalse()->getAsmName();
-      }
-    }
-  }
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Br::emitIAS(const Cfg *Func) const {
-  Assembler *Asm = Func->getAssembler<Assembler>();
-  if (Label) {
-    auto *L = Asm->getOrCreateLocalLabel(Label->getLabelNumber());
-    if (Condition == Cond::Br_None) {
-      Asm->jmp(L, isNear());
-    } else {
-      Asm->j(Condition, L, isNear());
-    }
-  } else {
-    if (Condition == Cond::Br_None) {
-      auto *L = Asm->getOrCreateCfgNodeLabel(getTargetFalse()->getIndex());
-      assert(!getTargetTrue());
-      Asm->jmp(L, isNear());
-    } else {
-      auto *L = Asm->getOrCreateCfgNodeLabel(getTargetTrue()->getIndex());
-      Asm->j(Condition, L, isNear());
-      if (getTargetFalse()) {
-        auto *L2 = Asm->getOrCreateCfgNodeLabel(getTargetFalse()->getIndex());
-        Asm->jmp(L2, isNear());
-      }
-    }
-  }
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Br::dump(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrDump();
-  Str << "br ";
-
-  if (Condition == Cond::Br_None) {
-    if (Label) {
-      Str << "label %" << Label->getLabelName();
-    } else {
-      Str << "label %" << getTargetFalse()->getName();
-    }
-    return;
-  }
-
-  Str << Traits::InstBrAttributes[Condition].DisplayString;
-  if (Label) {
-    Str << ", label %" << Label->getLabelName();
-  } else {
-    Str << ", label %" << getTargetTrue()->getName();
-    if (getTargetFalse()) {
-      Str << ", label %" << getTargetFalse()->getName();
-    }
-  }
-
-  Str << " // (" << (isNear() ? "near" : "far") << " jump)";
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Jmp::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  assert(this->getSrcSize() == 1);
-  const Operand *Src = this->getSrc(0);
-  if (Traits::Is64Bit) {
-    if (const auto *CR = llvm::dyn_cast<ConstantRelocatable>(Src)) {
-      Str << "\t"
-             "jmp"
-             "\t"
-          << CR->getName();
-      return;
-    }
-  }
-  Str << "\t"
-         "jmp"
-         "\t*";
-  getJmpTarget()->emit(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Jmp::emitIAS(const Cfg *Func) const {
-  // Note: Adapted (mostly copied) from
-  // InstImpl<TraitsType>::InstX86Call::emitIAS().
-  Assembler *Asm = Func->getAssembler<Assembler>();
-  Operand *Target = getJmpTarget();
-  if (const auto *Var = llvm::dyn_cast<Variable>(Target)) {
-    if (Var->hasReg()) {
-      Asm->jmp(Traits::getEncodedGPR(Var->getRegNum()));
-    } else {
-      // The jmp instruction with a memory operand should be possible to
-      // encode, but it isn't a valid sandboxed instruction, and there
-      // shouldn't be a register allocation issue to jump through a scratch
-      // register, so we don't really need to bother implementing it.
-      llvm::report_fatal_error("Assembler can't jmp to memory operand");
-    }
-  } else if (const auto *Mem = llvm::dyn_cast<X86OperandMem>(Target)) {
-    (void)Mem;
-    assert(Mem->getSegmentRegister() == X86OperandMem::DefaultSegment);
-    llvm::report_fatal_error("Assembler can't jmp to memory operand");
-  } else if (const auto *CR = llvm::dyn_cast<ConstantRelocatable>(Target)) {
-    Asm->jmp(CR);
-  } else if (const auto *Imm = llvm::dyn_cast<ConstantInteger32>(Target)) {
-    // NaCl trampoline calls refer to an address within the sandbox directly.
-    // This is usually only needed for non-IRT builds and otherwise not very
-    // portable or stable. Usually this is only done for "calls" and not jumps.
-    Asm->jmp(AssemblerImmediate(Imm->getValue()));
-  } else {
-    llvm::report_fatal_error("Unexpected operand type");
-  }
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Jmp::dump(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrDump();
-  Str << "jmp ";
-  getJmpTarget()->dump(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Call::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  assert(this->getSrcSize() == 1);
-  Str << "\t"
-         "call\t";
-  Operand *CallTarget = getCallTarget();
-  auto *Target = InstX86Base::getTarget(Func);
-  if (const auto *CI = llvm::dyn_cast<ConstantInteger32>(CallTarget)) {
-    // Emit without a leading '$'.
-    Str << CI->getValue();
-  } else if (const auto DirectCallTarget =
-                 llvm::dyn_cast<ConstantRelocatable>(CallTarget)) {
-    DirectCallTarget->emitWithoutPrefix(Target);
-  } else {
-    Str << "*";
-    CallTarget->emit(Func);
-  }
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Call::emitIAS(const Cfg *Func) const {
-  Assembler *Asm = Func->getAssembler<Assembler>();
-  Operand *CallTarget = getCallTarget();
-  auto *Target = InstX86Base::getTarget(Func);
-  if (const auto *Var = llvm::dyn_cast<Variable>(CallTarget)) {
-    if (Var->hasReg()) {
-      Asm->call(Traits::getEncodedGPR(Var->getRegNum()));
-    } else {
-      Asm->call(Target->stackVarToAsmOperand(Var));
-    }
-  } else if (const auto *Mem = llvm::dyn_cast<X86OperandMem>(CallTarget)) {
-    assert(Mem->getSegmentRegister() == X86OperandMem::DefaultSegment);
-    Asm->call(Mem->toAsmAddress(Asm, Target));
-  } else if (const auto *CR = llvm::dyn_cast<ConstantRelocatable>(CallTarget)) {
-    Asm->call(CR);
-  } else if (const auto *Imm = llvm::dyn_cast<ConstantInteger32>(CallTarget)) {
-    Asm->call(AssemblerImmediate(Imm->getValue()));
-  } else {
-    llvm_unreachable("Unexpected operand type");
-  }
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Call::dump(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrDump();
-  if (this->getDest()) {
-    this->dumpDest(Func);
-    Str << " = ";
-  }
-  Str << "call ";
-  getCallTarget()->dump(Func);
-}
-
-// The this->Opcode parameter needs to be char* and not std::string because of
-// template issues.
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Base::emitTwoAddress(
-    const Cfg *Func, const char *Opcode, const char *Suffix) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  assert(getSrcSize() == 2);
-  Operand *Dest = getDest();
-  if (Dest == nullptr)
-    Dest = getSrc(0);
-  assert(Dest == getSrc(0));
-  Operand *Src1 = getSrc(1);
-  Str << "\t" << Opcode << Suffix
-      << InstX86Base::getWidthString(Dest->getType()) << "\t";
-  Src1->emit(Func);
-  Str << ", ";
-  Dest->emit(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::emitIASOpTyGPR(const Cfg *Func, Type Ty,
-                                          const Operand *Op,
-                                          const GPREmitterOneOp &Emitter) {
-  auto *Target = InstX86Base::getTarget(Func);
-  Assembler *Asm = Func->getAssembler<Assembler>();
-  if (const auto *Var = llvm::dyn_cast<Variable>(Op)) {
-    if (Var->hasReg()) {
-      // We cheat a little and use GPRRegister even for byte operations.
-      GPRRegister VarReg = Traits::getEncodedGPR(Var->getRegNum());
-      (Asm->*(Emitter.Reg))(Ty, VarReg);
-    } else {
-      Address StackAddr(Target->stackVarToAsmOperand(Var));
-      (Asm->*(Emitter.Addr))(Ty, StackAddr);
-    }
-  } else if (const auto *Mem = llvm::dyn_cast<X86OperandMem>(Op)) {
-    Mem->emitSegmentOverride(Asm);
-    (Asm->*(Emitter.Addr))(Ty, Mem->toAsmAddress(Asm, Target));
-  } else {
-    llvm_unreachable("Unexpected operand type");
-  }
-}
-
-template <typename TraitsType>
-template <bool VarCanBeByte, bool SrcCanBeByte>
-void InstImpl<TraitsType>::emitIASRegOpTyGPR(const Cfg *Func, bool IsLea,
-                                             Type Ty, const Variable *Var,
-                                             const Operand *Src,
-                                             const GPREmitterRegOp &Emitter) {
-  auto *Target = InstX86Base::getTarget(Func);
-  Assembler *Asm = Func->getAssembler<Assembler>();
-  assert(Var->hasReg());
-  // We cheat a little and use GPRRegister even for byte operations.
-  GPRRegister VarReg = VarCanBeByte ? Traits::getEncodedGPR(Var->getRegNum())
-                                    : Traits::getEncodedGPR(Var->getRegNum());
-  if (const auto *SrcVar = llvm::dyn_cast<Variable>(Src)) {
-    if (SrcVar->hasReg()) {
-      GPRRegister SrcReg = SrcCanBeByte
-                               ? Traits::getEncodedGPR(SrcVar->getRegNum())
-                               : Traits::getEncodedGPR(SrcVar->getRegNum());
-      (Asm->*(Emitter.GPRGPR))(Ty, VarReg, SrcReg);
-    } else {
-      Address SrcStackAddr = Target->stackVarToAsmOperand(SrcVar);
-      (Asm->*(Emitter.GPRAddr))(Ty, VarReg, SrcStackAddr);
-    }
-  } else if (const auto *Mem = llvm::dyn_cast<X86OperandMem>(Src)) {
-    Mem->emitSegmentOverride(Asm);
-    (Asm->*(Emitter.GPRAddr))(Ty, VarReg,
-                              Mem->toAsmAddress(Asm, Target, IsLea));
-  } else if (const auto *Imm = llvm::dyn_cast<ConstantInteger32>(Src)) {
-    (Asm->*(Emitter.GPRImm))(Ty, VarReg, AssemblerImmediate(Imm->getValue()));
-  } else if (const auto *Imm = llvm::dyn_cast<ConstantInteger64>(Src)) {
-    assert(Traits::Is64Bit);
-    assert(Utils::IsInt(32, Imm->getValue()));
-    (Asm->*(Emitter.GPRImm))(Ty, VarReg, AssemblerImmediate(Imm->getValue()));
-  } else if (const auto *Reloc = llvm::dyn_cast<ConstantRelocatable>(Src)) {
-    const auto FixupKind = (Reloc->getName().hasStdString() &&
-                            Reloc->getName().toString() == GlobalOffsetTable)
-                               ? Traits::FK_GotPC
-                               : Traits::TargetLowering::getAbsFixup();
-    AssemblerFixup *Fixup = Asm->createFixup(FixupKind, Reloc);
-    (Asm->*(Emitter.GPRImm))(Ty, VarReg, AssemblerImmediate(Fixup));
-  } else if (const auto *Split = llvm::dyn_cast<VariableSplit>(Src)) {
-    (Asm->*(Emitter.GPRAddr))(Ty, VarReg, Split->toAsmAddress(Func));
-  } else {
-    llvm_unreachable("Unexpected operand type");
-  }
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::emitIASAddrOpTyGPR(const Cfg *Func, Type Ty,
-                                              const Address &Addr,
-                                              const Operand *Src,
-                                              const GPREmitterAddrOp &Emitter) {
-  Assembler *Asm = Func->getAssembler<Assembler>();
-  // Src can only be Reg or AssemblerImmediate.
-  if (const auto *SrcVar = llvm::dyn_cast<Variable>(Src)) {
-    assert(SrcVar->hasReg());
-    GPRRegister SrcReg = Traits::getEncodedGPR(SrcVar->getRegNum());
-    (Asm->*(Emitter.AddrGPR))(Ty, Addr, SrcReg);
-  } else if (const auto *Imm = llvm::dyn_cast<ConstantInteger32>(Src)) {
-    (Asm->*(Emitter.AddrImm))(Ty, Addr, AssemblerImmediate(Imm->getValue()));
-  } else if (const auto *Imm = llvm::dyn_cast<ConstantInteger64>(Src)) {
-    assert(Traits::Is64Bit);
-    assert(Utils::IsInt(32, Imm->getValue()));
-    (Asm->*(Emitter.AddrImm))(Ty, Addr, AssemblerImmediate(Imm->getValue()));
-  } else if (const auto *Reloc = llvm::dyn_cast<ConstantRelocatable>(Src)) {
-    const auto FixupKind = (Reloc->getName().hasStdString() &&
-                            Reloc->getName().toString() == GlobalOffsetTable)
-                               ? Traits::FK_GotPC
-                               : Traits::TargetLowering::getAbsFixup();
-    AssemblerFixup *Fixup = Asm->createFixup(FixupKind, Reloc);
-    (Asm->*(Emitter.AddrImm))(Ty, Addr, AssemblerImmediate(Fixup));
-  } else {
-    llvm_unreachable("Unexpected operand type");
-  }
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::emitIASAsAddrOpTyGPR(
-    const Cfg *Func, Type Ty, const Operand *Op0, const Operand *Op1,
-    const GPREmitterAddrOp &Emitter) {
-  auto *Target = InstX86Base::getTarget(Func);
-  if (const auto *Op0Var = llvm::dyn_cast<Variable>(Op0)) {
-    assert(!Op0Var->hasReg());
-    Address StackAddr(Target->stackVarToAsmOperand(Op0Var));
-    emitIASAddrOpTyGPR(Func, Ty, StackAddr, Op1, Emitter);
-  } else if (const auto *Op0Mem = llvm::dyn_cast<X86OperandMem>(Op0)) {
-    Assembler *Asm = Func->getAssembler<Assembler>();
-    Op0Mem->emitSegmentOverride(Asm);
-    emitIASAddrOpTyGPR(Func, Ty, Op0Mem->toAsmAddress(Asm, Target), Op1,
-                       Emitter);
-  } else if (const auto *Split = llvm::dyn_cast<VariableSplit>(Op0)) {
-    emitIASAddrOpTyGPR(Func, Ty, Split->toAsmAddress(Func), Op1, Emitter);
-  } else {
-    llvm_unreachable("Unexpected operand type");
-  }
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::emitIASGPRShift(const Cfg *Func, Type Ty,
-                                           const Variable *Var,
-                                           const Operand *Src,
-                                           const GPREmitterShiftOp &Emitter) {
-  Assembler *Asm = Func->getAssembler<Assembler>();
-  // Technically, the Dest Var can be mem as well, but we only use Reg. We can
-  // extend this to check Dest if we decide to use that form.
-  assert(Var->hasReg());
-  // We cheat a little and use GPRRegister even for byte operations.
-  GPRRegister VarReg = Traits::getEncodedGPR(Var->getRegNum());
-  // Src must be reg == ECX or an Imm8. This is asserted by the assembler.
-  if (const auto *SrcVar = llvm::dyn_cast<Variable>(Src)) {
-    assert(SrcVar->hasReg());
-    GPRRegister SrcReg = Traits::getEncodedGPR(SrcVar->getRegNum());
-    (Asm->*(Emitter.GPRGPR))(Ty, VarReg, SrcReg);
-  } else if (const auto *Imm = llvm::dyn_cast<ConstantInteger32>(Src)) {
-    (Asm->*(Emitter.GPRImm))(Ty, VarReg, AssemblerImmediate(Imm->getValue()));
-  } else if (const auto *Imm = llvm::dyn_cast<ConstantInteger64>(Src)) {
-    assert(Traits::Is64Bit);
-    assert(Utils::IsInt(32, Imm->getValue()));
-    (Asm->*(Emitter.GPRImm))(Ty, VarReg, AssemblerImmediate(Imm->getValue()));
-  } else {
-    llvm_unreachable("Unexpected operand type");
-  }
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::emitIASGPRShiftDouble(
-    const Cfg *Func, const Variable *Dest, const Operand *Src1Op,
-    const Operand *Src2Op, const GPREmitterShiftD &Emitter) {
-  Assembler *Asm = Func->getAssembler<Assembler>();
-  // Dest can be reg or mem, but we only use the reg variant.
-  assert(Dest->hasReg());
-  GPRRegister DestReg = Traits::getEncodedGPR(Dest->getRegNum());
-  // SrcVar1 must be reg.
-  const auto *SrcVar1 = llvm::cast<Variable>(Src1Op);
-  assert(SrcVar1->hasReg());
-  GPRRegister SrcReg = Traits::getEncodedGPR(SrcVar1->getRegNum());
-  Type Ty = SrcVar1->getType();
-  // Src2 can be the implicit CL register or an immediate.
-  if (const auto *Imm = llvm::dyn_cast<ConstantInteger32>(Src2Op)) {
-    (Asm->*(Emitter.GPRGPRImm))(Ty, DestReg, SrcReg,
-                                AssemblerImmediate(Imm->getValue()));
-  } else {
-    assert(llvm::cast<Variable>(Src2Op)->getRegNum() == RegisterSet::Reg_cl);
-    (Asm->*(Emitter.GPRGPR))(Ty, DestReg, SrcReg);
-  }
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::emitIASXmmShift(const Cfg *Func, Type Ty,
-                                           const Variable *Var,
-                                           const Operand *Src,
-                                           const XmmEmitterShiftOp &Emitter) {
-  auto *Target = InstX86Base::getTarget(Func);
-  Assembler *Asm = Func->getAssembler<Assembler>();
-  assert(Var->hasReg());
-  XmmRegister VarReg = Traits::getEncodedXmm(Var->getRegNum());
-  if (const auto *SrcVar = llvm::dyn_cast<Variable>(Src)) {
-    if (SrcVar->hasReg()) {
-      XmmRegister SrcReg = Traits::getEncodedXmm(SrcVar->getRegNum());
-      (Asm->*(Emitter.XmmXmm))(Ty, VarReg, SrcReg);
-    } else {
-      Address SrcStackAddr = Target->stackVarToAsmOperand(SrcVar);
-      (Asm->*(Emitter.XmmAddr))(Ty, VarReg, SrcStackAddr);
-    }
-  } else if (const auto *Mem = llvm::dyn_cast<X86OperandMem>(Src)) {
-    assert(Mem->getSegmentRegister() == X86OperandMem::DefaultSegment);
-    (Asm->*(Emitter.XmmAddr))(Ty, VarReg, Mem->toAsmAddress(Asm, Target));
-  } else if (const auto *Imm = llvm::dyn_cast<ConstantInteger32>(Src)) {
-    (Asm->*(Emitter.XmmImm))(Ty, VarReg, AssemblerImmediate(Imm->getValue()));
-  } else {
-    llvm_unreachable("Unexpected operand type");
-  }
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::emitIASRegOpTyXMM(const Cfg *Func, Type Ty,
-                                             const Variable *Var,
-                                             const Operand *Src,
-                                             const XmmEmitterRegOp &Emitter) {
-  auto *Target = InstX86Base::getTarget(Func);
-  Assembler *Asm = Func->getAssembler<Assembler>();
-  assert(Var->hasReg());
-  XmmRegister VarReg = Traits::getEncodedXmm(Var->getRegNum());
-  if (const auto *SrcVar = llvm::dyn_cast<Variable>(Src)) {
-    if (SrcVar->hasReg()) {
-      XmmRegister SrcReg = Traits::getEncodedXmm(SrcVar->getRegNum());
-      (Asm->*(Emitter.XmmXmm))(Ty, VarReg, SrcReg);
-    } else {
-      Address SrcStackAddr = Target->stackVarToAsmOperand(SrcVar);
-      (Asm->*(Emitter.XmmAddr))(Ty, VarReg, SrcStackAddr);
-    }
-  } else if (const auto *Mem = llvm::dyn_cast<X86OperandMem>(Src)) {
-    assert(Mem->getSegmentRegister() == X86OperandMem::DefaultSegment);
-    (Asm->*(Emitter.XmmAddr))(Ty, VarReg, Mem->toAsmAddress(Asm, Target));
-  } else if (const auto *Imm = llvm::dyn_cast<Constant>(Src)) {
-    (Asm->*(Emitter.XmmAddr))(Ty, VarReg,
-                              Traits::Address::ofConstPool(Asm, Imm));
-  } else {
-    llvm_unreachable("Unexpected operand type");
-  }
-}
-
-template <typename TraitsType>
-template <typename DReg_t, typename SReg_t, DReg_t (*destEnc)(RegNumT),
-          SReg_t (*srcEnc)(RegNumT)>
-void InstImpl<TraitsType>::emitIASCastRegOp(
-    const Cfg *Func, Type DestTy, const Variable *Dest, Type SrcTy,
-    const Operand *Src, const CastEmitterRegOp<DReg_t, SReg_t> &Emitter) {
-  auto *Target = InstX86Base::getTarget(Func);
-  Assembler *Asm = Func->getAssembler<Assembler>();
-  assert(Dest->hasReg());
-  DReg_t DestReg = destEnc(Dest->getRegNum());
-  if (const auto *SrcVar = llvm::dyn_cast<Variable>(Src)) {
-    if (SrcVar->hasReg()) {
-      SReg_t SrcReg = srcEnc(SrcVar->getRegNum());
-      (Asm->*(Emitter.RegReg))(DestTy, DestReg, SrcTy, SrcReg);
-    } else {
-      Address SrcStackAddr = Target->stackVarToAsmOperand(SrcVar);
-      (Asm->*(Emitter.RegAddr))(DestTy, DestReg, SrcTy, SrcStackAddr);
-    }
-  } else if (const auto *Mem = llvm::dyn_cast<X86OperandMem>(Src)) {
-    Mem->emitSegmentOverride(Asm);
-    (Asm->*(Emitter.RegAddr))(DestTy, DestReg, SrcTy,
-                              Mem->toAsmAddress(Asm, Target));
-  } else {
-    llvm_unreachable("Unexpected operand type");
-  }
-}
-
-template <typename TraitsType>
-template <typename DReg_t, typename SReg_t, DReg_t (*destEnc)(RegNumT),
-          SReg_t (*srcEnc)(RegNumT)>
-void InstImpl<TraitsType>::emitIASThreeOpImmOps(
-    const Cfg *Func, Type DispatchTy, const Variable *Dest, const Operand *Src0,
-    const Operand *Src1, const ThreeOpImmEmitter<DReg_t, SReg_t> Emitter) {
-  auto *Target = InstX86Base::getTarget(Func);
-  Assembler *Asm = Func->getAssembler<Assembler>();
-  // This only handles Dest being a register, and Src1 being an immediate.
-  assert(Dest->hasReg());
-  DReg_t DestReg = destEnc(Dest->getRegNum());
-  AssemblerImmediate Imm(llvm::cast<ConstantInteger32>(Src1)->getValue());
-  if (const auto *SrcVar = llvm::dyn_cast<Variable>(Src0)) {
-    if (SrcVar->hasReg()) {
-      SReg_t SrcReg = srcEnc(SrcVar->getRegNum());
-      (Asm->*(Emitter.RegRegImm))(DispatchTy, DestReg, SrcReg, Imm);
-    } else {
-      Address SrcStackAddr = Target->stackVarToAsmOperand(SrcVar);
-      (Asm->*(Emitter.RegAddrImm))(DispatchTy, DestReg, SrcStackAddr, Imm);
-    }
-  } else if (const auto *Mem = llvm::dyn_cast<X86OperandMem>(Src0)) {
-    Mem->emitSegmentOverride(Asm);
-    (Asm->*(Emitter.RegAddrImm))(DispatchTy, DestReg,
-                                 Mem->toAsmAddress(Asm, Target), Imm);
-  } else {
-    llvm_unreachable("Unexpected operand type");
-  }
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::emitIASMovlikeXMM(const Cfg *Func,
-                                             const Variable *Dest,
-                                             const Operand *Src,
-                                             const XmmEmitterMovOps Emitter) {
-  auto *Target = InstX86Base::getTarget(Func);
-  Assembler *Asm = Func->getAssembler<Assembler>();
-  if (Dest->hasReg()) {
-    XmmRegister DestReg = Traits::getEncodedXmm(Dest->getRegNum());
-    if (const auto *SrcVar = llvm::dyn_cast<Variable>(Src)) {
-      if (SrcVar->hasReg()) {
-        (Asm->*(Emitter.XmmXmm))(DestReg,
-                                 Traits::getEncodedXmm(SrcVar->getRegNum()));
-      } else {
-        Address StackAddr(Target->stackVarToAsmOperand(SrcVar));
-        (Asm->*(Emitter.XmmAddr))(DestReg, StackAddr);
-      }
-    } else if (const auto *SrcMem = llvm::dyn_cast<X86OperandMem>(Src)) {
-      assert(SrcMem->getSegmentRegister() == X86OperandMem::DefaultSegment);
-      (Asm->*(Emitter.XmmAddr))(DestReg, SrcMem->toAsmAddress(Asm, Target));
-    } else {
-      llvm_unreachable("Unexpected operand type");
-    }
-  } else {
-    Address StackAddr(Target->stackVarToAsmOperand(Dest));
-    // Src must be a register in this case.
-    const auto *SrcVar = llvm::cast<Variable>(Src);
-    assert(SrcVar->hasReg());
-    (Asm->*(Emitter.AddrXmm))(StackAddr,
-                              Traits::getEncodedXmm(SrcVar->getRegNum()));
-  }
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Movmsk::dump(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrDump();
-  this->dumpDest(Func);
-  Str << " = movmsk." << this->getSrc(0)->getType() << " ";
-  this->dumpSources(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Movmsk::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  assert(this->getSrcSize() == 1);
-  Type SrcTy = this->getSrc(0)->getType();
-  assert(isVectorType(SrcTy));
-  switch (SrcTy) {
-  case IceType_v16i8:
-    Str << "\t"
-           "pmovmskb"
-           "\t";
-    break;
-  case IceType_v4i32:
-  case IceType_v4f32:
-    Str << "\t"
-           "movmskps"
-           "\t";
-    break;
-  default:
-    llvm_unreachable("Unexpected operand type");
-  }
-  this->getSrc(0)->emit(Func);
-  Str << ", ";
-  this->getDest()->emit(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Movmsk::emitIAS(const Cfg *Func) const {
-  assert(this->getSrcSize() == 1);
-  Assembler *Asm = Func->getAssembler<Assembler>();
-  const Variable *Dest = this->getDest();
-  const Variable *Src = llvm::cast<Variable>(this->getSrc(0));
-  const Type DestTy = Dest->getType();
-  (void)DestTy;
-  const Type SrcTy = Src->getType();
-  assert(isVectorType(SrcTy));
-  assert(isScalarIntegerType(DestTy));
-  if (Traits::Is64Bit) {
-    assert(DestTy == IceType_i32 || DestTy == IceType_i64);
-  } else {
-    assert(typeWidthInBytes(DestTy) <= 4);
-  }
-  XmmRegister SrcReg = Traits::getEncodedXmm(Src->getRegNum());
-  GPRRegister DestReg = Traits::getEncodedGPR(Dest->getRegNum());
-  Asm->movmsk(SrcTy, DestReg, SrcReg);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Sqrt::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  assert(this->getSrcSize() == 1);
-  Type Ty = this->getSrc(0)->getType();
-  assert(isScalarFloatingType(Ty));
-  Str << "\t"
-         "sqrt"
-      << Traits::TypeAttributes[Ty].SpSdString << "\t";
-  this->getSrc(0)->emit(Func);
-  Str << ", ";
-  this->getDest()->emit(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Div::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  assert(this->getSrcSize() == 3);
-  Operand *Src1 = this->getSrc(1);
-  Str << "\t" << this->Opcode << this->getWidthString(Src1->getType()) << "\t";
-  Src1->emit(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Div::emitIAS(const Cfg *Func) const {
-  assert(this->getSrcSize() == 3);
-  const Operand *Src = this->getSrc(1);
-  Type Ty = Src->getType();
-  static GPREmitterOneOp Emitter = {&Assembler::div, &Assembler::div};
-  emitIASOpTyGPR(Func, Ty, Src, Emitter);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Idiv::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  assert(this->getSrcSize() == 3);
-  Operand *Src1 = this->getSrc(1);
-  Str << "\t" << this->Opcode << this->getWidthString(Src1->getType()) << "\t";
-  Src1->emit(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Idiv::emitIAS(const Cfg *Func) const {
-  assert(this->getSrcSize() == 3);
-  const Operand *Src = this->getSrc(1);
-  Type Ty = Src->getType();
-  static const GPREmitterOneOp Emitter = {&Assembler::idiv, &Assembler::idiv};
-  emitIASOpTyGPR(Func, Ty, Src, Emitter);
-}
-
-// pblendvb and blendvps take xmm0 as a final implicit argument.
-template <typename TraitsType>
-void InstImpl<TraitsType>::emitVariableBlendInst(const char *Opcode,
-                                                 const Inst *Instr,
-                                                 const Cfg *Func) {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  assert(Instr->getSrcSize() == 3);
-  assert(llvm::cast<Variable>(Instr->getSrc(2))->getRegNum() ==
-         RegisterSet::Reg_xmm0);
-  Str << "\t" << Opcode << "\t";
-  Instr->getSrc(1)->emit(Func);
-  Str << ", ";
-  Instr->getDest()->emit(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::emitIASVariableBlendInst(
-    const Inst *Instr, const Cfg *Func, const XmmEmitterRegOp &Emitter) {
-  assert(Instr->getSrcSize() == 3);
-  assert(llvm::cast<Variable>(Instr->getSrc(2))->getRegNum() ==
-         RegisterSet::Reg_xmm0);
-  const Variable *Dest = Instr->getDest();
-  const Operand *Src = Instr->getSrc(1);
-  emitIASRegOpTyXMM(Func, Dest->getType(), Dest, Src, Emitter);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Blendvps::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  emitVariableBlendInst(this->Opcode, this, Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Blendvps::emitIAS(const Cfg *Func) const {
-  static const XmmEmitterRegOp Emitter = {&Assembler::blendvps,
-                                          &Assembler::blendvps};
-  emitIASVariableBlendInst(this, Func, Emitter);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Pblendvb::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  emitVariableBlendInst(this->Opcode, this, Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Pblendvb::emitIAS(const Cfg *Func) const {
-  static const XmmEmitterRegOp Emitter = {&Assembler::pblendvb,
-                                          &Assembler::pblendvb};
-  emitIASVariableBlendInst(this, Func, Emitter);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Imul::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  assert(this->getSrcSize() == 2);
-  Variable *Dest = this->getDest();
-  if (isByteSizedArithType(Dest->getType())) {
-    // The 8-bit version of imul only allows the form "imul r/m8".
-    const auto *Src0Var = llvm::dyn_cast<Variable>(this->getSrc(0));
-    (void)Src0Var;
-    assert(Src0Var->getRegNum() == RegisterSet::Reg_al);
-    Str << "\t"
-           "imulb\t";
-    this->getSrc(1)->emit(Func);
-  } else if (llvm::isa<Constant>(this->getSrc(1))) {
-    Str << "\t"
-           "imul"
-        << this->getWidthString(Dest->getType()) << "\t";
-    this->getSrc(1)->emit(Func);
-    Str << ", ";
-    this->getSrc(0)->emit(Func);
-    Str << ", ";
-    Dest->emit(Func);
-  } else {
-    this->emitTwoAddress(Func, this->Opcode);
-  }
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Imul::emitIAS(const Cfg *Func) const {
-  assert(this->getSrcSize() == 2);
-  const Variable *Var = this->getDest();
-  Type Ty = Var->getType();
-  const Operand *Src = this->getSrc(1);
-  if (isByteSizedArithType(Ty)) {
-    // The 8-bit version of imul only allows the form "imul r/m8".
-    const auto *Src0Var = llvm::dyn_cast<Variable>(this->getSrc(0));
-    (void)Src0Var;
-    assert(Src0Var->getRegNum() == RegisterSet::Reg_al);
-    static const GPREmitterOneOp Emitter = {&Assembler::imul, &Assembler::imul};
-    emitIASOpTyGPR(Func, Ty, this->getSrc(1), Emitter);
-  } else {
-    // The two-address version is used when multiplying by a non-constant
-    // or doing an 8-bit multiply.
-    assert(Var == this->getSrc(0));
-    static const GPREmitterRegOp Emitter = {&Assembler::imul, &Assembler::imul,
-                                            &Assembler::imul};
-    constexpr bool NotLea = false;
-    emitIASRegOpTyGPR(Func, NotLea, Ty, Var, Src, Emitter);
-  }
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86ImulImm::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  assert(this->getSrcSize() == 2);
-  Variable *Dest = this->getDest();
-  assert(Dest->getType() == IceType_i16 || Dest->getType() == IceType_i32);
-  assert(llvm::isa<Constant>(this->getSrc(1)));
-  Str << "\t"
-         "imul"
-      << this->getWidthString(Dest->getType()) << "\t";
-  this->getSrc(1)->emit(Func);
-  Str << ", ";
-  this->getSrc(0)->emit(Func);
-  Str << ", ";
-  Dest->emit(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86ImulImm::emitIAS(const Cfg *Func) const {
-  assert(this->getSrcSize() == 2);
-  const Variable *Dest = this->getDest();
-  Type Ty = Dest->getType();
-  assert(llvm::isa<Constant>(this->getSrc(1)));
-  static const ThreeOpImmEmitter<GPRRegister, GPRRegister> Emitter = {
-      &Assembler::imul, &Assembler::imul};
-  emitIASThreeOpImmOps<GPRRegister, GPRRegister, Traits::getEncodedGPR,
-                       Traits::getEncodedGPR>(Func, Ty, Dest, this->getSrc(0),
-                                              this->getSrc(1), Emitter);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Insertps::emitIAS(const Cfg *Func) const {
-  assert(this->getSrcSize() == 3);
-  assert(InstX86Base::getTarget(Func)->getInstructionSet() >= Traits::SSE4_1);
-  const Variable *Dest = this->getDest();
-  assert(Dest == this->getSrc(0));
-  Type Ty = Dest->getType();
-  static const ThreeOpImmEmitter<XmmRegister, XmmRegister> Emitter = {
-      &Assembler::insertps, &Assembler::insertps};
-  emitIASThreeOpImmOps<XmmRegister, XmmRegister, Traits::getEncodedXmm,
-                       Traits::getEncodedXmm>(Func, Ty, Dest, this->getSrc(1),
-                                              this->getSrc(2), Emitter);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Cbwdq::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  assert(this->getSrcSize() == 1);
-  Operand *Src0 = this->getSrc(0);
-  const auto DestReg = this->getDest()->getRegNum();
-  const auto SrcReg = llvm::cast<Variable>(Src0)->getRegNum();
-  (void)DestReg;
-  (void)SrcReg;
-  switch (Src0->getType()) {
-  default:
-    llvm_unreachable("unexpected source type!");
-    break;
-  case IceType_i8:
-    assert(SrcReg == RegisterSet::Reg_al);
-    assert(DestReg == RegisterSet::Reg_ax || DestReg == RegisterSet::Reg_ah);
-    Str << "\t"
-           "cbtw";
-    break;
-  case IceType_i16:
-    assert(SrcReg == RegisterSet::Reg_ax);
-    assert(DestReg == RegisterSet::Reg_dx);
-    Str << "\t"
-           "cwtd";
-    break;
-  case IceType_i32:
-    assert(SrcReg == RegisterSet::Reg_eax);
-    assert(DestReg == RegisterSet::Reg_edx);
-    Str << "\t"
-           "cltd";
-    break;
-  case IceType_i64:
-    assert(Traits::Is64Bit);
-    assert(SrcReg == Traits::getRaxOrDie());
-    assert(DestReg == Traits::getRdxOrDie());
-    Str << "\t"
-           "cqo";
-    break;
-  }
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Cbwdq::emitIAS(const Cfg *Func) const {
-  Assembler *Asm = Func->getAssembler<Assembler>();
-  assert(this->getSrcSize() == 1);
-  Operand *Src0 = this->getSrc(0);
-  const auto DestReg = this->getDest()->getRegNum();
-  const auto SrcReg = llvm::cast<Variable>(Src0)->getRegNum();
-  (void)DestReg;
-  (void)SrcReg;
-  switch (Src0->getType()) {
-  default:
-    llvm_unreachable("unexpected source type!");
-    break;
-  case IceType_i8:
-    assert(SrcReg == RegisterSet::Reg_al);
-    assert(DestReg == RegisterSet::Reg_ax || DestReg == RegisterSet::Reg_ah);
-    Asm->cbw();
-    break;
-  case IceType_i16:
-    assert(SrcReg == RegisterSet::Reg_ax);
-    assert(DestReg == RegisterSet::Reg_dx);
-    Asm->cwd();
-    break;
-  case IceType_i32:
-    assert(SrcReg == RegisterSet::Reg_eax);
-    assert(DestReg == RegisterSet::Reg_edx);
-    Asm->cdq();
-    break;
-  case IceType_i64:
-    assert(Traits::Is64Bit);
-    assert(SrcReg == Traits::getRaxOrDie());
-    assert(DestReg == Traits::getRdxOrDie());
-    Asm->cqo();
-    break;
-  }
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Mul::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  assert(this->getSrcSize() == 2);
-  assert(llvm::isa<Variable>(this->getSrc(0)));
-  assert(llvm::cast<Variable>(this->getSrc(0))->getRegNum() ==
-         RegisterSet::Reg_eax);
-  assert(this->getDest()->getRegNum() == RegisterSet::Reg_eax); // TODO:
-                                                                // allow
-                                                                // edx?
-  Str << "\t"
-         "mul"
-      << this->getWidthString(this->getDest()->getType()) << "\t";
-  this->getSrc(1)->emit(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Mul::emitIAS(const Cfg *Func) const {
-  assert(this->getSrcSize() == 2);
-  assert(llvm::isa<Variable>(this->getSrc(0)));
-  assert(llvm::cast<Variable>(this->getSrc(0))->getRegNum() ==
-         RegisterSet::Reg_eax);
-  assert(this->getDest()->getRegNum() == RegisterSet::Reg_eax); // TODO:
-                                                                // allow
-                                                                // edx?
-  const Operand *Src = this->getSrc(1);
-  Type Ty = Src->getType();
-  static const GPREmitterOneOp Emitter = {&Assembler::mul, &Assembler::mul};
-  emitIASOpTyGPR(Func, Ty, Src, Emitter);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Mul::dump(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrDump();
-  this->dumpDest(Func);
-  Str << " = mul." << this->getDest()->getType() << " ";
-  this->dumpSources(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Shld::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  Variable *Dest = this->getDest();
-  assert(this->getSrcSize() == 3);
-  assert(Dest == this->getSrc(0));
-  Str << "\t"
-         "shld"
-      << this->getWidthString(Dest->getType()) << "\t";
-  this->getSrc(2)->emit(Func);
-  Str << ", ";
-  this->getSrc(1)->emit(Func);
-  Str << ", ";
-  Dest->emit(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Shld::emitIAS(const Cfg *Func) const {
-  assert(this->getSrcSize() == 3);
-  assert(this->getDest() == this->getSrc(0));
-  const Variable *Dest = this->getDest();
-  const Operand *Src1 = this->getSrc(1);
-  const Operand *Src2 = this->getSrc(2);
-  static const GPREmitterShiftD Emitter = {&Assembler::shld, &Assembler::shld};
-  emitIASGPRShiftDouble(Func, Dest, Src1, Src2, Emitter);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Shld::dump(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrDump();
-  this->dumpDest(Func);
-  Str << " = shld." << this->getDest()->getType() << " ";
-  this->dumpSources(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Shrd::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  Variable *Dest = this->getDest();
-  assert(this->getSrcSize() == 3);
-  assert(Dest == this->getSrc(0));
-  Str << "\t"
-         "shrd"
-      << this->getWidthString(Dest->getType()) << "\t";
-  this->getSrc(2)->emit(Func);
-  Str << ", ";
-  this->getSrc(1)->emit(Func);
-  Str << ", ";
-  Dest->emit(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Shrd::emitIAS(const Cfg *Func) const {
-  assert(this->getSrcSize() == 3);
-  assert(this->getDest() == this->getSrc(0));
-  const Variable *Dest = this->getDest();
-  const Operand *Src1 = this->getSrc(1);
-  const Operand *Src2 = this->getSrc(2);
-  static const GPREmitterShiftD Emitter = {&Assembler::shrd, &Assembler::shrd};
-  emitIASGPRShiftDouble(Func, Dest, Src1, Src2, Emitter);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Shrd::dump(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrDump();
-  this->dumpDest(Func);
-  Str << " = shrd." << this->getDest()->getType() << " ";
-  this->dumpSources(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Cmov::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  Variable *Dest = this->getDest();
-  Str << "\t";
-  assert(Condition != Cond::Br_None);
-  assert(this->getDest()->hasReg());
-  Str << "cmov" << Traits::InstBrAttributes[Condition].DisplayString
-      << this->getWidthString(Dest->getType()) << "\t";
-  this->getSrc(1)->emit(Func);
-  Str << ", ";
-  Dest->emit(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Cmov::emitIAS(const Cfg *Func) const {
-  assert(Condition != Cond::Br_None);
-  assert(this->getDest()->hasReg());
-  assert(this->getSrcSize() == 2);
-  Operand *Src = this->getSrc(1);
-  Type SrcTy = Src->getType();
-  assert(SrcTy == IceType_i16 || SrcTy == IceType_i32 || (Traits::Is64Bit));
-  Assembler *Asm = Func->getAssembler<Assembler>();
-  auto *Target = InstX86Base::getTarget(Func);
-  if (const auto *SrcVar = llvm::dyn_cast<Variable>(Src)) {
-    if (SrcVar->hasReg()) {
-      Asm->cmov(SrcTy, Condition,
-                Traits::getEncodedGPR(this->getDest()->getRegNum()),
-                Traits::getEncodedGPR(SrcVar->getRegNum()));
-    } else {
-      Asm->cmov(SrcTy, Condition,
-                Traits::getEncodedGPR(this->getDest()->getRegNum()),
-                Target->stackVarToAsmOperand(SrcVar));
-    }
-  } else if (const auto *Mem = llvm::dyn_cast<X86OperandMem>(Src)) {
-    assert(Mem->getSegmentRegister() == X86OperandMem::DefaultSegment);
-    Asm->cmov(SrcTy, Condition,
-              Traits::getEncodedGPR(this->getDest()->getRegNum()),
-              Mem->toAsmAddress(Asm, Target));
-  } else {
-    llvm_unreachable("Unexpected operand type");
-  }
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Cmov::dump(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrDump();
-  Str << "cmov" << Traits::InstBrAttributes[Condition].DisplayString << ".";
-  Str << this->getDest()->getType() << " ";
-  this->dumpDest(Func);
-  Str << ", ";
-  this->dumpSources(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Cmpps::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  assert(this->getSrcSize() == 2);
-  assert(Condition < Cond::Cmpps_Invalid);
-  Type DestTy = this->Dest->getType();
-  Str << "\t"
-         "cmp"
-      << Traits::InstCmppsAttributes[Condition].EmitString
-      << Traits::TypeAttributes[DestTy].PdPsString << "\t";
-  this->getSrc(1)->emit(Func);
-  Str << ", ";
-  this->getDest()->emit(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Cmpps::emitIAS(const Cfg *Func) const {
-  Assembler *Asm = Func->getAssembler<Assembler>();
-  assert(this->getSrcSize() == 2);
-  assert(Condition < Cond::Cmpps_Invalid);
-  // Assuming there isn't any load folding for cmpps, and vector constants are
-  // not allowed in PNaCl.
-  assert(llvm::isa<Variable>(this->getSrc(1)));
-  auto *Target = InstX86Base::getTarget(Func);
-  const auto *SrcVar = llvm::cast<Variable>(this->getSrc(1));
-  if (SrcVar->hasReg()) {
-    Asm->cmpps(this->getDest()->getType(),
-               Traits::getEncodedXmm(this->getDest()->getRegNum()),
-               Traits::getEncodedXmm(SrcVar->getRegNum()), Condition);
-  } else {
-    Address SrcStackAddr = Target->stackVarToAsmOperand(SrcVar);
-    Asm->cmpps(this->getDest()->getType(),
-               Traits::getEncodedXmm(this->getDest()->getRegNum()),
-               SrcStackAddr, Condition);
-  }
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Cmpps::dump(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrDump();
-  assert(Condition < Cond::Cmpps_Invalid);
-  this->dumpDest(Func);
-  Str << " = cmp" << Traits::InstCmppsAttributes[Condition].EmitString
-      << "ps"
-         "\t";
-  this->dumpSources(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Cmpxchg::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  assert(this->getSrcSize() == 3);
-  if (this->Locked) {
-    Str << "\t"
-           "lock";
-  }
-  Str << "\t"
-         "cmpxchg"
-      << this->getWidthString(this->getSrc(0)->getType()) << "\t";
-  this->getSrc(2)->emit(Func);
-  Str << ", ";
-  this->getSrc(0)->emit(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Cmpxchg::emitIAS(const Cfg *Func) const {
-  assert(this->getSrcSize() == 3);
-  Assembler *Asm = Func->getAssembler<Assembler>();
-  Type Ty = this->getSrc(0)->getType();
-  auto *Target = InstX86Base::getTarget(Func);
-  const auto Mem = llvm::cast<X86OperandMem>(this->getSrc(0));
-  assert(Mem->getSegmentRegister() == X86OperandMem::DefaultSegment);
-  const Address Addr = Mem->toAsmAddress(Asm, Target);
-  const auto *VarReg = llvm::cast<Variable>(this->getSrc(2));
-  assert(VarReg->hasReg());
-  const GPRRegister Reg = Traits::getEncodedGPR(VarReg->getRegNum());
-  Asm->cmpxchg(Ty, Addr, Reg, this->Locked);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Cmpxchg::dump(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrDump();
-  if (this->Locked) {
-    Str << "lock ";
-  }
-  Str << "cmpxchg." << this->getSrc(0)->getType() << " ";
-  this->dumpSources(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Cmpxchg8b::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  assert(this->getSrcSize() == 5);
-  if (this->Locked) {
-    Str << "\t"
-           "lock";
-  }
-  Str << "\t"
-         "cmpxchg8b\t";
-  this->getSrc(0)->emit(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Cmpxchg8b::emitIAS(const Cfg *Func) const {
-  assert(this->getSrcSize() == 5);
-  Assembler *Asm = Func->getAssembler<Assembler>();
-  const auto Mem = llvm::cast<X86OperandMem>(this->getSrc(0));
-  assert(Mem->getSegmentRegister() == X86OperandMem::DefaultSegment);
-  auto *Target = InstX86Base::getTarget(Func);
-  const Address Addr = Mem->toAsmAddress(Asm, Target);
-  Asm->cmpxchg8b(Addr, this->Locked);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Cmpxchg8b::dump(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrDump();
-  if (this->Locked) {
-    Str << "lock ";
-  }
-  Str << "cmpxchg8b ";
-  this->dumpSources(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Cvt::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  assert(this->getSrcSize() == 1);
-  Str << "\t"
-         "cvt";
-  if (isTruncating())
-    Str << "t";
-  Str << Traits::TypeAttributes[this->getSrc(0)->getType()].CvtString << "2"
-      << Traits::TypeAttributes[this->getDest()->getType()].CvtString << "\t";
-  this->getSrc(0)->emit(Func);
-  Str << ", ";
-  this->getDest()->emit(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Cvt::emitIAS(const Cfg *Func) const {
-  assert(this->getSrcSize() == 1);
-  const Variable *Dest = this->getDest();
-  const Operand *Src = this->getSrc(0);
-  Type DestTy = Dest->getType();
-  Type SrcTy = Src->getType();
-  switch (Variant) {
-  case Si2ss: {
-    assert(isScalarIntegerType(SrcTy));
-    if (!Traits::Is64Bit) {
-      assert(typeWidthInBytes(SrcTy) <= 4);
-    } else {
-      assert(SrcTy == IceType_i32 || SrcTy == IceType_i64);
-    }
-    assert(isScalarFloatingType(DestTy));
-    static const CastEmitterRegOp<XmmRegister, GPRRegister> Emitter = {
-        &Assembler::cvtsi2ss, &Assembler::cvtsi2ss};
-    emitIASCastRegOp<XmmRegister, GPRRegister, Traits::getEncodedXmm,
-                     Traits::getEncodedGPR>(Func, DestTy, Dest, SrcTy, Src,
-                                            Emitter);
-    return;
-  }
-  case Tss2si: {
-    assert(isScalarFloatingType(SrcTy));
-    assert(isScalarIntegerType(DestTy));
-    if (Traits::Is64Bit) {
-      assert(DestTy == IceType_i32 || DestTy == IceType_i64);
-    } else {
-      assert(typeWidthInBytes(DestTy) <= 4);
-    }
-    static const CastEmitterRegOp<GPRRegister, XmmRegister> Emitter = {
-        &Assembler::cvttss2si, &Assembler::cvttss2si};
-    emitIASCastRegOp<GPRRegister, XmmRegister, Traits::getEncodedGPR,
-                     Traits::getEncodedXmm>(Func, DestTy, Dest, SrcTy, Src,
-                                            Emitter);
-    return;
-  }
-  case Ss2si: {
-    assert(isScalarFloatingType(SrcTy));
-    assert(isScalarIntegerType(DestTy));
-    if (Traits::Is64Bit) {
-      assert(DestTy == IceType_i32 || DestTy == IceType_i64);
-    } else {
-      assert(typeWidthInBytes(DestTy) <= 4);
-    }
-    static const CastEmitterRegOp<GPRRegister, XmmRegister> Emitter = {
-        &Assembler::cvtss2si, &Assembler::cvtss2si};
-    emitIASCastRegOp<GPRRegister, XmmRegister, Traits::getEncodedGPR,
-                     Traits::getEncodedXmm>(Func, DestTy, Dest, SrcTy, Src,
-                                            Emitter);
-    return;
-  }
-  case Float2float: {
-    assert(isScalarFloatingType(SrcTy));
-    assert(isScalarFloatingType(DestTy));
-    assert(DestTy != SrcTy);
-    static const XmmEmitterRegOp Emitter = {&Assembler::cvtfloat2float,
-                                            &Assembler::cvtfloat2float};
-    emitIASRegOpTyXMM(Func, SrcTy, Dest, Src, Emitter);
-    return;
-  }
-  case Dq2ps: {
-    assert(isVectorIntegerType(SrcTy));
-    assert(isVectorFloatingType(DestTy));
-    static const XmmEmitterRegOp Emitter = {&Assembler::cvtdq2ps,
-                                            &Assembler::cvtdq2ps};
-    emitIASRegOpTyXMM(Func, DestTy, Dest, Src, Emitter);
-    return;
-  }
-  case Tps2dq: {
-    assert(isVectorFloatingType(SrcTy));
-    assert(isVectorIntegerType(DestTy));
-    static const XmmEmitterRegOp Emitter = {&Assembler::cvttps2dq,
-                                            &Assembler::cvttps2dq};
-    emitIASRegOpTyXMM(Func, DestTy, Dest, Src, Emitter);
-    return;
-  }
-  case Ps2dq: {
-    assert(isVectorFloatingType(SrcTy));
-    assert(isVectorIntegerType(DestTy));
-    static const XmmEmitterRegOp Emitter = {&Assembler::cvtps2dq,
-                                            &Assembler::cvtps2dq};
-    emitIASRegOpTyXMM(Func, DestTy, Dest, Src, Emitter);
-    return;
-  }
-  }
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Cvt::dump(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrDump();
-  this->dumpDest(Func);
-  Str << " = cvt";
-  if (isTruncating())
-    Str << "t";
-  Str << Traits::TypeAttributes[this->getSrc(0)->getType()].CvtString << "2"
-      << Traits::TypeAttributes[this->getDest()->getType()].CvtString << " ";
-  this->dumpSources(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Round::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  assert(this->getSrcSize() == 3);
-  Str << "\t" << this->Opcode
-      << Traits::TypeAttributes[this->getDest()->getType()].SpSdString << "\t";
-  this->getSrc(1)->emit(Func);
-  Str << ", ";
-  this->getSrc(0)->emit(Func);
-  Str << ", ";
-  this->getDest()->emit(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Round::emitIAS(const Cfg *Func) const {
-  assert(this->getSrcSize() == 2);
-  assert(InstX86Base::getTarget(Func)->getInstructionSet() >= Traits::SSE4_1);
-  const Variable *Dest = this->getDest();
-  Type Ty = Dest->getType();
-  static const ThreeOpImmEmitter<XmmRegister, XmmRegister> Emitter = {
-      &Assembler::round, &Assembler::round};
-  emitIASThreeOpImmOps<XmmRegister, XmmRegister, Traits::getEncodedXmm,
-                       Traits::getEncodedXmm>(Func, Ty, Dest, this->getSrc(0),
-                                              this->getSrc(1), Emitter);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Icmp::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  assert(this->getSrcSize() == 2);
-  Str << "\t"
-         "cmp"
-      << this->getWidthString(this->getSrc(0)->getType()) << "\t";
-  this->getSrc(1)->emit(Func);
-  Str << ", ";
-  this->getSrc(0)->emit(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Icmp::emitIAS(const Cfg *Func) const {
-  assert(this->getSrcSize() == 2);
-  const Operand *Src0 = this->getSrc(0);
-  const Operand *Src1 = this->getSrc(1);
-  Type Ty = Src0->getType();
-  static const GPREmitterRegOp RegEmitter = {&Assembler::cmp, &Assembler::cmp,
-                                             &Assembler::cmp};
-  static const GPREmitterAddrOp AddrEmitter = {&Assembler::cmp,
-                                               &Assembler::cmp};
-  if (const auto *SrcVar0 = llvm::dyn_cast<Variable>(Src0)) {
-    if (SrcVar0->hasReg()) {
-      constexpr bool NotLea = false;
-      emitIASRegOpTyGPR(Func, NotLea, Ty, SrcVar0, Src1, RegEmitter);
-      return;
-    }
-  }
-  emitIASAsAddrOpTyGPR(Func, Ty, Src0, Src1, AddrEmitter);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Icmp::dump(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrDump();
-  Str << "cmp." << this->getSrc(0)->getType() << " ";
-  this->dumpSources(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Ucomiss::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  assert(this->getSrcSize() == 2);
-  Str << "\t"
-         "ucomi"
-      << Traits::TypeAttributes[this->getSrc(0)->getType()].SdSsString << "\t";
-  this->getSrc(1)->emit(Func);
-  Str << ", ";
-  this->getSrc(0)->emit(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Ucomiss::emitIAS(const Cfg *Func) const {
-  assert(this->getSrcSize() == 2);
-  // Currently src0 is always a variable by convention, to avoid having two
-  // memory operands.
-  assert(llvm::isa<Variable>(this->getSrc(0)));
-  const auto *Src0Var = llvm::cast<Variable>(this->getSrc(0));
-  Type Ty = Src0Var->getType();
-  static const XmmEmitterRegOp Emitter = {&Assembler::ucomiss,
-                                          &Assembler::ucomiss};
-  emitIASRegOpTyXMM(Func, Ty, Src0Var, this->getSrc(1), Emitter);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Ucomiss::dump(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrDump();
-  Str << "ucomiss." << this->getSrc(0)->getType() << " ";
-  this->dumpSources(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86UD2::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  assert(this->getSrcSize() == 0);
-  Str << "\t"
-         "ud2";
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86UD2::emitIAS(const Cfg *Func) const {
-  Assembler *Asm = Func->getAssembler<Assembler>();
-  Asm->ud2();
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86UD2::dump(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrDump();
-  Str << "ud2";
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Int3::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  assert(this->getSrcSize() == 0);
-  Str << "\t"
-         "int 3";
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Int3::emitIAS(const Cfg *Func) const {
-  Assembler *Asm = Func->getAssembler<Assembler>();
-  Asm->int3();
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Int3::dump(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrDump();
-  Str << "int 3";
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Test::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  assert(this->getSrcSize() == 2);
-  Str << "\t"
-         "test"
-      << this->getWidthString(this->getSrc(0)->getType()) << "\t";
-  this->getSrc(1)->emit(Func);
-  Str << ", ";
-  this->getSrc(0)->emit(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Test::emitIAS(const Cfg *Func) const {
-  assert(this->getSrcSize() == 2);
-  const Operand *Src0 = this->getSrc(0);
-  const Operand *Src1 = this->getSrc(1);
-  Type Ty = Src0->getType();
-  // The Reg/Addr form of test is not encodeable.
-  static const GPREmitterRegOp RegEmitter = {&Assembler::test, nullptr,
-                                             &Assembler::test};
-  static const GPREmitterAddrOp AddrEmitter = {&Assembler::test,
-                                               &Assembler::test};
-  if (const auto *SrcVar0 = llvm::dyn_cast<Variable>(Src0)) {
-    if (SrcVar0->hasReg()) {
-      constexpr bool NotLea = false;
-      emitIASRegOpTyGPR(Func, NotLea, Ty, SrcVar0, Src1, RegEmitter);
-      return;
-    }
-  }
-  emitIASAsAddrOpTyGPR(Func, Ty, Src0, Src1, AddrEmitter);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Test::dump(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrDump();
-  Str << "test." << this->getSrc(0)->getType() << " ";
-  this->dumpSources(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Mfence::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  assert(this->getSrcSize() == 0);
-  Str << "\t"
-         "mfence";
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Mfence::emitIAS(const Cfg *Func) const {
-  Assembler *Asm = Func->getAssembler<Assembler>();
-  Asm->mfence();
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Mfence::dump(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrDump();
-  Str << "mfence";
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Store::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  assert(this->getSrcSize() == 2);
-  Type Ty = this->getSrc(0)->getType();
-  Str << "\t"
-         "mov"
-      << this->getWidthString(Ty) << Traits::TypeAttributes[Ty].SdSsString
-      << "\t";
-  this->getSrc(0)->emit(Func);
-  Str << ", ";
-  this->getSrc(1)->emit(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Store::emitIAS(const Cfg *Func) const {
-  assert(this->getSrcSize() == 2);
-  const Operand *Dest = this->getSrc(1);
-  const Operand *Src = this->getSrc(0);
-  Type DestTy = Dest->getType();
-  if (isScalarFloatingType(DestTy)) {
-    // Src must be a register, since Dest is a Mem operand of some kind.
-    const auto *SrcVar = llvm::cast<Variable>(Src);
-    assert(SrcVar->hasReg());
-    XmmRegister SrcReg = Traits::getEncodedXmm(SrcVar->getRegNum());
-    Assembler *Asm = Func->getAssembler<Assembler>();
-    auto *Target = InstX86Base::getTarget(Func);
-    if (const auto *DestVar = llvm::dyn_cast<Variable>(Dest)) {
-      assert(!DestVar->hasReg());
-      Address StackAddr(Target->stackVarToAsmOperand(DestVar));
-      Asm->movss(DestTy, StackAddr, SrcReg);
-    } else {
-      const auto DestMem = llvm::cast<X86OperandMem>(Dest);
-      assert(DestMem->getSegmentRegister() == X86OperandMem::DefaultSegment);
-      Asm->movss(DestTy, DestMem->toAsmAddress(Asm, Target), SrcReg);
-    }
-    return;
-  } else {
-    assert(isScalarIntegerType(DestTy));
-    static const GPREmitterAddrOp GPRAddrEmitter = {&Assembler::mov,
-                                                    &Assembler::mov};
-    emitIASAsAddrOpTyGPR(Func, DestTy, Dest, Src, GPRAddrEmitter);
-  }
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Store::dump(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrDump();
-  Str << "mov." << this->getSrc(0)->getType() << " ";
-  this->getSrc(1)->dump(Func);
-  Str << ", ";
-  this->getSrc(0)->dump(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86StoreP::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  assert(this->getSrcSize() == 2);
-  assert(isVectorType(this->getSrc(1)->getType()));
-  Str << "\t"
-         "movups\t";
-  this->getSrc(0)->emit(Func);
-  Str << ", ";
-  this->getSrc(1)->emit(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86StoreP::emitIAS(const Cfg *Func) const {
-  Assembler *Asm = Func->getAssembler<Assembler>();
-  assert(this->getSrcSize() == 2);
-  const auto *SrcVar = llvm::cast<Variable>(this->getSrc(0));
-  const auto DestMem = llvm::cast<X86OperandMem>(this->getSrc(1));
-  assert(DestMem->getSegmentRegister() == X86OperandMem::DefaultSegment);
-  assert(SrcVar->hasReg());
-  auto *Target = InstX86Base::getTarget(Func);
-  Asm->movups(DestMem->toAsmAddress(Asm, Target),
-              Traits::getEncodedXmm(SrcVar->getRegNum()));
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86StoreP::dump(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrDump();
-  Str << "storep." << this->getSrc(0)->getType() << " ";
-  this->getSrc(1)->dump(Func);
-  Str << ", ";
-  this->getSrc(0)->dump(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86StoreQ::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  assert(this->getSrcSize() == 2);
-  assert(this->getSrc(1)->getType() == IceType_i64 ||
-         this->getSrc(1)->getType() == IceType_f64 ||
-         isVectorType(this->getSrc(1)->getType()));
-  Str << "\t"
-         "movq\t";
-  this->getSrc(0)->emit(Func);
-  Str << ", ";
-  this->getSrc(1)->emit(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86StoreQ::emitIAS(const Cfg *Func) const {
-  Assembler *Asm = Func->getAssembler<Assembler>();
-  assert(this->getSrcSize() == 2);
-  const auto *SrcVar = llvm::cast<Variable>(this->getSrc(0));
-  const auto DestMem = llvm::cast<X86OperandMem>(this->getSrc(1));
-  assert(DestMem->getSegmentRegister() == X86OperandMem::DefaultSegment);
-  assert(SrcVar->hasReg());
-  auto *Target = InstX86Base::getTarget(Func);
-  Asm->movq(DestMem->toAsmAddress(Asm, Target),
-            Traits::getEncodedXmm(SrcVar->getRegNum()));
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86StoreQ::dump(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrDump();
-  Str << "storeq." << this->getSrc(0)->getType() << " ";
-  this->getSrc(1)->dump(Func);
-  Str << ", ";
-  this->getSrc(0)->dump(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86StoreD::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  assert(this->getSrcSize() == 2);
-  assert(this->getSrc(1)->getType() == IceType_i64 ||
-         this->getSrc(1)->getType() == IceType_f64 ||
-         isVectorType(this->getSrc(1)->getType()));
-  Str << "\t"
-         "movd\t";
-  this->getSrc(0)->emit(Func);
-  Str << ", ";
-  this->getSrc(1)->emit(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86StoreD::emitIAS(const Cfg *Func) const {
-  Assembler *Asm = Func->getAssembler<Assembler>();
-  assert(this->getSrcSize() == 2);
-  const auto *SrcVar = llvm::cast<Variable>(this->getSrc(0));
-  const auto DestMem = llvm::cast<X86OperandMem>(this->getSrc(1));
-  assert(DestMem->getSegmentRegister() == X86OperandMem::DefaultSegment);
-  assert(SrcVar->hasReg());
-  auto *Target = InstX86Base::getTarget(Func);
-  Asm->movd(SrcVar->getType(), DestMem->toAsmAddress(Asm, Target),
-            Traits::getEncodedXmm(SrcVar->getRegNum()));
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86StoreD::dump(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrDump();
-  Str << "stored." << this->getSrc(0)->getType() << " ";
-  this->getSrc(1)->dump(Func);
-  Str << ", ";
-  this->getSrc(0)->dump(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Lea::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  if (auto *Add = this->deoptLeaToAddOrNull(Func)) {
-    Add->emit(Func);
-    return;
-  }
-
-  Ostream &Str = Func->getContext()->getStrEmit();
-  assert(this->getSrcSize() == 1);
-  assert(this->getDest()->hasReg());
-  Str << "\t"
-         "lea"
-      << this->getWidthString(this->getDest()->getType()) << "\t";
-  Operand *Src0 = this->getSrc(0);
-  if (const auto *Src0Var = llvm::dyn_cast<Variable>(Src0)) {
-    Type Ty = Src0Var->getType();
-    // lea on x86-32 doesn't accept mem128 operands, so cast VSrc0 to an
-    // acceptable type.
-    Src0Var->asType(Func, isVectorType(Ty) ? IceType_i32 : Ty, RegNumT())
-        ->emit(Func);
-  } else {
-    Src0->emit(Func);
-  }
-  Str << ", ";
-  this->getDest()->emit(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Mov::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  assert(this->getSrcSize() == 1);
-  Operand *Src = this->getSrc(0);
-  Type SrcTy = Src->getType();
-  Type DestTy = this->getDest()->getType();
-  if (Traits::Is64Bit && DestTy == IceType_i64 &&
-      llvm::isa<ConstantInteger64>(Src) &&
-      !Utils::IsInt(32, llvm::cast<ConstantInteger64>(Src)->getValue())) {
-    Str << "\t"
-           "movabs"
-           "\t";
-  } else {
-    Str << "\t"
-           "mov"
-        << (!isScalarFloatingType(DestTy)
-                ? this->getWidthString(DestTy)
-                : Traits::TypeAttributes[DestTy].SdSsString)
-        << "\t";
-  }
-  // For an integer truncation operation, src is wider than dest. In this case,
-  // we use a mov instruction whose data width matches the narrower dest.
-  // TODO: This assert disallows usages such as copying a floating
-  // point value between a vector and a scalar (which movss is used for). Clean
-  // this up.
-  assert(InstX86Base::getTarget(Func)->typeWidthInBytesOnStack(DestTy) ==
-         InstX86Base::getTarget(Func)->typeWidthInBytesOnStack(SrcTy));
-  const Operand *NewSrc = Src;
-  if (auto *SrcVar = llvm::dyn_cast<Variable>(Src)) {
-    RegNumT NewRegNum;
-    if (SrcVar->hasReg())
-      NewRegNum = Traits::getGprForType(DestTy, SrcVar->getRegNum());
-    if (SrcTy != DestTy)
-      NewSrc = SrcVar->asType(Func, DestTy, NewRegNum);
-  }
-  NewSrc->emit(Func);
-  Str << ", ";
-  this->getDest()->emit(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Mov::emitIAS(const Cfg *Func) const {
-  assert(this->getSrcSize() == 1);
-  const Variable *Dest = this->getDest();
-  const Operand *Src = this->getSrc(0);
-  Type DestTy = Dest->getType();
-  Type SrcTy = Src->getType();
-  // Mov can be used for GPRs or XMM registers. Also, the type does not
-  // necessarily match (Mov can be used for bitcasts). However, when the type
-  // does not match, one of the operands must be a register. Thus, the strategy
-  // is to find out if Src or Dest are a register, then use that register's
-  // type to decide on which emitter set to use. The emitter set will include
-  // reg-reg movs, but that case should be unused when the types don't match.
-  static const XmmEmitterRegOp XmmRegEmitter = {&Assembler::movss,
-                                                &Assembler::movss};
-  static const GPREmitterRegOp GPRRegEmitter = {
-      &Assembler::mov, &Assembler::mov, &Assembler::mov};
-  static const GPREmitterAddrOp GPRAddrEmitter = {&Assembler::mov,
-                                                  &Assembler::mov};
-  // For an integer truncation operation, src is wider than dest. In this case,
-  // we use a mov instruction whose data width matches the narrower dest.
-  // TODO: This assert disallows usages such as copying a floating
-  // point value between a vector and a scalar (which movss is used for). Clean
-  // this up.
-  auto *Target = InstX86Base::getTarget(Func);
-  assert(Target->typeWidthInBytesOnStack(this->getDest()->getType()) ==
-         Target->typeWidthInBytesOnStack(Src->getType()));
-  if (Dest->hasReg()) {
-    if (isScalarFloatingType(DestTy)) {
-      emitIASRegOpTyXMM(Func, DestTy, Dest, Src, XmmRegEmitter);
-      return;
-    } else {
-      assert(isScalarIntegerType(DestTy));
-      // Widen DestTy for truncation (see above note). We should only do this
-      // when both Src and Dest are integer types.
-      if (isScalarIntegerType(SrcTy)) {
-        SrcTy = DestTy;
-      }
-      constexpr bool NotLea = false;
-      emitIASRegOpTyGPR(Func, NotLea, DestTy, Dest, Src, GPRRegEmitter);
-      return;
-    }
-  } else {
-    // Dest must be Stack and Src *could* be a register. Use Src's type to
-    // decide on the emitters.
-    Address StackAddr(Target->stackVarToAsmOperand(Dest));
-    if (isScalarFloatingType(SrcTy)) {
-      // Src must be a register.
-      const auto *SrcVar = llvm::cast<Variable>(Src);
-      assert(SrcVar->hasReg());
-      Assembler *Asm = Func->getAssembler<Assembler>();
-      Asm->movss(SrcTy, StackAddr, Traits::getEncodedXmm(SrcVar->getRegNum()));
-      return;
-    } else if (isVectorType(SrcTy)) {
-      // Src must be a register
-      const auto *SrcVar = llvm::cast<Variable>(Src);
-      assert(SrcVar->hasReg());
-      Assembler *Asm = Func->getAssembler<Assembler>();
-      Asm->movups(StackAddr, Traits::getEncodedXmm(SrcVar->getRegNum()));
-    } else {
-      // Src can be a register or immediate.
-      assert(isScalarIntegerType(SrcTy));
-      emitIASAddrOpTyGPR(Func, SrcTy, StackAddr, Src, GPRAddrEmitter);
-      return;
-    }
-    return;
-  }
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Movd::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  assert(this->getSrcSize() == 1);
-  Variable *Dest = this->getDest();
-  Operand *Src = this->getSrc(0);
-
-  if (Dest->getType() == IceType_i64 || Src->getType() == IceType_i64) {
-    assert(Dest->getType() == IceType_f64 || Src->getType() == IceType_f64);
-    assert(Dest->getType() != Src->getType());
-    Ostream &Str = Func->getContext()->getStrEmit();
-    Str << "\t"
-           "movq"
-           "\t";
-    Src->emit(Func);
-    Str << ", ";
-    Dest->emit(Func);
-    return;
-  }
-
-  InstX86BaseUnaryopXmm<InstX86Base::Movd>::emit(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Movd::emitIAS(const Cfg *Func) const {
-  Assembler *Asm = Func->getAssembler<Assembler>();
-  assert(this->getSrcSize() == 1);
-  const Variable *Dest = this->getDest();
-  auto *Target = InstX86Base::getTarget(Func);
-  // For insert/extract element (one of Src/Dest is an Xmm vector and the other
-  // is an int type).
-  if (const auto *SrcVar = llvm::dyn_cast<Variable>(this->getSrc(0))) {
-    if (SrcVar->getType() == IceType_i32 ||
-        (Traits::Is64Bit && SrcVar->getType() == IceType_i64)) {
-      assert(isVectorType(Dest->getType()) ||
-             (isScalarFloatingType(Dest->getType()) &&
-              typeWidthInBytes(SrcVar->getType()) ==
-                  typeWidthInBytes(Dest->getType())));
-      assert(Dest->hasReg());
-      XmmRegister DestReg = Traits::getEncodedXmm(Dest->getRegNum());
-      if (SrcVar->hasReg()) {
-        Asm->movd(SrcVar->getType(), DestReg,
-                  Traits::getEncodedGPR(SrcVar->getRegNum()));
-      } else {
-        Address StackAddr(Target->stackVarToAsmOperand(SrcVar));
-        Asm->movd(SrcVar->getType(), DestReg, StackAddr);
-      }
-    } else {
-      assert(isVectorType(SrcVar->getType()) ||
-             (isScalarFloatingType(SrcVar->getType()) &&
-              typeWidthInBytes(SrcVar->getType()) ==
-                  typeWidthInBytes(Dest->getType())));
-      assert(SrcVar->hasReg());
-      assert(Dest->getType() == IceType_i32 ||
-             (Traits::Is64Bit && Dest->getType() == IceType_i64));
-      XmmRegister SrcReg = Traits::getEncodedXmm(SrcVar->getRegNum());
-      if (Dest->hasReg()) {
-        Asm->movd(Dest->getType(), Traits::getEncodedGPR(Dest->getRegNum()),
-                  SrcReg);
-      } else {
-        Address StackAddr(Target->stackVarToAsmOperand(Dest));
-        Asm->movd(Dest->getType(), StackAddr, SrcReg);
-      }
-    }
-  } else {
-    assert(Dest->hasReg());
-    XmmRegister DestReg = Traits::getEncodedXmm(Dest->getRegNum());
-    auto *Mem = llvm::cast<X86OperandMem>(this->getSrc(0));
-    Asm->movd(Mem->getType(), DestReg, Mem->toAsmAddress(Asm, Target));
-  }
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Movp::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  // TODO(wala,stichnot): movups works with all vector operands, but there
-  // exist other instructions (movaps, movdqa, movdqu) that may perform better,
-  // depending on the data type and alignment of the operands.
-  Ostream &Str = Func->getContext()->getStrEmit();
-  assert(this->getSrcSize() == 1);
-  Str << "\t"
-         "movups\t";
-  this->getSrc(0)->emit(Func);
-  Str << ", ";
-  this->getDest()->emit(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Movp::emitIAS(const Cfg *Func) const {
-  assert(this->getSrcSize() == 1);
-  assert(isVectorType(this->getDest()->getType()));
-  const Variable *Dest = this->getDest();
-  const Operand *Src = this->getSrc(0);
-  static const XmmEmitterMovOps Emitter = {
-      &Assembler::movups, &Assembler::movups, &Assembler::movups};
-  emitIASMovlikeXMM(Func, Dest, Src, Emitter);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Movq::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  assert(this->getSrcSize() == 1);
-  assert(this->getDest()->getType() == IceType_i64 ||
-         this->getDest()->getType() == IceType_f64);
-  Str << "\t"
-         "movq"
-         "\t";
-  this->getSrc(0)->emit(Func);
-  Str << ", ";
-  this->getDest()->emit(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Movq::emitIAS(const Cfg *Func) const {
-  assert(this->getSrcSize() == 1);
-  assert(this->getDest()->getType() == IceType_i64 ||
-         this->getDest()->getType() == IceType_f64 ||
-         isVectorType(this->getDest()->getType()));
-  const Variable *Dest = this->getDest();
-  const Operand *Src = this->getSrc(0);
-  static const XmmEmitterMovOps Emitter = {&Assembler::movq, &Assembler::movq,
-                                           &Assembler::movq};
-  emitIASMovlikeXMM(Func, Dest, Src, Emitter);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86MovssRegs::emitIAS(const Cfg *Func) const {
-  // This is Binop variant is only intended to be used for reg-reg moves where
-  // part of the Dest register is untouched.
-  assert(this->getSrcSize() == 2);
-  const Variable *Dest = this->getDest();
-  assert(Dest == this->getSrc(0));
-  const auto *SrcVar = llvm::cast<Variable>(this->getSrc(1));
-  assert(Dest->hasReg() && SrcVar->hasReg());
-  Assembler *Asm = Func->getAssembler<Assembler>();
-  Asm->movss(IceType_f32, Traits::getEncodedXmm(Dest->getRegNum()),
-             Traits::getEncodedXmm(SrcVar->getRegNum()));
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Movsx::emitIAS(const Cfg *Func) const {
-  assert(this->getSrcSize() == 1);
-  const Variable *Dest = this->getDest();
-  const Operand *Src = this->getSrc(0);
-  // Dest must be a > 8-bit register, but Src can be 8-bit. In practice we just
-  // use the full register for Dest to avoid having an OperandSizeOverride
-  // prefix. It also allows us to only dispatch on SrcTy.
-  Type SrcTy = Src->getType();
-  assert(typeWidthInBytes(Dest->getType()) > 1);
-  assert(typeWidthInBytes(Dest->getType()) > typeWidthInBytes(SrcTy));
-  constexpr bool NotLea = false;
-  emitIASRegOpTyGPR<false, true>(Func, NotLea, SrcTy, Dest, Src, this->Emitter);
-}
-
-template <typename TraitsType>
-bool InstImpl<TraitsType>::InstX86Movzx::mayBeElided(
-    const Variable *Dest, const Operand *SrcOpnd) const {
-  assert(Traits::Is64Bit);
-  const auto *Src = llvm::dyn_cast<Variable>(SrcOpnd);
-
-  // Src is not a Variable, so it does not have a register. Movzx can't be
-  // elided.
-  if (Src == nullptr)
-    return false;
-
-  // Movzx to/from memory can't be elided.
-  if (!Src->hasReg() || !Dest->hasReg())
-    return false;
-
-  // Reg/reg move with different source and dest can't be elided.
-  if (Traits::getEncodedGPR(Src->getRegNum()) !=
-      Traits::getEncodedGPR(Dest->getRegNum()))
-    return false;
-
-  // A must-keep movzx 32- to 64-bit is sometimes needed in x86-64 sandboxing.
-  return !MustKeep;
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Movzx::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  if (Traits::Is64Bit) {
-    // There's no movzx %eXX, %rXX. To zero extend 32- to 64-bits, we emit a
-    // mov %eXX, %eXX. The processor will still do a movzx[bw]q.
-    assert(this->getSrcSize() == 1);
-    const Operand *Src = this->getSrc(0);
-    const Variable *Dest = this->Dest;
-    if (Src->getType() == IceType_i32 && Dest->getType() == IceType_i64) {
-      Ostream &Str = Func->getContext()->getStrEmit();
-      if (mayBeElided(Dest, Src)) {
-        Str << "\t/* elided movzx */";
-      } else {
-        Str << "\t"
-               "mov"
-               "\t";
-        Src->emit(Func);
-        Str << ", ";
-        Dest->asType(Func, IceType_i32,
-                     Traits::getGprForType(IceType_i32, Dest->getRegNum()))
-            ->emit(Func);
-        Str << " /* movzx */";
-      }
-      return;
-    }
-  }
-  InstX86BaseUnaryopGPR<InstX86Base::Movzx>::emit(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Movzx::emitIAS(const Cfg *Func) const {
-  assert(this->getSrcSize() == 1);
-  const Variable *Dest = this->getDest();
-  const Operand *Src = this->getSrc(0);
-  Type SrcTy = Src->getType();
-  assert(typeWidthInBytes(Dest->getType()) > 1);
-  assert(typeWidthInBytes(Dest->getType()) > typeWidthInBytes(SrcTy));
-  if (Traits::Is64Bit) {
-    if (Src->getType() == IceType_i32 && Dest->getType() == IceType_i64 &&
-        mayBeElided(Dest, Src)) {
-      return;
-    }
-  }
-  constexpr bool NotLea = false;
-  emitIASRegOpTyGPR<false, true>(Func, NotLea, SrcTy, Dest, Src, this->Emitter);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Nop::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  // TODO: Emit the right code for each variant.
-  Str << "\t"
-         "nop\t/* variant = "
-      << Variant << " */";
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Nop::emitIAS(const Cfg *Func) const {
-  Assembler *Asm = Func->getAssembler<Assembler>();
-  // TODO: Emit the right code for the variant.
-  Asm->nop();
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Nop::dump(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrDump();
-  Str << "nop (variant = " << Variant << ")";
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Fld::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  assert(this->getSrcSize() == 1);
-  Type Ty = this->getSrc(0)->getType();
-  const auto *Var = llvm::dyn_cast<Variable>(this->getSrc(0));
-  if (Var && Var->hasReg()) {
-    // This is a physical xmm register, so we need to spill it to a temporary
-    // stack slot.  Function prolog emission guarantees that there is sufficient
-    // space to do this.
-    Str << "\t"
-           "mov"
-        << Traits::TypeAttributes[Ty].SdSsString << "\t";
-    Var->emit(Func);
-    Str << ", (%esp)\n"
-           "\t"
-           "fld"
-        << this->getFldString(Ty)
-        << "\t"
-           "(%esp)";
-    return;
-  }
-  Str << "\t"
-         "fld"
-      << this->getFldString(Ty) << "\t";
-  this->getSrc(0)->emit(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Fld::emitIAS(const Cfg *Func) const {
-  Assembler *Asm = Func->getAssembler<Assembler>();
-  assert(this->getSrcSize() == 1);
-  const Operand *Src = this->getSrc(0);
-  auto *Target = InstX86Base::getTarget(Func);
-  Type Ty = Src->getType();
-  if (const auto *Var = llvm::dyn_cast<Variable>(Src)) {
-    if (Var->hasReg()) {
-      // This is a physical xmm register, so we need to spill it to a temporary
-      // stack slot.  Function prolog emission guarantees that there is
-      // sufficient space to do this.
-      Address StackSlot =
-          Address(RegisterSet::Encoded_Reg_esp, 0, AssemblerFixup::NoFixup);
-      Asm->movss(Ty, StackSlot, Traits::getEncodedXmm(Var->getRegNum()));
-      Asm->fld(Ty, StackSlot);
-    } else {
-      Address StackAddr(Target->stackVarToAsmOperand(Var));
-      Asm->fld(Ty, StackAddr);
-    }
-  } else if (const auto *Mem = llvm::dyn_cast<X86OperandMem>(Src)) {
-    assert(Mem->getSegmentRegister() == X86OperandMem::DefaultSegment);
-    Asm->fld(Ty, Mem->toAsmAddress(Asm, Target));
-  } else if (const auto *Imm = llvm::dyn_cast<Constant>(Src)) {
-    Asm->fld(Ty, Traits::Address::ofConstPool(Asm, Imm));
-  } else {
-    llvm_unreachable("Unexpected operand type");
-  }
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Fld::dump(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrDump();
-  Str << "fld." << this->getSrc(0)->getType() << " ";
-  this->dumpSources(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Fstp::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  assert(this->getSrcSize() == 0);
-  // TODO(jvoung,stichnot): Utilize this by setting Dest to nullptr to
-  // "partially" delete the fstp if the Dest is unused. Even if Dest is unused,
-  // the fstp should be kept for the SideEffects of popping the stack.
-  if (!this->getDest()) {
-    Str << "\t"
-           "fstp\t"
-           "st(0)";
-    return;
-  }
-  Type Ty = this->getDest()->getType();
-  if (!this->getDest()->hasReg()) {
-    Str << "\t"
-           "fstp"
-        << this->getFldString(Ty) << "\t";
-    this->getDest()->emit(Func);
-    return;
-  }
-  // Dest is a physical (xmm) register, so st(0) needs to go through memory.
-  // Hack this by using caller-reserved memory at the top of stack, spilling
-  // st(0) there, and loading it into the xmm register.
-  Str << "\t"
-         "fstp"
-      << this->getFldString(Ty)
-      << "\t"
-         "(%esp)\n";
-  Str << "\t"
-         "mov"
-      << Traits::TypeAttributes[Ty].SdSsString
-      << "\t"
-         "(%esp), ";
-  this->getDest()->emit(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Fstp::emitIAS(const Cfg *Func) const {
-  Assembler *Asm = Func->getAssembler<Assembler>();
-  assert(this->getSrcSize() == 0);
-  const Variable *Dest = this->getDest();
-  // TODO(jvoung,stichnot): Utilize this by setting Dest to nullptr to
-  // "partially" delete the fstp if the Dest is unused. Even if Dest is unused,
-  // the fstp should be kept for the SideEffects of popping the stack.
-  if (!Dest) {
-    Asm->fstp(RegisterSet::getEncodedSTReg(0));
-    return;
-  }
-  auto *Target = InstX86Base::getTarget(Func);
-  Type Ty = Dest->getType();
-  if (!Dest->hasReg()) {
-    Address StackAddr(Target->stackVarToAsmOperand(Dest));
-    Asm->fstp(Ty, StackAddr);
-  } else {
-    // Dest is a physical (xmm) register, so st(0) needs to go through memory.
-    // Hack this by using caller-reserved memory at the top of stack, spilling
-    // st(0) there, and loading it into the xmm register.
-    Address StackSlot =
-        Address(RegisterSet::Encoded_Reg_esp, 0, AssemblerFixup::NoFixup);
-    Asm->fstp(Ty, StackSlot);
-    Asm->movss(Ty, Traits::getEncodedXmm(Dest->getRegNum()), StackSlot);
-  }
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Fstp::dump(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrDump();
-  this->dumpDest(Func);
-  Str << " = fstp." << this->getDest()->getType() << ", st(0)";
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Pextr::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  assert(this->getSrcSize() == 2);
-  // pextrb and pextrd are SSE4.1 instructions.
-  Str << "\t" << this->Opcode
-      << Traits::TypeAttributes[this->getSrc(0)->getType()].IntegralString
-      << "\t";
-  this->getSrc(1)->emit(Func);
-  Str << ", ";
-  this->getSrc(0)->emit(Func);
-  Str << ", ";
-  Variable *Dest = this->getDest();
-  // pextrw must take a register dest. There is an SSE4.1 version that takes a
-  // memory dest, but we aren't using it. For uniformity, just restrict them
-  // all to have a register dest for now.
-  assert(Dest->hasReg());
-  Dest->asType(Func, IceType_i32, Dest->getRegNum())->emit(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Pextr::emitIAS(const Cfg *Func) const {
-  assert(this->getSrcSize() == 2);
-  // pextrb and pextrd are SSE4.1 instructions.
-  const Variable *Dest = this->getDest();
-  Type DispatchTy = Traits::getInVectorElementType(this->getSrc(0)->getType());
-  // pextrw must take a register dest. There is an SSE4.1 version that takes a
-  // memory dest, but we aren't using it. For uniformity, just restrict them
-  // all to have a register dest for now.
-  assert(Dest->hasReg());
-  // pextrw's Src(0) must be a register (both SSE4.1 and SSE2).
-  assert(llvm::cast<Variable>(this->getSrc(0))->hasReg());
-  static const ThreeOpImmEmitter<GPRRegister, XmmRegister> Emitter = {
-      &Assembler::pextr, nullptr};
-  emitIASThreeOpImmOps<GPRRegister, XmmRegister, Traits::getEncodedGPR,
-                       Traits::getEncodedXmm>(
-      Func, DispatchTy, Dest, this->getSrc(0), this->getSrc(1), Emitter);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Pinsr::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  assert(this->getSrcSize() == 3);
-  Str << "\t" << this->Opcode
-      << Traits::TypeAttributes[this->getDest()->getType()].IntegralString
-      << "\t";
-  this->getSrc(2)->emit(Func);
-  Str << ", ";
-  Operand *Src1 = this->getSrc(1);
-  if (const auto *Src1Var = llvm::dyn_cast<Variable>(Src1)) {
-    // If src1 is a register, it should always be r32.
-    if (Src1Var->hasReg()) {
-      const auto NewRegNum = Traits::getBaseReg(Src1Var->getRegNum());
-      const Variable *NewSrc = Src1Var->asType(Func, IceType_i32, NewRegNum);
-      NewSrc->emit(Func);
-    } else {
-      Src1Var->emit(Func);
-    }
-  } else {
-    Src1->emit(Func);
-  }
-  Str << ", ";
-  this->getDest()->emit(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Pinsr::emitIAS(const Cfg *Func) const {
-  assert(this->getSrcSize() == 3);
-  assert(this->getDest() == this->getSrc(0));
-  // pinsrb and pinsrd are SSE4.1 instructions.
-  const Operand *Src0 = this->getSrc(1);
-  Type DispatchTy = Src0->getType();
-  // If src1 is a register, it should always be r32 (this should fall out from
-  // the encodings for ByteRegs overlapping the encodings for r32), but we have
-  // to make sure the register allocator didn't choose an 8-bit high register
-  // like "ah".
-  if (BuildDefs::asserts()) {
-    if (auto *Src0Var = llvm::dyn_cast<Variable>(Src0)) {
-      if (Src0Var->hasReg()) {
-        const auto RegNum = Src0Var->getRegNum();
-        const auto BaseRegNum = Traits::getBaseReg(RegNum);
-        (void)BaseRegNum;
-        assert(Traits::getEncodedGPR(RegNum) ==
-               Traits::getEncodedGPR(BaseRegNum));
-      }
-    }
-  }
-  static const ThreeOpImmEmitter<XmmRegister, GPRRegister> Emitter = {
-      &Assembler::pinsr, &Assembler::pinsr};
-  emitIASThreeOpImmOps<XmmRegister, GPRRegister, Traits::getEncodedXmm,
-                       Traits::getEncodedGPR>(Func, DispatchTy, this->getDest(),
-                                              Src0, this->getSrc(2), Emitter);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Pshufd::emitIAS(const Cfg *Func) const {
-  assert(this->getSrcSize() == 2);
-  const Variable *Dest = this->getDest();
-  Type Ty = Dest->getType();
-  static const ThreeOpImmEmitter<XmmRegister, XmmRegister> Emitter = {
-      &Assembler::pshufd, &Assembler::pshufd};
-  emitIASThreeOpImmOps<XmmRegister, XmmRegister, Traits::getEncodedXmm,
-                       Traits::getEncodedXmm>(Func, Ty, Dest, this->getSrc(0),
-                                              this->getSrc(1), Emitter);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Shufps::emitIAS(const Cfg *Func) const {
-  assert(this->getSrcSize() == 3);
-  const Variable *Dest = this->getDest();
-  assert(Dest == this->getSrc(0));
-  Type Ty = Dest->getType();
-  static const ThreeOpImmEmitter<XmmRegister, XmmRegister> Emitter = {
-      &Assembler::shufps, &Assembler::shufps};
-  emitIASThreeOpImmOps<XmmRegister, XmmRegister, Traits::getEncodedXmm,
-                       Traits::getEncodedXmm>(Func, Ty, Dest, this->getSrc(1),
-                                              this->getSrc(2), Emitter);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Pop::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  assert(this->getSrcSize() == 0);
-  Str << "\t"
-         "pop\t";
-  this->getDest()->emit(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Pop::emitIAS(const Cfg *Func) const {
-  assert(this->getSrcSize() == 0);
-  Assembler *Asm = Func->getAssembler<Assembler>();
-  if (this->getDest()->hasReg()) {
-    Asm->popl(Traits::getEncodedGPR(this->getDest()->getRegNum()));
-  } else {
-    auto *Target = InstX86Base::getTarget(Func);
-    Asm->popl(Target->stackVarToAsmOperand(this->getDest()));
-  }
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Pop::dump(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrDump();
-  this->dumpDest(Func);
-  Str << " = pop." << this->getDest()->getType() << " ";
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Push::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  Str << "\t"
-         "push"
-         "\t";
-  assert(this->getSrcSize() == 1);
-  const Operand *Src = this->getSrc(0);
-  Src->emit(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Push::emitIAS(const Cfg *Func) const {
-  Assembler *Asm = Func->getAssembler<Assembler>();
-
-  assert(this->getSrcSize() == 1);
-  const Operand *Src = this->getSrc(0);
-
-  if (const auto *Var = llvm::dyn_cast<Variable>(Src)) {
-    Asm->pushl(Traits::getEncodedGPR(Var->getRegNum()));
-  } else if (const auto *Const32 = llvm::dyn_cast<ConstantInteger32>(Src)) {
-    Asm->pushl(AssemblerImmediate(Const32->getValue()));
-  } else if (auto *CR = llvm::dyn_cast<ConstantRelocatable>(Src)) {
-    Asm->pushl(CR);
-  } else {
-    llvm_unreachable("Unexpected operand type");
-  }
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Push::dump(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrDump();
-  Str << "push." << this->getSrc(0)->getType() << " ";
-  this->dumpSources(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Ret::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  Str << "\t"
-         "ret";
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Ret::emitIAS(const Cfg *Func) const {
-  Assembler *Asm = Func->getAssembler<Assembler>();
-  Asm->ret();
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Ret::dump(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrDump();
-  Type Ty =
-      (this->getSrcSize() == 0 ? IceType_void : this->getSrc(0)->getType());
-  Str << "ret." << Ty << " ";
-  this->dumpSources(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Setcc::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  Str << "\t"
-         "set"
-      << Traits::InstBrAttributes[Condition].DisplayString << "\t";
-  this->Dest->emit(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Setcc::emitIAS(const Cfg *Func) const {
-  assert(Condition != Cond::Br_None);
-  assert(this->getDest()->getType() == IceType_i1);
-  assert(this->getSrcSize() == 0);
-  Assembler *Asm = Func->getAssembler<Assembler>();
-  auto *Target = InstX86Base::getTarget(Func);
-  if (this->getDest()->hasReg())
-    Asm->setcc(Condition,
-               Traits::getEncodedByteReg(this->getDest()->getRegNum()));
-  else
-    Asm->setcc(Condition, Target->stackVarToAsmOperand(this->getDest()));
-  return;
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Setcc::dump(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrDump();
-  Str << "setcc." << Traits::InstBrAttributes[Condition].DisplayString << " ";
-  this->dumpDest(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Xadd::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  if (this->Locked) {
-    Str << "\t"
-           "lock";
-  }
-  Str << "\t"
-         "xadd"
-      << this->getWidthString(this->getSrc(0)->getType()) << "\t";
-  this->getSrc(1)->emit(Func);
-  Str << ", ";
-  this->getSrc(0)->emit(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Xadd::emitIAS(const Cfg *Func) const {
-  assert(this->getSrcSize() == 2);
-  Assembler *Asm = Func->getAssembler<Assembler>();
-  Type Ty = this->getSrc(0)->getType();
-  const auto Mem = llvm::cast<X86OperandMem>(this->getSrc(0));
-  assert(Mem->getSegmentRegister() == X86OperandMem::DefaultSegment);
-  auto *Target = InstX86Base::getTarget(Func);
-  const Address Addr = Mem->toAsmAddress(Asm, Target);
-  const auto *VarReg = llvm::cast<Variable>(this->getSrc(1));
-  assert(VarReg->hasReg());
-  const GPRRegister Reg = Traits::getEncodedGPR(VarReg->getRegNum());
-  Asm->xadd(Ty, Addr, Reg, this->Locked);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Xadd::dump(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrDump();
-  if (this->Locked) {
-    Str << "lock ";
-  }
-  Type Ty = this->getSrc(0)->getType();
-  Str << "xadd." << Ty << " ";
-  this->dumpSources(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Xchg::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  Str << "\t"
-         "xchg"
-      << this->getWidthString(this->getSrc(0)->getType()) << "\t";
-  this->getSrc(1)->emit(Func);
-  Str << ", ";
-  this->getSrc(0)->emit(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Xchg::emitIAS(const Cfg *Func) const {
-  assert(this->getSrcSize() == 2);
-  Assembler *Asm = Func->getAssembler<Assembler>();
-  Type Ty = this->getSrc(0)->getType();
-  const auto *VarReg1 = llvm::cast<Variable>(this->getSrc(1));
-  assert(VarReg1->hasReg());
-  const GPRRegister Reg1 = Traits::getEncodedGPR(VarReg1->getRegNum());
-
-  if (const auto *VarReg0 = llvm::dyn_cast<Variable>(this->getSrc(0))) {
-    assert(VarReg0->hasReg());
-    const GPRRegister Reg0 = Traits::getEncodedGPR(VarReg0->getRegNum());
-    Asm->xchg(Ty, Reg0, Reg1);
-    return;
-  }
-
-  const auto *Mem = llvm::cast<X86OperandMem>(this->getSrc(0));
-  assert(Mem->getSegmentRegister() == X86OperandMem::DefaultSegment);
-  auto *Target = InstX86Base::getTarget(Func);
-  const Address Addr = Mem->toAsmAddress(Asm, Target);
-  Asm->xchg(Ty, Addr, Reg1);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Xchg::dump(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrDump();
-  Type Ty = this->getSrc(0)->getType();
-  Str << "xchg." << Ty << " ";
-  this->dumpSources(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86IacaStart::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  Str << "\t# IACA_START\n"
-         "\t.byte 0x0F, 0x0B\n"
-         "\t"
-         "movl\t$111, %ebx\n"
-         "\t.byte 0x64, 0x67, 0x90";
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86IacaStart::emitIAS(const Cfg *Func) const {
-  Assembler *Asm = Func->getAssembler<Assembler>();
-  Asm->iaca_start();
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86IacaStart::dump(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrDump();
-  Str << "IACA_START";
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86IacaEnd::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  Str << "\t# IACA_END\n"
-         "\t"
-         "movl\t$222, %ebx\n"
-         "\t.byte 0x64, 0x67, 0x90\n"
-         "\t.byte 0x0F, 0x0B";
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86IacaEnd::emitIAS(const Cfg *Func) const {
-  Assembler *Asm = Func->getAssembler<Assembler>();
-  Asm->iaca_end();
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86IacaEnd::dump(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrDump();
-  Str << "IACA_END";
-}
-
-} // end of namespace X8632
-} // end of namespace Ice
-
-#endif // SUBZERO_SRC_ICEINSTX8632BASEIMPL_H
diff --git a/third_party/subzero/src/IceInstX8664.cpp b/third_party/subzero/src/IceInstX8664.cpp
index d847ea1..36fbb25 100644
--- a/third_party/subzero/src/IceInstX8664.cpp
+++ b/third_party/subzero/src/IceInstX8664.cpp
@@ -16,21 +16,2751 @@
 /// This file also defines X8664 operand specific methods (dump and emit.)
 ///
 //===----------------------------------------------------------------------===//
+
 #include "IceInstX8664.h"
 
 #include "IceAssemblerX8664.h"
 #include "IceCfg.h"
 #include "IceCfgNode.h"
 #include "IceConditionCodesX86.h"
+#include "IceDefs.h"
 #include "IceInst.h"
 #include "IceOperand.h"
 #include "IceRegistersX8664.h"
+#include "IceTargetLowering.h"
 #include "IceTargetLoweringX8664.h"
 
 namespace Ice {
-
 namespace X8664 {
 
+const char *InstX86Base::getWidthString(Type Ty) {
+  return Traits::TypeAttributes[Ty].WidthString;
+}
+
+const char *InstX86Base::getFldString(Type Ty) {
+  return Traits::TypeAttributes[Ty].FldString;
+}
+
+typename Cond::BrCond InstX86Base::getOppositeCondition(BrCond Cond) {
+  return Traits::InstBrAttributes[Cond].Opposite;
+}
+
+InstX86FakeRMW::InstX86FakeRMW(Cfg *Func, Operand *Data, Operand *Addr,
+                               InstArithmetic::OpKind Op, Variable *Beacon)
+    : InstX86Base(Func, InstX86Base::FakeRMW, 3, nullptr), Op(Op) {
+  this->addSource(Data);
+  this->addSource(Addr);
+  this->addSource(Beacon);
+}
+
+InstX86Mul::InstX86Mul(Cfg *Func, Variable *Dest, Variable *Source1,
+                       Operand *Source2)
+    : InstX86Base(Func, InstX86Base::Mul, 2, Dest) {
+  this->addSource(Source1);
+  this->addSource(Source2);
+}
+
+InstX86Shld::InstX86Shld(Cfg *Func, Variable *Dest, Variable *Source1,
+                         Operand *Source2)
+    : InstX86Base(Func, InstX86Base::Shld, 3, Dest) {
+  this->addSource(Dest);
+  this->addSource(Source1);
+  this->addSource(Source2);
+}
+
+InstX86Shrd::InstX86Shrd(Cfg *Func, Variable *Dest, Variable *Source1,
+                         Operand *Source2)
+    : InstX86Base(Func, InstX86Base::Shrd, 3, Dest) {
+  this->addSource(Dest);
+  this->addSource(Source1);
+  this->addSource(Source2);
+}
+
+InstX86Label::InstX86Label(Cfg *Func, TargetLowering *Target)
+    : InstX86Base(Func, InstX86Base::Label, 0, nullptr),
+      LabelNumber(Target->makeNextLabelNumber()) {
+  if (BuildDefs::dump()) {
+    Name = GlobalString::createWithString(
+        Func->getContext(), ".L" + Func->getFunctionName() + "$local$__" +
+                                std::to_string(LabelNumber));
+  } else {
+    Name = GlobalString::createWithoutString(Func->getContext());
+  }
+}
+
+InstX86Br::InstX86Br(Cfg *Func, const CfgNode *TargetTrue,
+                     const CfgNode *TargetFalse, const InstX86Label *Label,
+                     BrCond Condition, Mode Kind)
+    : InstX86Base(Func, InstX86Base::Br, 0, nullptr), Condition(Condition),
+      TargetTrue(TargetTrue), TargetFalse(TargetFalse), Label(Label),
+      Kind(Kind) {}
+
+bool InstX86Br::optimizeBranch(const CfgNode *NextNode) {
+  // If there is no next block, then there can be no fallthrough to optimize.
+  if (NextNode == nullptr)
+    return false;
+  // Intra-block conditional branches can't be optimized.
+  if (Label)
+    return false;
+  // If there is no fallthrough node, such as a non-default case label for a
+  // switch instruction, then there is no opportunity to optimize.
+  if (getTargetFalse() == nullptr)
+    return false;
+
+  // Unconditional branch to the next node can be removed.
+  if (Condition == Cond::Br_None && getTargetFalse() == NextNode) {
+    assert(getTargetTrue() == nullptr);
+    this->setDeleted();
+    return true;
+  }
+  // If the fallthrough is to the next node, set fallthrough to nullptr to
+  // indicate.
+  if (getTargetFalse() == NextNode) {
+    TargetFalse = nullptr;
+    return true;
+  }
+  // If TargetTrue is the next node, and TargetFalse is not nullptr (which was
+  // already tested above), then invert the branch condition, swap the targets,
+  // and set new fallthrough to nullptr.
+  if (getTargetTrue() == NextNode) {
+    assert(Condition != Cond::Br_None);
+    Condition = this->getOppositeCondition(Condition);
+    TargetTrue = getTargetFalse();
+    TargetFalse = nullptr;
+    return true;
+  }
+  return false;
+}
+
+bool InstX86Br::repointEdges(CfgNode *OldNode, CfgNode *NewNode) {
+  bool Found = false;
+  if (TargetFalse == OldNode) {
+    TargetFalse = NewNode;
+    Found = true;
+  }
+  if (TargetTrue == OldNode) {
+    TargetTrue = NewNode;
+    Found = true;
+  }
+  return Found;
+}
+
+InstX86Jmp::InstX86Jmp(Cfg *Func, Operand *Target)
+    : InstX86Base(Func, InstX86Base::Jmp, 1, nullptr) {
+  this->addSource(Target);
+}
+
+InstX86Call::InstX86Call(Cfg *Func, Variable *Dest, Operand *CallTarget)
+    : InstX86Base(Func, InstX86Base::Call, 1, Dest) {
+  this->HasSideEffects = true;
+  this->addSource(CallTarget);
+}
+
+InstX86Movmsk::InstX86Movmsk(Cfg *Func, Variable *Dest, Operand *Source)
+    : InstX86Base(Func, InstX86Base::Movmsk, 1, Dest) {
+  this->addSource(Source);
+}
+
+InstX86Cmov::InstX86Cmov(Cfg *Func, Variable *Dest, Operand *Source,
+                         BrCond Condition)
+    : InstX86Base(Func, InstX86Base::Cmov, 2, Dest), Condition(Condition) {
+  // The final result is either the original Dest, or Source, so mark both as
+  // sources.
+  this->addSource(Dest);
+  this->addSource(Source);
+}
+
+InstX86Cmpps::InstX86Cmpps(Cfg *Func, Variable *Dest, Operand *Source,
+                           CmppsCond Condition)
+    : InstX86Base(Func, InstX86Base::Cmpps, 2, Dest), Condition(Condition) {
+  this->addSource(Dest);
+  this->addSource(Source);
+}
+
+InstX86Cmpxchg::InstX86Cmpxchg(Cfg *Func, Operand *DestOrAddr, Variable *Eax,
+                               Variable *Desired, bool Locked)
+    : InstX86BaseLockable(Func, InstX86Base::Cmpxchg, 3,
+                          llvm::dyn_cast<Variable>(DestOrAddr), Locked) {
+  constexpr uint16_t Encoded_rAX = 0;
+  (void)Encoded_rAX;
+  assert(Traits::getEncodedGPR(Eax->getRegNum()) == Encoded_rAX);
+  this->addSource(DestOrAddr);
+  this->addSource(Eax);
+  this->addSource(Desired);
+}
+
+InstX86Cmpxchg8b::InstX86Cmpxchg8b(Cfg *Func, X86OperandMem *Addr,
+                                   Variable *Edx, Variable *Eax, Variable *Ecx,
+                                   Variable *Ebx, bool Locked)
+    : InstX86BaseLockable(Func, InstX86Base::Cmpxchg, 5, nullptr, Locked) {
+  assert(Edx->getRegNum() == RegisterSet::Reg_edx);
+  assert(Eax->getRegNum() == RegisterSet::Reg_eax);
+  assert(Ecx->getRegNum() == RegisterSet::Reg_ecx);
+  assert(Ebx->getRegNum() == RegisterSet::Reg_ebx);
+  this->addSource(Addr);
+  this->addSource(Edx);
+  this->addSource(Eax);
+  this->addSource(Ecx);
+  this->addSource(Ebx);
+}
+
+InstX86Cvt::InstX86Cvt(Cfg *Func, Variable *Dest, Operand *Source,
+                       CvtVariant Variant)
+    : InstX86Base(Func, InstX86Base::Cvt, 1, Dest), Variant(Variant) {
+  this->addSource(Source);
+}
+
+InstX86Icmp::InstX86Icmp(Cfg *Func, Operand *Src0, Operand *Src1)
+    : InstX86Base(Func, InstX86Base::Icmp, 2, nullptr) {
+  this->addSource(Src0);
+  this->addSource(Src1);
+}
+
+InstX86Ucomiss::InstX86Ucomiss(Cfg *Func, Operand *Src0, Operand *Src1)
+    : InstX86Base(Func, InstX86Base::Ucomiss, 2, nullptr) {
+  this->addSource(Src0);
+  this->addSource(Src1);
+}
+
+InstX86UD2::InstX86UD2(Cfg *Func)
+    : InstX86Base(Func, InstX86Base::UD2, 0, nullptr) {}
+
+InstX86Int3::InstX86Int3(Cfg *Func)
+    : InstX86Base(Func, InstX86Base::Int3, 0, nullptr) {}
+
+InstX86Test::InstX86Test(Cfg *Func, Operand *Src1, Operand *Src2)
+    : InstX86Base(Func, InstX86Base::Test, 2, nullptr) {
+  this->addSource(Src1);
+  this->addSource(Src2);
+}
+
+InstX86Mfence::InstX86Mfence(Cfg *Func)
+    : InstX86Base(Func, InstX86Base::Mfence, 0, nullptr) {
+  this->HasSideEffects = true;
+}
+
+InstX86Store::InstX86Store(Cfg *Func, Operand *Value, X86Operand *Mem)
+    : InstX86Base(Func, InstX86Base::Store, 2, nullptr) {
+  this->addSource(Value);
+  this->addSource(Mem);
+}
+
+InstX86StoreP::InstX86StoreP(Cfg *Func, Variable *Value, X86OperandMem *Mem)
+    : InstX86Base(Func, InstX86Base::StoreP, 2, nullptr) {
+  this->addSource(Value);
+  this->addSource(Mem);
+}
+
+InstX86StoreQ::InstX86StoreQ(Cfg *Func, Operand *Value, X86OperandMem *Mem)
+    : InstX86Base(Func, InstX86Base::StoreQ, 2, nullptr) {
+  this->addSource(Value);
+  this->addSource(Mem);
+}
+
+InstX86StoreD::InstX86StoreD(Cfg *Func, Operand *Value, X86OperandMem *Mem)
+    : InstX86Base(Func, InstX86Base::StoreD, 2, nullptr) {
+  this->addSource(Value);
+  this->addSource(Mem);
+}
+
+InstX86Nop::InstX86Nop(Cfg *Func, NopVariant Variant)
+    : InstX86Base(Func, InstX86Base::Nop, 0, nullptr), Variant(Variant) {}
+
+InstX86Pop::InstX86Pop(Cfg *Func, Variable *Dest)
+    : InstX86Base(Func, InstX86Base::Pop, 0, Dest) {
+  // A pop instruction affects the stack pointer and so it should not be
+  // allowed to be automatically dead-code eliminated. (The corresponding push
+  // instruction doesn't need this treatment because it has no dest variable
+  // and therefore won't be dead-code eliminated.) This is needed for
+  // late-stage liveness analysis (e.g. asm-verbose mode).
+  this->HasSideEffects = true;
+}
+
+InstX86Push::InstX86Push(Cfg *Func, Operand *Source)
+    : InstX86Base(Func, InstX86Base::Push, 1, nullptr) {
+  this->addSource(Source);
+}
+
+InstX86Ret::InstX86Ret(Cfg *Func, Variable *Source)
+    : InstX86Base(Func, InstX86Base::Ret, Source ? 1 : 0, nullptr) {
+  if (Source)
+    this->addSource(Source);
+}
+
+InstX86Setcc::InstX86Setcc(Cfg *Func, Variable *Dest, BrCond Cond)
+    : InstX86Base(Func, InstX86Base::Setcc, 0, Dest), Condition(Cond) {}
+
+InstX86Xadd::InstX86Xadd(Cfg *Func, Operand *Dest, Variable *Source,
+                         bool Locked)
+    : InstX86BaseLockable(Func, InstX86Base::Xadd, 2,
+                          llvm::dyn_cast<Variable>(Dest), Locked) {
+  this->addSource(Dest);
+  this->addSource(Source);
+}
+
+InstX86Xchg::InstX86Xchg(Cfg *Func, Operand *Dest, Variable *Source)
+    : InstX86Base(Func, InstX86Base::Xchg, 2, llvm::dyn_cast<Variable>(Dest)) {
+  this->addSource(Dest);
+  this->addSource(Source);
+}
+
+InstX86IacaStart::InstX86IacaStart(Cfg *Func)
+    : InstX86Base(Func, InstX86Base::IacaStart, 0, nullptr) {
+  assert(getFlags().getAllowIacaMarks());
+}
+
+InstX86IacaEnd::InstX86IacaEnd(Cfg *Func)
+    : InstX86Base(Func, InstX86Base::IacaEnd, 0, nullptr) {
+  assert(getFlags().getAllowIacaMarks());
+}
+
+// ======================== Dump routines ======================== //
+
+void InstX86Base::dump(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  Str << "[" << Traits::TargetName << "] ";
+  Inst::dump(Func);
+}
+
+void InstX86FakeRMW::dump(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  Type Ty = getData()->getType();
+  Str << "rmw " << InstArithmetic::getOpName(getOp()) << " " << Ty << " *";
+  getAddr()->dump(Func);
+  Str << ", ";
+  getData()->dump(Func);
+  Str << ", beacon=";
+  getBeacon()->dump(Func);
+}
+
+void InstX86Label::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  Str << getLabelName() << ":";
+}
+
+void InstX86Label::emitIAS(const Cfg *Func) const {
+  Assembler *Asm = Func->getAssembler<Assembler>();
+  Asm->bindLocalLabel(LabelNumber);
+  if (OffsetReloc != nullptr) {
+    Asm->bindRelocOffset(OffsetReloc);
+  }
+}
+
+void InstX86Label::dump(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  Str << getLabelName() << ":";
+}
+
+void InstX86Br::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  Str << "\t";
+
+  if (Condition == Cond::Br_None) {
+    Str << "jmp";
+  } else {
+    Str << Traits::InstBrAttributes[Condition].EmitString;
+  }
+
+  if (Label) {
+    Str << "\t" << Label->getLabelName();
+  } else {
+    if (Condition == Cond::Br_None) {
+      Str << "\t" << getTargetFalse()->getAsmName();
+    } else {
+      Str << "\t" << getTargetTrue()->getAsmName();
+      if (getTargetFalse()) {
+        Str << "\n\t"
+               "jmp\t"
+            << getTargetFalse()->getAsmName();
+      }
+    }
+  }
+}
+
+void InstX86Br::emitIAS(const Cfg *Func) const {
+  Assembler *Asm = Func->getAssembler<Assembler>();
+  if (Label) {
+    auto *L = Asm->getOrCreateLocalLabel(Label->getLabelNumber());
+    if (Condition == Cond::Br_None) {
+      Asm->jmp(L, isNear());
+    } else {
+      Asm->j(Condition, L, isNear());
+    }
+  } else {
+    if (Condition == Cond::Br_None) {
+      auto *L = Asm->getOrCreateCfgNodeLabel(getTargetFalse()->getIndex());
+      assert(!getTargetTrue());
+      Asm->jmp(L, isNear());
+    } else {
+      auto *L = Asm->getOrCreateCfgNodeLabel(getTargetTrue()->getIndex());
+      Asm->j(Condition, L, isNear());
+      if (getTargetFalse()) {
+        auto *L2 = Asm->getOrCreateCfgNodeLabel(getTargetFalse()->getIndex());
+        Asm->jmp(L2, isNear());
+      }
+    }
+  }
+}
+
+void InstX86Br::dump(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  Str << "br ";
+
+  if (Condition == Cond::Br_None) {
+    if (Label) {
+      Str << "label %" << Label->getLabelName();
+    } else {
+      Str << "label %" << getTargetFalse()->getName();
+    }
+    return;
+  }
+
+  Str << Traits::InstBrAttributes[Condition].DisplayString;
+  if (Label) {
+    Str << ", label %" << Label->getLabelName();
+  } else {
+    Str << ", label %" << getTargetTrue()->getName();
+    if (getTargetFalse()) {
+      Str << ", label %" << getTargetFalse()->getName();
+    }
+  }
+
+  Str << " // (" << (isNear() ? "near" : "far") << " jump)";
+}
+
+void InstX86Jmp::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(this->getSrcSize() == 1);
+  const Operand *Src = this->getSrc(0);
+  if (Traits::Is64Bit) {
+    if (const auto *CR = llvm::dyn_cast<ConstantRelocatable>(Src)) {
+      Str << "\t"
+             "jmp"
+             "\t"
+          << CR->getName();
+      return;
+    }
+  }
+  Str << "\t"
+         "jmp"
+         "\t*";
+  getJmpTarget()->emit(Func);
+}
+
+void InstX86Jmp::emitIAS(const Cfg *Func) const {
+  // Note: Adapted (mostly copied) from
+  // InstX86Call::emitIAS().
+  Assembler *Asm = Func->getAssembler<Assembler>();
+  Operand *Target = getJmpTarget();
+  if (const auto *Var = llvm::dyn_cast<Variable>(Target)) {
+    if (Var->hasReg()) {
+      Asm->jmp(Traits::getEncodedGPR(Var->getRegNum()));
+    } else {
+      // The jmp instruction with a memory operand should be possible to
+      // encode, but it isn't a valid sandboxed instruction, and there
+      // shouldn't be a register allocation issue to jump through a scratch
+      // register, so we don't really need to bother implementing it.
+      llvm::report_fatal_error("Assembler can't jmp to memory operand");
+    }
+  } else if (const auto *Mem = llvm::dyn_cast<X86OperandMem>(Target)) {
+    (void)Mem;
+    assert(Mem->getSegmentRegister() == X86OperandMem::DefaultSegment);
+    llvm::report_fatal_error("Assembler can't jmp to memory operand");
+  } else if (const auto *CR = llvm::dyn_cast<ConstantRelocatable>(Target)) {
+    Asm->jmp(CR);
+  } else if (const auto *Imm = llvm::dyn_cast<ConstantInteger32>(Target)) {
+    // NaCl trampoline calls refer to an address within the sandbox directly.
+    // This is usually only needed for non-IRT builds and otherwise not very
+    // portable or stable. Usually this is only done for "calls" and not jumps.
+    Asm->jmp(AssemblerImmediate(Imm->getValue()));
+  } else {
+    llvm::report_fatal_error("Unexpected operand type");
+  }
+}
+
+void InstX86Jmp::dump(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  Str << "jmp ";
+  getJmpTarget()->dump(Func);
+}
+
+void InstX86Call::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(this->getSrcSize() == 1);
+  Str << "\t"
+         "call\t";
+  Operand *CallTarget = getCallTarget();
+  auto *Target = InstX86Base::getTarget(Func);
+  if (const auto *CI = llvm::dyn_cast<ConstantInteger32>(CallTarget)) {
+    // Emit without a leading '$'.
+    Str << CI->getValue();
+  } else if (const auto DirectCallTarget =
+                 llvm::dyn_cast<ConstantRelocatable>(CallTarget)) {
+    DirectCallTarget->emitWithoutPrefix(Target);
+  } else {
+    Str << "*";
+    CallTarget->emit(Func);
+  }
+}
+
+void InstX86Call::emitIAS(const Cfg *Func) const {
+  Assembler *Asm = Func->getAssembler<Assembler>();
+  Operand *CallTarget = getCallTarget();
+  auto *Target = InstX86Base::getTarget(Func);
+  if (const auto *Var = llvm::dyn_cast<Variable>(CallTarget)) {
+    if (Var->hasReg()) {
+      Asm->call(Traits::getEncodedGPR(Var->getRegNum()));
+    } else {
+      Asm->call(Target->stackVarToAsmOperand(Var));
+    }
+  } else if (const auto *Mem = llvm::dyn_cast<X86OperandMem>(CallTarget)) {
+    assert(Mem->getSegmentRegister() == X86OperandMem::DefaultSegment);
+    Asm->call(Mem->toAsmAddress(Asm, Target));
+  } else if (const auto *CR = llvm::dyn_cast<ConstantRelocatable>(CallTarget)) {
+    Asm->call(CR);
+  } else if (const auto *Imm = llvm::dyn_cast<ConstantInteger32>(CallTarget)) {
+    Asm->call(AssemblerImmediate(Imm->getValue()));
+  } else {
+    llvm_unreachable("Unexpected operand type");
+  }
+}
+
+void InstX86Call::dump(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  if (this->getDest()) {
+    this->dumpDest(Func);
+    Str << " = ";
+  }
+  Str << "call ";
+  getCallTarget()->dump(Func);
+}
+
+// The this->Opcode parameter needs to be char* and not std::string because of
+// template issues.
+
+void InstX86Base::emitTwoAddress(const Cfg *Func, const char *Opcode,
+                                 const char *Suffix) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(getSrcSize() == 2);
+  Operand *Dest = getDest();
+  if (Dest == nullptr)
+    Dest = getSrc(0);
+  assert(Dest == getSrc(0));
+  Operand *Src1 = getSrc(1);
+  Str << "\t" << Opcode << Suffix
+      << InstX86Base::getWidthString(Dest->getType()) << "\t";
+  Src1->emit(Func);
+  Str << ", ";
+  Dest->emit(Func);
+}
+
+void emitIASOpTyGPR(const Cfg *Func, Type Ty, const Operand *Op,
+                    const GPREmitterOneOp &Emitter) {
+  auto *Target = InstX86Base::getTarget(Func);
+  Assembler *Asm = Func->getAssembler<Assembler>();
+  if (const auto *Var = llvm::dyn_cast<Variable>(Op)) {
+    if (Var->hasReg()) {
+      // We cheat a little and use GPRRegister even for byte operations.
+      GPRRegister VarReg = Traits::getEncodedGPR(Var->getRegNum());
+      (Asm->*(Emitter.Reg))(Ty, VarReg);
+    } else {
+      Address StackAddr(Target->stackVarToAsmOperand(Var));
+      (Asm->*(Emitter.Addr))(Ty, StackAddr);
+    }
+  } else if (const auto *Mem = llvm::dyn_cast<X86OperandMem>(Op)) {
+    Mem->emitSegmentOverride(Asm);
+    (Asm->*(Emitter.Addr))(Ty, Mem->toAsmAddress(Asm, Target));
+  } else {
+    llvm_unreachable("Unexpected operand type");
+  }
+}
+
+template <bool VarCanBeByte, bool SrcCanBeByte>
+void emitIASRegOpTyGPR(const Cfg *Func, bool IsLea, Type Ty,
+                       const Variable *Var, const Operand *Src,
+                       const GPREmitterRegOp &Emitter) {
+  auto *Target = InstX86Base::getTarget(Func);
+  Assembler *Asm = Func->getAssembler<Assembler>();
+  assert(Var->hasReg());
+  // We cheat a little and use GPRRegister even for byte operations.
+  GPRRegister VarReg = VarCanBeByte ? Traits::getEncodedGPR(Var->getRegNum())
+                                    : Traits::getEncodedGPR(Var->getRegNum());
+  if (const auto *SrcVar = llvm::dyn_cast<Variable>(Src)) {
+    if (SrcVar->hasReg()) {
+      GPRRegister SrcReg = SrcCanBeByte
+                               ? Traits::getEncodedGPR(SrcVar->getRegNum())
+                               : Traits::getEncodedGPR(SrcVar->getRegNum());
+      (Asm->*(Emitter.GPRGPR))(Ty, VarReg, SrcReg);
+    } else {
+      Address SrcStackAddr = Target->stackVarToAsmOperand(SrcVar);
+      (Asm->*(Emitter.GPRAddr))(Ty, VarReg, SrcStackAddr);
+    }
+  } else if (const auto *Mem = llvm::dyn_cast<X86OperandMem>(Src)) {
+    Mem->emitSegmentOverride(Asm);
+    (Asm->*(Emitter.GPRAddr))(Ty, VarReg,
+                              Mem->toAsmAddress(Asm, Target, IsLea));
+  } else if (const auto *Imm = llvm::dyn_cast<ConstantInteger32>(Src)) {
+    (Asm->*(Emitter.GPRImm))(Ty, VarReg, AssemblerImmediate(Imm->getValue()));
+  } else if (const auto *Imm = llvm::dyn_cast<ConstantInteger64>(Src)) {
+    assert(Traits::Is64Bit);
+    assert(Utils::IsInt(32, Imm->getValue()));
+    (Asm->*(Emitter.GPRImm))(Ty, VarReg, AssemblerImmediate(Imm->getValue()));
+  } else if (const auto *Reloc = llvm::dyn_cast<ConstantRelocatable>(Src)) {
+    const auto FixupKind = (Reloc->getName().hasStdString() &&
+                            Reloc->getName().toString() == GlobalOffsetTable)
+                               ? Traits::FK_GotPC
+                               : Traits::TargetLowering::getAbsFixup();
+    AssemblerFixup *Fixup = Asm->createFixup(FixupKind, Reloc);
+    (Asm->*(Emitter.GPRImm))(Ty, VarReg, AssemblerImmediate(Fixup));
+  } else if (const auto *Split = llvm::dyn_cast<VariableSplit>(Src)) {
+    (Asm->*(Emitter.GPRAddr))(Ty, VarReg, Split->toAsmAddress(Func));
+  } else {
+    llvm_unreachable("Unexpected operand type");
+  }
+}
+
+void emitIASAddrOpTyGPR(const Cfg *Func, Type Ty, const Address &Addr,
+                        const Operand *Src, const GPREmitterAddrOp &Emitter) {
+  Assembler *Asm = Func->getAssembler<Assembler>();
+  // Src can only be Reg or AssemblerImmediate.
+  if (const auto *SrcVar = llvm::dyn_cast<Variable>(Src)) {
+    assert(SrcVar->hasReg());
+    GPRRegister SrcReg = Traits::getEncodedGPR(SrcVar->getRegNum());
+    (Asm->*(Emitter.AddrGPR))(Ty, Addr, SrcReg);
+  } else if (const auto *Imm = llvm::dyn_cast<ConstantInteger32>(Src)) {
+    (Asm->*(Emitter.AddrImm))(Ty, Addr, AssemblerImmediate(Imm->getValue()));
+  } else if (const auto *Imm = llvm::dyn_cast<ConstantInteger64>(Src)) {
+    assert(Traits::Is64Bit);
+    assert(Utils::IsInt(32, Imm->getValue()));
+    (Asm->*(Emitter.AddrImm))(Ty, Addr, AssemblerImmediate(Imm->getValue()));
+  } else if (const auto *Reloc = llvm::dyn_cast<ConstantRelocatable>(Src)) {
+    const auto FixupKind = (Reloc->getName().hasStdString() &&
+                            Reloc->getName().toString() == GlobalOffsetTable)
+                               ? Traits::FK_GotPC
+                               : Traits::TargetLowering::getAbsFixup();
+    AssemblerFixup *Fixup = Asm->createFixup(FixupKind, Reloc);
+    (Asm->*(Emitter.AddrImm))(Ty, Addr, AssemblerImmediate(Fixup));
+  } else {
+    llvm_unreachable("Unexpected operand type");
+  }
+}
+
+void emitIASAsAddrOpTyGPR(const Cfg *Func, Type Ty, const Operand *Op0,
+                          const Operand *Op1, const GPREmitterAddrOp &Emitter) {
+  auto *Target = InstX86Base::getTarget(Func);
+  if (const auto *Op0Var = llvm::dyn_cast<Variable>(Op0)) {
+    assert(!Op0Var->hasReg());
+    Address StackAddr(Target->stackVarToAsmOperand(Op0Var));
+    emitIASAddrOpTyGPR(Func, Ty, StackAddr, Op1, Emitter);
+  } else if (const auto *Op0Mem = llvm::dyn_cast<X86OperandMem>(Op0)) {
+    Assembler *Asm = Func->getAssembler<Assembler>();
+    Op0Mem->emitSegmentOverride(Asm);
+    emitIASAddrOpTyGPR(Func, Ty, Op0Mem->toAsmAddress(Asm, Target), Op1,
+                       Emitter);
+  } else if (const auto *Split = llvm::dyn_cast<VariableSplit>(Op0)) {
+    emitIASAddrOpTyGPR(Func, Ty, Split->toAsmAddress(Func), Op1, Emitter);
+  } else {
+    llvm_unreachable("Unexpected operand type");
+  }
+}
+
+void emitIASGPRShift(const Cfg *Func, Type Ty, const Variable *Var,
+                     const Operand *Src, const GPREmitterShiftOp &Emitter) {
+  Assembler *Asm = Func->getAssembler<Assembler>();
+  // Technically, the Dest Var can be mem as well, but we only use Reg. We can
+  // extend this to check Dest if we decide to use that form.
+  assert(Var->hasReg());
+  // We cheat a little and use GPRRegister even for byte operations.
+  GPRRegister VarReg = Traits::getEncodedGPR(Var->getRegNum());
+  // Src must be reg == ECX or an Imm8. This is asserted by the assembler.
+  if (const auto *SrcVar = llvm::dyn_cast<Variable>(Src)) {
+    assert(SrcVar->hasReg());
+    GPRRegister SrcReg = Traits::getEncodedGPR(SrcVar->getRegNum());
+    (Asm->*(Emitter.GPRGPR))(Ty, VarReg, SrcReg);
+  } else if (const auto *Imm = llvm::dyn_cast<ConstantInteger32>(Src)) {
+    (Asm->*(Emitter.GPRImm))(Ty, VarReg, AssemblerImmediate(Imm->getValue()));
+  } else if (const auto *Imm = llvm::dyn_cast<ConstantInteger64>(Src)) {
+    assert(Traits::Is64Bit);
+    assert(Utils::IsInt(32, Imm->getValue()));
+    (Asm->*(Emitter.GPRImm))(Ty, VarReg, AssemblerImmediate(Imm->getValue()));
+  } else {
+    llvm_unreachable("Unexpected operand type");
+  }
+}
+
+void emitIASGPRShiftDouble(const Cfg *Func, const Variable *Dest,
+                           const Operand *Src1Op, const Operand *Src2Op,
+                           const GPREmitterShiftD &Emitter) {
+  Assembler *Asm = Func->getAssembler<Assembler>();
+  // Dest can be reg or mem, but we only use the reg variant.
+  assert(Dest->hasReg());
+  GPRRegister DestReg = Traits::getEncodedGPR(Dest->getRegNum());
+  // SrcVar1 must be reg.
+  const auto *SrcVar1 = llvm::cast<Variable>(Src1Op);
+  assert(SrcVar1->hasReg());
+  GPRRegister SrcReg = Traits::getEncodedGPR(SrcVar1->getRegNum());
+  Type Ty = SrcVar1->getType();
+  // Src2 can be the implicit CL register or an immediate.
+  if (const auto *Imm = llvm::dyn_cast<ConstantInteger32>(Src2Op)) {
+    (Asm->*(Emitter.GPRGPRImm))(Ty, DestReg, SrcReg,
+                                AssemblerImmediate(Imm->getValue()));
+  } else {
+    assert(llvm::cast<Variable>(Src2Op)->getRegNum() == RegisterSet::Reg_cl);
+    (Asm->*(Emitter.GPRGPR))(Ty, DestReg, SrcReg);
+  }
+}
+
+void emitIASXmmShift(const Cfg *Func, Type Ty, const Variable *Var,
+                     const Operand *Src, const XmmEmitterShiftOp &Emitter) {
+  auto *Target = InstX86Base::getTarget(Func);
+  Assembler *Asm = Func->getAssembler<Assembler>();
+  assert(Var->hasReg());
+  XmmRegister VarReg = Traits::getEncodedXmm(Var->getRegNum());
+  if (const auto *SrcVar = llvm::dyn_cast<Variable>(Src)) {
+    if (SrcVar->hasReg()) {
+      XmmRegister SrcReg = Traits::getEncodedXmm(SrcVar->getRegNum());
+      (Asm->*(Emitter.XmmXmm))(Ty, VarReg, SrcReg);
+    } else {
+      Address SrcStackAddr = Target->stackVarToAsmOperand(SrcVar);
+      (Asm->*(Emitter.XmmAddr))(Ty, VarReg, SrcStackAddr);
+    }
+  } else if (const auto *Mem = llvm::dyn_cast<X86OperandMem>(Src)) {
+    assert(Mem->getSegmentRegister() == X86OperandMem::DefaultSegment);
+    (Asm->*(Emitter.XmmAddr))(Ty, VarReg, Mem->toAsmAddress(Asm, Target));
+  } else if (const auto *Imm = llvm::dyn_cast<ConstantInteger32>(Src)) {
+    (Asm->*(Emitter.XmmImm))(Ty, VarReg, AssemblerImmediate(Imm->getValue()));
+  } else {
+    llvm_unreachable("Unexpected operand type");
+  }
+}
+
+void emitIASRegOpTyXMM(const Cfg *Func, Type Ty, const Variable *Var,
+                       const Operand *Src, const XmmEmitterRegOp &Emitter) {
+  auto *Target = InstX86Base::getTarget(Func);
+  Assembler *Asm = Func->getAssembler<Assembler>();
+  assert(Var->hasReg());
+  XmmRegister VarReg = Traits::getEncodedXmm(Var->getRegNum());
+  if (const auto *SrcVar = llvm::dyn_cast<Variable>(Src)) {
+    if (SrcVar->hasReg()) {
+      XmmRegister SrcReg = Traits::getEncodedXmm(SrcVar->getRegNum());
+      (Asm->*(Emitter.XmmXmm))(Ty, VarReg, SrcReg);
+    } else {
+      Address SrcStackAddr = Target->stackVarToAsmOperand(SrcVar);
+      (Asm->*(Emitter.XmmAddr))(Ty, VarReg, SrcStackAddr);
+    }
+  } else if (const auto *Mem = llvm::dyn_cast<X86OperandMem>(Src)) {
+    assert(Mem->getSegmentRegister() == X86OperandMem::DefaultSegment);
+    (Asm->*(Emitter.XmmAddr))(Ty, VarReg, Mem->toAsmAddress(Asm, Target));
+  } else if (const auto *Imm = llvm::dyn_cast<Constant>(Src)) {
+    (Asm->*(Emitter.XmmAddr))(Ty, VarReg,
+                              Traits::Address::ofConstPool(Asm, Imm));
+  } else {
+    llvm_unreachable("Unexpected operand type");
+  }
+}
+
+template <typename DReg_t, typename SReg_t, DReg_t (*destEnc)(RegNumT),
+          SReg_t (*srcEnc)(RegNumT)>
+void emitIASCastRegOp(const Cfg *Func, Type DestTy, const Variable *Dest,
+                      Type SrcTy, const Operand *Src,
+                      const CastEmitterRegOp<DReg_t, SReg_t> &Emitter) {
+  auto *Target = InstX86Base::getTarget(Func);
+  Assembler *Asm = Func->getAssembler<Assembler>();
+  assert(Dest->hasReg());
+  DReg_t DestReg = destEnc(Dest->getRegNum());
+  if (const auto *SrcVar = llvm::dyn_cast<Variable>(Src)) {
+    if (SrcVar->hasReg()) {
+      SReg_t SrcReg = srcEnc(SrcVar->getRegNum());
+      (Asm->*(Emitter.RegReg))(DestTy, DestReg, SrcTy, SrcReg);
+    } else {
+      Address SrcStackAddr = Target->stackVarToAsmOperand(SrcVar);
+      (Asm->*(Emitter.RegAddr))(DestTy, DestReg, SrcTy, SrcStackAddr);
+    }
+  } else if (const auto *Mem = llvm::dyn_cast<X86OperandMem>(Src)) {
+    Mem->emitSegmentOverride(Asm);
+    (Asm->*(Emitter.RegAddr))(DestTy, DestReg, SrcTy,
+                              Mem->toAsmAddress(Asm, Target));
+  } else {
+    llvm_unreachable("Unexpected operand type");
+  }
+}
+
+template <typename DReg_t, typename SReg_t, DReg_t (*destEnc)(RegNumT),
+          SReg_t (*srcEnc)(RegNumT)>
+void emitIASThreeOpImmOps(const Cfg *Func, Type DispatchTy,
+                          const Variable *Dest, const Operand *Src0,
+                          const Operand *Src1,
+                          const ThreeOpImmEmitter<DReg_t, SReg_t> Emitter) {
+  auto *Target = InstX86Base::getTarget(Func);
+  Assembler *Asm = Func->getAssembler<Assembler>();
+  // This only handles Dest being a register, and Src1 being an immediate.
+  assert(Dest->hasReg());
+  DReg_t DestReg = destEnc(Dest->getRegNum());
+  AssemblerImmediate Imm(llvm::cast<ConstantInteger32>(Src1)->getValue());
+  if (const auto *SrcVar = llvm::dyn_cast<Variable>(Src0)) {
+    if (SrcVar->hasReg()) {
+      SReg_t SrcReg = srcEnc(SrcVar->getRegNum());
+      (Asm->*(Emitter.RegRegImm))(DispatchTy, DestReg, SrcReg, Imm);
+    } else {
+      Address SrcStackAddr = Target->stackVarToAsmOperand(SrcVar);
+      (Asm->*(Emitter.RegAddrImm))(DispatchTy, DestReg, SrcStackAddr, Imm);
+    }
+  } else if (const auto *Mem = llvm::dyn_cast<X86OperandMem>(Src0)) {
+    Mem->emitSegmentOverride(Asm);
+    (Asm->*(Emitter.RegAddrImm))(DispatchTy, DestReg,
+                                 Mem->toAsmAddress(Asm, Target), Imm);
+  } else {
+    llvm_unreachable("Unexpected operand type");
+  }
+}
+
+void emitIASMovlikeXMM(const Cfg *Func, const Variable *Dest,
+                       const Operand *Src, const XmmEmitterMovOps Emitter) {
+  auto *Target = InstX86Base::getTarget(Func);
+  Assembler *Asm = Func->getAssembler<Assembler>();
+  if (Dest->hasReg()) {
+    XmmRegister DestReg = Traits::getEncodedXmm(Dest->getRegNum());
+    if (const auto *SrcVar = llvm::dyn_cast<Variable>(Src)) {
+      if (SrcVar->hasReg()) {
+        (Asm->*(Emitter.XmmXmm))(DestReg,
+                                 Traits::getEncodedXmm(SrcVar->getRegNum()));
+      } else {
+        Address StackAddr(Target->stackVarToAsmOperand(SrcVar));
+        (Asm->*(Emitter.XmmAddr))(DestReg, StackAddr);
+      }
+    } else if (const auto *SrcMem = llvm::dyn_cast<X86OperandMem>(Src)) {
+      assert(SrcMem->getSegmentRegister() == X86OperandMem::DefaultSegment);
+      (Asm->*(Emitter.XmmAddr))(DestReg, SrcMem->toAsmAddress(Asm, Target));
+    } else {
+      llvm_unreachable("Unexpected operand type");
+    }
+  } else {
+    Address StackAddr(Target->stackVarToAsmOperand(Dest));
+    // Src must be a register in this case.
+    const auto *SrcVar = llvm::cast<Variable>(Src);
+    assert(SrcVar->hasReg());
+    (Asm->*(Emitter.AddrXmm))(StackAddr,
+                              Traits::getEncodedXmm(SrcVar->getRegNum()));
+  }
+}
+
+void InstX86Movmsk::dump(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  this->dumpDest(Func);
+  Str << " = movmsk." << this->getSrc(0)->getType() << " ";
+  this->dumpSources(Func);
+}
+
+void InstX86Movmsk::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(this->getSrcSize() == 1);
+  Type SrcTy = this->getSrc(0)->getType();
+  assert(isVectorType(SrcTy));
+  switch (SrcTy) {
+  case IceType_v16i8:
+    Str << "\t"
+           "pmovmskb"
+           "\t";
+    break;
+  case IceType_v4i32:
+  case IceType_v4f32:
+    Str << "\t"
+           "movmskps"
+           "\t";
+    break;
+  default:
+    llvm_unreachable("Unexpected operand type");
+  }
+  this->getSrc(0)->emit(Func);
+  Str << ", ";
+  this->getDest()->emit(Func);
+}
+
+void InstX86Movmsk::emitIAS(const Cfg *Func) const {
+  assert(this->getSrcSize() == 1);
+  Assembler *Asm = Func->getAssembler<Assembler>();
+  const Variable *Dest = this->getDest();
+  const Variable *Src = llvm::cast<Variable>(this->getSrc(0));
+  const Type DestTy = Dest->getType();
+  (void)DestTy;
+  const Type SrcTy = Src->getType();
+  assert(isVectorType(SrcTy));
+  assert(isScalarIntegerType(DestTy));
+  if (Traits::Is64Bit) {
+    assert(DestTy == IceType_i32 || DestTy == IceType_i64);
+  } else {
+    assert(typeWidthInBytes(DestTy) <= 4);
+  }
+  XmmRegister SrcReg = Traits::getEncodedXmm(Src->getRegNum());
+  GPRRegister DestReg = Traits::getEncodedGPR(Dest->getRegNum());
+  Asm->movmsk(SrcTy, DestReg, SrcReg);
+}
+
+void InstX86Sqrt::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(this->getSrcSize() == 1);
+  Type Ty = this->getSrc(0)->getType();
+  assert(isScalarFloatingType(Ty));
+  Str << "\t"
+         "sqrt"
+      << Traits::TypeAttributes[Ty].SpSdString << "\t";
+  this->getSrc(0)->emit(Func);
+  Str << ", ";
+  this->getDest()->emit(Func);
+}
+
+void InstX86Div::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(this->getSrcSize() == 3);
+  Operand *Src1 = this->getSrc(1);
+  Str << "\t" << this->Opcode << this->getWidthString(Src1->getType()) << "\t";
+  Src1->emit(Func);
+}
+
+void InstX86Div::emitIAS(const Cfg *Func) const {
+  assert(this->getSrcSize() == 3);
+  const Operand *Src = this->getSrc(1);
+  Type Ty = Src->getType();
+  static GPREmitterOneOp Emitter = {&Assembler::div, &Assembler::div};
+  emitIASOpTyGPR(Func, Ty, Src, Emitter);
+}
+
+void InstX86Idiv::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(this->getSrcSize() == 3);
+  Operand *Src1 = this->getSrc(1);
+  Str << "\t" << this->Opcode << this->getWidthString(Src1->getType()) << "\t";
+  Src1->emit(Func);
+}
+
+void InstX86Idiv::emitIAS(const Cfg *Func) const {
+  assert(this->getSrcSize() == 3);
+  const Operand *Src = this->getSrc(1);
+  Type Ty = Src->getType();
+  static const GPREmitterOneOp Emitter = {&Assembler::idiv, &Assembler::idiv};
+  emitIASOpTyGPR(Func, Ty, Src, Emitter);
+}
+
+// pblendvb and blendvps take xmm0 as a final implicit argument.
+
+void emitVariableBlendInst(const char *Opcode, const Inst *Instr,
+                           const Cfg *Func) {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(Instr->getSrcSize() == 3);
+  assert(llvm::cast<Variable>(Instr->getSrc(2))->getRegNum() ==
+         RegisterSet::Reg_xmm0);
+  Str << "\t" << Opcode << "\t";
+  Instr->getSrc(1)->emit(Func);
+  Str << ", ";
+  Instr->getDest()->emit(Func);
+}
+
+void emitIASVariableBlendInst(const Inst *Instr, const Cfg *Func,
+                              const XmmEmitterRegOp &Emitter) {
+  assert(Instr->getSrcSize() == 3);
+  assert(llvm::cast<Variable>(Instr->getSrc(2))->getRegNum() ==
+         RegisterSet::Reg_xmm0);
+  const Variable *Dest = Instr->getDest();
+  const Operand *Src = Instr->getSrc(1);
+  emitIASRegOpTyXMM(Func, Dest->getType(), Dest, Src, Emitter);
+}
+
+void InstX86Blendvps::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  emitVariableBlendInst(this->Opcode, this, Func);
+}
+
+void InstX86Blendvps::emitIAS(const Cfg *Func) const {
+  static const XmmEmitterRegOp Emitter = {&Assembler::blendvps,
+                                          &Assembler::blendvps};
+  emitIASVariableBlendInst(this, Func, Emitter);
+}
+
+void InstX86Pblendvb::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  emitVariableBlendInst(this->Opcode, this, Func);
+}
+
+void InstX86Pblendvb::emitIAS(const Cfg *Func) const {
+  static const XmmEmitterRegOp Emitter = {&Assembler::pblendvb,
+                                          &Assembler::pblendvb};
+  emitIASVariableBlendInst(this, Func, Emitter);
+}
+
+void InstX86Imul::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(this->getSrcSize() == 2);
+  Variable *Dest = this->getDest();
+  if (isByteSizedArithType(Dest->getType())) {
+    // The 8-bit version of imul only allows the form "imul r/m8".
+    const auto *Src0Var = llvm::dyn_cast<Variable>(this->getSrc(0));
+    (void)Src0Var;
+    assert(Src0Var->getRegNum() == RegisterSet::Reg_al);
+    Str << "\t"
+           "imulb\t";
+    this->getSrc(1)->emit(Func);
+  } else if (llvm::isa<Constant>(this->getSrc(1))) {
+    Str << "\t"
+           "imul"
+        << this->getWidthString(Dest->getType()) << "\t";
+    this->getSrc(1)->emit(Func);
+    Str << ", ";
+    this->getSrc(0)->emit(Func);
+    Str << ", ";
+    Dest->emit(Func);
+  } else {
+    this->emitTwoAddress(Func, this->Opcode);
+  }
+}
+
+void InstX86Imul::emitIAS(const Cfg *Func) const {
+  assert(this->getSrcSize() == 2);
+  const Variable *Var = this->getDest();
+  Type Ty = Var->getType();
+  const Operand *Src = this->getSrc(1);
+  if (isByteSizedArithType(Ty)) {
+    // The 8-bit version of imul only allows the form "imul r/m8".
+    const auto *Src0Var = llvm::dyn_cast<Variable>(this->getSrc(0));
+    (void)Src0Var;
+    assert(Src0Var->getRegNum() == RegisterSet::Reg_al);
+    static const GPREmitterOneOp Emitter = {&Assembler::imul, &Assembler::imul};
+    emitIASOpTyGPR(Func, Ty, this->getSrc(1), Emitter);
+  } else {
+    // The two-address version is used when multiplying by a non-constant
+    // or doing an 8-bit multiply.
+    assert(Var == this->getSrc(0));
+    static const GPREmitterRegOp Emitter = {&Assembler::imul, &Assembler::imul,
+                                            &Assembler::imul};
+    constexpr bool NotLea = false;
+    emitIASRegOpTyGPR(Func, NotLea, Ty, Var, Src, Emitter);
+  }
+}
+
+void InstX86ImulImm::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(this->getSrcSize() == 2);
+  Variable *Dest = this->getDest();
+  assert(Dest->getType() == IceType_i16 || Dest->getType() == IceType_i32);
+  assert(llvm::isa<Constant>(this->getSrc(1)));
+  Str << "\t"
+         "imul"
+      << this->getWidthString(Dest->getType()) << "\t";
+  this->getSrc(1)->emit(Func);
+  Str << ", ";
+  this->getSrc(0)->emit(Func);
+  Str << ", ";
+  Dest->emit(Func);
+}
+
+void InstX86ImulImm::emitIAS(const Cfg *Func) const {
+  assert(this->getSrcSize() == 2);
+  const Variable *Dest = this->getDest();
+  Type Ty = Dest->getType();
+  assert(llvm::isa<Constant>(this->getSrc(1)));
+  static const ThreeOpImmEmitter<GPRRegister, GPRRegister> Emitter = {
+      &Assembler::imul, &Assembler::imul};
+  emitIASThreeOpImmOps<GPRRegister, GPRRegister, Traits::getEncodedGPR,
+                       Traits::getEncodedGPR>(Func, Ty, Dest, this->getSrc(0),
+                                              this->getSrc(1), Emitter);
+}
+
+void InstX86Insertps::emitIAS(const Cfg *Func) const {
+  assert(this->getSrcSize() == 3);
+  assert(getInstructionSet(Func) >= SSE4_1);
+  const Variable *Dest = this->getDest();
+  assert(Dest == this->getSrc(0));
+  Type Ty = Dest->getType();
+  static const ThreeOpImmEmitter<XmmRegister, XmmRegister> Emitter = {
+      &Assembler::insertps, &Assembler::insertps};
+  emitIASThreeOpImmOps<XmmRegister, XmmRegister, Traits::getEncodedXmm,
+                       Traits::getEncodedXmm>(Func, Ty, Dest, this->getSrc(1),
+                                              this->getSrc(2), Emitter);
+}
+
+void InstX86Cbwdq::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(this->getSrcSize() == 1);
+  Operand *Src0 = this->getSrc(0);
+  const auto DestReg = this->getDest()->getRegNum();
+  const auto SrcReg = llvm::cast<Variable>(Src0)->getRegNum();
+  (void)DestReg;
+  (void)SrcReg;
+  switch (Src0->getType()) {
+  default:
+    llvm_unreachable("unexpected source type!");
+    break;
+  case IceType_i8:
+    assert(SrcReg == RegisterSet::Reg_al);
+    assert(DestReg == RegisterSet::Reg_ax || DestReg == RegisterSet::Reg_ah);
+    Str << "\t"
+           "cbtw";
+    break;
+  case IceType_i16:
+    assert(SrcReg == RegisterSet::Reg_ax);
+    assert(DestReg == RegisterSet::Reg_dx);
+    Str << "\t"
+           "cwtd";
+    break;
+  case IceType_i32:
+    assert(SrcReg == RegisterSet::Reg_eax);
+    assert(DestReg == RegisterSet::Reg_edx);
+    Str << "\t"
+           "cltd";
+    break;
+  case IceType_i64:
+    assert(Traits::Is64Bit);
+    assert(SrcReg == Traits::getRaxOrDie());
+    assert(DestReg == Traits::getRdxOrDie());
+    Str << "\t"
+           "cqo";
+    break;
+  }
+}
+
+void InstX86Cbwdq::emitIAS(const Cfg *Func) const {
+  Assembler *Asm = Func->getAssembler<Assembler>();
+  assert(this->getSrcSize() == 1);
+  Operand *Src0 = this->getSrc(0);
+  const auto DestReg = this->getDest()->getRegNum();
+  const auto SrcReg = llvm::cast<Variable>(Src0)->getRegNum();
+  (void)DestReg;
+  (void)SrcReg;
+  switch (Src0->getType()) {
+  default:
+    llvm_unreachable("unexpected source type!");
+    break;
+  case IceType_i8:
+    assert(SrcReg == RegisterSet::Reg_al);
+    assert(DestReg == RegisterSet::Reg_ax || DestReg == RegisterSet::Reg_ah);
+    Asm->cbw();
+    break;
+  case IceType_i16:
+    assert(SrcReg == RegisterSet::Reg_ax);
+    assert(DestReg == RegisterSet::Reg_dx);
+    Asm->cwd();
+    break;
+  case IceType_i32:
+    assert(SrcReg == RegisterSet::Reg_eax);
+    assert(DestReg == RegisterSet::Reg_edx);
+    Asm->cdq();
+    break;
+  case IceType_i64:
+    assert(Traits::Is64Bit);
+    assert(SrcReg == Traits::getRaxOrDie());
+    assert(DestReg == Traits::getRdxOrDie());
+    Asm->cqo();
+    break;
+  }
+}
+
+void InstX86Mul::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(this->getSrcSize() == 2);
+  assert(llvm::isa<Variable>(this->getSrc(0)));
+  assert(llvm::cast<Variable>(this->getSrc(0))->getRegNum() ==
+         RegisterSet::Reg_eax);
+  assert(this->getDest()->getRegNum() == RegisterSet::Reg_eax); // TODO:
+                                                                // allow
+                                                                // edx?
+  Str << "\t"
+         "mul"
+      << this->getWidthString(this->getDest()->getType()) << "\t";
+  this->getSrc(1)->emit(Func);
+}
+
+void InstX86Mul::emitIAS(const Cfg *Func) const {
+  assert(this->getSrcSize() == 2);
+  assert(llvm::isa<Variable>(this->getSrc(0)));
+  assert(llvm::cast<Variable>(this->getSrc(0))->getRegNum() ==
+         RegisterSet::Reg_eax);
+  assert(this->getDest()->getRegNum() == RegisterSet::Reg_eax); // TODO:
+                                                                // allow
+                                                                // edx?
+  const Operand *Src = this->getSrc(1);
+  Type Ty = Src->getType();
+  static const GPREmitterOneOp Emitter = {&Assembler::mul, &Assembler::mul};
+  emitIASOpTyGPR(Func, Ty, Src, Emitter);
+}
+
+void InstX86Mul::dump(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  this->dumpDest(Func);
+  Str << " = mul." << this->getDest()->getType() << " ";
+  this->dumpSources(Func);
+}
+
+void InstX86Shld::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  Variable *Dest = this->getDest();
+  assert(this->getSrcSize() == 3);
+  assert(Dest == this->getSrc(0));
+  Str << "\t"
+         "shld"
+      << this->getWidthString(Dest->getType()) << "\t";
+  this->getSrc(2)->emit(Func);
+  Str << ", ";
+  this->getSrc(1)->emit(Func);
+  Str << ", ";
+  Dest->emit(Func);
+}
+
+void InstX86Shld::emitIAS(const Cfg *Func) const {
+  assert(this->getSrcSize() == 3);
+  assert(this->getDest() == this->getSrc(0));
+  const Variable *Dest = this->getDest();
+  const Operand *Src1 = this->getSrc(1);
+  const Operand *Src2 = this->getSrc(2);
+  static const GPREmitterShiftD Emitter = {&Assembler::shld, &Assembler::shld};
+  emitIASGPRShiftDouble(Func, Dest, Src1, Src2, Emitter);
+}
+
+void InstX86Shld::dump(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  this->dumpDest(Func);
+  Str << " = shld." << this->getDest()->getType() << " ";
+  this->dumpSources(Func);
+}
+
+void InstX86Shrd::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  Variable *Dest = this->getDest();
+  assert(this->getSrcSize() == 3);
+  assert(Dest == this->getSrc(0));
+  Str << "\t"
+         "shrd"
+      << this->getWidthString(Dest->getType()) << "\t";
+  this->getSrc(2)->emit(Func);
+  Str << ", ";
+  this->getSrc(1)->emit(Func);
+  Str << ", ";
+  Dest->emit(Func);
+}
+
+void InstX86Shrd::emitIAS(const Cfg *Func) const {
+  assert(this->getSrcSize() == 3);
+  assert(this->getDest() == this->getSrc(0));
+  const Variable *Dest = this->getDest();
+  const Operand *Src1 = this->getSrc(1);
+  const Operand *Src2 = this->getSrc(2);
+  static const GPREmitterShiftD Emitter = {&Assembler::shrd, &Assembler::shrd};
+  emitIASGPRShiftDouble(Func, Dest, Src1, Src2, Emitter);
+}
+
+void InstX86Shrd::dump(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  this->dumpDest(Func);
+  Str << " = shrd." << this->getDest()->getType() << " ";
+  this->dumpSources(Func);
+}
+
+void InstX86Cmov::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  Variable *Dest = this->getDest();
+  Str << "\t";
+  assert(Condition != Cond::Br_None);
+  assert(this->getDest()->hasReg());
+  Str << "cmov" << Traits::InstBrAttributes[Condition].DisplayString
+      << this->getWidthString(Dest->getType()) << "\t";
+  this->getSrc(1)->emit(Func);
+  Str << ", ";
+  Dest->emit(Func);
+}
+
+void InstX86Cmov::emitIAS(const Cfg *Func) const {
+  assert(Condition != Cond::Br_None);
+  assert(this->getDest()->hasReg());
+  assert(this->getSrcSize() == 2);
+  Operand *Src = this->getSrc(1);
+  Type SrcTy = Src->getType();
+  assert(SrcTy == IceType_i16 || SrcTy == IceType_i32 || (Traits::Is64Bit));
+  Assembler *Asm = Func->getAssembler<Assembler>();
+  auto *Target = InstX86Base::getTarget(Func);
+  if (const auto *SrcVar = llvm::dyn_cast<Variable>(Src)) {
+    if (SrcVar->hasReg()) {
+      Asm->cmov(SrcTy, Condition,
+                Traits::getEncodedGPR(this->getDest()->getRegNum()),
+                Traits::getEncodedGPR(SrcVar->getRegNum()));
+    } else {
+      Asm->cmov(SrcTy, Condition,
+                Traits::getEncodedGPR(this->getDest()->getRegNum()),
+                Target->stackVarToAsmOperand(SrcVar));
+    }
+  } else if (const auto *Mem = llvm::dyn_cast<X86OperandMem>(Src)) {
+    assert(Mem->getSegmentRegister() == X86OperandMem::DefaultSegment);
+    Asm->cmov(SrcTy, Condition,
+              Traits::getEncodedGPR(this->getDest()->getRegNum()),
+              Mem->toAsmAddress(Asm, Target));
+  } else {
+    llvm_unreachable("Unexpected operand type");
+  }
+}
+
+void InstX86Cmov::dump(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  Str << "cmov" << Traits::InstBrAttributes[Condition].DisplayString << ".";
+  Str << this->getDest()->getType() << " ";
+  this->dumpDest(Func);
+  Str << ", ";
+  this->dumpSources(Func);
+}
+
+void InstX86Cmpps::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(this->getSrcSize() == 2);
+  assert(Condition < Cond::Cmpps_Invalid);
+  Type DestTy = this->Dest->getType();
+  Str << "\t"
+         "cmp"
+      << Traits::InstCmppsAttributes[Condition].EmitString
+      << Traits::TypeAttributes[DestTy].PdPsString << "\t";
+  this->getSrc(1)->emit(Func);
+  Str << ", ";
+  this->getDest()->emit(Func);
+}
+
+void InstX86Cmpps::emitIAS(const Cfg *Func) const {
+  Assembler *Asm = Func->getAssembler<Assembler>();
+  assert(this->getSrcSize() == 2);
+  assert(Condition < Cond::Cmpps_Invalid);
+  // Assuming there isn't any load folding for cmpps, and vector constants are
+  // not allowed in PNaCl.
+  assert(llvm::isa<Variable>(this->getSrc(1)));
+  auto *Target = InstX86Base::getTarget(Func);
+  const auto *SrcVar = llvm::cast<Variable>(this->getSrc(1));
+  if (SrcVar->hasReg()) {
+    Asm->cmpps(this->getDest()->getType(),
+               Traits::getEncodedXmm(this->getDest()->getRegNum()),
+               Traits::getEncodedXmm(SrcVar->getRegNum()), Condition);
+  } else {
+    Address SrcStackAddr = Target->stackVarToAsmOperand(SrcVar);
+    Asm->cmpps(this->getDest()->getType(),
+               Traits::getEncodedXmm(this->getDest()->getRegNum()),
+               SrcStackAddr, Condition);
+  }
+}
+
+void InstX86Cmpps::dump(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  assert(Condition < Cond::Cmpps_Invalid);
+  this->dumpDest(Func);
+  Str << " = cmp" << Traits::InstCmppsAttributes[Condition].EmitString
+      << "ps"
+         "\t";
+  this->dumpSources(Func);
+}
+
+void InstX86Cmpxchg::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(this->getSrcSize() == 3);
+  if (this->Locked) {
+    Str << "\t"
+           "lock";
+  }
+  Str << "\t"
+         "cmpxchg"
+      << this->getWidthString(this->getSrc(0)->getType()) << "\t";
+  this->getSrc(2)->emit(Func);
+  Str << ", ";
+  this->getSrc(0)->emit(Func);
+}
+
+void InstX86Cmpxchg::emitIAS(const Cfg *Func) const {
+  assert(this->getSrcSize() == 3);
+  Assembler *Asm = Func->getAssembler<Assembler>();
+  Type Ty = this->getSrc(0)->getType();
+  auto *Target = InstX86Base::getTarget(Func);
+  const auto Mem = llvm::cast<X86OperandMem>(this->getSrc(0));
+  assert(Mem->getSegmentRegister() == X86OperandMem::DefaultSegment);
+  const Address Addr = Mem->toAsmAddress(Asm, Target);
+  const auto *VarReg = llvm::cast<Variable>(this->getSrc(2));
+  assert(VarReg->hasReg());
+  const GPRRegister Reg = Traits::getEncodedGPR(VarReg->getRegNum());
+  Asm->cmpxchg(Ty, Addr, Reg, this->Locked);
+}
+
+void InstX86Cmpxchg::dump(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  if (this->Locked) {
+    Str << "lock ";
+  }
+  Str << "cmpxchg." << this->getSrc(0)->getType() << " ";
+  this->dumpSources(Func);
+}
+
+void InstX86Cmpxchg8b::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(this->getSrcSize() == 5);
+  if (this->Locked) {
+    Str << "\t"
+           "lock";
+  }
+  Str << "\t"
+         "cmpxchg8b\t";
+  this->getSrc(0)->emit(Func);
+}
+
+void InstX86Cmpxchg8b::emitIAS(const Cfg *Func) const {
+  assert(this->getSrcSize() == 5);
+  Assembler *Asm = Func->getAssembler<Assembler>();
+  const auto Mem = llvm::cast<X86OperandMem>(this->getSrc(0));
+  assert(Mem->getSegmentRegister() == X86OperandMem::DefaultSegment);
+  auto *Target = InstX86Base::getTarget(Func);
+  const Address Addr = Mem->toAsmAddress(Asm, Target);
+  Asm->cmpxchg8b(Addr, this->Locked);
+}
+
+void InstX86Cmpxchg8b::dump(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  if (this->Locked) {
+    Str << "lock ";
+  }
+  Str << "cmpxchg8b ";
+  this->dumpSources(Func);
+}
+
+void InstX86Cvt::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(this->getSrcSize() == 1);
+  Str << "\t"
+         "cvt";
+  if (isTruncating())
+    Str << "t";
+  Str << Traits::TypeAttributes[this->getSrc(0)->getType()].CvtString << "2"
+      << Traits::TypeAttributes[this->getDest()->getType()].CvtString << "\t";
+  this->getSrc(0)->emit(Func);
+  Str << ", ";
+  this->getDest()->emit(Func);
+}
+
+void InstX86Cvt::emitIAS(const Cfg *Func) const {
+  assert(this->getSrcSize() == 1);
+  const Variable *Dest = this->getDest();
+  const Operand *Src = this->getSrc(0);
+  Type DestTy = Dest->getType();
+  Type SrcTy = Src->getType();
+  switch (Variant) {
+  case Si2ss: {
+    assert(isScalarIntegerType(SrcTy));
+    if (!Traits::Is64Bit) {
+      assert(typeWidthInBytes(SrcTy) <= 4);
+    } else {
+      assert(SrcTy == IceType_i32 || SrcTy == IceType_i64);
+    }
+    assert(isScalarFloatingType(DestTy));
+    static const CastEmitterRegOp<XmmRegister, GPRRegister> Emitter = {
+        &Assembler::cvtsi2ss, &Assembler::cvtsi2ss};
+    emitIASCastRegOp<XmmRegister, GPRRegister, Traits::getEncodedXmm,
+                     Traits::getEncodedGPR>(Func, DestTy, Dest, SrcTy, Src,
+                                            Emitter);
+    return;
+  }
+  case Tss2si: {
+    assert(isScalarFloatingType(SrcTy));
+    assert(isScalarIntegerType(DestTy));
+    if (Traits::Is64Bit) {
+      assert(DestTy == IceType_i32 || DestTy == IceType_i64);
+    } else {
+      assert(typeWidthInBytes(DestTy) <= 4);
+    }
+    static const CastEmitterRegOp<GPRRegister, XmmRegister> Emitter = {
+        &Assembler::cvttss2si, &Assembler::cvttss2si};
+    emitIASCastRegOp<GPRRegister, XmmRegister, Traits::getEncodedGPR,
+                     Traits::getEncodedXmm>(Func, DestTy, Dest, SrcTy, Src,
+                                            Emitter);
+    return;
+  }
+  case Ss2si: {
+    assert(isScalarFloatingType(SrcTy));
+    assert(isScalarIntegerType(DestTy));
+    if (Traits::Is64Bit) {
+      assert(DestTy == IceType_i32 || DestTy == IceType_i64);
+    } else {
+      assert(typeWidthInBytes(DestTy) <= 4);
+    }
+    static const CastEmitterRegOp<GPRRegister, XmmRegister> Emitter = {
+        &Assembler::cvtss2si, &Assembler::cvtss2si};
+    emitIASCastRegOp<GPRRegister, XmmRegister, Traits::getEncodedGPR,
+                     Traits::getEncodedXmm>(Func, DestTy, Dest, SrcTy, Src,
+                                            Emitter);
+    return;
+  }
+  case Float2float: {
+    assert(isScalarFloatingType(SrcTy));
+    assert(isScalarFloatingType(DestTy));
+    assert(DestTy != SrcTy);
+    static const XmmEmitterRegOp Emitter = {&Assembler::cvtfloat2float,
+                                            &Assembler::cvtfloat2float};
+    emitIASRegOpTyXMM(Func, SrcTy, Dest, Src, Emitter);
+    return;
+  }
+  case Dq2ps: {
+    assert(isVectorIntegerType(SrcTy));
+    assert(isVectorFloatingType(DestTy));
+    static const XmmEmitterRegOp Emitter = {&Assembler::cvtdq2ps,
+                                            &Assembler::cvtdq2ps};
+    emitIASRegOpTyXMM(Func, DestTy, Dest, Src, Emitter);
+    return;
+  }
+  case Tps2dq: {
+    assert(isVectorFloatingType(SrcTy));
+    assert(isVectorIntegerType(DestTy));
+    static const XmmEmitterRegOp Emitter = {&Assembler::cvttps2dq,
+                                            &Assembler::cvttps2dq};
+    emitIASRegOpTyXMM(Func, DestTy, Dest, Src, Emitter);
+    return;
+  }
+  case Ps2dq: {
+    assert(isVectorFloatingType(SrcTy));
+    assert(isVectorIntegerType(DestTy));
+    static const XmmEmitterRegOp Emitter = {&Assembler::cvtps2dq,
+                                            &Assembler::cvtps2dq};
+    emitIASRegOpTyXMM(Func, DestTy, Dest, Src, Emitter);
+    return;
+  }
+  }
+}
+
+void InstX86Cvt::dump(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  this->dumpDest(Func);
+  Str << " = cvt";
+  if (isTruncating())
+    Str << "t";
+  Str << Traits::TypeAttributes[this->getSrc(0)->getType()].CvtString << "2"
+      << Traits::TypeAttributes[this->getDest()->getType()].CvtString << " ";
+  this->dumpSources(Func);
+}
+
+void InstX86Round::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(this->getSrcSize() == 3);
+  Str << "\t" << this->Opcode
+      << Traits::TypeAttributes[this->getDest()->getType()].SpSdString << "\t";
+  this->getSrc(1)->emit(Func);
+  Str << ", ";
+  this->getSrc(0)->emit(Func);
+  Str << ", ";
+  this->getDest()->emit(Func);
+}
+
+void InstX86Round::emitIAS(const Cfg *Func) const {
+  assert(this->getSrcSize() == 2);
+  assert(getInstructionSet(Func) >= SSE4_1);
+  const Variable *Dest = this->getDest();
+  Type Ty = Dest->getType();
+  static const ThreeOpImmEmitter<XmmRegister, XmmRegister> Emitter = {
+      &Assembler::round, &Assembler::round};
+  emitIASThreeOpImmOps<XmmRegister, XmmRegister, Traits::getEncodedXmm,
+                       Traits::getEncodedXmm>(Func, Ty, Dest, this->getSrc(0),
+                                              this->getSrc(1), Emitter);
+}
+
+void InstX86Icmp::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(this->getSrcSize() == 2);
+  Str << "\t"
+         "cmp"
+      << this->getWidthString(this->getSrc(0)->getType()) << "\t";
+  this->getSrc(1)->emit(Func);
+  Str << ", ";
+  this->getSrc(0)->emit(Func);
+}
+
+void InstX86Icmp::emitIAS(const Cfg *Func) const {
+  assert(this->getSrcSize() == 2);
+  const Operand *Src0 = this->getSrc(0);
+  const Operand *Src1 = this->getSrc(1);
+  Type Ty = Src0->getType();
+  static const GPREmitterRegOp RegEmitter = {&Assembler::cmp, &Assembler::cmp,
+                                             &Assembler::cmp};
+  static const GPREmitterAddrOp AddrEmitter = {&Assembler::cmp,
+                                               &Assembler::cmp};
+  if (const auto *SrcVar0 = llvm::dyn_cast<Variable>(Src0)) {
+    if (SrcVar0->hasReg()) {
+      constexpr bool NotLea = false;
+      emitIASRegOpTyGPR(Func, NotLea, Ty, SrcVar0, Src1, RegEmitter);
+      return;
+    }
+  }
+  emitIASAsAddrOpTyGPR(Func, Ty, Src0, Src1, AddrEmitter);
+}
+
+void InstX86Icmp::dump(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  Str << "cmp." << this->getSrc(0)->getType() << " ";
+  this->dumpSources(Func);
+}
+
+void InstX86Ucomiss::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(this->getSrcSize() == 2);
+  Str << "\t"
+         "ucomi"
+      << Traits::TypeAttributes[this->getSrc(0)->getType()].SdSsString << "\t";
+  this->getSrc(1)->emit(Func);
+  Str << ", ";
+  this->getSrc(0)->emit(Func);
+}
+
+void InstX86Ucomiss::emitIAS(const Cfg *Func) const {
+  assert(this->getSrcSize() == 2);
+  // Currently src0 is always a variable by convention, to avoid having two
+  // memory operands.
+  assert(llvm::isa<Variable>(this->getSrc(0)));
+  const auto *Src0Var = llvm::cast<Variable>(this->getSrc(0));
+  Type Ty = Src0Var->getType();
+  static const XmmEmitterRegOp Emitter = {&Assembler::ucomiss,
+                                          &Assembler::ucomiss};
+  emitIASRegOpTyXMM(Func, Ty, Src0Var, this->getSrc(1), Emitter);
+}
+
+void InstX86Ucomiss::dump(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  Str << "ucomiss." << this->getSrc(0)->getType() << " ";
+  this->dumpSources(Func);
+}
+
+void InstX86UD2::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(this->getSrcSize() == 0);
+  Str << "\t"
+         "ud2";
+}
+
+void InstX86UD2::emitIAS(const Cfg *Func) const {
+  Assembler *Asm = Func->getAssembler<Assembler>();
+  Asm->ud2();
+}
+
+void InstX86UD2::dump(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  Str << "ud2";
+}
+
+void InstX86Int3::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(this->getSrcSize() == 0);
+  Str << "\t"
+         "int 3";
+}
+
+void InstX86Int3::emitIAS(const Cfg *Func) const {
+  Assembler *Asm = Func->getAssembler<Assembler>();
+  Asm->int3();
+}
+
+void InstX86Int3::dump(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  Str << "int 3";
+}
+
+void InstX86Test::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(this->getSrcSize() == 2);
+  Str << "\t"
+         "test"
+      << this->getWidthString(this->getSrc(0)->getType()) << "\t";
+  this->getSrc(1)->emit(Func);
+  Str << ", ";
+  this->getSrc(0)->emit(Func);
+}
+
+void InstX86Test::emitIAS(const Cfg *Func) const {
+  assert(this->getSrcSize() == 2);
+  const Operand *Src0 = this->getSrc(0);
+  const Operand *Src1 = this->getSrc(1);
+  Type Ty = Src0->getType();
+  // The Reg/Addr form of test is not encodeable.
+  static const GPREmitterRegOp RegEmitter = {&Assembler::test, nullptr,
+                                             &Assembler::test};
+  static const GPREmitterAddrOp AddrEmitter = {&Assembler::test,
+                                               &Assembler::test};
+  if (const auto *SrcVar0 = llvm::dyn_cast<Variable>(Src0)) {
+    if (SrcVar0->hasReg()) {
+      constexpr bool NotLea = false;
+      emitIASRegOpTyGPR(Func, NotLea, Ty, SrcVar0, Src1, RegEmitter);
+      return;
+    }
+  }
+  emitIASAsAddrOpTyGPR(Func, Ty, Src0, Src1, AddrEmitter);
+}
+
+void InstX86Test::dump(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  Str << "test." << this->getSrc(0)->getType() << " ";
+  this->dumpSources(Func);
+}
+
+void InstX86Mfence::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(this->getSrcSize() == 0);
+  Str << "\t"
+         "mfence";
+}
+
+void InstX86Mfence::emitIAS(const Cfg *Func) const {
+  Assembler *Asm = Func->getAssembler<Assembler>();
+  Asm->mfence();
+}
+
+void InstX86Mfence::dump(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  Str << "mfence";
+}
+
+void InstX86Store::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(this->getSrcSize() == 2);
+  Type Ty = this->getSrc(0)->getType();
+  Str << "\t"
+         "mov"
+      << this->getWidthString(Ty) << Traits::TypeAttributes[Ty].SdSsString
+      << "\t";
+  this->getSrc(0)->emit(Func);
+  Str << ", ";
+  this->getSrc(1)->emit(Func);
+}
+
+void InstX86Store::emitIAS(const Cfg *Func) const {
+  assert(this->getSrcSize() == 2);
+  const Operand *Dest = this->getSrc(1);
+  const Operand *Src = this->getSrc(0);
+  Type DestTy = Dest->getType();
+  if (isScalarFloatingType(DestTy)) {
+    // Src must be a register, since Dest is a Mem operand of some kind.
+    const auto *SrcVar = llvm::cast<Variable>(Src);
+    assert(SrcVar->hasReg());
+    XmmRegister SrcReg = Traits::getEncodedXmm(SrcVar->getRegNum());
+    Assembler *Asm = Func->getAssembler<Assembler>();
+    auto *Target = InstX86Base::getTarget(Func);
+    if (const auto *DestVar = llvm::dyn_cast<Variable>(Dest)) {
+      assert(!DestVar->hasReg());
+      Address StackAddr(Target->stackVarToAsmOperand(DestVar));
+      Asm->movss(DestTy, StackAddr, SrcReg);
+    } else {
+      const auto DestMem = llvm::cast<X86OperandMem>(Dest);
+      assert(DestMem->getSegmentRegister() == X86OperandMem::DefaultSegment);
+      Asm->movss(DestTy, DestMem->toAsmAddress(Asm, Target), SrcReg);
+    }
+    return;
+  } else {
+    assert(isScalarIntegerType(DestTy));
+    static const GPREmitterAddrOp GPRAddrEmitter = {&Assembler::mov,
+                                                    &Assembler::mov};
+    emitIASAsAddrOpTyGPR(Func, DestTy, Dest, Src, GPRAddrEmitter);
+  }
+}
+
+void InstX86Store::dump(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  Str << "mov." << this->getSrc(0)->getType() << " ";
+  this->getSrc(1)->dump(Func);
+  Str << ", ";
+  this->getSrc(0)->dump(Func);
+}
+
+void InstX86StoreP::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(this->getSrcSize() == 2);
+  assert(isVectorType(this->getSrc(1)->getType()));
+  Str << "\t"
+         "movups\t";
+  this->getSrc(0)->emit(Func);
+  Str << ", ";
+  this->getSrc(1)->emit(Func);
+}
+
+void InstX86StoreP::emitIAS(const Cfg *Func) const {
+  Assembler *Asm = Func->getAssembler<Assembler>();
+  assert(this->getSrcSize() == 2);
+  const auto *SrcVar = llvm::cast<Variable>(this->getSrc(0));
+  const auto DestMem = llvm::cast<X86OperandMem>(this->getSrc(1));
+  assert(DestMem->getSegmentRegister() == X86OperandMem::DefaultSegment);
+  assert(SrcVar->hasReg());
+  auto *Target = InstX86Base::getTarget(Func);
+  Asm->movups(DestMem->toAsmAddress(Asm, Target),
+              Traits::getEncodedXmm(SrcVar->getRegNum()));
+}
+
+void InstX86StoreP::dump(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  Str << "storep." << this->getSrc(0)->getType() << " ";
+  this->getSrc(1)->dump(Func);
+  Str << ", ";
+  this->getSrc(0)->dump(Func);
+}
+
+void InstX86StoreQ::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(this->getSrcSize() == 2);
+  assert(this->getSrc(1)->getType() == IceType_i64 ||
+         this->getSrc(1)->getType() == IceType_f64 ||
+         isVectorType(this->getSrc(1)->getType()));
+  Str << "\t"
+         "movq\t";
+  this->getSrc(0)->emit(Func);
+  Str << ", ";
+  this->getSrc(1)->emit(Func);
+}
+
+void InstX86StoreQ::emitIAS(const Cfg *Func) const {
+  Assembler *Asm = Func->getAssembler<Assembler>();
+  assert(this->getSrcSize() == 2);
+  const auto *SrcVar = llvm::cast<Variable>(this->getSrc(0));
+  const auto DestMem = llvm::cast<X86OperandMem>(this->getSrc(1));
+  assert(DestMem->getSegmentRegister() == X86OperandMem::DefaultSegment);
+  assert(SrcVar->hasReg());
+  auto *Target = InstX86Base::getTarget(Func);
+  Asm->movq(DestMem->toAsmAddress(Asm, Target),
+            Traits::getEncodedXmm(SrcVar->getRegNum()));
+}
+
+void InstX86StoreQ::dump(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  Str << "storeq." << this->getSrc(0)->getType() << " ";
+  this->getSrc(1)->dump(Func);
+  Str << ", ";
+  this->getSrc(0)->dump(Func);
+}
+
+void InstX86StoreD::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(this->getSrcSize() == 2);
+  assert(this->getSrc(1)->getType() == IceType_i64 ||
+         this->getSrc(1)->getType() == IceType_f64 ||
+         isVectorType(this->getSrc(1)->getType()));
+  Str << "\t"
+         "movd\t";
+  this->getSrc(0)->emit(Func);
+  Str << ", ";
+  this->getSrc(1)->emit(Func);
+}
+
+void InstX86StoreD::emitIAS(const Cfg *Func) const {
+  Assembler *Asm = Func->getAssembler<Assembler>();
+  assert(this->getSrcSize() == 2);
+  const auto *SrcVar = llvm::cast<Variable>(this->getSrc(0));
+  const auto DestMem = llvm::cast<X86OperandMem>(this->getSrc(1));
+  assert(DestMem->getSegmentRegister() == X86OperandMem::DefaultSegment);
+  assert(SrcVar->hasReg());
+  auto *Target = InstX86Base::getTarget(Func);
+  Asm->movd(SrcVar->getType(), DestMem->toAsmAddress(Asm, Target),
+            Traits::getEncodedXmm(SrcVar->getRegNum()));
+}
+
+void InstX86StoreD::dump(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  Str << "stored." << this->getSrc(0)->getType() << " ";
+  this->getSrc(1)->dump(Func);
+  Str << ", ";
+  this->getSrc(0)->dump(Func);
+}
+
+void InstX86Lea::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  if (auto *Add = this->deoptToAddOrNull(Func)) {
+    Add->emit(Func);
+    return;
+  }
+
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(this->getSrcSize() == 1);
+  assert(this->getDest()->hasReg());
+  Str << "\t"
+         "lea"
+      << this->getWidthString(this->getDest()->getType()) << "\t";
+  Operand *Src0 = this->getSrc(0);
+  if (const auto *Src0Var = llvm::dyn_cast<Variable>(Src0)) {
+    Type Ty = Src0Var->getType();
+    // lea on x86-32 doesn't accept mem128 operands, so cast VSrc0 to an
+    // acceptable type.
+    Src0Var->asType(Func, isVectorType(Ty) ? IceType_i32 : Ty, RegNumT())
+        ->emit(Func);
+  } else {
+    Src0->emit(Func);
+  }
+  Str << ", ";
+  this->getDest()->emit(Func);
+}
+
+void InstX86Lea::emitIAS(const Cfg *Func) const {
+  assert(this->getSrcSize() == 1);
+  const Variable *Var = this->getDest();
+  Type Ty = Var->getType();
+  const Operand *Src = this->getSrc(0);
+  bool IsLea = true;
+
+  if (auto *Add = this->deoptToAddOrNull(Func)) {
+    Add->emitIAS(Func);
+    return;
+  }
+
+  emitIASRegOpTyGPR(Func, IsLea, Ty, Var, Src, Emitter);
+}
+
+Inst *InstX86Lea::deoptToAddOrNull(const Cfg *Func) const {
+  // Revert back to Add when the Lea is a 2-address instruction.
+  // Caller has to emit, this just produces the add instruction.
+  if (auto *MemOp = llvm::dyn_cast<X86OperandMem>(this->getSrc(0))) {
+    if (getFlags().getAggressiveLea() &&
+        MemOp->getBase()->getRegNum() == this->getDest()->getRegNum() &&
+        MemOp->getIndex() == nullptr && MemOp->getShift() == 0) {
+      auto *Add = InstX86Add::create(const_cast<Cfg *>(Func), this->getDest(),
+                                     MemOp->getOffset());
+      // TODO(manasijm): Remove const_cast by emitting code for add
+      // directly.
+      return Add;
+    }
+  }
+  return nullptr;
+}
+
+void InstX86Mov::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(this->getSrcSize() == 1);
+  Operand *Src = this->getSrc(0);
+  Type SrcTy = Src->getType();
+  Type DestTy = this->getDest()->getType();
+  if (Traits::Is64Bit && DestTy == IceType_i64 &&
+      llvm::isa<ConstantInteger64>(Src) &&
+      !Utils::IsInt(32, llvm::cast<ConstantInteger64>(Src)->getValue())) {
+    Str << "\t"
+           "movabs"
+           "\t";
+  } else {
+    Str << "\t"
+           "mov"
+        << (!isScalarFloatingType(DestTy)
+                ? this->getWidthString(DestTy)
+                : Traits::TypeAttributes[DestTy].SdSsString)
+        << "\t";
+  }
+  // For an integer truncation operation, src is wider than dest. In this case,
+  // we use a mov instruction whose data width matches the narrower dest.
+  // TODO: This assert disallows usages such as copying a floating
+  // point value between a vector and a scalar (which movss is used for). Clean
+  // this up.
+  assert(InstX86Base::getTarget(Func)->typeWidthInBytesOnStack(DestTy) ==
+         InstX86Base::getTarget(Func)->typeWidthInBytesOnStack(SrcTy));
+  const Operand *NewSrc = Src;
+  if (auto *SrcVar = llvm::dyn_cast<Variable>(Src)) {
+    RegNumT NewRegNum;
+    if (SrcVar->hasReg())
+      NewRegNum = Traits::getGprForType(DestTy, SrcVar->getRegNum());
+    if (SrcTy != DestTy)
+      NewSrc = SrcVar->asType(Func, DestTy, NewRegNum);
+  }
+  NewSrc->emit(Func);
+  Str << ", ";
+  this->getDest()->emit(Func);
+}
+
+void InstX86Mov::emitIAS(const Cfg *Func) const {
+  assert(this->getSrcSize() == 1);
+  const Variable *Dest = this->getDest();
+  const Operand *Src = this->getSrc(0);
+  Type DestTy = Dest->getType();
+  Type SrcTy = Src->getType();
+  // Mov can be used for GPRs or XMM registers. Also, the type does not
+  // necessarily match (Mov can be used for bitcasts). However, when the type
+  // does not match, one of the operands must be a register. Thus, the strategy
+  // is to find out if Src or Dest are a register, then use that register's
+  // type to decide on which emitter set to use. The emitter set will include
+  // reg-reg movs, but that case should be unused when the types don't match.
+  static const XmmEmitterRegOp XmmRegEmitter = {&Assembler::movss,
+                                                &Assembler::movss};
+  static const GPREmitterRegOp GPRRegEmitter = {
+      &Assembler::mov, &Assembler::mov, &Assembler::mov};
+  static const GPREmitterAddrOp GPRAddrEmitter = {&Assembler::mov,
+                                                  &Assembler::mov};
+  // For an integer truncation operation, src is wider than dest. In this case,
+  // we use a mov instruction whose data width matches the narrower dest.
+  // TODO: This assert disallows usages such as copying a floating
+  // point value between a vector and a scalar (which movss is used for). Clean
+  // this up.
+  auto *Target = InstX86Base::getTarget(Func);
+  assert(Target->typeWidthInBytesOnStack(this->getDest()->getType()) ==
+         Target->typeWidthInBytesOnStack(Src->getType()));
+  if (Dest->hasReg()) {
+    if (isScalarFloatingType(DestTy)) {
+      emitIASRegOpTyXMM(Func, DestTy, Dest, Src, XmmRegEmitter);
+      return;
+    } else {
+      assert(isScalarIntegerType(DestTy));
+      // Widen DestTy for truncation (see above note). We should only do this
+      // when both Src and Dest are integer types.
+      if (Traits::Is64Bit && DestTy == IceType_i64) {
+        if (const auto *C64 = llvm::dyn_cast<ConstantInteger64>(Src)) {
+          Func->getAssembler<Assembler>()->movabs(
+              Traits::getEncodedGPR(Dest->getRegNum()), C64->getValue());
+          return;
+        }
+      }
+      if (isScalarIntegerType(SrcTy)) {
+        SrcTy = DestTy;
+      }
+      constexpr bool NotLea = false;
+      emitIASRegOpTyGPR(Func, NotLea, DestTy, Dest, Src, GPRRegEmitter);
+      return;
+    }
+  } else {
+    // Dest must be Stack and Src *could* be a register. Use Src's type to
+    // decide on the emitters.
+    Address StackAddr(Target->stackVarToAsmOperand(Dest));
+    if (isScalarFloatingType(SrcTy)) {
+      // Src must be a register.
+      const auto *SrcVar = llvm::cast<Variable>(Src);
+      assert(SrcVar->hasReg());
+      Assembler *Asm = Func->getAssembler<Assembler>();
+      Asm->movss(SrcTy, StackAddr, Traits::getEncodedXmm(SrcVar->getRegNum()));
+      return;
+    } else if (isVectorType(SrcTy)) {
+      // Src must be a register
+      const auto *SrcVar = llvm::cast<Variable>(Src);
+      assert(SrcVar->hasReg());
+      Assembler *Asm = Func->getAssembler<Assembler>();
+      Asm->movups(StackAddr, Traits::getEncodedXmm(SrcVar->getRegNum()));
+    } else {
+      // Src can be a register or immediate.
+      assert(isScalarIntegerType(SrcTy));
+      emitIASAddrOpTyGPR(Func, SrcTy, StackAddr, Src, GPRAddrEmitter);
+      return;
+    }
+    return;
+  }
+}
+
+void InstX86Movd::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  assert(this->getSrcSize() == 1);
+  Variable *Dest = this->getDest();
+  Operand *Src = this->getSrc(0);
+
+  if (Dest->getType() == IceType_i64 || Src->getType() == IceType_i64) {
+    assert(Dest->getType() == IceType_f64 || Src->getType() == IceType_f64);
+    assert(Dest->getType() != Src->getType());
+    Ostream &Str = Func->getContext()->getStrEmit();
+    Str << "\t"
+           "movq"
+           "\t";
+    Src->emit(Func);
+    Str << ", ";
+    Dest->emit(Func);
+    return;
+  }
+
+  InstX86BaseUnaryopXmm<InstX86Base::Movd>::emit(Func);
+}
+
+void InstX86Movd::emitIAS(const Cfg *Func) const {
+  Assembler *Asm = Func->getAssembler<Assembler>();
+  assert(this->getSrcSize() == 1);
+  const Variable *Dest = this->getDest();
+  auto *Target = InstX86Base::getTarget(Func);
+  // For insert/extract element (one of Src/Dest is an Xmm vector and the other
+  // is an int type).
+  if (const auto *SrcVar = llvm::dyn_cast<Variable>(this->getSrc(0))) {
+    if (SrcVar->getType() == IceType_i32 ||
+        (Traits::Is64Bit && SrcVar->getType() == IceType_i64)) {
+      assert(isVectorType(Dest->getType()) ||
+             (isScalarFloatingType(Dest->getType()) &&
+              typeWidthInBytes(SrcVar->getType()) ==
+                  typeWidthInBytes(Dest->getType())));
+      assert(Dest->hasReg());
+      XmmRegister DestReg = Traits::getEncodedXmm(Dest->getRegNum());
+      if (SrcVar->hasReg()) {
+        Asm->movd(SrcVar->getType(), DestReg,
+                  Traits::getEncodedGPR(SrcVar->getRegNum()));
+      } else {
+        Address StackAddr(Target->stackVarToAsmOperand(SrcVar));
+        Asm->movd(SrcVar->getType(), DestReg, StackAddr);
+      }
+    } else {
+      assert(isVectorType(SrcVar->getType()) ||
+             (isScalarFloatingType(SrcVar->getType()) &&
+              typeWidthInBytes(SrcVar->getType()) ==
+                  typeWidthInBytes(Dest->getType())));
+      assert(SrcVar->hasReg());
+      assert(Dest->getType() == IceType_i32 ||
+             (Traits::Is64Bit && Dest->getType() == IceType_i64));
+      XmmRegister SrcReg = Traits::getEncodedXmm(SrcVar->getRegNum());
+      if (Dest->hasReg()) {
+        Asm->movd(Dest->getType(), Traits::getEncodedGPR(Dest->getRegNum()),
+                  SrcReg);
+      } else {
+        Address StackAddr(Target->stackVarToAsmOperand(Dest));
+        Asm->movd(Dest->getType(), StackAddr, SrcReg);
+      }
+    }
+  } else {
+    assert(Dest->hasReg());
+    XmmRegister DestReg = Traits::getEncodedXmm(Dest->getRegNum());
+    auto *Mem = llvm::cast<X86OperandMem>(this->getSrc(0));
+    Asm->movd(Mem->getType(), DestReg, Mem->toAsmAddress(Asm, Target));
+  }
+}
+
+void InstX86Movp::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  // TODO(wala,stichnot): movups works with all vector operands, but there
+  // exist other instructions (movaps, movdqa, movdqu) that may perform better,
+  // depending on the data type and alignment of the operands.
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(this->getSrcSize() == 1);
+  Str << "\t"
+         "movups\t";
+  this->getSrc(0)->emit(Func);
+  Str << ", ";
+  this->getDest()->emit(Func);
+}
+
+void InstX86Movp::emitIAS(const Cfg *Func) const {
+  assert(this->getSrcSize() == 1);
+  assert(isVectorType(this->getDest()->getType()));
+  const Variable *Dest = this->getDest();
+  const Operand *Src = this->getSrc(0);
+  static const XmmEmitterMovOps Emitter = {
+      &Assembler::movups, &Assembler::movups, &Assembler::movups};
+  emitIASMovlikeXMM(Func, Dest, Src, Emitter);
+}
+
+void InstX86Movq::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(this->getSrcSize() == 1);
+  assert(this->getDest()->getType() == IceType_i64 ||
+         this->getDest()->getType() == IceType_f64);
+  Str << "\t"
+         "movq"
+         "\t";
+  this->getSrc(0)->emit(Func);
+  Str << ", ";
+  this->getDest()->emit(Func);
+}
+
+void InstX86Movq::emitIAS(const Cfg *Func) const {
+  assert(this->getSrcSize() == 1);
+  assert(this->getDest()->getType() == IceType_i64 ||
+         this->getDest()->getType() == IceType_f64 ||
+         isVectorType(this->getDest()->getType()));
+  const Variable *Dest = this->getDest();
+  const Operand *Src = this->getSrc(0);
+  static const XmmEmitterMovOps Emitter = {&Assembler::movq, &Assembler::movq,
+                                           &Assembler::movq};
+  emitIASMovlikeXMM(Func, Dest, Src, Emitter);
+}
+
+void InstX86MovssRegs::emitIAS(const Cfg *Func) const {
+  // This is Binop variant is only intended to be used for reg-reg moves where
+  // part of the Dest register is untouched.
+  assert(this->getSrcSize() == 2);
+  const Variable *Dest = this->getDest();
+  assert(Dest == this->getSrc(0));
+  const auto *SrcVar = llvm::cast<Variable>(this->getSrc(1));
+  assert(Dest->hasReg() && SrcVar->hasReg());
+  Assembler *Asm = Func->getAssembler<Assembler>();
+  Asm->movss(IceType_f32, Traits::getEncodedXmm(Dest->getRegNum()),
+             Traits::getEncodedXmm(SrcVar->getRegNum()));
+}
+
+void InstX86Movsx::emitIAS(const Cfg *Func) const {
+  assert(this->getSrcSize() == 1);
+  const Variable *Dest = this->getDest();
+  const Operand *Src = this->getSrc(0);
+  // Dest must be a > 8-bit register, but Src can be 8-bit. In practice we just
+  // use the full register for Dest to avoid having an OperandSizeOverride
+  // prefix. It also allows us to only dispatch on SrcTy.
+  Type SrcTy = Src->getType();
+  assert(typeWidthInBytes(Dest->getType()) > 1);
+  assert(typeWidthInBytes(Dest->getType()) > typeWidthInBytes(SrcTy));
+  constexpr bool NotLea = false;
+  emitIASRegOpTyGPR<false, true>(Func, NotLea, SrcTy, Dest, Src, this->Emitter);
+}
+
+bool InstX86Movzx::mayBeElided(const Variable *Dest,
+                               const Operand *SrcOpnd) const {
+  assert(Traits::Is64Bit);
+  const auto *Src = llvm::dyn_cast<Variable>(SrcOpnd);
+
+  // Src is not a Variable, so it does not have a register. Movzx can't be
+  // elided.
+  if (Src == nullptr)
+    return false;
+
+  // Movzx to/from memory can't be elided.
+  if (!Src->hasReg() || !Dest->hasReg())
+    return false;
+
+  // Reg/reg move with different source and dest can't be elided.
+  if (Traits::getEncodedGPR(Src->getRegNum()) !=
+      Traits::getEncodedGPR(Dest->getRegNum()))
+    return false;
+
+  // A must-keep movzx 32- to 64-bit is sometimes needed in x86-64 sandboxing.
+  return !MustKeep;
+}
+
+void InstX86Movzx::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  if (Traits::Is64Bit) {
+    // There's no movzx %eXX, %rXX. To zero extend 32- to 64-bits, we emit a
+    // mov %eXX, %eXX. The processor will still do a movzx[bw]q.
+    assert(this->getSrcSize() == 1);
+    const Operand *Src = this->getSrc(0);
+    const Variable *Dest = this->Dest;
+    if (Src->getType() == IceType_i32 && Dest->getType() == IceType_i64) {
+      Ostream &Str = Func->getContext()->getStrEmit();
+      if (mayBeElided(Dest, Src)) {
+        Str << "\t/* elided movzx */";
+      } else {
+        Str << "\t"
+               "mov"
+               "\t";
+        Src->emit(Func);
+        Str << ", ";
+        Dest->asType(Func, IceType_i32,
+                     Traits::getGprForType(IceType_i32, Dest->getRegNum()))
+            ->emit(Func);
+        Str << " /* movzx */";
+      }
+      return;
+    }
+  }
+  InstX86BaseUnaryopGPR<InstX86Base::Movzx>::emit(Func);
+}
+
+void InstX86Movzx::emitIAS(const Cfg *Func) const {
+  assert(this->getSrcSize() == 1);
+  const Variable *Dest = this->getDest();
+  const Operand *Src = this->getSrc(0);
+  Type SrcTy = Src->getType();
+  assert(typeWidthInBytes(Dest->getType()) > 1);
+  assert(typeWidthInBytes(Dest->getType()) > typeWidthInBytes(SrcTy));
+  if (Traits::Is64Bit) {
+    if (Src->getType() == IceType_i32 && Dest->getType() == IceType_i64 &&
+        mayBeElided(Dest, Src)) {
+      return;
+    }
+  }
+  constexpr bool NotLea = false;
+  emitIASRegOpTyGPR<false, true>(Func, NotLea, SrcTy, Dest, Src, this->Emitter);
+}
+
+void InstX86Nop::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  // TODO: Emit the right code for each variant.
+  Str << "\t"
+         "nop\t/* variant = "
+      << Variant << " */";
+}
+
+void InstX86Nop::emitIAS(const Cfg *Func) const {
+  Assembler *Asm = Func->getAssembler<Assembler>();
+  // TODO: Emit the right code for the variant.
+  Asm->nop();
+}
+
+void InstX86Nop::dump(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  Str << "nop (variant = " << Variant << ")";
+}
+
+void InstX86Pextr::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(this->getSrcSize() == 2);
+  // pextrb and pextrd are SSE4.1 instructions.
+  Str << "\t" << this->Opcode
+      << Traits::TypeAttributes[this->getSrc(0)->getType()].IntegralString
+      << "\t";
+  this->getSrc(1)->emit(Func);
+  Str << ", ";
+  this->getSrc(0)->emit(Func);
+  Str << ", ";
+  Variable *Dest = this->getDest();
+  // pextrw must take a register dest. There is an SSE4.1 version that takes a
+  // memory dest, but we aren't using it. For uniformity, just restrict them
+  // all to have a register dest for now.
+  assert(Dest->hasReg());
+  Dest->asType(Func, IceType_i32, Dest->getRegNum())->emit(Func);
+}
+
+void InstX86Pextr::emitIAS(const Cfg *Func) const {
+  assert(this->getSrcSize() == 2);
+  // pextrb and pextrd are SSE4.1 instructions.
+  const Variable *Dest = this->getDest();
+  Type DispatchTy = Traits::getInVectorElementType(this->getSrc(0)->getType());
+  // pextrw must take a register dest. There is an SSE4.1 version that takes a
+  // memory dest, but we aren't using it. For uniformity, just restrict them
+  // all to have a register dest for now.
+  assert(Dest->hasReg());
+  // pextrw's Src(0) must be a register (both SSE4.1 and SSE2).
+  assert(llvm::cast<Variable>(this->getSrc(0))->hasReg());
+  static const ThreeOpImmEmitter<GPRRegister, XmmRegister> Emitter = {
+      &Assembler::pextr, nullptr};
+  emitIASThreeOpImmOps<GPRRegister, XmmRegister, Traits::getEncodedGPR,
+                       Traits::getEncodedXmm>(
+      Func, DispatchTy, Dest, this->getSrc(0), this->getSrc(1), Emitter);
+}
+
+void InstX86Pinsr::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(this->getSrcSize() == 3);
+  Str << "\t" << this->Opcode
+      << Traits::TypeAttributes[this->getDest()->getType()].IntegralString
+      << "\t";
+  this->getSrc(2)->emit(Func);
+  Str << ", ";
+  Operand *Src1 = this->getSrc(1);
+  if (const auto *Src1Var = llvm::dyn_cast<Variable>(Src1)) {
+    // If src1 is a register, it should always be r32.
+    if (Src1Var->hasReg()) {
+      const auto NewRegNum = Traits::getBaseReg(Src1Var->getRegNum());
+      const Variable *NewSrc = Src1Var->asType(Func, IceType_i32, NewRegNum);
+      NewSrc->emit(Func);
+    } else {
+      Src1Var->emit(Func);
+    }
+  } else {
+    Src1->emit(Func);
+  }
+  Str << ", ";
+  this->getDest()->emit(Func);
+}
+
+void InstX86Pinsr::emitIAS(const Cfg *Func) const {
+  assert(this->getSrcSize() == 3);
+  assert(this->getDest() == this->getSrc(0));
+  // pinsrb and pinsrd are SSE4.1 instructions.
+  const Operand *Src0 = this->getSrc(1);
+  Type DispatchTy = Src0->getType();
+  // If src1 is a register, it should always be r32 (this should fall out from
+  // the encodings for ByteRegs overlapping the encodings for r32), but we have
+  // to make sure the register allocator didn't choose an 8-bit high register
+  // like "ah".
+  if (BuildDefs::asserts()) {
+    if (auto *Src0Var = llvm::dyn_cast<Variable>(Src0)) {
+      if (Src0Var->hasReg()) {
+        const auto RegNum = Src0Var->getRegNum();
+        const auto BaseRegNum = Traits::getBaseReg(RegNum);
+        (void)BaseRegNum;
+        assert(Traits::getEncodedGPR(RegNum) ==
+               Traits::getEncodedGPR(BaseRegNum));
+      }
+    }
+  }
+  static const ThreeOpImmEmitter<XmmRegister, GPRRegister> Emitter = {
+      &Assembler::pinsr, &Assembler::pinsr};
+  emitIASThreeOpImmOps<XmmRegister, GPRRegister, Traits::getEncodedXmm,
+                       Traits::getEncodedGPR>(Func, DispatchTy, this->getDest(),
+                                              Src0, this->getSrc(2), Emitter);
+}
+
+void InstX86Pshufd::emitIAS(const Cfg *Func) const {
+  assert(this->getSrcSize() == 2);
+  const Variable *Dest = this->getDest();
+  Type Ty = Dest->getType();
+  static const ThreeOpImmEmitter<XmmRegister, XmmRegister> Emitter = {
+      &Assembler::pshufd, &Assembler::pshufd};
+  emitIASThreeOpImmOps<XmmRegister, XmmRegister, Traits::getEncodedXmm,
+                       Traits::getEncodedXmm>(Func, Ty, Dest, this->getSrc(0),
+                                              this->getSrc(1), Emitter);
+}
+
+void InstX86Shufps::emitIAS(const Cfg *Func) const {
+  assert(this->getSrcSize() == 3);
+  const Variable *Dest = this->getDest();
+  assert(Dest == this->getSrc(0));
+  Type Ty = Dest->getType();
+  static const ThreeOpImmEmitter<XmmRegister, XmmRegister> Emitter = {
+      &Assembler::shufps, &Assembler::shufps};
+  emitIASThreeOpImmOps<XmmRegister, XmmRegister, Traits::getEncodedXmm,
+                       Traits::getEncodedXmm>(Func, Ty, Dest, this->getSrc(1),
+                                              this->getSrc(2), Emitter);
+}
+
+void InstX86Pop::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(this->getSrcSize() == 0);
+  Str << "\t"
+         "pop\t";
+  this->getDest()->emit(Func);
+}
+
+void InstX86Pop::emitIAS(const Cfg *Func) const {
+  assert(this->getSrcSize() == 0);
+  Assembler *Asm = Func->getAssembler<Assembler>();
+  if (this->getDest()->hasReg()) {
+    Asm->popl(Traits::getEncodedGPR(this->getDest()->getRegNum()));
+  } else {
+    auto *Target = InstX86Base::getTarget(Func);
+    Asm->popl(Target->stackVarToAsmOperand(this->getDest()));
+  }
+}
+
+void InstX86Pop::dump(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  this->dumpDest(Func);
+  Str << " = pop." << this->getDest()->getType() << " ";
+}
+
+void InstX86Push::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  Str << "\t"
+         "push"
+         "\t";
+  assert(this->getSrcSize() == 1);
+  const Operand *Src = this->getSrc(0);
+  Src->emit(Func);
+}
+
+void InstX86Push::emitIAS(const Cfg *Func) const {
+  Assembler *Asm = Func->getAssembler<Assembler>();
+
+  assert(this->getSrcSize() == 1);
+  const Operand *Src = this->getSrc(0);
+
+  if (const auto *Var = llvm::dyn_cast<Variable>(Src)) {
+    Asm->pushl(Traits::getEncodedGPR(Var->getRegNum()));
+  } else if (const auto *Const32 = llvm::dyn_cast<ConstantInteger32>(Src)) {
+    Asm->pushl(AssemblerImmediate(Const32->getValue()));
+  } else if (auto *CR = llvm::dyn_cast<ConstantRelocatable>(Src)) {
+    Asm->pushl(CR);
+  } else {
+    llvm_unreachable("Unexpected operand type");
+  }
+}
+
+void InstX86Push::dump(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  Str << "push." << this->getSrc(0)->getType() << " ";
+  this->dumpSources(Func);
+}
+
+void InstX86Ret::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  Str << "\t"
+         "ret";
+}
+
+void InstX86Ret::emitIAS(const Cfg *Func) const {
+  Assembler *Asm = Func->getAssembler<Assembler>();
+  Asm->ret();
+}
+
+void InstX86Ret::dump(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  Type Ty =
+      (this->getSrcSize() == 0 ? IceType_void : this->getSrc(0)->getType());
+  Str << "ret." << Ty << " ";
+  this->dumpSources(Func);
+}
+
+void InstX86Setcc::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  Str << "\t"
+         "set"
+      << Traits::InstBrAttributes[Condition].DisplayString << "\t";
+  this->Dest->emit(Func);
+}
+
+void InstX86Setcc::emitIAS(const Cfg *Func) const {
+  assert(Condition != Cond::Br_None);
+  assert(this->getDest()->getType() == IceType_i1);
+  assert(this->getSrcSize() == 0);
+  Assembler *Asm = Func->getAssembler<Assembler>();
+  auto *Target = InstX86Base::getTarget(Func);
+  if (this->getDest()->hasReg())
+    Asm->setcc(Condition,
+               Traits::getEncodedByteReg(this->getDest()->getRegNum()));
+  else
+    Asm->setcc(Condition, Target->stackVarToAsmOperand(this->getDest()));
+  return;
+}
+
+void InstX86Setcc::dump(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  Str << "setcc." << Traits::InstBrAttributes[Condition].DisplayString << " ";
+  this->dumpDest(Func);
+}
+
+void InstX86Xadd::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  if (this->Locked) {
+    Str << "\t"
+           "lock";
+  }
+  Str << "\t"
+         "xadd"
+      << this->getWidthString(this->getSrc(0)->getType()) << "\t";
+  this->getSrc(1)->emit(Func);
+  Str << ", ";
+  this->getSrc(0)->emit(Func);
+}
+
+void InstX86Xadd::emitIAS(const Cfg *Func) const {
+  assert(this->getSrcSize() == 2);
+  Assembler *Asm = Func->getAssembler<Assembler>();
+  Type Ty = this->getSrc(0)->getType();
+  const auto Mem = llvm::cast<X86OperandMem>(this->getSrc(0));
+  assert(Mem->getSegmentRegister() == X86OperandMem::DefaultSegment);
+  auto *Target = InstX86Base::getTarget(Func);
+  const Address Addr = Mem->toAsmAddress(Asm, Target);
+  const auto *VarReg = llvm::cast<Variable>(this->getSrc(1));
+  assert(VarReg->hasReg());
+  const GPRRegister Reg = Traits::getEncodedGPR(VarReg->getRegNum());
+  Asm->xadd(Ty, Addr, Reg, this->Locked);
+}
+
+void InstX86Xadd::dump(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  if (this->Locked) {
+    Str << "lock ";
+  }
+  Type Ty = this->getSrc(0)->getType();
+  Str << "xadd." << Ty << " ";
+  this->dumpSources(Func);
+}
+
+void InstX86Xchg::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  Str << "\t"
+         "xchg"
+      << this->getWidthString(this->getSrc(0)->getType()) << "\t";
+  this->getSrc(1)->emit(Func);
+  Str << ", ";
+  this->getSrc(0)->emit(Func);
+}
+
+void InstX86Xchg::emitIAS(const Cfg *Func) const {
+  assert(this->getSrcSize() == 2);
+  Assembler *Asm = Func->getAssembler<Assembler>();
+  Type Ty = this->getSrc(0)->getType();
+  const auto *VarReg1 = llvm::cast<Variable>(this->getSrc(1));
+  assert(VarReg1->hasReg());
+  const GPRRegister Reg1 = Traits::getEncodedGPR(VarReg1->getRegNum());
+
+  if (const auto *VarReg0 = llvm::dyn_cast<Variable>(this->getSrc(0))) {
+    assert(VarReg0->hasReg());
+    const GPRRegister Reg0 = Traits::getEncodedGPR(VarReg0->getRegNum());
+    Asm->xchg(Ty, Reg0, Reg1);
+    return;
+  }
+
+  const auto *Mem = llvm::cast<X86OperandMem>(this->getSrc(0));
+  assert(Mem->getSegmentRegister() == X86OperandMem::DefaultSegment);
+  auto *Target = InstX86Base::getTarget(Func);
+  const Address Addr = Mem->toAsmAddress(Asm, Target);
+  Asm->xchg(Ty, Addr, Reg1);
+}
+
+void InstX86Xchg::dump(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  Type Ty = this->getSrc(0)->getType();
+  Str << "xchg." << Ty << " ";
+  this->dumpSources(Func);
+}
+
+void InstX86IacaStart::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  Str << "\t# IACA_START\n"
+         "\t.byte 0x0F, 0x0B\n"
+         "\t"
+         "movl\t$111, %ebx\n"
+         "\t.byte 0x64, 0x67, 0x90";
+}
+
+void InstX86IacaStart::emitIAS(const Cfg *Func) const {
+  Assembler *Asm = Func->getAssembler<Assembler>();
+  Asm->iaca_start();
+}
+
+void InstX86IacaStart::dump(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  Str << "IACA_START";
+}
+
+void InstX86IacaEnd::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  Str << "\t# IACA_END\n"
+         "\t"
+         "movl\t$222, %ebx\n"
+         "\t.byte 0x64, 0x67, 0x90\n"
+         "\t.byte 0x0F, 0x0B";
+}
+
+void InstX86IacaEnd::emitIAS(const Cfg *Func) const {
+  Assembler *Asm = Func->getAssembler<Assembler>();
+  Asm->iaca_end();
+}
+
+void InstX86IacaEnd::dump(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  Str << "IACA_END";
+}
+
 const TargetX8664Traits::InstBrAttributesType
     TargetX8664Traits::InstBrAttributes[] = {
 #define X(val, encode, opp, dump, emit) {CondX86::opp, dump, emit},
@@ -143,7 +2873,23 @@
 
   Str << "(";
   if (Base != nullptr) {
-    Base->emit(Func);
+    const Variable *B = Base;
+    // TODO(jpp): stop abusing the operand's type to identify LEAs.
+    const Type MemType = getType();
+    if (Base->getType() != IceType_i32 && MemType != IceType_void &&
+        !isVectorType(MemType)) {
+      // X86-64 is ILP32, but %rsp and %rbp are accessed as 64-bit registers.
+      // For filetype=asm, they need to be emitted as their 32-bit siblings.
+      assert(Base->getType() == IceType_i64);
+      assert(getEncodedGPR(Base->getRegNum()) == RegX8664::Encoded_Reg_rsp ||
+             getEncodedGPR(Base->getRegNum()) == RegX8664::Encoded_Reg_rbp ||
+             getType() == IceType_void);
+      B = B->asType(
+          Func, IceType_i32,
+          X8664::Traits::getGprForType(IceType_i32, Base->getRegNum()));
+    }
+
+    B->emit(Func);
   }
 
   if (Index != nullptr) {
diff --git a/third_party/subzero/src/IceInstX8664.h b/third_party/subzero/src/IceInstX8664.h
index 1ed80d7..5cbb0e9c 100644
--- a/third_party/subzero/src/IceInstX8664.h
+++ b/third_party/subzero/src/IceInstX8664.h
@@ -1,4 +1,4 @@
-//===- subzero/src/IceInstX8664.h - x86-64 machine instructions -*- C++ -*-===//
+//===- subzero/src/IceInstX86.h - Generic x86 instructions -*- C++ -*--===//
 //
 //                        The Subzero Code Generator
 //
@@ -8,28 +8,3466 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// \brief (Note: x86 instructions are templates, and they are defined in
-/// src/IceInstX86Base.)
+/// \brief This file defines the InstX86Base template class, as well as the
+/// generic X86 Instruction class hierarchy.
 ///
-/// When interacting with the X8664 target (which should only happen in the
-/// X8664 TargetLowering) clients have should use the Ice::X8664::Traits::Insts
-/// traits, which hides all the template verboseness behind a type alias.
-///
-/// For example, to create an X8664 MOV Instruction, clients should do
-///
-/// ::Ice::X8664::Traits::Insts::Mov::create
+/// Only X86 instructions common across all/most X86 targets should be defined
+/// here, with target-specific instructions declared in the target's traits.
 ///
 //===----------------------------------------------------------------------===//
 
 #ifndef SUBZERO_SRC_ICEINSTX8664_H
 #define SUBZERO_SRC_ICEINSTX8664_H
 
-#include "IceDefs.h"
-#include "IceInst.h"
-#include "IceInstX8664Base.h"
-#include "IceOperand.h"
 #include "IceTargetLoweringX8664Traits.h"
 
-X86INSTS_DEFINE_STATIC_DATA(X8664::Traits)
+#include "IceAssemblerX8664.h"
+#include "IceDefs.h"
+#include "IceInst.h"
+#include "IceOperand.h"
+#include "IceTargetLoweringX86.h"
+
+namespace Ice {
+namespace X8664 {
+
+using Traits = TargetX8664Traits;
+using Assembler = typename Traits::Assembler;
+using AssemblerImmediate = typename Assembler::Immediate;
+using TargetLowering = typename Traits::TargetLowering;
+using Address = typename Traits::Address;
+using X86Operand = typename Traits::X86Operand;
+using X86OperandMem = typename Traits::X86OperandMem;
+using VariableSplit = typename Traits::VariableSplit;
+
+using GPRRegister = typename Traits::RegisterSet::GPRRegister;
+using RegisterSet = typename Traits::RegisterSet;
+using XmmRegister = typename Traits::RegisterSet::XmmRegister;
+
+using Cond = CondX86;
+using BrCond = Cond::BrCond;
+using CmppsCond = Cond::CmppsCond;
+
+template <typename SReg_t, typename DReg_t>
+using CastEmitterRegOp =
+    typename Traits::Assembler::template CastEmitterRegOp<SReg_t, DReg_t>;
+template <typename SReg_t, typename DReg_t>
+using ThreeOpImmEmitter =
+    typename Traits::Assembler::template ThreeOpImmEmitter<SReg_t, DReg_t>;
+using GPREmitterAddrOp = typename Traits::Assembler::GPREmitterAddrOp;
+using GPREmitterRegOp = typename Traits::Assembler::GPREmitterRegOp;
+using GPREmitterShiftD = typename Traits::Assembler::GPREmitterShiftD;
+using GPREmitterShiftOp = typename Traits::Assembler::GPREmitterShiftOp;
+using GPREmitterOneOp = typename Traits::Assembler::GPREmitterOneOp;
+using XmmEmitterRegOp = typename Traits::Assembler::XmmEmitterRegOp;
+using XmmEmitterShiftOp = typename Traits::Assembler::XmmEmitterShiftOp;
+using XmmEmitterMovOps = typename Traits::Assembler::XmmEmitterMovOps;
+
+class InstX86Base : public InstTarget {
+  InstX86Base() = delete;
+  InstX86Base(const InstX86Base &) = delete;
+  InstX86Base &operator=(const InstX86Base &) = delete;
+
+public:
+  enum InstKindX86 {
+    k__Start = Inst::Target,
+    Adc,
+    AdcRMW,
+    Add,
+    AddRMW,
+    Addps,
+    Addss,
+    And,
+    Andnps,
+    Andps,
+    AndRMW,
+    Blendvps,
+    Br,
+    Bsf,
+    Bsr,
+    Bswap,
+    Call,
+    Cbwdq,
+    Cmov,
+    Cmpps,
+    Cmpxchg,
+    Cmpxchg8b,
+    Cvt,
+    Div,
+    Divps,
+    Divss,
+    FakeRMW,
+    Icmp,
+    Idiv,
+    Imul,
+    ImulImm,
+    Insertps,
+    Int3,
+    Jmp,
+    Label,
+    Lea,
+    Load,
+    Mfence,
+    Minps,
+    Maxps,
+    Minss,
+    Maxss,
+    Mov,
+    Movd,
+    Movmsk,
+    Movp,
+    Movq,
+    MovssRegs,
+    Movsx,
+    Movzx,
+    Mul,
+    Mulps,
+    Mulss,
+    Neg,
+    Nop,
+    Or,
+    Orps,
+    OrRMW,
+    Padd,
+    Padds,
+    Paddus,
+    Pand,
+    Pandn,
+    Pblendvb,
+    Pcmpeq,
+    Pcmpgt,
+    Pextr,
+    Pinsr,
+    Pmull,
+    Pmulhw,
+    Pmulhuw,
+    Pmaddwd,
+    Pmuludq,
+    Pop,
+    Por,
+    Pshufb,
+    Pshufd,
+    Punpckl,
+    Punpckh,
+    Packss,
+    Packus,
+    Psll,
+    Psra,
+    Psrl,
+    Psub,
+    Psubs,
+    Psubus,
+    Push,
+    Pxor,
+    Ret,
+    Rol,
+    Round,
+    Sar,
+    Sbb,
+    SbbRMW,
+    Setcc,
+    Shl,
+    Shld,
+    Shr,
+    Shrd,
+    Shufps,
+    Sqrt,
+    Store,
+    StoreP,
+    StoreQ,
+    StoreD,
+    Sub,
+    SubRMW,
+    Subps,
+    Subss,
+    Test,
+    Ucomiss,
+    UD2,
+    Xadd,
+    Xchg,
+    Xor,
+    Xorps,
+    XorRMW,
+
+    /// Intel Architecture Code Analyzer markers. These are not executable so
+    /// must only be used for analysis.
+    IacaStart,
+    IacaEnd
+  };
+
+  enum SseSuffix { None, Packed, Unpack, Scalar, Integral, Pack };
+
+  static const char *getWidthString(Type Ty);
+  static const char *getFldString(Type Ty);
+  static BrCond getOppositeCondition(BrCond Cond);
+  void dump(const Cfg *Func) const override;
+
+  // Shared emit routines for common forms of instructions.
+  void emitTwoAddress(const Cfg *Func, const char *Opcode,
+                      const char *Suffix = "") const;
+
+  static TargetLowering *getTarget(const Cfg *Func) {
+    return reinterpret_cast<TargetLowering *>(Func->getTarget());
+  }
+
+protected:
+  InstX86Base(Cfg *Func, InstKindX86 Kind, SizeT Maxsrcs, Variable *Dest)
+      : InstTarget(Func, static_cast<InstKind>(Kind), Maxsrcs, Dest) {}
+
+  static bool isClassof(const Inst *Instr, InstKindX86 MyKind) {
+    return Instr->getKind() == static_cast<InstKind>(MyKind);
+  }
+  // Most instructions that operate on vector arguments require vector memory
+  // operands to be fully aligned (16-byte alignment for PNaCl vector types).
+  // The stack frame layout and call ABI ensure proper alignment for stack
+  // operands, but memory operands (originating from load/store bitcode
+  // instructions) only have element-size alignment guarantees. This function
+  // validates that none of the operands is a memory operand of vector type,
+  // calling report_fatal_error() if one is found. This function should be
+  // called during emission, and maybe also in the ctor (as long as that fits
+  // the lowering style).
+  void validateVectorAddrMode() const {
+    if (this->getDest())
+      this->validateVectorAddrModeOpnd(this->getDest());
+    for (SizeT i = 0; i < this->getSrcSize(); ++i) {
+      this->validateVectorAddrModeOpnd(this->getSrc(i));
+    }
+  }
+
+private:
+  static void validateVectorAddrModeOpnd(const Operand *Opnd) {
+    if (llvm::isa<X86OperandMem>(Opnd) && isVectorType(Opnd->getType())) {
+      llvm::report_fatal_error("Possible misaligned vector memory operation");
+    }
+  }
+};
+
+/// InstX86FakeRMW represents a non-atomic read-modify-write operation on a
+/// memory location. An InstX86FakeRMW is a "fake" instruction in that it
+/// still needs to be lowered to some actual RMW instruction.
+///
+/// If A is some memory address, D is some data value to apply, and OP is an
+/// arithmetic operator, the instruction operates as: (*A) = (*A) OP D
+class InstX86FakeRMW final : public InstX86Base {
+  InstX86FakeRMW() = delete;
+  InstX86FakeRMW(const InstX86FakeRMW &) = delete;
+  InstX86FakeRMW &operator=(const InstX86FakeRMW &) = delete;
+
+public:
+  static InstX86FakeRMW *create(Cfg *Func, Operand *Data, Operand *Addr,
+                                Variable *Beacon, InstArithmetic::OpKind Op,
+                                uint32_t Align = 1) {
+    // TODO(stichnot): Stop ignoring alignment specification.
+    (void)Align;
+    return new (Func->allocate<InstX86FakeRMW>())
+        InstX86FakeRMW(Func, Data, Addr, Op, Beacon);
+  }
+  Operand *getAddr() const { return this->getSrc(1); }
+  Operand *getData() const { return this->getSrc(0); }
+  InstArithmetic::OpKind getOp() const { return Op; }
+  Variable *getBeacon() const { return llvm::cast<Variable>(this->getSrc(2)); }
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, InstX86Base::FakeRMW);
+  }
+
+private:
+  InstArithmetic::OpKind Op;
+  InstX86FakeRMW(Cfg *Func, Operand *Data, Operand *Addr,
+                 InstArithmetic::OpKind Op, Variable *Beacon);
+};
+
+/// InstX86Label represents an intra-block label that is the target of an
+/// intra-block branch. The offset between the label and the branch must be
+/// fit into one byte (considered "near"). These are used for lowering i1
+/// calculations, Select instructions, and 64-bit compares on a 32-bit
+/// architecture, without basic block splitting. Basic block splitting is not
+/// so desirable for several reasons, one of which is the impact on decisions
+/// based on whether a variable's live range spans multiple basic blocks.
+///
+/// Intra-block control flow must be used with caution. Consider the sequence
+/// for "c = (a >= b ? x : y)".
+///     cmp a, b
+///     br lt, L1
+///     mov c, x
+///     jmp L2
+///   L1:
+///     mov c, y
+///   L2:
+///
+/// Labels L1 and L2 are intra-block labels. Without knowledge of the
+/// intra-block control flow, liveness analysis will determine the "mov c, x"
+/// instruction to be dead. One way to prevent this is to insert a
+/// "FakeUse(c)" instruction anywhere between the two "mov c, ..."
+/// instructions, e.g.:
+///
+///     cmp a, b
+///     br lt, L1
+///     mov c, x
+///     jmp L2
+///     FakeUse(c)
+///   L1:
+///     mov c, y
+///   L2:
+///
+/// The down-side is that "mov c, x" can never be dead-code eliminated even if
+/// there are no uses of c. As unlikely as this situation is, it may be
+/// prevented by running dead code elimination before lowering.
+class InstX86Label final : public InstX86Base {
+  InstX86Label() = delete;
+  InstX86Label(const InstX86Label &) = delete;
+  InstX86Label &operator=(const InstX86Label &) = delete;
+
+public:
+  static InstX86Label *create(Cfg *Func, TargetLowering *Target) {
+    return new (Func->allocate<InstX86Label>()) InstX86Label(Func, Target);
+  }
+  uint32_t getEmitInstCount() const override { return 0; }
+  GlobalString getLabelName() const { return Name; }
+  SizeT getLabelNumber() const { return LabelNumber; }
+  bool isLabel() const override { return true; }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  void setRelocOffset(RelocOffset *Value) { OffsetReloc = Value; }
+
+private:
+  InstX86Label(Cfg *Func, TargetLowering *Target);
+
+  SizeT LabelNumber; // used for unique label generation.
+  RelocOffset *OffsetReloc = nullptr;
+  GlobalString Name;
+};
+
+/// Conditional and unconditional branch instruction.
+class InstX86Br final : public InstX86Base {
+  InstX86Br() = delete;
+  InstX86Br(const InstX86Br &) = delete;
+  InstX86Br &operator=(const InstX86Br &) = delete;
+
+public:
+  enum Mode { Near, Far };
+
+  /// Create a conditional branch to a node.
+  static InstX86Br *create(Cfg *Func, CfgNode *TargetTrue, CfgNode *TargetFalse,
+                           BrCond Condition, Mode Kind) {
+    assert(Condition != Cond::Br_None);
+    constexpr InstX86Label *NoLabel = nullptr;
+    return new (Func->allocate<InstX86Br>())
+        InstX86Br(Func, TargetTrue, TargetFalse, NoLabel, Condition, Kind);
+  }
+  /// Create an unconditional branch to a node.
+  static InstX86Br *create(Cfg *Func, CfgNode *Target, Mode Kind) {
+    constexpr CfgNode *NoCondTarget = nullptr;
+    constexpr InstX86Label *NoLabel = nullptr;
+    return new (Func->allocate<InstX86Br>())
+        InstX86Br(Func, NoCondTarget, Target, NoLabel, Cond::Br_None, Kind);
+  }
+  /// Create a non-terminator conditional branch to a node, with a fallthrough
+  /// to the next instruction in the current node. This is used for switch
+  /// lowering.
+  static InstX86Br *create(Cfg *Func, CfgNode *Target, BrCond Condition,
+                           Mode Kind) {
+    assert(Condition != Cond::Br_None);
+    constexpr CfgNode *NoUncondTarget = nullptr;
+    constexpr InstX86Label *NoLabel = nullptr;
+    return new (Func->allocate<InstX86Br>())
+        InstX86Br(Func, Target, NoUncondTarget, NoLabel, Condition, Kind);
+  }
+  /// Create a conditional intra-block branch (or unconditional, if
+  /// Condition==Br_None) to a label in the current block.
+  static InstX86Br *create(Cfg *Func, InstX86Label *Label, BrCond Condition,
+                           Mode Kind) {
+    constexpr CfgNode *NoCondTarget = nullptr;
+    constexpr CfgNode *NoUncondTarget = nullptr;
+    return new (Func->allocate<InstX86Br>())
+        InstX86Br(Func, NoCondTarget, NoUncondTarget, Label, Condition, Kind);
+  }
+  const CfgNode *getTargetTrue() const { return TargetTrue; }
+  const CfgNode *getTargetFalse() const { return TargetFalse; }
+  bool isNear() const { return Kind == Near; }
+  bool optimizeBranch(const CfgNode *NextNode);
+  uint32_t getEmitInstCount() const override {
+    uint32_t Sum = 0;
+    if (Label)
+      ++Sum;
+    if (getTargetTrue())
+      ++Sum;
+    if (getTargetFalse())
+      ++Sum;
+    return Sum;
+  }
+  bool isUnconditionalBranch() const override {
+    return !Label && Condition == Cond::Br_None;
+  }
+  const Inst *getIntraBlockBranchTarget() const override { return Label; }
+  bool repointEdges(CfgNode *OldNode, CfgNode *NewNode) override;
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, InstX86Base::Br);
+  }
+
+private:
+  InstX86Br(Cfg *Func, const CfgNode *TargetTrue, const CfgNode *TargetFalse,
+            const InstX86Label *Label, BrCond Condition, Mode Kind);
+
+  BrCond Condition;
+  const CfgNode *TargetTrue;
+  const CfgNode *TargetFalse;
+  const InstX86Label *Label; // Intra-block branch target
+  const Mode Kind;
+};
+
+/// Jump to a target outside this function, such as tailcall, nacljump,
+/// naclret, unreachable. This is different from a Branch instruction in that
+/// there is no intra-function control flow to represent.
+class InstX86Jmp final : public InstX86Base {
+  InstX86Jmp() = delete;
+  InstX86Jmp(const InstX86Jmp &) = delete;
+  InstX86Jmp &operator=(const InstX86Jmp &) = delete;
+
+public:
+  static InstX86Jmp *create(Cfg *Func, Operand *Target) {
+    return new (Func->allocate<InstX86Jmp>()) InstX86Jmp(Func, Target);
+  }
+  Operand *getJmpTarget() const { return this->getSrc(0); }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, InstX86Base::Jmp);
+  }
+
+private:
+  InstX86Jmp(Cfg *Func, Operand *Target);
+};
+
+/// Call instruction. Arguments should have already been pushed.
+class InstX86Call final : public InstX86Base {
+  InstX86Call() = delete;
+  InstX86Call(const InstX86Call &) = delete;
+  InstX86Call &operator=(const InstX86Call &) = delete;
+
+public:
+  static InstX86Call *create(Cfg *Func, Variable *Dest, Operand *CallTarget) {
+    return new (Func->allocate<InstX86Call>())
+        InstX86Call(Func, Dest, CallTarget);
+  }
+  Operand *getCallTarget() const { return this->getSrc(0); }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, InstX86Base::Call);
+  }
+
+private:
+  InstX86Call(Cfg *Func, Variable *Dest, Operand *CallTarget);
+};
+
+/// Emit a one-operand (GPR) instruction.
+void emitIASOpTyGPR(const Cfg *Func, Type Ty, const Operand *Var,
+                    const GPREmitterOneOp &Emitter);
+
+void emitIASAsAddrOpTyGPR(const Cfg *Func, Type Ty, const Operand *Op0,
+                          const Operand *Op1, const GPREmitterAddrOp &Emitter);
+
+void emitIASGPRShift(const Cfg *Func, Type Ty, const Variable *Var,
+                     const Operand *Src, const GPREmitterShiftOp &Emitter);
+
+void emitIASAddrOpTyGPR(const Cfg *Func, Type Ty, const Address &Addr,
+                        const Operand *Src, const GPREmitterAddrOp &Emitter);
+
+void emitIASRegOpTyXMM(const Cfg *Func, Type Ty, const Variable *Var,
+                       const Operand *Src, const XmmEmitterRegOp &Emitter);
+
+void emitIASGPRShiftDouble(const Cfg *Func, const Variable *Dest,
+                           const Operand *Src1Op, const Operand *Src2Op,
+                           const GPREmitterShiftD &Emitter);
+
+template <typename DReg_t, typename SReg_t, DReg_t (*destEnc)(RegNumT),
+          SReg_t (*srcEnc)(RegNumT)>
+void emitIASCastRegOp(const Cfg *Func, Type DestTy, const Variable *Dest,
+                      Type SrcTy, const Operand *Src,
+                      const CastEmitterRegOp<DReg_t, SReg_t> &Emitter);
+
+template <typename DReg_t, typename SReg_t, DReg_t (*destEnc)(RegNumT),
+          SReg_t (*srcEnc)(RegNumT)>
+void emitIASThreeOpImmOps(const Cfg *Func, Type DispatchTy,
+                          const Variable *Dest, const Operand *Src0,
+                          const Operand *Src1,
+                          const ThreeOpImmEmitter<DReg_t, SReg_t> Emitter);
+
+void emitIASMovlikeXMM(const Cfg *Func, const Variable *Dest,
+                       const Operand *Src, const XmmEmitterMovOps Emitter);
+
+void emitVariableBlendInst(const char *Opcode, const Inst *Instr,
+                           const Cfg *Func);
+
+void emitIASVariableBlendInst(const Inst *Instr, const Cfg *Func,
+                              const XmmEmitterRegOp &Emitter);
+
+void emitIASXmmShift(const Cfg *Func, Type Ty, const Variable *Var,
+                     const Operand *Src, const XmmEmitterShiftOp &Emitter);
+
+/// Emit a two-operand (GPR) instruction, where the dest operand is a Variable
+/// that's guaranteed to be a register.
+template <bool VarCanBeByte = true, bool SrcCanBeByte = true>
+void emitIASRegOpTyGPR(const Cfg *Func, bool IsLea, Type Ty,
+                       const Variable *Dst, const Operand *Src,
+                       const GPREmitterRegOp &Emitter);
+
+/// Instructions of the form x := op(x).
+template <typename InstX86Base::InstKindX86 K>
+class InstX86BaseInplaceopGPR : public InstX86Base {
+  InstX86BaseInplaceopGPR() = delete;
+  InstX86BaseInplaceopGPR(const InstX86BaseInplaceopGPR &) = delete;
+  InstX86BaseInplaceopGPR &operator=(const InstX86BaseInplaceopGPR &) = delete;
+
+public:
+  using Base = InstX86BaseInplaceopGPR<K>;
+
+  void emit(const Cfg *Func) const override {
+    if (!BuildDefs::dump())
+      return;
+    Ostream &Str = Func->getContext()->getStrEmit();
+    assert(this->getSrcSize() == 1);
+    Str << "\t" << Opcode << "\t";
+    this->getSrc(0)->emit(Func);
+  }
+  void emitIAS(const Cfg *Func) const override {
+    assert(this->getSrcSize() == 1);
+    const Variable *Var = this->getDest();
+    Type Ty = Var->getType();
+    emitIASOpTyGPR(Func, Ty, Var, Emitter);
+  }
+  void dump(const Cfg *Func) const override {
+    if (!BuildDefs::dump())
+      return;
+    Ostream &Str = Func->getContext()->getStrDump();
+    this->dumpDest(Func);
+    Str << " = " << Opcode << "." << this->getDest()->getType() << " ";
+    this->dumpSources(Func);
+  }
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, K);
+  }
+
+protected:
+  InstX86BaseInplaceopGPR(Cfg *Func, Operand *SrcDest)
+      : InstX86Base(Func, K, 1, llvm::dyn_cast<Variable>(SrcDest)) {
+    this->addSource(SrcDest);
+  }
+
+private:
+  static const char *const Opcode;
+  static const GPREmitterOneOp Emitter;
+};
+
+/// Instructions of the form x := op(y).
+template <typename InstX86Base::InstKindX86 K>
+class InstX86BaseUnaryopGPR : public InstX86Base {
+  InstX86BaseUnaryopGPR() = delete;
+  InstX86BaseUnaryopGPR(const InstX86BaseUnaryopGPR &) = delete;
+  InstX86BaseUnaryopGPR &operator=(const InstX86BaseUnaryopGPR &) = delete;
+
+public:
+  using Base = InstX86BaseUnaryopGPR<K>;
+
+  void emit(const Cfg *Func) const override {
+    if (!BuildDefs::dump())
+      return;
+    Ostream &Str = Func->getContext()->getStrEmit();
+    assert(this->getSrcSize() == 1);
+    Type SrcTy = this->getSrc(0)->getType();
+    Type DestTy = this->getDest()->getType();
+    Str << "\t" << Opcode << this->getWidthString(SrcTy);
+    // Movsx and movzx need both the source and dest type width letter to
+    // define the operation. The other unary operations have the same source
+    // and dest type and as a result need only one letter.
+    if (SrcTy != DestTy)
+      Str << this->getWidthString(DestTy);
+    Str << "\t";
+    this->getSrc(0)->emit(Func);
+    Str << ", ";
+    this->getDest()->emit(Func);
+  }
+  void emitIAS(const Cfg *Func) const override {
+    assert(this->getSrcSize() == 1 && K != InstX86Base::Lea);
+    const Variable *Var = this->getDest();
+    Type Ty = Var->getType();
+    const Operand *Src = this->getSrc(0);
+    bool IsLea = false;
+    emitIASRegOpTyGPR(Func, IsLea, Ty, Var, Src, Emitter);
+  }
+  void dump(const Cfg *Func) const override {
+    if (!BuildDefs::dump())
+      return;
+    Ostream &Str = Func->getContext()->getStrDump();
+    this->dumpDest(Func);
+    Str << " = " << Opcode << "." << this->getSrc(0)->getType() << " ";
+    this->dumpSources(Func);
+  }
+
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, K);
+  }
+
+protected:
+  InstX86BaseUnaryopGPR(Cfg *Func, Variable *Dest, Operand *Src)
+      : InstX86Base(Func, K, 1, Dest) {
+    this->addSource(Src);
+  }
+
+  static const char *const Opcode;
+  static const GPREmitterRegOp Emitter;
+};
+
+template <typename InstX86Base::InstKindX86 K>
+class InstX86BaseUnaryopXmm : public InstX86Base {
+  InstX86BaseUnaryopXmm() = delete;
+  InstX86BaseUnaryopXmm(const InstX86BaseUnaryopXmm &) = delete;
+  InstX86BaseUnaryopXmm &operator=(const InstX86BaseUnaryopXmm &) = delete;
+
+public:
+  using Base = InstX86BaseUnaryopXmm<K>;
+
+  void emit(const Cfg *Func) const override {
+    if (!BuildDefs::dump())
+      return;
+    Ostream &Str = Func->getContext()->getStrEmit();
+    assert(this->getSrcSize() == 1);
+    Str << "\t" << Opcode << "\t";
+    this->getSrc(0)->emit(Func);
+    Str << ", ";
+    this->getDest()->emit(Func);
+  }
+  void emitIAS(const Cfg *Func) const override {
+    Type Ty = this->getDest()->getType();
+    assert(this->getSrcSize() == 1);
+    emitIASRegOpTyXMM(Func, Ty, this->getDest(), this->getSrc(0), Emitter);
+  }
+  void dump(const Cfg *Func) const override {
+    if (!BuildDefs::dump())
+      return;
+    Ostream &Str = Func->getContext()->getStrDump();
+    this->dumpDest(Func);
+    Str << " = " << Opcode << "." << this->getDest()->getType() << " ";
+    this->dumpSources(Func);
+  }
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, K);
+  }
+
+protected:
+  InstX86BaseUnaryopXmm(Cfg *Func, Variable *Dest, Operand *Src)
+      : InstX86Base(Func, K, 1, Dest) {
+    this->addSource(Src);
+  }
+
+  static const char *const Opcode;
+  static const XmmEmitterRegOp Emitter;
+};
+
+template <typename InstX86Base::InstKindX86 K>
+class InstX86BaseBinopGPRShift : public InstX86Base {
+  InstX86BaseBinopGPRShift() = delete;
+  InstX86BaseBinopGPRShift(const InstX86BaseBinopGPRShift &) = delete;
+  InstX86BaseBinopGPRShift &
+  operator=(const InstX86BaseBinopGPRShift &) = delete;
+
+public:
+  using Base = InstX86BaseBinopGPRShift<K>;
+
+  void emit(const Cfg *Func) const override {
+    if (!BuildDefs::dump())
+      return;
+    this->emitTwoAddress(Func, Opcode);
+  }
+  void emitIAS(const Cfg *Func) const override {
+    Type Ty = this->getDest()->getType();
+    assert(this->getSrcSize() == 2);
+    emitIASGPRShift(Func, Ty, this->getDest(), this->getSrc(1), Emitter);
+  }
+  void dump(const Cfg *Func) const override {
+    if (!BuildDefs::dump())
+      return;
+    Ostream &Str = Func->getContext()->getStrDump();
+    this->dumpDest(Func);
+    Str << " = " << Opcode << "." << this->getDest()->getType() << " ";
+    this->dumpSources(Func);
+  }
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, K);
+  }
+
+protected:
+  InstX86BaseBinopGPRShift(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86Base(Func, K, 2, Dest) {
+    this->addSource(Dest);
+    this->addSource(Source);
+  }
+
+  static const char *const Opcode;
+  static const GPREmitterShiftOp Emitter;
+};
+
+template <typename InstX86Base::InstKindX86 K>
+class InstX86BaseBinopGPR : public InstX86Base {
+  InstX86BaseBinopGPR() = delete;
+  InstX86BaseBinopGPR(const InstX86BaseBinopGPR &) = delete;
+  InstX86BaseBinopGPR &operator=(const InstX86BaseBinopGPR &) = delete;
+
+public:
+  using Base = InstX86BaseBinopGPR<K>;
+
+  void emit(const Cfg *Func) const override {
+    if (!BuildDefs::dump())
+      return;
+    this->emitTwoAddress(Func, Opcode);
+  }
+  void emitIAS(const Cfg *Func) const override {
+    Type Ty = this->getDest()->getType();
+    assert(this->getSrcSize() == 2);
+    constexpr bool ThisIsLEA = K == InstX86Base::Lea;
+    static_assert(!ThisIsLEA, "Lea should be a unaryop.");
+    emitIASRegOpTyGPR(Func, !ThisIsLEA, Ty, this->getDest(), this->getSrc(1),
+                      Emitter);
+  }
+  void dump(const Cfg *Func) const override {
+    if (!BuildDefs::dump())
+      return;
+    Ostream &Str = Func->getContext()->getStrDump();
+    this->dumpDest(Func);
+    Str << " = " << Opcode << "." << this->getDest()->getType() << " ";
+    this->dumpSources(Func);
+  }
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, K);
+  }
+
+protected:
+  InstX86BaseBinopGPR(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86Base(Func, K, 2, Dest) {
+    this->addSource(Dest);
+    this->addSource(Source);
+  }
+
+  static const char *const Opcode;
+  static const GPREmitterRegOp Emitter;
+};
+
+template <typename InstX86Base::InstKindX86 K>
+class InstX86BaseBinopRMW : public InstX86Base {
+  InstX86BaseBinopRMW() = delete;
+  InstX86BaseBinopRMW(const InstX86BaseBinopRMW &) = delete;
+  InstX86BaseBinopRMW &operator=(const InstX86BaseBinopRMW &) = delete;
+
+public:
+  using Base = InstX86BaseBinopRMW<K>;
+
+  void emit(const Cfg *Func) const override {
+    if (!BuildDefs::dump())
+      return;
+    this->emitTwoAddress(Func, Opcode);
+  }
+  void emitIAS(const Cfg *Func) const override {
+    Type Ty = this->getSrc(0)->getType();
+    assert(this->getSrcSize() == 2);
+    emitIASAsAddrOpTyGPR(Func, Ty, this->getSrc(0), this->getSrc(1), Emitter);
+  }
+  void dump(const Cfg *Func) const override {
+    if (!BuildDefs::dump())
+      return;
+    Ostream &Str = Func->getContext()->getStrDump();
+    Str << Opcode << "." << this->getSrc(0)->getType() << " ";
+    this->dumpSources(Func);
+  }
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, K);
+  }
+
+protected:
+  InstX86BaseBinopRMW(Cfg *Func, X86OperandMem *DestSrc0, Operand *Src1)
+      : InstX86Base(Func, K, 2, nullptr) {
+    this->addSource(DestSrc0);
+    this->addSource(Src1);
+  }
+
+  static const char *const Opcode;
+  static const GPREmitterAddrOp Emitter;
+};
+
+template <typename InstX86Base::InstKindX86 K, bool NeedsElementType,
+          typename InstX86Base::SseSuffix Suffix>
+class InstX86BaseBinopXmm : public InstX86Base {
+  InstX86BaseBinopXmm() = delete;
+  InstX86BaseBinopXmm(const InstX86BaseBinopXmm &) = delete;
+  InstX86BaseBinopXmm &operator=(const InstX86BaseBinopXmm &) = delete;
+
+public:
+  using Base = InstX86BaseBinopXmm<K, NeedsElementType, Suffix>;
+
+  void emit(const Cfg *Func) const override {
+    if (!BuildDefs::dump())
+      return;
+    this->validateVectorAddrMode();
+    const Type DestTy = ArithmeticTypeOverride == IceType_void
+                            ? this->getDest()->getType()
+                            : ArithmeticTypeOverride;
+    const char *SuffixString = "";
+    switch (Suffix) {
+    case InstX86Base::SseSuffix::None:
+      break;
+    case InstX86Base::SseSuffix::Packed:
+      SuffixString = Traits::TypeAttributes[DestTy].PdPsString;
+      break;
+    case InstX86Base::SseSuffix::Unpack:
+      SuffixString = Traits::TypeAttributes[DestTy].UnpackString;
+      break;
+    case InstX86Base::SseSuffix::Scalar:
+      SuffixString = Traits::TypeAttributes[DestTy].SdSsString;
+      break;
+    case InstX86Base::SseSuffix::Integral:
+      SuffixString = Traits::TypeAttributes[DestTy].IntegralString;
+      break;
+    case InstX86Base::SseSuffix::Pack:
+      SuffixString = Traits::TypeAttributes[DestTy].PackString;
+      break;
+    }
+    this->emitTwoAddress(Func, Opcode, SuffixString);
+  }
+  void emitIAS(const Cfg *Func) const override {
+    this->validateVectorAddrMode();
+    Type Ty = this->getDest()->getType();
+    if (NeedsElementType)
+      Ty = typeElementType(Ty);
+    assert(this->getSrcSize() == 2);
+    emitIASRegOpTyXMM(Func, Ty, this->getDest(), this->getSrc(1), Emitter);
+  }
+  void dump(const Cfg *Func) const override {
+    if (!BuildDefs::dump())
+      return;
+    Ostream &Str = Func->getContext()->getStrDump();
+    this->dumpDest(Func);
+    Str << " = " << Opcode << "." << this->getDest()->getType() << " ";
+    this->dumpSources(Func);
+  }
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, K);
+  }
+
+protected:
+  InstX86BaseBinopXmm(Cfg *Func, Variable *Dest, Operand *Source,
+                      Type ArithmeticTypeOverride = IceType_void)
+      : InstX86Base(Func, K, 2, Dest),
+        ArithmeticTypeOverride(ArithmeticTypeOverride) {
+    this->addSource(Dest);
+    this->addSource(Source);
+  }
+
+  const Type ArithmeticTypeOverride;
+  static const char *const Opcode;
+  static const XmmEmitterRegOp Emitter;
+};
+
+template <typename InstX86Base::InstKindX86 K, bool AllowAllTypes = false>
+class InstX86BaseBinopXmmShift : public InstX86Base {
+  InstX86BaseBinopXmmShift() = delete;
+  InstX86BaseBinopXmmShift(const InstX86BaseBinopXmmShift &) = delete;
+  InstX86BaseBinopXmmShift &
+  operator=(const InstX86BaseBinopXmmShift &) = delete;
+
+public:
+  using Base = InstX86BaseBinopXmmShift<K, AllowAllTypes>;
+
+  void emit(const Cfg *Func) const override {
+    if (!BuildDefs::dump())
+      return;
+    this->validateVectorAddrMode();
+    // Shift operations are always integral, and hence always need a suffix.
+    const Type DestTy = this->getDest()->getType();
+    this->emitTwoAddress(Func, this->Opcode,
+                         Traits::TypeAttributes[DestTy].IntegralString);
+  }
+  void emitIAS(const Cfg *Func) const override {
+    this->validateVectorAddrMode();
+    Type Ty = this->getDest()->getType();
+    assert(AllowAllTypes || isVectorType(Ty));
+    Type ElementTy = typeElementType(Ty);
+    assert(this->getSrcSize() == 2);
+    emitIASXmmShift(Func, ElementTy, this->getDest(), this->getSrc(1), Emitter);
+  }
+  void dump(const Cfg *Func) const override {
+    if (!BuildDefs::dump())
+      return;
+    Ostream &Str = Func->getContext()->getStrDump();
+    this->dumpDest(Func);
+    Str << " = " << Opcode << "." << this->getDest()->getType() << " ";
+    this->dumpSources(Func);
+  }
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, K);
+  }
+
+protected:
+  InstX86BaseBinopXmmShift(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86Base(Func, K, 2, Dest) {
+    this->addSource(Dest);
+    this->addSource(Source);
+  }
+
+  static const char *const Opcode;
+  static const XmmEmitterShiftOp Emitter;
+};
+
+template <typename InstX86Base::InstKindX86 K>
+class InstX86BaseTernop : public InstX86Base {
+  InstX86BaseTernop() = delete;
+  InstX86BaseTernop(const InstX86BaseTernop &) = delete;
+  InstX86BaseTernop &operator=(const InstX86BaseTernop &) = delete;
+
+public:
+  using Base = InstX86BaseTernop<K>;
+
+  void emit(const Cfg *Func) const override {
+    if (!BuildDefs::dump())
+      return;
+    Ostream &Str = Func->getContext()->getStrEmit();
+    assert(this->getSrcSize() == 3);
+    Str << "\t" << Opcode << "\t";
+    this->getSrc(2)->emit(Func);
+    Str << ", ";
+    this->getSrc(1)->emit(Func);
+    Str << ", ";
+    this->getDest()->emit(Func);
+  }
+  void dump(const Cfg *Func) const override {
+    if (!BuildDefs::dump())
+      return;
+    Ostream &Str = Func->getContext()->getStrDump();
+    this->dumpDest(Func);
+    Str << " = " << Opcode << "." << this->getDest()->getType() << " ";
+    this->dumpSources(Func);
+  }
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, K);
+  }
+
+protected:
+  InstX86BaseTernop(Cfg *Func, Variable *Dest, Operand *Source1,
+                    Operand *Source2)
+      : InstX86Base(Func, K, 3, Dest) {
+    this->addSource(Dest);
+    this->addSource(Source1);
+    this->addSource(Source2);
+  }
+
+  static const char *const Opcode;
+};
+
+// Instructions of the form x := y op z
+template <typename InstX86Base::InstKindX86 K>
+class InstX86BaseThreeAddressop : public InstX86Base {
+  InstX86BaseThreeAddressop() = delete;
+  InstX86BaseThreeAddressop(const InstX86BaseThreeAddressop &) = delete;
+  InstX86BaseThreeAddressop &
+  operator=(const InstX86BaseThreeAddressop &) = delete;
+
+public:
+  using Base = InstX86BaseThreeAddressop<K>;
+
+  void emit(const Cfg *Func) const override {
+    if (!BuildDefs::dump())
+      return;
+    Ostream &Str = Func->getContext()->getStrEmit();
+    assert(this->getSrcSize() == 2);
+    Str << "\t" << Opcode << "\t";
+    this->getSrc(1)->emit(Func);
+    Str << ", ";
+    this->getSrc(0)->emit(Func);
+    Str << ", ";
+    this->getDest()->emit(Func);
+  }
+  void dump(const Cfg *Func) const override {
+    if (!BuildDefs::dump())
+      return;
+    Ostream &Str = Func->getContext()->getStrDump();
+    this->dumpDest(Func);
+    Str << " = " << Opcode << "." << this->getDest()->getType() << " ";
+    this->dumpSources(Func);
+  }
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, K);
+  }
+
+protected:
+  InstX86BaseThreeAddressop(Cfg *Func, Variable *Dest, Operand *Source0,
+                            Operand *Source1)
+      : InstX86Base(Func, K, 2, Dest) {
+    this->addSource(Source0);
+    this->addSource(Source1);
+  }
+
+  static const char *const Opcode;
+};
+
+/// Base class for assignment instructions
+template <typename InstX86Base::InstKindX86 K>
+class InstX86BaseMovlike : public InstX86Base {
+  InstX86BaseMovlike() = delete;
+  InstX86BaseMovlike(const InstX86BaseMovlike &) = delete;
+  InstX86BaseMovlike &operator=(const InstX86BaseMovlike &) = delete;
+
+public:
+  using Base = InstX86BaseMovlike<K>;
+
+  bool isRedundantAssign() const override {
+    if (const auto *SrcVar = llvm::dyn_cast<const Variable>(this->getSrc(0))) {
+      if (SrcVar->hasReg() && this->Dest->hasReg()) {
+        // An assignment between physical registers is considered redundant if
+        // they have the same base register and the same encoding. E.g.:
+        //   mov cl, ecx ==> redundant
+        //   mov ch, ecx ==> not redundant due to different encodings
+        //   mov ch, ebp ==> not redundant due to different base registers
+        //   mov ecx, ecx ==> redundant, and dangerous in x86-64. i64 zexting
+        //                    is handled by Inst86Zext.
+        const auto SrcReg = SrcVar->getRegNum();
+        const auto DestReg = this->Dest->getRegNum();
+        return (Traits::getEncoding(SrcReg) == Traits::getEncoding(DestReg)) &&
+               (Traits::getBaseReg(SrcReg) == Traits::getBaseReg(DestReg));
+      }
+    }
+    return checkForRedundantAssign(this->getDest(), this->getSrc(0));
+  }
+  bool isVarAssign() const override {
+    return llvm::isa<Variable>(this->getSrc(0));
+  }
+  void dump(const Cfg *Func) const override {
+    if (!BuildDefs::dump())
+      return;
+    Ostream &Str = Func->getContext()->getStrDump();
+    Str << Opcode << "." << this->getDest()->getType() << " ";
+    this->dumpDest(Func);
+    Str << ", ";
+    this->dumpSources(Func);
+  }
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, K);
+  }
+
+protected:
+  InstX86BaseMovlike(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86Base(Func, K, 1, Dest) {
+    this->addSource(Source);
+    // For an integer assignment, make sure it's either a same-type assignment
+    // or a truncation.
+    assert(!isScalarIntegerType(Dest->getType()) ||
+           (typeWidthInBytes(Dest->getType()) <=
+            typeWidthInBytes(Source->getType())));
+  }
+
+  static const char *const Opcode;
+};
+
+class InstX86Bswap : public InstX86BaseInplaceopGPR<InstX86Base::Bswap> {
+public:
+  static InstX86Bswap *create(Cfg *Func, Operand *SrcDest) {
+    return new (Func->allocate<InstX86Bswap>()) InstX86Bswap(Func, SrcDest);
+  }
+
+private:
+  InstX86Bswap(Cfg *Func, Operand *SrcDest)
+      : InstX86BaseInplaceopGPR<InstX86Base::Bswap>(Func, SrcDest) {}
+};
+
+class InstX86Neg : public InstX86BaseInplaceopGPR<InstX86Base::Neg> {
+public:
+  static InstX86Neg *create(Cfg *Func, Operand *SrcDest) {
+    return new (Func->allocate<InstX86Neg>()) InstX86Neg(Func, SrcDest);
+  }
+
+private:
+  InstX86Neg(Cfg *Func, Operand *SrcDest)
+      : InstX86BaseInplaceopGPR<InstX86Base::Neg>(Func, SrcDest) {}
+};
+
+class InstX86Bsf : public InstX86BaseUnaryopGPR<InstX86Base::Bsf> {
+public:
+  static InstX86Bsf *create(Cfg *Func, Variable *Dest, Operand *Src) {
+    return new (Func->allocate<InstX86Bsf>()) InstX86Bsf(Func, Dest, Src);
+  }
+
+private:
+  InstX86Bsf(Cfg *Func, Variable *Dest, Operand *Src)
+      : InstX86BaseUnaryopGPR<InstX86Base::Bsf>(Func, Dest, Src) {}
+};
+
+class InstX86Bsr : public InstX86BaseUnaryopGPR<InstX86Base::Bsr> {
+public:
+  static InstX86Bsr *create(Cfg *Func, Variable *Dest, Operand *Src) {
+    return new (Func->allocate<InstX86Bsr>()) InstX86Bsr(Func, Dest, Src);
+  }
+
+private:
+  InstX86Bsr(Cfg *Func, Variable *Dest, Operand *Src)
+      : InstX86BaseUnaryopGPR<InstX86Base::Bsr>(Func, Dest, Src) {}
+};
+
+class InstX86Lea : public InstX86BaseUnaryopGPR<InstX86Base::Lea> {
+public:
+  static InstX86Lea *create(Cfg *Func, Variable *Dest, Operand *Src) {
+    return new (Func->allocate<InstX86Lea>()) InstX86Lea(Func, Dest, Src);
+  }
+
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+
+private:
+  InstX86Lea(Cfg *Func, Variable *Dest, Operand *Src)
+      : InstX86BaseUnaryopGPR<InstX86Base::Lea>(Func, Dest, Src) {}
+
+  Inst *deoptToAddOrNull(const Cfg *Func) const;
+};
+
+// Cbwdq instruction - wrapper for cbw, cwd, and cdq
+class InstX86Cbwdq : public InstX86BaseUnaryopGPR<InstX86Base::Cbwdq> {
+public:
+  static InstX86Cbwdq *create(Cfg *Func, Variable *Dest, Operand *Src) {
+    return new (Func->allocate<InstX86Cbwdq>()) InstX86Cbwdq(Func, Dest, Src);
+  }
+
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+
+private:
+  InstX86Cbwdq(Cfg *Func, Variable *Dest, Operand *Src)
+      : InstX86BaseUnaryopGPR<InstX86Base::Cbwdq>(Func, Dest, Src) {}
+};
+
+class InstX86Movsx : public InstX86BaseUnaryopGPR<InstX86Base::Movsx> {
+public:
+  static InstX86Movsx *create(Cfg *Func, Variable *Dest, Operand *Src) {
+    assert(typeWidthInBytes(Dest->getType()) >
+           typeWidthInBytes(Src->getType()));
+    return new (Func->allocate<InstX86Movsx>()) InstX86Movsx(Func, Dest, Src);
+  }
+
+  void emitIAS(const Cfg *Func) const override;
+
+private:
+  InstX86Movsx(Cfg *Func, Variable *Dest, Operand *Src)
+      : InstX86BaseUnaryopGPR<InstX86Base::Movsx>(Func, Dest, Src) {}
+};
+
+class InstX86Movzx : public InstX86BaseUnaryopGPR<InstX86Base::Movzx> {
+public:
+  static InstX86Movzx *create(Cfg *Func, Variable *Dest, Operand *Src) {
+    assert(typeWidthInBytes(Dest->getType()) >
+           typeWidthInBytes(Src->getType()));
+    return new (Func->allocate<InstX86Movzx>()) InstX86Movzx(Func, Dest, Src);
+  }
+
+  void emit(const Cfg *Func) const override;
+
+  void emitIAS(const Cfg *Func) const override;
+
+  void setMustKeep() { MustKeep = true; }
+
+private:
+  bool MustKeep = false;
+
+  InstX86Movzx(Cfg *Func, Variable *Dest, Operand *Src)
+      : InstX86BaseUnaryopGPR<InstX86Base::Movzx>(Func, Dest, Src) {}
+
+  bool mayBeElided(const Variable *Dest, const Operand *Src) const;
+};
+
+class InstX86Movd : public InstX86BaseUnaryopXmm<InstX86Base::Movd> {
+public:
+  static InstX86Movd *create(Cfg *Func, Variable *Dest, Operand *Src) {
+    return new (Func->allocate<InstX86Movd>()) InstX86Movd(Func, Dest, Src);
+  }
+
+  void emit(const Cfg *Func) const override;
+
+  void emitIAS(const Cfg *Func) const override;
+
+private:
+  InstX86Movd(Cfg *Func, Variable *Dest, Operand *Src)
+      : InstX86BaseUnaryopXmm<InstX86Base::Movd>(Func, Dest, Src) {}
+};
+
+class InstX86Movmsk final : public InstX86Base {
+  InstX86Movmsk() = delete;
+  InstX86Movmsk(const InstX86Movmsk &) = delete;
+  InstX86Movmsk &operator=(const InstX86Movmsk &) = delete;
+
+public:
+  static InstX86Movmsk *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Movmsk>())
+        InstX86Movmsk(Func, Dest, Source);
+  }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, InstX86Base::Movmsk);
+  }
+
+private:
+  InstX86Movmsk(Cfg *Func, Variable *Dest, Operand *Source);
+};
+
+class InstX86Sqrt : public InstX86BaseUnaryopXmm<InstX86Base::Sqrt> {
+public:
+  static InstX86Sqrt *create(Cfg *Func, Variable *Dest, Operand *Src) {
+    return new (Func->allocate<InstX86Sqrt>()) InstX86Sqrt(Func, Dest, Src);
+  }
+
+  virtual void emit(const Cfg *Func) const override;
+
+private:
+  InstX86Sqrt(Cfg *Func, Variable *Dest, Operand *Src)
+      : InstX86BaseUnaryopXmm<InstX86Base::Sqrt>(Func, Dest, Src) {}
+};
+
+/// Move/assignment instruction - wrapper for mov/movss/movsd.
+class InstX86Mov : public InstX86BaseMovlike<InstX86Base::Mov> {
+public:
+  static InstX86Mov *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    assert(!isScalarIntegerType(Dest->getType()) ||
+           (typeWidthInBytes(Dest->getType()) <=
+            typeWidthInBytes(Source->getType())));
+    return new (Func->allocate<InstX86Mov>()) InstX86Mov(Func, Dest, Source);
+  }
+
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+
+private:
+  InstX86Mov(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseMovlike<InstX86Base::Mov>(Func, Dest, Source) {}
+};
+
+/// Move packed - copy 128 bit values between XMM registers, or mem128 and XMM
+/// registers.
+class InstX86Movp : public InstX86BaseMovlike<InstX86Base::Movp> {
+public:
+  static InstX86Movp *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Movp>()) InstX86Movp(Func, Dest, Source);
+  }
+
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+
+private:
+  InstX86Movp(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseMovlike<InstX86Base::Movp>(Func, Dest, Source) {}
+};
+
+/// Movq - copy between XMM registers, or mem64 and XMM registers.
+class InstX86Movq : public InstX86BaseMovlike<InstX86Base::Movq> {
+public:
+  static InstX86Movq *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Movq>()) InstX86Movq(Func, Dest, Source);
+  }
+
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+
+private:
+  InstX86Movq(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseMovlike<InstX86Base::Movq>(Func, Dest, Source) {}
+};
+
+class InstX86Add : public InstX86BaseBinopGPR<InstX86Base::Add> {
+public:
+  static InstX86Add *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Add>()) InstX86Add(Func, Dest, Source);
+  }
+
+private:
+  InstX86Add(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopGPR<InstX86Base::Add>(Func, Dest, Source) {}
+};
+
+class InstX86AddRMW : public InstX86BaseBinopRMW<InstX86Base::AddRMW> {
+public:
+  static InstX86AddRMW *create(Cfg *Func, X86OperandMem *DestSrc0,
+                               Operand *Src1) {
+    return new (Func->allocate<InstX86AddRMW>())
+        InstX86AddRMW(Func, DestSrc0, Src1);
+  }
+
+private:
+  InstX86AddRMW(Cfg *Func, X86OperandMem *DestSrc0, Operand *Src1)
+      : InstX86BaseBinopRMW<InstX86Base::AddRMW>(Func, DestSrc0, Src1) {}
+};
+
+class InstX86Addps
+    : public InstX86BaseBinopXmm<InstX86Base::Addps, true,
+                                 InstX86Base::SseSuffix::Packed> {
+public:
+  static InstX86Addps *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Addps>())
+        InstX86Addps(Func, Dest, Source);
+  }
+
+private:
+  InstX86Addps(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopXmm<InstX86Base::Addps, true,
+                            InstX86Base::SseSuffix::Packed>(Func, Dest,
+                                                            Source) {}
+};
+
+class InstX86Adc : public InstX86BaseBinopGPR<InstX86Base::Adc> {
+public:
+  static InstX86Adc *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Adc>()) InstX86Adc(Func, Dest, Source);
+  }
+
+private:
+  InstX86Adc(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopGPR<InstX86Base::Adc>(Func, Dest, Source) {}
+};
+
+class InstX86AdcRMW : public InstX86BaseBinopRMW<InstX86Base::AdcRMW> {
+public:
+  static InstX86AdcRMW *create(Cfg *Func, X86OperandMem *DestSrc0,
+                               Operand *Src1) {
+    return new (Func->allocate<InstX86AdcRMW>())
+        InstX86AdcRMW(Func, DestSrc0, Src1);
+  }
+
+private:
+  InstX86AdcRMW(Cfg *Func, X86OperandMem *DestSrc0, Operand *Src1)
+      : InstX86BaseBinopRMW<InstX86Base::AdcRMW>(Func, DestSrc0, Src1) {}
+};
+
+class InstX86Addss
+    : public InstX86BaseBinopXmm<InstX86Base::Addss, false,
+                                 InstX86Base::SseSuffix::Scalar> {
+public:
+  static InstX86Addss *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Addss>())
+        InstX86Addss(Func, Dest, Source);
+  }
+
+private:
+  InstX86Addss(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopXmm<InstX86Base::Addss, false,
+                            InstX86Base::SseSuffix::Scalar>(Func, Dest,
+                                                            Source) {}
+};
+
+class InstX86Padd
+    : public InstX86BaseBinopXmm<InstX86Base::Padd, true,
+                                 InstX86Base::SseSuffix::Integral> {
+public:
+  static InstX86Padd *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Padd>()) InstX86Padd(Func, Dest, Source);
+  }
+
+private:
+  InstX86Padd(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopXmm<InstX86Base::Padd, true,
+                            InstX86Base::SseSuffix::Integral>(Func, Dest,
+                                                              Source) {}
+};
+
+class InstX86Padds
+    : public InstX86BaseBinopXmm<InstX86Base::Padds, true,
+                                 InstX86Base::SseSuffix::Integral> {
+public:
+  static InstX86Padds *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Padds>())
+        InstX86Padds(Func, Dest, Source);
+  }
+
+private:
+  InstX86Padds(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopXmm<InstX86Base::Padds, true,
+                            InstX86Base::SseSuffix::Integral>(Func, Dest,
+                                                              Source) {}
+};
+
+class InstX86Paddus
+    : public InstX86BaseBinopXmm<InstX86Base::Paddus, true,
+                                 InstX86Base::SseSuffix::Integral> {
+public:
+  static InstX86Paddus *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Paddus>())
+        InstX86Paddus(Func, Dest, Source);
+  }
+
+private:
+  InstX86Paddus(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopXmm<InstX86Base::Paddus, true,
+                            InstX86Base::SseSuffix::Integral>(Func, Dest,
+                                                              Source) {}
+};
+
+class InstX86Sub : public InstX86BaseBinopGPR<InstX86Base::Sub> {
+public:
+  static InstX86Sub *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Sub>()) InstX86Sub(Func, Dest, Source);
+  }
+
+private:
+  InstX86Sub(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopGPR<InstX86Base::Sub>(Func, Dest, Source) {}
+};
+
+class InstX86SubRMW : public InstX86BaseBinopRMW<InstX86Base::SubRMW> {
+public:
+  static InstX86SubRMW *create(Cfg *Func, X86OperandMem *DestSrc0,
+                               Operand *Src1) {
+    return new (Func->allocate<InstX86SubRMW>())
+        InstX86SubRMW(Func, DestSrc0, Src1);
+  }
+
+private:
+  InstX86SubRMW(Cfg *Func, X86OperandMem *DestSrc0, Operand *Src1)
+      : InstX86BaseBinopRMW<InstX86Base::SubRMW>(Func, DestSrc0, Src1) {}
+};
+
+class InstX86Subps
+    : public InstX86BaseBinopXmm<InstX86Base::Subps, true,
+                                 InstX86Base::SseSuffix::Packed> {
+public:
+  static InstX86Subps *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Subps>())
+        InstX86Subps(Func, Dest, Source);
+  }
+
+private:
+  InstX86Subps(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopXmm<InstX86Base::Subps, true,
+                            InstX86Base::SseSuffix::Packed>(Func, Dest,
+                                                            Source) {}
+};
+
+class InstX86Subss
+    : public InstX86BaseBinopXmm<InstX86Base::Subss, false,
+                                 InstX86Base::SseSuffix::Scalar> {
+public:
+  static InstX86Subss *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Subss>())
+        InstX86Subss(Func, Dest, Source);
+  }
+
+private:
+  InstX86Subss(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopXmm<InstX86Base::Subss, false,
+                            InstX86Base::SseSuffix::Scalar>(Func, Dest,
+                                                            Source) {}
+};
+
+class InstX86Sbb : public InstX86BaseBinopGPR<InstX86Base::Sbb> {
+public:
+  static InstX86Sbb *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Sbb>()) InstX86Sbb(Func, Dest, Source);
+  }
+
+private:
+  InstX86Sbb(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopGPR<InstX86Base::Sbb>(Func, Dest, Source) {}
+};
+
+class InstX86SbbRMW : public InstX86BaseBinopRMW<InstX86Base::SbbRMW> {
+public:
+  static InstX86SbbRMW *create(Cfg *Func, X86OperandMem *DestSrc0,
+                               Operand *Src1) {
+    return new (Func->allocate<InstX86SbbRMW>())
+        InstX86SbbRMW(Func, DestSrc0, Src1);
+  }
+
+private:
+  InstX86SbbRMW(Cfg *Func, X86OperandMem *DestSrc0, Operand *Src1)
+      : InstX86BaseBinopRMW<InstX86Base::SbbRMW>(Func, DestSrc0, Src1) {}
+};
+
+class InstX86Psub
+    : public InstX86BaseBinopXmm<InstX86Base::Psub, true,
+                                 InstX86Base::SseSuffix::Integral> {
+public:
+  static InstX86Psub *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Psub>()) InstX86Psub(Func, Dest, Source);
+  }
+
+private:
+  InstX86Psub(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopXmm<InstX86Base::Psub, true,
+                            InstX86Base::SseSuffix::Integral>(Func, Dest,
+                                                              Source) {}
+};
+
+class InstX86Psubs
+    : public InstX86BaseBinopXmm<InstX86Base::Psubs, true,
+                                 InstX86Base::SseSuffix::Integral> {
+public:
+  static InstX86Psubs *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Psubs>())
+        InstX86Psubs(Func, Dest, Source);
+  }
+
+private:
+  InstX86Psubs(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopXmm<InstX86Base::Psubs, true,
+                            InstX86Base::SseSuffix::Integral>(Func, Dest,
+                                                              Source) {}
+};
+
+class InstX86Psubus
+    : public InstX86BaseBinopXmm<InstX86Base::Psubus, true,
+                                 InstX86Base::SseSuffix::Integral> {
+public:
+  static InstX86Psubus *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Psubus>())
+        InstX86Psubus(Func, Dest, Source);
+  }
+
+private:
+  InstX86Psubus(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopXmm<InstX86Base::Psubus, true,
+                            InstX86Base::SseSuffix::Integral>(Func, Dest,
+                                                              Source) {}
+};
+
+class InstX86And : public InstX86BaseBinopGPR<InstX86Base::And> {
+public:
+  static InstX86And *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86And>()) InstX86And(Func, Dest, Source);
+  }
+
+private:
+  InstX86And(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopGPR<InstX86Base::And>(Func, Dest, Source) {}
+};
+
+class InstX86Andnps
+    : public InstX86BaseBinopXmm<InstX86Base::Andnps, true,
+                                 InstX86Base::SseSuffix::Packed> {
+public:
+  static InstX86Andnps *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Andnps>())
+        InstX86Andnps(Func, Dest, Source);
+  }
+
+private:
+  InstX86Andnps(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopXmm<InstX86Base::Andnps, true,
+                            InstX86Base::SseSuffix::Packed>(Func, Dest,
+                                                            Source) {}
+};
+
+class InstX86Andps
+    : public InstX86BaseBinopXmm<InstX86Base::Andps, true,
+                                 InstX86Base::SseSuffix::Packed> {
+public:
+  static InstX86Andps *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Andps>())
+        InstX86Andps(Func, Dest, Source);
+  }
+
+private:
+  InstX86Andps(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopXmm<InstX86Base::Andps, true,
+                            InstX86Base::SseSuffix::Packed>(Func, Dest,
+                                                            Source) {}
+};
+
+class InstX86AndRMW : public InstX86BaseBinopRMW<InstX86Base::AndRMW> {
+public:
+  static InstX86AndRMW *create(Cfg *Func, X86OperandMem *DestSrc0,
+                               Operand *Src1) {
+    return new (Func->allocate<InstX86AndRMW>())
+        InstX86AndRMW(Func, DestSrc0, Src1);
+  }
+
+private:
+  InstX86AndRMW(Cfg *Func, X86OperandMem *DestSrc0, Operand *Src1)
+      : InstX86BaseBinopRMW<InstX86Base::AndRMW>(Func, DestSrc0, Src1) {}
+};
+
+class InstX86Pand : public InstX86BaseBinopXmm<InstX86Base::Pand, false,
+                                               InstX86Base::SseSuffix::None> {
+public:
+  static InstX86Pand *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Pand>()) InstX86Pand(Func, Dest, Source);
+  }
+
+private:
+  InstX86Pand(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopXmm<InstX86Base::Pand, false,
+                            InstX86Base::SseSuffix::None>(Func, Dest, Source) {}
+};
+
+class InstX86Pandn : public InstX86BaseBinopXmm<InstX86Base::Pandn, false,
+                                                InstX86Base::SseSuffix::None> {
+public:
+  static InstX86Pandn *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Pandn>())
+        InstX86Pandn(Func, Dest, Source);
+  }
+
+private:
+  InstX86Pandn(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopXmm<InstX86Base::Pandn, false,
+                            InstX86Base::SseSuffix::None>(Func, Dest, Source) {}
+};
+
+class InstX86Maxss
+    : public InstX86BaseBinopXmm<InstX86Base::Maxss, true,
+                                 InstX86Base::SseSuffix::Scalar> {
+public:
+  static InstX86Maxss *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Maxss>())
+        InstX86Maxss(Func, Dest, Source);
+  }
+
+private:
+  InstX86Maxss(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopXmm<InstX86Base::Maxss, true,
+                            InstX86Base::SseSuffix::Scalar>(Func, Dest,
+                                                            Source) {}
+};
+
+class InstX86Minss
+    : public InstX86BaseBinopXmm<InstX86Base::Minss, true,
+                                 InstX86Base::SseSuffix::Scalar> {
+public:
+  static InstX86Minss *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Minss>())
+        InstX86Minss(Func, Dest, Source);
+  }
+
+private:
+  InstX86Minss(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopXmm<InstX86Base::Minss, true,
+                            InstX86Base::SseSuffix::Scalar>(Func, Dest,
+                                                            Source) {}
+};
+
+class InstX86Maxps : public InstX86BaseBinopXmm<InstX86Base::Maxps, true,
+                                                InstX86Base::SseSuffix::None> {
+public:
+  static InstX86Maxps *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Maxps>())
+        InstX86Maxps(Func, Dest, Source);
+  }
+
+private:
+  InstX86Maxps(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopXmm<InstX86Base::Maxps, true,
+                            InstX86Base::SseSuffix::None>(Func, Dest, Source) {}
+};
+
+class InstX86Minps : public InstX86BaseBinopXmm<InstX86Base::Minps, true,
+                                                InstX86Base::SseSuffix::None> {
+public:
+  static InstX86Minps *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Minps>())
+        InstX86Minps(Func, Dest, Source);
+  }
+
+private:
+  InstX86Minps(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopXmm<InstX86Base::Minps, true,
+                            InstX86Base::SseSuffix::None>(Func, Dest, Source) {}
+};
+
+class InstX86Or : public InstX86BaseBinopGPR<InstX86Base::Or> {
+public:
+  static InstX86Or *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Or>()) InstX86Or(Func, Dest, Source);
+  }
+
+private:
+  InstX86Or(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopGPR<InstX86Base::Or>(Func, Dest, Source) {}
+};
+
+class InstX86Orps : public InstX86BaseBinopXmm<InstX86Base::Orps, true,
+                                               InstX86Base::SseSuffix::Packed> {
+public:
+  static InstX86Orps *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Orps>()) InstX86Orps(Func, Dest, Source);
+  }
+
+private:
+  InstX86Orps(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopXmm<InstX86Base::Orps, true,
+                            InstX86Base::SseSuffix::Packed>(Func, Dest,
+                                                            Source) {}
+};
+
+class InstX86OrRMW : public InstX86BaseBinopRMW<InstX86Base::OrRMW> {
+public:
+  static InstX86OrRMW *create(Cfg *Func, X86OperandMem *DestSrc0,
+                              Operand *Src1) {
+    return new (Func->allocate<InstX86OrRMW>())
+        InstX86OrRMW(Func, DestSrc0, Src1);
+  }
+
+private:
+  InstX86OrRMW(Cfg *Func, X86OperandMem *DestSrc0, Operand *Src1)
+      : InstX86BaseBinopRMW<InstX86Base::OrRMW>(Func, DestSrc0, Src1) {}
+};
+
+class InstX86Por : public InstX86BaseBinopXmm<InstX86Base::Por, false,
+                                              InstX86Base::SseSuffix::None> {
+public:
+  static InstX86Por *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Por>()) InstX86Por(Func, Dest, Source);
+  }
+
+private:
+  InstX86Por(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopXmm<InstX86Base::Por, false,
+                            InstX86Base::SseSuffix::None>(Func, Dest, Source) {}
+};
+
+class InstX86Xor : public InstX86BaseBinopGPR<InstX86Base::Xor> {
+public:
+  static InstX86Xor *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Xor>()) InstX86Xor(Func, Dest, Source);
+  }
+
+private:
+  InstX86Xor(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopGPR<InstX86Base::Xor>(Func, Dest, Source) {}
+};
+
+class InstX86Xorps
+    : public InstX86BaseBinopXmm<InstX86Base::Xorps, true,
+                                 InstX86Base::SseSuffix::Packed> {
+public:
+  static InstX86Xorps *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Xorps>())
+        InstX86Xorps(Func, Dest, Source);
+  }
+
+private:
+  InstX86Xorps(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopXmm<InstX86Base::Xorps, true,
+                            InstX86Base::SseSuffix::Packed>(Func, Dest,
+                                                            Source) {}
+};
+
+class InstX86XorRMW : public InstX86BaseBinopRMW<InstX86Base::XorRMW> {
+public:
+  static InstX86XorRMW *create(Cfg *Func, X86OperandMem *DestSrc0,
+                               Operand *Src1) {
+    return new (Func->allocate<InstX86XorRMW>())
+        InstX86XorRMW(Func, DestSrc0, Src1);
+  }
+
+private:
+  InstX86XorRMW(Cfg *Func, X86OperandMem *DestSrc0, Operand *Src1)
+      : InstX86BaseBinopRMW<InstX86Base::XorRMW>(Func, DestSrc0, Src1) {}
+};
+
+class InstX86Pxor : public InstX86BaseBinopXmm<InstX86Base::Pxor, false,
+                                               InstX86Base::SseSuffix::None> {
+public:
+  static InstX86Pxor *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Pxor>()) InstX86Pxor(Func, Dest, Source);
+  }
+
+private:
+  InstX86Pxor(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopXmm<InstX86Base::Pxor, false,
+                            InstX86Base::SseSuffix::None>(Func, Dest, Source) {}
+};
+
+class InstX86Imul : public InstX86BaseBinopGPR<InstX86Base::Imul> {
+public:
+  static InstX86Imul *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Imul>()) InstX86Imul(Func, Dest, Source);
+  }
+
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+
+private:
+  InstX86Imul(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopGPR<InstX86Base::Imul>(Func, Dest, Source) {}
+};
+
+class InstX86ImulImm : public InstX86BaseThreeAddressop<InstX86Base::ImulImm> {
+public:
+  static InstX86ImulImm *create(Cfg *Func, Variable *Dest, Operand *Source0,
+                                Operand *Source1) {
+    return new (Func->allocate<InstX86ImulImm>())
+        InstX86ImulImm(Func, Dest, Source0, Source1);
+  }
+
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+
+private:
+  InstX86ImulImm(Cfg *Func, Variable *Dest, Operand *Source0, Operand *Source1)
+      : InstX86BaseThreeAddressop<InstX86Base::ImulImm>(Func, Dest, Source0,
+                                                        Source1) {}
+};
+
+class InstX86Mulps
+    : public InstX86BaseBinopXmm<InstX86Base::Mulps, true,
+                                 InstX86Base::SseSuffix::Packed> {
+public:
+  static InstX86Mulps *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Mulps>())
+        InstX86Mulps(Func, Dest, Source);
+  }
+
+private:
+  InstX86Mulps(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopXmm<InstX86Base::Mulps, true,
+                            InstX86Base::SseSuffix::Packed>(Func, Dest,
+                                                            Source) {}
+};
+
+class InstX86Mulss
+    : public InstX86BaseBinopXmm<InstX86Base::Mulss, false,
+                                 InstX86Base::SseSuffix::Scalar> {
+public:
+  static InstX86Mulss *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Mulss>())
+        InstX86Mulss(Func, Dest, Source);
+  }
+
+private:
+  InstX86Mulss(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopXmm<InstX86Base::Mulss, false,
+                            InstX86Base::SseSuffix::Scalar>(Func, Dest,
+                                                            Source) {}
+};
+
+class InstX86Pmull
+    : public InstX86BaseBinopXmm<InstX86Base::Pmull, true,
+                                 InstX86Base::SseSuffix::Integral> {
+public:
+  static InstX86Pmull *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    bool TypesAreValid =
+        Dest->getType() == IceType_v4i32 || Dest->getType() == IceType_v8i16;
+    bool InstructionSetIsValid =
+        Dest->getType() == IceType_v8i16 || getInstructionSet(Func) >= SSE4_1;
+    (void)TypesAreValid;
+    (void)InstructionSetIsValid;
+    assert(TypesAreValid);
+    assert(InstructionSetIsValid);
+    return new (Func->allocate<InstX86Pmull>())
+        InstX86Pmull(Func, Dest, Source);
+  }
+
+private:
+  InstX86Pmull(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopXmm<InstX86Base::Pmull, true,
+                            InstX86Base::SseSuffix::Integral>(Func, Dest,
+                                                              Source) {}
+};
+
+class InstX86Pmulhw : public InstX86BaseBinopXmm<InstX86Base::Pmulhw, false,
+                                                 InstX86Base::SseSuffix::None> {
+public:
+  static InstX86Pmulhw *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    assert(Dest->getType() == IceType_v8i16 &&
+           Source->getType() == IceType_v8i16);
+    return new (Func->allocate<InstX86Pmulhw>())
+        InstX86Pmulhw(Func, Dest, Source);
+  }
+
+private:
+  InstX86Pmulhw(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopXmm<InstX86Base::Pmulhw, false,
+                            InstX86Base::SseSuffix::None>(Func, Dest, Source) {}
+};
+
+class InstX86Pmulhuw
+    : public InstX86BaseBinopXmm<InstX86Base::Pmulhuw, false,
+                                 InstX86Base::SseSuffix::None> {
+public:
+  static InstX86Pmulhuw *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    assert(Dest->getType() == IceType_v8i16 &&
+           Source->getType() == IceType_v8i16);
+    return new (Func->allocate<InstX86Pmulhuw>())
+        InstX86Pmulhuw(Func, Dest, Source);
+  }
+
+private:
+  InstX86Pmulhuw(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopXmm<InstX86Base::Pmulhuw, false,
+                            InstX86Base::SseSuffix::None>(Func, Dest, Source) {}
+};
+
+class InstX86Pmaddwd
+    : public InstX86BaseBinopXmm<InstX86Base::Pmaddwd, false,
+                                 InstX86Base::SseSuffix::None> {
+public:
+  static InstX86Pmaddwd *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    assert(Dest->getType() == IceType_v8i16 &&
+           Source->getType() == IceType_v8i16);
+    return new (Func->allocate<InstX86Pmaddwd>())
+        InstX86Pmaddwd(Func, Dest, Source);
+  }
+
+private:
+  InstX86Pmaddwd(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopXmm<InstX86Base::Pmaddwd, false,
+                            InstX86Base::SseSuffix::None>(Func, Dest, Source) {}
+};
+
+class InstX86Pmuludq
+    : public InstX86BaseBinopXmm<InstX86Base::Pmuludq, false,
+                                 InstX86Base::SseSuffix::None> {
+public:
+  static InstX86Pmuludq *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    assert(Dest->getType() == IceType_v4i32 &&
+           Source->getType() == IceType_v4i32);
+    return new (Func->allocate<InstX86Pmuludq>())
+        InstX86Pmuludq(Func, Dest, Source);
+  }
+
+private:
+  InstX86Pmuludq(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopXmm<InstX86Base::Pmuludq, false,
+                            InstX86Base::SseSuffix::None>(Func, Dest, Source) {}
+};
+
+class InstX86Divps
+    : public InstX86BaseBinopXmm<InstX86Base::Divps, true,
+                                 InstX86Base::SseSuffix::Packed> {
+public:
+  static InstX86Divps *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Divps>())
+        InstX86Divps(Func, Dest, Source);
+  }
+
+private:
+  InstX86Divps(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopXmm<InstX86Base::Divps, true,
+                            InstX86Base::SseSuffix::Packed>(Func, Dest,
+                                                            Source) {}
+};
+
+class InstX86Divss
+    : public InstX86BaseBinopXmm<InstX86Base::Divss, false,
+                                 InstX86Base::SseSuffix::Scalar> {
+public:
+  static InstX86Divss *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Divss>())
+        InstX86Divss(Func, Dest, Source);
+  }
+
+private:
+  InstX86Divss(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopXmm<InstX86Base::Divss, false,
+                            InstX86Base::SseSuffix::Scalar>(Func, Dest,
+                                                            Source) {}
+};
+
+class InstX86Rol : public InstX86BaseBinopGPRShift<InstX86Base::Rol> {
+public:
+  static InstX86Rol *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Rol>()) InstX86Rol(Func, Dest, Source);
+  }
+
+private:
+  InstX86Rol(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopGPRShift<InstX86Base::Rol>(Func, Dest, Source) {}
+};
+
+class InstX86Shl : public InstX86BaseBinopGPRShift<InstX86Base::Shl> {
+public:
+  static InstX86Shl *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Shl>()) InstX86Shl(Func, Dest, Source);
+  }
+
+private:
+  InstX86Shl(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopGPRShift<InstX86Base::Shl>(Func, Dest, Source) {}
+};
+
+class InstX86Psll : public InstX86BaseBinopXmmShift<InstX86Base::Psll> {
+public:
+  static InstX86Psll *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    assert(Dest->getType() == IceType_v8i16 ||
+           Dest->getType() == IceType_v8i1 ||
+           Dest->getType() == IceType_v4i32 || Dest->getType() == IceType_v4i1);
+    return new (Func->allocate<InstX86Psll>()) InstX86Psll(Func, Dest, Source);
+  }
+
+private:
+  InstX86Psll(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopXmmShift<InstX86Base::Psll>(Func, Dest, Source) {}
+};
+
+class InstX86Psrl : public InstX86BaseBinopXmmShift<InstX86Base::Psrl, true> {
+public:
+  static InstX86Psrl *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Psrl>()) InstX86Psrl(Func, Dest, Source);
+  }
+
+private:
+  InstX86Psrl(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopXmmShift<InstX86Base::Psrl, true>(Func, Dest, Source) {}
+};
+
+class InstX86Shr : public InstX86BaseBinopGPRShift<InstX86Base::Shr> {
+public:
+  static InstX86Shr *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Shr>()) InstX86Shr(Func, Dest, Source);
+  }
+
+private:
+  InstX86Shr(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopGPRShift<InstX86Base::Shr>(Func, Dest, Source) {}
+};
+
+class InstX86Sar : public InstX86BaseBinopGPRShift<InstX86Base::Sar> {
+public:
+  static InstX86Sar *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Sar>()) InstX86Sar(Func, Dest, Source);
+  }
+
+private:
+  InstX86Sar(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopGPRShift<InstX86Base::Sar>(Func, Dest, Source) {}
+};
+
+class InstX86Psra : public InstX86BaseBinopXmmShift<InstX86Base::Psra> {
+public:
+  static InstX86Psra *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    assert(Dest->getType() == IceType_v8i16 ||
+           Dest->getType() == IceType_v8i1 ||
+           Dest->getType() == IceType_v4i32 || Dest->getType() == IceType_v4i1);
+    return new (Func->allocate<InstX86Psra>()) InstX86Psra(Func, Dest, Source);
+  }
+
+private:
+  InstX86Psra(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopXmmShift<InstX86Base::Psra>(Func, Dest, Source) {}
+};
+
+class InstX86Pcmpeq
+    : public InstX86BaseBinopXmm<InstX86Base::Pcmpeq, true,
+                                 InstX86Base::SseSuffix::Integral> {
+public:
+  static InstX86Pcmpeq *create(Cfg *Func, Variable *Dest, Operand *Source,
+                               Type ArithmeticTypeOverride = IceType_void) {
+    const Type Ty = ArithmeticTypeOverride == IceType_void
+                        ? Dest->getType()
+                        : ArithmeticTypeOverride;
+    (void)Ty;
+    assert((Ty != IceType_f64 && Ty != IceType_i64) ||
+           getInstructionSet(Func) >= SSE4_1);
+    return new (Func->allocate<InstX86Pcmpeq>())
+        InstX86Pcmpeq(Func, Dest, Source, ArithmeticTypeOverride);
+  }
+
+private:
+  InstX86Pcmpeq(Cfg *Func, Variable *Dest, Operand *Source,
+                Type ArithmeticTypeOverride)
+      : InstX86BaseBinopXmm<InstX86Base::Pcmpeq, true,
+                            InstX86Base::SseSuffix::Integral>(
+            Func, Dest, Source, ArithmeticTypeOverride) {}
+};
+
+class InstX86Pcmpgt
+    : public InstX86BaseBinopXmm<InstX86Base::Pcmpgt, true,
+                                 InstX86Base::SseSuffix::Integral> {
+public:
+  static InstX86Pcmpgt *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    assert(Dest->getType() != IceType_f64 ||
+           getInstructionSet(Func) >= SSE4_1);
+    return new (Func->allocate<InstX86Pcmpgt>())
+        InstX86Pcmpgt(Func, Dest, Source);
+  }
+
+private:
+  InstX86Pcmpgt(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopXmm<InstX86Base::Pcmpgt, true,
+                            InstX86Base::SseSuffix::Integral>(Func, Dest,
+                                                              Source) {}
+};
+
+/// movss is only a binary operation when the source and dest operands are
+/// both registers (the high bits of dest are left untouched). In other cases,
+/// it behaves like a copy (mov-like) operation (and the high bits of dest are
+/// cleared). InstX86Movss will assert that both its source and dest operands
+/// are registers, so the lowering code should use _mov instead of _movss in
+/// cases where a copy operation is intended.
+class InstX86MovssRegs
+    : public InstX86BaseBinopXmm<InstX86Base::MovssRegs, false,
+                                 InstX86Base::SseSuffix::None> {
+public:
+  static InstX86MovssRegs *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86MovssRegs>())
+        InstX86MovssRegs(Func, Dest, Source);
+  }
+
+  void emitIAS(const Cfg *Func) const override;
+
+private:
+  InstX86MovssRegs(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopXmm<InstX86Base::MovssRegs, false,
+                            InstX86Base::SseSuffix::None>(Func, Dest, Source) {}
+};
+
+class InstX86Idiv : public InstX86BaseTernop<InstX86Base::Idiv> {
+public:
+  static InstX86Idiv *create(Cfg *Func, Variable *Dest, Operand *Source1,
+                             Operand *Source2) {
+    return new (Func->allocate<InstX86Idiv>())
+        InstX86Idiv(Func, Dest, Source1, Source2);
+  }
+
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+
+private:
+  InstX86Idiv(Cfg *Func, Variable *Dest, Operand *Source1, Operand *Source2)
+      : InstX86BaseTernop<InstX86Base::Idiv>(Func, Dest, Source1, Source2) {}
+};
+
+class InstX86Div : public InstX86BaseTernop<InstX86Base::Div> {
+public:
+  static InstX86Div *create(Cfg *Func, Variable *Dest, Operand *Source1,
+                            Operand *Source2) {
+    return new (Func->allocate<InstX86Div>())
+        InstX86Div(Func, Dest, Source1, Source2);
+  }
+
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+
+private:
+  InstX86Div(Cfg *Func, Variable *Dest, Operand *Source1, Operand *Source2)
+      : InstX86BaseTernop<InstX86Base::Div>(Func, Dest, Source1, Source2) {}
+};
+
+class InstX86Insertps : public InstX86BaseTernop<InstX86Base::Insertps> {
+public:
+  static InstX86Insertps *create(Cfg *Func, Variable *Dest, Operand *Source1,
+                                 Operand *Source2) {
+    return new (Func->allocate<InstX86Insertps>())
+        InstX86Insertps(Func, Dest, Source1, Source2);
+  }
+
+  void emitIAS(const Cfg *Func) const override;
+
+private:
+  InstX86Insertps(Cfg *Func, Variable *Dest, Operand *Source1, Operand *Source2)
+      : InstX86BaseTernop<InstX86Base::Insertps>(Func, Dest, Source1, Source2) {
+  }
+};
+
+class InstX86Pinsr : public InstX86BaseTernop<InstX86Base::Pinsr> {
+public:
+  static InstX86Pinsr *create(Cfg *Func, Variable *Dest, Operand *Source1,
+                              Operand *Source2) {
+    // pinsrb and pinsrd are SSE4.1 instructions.
+    assert(Dest->getType() == IceType_v8i16 ||
+           Dest->getType() == IceType_v8i1 ||
+           getInstructionSet(Func) >= SSE4_1);
+    return new (Func->allocate<InstX86Pinsr>())
+        InstX86Pinsr(Func, Dest, Source1, Source2);
+  }
+
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+
+private:
+  InstX86Pinsr(Cfg *Func, Variable *Dest, Operand *Source1, Operand *Source2)
+      : InstX86BaseTernop<InstX86Base::Pinsr>(Func, Dest, Source1, Source2) {}
+};
+
+class InstX86Shufps : public InstX86BaseTernop<InstX86Base::Shufps> {
+public:
+  static InstX86Shufps *create(Cfg *Func, Variable *Dest, Operand *Source1,
+                               Operand *Source2) {
+    return new (Func->allocate<InstX86Shufps>())
+        InstX86Shufps(Func, Dest, Source1, Source2);
+  }
+
+  void emitIAS(const Cfg *Func) const override;
+
+private:
+  InstX86Shufps(Cfg *Func, Variable *Dest, Operand *Source1, Operand *Source2)
+      : InstX86BaseTernop<InstX86Base::Shufps>(Func, Dest, Source1, Source2) {}
+};
+
+class InstX86Blendvps : public InstX86BaseTernop<InstX86Base::Blendvps> {
+public:
+  static InstX86Blendvps *create(Cfg *Func, Variable *Dest, Operand *Source1,
+                                 Operand *Source2) {
+    assert(getInstructionSet(Func) >= SSE4_1);
+    return new (Func->allocate<InstX86Blendvps>())
+        InstX86Blendvps(Func, Dest, Source1, Source2);
+  }
+
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Fund) const override;
+
+private:
+  InstX86Blendvps(Cfg *Func, Variable *Dest, Operand *Source1, Operand *Source2)
+      : InstX86BaseTernop<InstX86Base::Blendvps>(Func, Dest, Source1, Source2) {
+  }
+};
+
+class InstX86Pblendvb : public InstX86BaseTernop<InstX86Base::Pblendvb> {
+public:
+  static InstX86Pblendvb *create(Cfg *Func, Variable *Dest, Operand *Source1,
+                                 Operand *Source2) {
+    assert(getInstructionSet(Func) >= SSE4_1);
+    return new (Func->allocate<InstX86Pblendvb>())
+        InstX86Pblendvb(Func, Dest, Source1, Source2);
+  }
+
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+
+private:
+  InstX86Pblendvb(Cfg *Func, Variable *Dest, Operand *Source1, Operand *Source2)
+      : InstX86BaseTernop<InstX86Base::Pblendvb>(Func, Dest, Source1, Source2) {
+  }
+};
+
+class InstX86Pextr : public InstX86BaseThreeAddressop<InstX86Base::Pextr> {
+public:
+  static InstX86Pextr *create(Cfg *Func, Variable *Dest, Operand *Source0,
+                              Operand *Source1) {
+    assert(Source0->getType() == IceType_v8i16 ||
+           Source0->getType() == IceType_v8i1 ||
+           getInstructionSet(Func) >= SSE4_1);
+    return new (Func->allocate<InstX86Pextr>())
+        InstX86Pextr(Func, Dest, Source0, Source1);
+  }
+
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+
+private:
+  InstX86Pextr(Cfg *Func, Variable *Dest, Operand *Source0, Operand *Source1)
+      : InstX86BaseThreeAddressop<InstX86Base::Pextr>(Func, Dest, Source0,
+                                                      Source1) {}
+};
+
+class InstX86Pshufd : public InstX86BaseThreeAddressop<InstX86Base::Pshufd> {
+public:
+  static InstX86Pshufd *create(Cfg *Func, Variable *Dest, Operand *Source0,
+                               Operand *Source1) {
+    return new (Func->allocate<InstX86Pshufd>())
+        InstX86Pshufd(Func, Dest, Source0, Source1);
+  }
+
+  void emitIAS(const Cfg *Func) const override;
+
+private:
+  InstX86Pshufd(Cfg *Func, Variable *Dest, Operand *Source0, Operand *Source1)
+      : InstX86BaseThreeAddressop<InstX86Base::Pshufd>(Func, Dest, Source0,
+                                                       Source1) {}
+};
+
+/// Base class for a lockable x86-32 instruction (emits a locked prefix).
+class InstX86BaseLockable : public InstX86Base {
+  InstX86BaseLockable() = delete;
+  InstX86BaseLockable(const InstX86BaseLockable &) = delete;
+  InstX86BaseLockable &operator=(const InstX86BaseLockable &) = delete;
+
+protected:
+  bool Locked;
+
+  InstX86BaseLockable(Cfg *Func, typename InstX86Base::InstKindX86 Kind,
+                      SizeT Maxsrcs, Variable *Dest, bool Locked)
+      : InstX86Base(Func, Kind, Maxsrcs, Dest), Locked(Locked) {
+    // Assume that such instructions are used for Atomics and be careful with
+    // optimizations.
+    this->HasSideEffects = Locked;
+  }
+};
+
+/// Mul instruction - unsigned multiply.
+class InstX86Mul final : public InstX86Base {
+  InstX86Mul() = delete;
+  InstX86Mul(const InstX86Mul &) = delete;
+  InstX86Mul &operator=(const InstX86Mul &) = delete;
+
+public:
+  static InstX86Mul *create(Cfg *Func, Variable *Dest, Variable *Source1,
+                            Operand *Source2) {
+    return new (Func->allocate<InstX86Mul>())
+        InstX86Mul(Func, Dest, Source1, Source2);
+  }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, InstX86Base::Mul);
+  }
+
+private:
+  InstX86Mul(Cfg *Func, Variable *Dest, Variable *Source1, Operand *Source2);
+};
+
+/// Shld instruction - shift across a pair of operands.
+class InstX86Shld final : public InstX86Base {
+  InstX86Shld() = delete;
+  InstX86Shld(const InstX86Shld &) = delete;
+  InstX86Shld &operator=(const InstX86Shld &) = delete;
+
+public:
+  static InstX86Shld *create(Cfg *Func, Variable *Dest, Variable *Source1,
+                             Operand *Source2) {
+    return new (Func->allocate<InstX86Shld>())
+        InstX86Shld(Func, Dest, Source1, Source2);
+  }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, InstX86Base::Shld);
+  }
+
+private:
+  InstX86Shld(Cfg *Func, Variable *Dest, Variable *Source1, Operand *Source2);
+};
+
+/// Shrd instruction - shift across a pair of operands.
+class InstX86Shrd final : public InstX86Base {
+  InstX86Shrd() = delete;
+  InstX86Shrd(const InstX86Shrd &) = delete;
+  InstX86Shrd &operator=(const InstX86Shrd &) = delete;
+
+public:
+  static InstX86Shrd *create(Cfg *Func, Variable *Dest, Variable *Source1,
+                             Operand *Source2) {
+    return new (Func->allocate<InstX86Shrd>())
+        InstX86Shrd(Func, Dest, Source1, Source2);
+  }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, InstX86Base::Shrd);
+  }
+
+private:
+  InstX86Shrd(Cfg *Func, Variable *Dest, Variable *Source1, Operand *Source2);
+};
+
+/// Conditional move instruction.
+class InstX86Cmov final : public InstX86Base {
+  InstX86Cmov() = delete;
+  InstX86Cmov(const InstX86Cmov &) = delete;
+  InstX86Cmov &operator=(const InstX86Cmov &) = delete;
+
+public:
+  static InstX86Cmov *create(Cfg *Func, Variable *Dest, Operand *Source,
+                             BrCond Cond) {
+    return new (Func->allocate<InstX86Cmov>())
+        InstX86Cmov(Func, Dest, Source, Cond);
+  }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, InstX86Base::Cmov);
+  }
+
+private:
+  InstX86Cmov(Cfg *Func, Variable *Dest, Operand *Source, BrCond Cond);
+
+  BrCond Condition;
+};
+
+/// Cmpps instruction - compare packed singled-precision floating point values
+class InstX86Cmpps final : public InstX86Base {
+  InstX86Cmpps() = delete;
+  InstX86Cmpps(const InstX86Cmpps &) = delete;
+  InstX86Cmpps &operator=(const InstX86Cmpps &) = delete;
+
+public:
+  static InstX86Cmpps *create(Cfg *Func, Variable *Dest, Operand *Source,
+                              CmppsCond Condition) {
+    return new (Func->allocate<InstX86Cmpps>())
+        InstX86Cmpps(Func, Dest, Source, Condition);
+  }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, InstX86Base::Cmpps);
+  }
+
+private:
+  InstX86Cmpps(Cfg *Func, Variable *Dest, Operand *Source, CmppsCond Cond);
+
+  CmppsCond Condition;
+};
+
+/// Cmpxchg instruction - cmpxchg <dest>, <desired> will compare if <dest>
+/// equals eax. If so, the ZF is set and <desired> is stored in <dest>. If
+/// not, ZF is cleared and <dest> is copied to eax (or subregister). <dest>
+/// can be a register or memory, while <desired> must be a register. It is
+/// the user's responsibility to mark eax with a FakeDef.
+class InstX86Cmpxchg final : public InstX86BaseLockable {
+  InstX86Cmpxchg() = delete;
+  InstX86Cmpxchg(const InstX86Cmpxchg &) = delete;
+  InstX86Cmpxchg &operator=(const InstX86Cmpxchg &) = delete;
+
+public:
+  static InstX86Cmpxchg *create(Cfg *Func, Operand *DestOrAddr, Variable *Eax,
+                                Variable *Desired, bool Locked) {
+    return new (Func->allocate<InstX86Cmpxchg>())
+        InstX86Cmpxchg(Func, DestOrAddr, Eax, Desired, Locked);
+  }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, InstX86Base::Cmpxchg);
+  }
+
+private:
+  InstX86Cmpxchg(Cfg *Func, Operand *DestOrAddr, Variable *Eax,
+                 Variable *Desired, bool Locked);
+};
+
+/// Cmpxchg8b instruction - cmpxchg8b <m64> will compare if <m64> equals
+/// edx:eax. If so, the ZF is set and ecx:ebx is stored in <m64>. If not, ZF
+/// is cleared and <m64> is copied to edx:eax. The caller is responsible for
+/// inserting FakeDefs to mark edx and eax as modified. <m64> must be a memory
+/// operand.
+class InstX86Cmpxchg8b final : public InstX86BaseLockable {
+  InstX86Cmpxchg8b() = delete;
+  InstX86Cmpxchg8b(const InstX86Cmpxchg8b &) = delete;
+  InstX86Cmpxchg8b &operator=(const InstX86Cmpxchg8b &) = delete;
+
+public:
+  static InstX86Cmpxchg8b *create(Cfg *Func, X86OperandMem *Dest, Variable *Edx,
+                                  Variable *Eax, Variable *Ecx, Variable *Ebx,
+                                  bool Locked) {
+    return new (Func->allocate<InstX86Cmpxchg8b>())
+        InstX86Cmpxchg8b(Func, Dest, Edx, Eax, Ecx, Ebx, Locked);
+  }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, InstX86Base::Cmpxchg8b);
+  }
+
+private:
+  InstX86Cmpxchg8b(Cfg *Func, X86OperandMem *Dest, Variable *Edx, Variable *Eax,
+                   Variable *Ecx, Variable *Ebx, bool Locked);
+};
+
+/// Cvt instruction - wrapper for cvtsX2sY where X and Y are in {s,d,i} as
+/// appropriate.  s=float, d=double, i=int. X and Y are determined from
+/// dest/src types. Sign and zero extension on the integer operand needs to be
+/// done separately.
+class InstX86Cvt final : public InstX86Base {
+  InstX86Cvt() = delete;
+  InstX86Cvt(const InstX86Cvt &) = delete;
+  InstX86Cvt &operator=(const InstX86Cvt &) = delete;
+
+public:
+  enum CvtVariant { Si2ss, Tss2si, Ss2si, Float2float, Dq2ps, Tps2dq, Ps2dq };
+  static InstX86Cvt *create(Cfg *Func, Variable *Dest, Operand *Source,
+                            CvtVariant Variant) {
+    return new (Func->allocate<InstX86Cvt>())
+        InstX86Cvt(Func, Dest, Source, Variant);
+  }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, InstX86Base::Cvt);
+  }
+  bool isTruncating() const { return Variant == Tss2si || Variant == Tps2dq; }
+
+private:
+  CvtVariant Variant;
+  InstX86Cvt(Cfg *Func, Variable *Dest, Operand *Source, CvtVariant Variant);
+};
+
+/// Round instruction
+class InstX86Round final
+    : public InstX86BaseThreeAddressop<InstX86Base::Round> {
+public:
+  static InstX86Round *create(Cfg *Func, Variable *Dest, Operand *Source,
+                              Operand *Imm) {
+    return new (Func->allocate<InstX86Round>())
+        InstX86Round(Func, Dest, Source, Imm);
+  }
+
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+
+private:
+  InstX86Round(Cfg *Func, Variable *Dest, Operand *Source, Operand *Imm)
+      : InstX86BaseThreeAddressop<InstX86Base::Round>(Func, Dest, Source, Imm) {
+  }
+};
+
+/// cmp - Integer compare instruction.
+class InstX86Icmp final : public InstX86Base {
+  InstX86Icmp() = delete;
+  InstX86Icmp(const InstX86Icmp &) = delete;
+  InstX86Icmp &operator=(const InstX86Icmp &) = delete;
+
+public:
+  static InstX86Icmp *create(Cfg *Func, Operand *Src1, Operand *Src2) {
+    return new (Func->allocate<InstX86Icmp>()) InstX86Icmp(Func, Src1, Src2);
+  }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, InstX86Base::Icmp);
+  }
+
+private:
+  InstX86Icmp(Cfg *Func, Operand *Src1, Operand *Src2);
+};
+
+/// ucomiss/ucomisd - floating-point compare instruction.
+class InstX86Ucomiss final : public InstX86Base {
+  InstX86Ucomiss() = delete;
+  InstX86Ucomiss(const InstX86Ucomiss &) = delete;
+  InstX86Ucomiss &operator=(const InstX86Ucomiss &) = delete;
+
+public:
+  static InstX86Ucomiss *create(Cfg *Func, Operand *Src1, Operand *Src2) {
+    return new (Func->allocate<InstX86Ucomiss>())
+        InstX86Ucomiss(Func, Src1, Src2);
+  }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, InstX86Base::Ucomiss);
+  }
+
+private:
+  InstX86Ucomiss(Cfg *Func, Operand *Src1, Operand *Src2);
+};
+
+/// UD2 instruction.
+class InstX86UD2 final : public InstX86Base {
+  InstX86UD2() = delete;
+  InstX86UD2(const InstX86UD2 &) = delete;
+  InstX86UD2 &operator=(const InstX86UD2 &) = delete;
+
+public:
+  static InstX86UD2 *create(Cfg *Func) {
+    return new (Func->allocate<InstX86UD2>()) InstX86UD2(Func);
+  }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, InstX86Base::UD2);
+  }
+
+private:
+  explicit InstX86UD2(Cfg *Func);
+};
+
+/// Int3 instruction.
+class InstX86Int3 final : public InstX86Base {
+  InstX86Int3() = delete;
+  InstX86Int3(const InstX86Int3 &) = delete;
+  InstX86Int3 &operator=(const InstX86Int3 &) = delete;
+
+public:
+  static InstX86Int3 *create(Cfg *Func) {
+    return new (Func->allocate<InstX86Int3>()) InstX86Int3(Func);
+  }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, InstX86Base::Int3);
+  }
+
+private:
+  explicit InstX86Int3(Cfg *Func);
+};
+
+/// Test instruction.
+class InstX86Test final : public InstX86Base {
+  InstX86Test() = delete;
+  InstX86Test(const InstX86Test &) = delete;
+  InstX86Test &operator=(const InstX86Test &) = delete;
+
+public:
+  static InstX86Test *create(Cfg *Func, Operand *Source1, Operand *Source2) {
+    return new (Func->allocate<InstX86Test>())
+        InstX86Test(Func, Source1, Source2);
+  }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, InstX86Base::Test);
+  }
+
+private:
+  InstX86Test(Cfg *Func, Operand *Source1, Operand *Source2);
+};
+
+/// Mfence instruction.
+class InstX86Mfence final : public InstX86Base {
+  InstX86Mfence() = delete;
+  InstX86Mfence(const InstX86Mfence &) = delete;
+  InstX86Mfence &operator=(const InstX86Mfence &) = delete;
+
+public:
+  static InstX86Mfence *create(Cfg *Func) {
+    return new (Func->allocate<InstX86Mfence>()) InstX86Mfence(Func);
+  }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, InstX86Base::Mfence);
+  }
+
+private:
+  explicit InstX86Mfence(Cfg *Func);
+};
+
+/// This is essentially a "mov" instruction with anX86OperandMem operand
+/// instead of Variable as the destination. It's important for liveness that
+/// there is no Dest operand.
+class InstX86Store final : public InstX86Base {
+  InstX86Store() = delete;
+  InstX86Store(const InstX86Store &) = delete;
+  InstX86Store &operator=(const InstX86Store &) = delete;
+
+public:
+  static InstX86Store *create(Cfg *Func, Operand *Value, X86Operand *Mem) {
+    return new (Func->allocate<InstX86Store>()) InstX86Store(Func, Value, Mem);
+  }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, InstX86Base::Store);
+  }
+
+private:
+  InstX86Store(Cfg *Func, Operand *Value, X86Operand *Mem);
+};
+
+/// This is essentially a vector "mov" instruction with an typename
+/// X86OperandMem operand instead of Variable as the destination. It's
+/// important for liveness that there is no Dest operand. The source must be
+/// an Xmm register, since Dest is mem.
+class InstX86StoreP final : public InstX86Base {
+  InstX86StoreP() = delete;
+  InstX86StoreP(const InstX86StoreP &) = delete;
+  InstX86StoreP &operator=(const InstX86StoreP &) = delete;
+
+public:
+  static InstX86StoreP *create(Cfg *Func, Variable *Value, X86OperandMem *Mem) {
+    return new (Func->allocate<InstX86StoreP>())
+        InstX86StoreP(Func, Value, Mem);
+  }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, InstX86Base::StoreP);
+  }
+
+private:
+  InstX86StoreP(Cfg *Func, Variable *Value, X86OperandMem *Mem);
+};
+
+class InstX86StoreQ final : public InstX86Base {
+  InstX86StoreQ() = delete;
+  InstX86StoreQ(const InstX86StoreQ &) = delete;
+  InstX86StoreQ &operator=(const InstX86StoreQ &) = delete;
+
+public:
+  static InstX86StoreQ *create(Cfg *Func, Operand *Value, X86OperandMem *Mem) {
+    return new (Func->allocate<InstX86StoreQ>())
+        InstX86StoreQ(Func, Value, Mem);
+  }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, InstX86Base::StoreQ);
+  }
+
+private:
+  InstX86StoreQ(Cfg *Func, Operand *Value, X86OperandMem *Mem);
+};
+
+class InstX86StoreD final : public InstX86Base {
+  InstX86StoreD() = delete;
+  InstX86StoreD(const InstX86StoreD &) = delete;
+  InstX86StoreD &operator=(const InstX86StoreD &) = delete;
+
+public:
+  static InstX86StoreD *create(Cfg *Func, Operand *Value, X86OperandMem *Mem) {
+    return new (Func->allocate<InstX86StoreD>())
+        InstX86StoreD(Func, Value, Mem);
+  }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, InstX86Base::StoreQ);
+  }
+
+private:
+  InstX86StoreD(Cfg *Func, Operand *Value, X86OperandMem *Mem);
+};
+
+/// Nop instructions of varying length
+class InstX86Nop final : public InstX86Base {
+  InstX86Nop() = delete;
+  InstX86Nop(const InstX86Nop &) = delete;
+  InstX86Nop &operator=(const InstX86Nop &) = delete;
+
+public:
+  // TODO: Replace with enum.
+  using NopVariant = unsigned;
+
+  static InstX86Nop *create(Cfg *Func, NopVariant Variant) {
+    return new (Func->allocate<InstX86Nop>()) InstX86Nop(Func, Variant);
+  }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, InstX86Base::Nop);
+  }
+
+private:
+  InstX86Nop(Cfg *Func, NopVariant Length);
+
+  NopVariant Variant;
+};
+
+class InstX86Pop final : public InstX86Base {
+  InstX86Pop() = delete;
+  InstX86Pop(const InstX86Pop &) = delete;
+  InstX86Pop &operator=(const InstX86Pop &) = delete;
+
+public:
+  static InstX86Pop *create(Cfg *Func, Variable *Dest) {
+    return new (Func->allocate<InstX86Pop>()) InstX86Pop(Func, Dest);
+  }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, InstX86Base::Pop);
+  }
+
+private:
+  InstX86Pop(Cfg *Func, Variable *Dest);
+};
+
+class InstX86Push final : public InstX86Base {
+  InstX86Push() = delete;
+  InstX86Push(const InstX86Push &) = delete;
+  InstX86Push &operator=(const InstX86Push &) = delete;
+
+public:
+  static InstX86Push *create(Cfg *Func, Operand *Source) {
+    return new (Func->allocate<InstX86Push>()) InstX86Push(Func, Source);
+  }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, InstX86Base::Push);
+  }
+
+private:
+  InstX86Push(Cfg *Func, Operand *Source);
+};
+
+/// Ret instruction. Currently only supports the "ret" version that does not
+/// pop arguments. This instruction takes a Source operand (for non-void
+/// returning functions) for liveness analysis, though a FakeUse before the
+/// ret would do just as well.
+class InstX86Ret final : public InstX86Base {
+  InstX86Ret() = delete;
+  InstX86Ret(const InstX86Ret &) = delete;
+  InstX86Ret &operator=(const InstX86Ret &) = delete;
+
+public:
+  static InstX86Ret *create(Cfg *Func, Variable *Source = nullptr) {
+    return new (Func->allocate<InstX86Ret>()) InstX86Ret(Func, Source);
+  }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, InstX86Base::Ret);
+  }
+
+private:
+  InstX86Ret(Cfg *Func, Variable *Source);
+};
+
+/// Conditional set-byte instruction.
+class InstX86Setcc final : public InstX86Base {
+  InstX86Setcc() = delete;
+  InstX86Setcc(const InstX86Cmov &) = delete;
+  InstX86Setcc &operator=(const InstX86Setcc &) = delete;
+
+public:
+  static InstX86Setcc *create(Cfg *Func, Variable *Dest, BrCond Cond) {
+    return new (Func->allocate<InstX86Setcc>()) InstX86Setcc(Func, Dest, Cond);
+  }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, InstX86Base::Setcc);
+  }
+
+private:
+  InstX86Setcc(Cfg *Func, Variable *Dest, BrCond Cond);
+
+  const BrCond Condition;
+};
+
+/// Exchanging Add instruction. Exchanges the first operand (destination
+/// operand) with the second operand (source operand), then loads the sum of
+/// the two values into the destination operand. The destination may be a
+/// register or memory, while the source must be a register.
+///
+/// Both the dest and source are updated. The caller should then insert a
+/// FakeDef to reflect the second udpate.
+class InstX86Xadd final : public InstX86BaseLockable {
+  InstX86Xadd() = delete;
+  InstX86Xadd(const InstX86Xadd &) = delete;
+  InstX86Xadd &operator=(const InstX86Xadd &) = delete;
+
+public:
+  static InstX86Xadd *create(Cfg *Func, Operand *Dest, Variable *Source,
+                             bool Locked) {
+    return new (Func->allocate<InstX86Xadd>())
+        InstX86Xadd(Func, Dest, Source, Locked);
+  }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, InstX86Base::Xadd);
+  }
+
+private:
+  InstX86Xadd(Cfg *Func, Operand *Dest, Variable *Source, bool Locked);
+};
+
+/// Exchange instruction. Exchanges the first operand (destination operand)
+/// with the second operand (source operand). At least one of the operands
+/// must be a register (and the other can be reg or mem). Both the Dest and
+/// Source are updated. If there is a memory operand, then the instruction is
+/// automatically "locked" without the need for a lock prefix.
+class InstX86Xchg final : public InstX86Base {
+  InstX86Xchg() = delete;
+  InstX86Xchg(const InstX86Xchg &) = delete;
+  InstX86Xchg &operator=(const InstX86Xchg &) = delete;
+
+public:
+  static InstX86Xchg *create(Cfg *Func, Operand *Dest, Variable *Source) {
+    return new (Func->allocate<InstX86Xchg>()) InstX86Xchg(Func, Dest, Source);
+  }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, InstX86Base::Xchg);
+  }
+
+private:
+  InstX86Xchg(Cfg *Func, Operand *Dest, Variable *Source);
+};
+
+/// Start marker for the Intel Architecture Code Analyzer. This is not an
+/// executable instruction and must only be used for analysis.
+class InstX86IacaStart final : public InstX86Base {
+  InstX86IacaStart() = delete;
+  InstX86IacaStart(const InstX86IacaStart &) = delete;
+  InstX86IacaStart &operator=(const InstX86IacaStart &) = delete;
+
+public:
+  static InstX86IacaStart *create(Cfg *Func) {
+    return new (Func->allocate<InstX86IacaStart>()) InstX86IacaStart(Func);
+  }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, InstX86Base::IacaStart);
+  }
+
+private:
+  InstX86IacaStart(Cfg *Func);
+};
+
+/// End marker for the Intel Architecture Code Analyzer. This is not an
+/// executable instruction and must only be used for analysis.
+class InstX86IacaEnd final : public InstX86Base {
+  InstX86IacaEnd() = delete;
+  InstX86IacaEnd(const InstX86IacaEnd &) = delete;
+  InstX86IacaEnd &operator=(const InstX86IacaEnd &) = delete;
+
+public:
+  static InstX86IacaEnd *create(Cfg *Func) {
+    return new (Func->allocate<InstX86IacaEnd>()) InstX86IacaEnd(Func);
+  }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Instr) {
+    return InstX86Base::isClassof(Instr, InstX86Base::IacaEnd);
+  }
+
+private:
+  InstX86IacaEnd(Cfg *Func);
+};
+
+class InstX86Pshufb : public InstX86BaseBinopXmm<InstX86Base::Pshufb, false,
+                                                 InstX86Base::SseSuffix::None> {
+public:
+  static InstX86Pshufb *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Pshufb>())
+        InstX86Pshufb(Func, Dest, Source);
+  }
+
+private:
+  InstX86Pshufb(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopXmm<InstX86Base::Pshufb, false,
+                            InstX86Base::SseSuffix::None>(Func, Dest, Source) {}
+};
+
+class InstX86Punpckl
+    : public InstX86BaseBinopXmm<InstX86Base::Punpckl, false,
+                                 InstX86Base::SseSuffix::Unpack> {
+public:
+  static InstX86Punpckl *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Punpckl>())
+        InstX86Punpckl(Func, Dest, Source);
+  }
+
+private:
+  InstX86Punpckl(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopXmm<InstX86Base::Punpckl, false,
+                            InstX86Base::SseSuffix::Unpack>(Func, Dest,
+                                                            Source) {}
+};
+
+class InstX86Punpckh
+    : public InstX86BaseBinopXmm<InstX86Base::Punpckh, false,
+                                 InstX86Base::SseSuffix::Unpack> {
+public:
+  static InstX86Punpckh *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Punpckh>())
+        InstX86Punpckh(Func, Dest, Source);
+  }
+
+private:
+  InstX86Punpckh(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopXmm<InstX86Base::Punpckh, false,
+                            InstX86Base::SseSuffix::Unpack>(Func, Dest,
+                                                            Source) {}
+};
+
+class InstX86Packss : public InstX86BaseBinopXmm<InstX86Base::Packss, false,
+                                                 InstX86Base::SseSuffix::Pack> {
+public:
+  static InstX86Packss *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Packss>())
+        InstX86Packss(Func, Dest, Source);
+  }
+
+private:
+  InstX86Packss(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopXmm<InstX86Base::Packss, false,
+                            InstX86Base::SseSuffix::Pack>(Func, Dest, Source) {}
+};
+
+class InstX86Packus : public InstX86BaseBinopXmm<InstX86Base::Packus, false,
+                                                 InstX86Base::SseSuffix::Pack> {
+public:
+  static InstX86Packus *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX86Packus>())
+        InstX86Packus(Func, Dest, Source);
+  }
+
+private:
+  InstX86Packus(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX86BaseBinopXmm<InstX86Base::Packus, false,
+                            InstX86Base::SseSuffix::Pack>(Func, Dest, Source) {}
+};
+
+/// struct Insts is a template that can be used to instantiate all the X86
+/// instructions for a target with a simple
+///
+/// using Insts = ::Ice::X8664::Insts<TraitsType>;
+struct Insts {
+  using FakeRMW = InstX86FakeRMW;
+  using Label = InstX86Label;
+
+  using Call = InstX86Call;
+
+  using Br = InstX86Br;
+  using Jmp = InstX86Jmp;
+  using Bswap = InstX86Bswap;
+  using Neg = InstX86Neg;
+  using Bsf = InstX86Bsf;
+  using Bsr = InstX86Bsr;
+  using Lea = InstX86Lea;
+  using Cbwdq = InstX86Cbwdq;
+  using Movsx = InstX86Movsx;
+  using Movzx = InstX86Movzx;
+  using Movd = InstX86Movd;
+  using Movmsk = InstX86Movmsk;
+  using Sqrt = InstX86Sqrt;
+  using Mov = InstX86Mov;
+  using Movp = InstX86Movp;
+  using Movq = InstX86Movq;
+  using Add = InstX86Add;
+  using AddRMW = InstX86AddRMW;
+  using Addps = InstX86Addps;
+  using Adc = InstX86Adc;
+  using AdcRMW = InstX86AdcRMW;
+  using Addss = InstX86Addss;
+  using Andnps = InstX86Andnps;
+  using Andps = InstX86Andps;
+  using Padd = InstX86Padd;
+  using Padds = InstX86Padds;
+  using Paddus = InstX86Paddus;
+  using Sub = InstX86Sub;
+  using SubRMW = InstX86SubRMW;
+  using Subps = InstX86Subps;
+  using Subss = InstX86Subss;
+  using Sbb = InstX86Sbb;
+  using SbbRMW = InstX86SbbRMW;
+  using Psub = InstX86Psub;
+  using Psubs = InstX86Psubs;
+  using Psubus = InstX86Psubus;
+  using And = InstX86And;
+  using AndRMW = InstX86AndRMW;
+  using Pand = InstX86Pand;
+  using Pandn = InstX86Pandn;
+  using Or = InstX86Or;
+  using Orps = InstX86Orps;
+  using OrRMW = InstX86OrRMW;
+  using Por = InstX86Por;
+  using Xor = InstX86Xor;
+  using Xorps = InstX86Xorps;
+  using XorRMW = InstX86XorRMW;
+  using Pxor = InstX86Pxor;
+  using Maxss = InstX86Maxss;
+  using Minss = InstX86Minss;
+  using Maxps = InstX86Maxps;
+  using Minps = InstX86Minps;
+  using Imul = InstX86Imul;
+  using ImulImm = InstX86ImulImm;
+  using Mulps = InstX86Mulps;
+  using Mulss = InstX86Mulss;
+  using Pmull = InstX86Pmull;
+  using Pmulhw = InstX86Pmulhw;
+  using Pmulhuw = InstX86Pmulhuw;
+  using Pmaddwd = InstX86Pmaddwd;
+  using Pmuludq = InstX86Pmuludq;
+  using Divps = InstX86Divps;
+  using Divss = InstX86Divss;
+  using Rol = InstX86Rol;
+  using Shl = InstX86Shl;
+  using Psll = InstX86Psll;
+  using Psrl = InstX86Psrl;
+  using Shr = InstX86Shr;
+  using Sar = InstX86Sar;
+  using Psra = InstX86Psra;
+  using Pcmpeq = InstX86Pcmpeq;
+  using Pcmpgt = InstX86Pcmpgt;
+  using MovssRegs = InstX86MovssRegs;
+  using Idiv = InstX86Idiv;
+  using Div = InstX86Div;
+  using Insertps = InstX86Insertps;
+  using Pinsr = InstX86Pinsr;
+  using Shufps = InstX86Shufps;
+  using Blendvps = InstX86Blendvps;
+  using Pblendvb = InstX86Pblendvb;
+  using Pextr = InstX86Pextr;
+  using Pshufd = InstX86Pshufd;
+  using Lockable = InstX86BaseLockable;
+  using Mul = InstX86Mul;
+  using Shld = InstX86Shld;
+  using Shrd = InstX86Shrd;
+  using Cmov = InstX86Cmov;
+  using Cmpps = InstX86Cmpps;
+  using Cmpxchg = InstX86Cmpxchg;
+  using Cmpxchg8b = InstX86Cmpxchg8b;
+  using Cvt = InstX86Cvt;
+  using Round = InstX86Round;
+  using Icmp = InstX86Icmp;
+  using Ucomiss = InstX86Ucomiss;
+  using UD2 = InstX86UD2;
+  using Int3 = InstX86Int3;
+  using Test = InstX86Test;
+  using Mfence = InstX86Mfence;
+  using Store = InstX86Store;
+  using StoreP = InstX86StoreP;
+  using StoreQ = InstX86StoreQ;
+  using StoreD = InstX86StoreD;
+  using Nop = InstX86Nop;
+  using Pop = InstX86Pop;
+  using Push = InstX86Push;
+  using Ret = InstX86Ret;
+  using Setcc = InstX86Setcc;
+  using Xadd = InstX86Xadd;
+  using Xchg = InstX86Xchg;
+
+  using IacaStart = InstX86IacaStart;
+  using IacaEnd = InstX86IacaEnd;
+
+  using Pshufb = InstX86Pshufb;
+  using Punpckl = InstX86Punpckl;
+  using Punpckh = InstX86Punpckh;
+  using Packss = InstX86Packss;
+  using Packus = InstX86Packus;
+};
+
+/// X86 Instructions have static data (particularly, opcodes and instruction
+/// emitters). Each X86 target needs to define all of these, so this macro is
+/// provided so that, if something changes, then all X86 targets will be updated
+/// automatically.
+/* In-place ops */
+template <> constexpr const char *InstX86Bswap::Base::Opcode = "bswap";
+template <> constexpr const char *InstX86Neg::Base::Opcode = "neg";
+/* Unary ops */
+template <> constexpr const char *InstX86Bsf::Base::Opcode = "bsf";
+template <> constexpr const char *InstX86Bsr::Base::Opcode = "bsr";
+template <> constexpr const char *InstX86Lea::Base::Opcode = "lea";
+template <> constexpr const char *InstX86Movd::Base::Opcode = "movd";
+template <> constexpr const char *InstX86Movsx::Base::Opcode = "movs";
+template <> constexpr const char *InstX86Movzx::Base::Opcode = "movz";
+template <> constexpr const char *InstX86Sqrt::Base::Opcode = "sqrt";
+template <> constexpr const char *InstX86Cbwdq::Base::Opcode = "cbw/cwd/cdq";
+/* Mov-like ops */
+template <> constexpr const char *InstX86Mov::Base::Opcode = "mov";
+template <> constexpr const char *InstX86Movp::Base::Opcode = "movups";
+template <> constexpr const char *InstX86Movq::Base::Opcode = "movq";
+/* Binary ops */
+template <> constexpr const char *InstX86Add::Base::Opcode = "add";
+template <> constexpr const char *InstX86AddRMW::Base::Opcode = "add";
+template <> constexpr const char *InstX86Addps::Base::Opcode = "add";
+template <> constexpr const char *InstX86Adc::Base::Opcode = "adc";
+template <> constexpr const char *InstX86AdcRMW::Base::Opcode = "adc";
+template <> constexpr const char *InstX86Addss::Base::Opcode = "add";
+template <> constexpr const char *InstX86Andnps::Base::Opcode = "andn";
+template <> constexpr const char *InstX86Andps::Base::Opcode = "and";
+template <> constexpr const char *InstX86Maxss::Base::Opcode = "max";
+template <> constexpr const char *InstX86Minss::Base::Opcode = "min";
+template <> constexpr const char *InstX86Maxps::Base::Opcode = "max";
+template <> constexpr const char *InstX86Minps::Base::Opcode = "min";
+template <> constexpr const char *InstX86Padd::Base::Opcode = "padd";
+template <> constexpr const char *InstX86Padds::Base::Opcode = "padds";
+template <> constexpr const char *InstX86Paddus::Base::Opcode = "paddus";
+template <> constexpr const char *InstX86Sub::Base::Opcode = "sub";
+template <> constexpr const char *InstX86SubRMW::Base::Opcode = "sub";
+template <> constexpr const char *InstX86Subps::Base::Opcode = "sub";
+template <> constexpr const char *InstX86Subss::Base::Opcode = "sub";
+template <> constexpr const char *InstX86Sbb::Base::Opcode = "sbb";
+template <> constexpr const char *InstX86SbbRMW::Base::Opcode = "sbb";
+template <> constexpr const char *InstX86Psub::Base::Opcode = "psub";
+template <> constexpr const char *InstX86Psubs::Base::Opcode = "psubs";
+template <> constexpr const char *InstX86Psubus::Base::Opcode = "psubus";
+template <> constexpr const char *InstX86And::Base::Opcode = "and";
+template <> constexpr const char *InstX86AndRMW::Base::Opcode = "and";
+template <> constexpr const char *InstX86Pand::Base::Opcode = "pand";
+template <> constexpr const char *InstX86Pandn::Base::Opcode = "pandn";
+template <> constexpr const char *InstX86Or::Base::Opcode = "or";
+template <> constexpr const char *InstX86Orps::Base::Opcode = "or";
+template <> constexpr const char *InstX86OrRMW::Base::Opcode = "or";
+template <> constexpr const char *InstX86Por::Base::Opcode = "por";
+template <> constexpr const char *InstX86Xor::Base::Opcode = "xor";
+template <> constexpr const char *InstX86Xorps::Base::Opcode = "xor";
+template <> constexpr const char *InstX86XorRMW::Base::Opcode = "xor";
+template <> constexpr const char *InstX86Pxor::Base::Opcode = "pxor";
+template <> constexpr const char *InstX86Imul::Base::Opcode = "imul";
+template <> constexpr const char *InstX86ImulImm::Base::Opcode = "imul";
+template <> constexpr const char *InstX86Mulps::Base::Opcode = "mul";
+template <> constexpr const char *InstX86Mulss::Base::Opcode = "mul";
+template <> constexpr const char *InstX86Pmull::Base::Opcode = "pmull";
+template <> constexpr const char *InstX86Pmulhw::Base::Opcode = "pmulhw";
+template <> constexpr const char *InstX86Pmulhuw::Base::Opcode = "pmulhuw";
+template <> constexpr const char *InstX86Pmaddwd::Base::Opcode = "pmaddwd";
+template <> constexpr const char *InstX86Pmuludq::Base::Opcode = "pmuludq";
+template <> constexpr const char *InstX86Div::Base::Opcode = "div";
+template <> constexpr const char *InstX86Divps::Base::Opcode = "div";
+template <> constexpr const char *InstX86Divss::Base::Opcode = "div";
+template <> constexpr const char *InstX86Idiv::Base::Opcode = "idiv";
+template <> constexpr const char *InstX86Rol::Base::Opcode = "rol";
+template <> constexpr const char *InstX86Shl::Base::Opcode = "shl";
+template <> constexpr const char *InstX86Psll::Base::Opcode = "psll";
+template <> constexpr const char *InstX86Shr::Base::Opcode = "shr";
+template <> constexpr const char *InstX86Sar::Base::Opcode = "sar";
+template <> constexpr const char *InstX86Psra::Base::Opcode = "psra";
+template <> constexpr const char *InstX86Psrl::Base::Opcode = "psrl";
+template <> constexpr const char *InstX86Pcmpeq::Base::Opcode = "pcmpeq";
+template <> constexpr const char *InstX86Pcmpgt::Base::Opcode = "pcmpgt";
+template <> constexpr const char *InstX86MovssRegs::Base::Opcode = "movss";
+/* Ternary ops */
+template <> constexpr const char *InstX86Insertps::Base::Opcode = "insertps";
+template <> constexpr const char *InstX86Round::Base::Opcode = "round";
+template <> constexpr const char *InstX86Shufps::Base::Opcode = "shufps";
+template <> constexpr const char *InstX86Pinsr::Base::Opcode = "pinsr";
+template <> constexpr const char *InstX86Blendvps::Base::Opcode = "blendvps";
+template <> constexpr const char *InstX86Pblendvb::Base::Opcode = "pblendvb";
+/* Three address ops */
+template <> constexpr const char *InstX86Pextr::Base::Opcode = "pextr";
+template <> constexpr const char *InstX86Pshufd::Base::Opcode = "pshufd";
+template <> constexpr const char *InstX86Pshufb::Base::Opcode = "pshufb";
+template <> constexpr const char *InstX86Punpckl::Base::Opcode = "punpckl";
+template <> constexpr const char *InstX86Punpckh::Base::Opcode = "punpckh";
+template <> constexpr const char *InstX86Packss::Base::Opcode = "packss";
+template <> constexpr const char *InstX86Packus::Base::Opcode = "packus";
+/* Inplace GPR ops */
+template <>
+constexpr const Assembler::GPREmitterOneOp InstX86Bswap::Base::Emitter = {
+    &Assembler::bswap, nullptr /* only a reg form exists */
+};
+template <>
+constexpr const Assembler::GPREmitterOneOp InstX86Neg::Base::Emitter = {
+    &Assembler::neg, &Assembler::neg};
+/* Unary GPR ops */
+/* uses specialized emitter. */
+template <>
+constexpr const Assembler::GPREmitterRegOp InstX86Cbwdq::Base::Emitter = {
+    nullptr, nullptr, nullptr};
+template <>
+constexpr const Assembler::GPREmitterRegOp InstX86Bsf::Base::Emitter = {
+    &Assembler::bsf, &Assembler::bsf, nullptr};
+template <>
+constexpr const Assembler::GPREmitterRegOp InstX86Bsr::Base::Emitter = {
+    &Assembler::bsr, &Assembler::bsr, nullptr};
+template <>
+constexpr const Assembler::GPREmitterRegOp InstX86Lea::Base::Emitter = {
+    /* reg/reg and reg/imm are illegal */ nullptr, &Assembler::lea, nullptr};
+template <>
+constexpr const Assembler::GPREmitterRegOp InstX86Movsx::Base::Emitter = {
+    &Assembler::movsx, &Assembler::movsx, nullptr};
+template <>
+constexpr const Assembler::GPREmitterRegOp InstX86Movzx::Base::Emitter = {
+    &Assembler::movzx, &Assembler::movzx, nullptr};
+/* Unary XMM ops */
+/* uses specialized emitter. */
+template <>
+constexpr const Assembler::XmmEmitterRegOp InstX86Movd::Base::Emitter = {
+    nullptr, nullptr};
+template <>
+constexpr const Assembler::XmmEmitterRegOp InstX86Sqrt::Base::Emitter = {
+    &Assembler::sqrt, &Assembler::sqrt};
+/* Binary GPR ops */
+/* uses specialized emitter. */
+template <>
+constexpr const Assembler::GPREmitterRegOp InstX86Imul::Base::Emitter = {
+    nullptr, nullptr, nullptr};
+template <>
+constexpr const Assembler::GPREmitterRegOp InstX86Add::Base::Emitter = {
+    &Assembler::add, &Assembler::add, &Assembler::add};
+template <>
+constexpr const Assembler::GPREmitterAddrOp InstX86AddRMW::Base::Emitter = {
+    &Assembler::add, &Assembler::add};
+template <>
+constexpr const Assembler::GPREmitterRegOp InstX86Adc::Base::Emitter = {
+    &Assembler::adc, &Assembler::adc, &Assembler::adc};
+template <>
+constexpr const Assembler::GPREmitterAddrOp InstX86AdcRMW::Base::Emitter = {
+    &Assembler::adc, &Assembler::adc};
+template <>
+constexpr const Assembler::GPREmitterRegOp InstX86And::Base::Emitter = {
+    &Assembler::And, &Assembler::And, &Assembler::And};
+template <>
+constexpr const Assembler::GPREmitterAddrOp InstX86AndRMW::Base::Emitter = {
+    &Assembler::And, &Assembler::And};
+template <>
+constexpr const Assembler::GPREmitterRegOp InstX86Or::Base::Emitter = {
+    &Assembler::Or, &Assembler::Or, &Assembler::Or};
+template <>
+constexpr const Assembler::GPREmitterAddrOp InstX86OrRMW::Base::Emitter = {
+    &Assembler::Or, &Assembler::Or};
+template <>
+constexpr const Assembler::GPREmitterRegOp InstX86Sbb::Base::Emitter = {
+    &Assembler::sbb, &Assembler::sbb, &Assembler::sbb};
+template <>
+constexpr const Assembler::GPREmitterAddrOp InstX86SbbRMW::Base::Emitter = {
+    &Assembler::sbb, &Assembler::sbb};
+template <>
+constexpr const Assembler::GPREmitterRegOp InstX86Sub::Base::Emitter = {
+    &Assembler::sub, &Assembler::sub, &Assembler::sub};
+template <>
+constexpr const Assembler::GPREmitterAddrOp InstX86SubRMW::Base::Emitter = {
+    &Assembler::sub, &Assembler::sub};
+template <>
+constexpr const Assembler::GPREmitterRegOp InstX86Xor::Base::Emitter = {
+    &Assembler::Xor, &Assembler::Xor, &Assembler::Xor};
+template <>
+constexpr const Assembler::GPREmitterAddrOp InstX86XorRMW::Base::Emitter = {
+    &Assembler::Xor, &Assembler::Xor};
+/* Binary Shift GPR ops */
+template <>
+constexpr const Assembler::GPREmitterShiftOp InstX86Rol::Base::Emitter = {
+    &Assembler::rol, &Assembler::rol};
+template <>
+constexpr const Assembler::GPREmitterShiftOp InstX86Sar::Base::Emitter = {
+    &Assembler::sar, &Assembler::sar};
+template <>
+constexpr const Assembler::GPREmitterShiftOp InstX86Shl::Base::Emitter = {
+    &Assembler::shl, &Assembler::shl};
+template <>
+constexpr const Assembler::GPREmitterShiftOp InstX86Shr::Base::Emitter = {
+    &Assembler::shr, &Assembler::shr};
+/* Binary XMM ops */
+/* uses specialized emitter. */
+template <>
+constexpr const Assembler::XmmEmitterRegOp InstX86MovssRegs::Base::Emitter = {
+    nullptr, nullptr};
+template <>
+constexpr const Assembler::XmmEmitterRegOp InstX86Addss::Base::Emitter = {
+    &Assembler::addss, &Assembler::addss};
+template <>
+constexpr const Assembler::XmmEmitterRegOp InstX86Addps::Base::Emitter = {
+    &Assembler::addps, &Assembler::addps};
+template <>
+constexpr const Assembler::XmmEmitterRegOp InstX86Divss::Base::Emitter = {
+    &Assembler::divss, &Assembler::divss};
+template <>
+constexpr const Assembler::XmmEmitterRegOp InstX86Divps::Base::Emitter = {
+    &Assembler::divps, &Assembler::divps};
+template <>
+constexpr const Assembler::XmmEmitterRegOp InstX86Mulss::Base::Emitter = {
+    &Assembler::mulss, &Assembler::mulss};
+template <>
+constexpr const Assembler::XmmEmitterRegOp InstX86Mulps::Base::Emitter = {
+    &Assembler::mulps, &Assembler::mulps};
+template <>
+constexpr const Assembler::XmmEmitterRegOp InstX86Padd::Base::Emitter = {
+    &Assembler::padd, &Assembler::padd};
+template <>
+constexpr const Assembler::XmmEmitterRegOp InstX86Padds::Base::Emitter = {
+    &Assembler::padds, &Assembler::padds};
+template <>
+constexpr const Assembler::XmmEmitterRegOp InstX86Paddus::Base::Emitter = {
+    &Assembler::paddus, &Assembler::paddus};
+template <>
+constexpr const Assembler::XmmEmitterRegOp InstX86Pand::Base::Emitter = {
+    &Assembler::pand, &Assembler::pand};
+template <>
+constexpr const Assembler::XmmEmitterRegOp InstX86Pandn::Base::Emitter = {
+    &Assembler::pandn, &Assembler::pandn};
+template <>
+constexpr const Assembler::XmmEmitterRegOp InstX86Pcmpeq::Base::Emitter = {
+    &Assembler::pcmpeq, &Assembler::pcmpeq};
+template <>
+constexpr const Assembler::XmmEmitterRegOp InstX86Pcmpgt::Base::Emitter = {
+    &Assembler::pcmpgt, &Assembler::pcmpgt};
+template <>
+constexpr const Assembler::XmmEmitterRegOp InstX86Pmull::Base::Emitter = {
+    &Assembler::pmull, &Assembler::pmull};
+template <>
+constexpr const Assembler::XmmEmitterRegOp InstX86Pmulhw::Base::Emitter = {
+    &Assembler::pmulhw, &Assembler::pmulhw};
+template <>
+constexpr const Assembler::XmmEmitterRegOp InstX86Pmulhuw::Base::Emitter = {
+    &Assembler::pmulhuw, &Assembler::pmulhuw};
+template <>
+constexpr const Assembler::XmmEmitterRegOp InstX86Pmaddwd::Base::Emitter = {
+    &Assembler::pmaddwd, &Assembler::pmaddwd};
+template <>
+constexpr const Assembler::XmmEmitterRegOp InstX86Pmuludq::Base::Emitter = {
+    &Assembler::pmuludq, &Assembler::pmuludq};
+template <>
+constexpr const Assembler::XmmEmitterRegOp InstX86Por::Base::Emitter = {
+    &Assembler::por, &Assembler::por};
+template <>
+constexpr const Assembler::XmmEmitterRegOp InstX86Psub::Base::Emitter = {
+    &Assembler::psub, &Assembler::psub};
+template <>
+constexpr const Assembler::XmmEmitterRegOp InstX86Psubs::Base::Emitter = {
+    &Assembler::psubs, &Assembler::psubs};
+template <>
+constexpr const Assembler::XmmEmitterRegOp InstX86Psubus::Base::Emitter = {
+    &Assembler::psubus, &Assembler::psubus};
+template <>
+constexpr const Assembler::XmmEmitterRegOp InstX86Pxor::Base::Emitter = {
+    &Assembler::pxor, &Assembler::pxor};
+template <>
+constexpr const Assembler::XmmEmitterRegOp InstX86Subss::Base::Emitter = {
+    &Assembler::subss, &Assembler::subss};
+template <>
+constexpr const Assembler::XmmEmitterRegOp InstX86Subps::Base::Emitter = {
+    &Assembler::subps, &Assembler::subps};
+template <>
+constexpr const Assembler::XmmEmitterRegOp InstX86Andnps::Base::Emitter = {
+    &Assembler::andnps, &Assembler::andnps};
+template <>
+constexpr const Assembler::XmmEmitterRegOp InstX86Andps::Base::Emitter = {
+    &Assembler::andps, &Assembler::andps};
+template <>
+constexpr const Assembler::XmmEmitterRegOp InstX86Maxss::Base::Emitter = {
+    &Assembler::maxss, &Assembler::maxss};
+template <>
+constexpr const Assembler::XmmEmitterRegOp InstX86Minss::Base::Emitter = {
+    &Assembler::minss, &Assembler::minss};
+template <>
+constexpr const Assembler::XmmEmitterRegOp InstX86Maxps::Base::Emitter = {
+    &Assembler::maxps, &Assembler::maxps};
+template <>
+constexpr const Assembler::XmmEmitterRegOp InstX86Minps::Base::Emitter = {
+    &Assembler::minps, &Assembler::minps};
+template <>
+constexpr const Assembler::XmmEmitterRegOp InstX86Orps::Base::Emitter = {
+    &Assembler::orps, &Assembler::orps};
+template <>
+constexpr const Assembler::XmmEmitterRegOp InstX86Xorps::Base::Emitter = {
+    &Assembler::xorps, &Assembler::xorps}; /* Binary XMM Shift ops */
+template <>
+constexpr const Assembler::XmmEmitterShiftOp InstX86Psll::Base::Emitter = {
+    &Assembler::psll, &Assembler::psll, &Assembler::psll};
+template <>
+constexpr const Assembler::XmmEmitterShiftOp InstX86Psra::Base::Emitter = {
+    &Assembler::psra, &Assembler::psra, &Assembler::psra};
+template <>
+constexpr const Assembler::XmmEmitterShiftOp InstX86Psrl::Base::Emitter = {
+    &Assembler::psrl, &Assembler::psrl, &Assembler::psrl};
+template <>
+constexpr const Assembler::XmmEmitterRegOp InstX86Pshufb::Base::Emitter = {
+    &Assembler::pshufb, &Assembler::pshufb};
+template <>
+constexpr const Assembler::XmmEmitterRegOp InstX86Punpckl::Base::Emitter = {
+    &Assembler::punpckl, &Assembler::punpckl};
+template <>
+constexpr const Assembler::XmmEmitterRegOp InstX86Punpckh::Base::Emitter = {
+    &Assembler::punpckh, &Assembler::punpckh};
+template <>
+constexpr const Assembler::XmmEmitterRegOp InstX86Packss::Base::Emitter = {
+    &Assembler::packss, &Assembler::packss};
+template <>
+constexpr const Assembler::XmmEmitterRegOp InstX86Packus::Base::Emitter = {
+    &Assembler::packus, &Assembler::packus};
+
+} // end of namespace X8664
+} // end of namespace Ice
 
 #endif // SUBZERO_SRC_ICEINSTX8664_H
diff --git a/third_party/subzero/src/IceInstX8664Base.h b/third_party/subzero/src/IceInstX8664Base.h
deleted file mode 100644
index 70d650f..0000000
--- a/third_party/subzero/src/IceInstX8664Base.h
+++ /dev/null
@@ -1,4056 +0,0 @@
-//===- subzero/src/IceInstX8664Base.h - Generic x86 instructions -*- C++
-//-*--===//
-//
-//                        The Subzero Code Generator
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// \brief This file defines the InstX86Base template class, as well as the
-/// generic X86 Instruction class hierarchy.
-///
-/// Only X86 instructions common across all/most X86 targets should be defined
-/// here, with target-specific instructions declared in the target's traits.
-///
-//===----------------------------------------------------------------------===//
-
-#ifndef SUBZERO_SRC_ICEINSTX8664BASE_H
-#define SUBZERO_SRC_ICEINSTX8664BASE_H
-
-#include "IceAssemblerX8664.h"
-#include "IceDefs.h"
-#include "IceInst.h"
-#include "IceOperand.h"
-
-namespace Ice {
-namespace X8664 {
-
-template <typename TraitsType> struct InstImpl {
-  using Traits = TraitsType;
-  using Assembler = AssemblerX8664;
-  using AssemblerLabel = typename Assembler::Label;
-  using AssemblerImmediate = typename Assembler::Immediate;
-  using TargetLowering = typename Traits::TargetLowering;
-  using Address = typename Traits::Address;
-  using X86Operand = typename Traits::X86Operand;
-  using X86OperandMem = typename Traits::X86OperandMem;
-  using VariableSplit = typename Traits::VariableSplit;
-
-  using GPRRegister = typename Traits::RegisterSet::GPRRegister;
-  using RegisterSet = typename Traits::RegisterSet;
-  using XmmRegister = typename Traits::RegisterSet::XmmRegister;
-
-  using Cond = CondX86;
-  using BrCond = Cond::BrCond;
-  using CmppsCond = Cond::CmppsCond;
-
-  template <typename SReg_t, typename DReg_t>
-  using CastEmitterRegOp =
-      typename Traits::Assembler::template CastEmitterRegOp<SReg_t, DReg_t>;
-  template <typename SReg_t, typename DReg_t>
-  using ThreeOpImmEmitter =
-      typename Traits::Assembler::template ThreeOpImmEmitter<SReg_t, DReg_t>;
-  using GPREmitterAddrOp = typename Traits::Assembler::GPREmitterAddrOp;
-  using GPREmitterRegOp = typename Traits::Assembler::GPREmitterRegOp;
-  using GPREmitterShiftD = typename Traits::Assembler::GPREmitterShiftD;
-  using GPREmitterShiftOp = typename Traits::Assembler::GPREmitterShiftOp;
-  using GPREmitterOneOp = typename Traits::Assembler::GPREmitterOneOp;
-  using XmmEmitterRegOp = typename Traits::Assembler::XmmEmitterRegOp;
-  using XmmEmitterShiftOp = typename Traits::Assembler::XmmEmitterShiftOp;
-  using XmmEmitterMovOps = typename Traits::Assembler::XmmEmitterMovOps;
-
-  class InstX86Base : public InstTarget {
-    InstX86Base() = delete;
-    InstX86Base(const InstX86Base &) = delete;
-    InstX86Base &operator=(const InstX86Base &) = delete;
-
-  public:
-    enum InstKindX86 {
-      k__Start = Inst::Target,
-      Adc,
-      AdcRMW,
-      Add,
-      AddRMW,
-      Addps,
-      Addss,
-      And,
-      Andnps,
-      Andps,
-      AndRMW,
-      Blendvps,
-      Br,
-      Bsf,
-      Bsr,
-      Bswap,
-      Call,
-      Cbwdq,
-      Cmov,
-      Cmpps,
-      Cmpxchg,
-      Cmpxchg8b,
-      Cvt,
-      Div,
-      Divps,
-      Divss,
-      FakeRMW,
-      Fld,
-      Fstp,
-      Icmp,
-      Idiv,
-      Imul,
-      ImulImm,
-      Insertps,
-      Int3,
-      Jmp,
-      Label,
-      Lea,
-      Load,
-      Mfence,
-      Minps,
-      Maxps,
-      Minss,
-      Maxss,
-      Mov,
-      Movd,
-      Movmsk,
-      Movp,
-      Movq,
-      MovssRegs,
-      Movsx,
-      Movzx,
-      Mul,
-      Mulps,
-      Mulss,
-      Neg,
-      Nop,
-      Or,
-      Orps,
-      OrRMW,
-      Padd,
-      Padds,
-      Paddus,
-      Pand,
-      Pandn,
-      Pblendvb,
-      Pcmpeq,
-      Pcmpgt,
-      Pextr,
-      Pinsr,
-      Pmull,
-      Pmulhw,
-      Pmulhuw,
-      Pmaddwd,
-      Pmuludq,
-      Pop,
-      Por,
-      Pshufb,
-      Pshufd,
-      Punpckl,
-      Punpckh,
-      Packss,
-      Packus,
-      Psll,
-      Psra,
-      Psrl,
-      Psub,
-      Psubs,
-      Psubus,
-      Push,
-      Pxor,
-      Ret,
-      Rol,
-      Round,
-      Sar,
-      Sbb,
-      SbbRMW,
-      Setcc,
-      Shl,
-      Shld,
-      Shr,
-      Shrd,
-      Shufps,
-      Sqrt,
-      Store,
-      StoreP,
-      StoreQ,
-      StoreD,
-      Sub,
-      SubRMW,
-      Subps,
-      Subss,
-      Test,
-      Ucomiss,
-      UD2,
-      Xadd,
-      Xchg,
-      Xor,
-      Xorps,
-      XorRMW,
-
-      /// Intel Architecture Code Analyzer markers. These are not executable so
-      /// must only be used for analysis.
-      IacaStart,
-      IacaEnd
-    };
-
-    enum SseSuffix { None, Packed, Unpack, Scalar, Integral, Pack };
-
-    static const char *getWidthString(Type Ty);
-    static const char *getFldString(Type Ty);
-    static BrCond getOppositeCondition(BrCond Cond);
-    void dump(const Cfg *Func) const override;
-
-    // Shared emit routines for common forms of instructions.
-    void emitTwoAddress(const Cfg *Func, const char *Opcode,
-                        const char *Suffix = "") const;
-
-    static TargetLowering *getTarget(const Cfg *Func) {
-      return static_cast<TargetLowering *>(Func->getTarget());
-    }
-
-  protected:
-    InstX86Base(Cfg *Func, InstKindX86 Kind, SizeT Maxsrcs, Variable *Dest)
-        : InstTarget(Func, static_cast<InstKind>(Kind), Maxsrcs, Dest) {}
-
-    static bool isClassof(const Inst *Instr, InstKindX86 MyKind) {
-      return Instr->getKind() == static_cast<InstKind>(MyKind);
-    }
-    // Most instructions that operate on vector arguments require vector memory
-    // operands to be fully aligned (16-byte alignment for PNaCl vector types).
-    // The stack frame layout and call ABI ensure proper alignment for stack
-    // operands, but memory operands (originating from load/store bitcode
-    // instructions) only have element-size alignment guarantees. This function
-    // validates that none of the operands is a memory operand of vector type,
-    // calling report_fatal_error() if one is found. This function should be
-    // called during emission, and maybe also in the ctor (as long as that fits
-    // the lowering style).
-    void validateVectorAddrMode() const {
-      if (this->getDest())
-        this->validateVectorAddrModeOpnd(this->getDest());
-      for (SizeT i = 0; i < this->getSrcSize(); ++i) {
-        this->validateVectorAddrModeOpnd(this->getSrc(i));
-      }
-    }
-
-  private:
-    static void validateVectorAddrModeOpnd(const Operand *Opnd) {
-      if (llvm::isa<X86OperandMem>(Opnd) && isVectorType(Opnd->getType())) {
-        llvm::report_fatal_error("Possible misaligned vector memory operation");
-      }
-    }
-  };
-
-  /// InstX86FakeRMW represents a non-atomic read-modify-write operation on a
-  /// memory location. An InstX86FakeRMW is a "fake" instruction in that it
-  /// still needs to be lowered to some actual RMW instruction.
-  ///
-  /// If A is some memory address, D is some data value to apply, and OP is an
-  /// arithmetic operator, the instruction operates as: (*A) = (*A) OP D
-  class InstX86FakeRMW final : public InstX86Base {
-    InstX86FakeRMW() = delete;
-    InstX86FakeRMW(const InstX86FakeRMW &) = delete;
-    InstX86FakeRMW &operator=(const InstX86FakeRMW &) = delete;
-
-  public:
-    static InstX86FakeRMW *create(Cfg *Func, Operand *Data, Operand *Addr,
-                                  Variable *Beacon, InstArithmetic::OpKind Op,
-                                  uint32_t Align = 1) {
-      // TODO(stichnot): Stop ignoring alignment specification.
-      (void)Align;
-      return new (Func->allocate<InstX86FakeRMW>())
-          InstX86FakeRMW(Func, Data, Addr, Op, Beacon);
-    }
-    Operand *getAddr() const { return this->getSrc(1); }
-    Operand *getData() const { return this->getSrc(0); }
-    InstArithmetic::OpKind getOp() const { return Op; }
-    Variable *getBeacon() const {
-      return llvm::cast<Variable>(this->getSrc(2));
-    }
-    void dump(const Cfg *Func) const override;
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::FakeRMW);
-    }
-
-  private:
-    InstArithmetic::OpKind Op;
-    InstX86FakeRMW(Cfg *Func, Operand *Data, Operand *Addr,
-                   InstArithmetic::OpKind Op, Variable *Beacon);
-  };
-
-  /// InstX86Label represents an intra-block label that is the target of an
-  /// intra-block branch. The offset between the label and the branch must be
-  /// fit into one byte (considered "near"). These are used for lowering i1
-  /// calculations, Select instructions, and 64-bit compares on a 32-bit
-  /// architecture, without basic block splitting. Basic block splitting is not
-  /// so desirable for several reasons, one of which is the impact on decisions
-  /// based on whether a variable's live range spans multiple basic blocks.
-  ///
-  /// Intra-block control flow must be used with caution. Consider the sequence
-  /// for "c = (a >= b ? x : y)".
-  ///     cmp a, b
-  ///     br lt, L1
-  ///     mov c, x
-  ///     jmp L2
-  ///   L1:
-  ///     mov c, y
-  ///   L2:
-  ///
-  /// Labels L1 and L2 are intra-block labels. Without knowledge of the
-  /// intra-block control flow, liveness analysis will determine the "mov c, x"
-  /// instruction to be dead. One way to prevent this is to insert a
-  /// "FakeUse(c)" instruction anywhere between the two "mov c, ..."
-  /// instructions, e.g.:
-  ///
-  ///     cmp a, b
-  ///     br lt, L1
-  ///     mov c, x
-  ///     jmp L2
-  ///     FakeUse(c)
-  ///   L1:
-  ///     mov c, y
-  ///   L2:
-  ///
-  /// The down-side is that "mov c, x" can never be dead-code eliminated even if
-  /// there are no uses of c. As unlikely as this situation is, it may be
-  /// prevented by running dead code elimination before lowering.
-  class InstX86Label final : public InstX86Base {
-    InstX86Label() = delete;
-    InstX86Label(const InstX86Label &) = delete;
-    InstX86Label &operator=(const InstX86Label &) = delete;
-
-  public:
-    static InstX86Label *create(Cfg *Func, TargetLowering *Target) {
-      return new (Func->allocate<InstX86Label>()) InstX86Label(Func, Target);
-    }
-    uint32_t getEmitInstCount() const override { return 0; }
-    GlobalString getLabelName() const { return Name; }
-    SizeT getLabelNumber() const { return LabelNumber; }
-    bool isLabel() const override { return true; }
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-    void dump(const Cfg *Func) const override;
-    void setRelocOffset(RelocOffset *Value) { OffsetReloc = Value; }
-
-  private:
-    InstX86Label(Cfg *Func, TargetLowering *Target);
-
-    SizeT LabelNumber; // used for unique label generation.
-    RelocOffset *OffsetReloc = nullptr;
-    GlobalString Name;
-  };
-
-  /// Conditional and unconditional branch instruction.
-  class InstX86Br final : public InstX86Base {
-    InstX86Br() = delete;
-    InstX86Br(const InstX86Br &) = delete;
-    InstX86Br &operator=(const InstX86Br &) = delete;
-
-  public:
-    enum Mode { Near, Far };
-
-    /// Create a conditional branch to a node.
-    static InstX86Br *create(Cfg *Func, CfgNode *TargetTrue,
-                             CfgNode *TargetFalse, BrCond Condition,
-                             Mode Kind) {
-      assert(Condition != Cond::Br_None);
-      constexpr InstX86Label *NoLabel = nullptr;
-      return new (Func->allocate<InstX86Br>())
-          InstX86Br(Func, TargetTrue, TargetFalse, NoLabel, Condition, Kind);
-    }
-    /// Create an unconditional branch to a node.
-    static InstX86Br *create(Cfg *Func, CfgNode *Target, Mode Kind) {
-      constexpr CfgNode *NoCondTarget = nullptr;
-      constexpr InstX86Label *NoLabel = nullptr;
-      return new (Func->allocate<InstX86Br>())
-          InstX86Br(Func, NoCondTarget, Target, NoLabel, Cond::Br_None, Kind);
-    }
-    /// Create a non-terminator conditional branch to a node, with a fallthrough
-    /// to the next instruction in the current node. This is used for switch
-    /// lowering.
-    static InstX86Br *create(Cfg *Func, CfgNode *Target, BrCond Condition,
-                             Mode Kind) {
-      assert(Condition != Cond::Br_None);
-      constexpr CfgNode *NoUncondTarget = nullptr;
-      constexpr InstX86Label *NoLabel = nullptr;
-      return new (Func->allocate<InstX86Br>())
-          InstX86Br(Func, Target, NoUncondTarget, NoLabel, Condition, Kind);
-    }
-    /// Create a conditional intra-block branch (or unconditional, if
-    /// Condition==Br_None) to a label in the current block.
-    static InstX86Br *create(Cfg *Func, InstX86Label *Label, BrCond Condition,
-                             Mode Kind) {
-      constexpr CfgNode *NoCondTarget = nullptr;
-      constexpr CfgNode *NoUncondTarget = nullptr;
-      return new (Func->allocate<InstX86Br>())
-          InstX86Br(Func, NoCondTarget, NoUncondTarget, Label, Condition, Kind);
-    }
-    const CfgNode *getTargetTrue() const { return TargetTrue; }
-    const CfgNode *getTargetFalse() const { return TargetFalse; }
-    bool isNear() const { return Kind == Near; }
-    bool optimizeBranch(const CfgNode *NextNode);
-    uint32_t getEmitInstCount() const override {
-      uint32_t Sum = 0;
-      if (Label)
-        ++Sum;
-      if (getTargetTrue())
-        ++Sum;
-      if (getTargetFalse())
-        ++Sum;
-      return Sum;
-    }
-    bool isUnconditionalBranch() const override {
-      return !Label && Condition == Cond::Br_None;
-    }
-    const Inst *getIntraBlockBranchTarget() const override { return Label; }
-    bool repointEdges(CfgNode *OldNode, CfgNode *NewNode) override;
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-    void dump(const Cfg *Func) const override;
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::Br);
-    }
-
-  private:
-    InstX86Br(Cfg *Func, const CfgNode *TargetTrue, const CfgNode *TargetFalse,
-              const InstX86Label *Label, BrCond Condition, Mode Kind);
-
-    BrCond Condition;
-    const CfgNode *TargetTrue;
-    const CfgNode *TargetFalse;
-    const InstX86Label *Label; // Intra-block branch target
-    const Mode Kind;
-  };
-
-  /// Jump to a target outside this function, such as tailcall, nacljump,
-  /// naclret, unreachable. This is different from a Branch instruction in that
-  /// there is no intra-function control flow to represent.
-  class InstX86Jmp final : public InstX86Base {
-    InstX86Jmp() = delete;
-    InstX86Jmp(const InstX86Jmp &) = delete;
-    InstX86Jmp &operator=(const InstX86Jmp &) = delete;
-
-  public:
-    static InstX86Jmp *create(Cfg *Func, Operand *Target) {
-      return new (Func->allocate<InstX86Jmp>()) InstX86Jmp(Func, Target);
-    }
-    Operand *getJmpTarget() const { return this->getSrc(0); }
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-    void dump(const Cfg *Func) const override;
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::Jmp);
-    }
-
-  private:
-    InstX86Jmp(Cfg *Func, Operand *Target);
-  };
-
-  /// Call instruction. Arguments should have already been pushed.
-  class InstX86Call final : public InstX86Base {
-    InstX86Call() = delete;
-    InstX86Call(const InstX86Call &) = delete;
-    InstX86Call &operator=(const InstX86Call &) = delete;
-
-  public:
-    static InstX86Call *create(Cfg *Func, Variable *Dest, Operand *CallTarget) {
-      return new (Func->allocate<InstX86Call>())
-          InstX86Call(Func, Dest, CallTarget);
-    }
-    Operand *getCallTarget() const { return this->getSrc(0); }
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-    void dump(const Cfg *Func) const override;
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::Call);
-    }
-
-  private:
-    InstX86Call(Cfg *Func, Variable *Dest, Operand *CallTarget);
-  };
-
-  /// Emit a one-operand (GPR) instruction.
-  static void emitIASOpTyGPR(const Cfg *Func, Type Ty, const Operand *Var,
-                             const GPREmitterOneOp &Emitter);
-
-  static void emitIASAsAddrOpTyGPR(const Cfg *Func, Type Ty, const Operand *Op0,
-                                   const Operand *Op1,
-                                   const GPREmitterAddrOp &Emitter);
-
-  static void emitIASGPRShift(const Cfg *Func, Type Ty, const Variable *Var,
-                              const Operand *Src,
-                              const GPREmitterShiftOp &Emitter);
-
-  static void emitIASAddrOpTyGPR(const Cfg *Func, Type Ty, const Address &Addr,
-                                 const Operand *Src,
-                                 const GPREmitterAddrOp &Emitter);
-
-  static void emitIASRegOpTyXMM(const Cfg *Func, Type Ty, const Variable *Var,
-                                const Operand *Src,
-                                const XmmEmitterRegOp &Emitter);
-
-  static void emitIASGPRShiftDouble(const Cfg *Func, const Variable *Dest,
-                                    const Operand *Src1Op,
-                                    const Operand *Src2Op,
-                                    const GPREmitterShiftD &Emitter);
-
-  template <typename DReg_t, typename SReg_t, DReg_t (*destEnc)(RegNumT),
-            SReg_t (*srcEnc)(RegNumT)>
-  static void emitIASCastRegOp(const Cfg *Func, Type DestTy,
-                               const Variable *Dest, Type SrcTy,
-                               const Operand *Src,
-                               const CastEmitterRegOp<DReg_t, SReg_t> &Emitter);
-
-  template <typename DReg_t, typename SReg_t, DReg_t (*destEnc)(RegNumT),
-            SReg_t (*srcEnc)(RegNumT)>
-  static void
-  emitIASThreeOpImmOps(const Cfg *Func, Type DispatchTy, const Variable *Dest,
-                       const Operand *Src0, const Operand *Src1,
-                       const ThreeOpImmEmitter<DReg_t, SReg_t> Emitter);
-
-  static void emitIASMovlikeXMM(const Cfg *Func, const Variable *Dest,
-                                const Operand *Src,
-                                const XmmEmitterMovOps Emitter);
-
-  static void emitVariableBlendInst(const char *Opcode, const Inst *Instr,
-                                    const Cfg *Func);
-
-  static void emitIASVariableBlendInst(const Inst *Instr, const Cfg *Func,
-                                       const XmmEmitterRegOp &Emitter);
-
-  static void emitIASXmmShift(const Cfg *Func, Type Ty, const Variable *Var,
-                              const Operand *Src,
-                              const XmmEmitterShiftOp &Emitter);
-
-  /// Emit a two-operand (GPR) instruction, where the dest operand is a Variable
-  /// that's guaranteed to be a register.
-  template <bool VarCanBeByte = true, bool SrcCanBeByte = true>
-  static void emitIASRegOpTyGPR(const Cfg *Func, bool IsLea, Type Ty,
-                                const Variable *Dst, const Operand *Src,
-                                const GPREmitterRegOp &Emitter);
-
-  /// Instructions of the form x := op(x).
-  template <typename InstX86Base::InstKindX86 K>
-  class InstX86BaseInplaceopGPR : public InstX86Base {
-    InstX86BaseInplaceopGPR() = delete;
-    InstX86BaseInplaceopGPR(const InstX86BaseInplaceopGPR &) = delete;
-    InstX86BaseInplaceopGPR &
-    operator=(const InstX86BaseInplaceopGPR &) = delete;
-
-  public:
-    using Base = InstX86BaseInplaceopGPR<K>;
-
-    void emit(const Cfg *Func) const override {
-      if (!BuildDefs::dump())
-        return;
-      Ostream &Str = Func->getContext()->getStrEmit();
-      assert(this->getSrcSize() == 1);
-      Str << "\t" << Opcode << "\t";
-      this->getSrc(0)->emit(Func);
-    }
-    void emitIAS(const Cfg *Func) const override {
-      assert(this->getSrcSize() == 1);
-      const Variable *Var = this->getDest();
-      Type Ty = Var->getType();
-      emitIASOpTyGPR(Func, Ty, Var, Emitter);
-    }
-    void dump(const Cfg *Func) const override {
-      if (!BuildDefs::dump())
-        return;
-      Ostream &Str = Func->getContext()->getStrDump();
-      this->dumpDest(Func);
-      Str << " = " << Opcode << "." << this->getDest()->getType() << " ";
-      this->dumpSources(Func);
-    }
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::K);
-    }
-
-  protected:
-    InstX86BaseInplaceopGPR(Cfg *Func, Operand *SrcDest)
-        : InstX86Base(Func, K, 1, llvm::dyn_cast<Variable>(SrcDest)) {
-      this->addSource(SrcDest);
-    }
-
-  private:
-    static const char *const Opcode;
-    static const GPREmitterOneOp Emitter;
-  };
-
-  /// Instructions of the form x := op(y).
-  template <typename InstX86Base::InstKindX86 K>
-  class InstX86BaseUnaryopGPR : public InstX86Base {
-    InstX86BaseUnaryopGPR() = delete;
-    InstX86BaseUnaryopGPR(const InstX86BaseUnaryopGPR &) = delete;
-    InstX86BaseUnaryopGPR &operator=(const InstX86BaseUnaryopGPR &) = delete;
-
-  public:
-    using Base = InstX86BaseUnaryopGPR<K>;
-
-    void emit(const Cfg *Func) const override {
-      if (!BuildDefs::dump())
-        return;
-      Ostream &Str = Func->getContext()->getStrEmit();
-      assert(this->getSrcSize() == 1);
-      Type SrcTy = this->getSrc(0)->getType();
-      Type DestTy = this->getDest()->getType();
-      Str << "\t" << Opcode << this->getWidthString(SrcTy);
-      // Movsx and movzx need both the source and dest type width letter to
-      // define the operation. The other unary operations have the same source
-      // and dest type and as a result need only one letter.
-      if (SrcTy != DestTy)
-        Str << this->getWidthString(DestTy);
-      Str << "\t";
-      this->getSrc(0)->emit(Func);
-      Str << ", ";
-      this->getDest()->emit(Func);
-    }
-    void emitIAS(const Cfg *Func) const override {
-      assert(this->getSrcSize() == 1);
-      const Variable *Var = this->getDest();
-      Type Ty = Var->getType();
-      const Operand *Src = this->getSrc(0);
-      constexpr bool IsLea = K == InstX86Base::Lea;
-
-      if (IsLea) {
-        if (auto *Add = deoptLeaToAddOrNull(Func)) {
-          Add->emitIAS(Func);
-          return;
-        }
-      }
-      emitIASRegOpTyGPR(Func, IsLea, Ty, Var, Src, Emitter);
-    }
-    void dump(const Cfg *Func) const override {
-      if (!BuildDefs::dump())
-        return;
-      Ostream &Str = Func->getContext()->getStrDump();
-      this->dumpDest(Func);
-      Str << " = " << Opcode << "." << this->getSrc(0)->getType() << " ";
-      this->dumpSources(Func);
-    }
-
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::K);
-    }
-
-  protected:
-    InstX86BaseUnaryopGPR(Cfg *Func, Variable *Dest, Operand *Src)
-        : InstX86Base(Func, K, 1, Dest) {
-      this->addSource(Src);
-    }
-
-    Inst *deoptLeaToAddOrNull(const Cfg *Func) const {
-      // Revert back to Add when the Lea is a 2-address instruction.
-      // Caller has to emit, this just produces the add instruction.
-      if (auto *MemOp = llvm::dyn_cast<X86OperandMem>(this->getSrc(0))) {
-        if (getFlags().getAggressiveLea() &&
-            MemOp->getBase()->getRegNum() == this->getDest()->getRegNum() &&
-            MemOp->getIndex() == nullptr && MemOp->getShift() == 0) {
-          auto *Add = InstImpl<TraitsType>::InstX86Add::create(
-              const_cast<Cfg *>(Func), this->getDest(), MemOp->getOffset());
-          // TODO(manasijm): Remove const_cast by emitting code for add
-          // directly.
-          return Add;
-        }
-      }
-      return nullptr;
-    }
-
-    static const char *const Opcode;
-    static const GPREmitterRegOp Emitter;
-  };
-
-  template <typename InstX86Base::InstKindX86 K>
-  class InstX86BaseUnaryopXmm : public InstX86Base {
-    InstX86BaseUnaryopXmm() = delete;
-    InstX86BaseUnaryopXmm(const InstX86BaseUnaryopXmm &) = delete;
-    InstX86BaseUnaryopXmm &operator=(const InstX86BaseUnaryopXmm &) = delete;
-
-  public:
-    using Base = InstX86BaseUnaryopXmm<K>;
-
-    void emit(const Cfg *Func) const override {
-      if (!BuildDefs::dump())
-        return;
-      Ostream &Str = Func->getContext()->getStrEmit();
-      assert(this->getSrcSize() == 1);
-      Str << "\t" << Opcode << "\t";
-      this->getSrc(0)->emit(Func);
-      Str << ", ";
-      this->getDest()->emit(Func);
-    }
-    void emitIAS(const Cfg *Func) const override {
-      Type Ty = this->getDest()->getType();
-      assert(this->getSrcSize() == 1);
-      emitIASRegOpTyXMM(Func, Ty, this->getDest(), this->getSrc(0), Emitter);
-    }
-    void dump(const Cfg *Func) const override {
-      if (!BuildDefs::dump())
-        return;
-      Ostream &Str = Func->getContext()->getStrDump();
-      this->dumpDest(Func);
-      Str << " = " << Opcode << "." << this->getDest()->getType() << " ";
-      this->dumpSources(Func);
-    }
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::K);
-    }
-
-  protected:
-    InstX86BaseUnaryopXmm(Cfg *Func, Variable *Dest, Operand *Src)
-        : InstX86Base(Func, K, 1, Dest) {
-      this->addSource(Src);
-    }
-
-    static const char *const Opcode;
-    static const XmmEmitterRegOp Emitter;
-  };
-
-  template <typename InstX86Base::InstKindX86 K>
-  class InstX86BaseBinopGPRShift : public InstX86Base {
-    InstX86BaseBinopGPRShift() = delete;
-    InstX86BaseBinopGPRShift(const InstX86BaseBinopGPRShift &) = delete;
-    InstX86BaseBinopGPRShift &
-    operator=(const InstX86BaseBinopGPRShift &) = delete;
-
-  public:
-    using Base = InstX86BaseBinopGPRShift<K>;
-
-    void emit(const Cfg *Func) const override {
-      if (!BuildDefs::dump())
-        return;
-      this->emitTwoAddress(Func, Opcode);
-    }
-    void emitIAS(const Cfg *Func) const override {
-      Type Ty = this->getDest()->getType();
-      assert(this->getSrcSize() == 2);
-      emitIASGPRShift(Func, Ty, this->getDest(), this->getSrc(1), Emitter);
-    }
-    void dump(const Cfg *Func) const override {
-      if (!BuildDefs::dump())
-        return;
-      Ostream &Str = Func->getContext()->getStrDump();
-      this->dumpDest(Func);
-      Str << " = " << Opcode << "." << this->getDest()->getType() << " ";
-      this->dumpSources(Func);
-    }
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::K);
-    }
-
-  protected:
-    InstX86BaseBinopGPRShift(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86Base(Func, K, 2, Dest) {
-      this->addSource(Dest);
-      this->addSource(Source);
-    }
-
-    static const char *const Opcode;
-    static const GPREmitterShiftOp Emitter;
-  };
-
-  template <typename InstX86Base::InstKindX86 K>
-  class InstX86BaseBinopGPR : public InstX86Base {
-    InstX86BaseBinopGPR() = delete;
-    InstX86BaseBinopGPR(const InstX86BaseBinopGPR &) = delete;
-    InstX86BaseBinopGPR &operator=(const InstX86BaseBinopGPR &) = delete;
-
-  public:
-    using Base = InstX86BaseBinopGPR<K>;
-
-    void emit(const Cfg *Func) const override {
-      if (!BuildDefs::dump())
-        return;
-      this->emitTwoAddress(Func, Opcode);
-    }
-    void emitIAS(const Cfg *Func) const override {
-      Type Ty = this->getDest()->getType();
-      assert(this->getSrcSize() == 2);
-      constexpr bool ThisIsLEA = K == InstX86Base::Lea;
-      static_assert(!ThisIsLEA, "Lea should be a unaryop.");
-      emitIASRegOpTyGPR(Func, !ThisIsLEA, Ty, this->getDest(), this->getSrc(1),
-                        Emitter);
-    }
-    void dump(const Cfg *Func) const override {
-      if (!BuildDefs::dump())
-        return;
-      Ostream &Str = Func->getContext()->getStrDump();
-      this->dumpDest(Func);
-      Str << " = " << Opcode << "." << this->getDest()->getType() << " ";
-      this->dumpSources(Func);
-    }
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::K);
-    }
-
-  protected:
-    InstX86BaseBinopGPR(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86Base(Func, K, 2, Dest) {
-      this->addSource(Dest);
-      this->addSource(Source);
-    }
-
-    static const char *const Opcode;
-    static const GPREmitterRegOp Emitter;
-  };
-
-  template <typename InstX86Base::InstKindX86 K>
-  class InstX86BaseBinopRMW : public InstX86Base {
-    InstX86BaseBinopRMW() = delete;
-    InstX86BaseBinopRMW(const InstX86BaseBinopRMW &) = delete;
-    InstX86BaseBinopRMW &operator=(const InstX86BaseBinopRMW &) = delete;
-
-  public:
-    using Base = InstX86BaseBinopRMW<K>;
-
-    void emit(const Cfg *Func) const override {
-      if (!BuildDefs::dump())
-        return;
-      this->emitTwoAddress(Func, Opcode);
-    }
-    void emitIAS(const Cfg *Func) const override {
-      Type Ty = this->getSrc(0)->getType();
-      assert(this->getSrcSize() == 2);
-      emitIASAsAddrOpTyGPR(Func, Ty, this->getSrc(0), this->getSrc(1), Emitter);
-    }
-    void dump(const Cfg *Func) const override {
-      if (!BuildDefs::dump())
-        return;
-      Ostream &Str = Func->getContext()->getStrDump();
-      Str << Opcode << "." << this->getSrc(0)->getType() << " ";
-      this->dumpSources(Func);
-    }
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::K);
-    }
-
-  protected:
-    InstX86BaseBinopRMW(Cfg *Func, X86OperandMem *DestSrc0, Operand *Src1)
-        : InstX86Base(Func, K, 2, nullptr) {
-      this->addSource(DestSrc0);
-      this->addSource(Src1);
-    }
-
-    static const char *const Opcode;
-    static const GPREmitterAddrOp Emitter;
-  };
-
-  template <typename InstX86Base::InstKindX86 K, bool NeedsElementType,
-            typename InstX86Base::SseSuffix Suffix>
-  class InstX86BaseBinopXmm : public InstX86Base {
-    InstX86BaseBinopXmm() = delete;
-    InstX86BaseBinopXmm(const InstX86BaseBinopXmm &) = delete;
-    InstX86BaseBinopXmm &operator=(const InstX86BaseBinopXmm &) = delete;
-
-  public:
-    using Base = InstX86BaseBinopXmm<K, NeedsElementType, Suffix>;
-
-    void emit(const Cfg *Func) const override {
-      if (!BuildDefs::dump())
-        return;
-      this->validateVectorAddrMode();
-      const Type DestTy = ArithmeticTypeOverride == IceType_void
-                              ? this->getDest()->getType()
-                              : ArithmeticTypeOverride;
-      const char *SuffixString = "";
-      switch (Suffix) {
-      case InstX86Base::SseSuffix::None:
-        break;
-      case InstX86Base::SseSuffix::Packed:
-        SuffixString = Traits::TypeAttributes[DestTy].PdPsString;
-        break;
-      case InstX86Base::SseSuffix::Unpack:
-        SuffixString = Traits::TypeAttributes[DestTy].UnpackString;
-        break;
-      case InstX86Base::SseSuffix::Scalar:
-        SuffixString = Traits::TypeAttributes[DestTy].SdSsString;
-        break;
-      case InstX86Base::SseSuffix::Integral:
-        SuffixString = Traits::TypeAttributes[DestTy].IntegralString;
-        break;
-      case InstX86Base::SseSuffix::Pack:
-        SuffixString = Traits::TypeAttributes[DestTy].PackString;
-        break;
-      }
-      this->emitTwoAddress(Func, Opcode, SuffixString);
-    }
-    void emitIAS(const Cfg *Func) const override {
-      this->validateVectorAddrMode();
-      Type Ty = this->getDest()->getType();
-      if (NeedsElementType)
-        Ty = typeElementType(Ty);
-      assert(this->getSrcSize() == 2);
-      emitIASRegOpTyXMM(Func, Ty, this->getDest(), this->getSrc(1), Emitter);
-    }
-    void dump(const Cfg *Func) const override {
-      if (!BuildDefs::dump())
-        return;
-      Ostream &Str = Func->getContext()->getStrDump();
-      this->dumpDest(Func);
-      Str << " = " << Opcode << "." << this->getDest()->getType() << " ";
-      this->dumpSources(Func);
-    }
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::K);
-    }
-
-  protected:
-    InstX86BaseBinopXmm(Cfg *Func, Variable *Dest, Operand *Source,
-                        Type ArithmeticTypeOverride = IceType_void)
-        : InstX86Base(Func, K, 2, Dest),
-          ArithmeticTypeOverride(ArithmeticTypeOverride) {
-      this->addSource(Dest);
-      this->addSource(Source);
-    }
-
-    const Type ArithmeticTypeOverride;
-
-    static const char *const Opcode;
-    static const XmmEmitterRegOp Emitter;
-  };
-
-  template <typename InstX86Base::InstKindX86 K, bool AllowAllTypes = false>
-  class InstX86BaseBinopXmmShift : public InstX86Base {
-    InstX86BaseBinopXmmShift() = delete;
-    InstX86BaseBinopXmmShift(const InstX86BaseBinopXmmShift &) = delete;
-    InstX86BaseBinopXmmShift &
-    operator=(const InstX86BaseBinopXmmShift &) = delete;
-
-  public:
-    using Base = InstX86BaseBinopXmmShift<K, AllowAllTypes>;
-
-    void emit(const Cfg *Func) const override {
-      if (!BuildDefs::dump())
-        return;
-      this->validateVectorAddrMode();
-      // Shift operations are always integral, and hence always need a suffix.
-      const Type DestTy = this->getDest()->getType();
-      this->emitTwoAddress(Func, this->Opcode,
-                           Traits::TypeAttributes[DestTy].IntegralString);
-    }
-    void emitIAS(const Cfg *Func) const override {
-      this->validateVectorAddrMode();
-      Type Ty = this->getDest()->getType();
-      assert(AllowAllTypes || isVectorType(Ty));
-      Type ElementTy = typeElementType(Ty);
-      assert(this->getSrcSize() == 2);
-      emitIASXmmShift(Func, ElementTy, this->getDest(), this->getSrc(1),
-                      Emitter);
-    }
-    void dump(const Cfg *Func) const override {
-      if (!BuildDefs::dump())
-        return;
-      Ostream &Str = Func->getContext()->getStrDump();
-      this->dumpDest(Func);
-      Str << " = " << Opcode << "." << this->getDest()->getType() << " ";
-      this->dumpSources(Func);
-    }
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::K);
-    }
-
-  protected:
-    InstX86BaseBinopXmmShift(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86Base(Func, K, 2, Dest) {
-      this->addSource(Dest);
-      this->addSource(Source);
-    }
-
-    static const char *const Opcode;
-    static const XmmEmitterShiftOp Emitter;
-  };
-
-  template <typename InstX86Base::InstKindX86 K>
-  class InstX86BaseTernop : public InstX86Base {
-    InstX86BaseTernop() = delete;
-    InstX86BaseTernop(const InstX86BaseTernop &) = delete;
-    InstX86BaseTernop &operator=(const InstX86BaseTernop &) = delete;
-
-  public:
-    using Base = InstX86BaseTernop<K>;
-
-    void emit(const Cfg *Func) const override {
-      if (!BuildDefs::dump())
-        return;
-      Ostream &Str = Func->getContext()->getStrEmit();
-      assert(this->getSrcSize() == 3);
-      Str << "\t" << Opcode << "\t";
-      this->getSrc(2)->emit(Func);
-      Str << ", ";
-      this->getSrc(1)->emit(Func);
-      Str << ", ";
-      this->getDest()->emit(Func);
-    }
-    void dump(const Cfg *Func) const override {
-      if (!BuildDefs::dump())
-        return;
-      Ostream &Str = Func->getContext()->getStrDump();
-      this->dumpDest(Func);
-      Str << " = " << Opcode << "." << this->getDest()->getType() << " ";
-      this->dumpSources(Func);
-    }
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::K);
-    }
-
-  protected:
-    InstX86BaseTernop(Cfg *Func, Variable *Dest, Operand *Source1,
-                      Operand *Source2)
-        : InstX86Base(Func, K, 3, Dest) {
-      this->addSource(Dest);
-      this->addSource(Source1);
-      this->addSource(Source2);
-    }
-
-    static const char *const Opcode;
-  };
-
-  // Instructions of the form x := y op z
-  template <typename InstX86Base::InstKindX86 K>
-  class InstX86BaseThreeAddressop : public InstX86Base {
-    InstX86BaseThreeAddressop() = delete;
-    InstX86BaseThreeAddressop(const InstX86BaseThreeAddressop &) = delete;
-    InstX86BaseThreeAddressop &
-    operator=(const InstX86BaseThreeAddressop &) = delete;
-
-  public:
-    using Base = InstX86BaseThreeAddressop<K>;
-
-    void emit(const Cfg *Func) const override {
-      if (!BuildDefs::dump())
-        return;
-      Ostream &Str = Func->getContext()->getStrEmit();
-      assert(this->getSrcSize() == 2);
-      Str << "\t" << Opcode << "\t";
-      this->getSrc(1)->emit(Func);
-      Str << ", ";
-      this->getSrc(0)->emit(Func);
-      Str << ", ";
-      this->getDest()->emit(Func);
-    }
-    void dump(const Cfg *Func) const override {
-      if (!BuildDefs::dump())
-        return;
-      Ostream &Str = Func->getContext()->getStrDump();
-      this->dumpDest(Func);
-      Str << " = " << Opcode << "." << this->getDest()->getType() << " ";
-      this->dumpSources(Func);
-    }
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::K);
-    }
-
-  protected:
-    InstX86BaseThreeAddressop(Cfg *Func, Variable *Dest, Operand *Source0,
-                              Operand *Source1)
-        : InstX86Base(Func, K, 2, Dest) {
-      this->addSource(Source0);
-      this->addSource(Source1);
-    }
-
-    static const char *const Opcode;
-  };
-
-  /// Base class for assignment instructions
-  template <typename InstX86Base::InstKindX86 K>
-  class InstX86BaseMovlike : public InstX86Base {
-    InstX86BaseMovlike() = delete;
-    InstX86BaseMovlike(const InstX86BaseMovlike &) = delete;
-    InstX86BaseMovlike &operator=(const InstX86BaseMovlike &) = delete;
-
-  public:
-    using Base = InstX86BaseMovlike<K>;
-
-    bool isRedundantAssign() const override {
-      if (const auto *SrcVar =
-              llvm::dyn_cast<const Variable>(this->getSrc(0))) {
-        if (SrcVar->hasReg() && this->Dest->hasReg()) {
-          // An assignment between physical registers is considered redundant if
-          // they have the same base register and the same encoding. E.g.:
-          //   mov cl, ecx ==> redundant
-          //   mov ch, ecx ==> not redundant due to different encodings
-          //   mov ch, ebp ==> not redundant due to different base registers
-          //   mov ecx, ecx ==> redundant, and dangerous in x86-64. i64 zexting
-          //                    is handled by Inst86Zext.
-          const auto SrcReg = SrcVar->getRegNum();
-          const auto DestReg = this->Dest->getRegNum();
-          return (Traits::getEncoding(SrcReg) ==
-                  Traits::getEncoding(DestReg)) &&
-                 (Traits::getBaseReg(SrcReg) == Traits::getBaseReg(DestReg));
-        }
-      }
-      return checkForRedundantAssign(this->getDest(), this->getSrc(0));
-    }
-    bool isVarAssign() const override {
-      return llvm::isa<Variable>(this->getSrc(0));
-    }
-    void dump(const Cfg *Func) const override {
-      if (!BuildDefs::dump())
-        return;
-      Ostream &Str = Func->getContext()->getStrDump();
-      Str << Opcode << "." << this->getDest()->getType() << " ";
-      this->dumpDest(Func);
-      Str << ", ";
-      this->dumpSources(Func);
-    }
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::K);
-    }
-
-  protected:
-    InstX86BaseMovlike(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86Base(Func, K, 1, Dest) {
-      this->addSource(Source);
-      // For an integer assignment, make sure it's either a same-type assignment
-      // or a truncation.
-      assert(!isScalarIntegerType(Dest->getType()) ||
-             (typeWidthInBytes(Dest->getType()) <=
-              typeWidthInBytes(Source->getType())));
-    }
-
-    static const char *const Opcode;
-  };
-
-  class InstX86Bswap : public InstX86BaseInplaceopGPR<InstX86Base::Bswap> {
-  public:
-    static InstX86Bswap *create(Cfg *Func, Operand *SrcDest) {
-      return new (Func->allocate<InstX86Bswap>()) InstX86Bswap(Func, SrcDest);
-    }
-
-  private:
-    InstX86Bswap(Cfg *Func, Operand *SrcDest)
-        : InstX86BaseInplaceopGPR<InstX86Base::Bswap>(Func, SrcDest) {}
-  };
-
-  class InstX86Neg : public InstX86BaseInplaceopGPR<InstX86Base::Neg> {
-  public:
-    static InstX86Neg *create(Cfg *Func, Operand *SrcDest) {
-      return new (Func->allocate<InstX86Neg>()) InstX86Neg(Func, SrcDest);
-    }
-
-  private:
-    InstX86Neg(Cfg *Func, Operand *SrcDest)
-        : InstX86BaseInplaceopGPR<InstX86Base::Neg>(Func, SrcDest) {}
-  };
-
-  class InstX86Bsf : public InstX86BaseUnaryopGPR<InstX86Base::Bsf> {
-  public:
-    static InstX86Bsf *create(Cfg *Func, Variable *Dest, Operand *Src) {
-      return new (Func->allocate<InstX86Bsf>()) InstX86Bsf(Func, Dest, Src);
-    }
-
-  private:
-    InstX86Bsf(Cfg *Func, Variable *Dest, Operand *Src)
-        : InstX86BaseUnaryopGPR<InstX86Base::Bsf>(Func, Dest, Src) {}
-  };
-
-  class InstX86Bsr : public InstX86BaseUnaryopGPR<InstX86Base::Bsr> {
-  public:
-    static InstX86Bsr *create(Cfg *Func, Variable *Dest, Operand *Src) {
-      return new (Func->allocate<InstX86Bsr>()) InstX86Bsr(Func, Dest, Src);
-    }
-
-  private:
-    InstX86Bsr(Cfg *Func, Variable *Dest, Operand *Src)
-        : InstX86BaseUnaryopGPR<InstX86Base::Bsr>(Func, Dest, Src) {}
-  };
-
-  class InstX86Lea : public InstX86BaseUnaryopGPR<InstX86Base::Lea> {
-  public:
-    static InstX86Lea *create(Cfg *Func, Variable *Dest, Operand *Src) {
-      return new (Func->allocate<InstX86Lea>()) InstX86Lea(Func, Dest, Src);
-    }
-
-    void emit(const Cfg *Func) const override;
-
-  private:
-    InstX86Lea(Cfg *Func, Variable *Dest, Operand *Src)
-        : InstX86BaseUnaryopGPR<InstX86Base::Lea>(Func, Dest, Src) {}
-  };
-
-  // Cbwdq instruction - wrapper for cbw, cwd, and cdq
-  class InstX86Cbwdq : public InstX86BaseUnaryopGPR<InstX86Base::Cbwdq> {
-  public:
-    static InstX86Cbwdq *create(Cfg *Func, Variable *Dest, Operand *Src) {
-      return new (Func->allocate<InstX86Cbwdq>()) InstX86Cbwdq(Func, Dest, Src);
-    }
-
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-
-  private:
-    InstX86Cbwdq(Cfg *Func, Variable *Dest, Operand *Src)
-        : InstX86BaseUnaryopGPR<InstX86Base::Cbwdq>(Func, Dest, Src) {}
-  };
-
-  class InstX86Movsx : public InstX86BaseUnaryopGPR<InstX86Base::Movsx> {
-  public:
-    static InstX86Movsx *create(Cfg *Func, Variable *Dest, Operand *Src) {
-      assert(typeWidthInBytes(Dest->getType()) >
-             typeWidthInBytes(Src->getType()));
-      return new (Func->allocate<InstX86Movsx>()) InstX86Movsx(Func, Dest, Src);
-    }
-
-    void emitIAS(const Cfg *Func) const override;
-
-  private:
-    InstX86Movsx(Cfg *Func, Variable *Dest, Operand *Src)
-        : InstX86BaseUnaryopGPR<InstX86Base::Movsx>(Func, Dest, Src) {}
-  };
-
-  class InstX86Movzx : public InstX86BaseUnaryopGPR<InstX86Base::Movzx> {
-  public:
-    static InstX86Movzx *create(Cfg *Func, Variable *Dest, Operand *Src) {
-      assert(typeWidthInBytes(Dest->getType()) >
-             typeWidthInBytes(Src->getType()));
-      return new (Func->allocate<InstX86Movzx>()) InstX86Movzx(Func, Dest, Src);
-    }
-
-    void emit(const Cfg *Func) const override;
-
-    void emitIAS(const Cfg *Func) const override;
-
-    void setMustKeep() { MustKeep = true; }
-
-  private:
-    bool MustKeep = false;
-
-    InstX86Movzx(Cfg *Func, Variable *Dest, Operand *Src)
-        : InstX86BaseUnaryopGPR<InstX86Base::Movzx>(Func, Dest, Src) {}
-
-    bool mayBeElided(const Variable *Dest, const Operand *Src) const;
-  };
-
-  class InstX86Movd : public InstX86BaseUnaryopXmm<InstX86Base::Movd> {
-  public:
-    static InstX86Movd *create(Cfg *Func, Variable *Dest, Operand *Src) {
-      return new (Func->allocate<InstX86Movd>()) InstX86Movd(Func, Dest, Src);
-    }
-
-    void emit(const Cfg *Func) const override;
-
-    void emitIAS(const Cfg *Func) const override;
-
-  private:
-    InstX86Movd(Cfg *Func, Variable *Dest, Operand *Src)
-        : InstX86BaseUnaryopXmm<InstX86Base::Movd>(Func, Dest, Src) {}
-  };
-
-  class InstX86Movmsk final : public InstX86Base {
-    InstX86Movmsk() = delete;
-    InstX86Movmsk(const InstX86Movmsk &) = delete;
-    InstX86Movmsk &operator=(const InstX86Movmsk &) = delete;
-
-  public:
-    static InstX86Movmsk *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Movmsk>())
-          InstX86Movmsk(Func, Dest, Source);
-    }
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-    void dump(const Cfg *Func) const override;
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::InstX86Movmsk);
-    }
-
-  private:
-    InstX86Movmsk(Cfg *Func, Variable *Dest, Operand *Source);
-  };
-
-  class InstX86Sqrt : public InstX86BaseUnaryopXmm<InstX86Base::Sqrt> {
-  public:
-    static InstX86Sqrt *create(Cfg *Func, Variable *Dest, Operand *Src) {
-      return new (Func->allocate<InstX86Sqrt>()) InstX86Sqrt(Func, Dest, Src);
-    }
-
-    virtual void emit(const Cfg *Func) const override;
-
-  private:
-    InstX86Sqrt(Cfg *Func, Variable *Dest, Operand *Src)
-        : InstX86BaseUnaryopXmm<InstX86Base::Sqrt>(Func, Dest, Src) {}
-  };
-
-  /// Move/assignment instruction - wrapper for mov/movss/movsd.
-  class InstX86Mov : public InstX86BaseMovlike<InstX86Base::Mov> {
-  public:
-    static InstX86Mov *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      assert(!isScalarIntegerType(Dest->getType()) ||
-             (typeWidthInBytes(Dest->getType()) <=
-              typeWidthInBytes(Source->getType())));
-      return new (Func->allocate<InstX86Mov>()) InstX86Mov(Func, Dest, Source);
-    }
-
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-
-  private:
-    InstX86Mov(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseMovlike<InstX86Base::Mov>(Func, Dest, Source) {}
-  };
-
-  /// Move packed - copy 128 bit values between XMM registers, or mem128 and XMM
-  /// registers.
-  class InstX86Movp : public InstX86BaseMovlike<InstX86Base::Movp> {
-  public:
-    static InstX86Movp *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Movp>())
-          InstX86Movp(Func, Dest, Source);
-    }
-
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-
-  private:
-    InstX86Movp(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseMovlike<InstX86Base::Movp>(Func, Dest, Source) {}
-  };
-
-  /// Movq - copy between XMM registers, or mem64 and XMM registers.
-  class InstX86Movq : public InstX86BaseMovlike<InstX86Base::Movq> {
-  public:
-    static InstX86Movq *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Movq>())
-          InstX86Movq(Func, Dest, Source);
-    }
-
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-
-  private:
-    InstX86Movq(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseMovlike<InstX86Base::Movq>(Func, Dest, Source) {}
-  };
-
-  class InstX86Add : public InstX86BaseBinopGPR<InstX86Base::Add> {
-  public:
-    static InstX86Add *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Add>()) InstX86Add(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Add(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopGPR<InstX86Base::Add>(Func, Dest, Source) {}
-  };
-
-  class InstX86AddRMW : public InstX86BaseBinopRMW<InstX86Base::AddRMW> {
-  public:
-    static InstX86AddRMW *create(Cfg *Func, X86OperandMem *DestSrc0,
-                                 Operand *Src1) {
-      return new (Func->allocate<InstX86AddRMW>())
-          InstX86AddRMW(Func, DestSrc0, Src1);
-    }
-
-  private:
-    InstX86AddRMW(Cfg *Func, X86OperandMem *DestSrc0, Operand *Src1)
-        : InstX86BaseBinopRMW<InstX86Base::AddRMW>(Func, DestSrc0, Src1) {}
-  };
-
-  class InstX86Addps
-      : public InstX86BaseBinopXmm<InstX86Base::Addps, true,
-                                   InstX86Base::SseSuffix::Packed> {
-  public:
-    static InstX86Addps *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Addps>())
-          InstX86Addps(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Addps(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopXmm<InstX86Base::Addps, true,
-                              InstX86Base::SseSuffix::Packed>(Func, Dest,
-                                                              Source) {}
-  };
-
-  class InstX86Adc : public InstX86BaseBinopGPR<InstX86Base::Adc> {
-  public:
-    static InstX86Adc *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Adc>()) InstX86Adc(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Adc(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopGPR<InstX86Base::Adc>(Func, Dest, Source) {}
-  };
-
-  class InstX86AdcRMW : public InstX86BaseBinopRMW<InstX86Base::AdcRMW> {
-  public:
-    static InstX86AdcRMW *create(Cfg *Func, X86OperandMem *DestSrc0,
-                                 Operand *Src1) {
-      return new (Func->allocate<InstX86AdcRMW>())
-          InstX86AdcRMW(Func, DestSrc0, Src1);
-    }
-
-  private:
-    InstX86AdcRMW(Cfg *Func, X86OperandMem *DestSrc0, Operand *Src1)
-        : InstX86BaseBinopRMW<InstX86Base::AdcRMW>(Func, DestSrc0, Src1) {}
-  };
-
-  class InstX86Addss
-      : public InstX86BaseBinopXmm<InstX86Base::Addss, false,
-                                   InstX86Base::SseSuffix::Scalar> {
-  public:
-    static InstX86Addss *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Addss>())
-          InstX86Addss(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Addss(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopXmm<InstX86Base::Addss, false,
-                              InstX86Base::SseSuffix::Scalar>(Func, Dest,
-                                                              Source) {}
-  };
-
-  class InstX86Padd
-      : public InstX86BaseBinopXmm<InstX86Base::Padd, true,
-                                   InstX86Base::SseSuffix::Integral> {
-  public:
-    static InstX86Padd *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Padd>())
-          InstX86Padd(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Padd(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopXmm<InstX86Base::Padd, true,
-                              InstX86Base::SseSuffix::Integral>(Func, Dest,
-                                                                Source) {}
-  };
-
-  class InstX86Padds
-      : public InstX86BaseBinopXmm<InstX86Base::Padds, true,
-                                   InstX86Base::SseSuffix::Integral> {
-  public:
-    static InstX86Padds *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Padds>())
-          InstX86Padds(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Padds(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopXmm<InstX86Base::Padds, true,
-                              InstX86Base::SseSuffix::Integral>(Func, Dest,
-                                                                Source) {}
-  };
-
-  class InstX86Paddus
-      : public InstX86BaseBinopXmm<InstX86Base::Paddus, true,
-                                   InstX86Base::SseSuffix::Integral> {
-  public:
-    static InstX86Paddus *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Paddus>())
-          InstX86Paddus(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Paddus(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopXmm<InstX86Base::Paddus, true,
-                              InstX86Base::SseSuffix::Integral>(Func, Dest,
-                                                                Source) {}
-  };
-
-  class InstX86Sub : public InstX86BaseBinopGPR<InstX86Base::Sub> {
-  public:
-    static InstX86Sub *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Sub>()) InstX86Sub(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Sub(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopGPR<InstX86Base::Sub>(Func, Dest, Source) {}
-  };
-
-  class InstX86SubRMW : public InstX86BaseBinopRMW<InstX86Base::SubRMW> {
-  public:
-    static InstX86SubRMW *create(Cfg *Func, X86OperandMem *DestSrc0,
-                                 Operand *Src1) {
-      return new (Func->allocate<InstX86SubRMW>())
-          InstX86SubRMW(Func, DestSrc0, Src1);
-    }
-
-  private:
-    InstX86SubRMW(Cfg *Func, X86OperandMem *DestSrc0, Operand *Src1)
-        : InstX86BaseBinopRMW<InstX86Base::SubRMW>(Func, DestSrc0, Src1) {}
-  };
-
-  class InstX86Subps
-      : public InstX86BaseBinopXmm<InstX86Base::Subps, true,
-                                   InstX86Base::SseSuffix::Packed> {
-  public:
-    static InstX86Subps *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Subps>())
-          InstX86Subps(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Subps(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopXmm<InstX86Base::Subps, true,
-                              InstX86Base::SseSuffix::Packed>(Func, Dest,
-                                                              Source) {}
-  };
-
-  class InstX86Subss
-      : public InstX86BaseBinopXmm<InstX86Base::Subss, false,
-                                   InstX86Base::SseSuffix::Scalar> {
-  public:
-    static InstX86Subss *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Subss>())
-          InstX86Subss(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Subss(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopXmm<InstX86Base::Subss, false,
-                              InstX86Base::SseSuffix::Scalar>(Func, Dest,
-                                                              Source) {}
-  };
-
-  class InstX86Sbb : public InstX86BaseBinopGPR<InstX86Base::Sbb> {
-  public:
-    static InstX86Sbb *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Sbb>()) InstX86Sbb(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Sbb(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopGPR<InstX86Base::Sbb>(Func, Dest, Source) {}
-  };
-
-  class InstX86SbbRMW : public InstX86BaseBinopRMW<InstX86Base::SbbRMW> {
-  public:
-    static InstX86SbbRMW *create(Cfg *Func, X86OperandMem *DestSrc0,
-                                 Operand *Src1) {
-      return new (Func->allocate<InstX86SbbRMW>())
-          InstX86SbbRMW(Func, DestSrc0, Src1);
-    }
-
-  private:
-    InstX86SbbRMW(Cfg *Func, X86OperandMem *DestSrc0, Operand *Src1)
-        : InstX86BaseBinopRMW<InstX86Base::SbbRMW>(Func, DestSrc0, Src1) {}
-  };
-
-  class InstX86Psub
-      : public InstX86BaseBinopXmm<InstX86Base::Psub, true,
-                                   InstX86Base::SseSuffix::Integral> {
-  public:
-    static InstX86Psub *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Psub>())
-          InstX86Psub(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Psub(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopXmm<InstX86Base::Psub, true,
-                              InstX86Base::SseSuffix::Integral>(Func, Dest,
-                                                                Source) {}
-  };
-
-  class InstX86Psubs
-      : public InstX86BaseBinopXmm<InstX86Base::Psubs, true,
-                                   InstX86Base::SseSuffix::Integral> {
-  public:
-    static InstX86Psubs *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Psubs>())
-          InstX86Psubs(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Psubs(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopXmm<InstX86Base::Psubs, true,
-                              InstX86Base::SseSuffix::Integral>(Func, Dest,
-                                                                Source) {}
-  };
-
-  class InstX86Psubus
-      : public InstX86BaseBinopXmm<InstX86Base::Psubus, true,
-                                   InstX86Base::SseSuffix::Integral> {
-  public:
-    static InstX86Psubus *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Psubus>())
-          InstX86Psubus(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Psubus(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopXmm<InstX86Base::Psubus, true,
-                              InstX86Base::SseSuffix::Integral>(Func, Dest,
-                                                                Source) {}
-  };
-
-  class InstX86And : public InstX86BaseBinopGPR<InstX86Base::And> {
-  public:
-    static InstX86And *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86And>()) InstX86And(Func, Dest, Source);
-    }
-
-  private:
-    InstX86And(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopGPR<InstX86Base::And>(Func, Dest, Source) {}
-  };
-
-  class InstX86Andnps
-      : public InstX86BaseBinopXmm<InstX86Base::Andnps, true,
-                                   InstX86Base::SseSuffix::Packed> {
-  public:
-    static InstX86Andnps *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Andnps>())
-          InstX86Andnps(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Andnps(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopXmm<InstX86Base::Andnps, true,
-                              InstX86Base::SseSuffix::Packed>(Func, Dest,
-                                                              Source) {}
-  };
-
-  class InstX86Andps
-      : public InstX86BaseBinopXmm<InstX86Base::Andps, true,
-                                   InstX86Base::SseSuffix::Packed> {
-  public:
-    static InstX86Andps *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Andps>())
-          InstX86Andps(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Andps(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopXmm<InstX86Base::Andps, true,
-                              InstX86Base::SseSuffix::Packed>(Func, Dest,
-                                                              Source) {}
-  };
-
-  class InstX86AndRMW : public InstX86BaseBinopRMW<InstX86Base::AndRMW> {
-  public:
-    static InstX86AndRMW *create(Cfg *Func, X86OperandMem *DestSrc0,
-                                 Operand *Src1) {
-      return new (Func->allocate<InstX86AndRMW>())
-          InstX86AndRMW(Func, DestSrc0, Src1);
-    }
-
-  private:
-    InstX86AndRMW(Cfg *Func, X86OperandMem *DestSrc0, Operand *Src1)
-        : InstX86BaseBinopRMW<InstX86Base::AndRMW>(Func, DestSrc0, Src1) {}
-  };
-
-  class InstX86Pand : public InstX86BaseBinopXmm<InstX86Base::Pand, false,
-                                                 InstX86Base::SseSuffix::None> {
-  public:
-    static InstX86Pand *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Pand>())
-          InstX86Pand(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Pand(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopXmm<InstX86Base::Pand, false,
-                              InstX86Base::SseSuffix::None>(Func, Dest,
-                                                            Source) {}
-  };
-
-  class InstX86Pandn
-      : public InstX86BaseBinopXmm<InstX86Base::Pandn, false,
-                                   InstX86Base::SseSuffix::None> {
-  public:
-    static InstX86Pandn *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Pandn>())
-          InstX86Pandn(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Pandn(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopXmm<InstX86Base::Pandn, false,
-                              InstX86Base::SseSuffix::None>(Func, Dest,
-                                                            Source) {}
-  };
-
-  class InstX86Maxss
-      : public InstX86BaseBinopXmm<InstX86Base::Maxss, true,
-                                   InstX86Base::SseSuffix::Scalar> {
-  public:
-    static InstX86Maxss *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Maxss>())
-          InstX86Maxss(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Maxss(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopXmm<InstX86Base::Maxss, true,
-                              InstX86Base::SseSuffix::Scalar>(Func, Dest,
-                                                              Source) {}
-  };
-
-  class InstX86Minss
-      : public InstX86BaseBinopXmm<InstX86Base::Minss, true,
-                                   InstX86Base::SseSuffix::Scalar> {
-  public:
-    static InstX86Minss *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Minss>())
-          InstX86Minss(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Minss(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopXmm<InstX86Base::Minss, true,
-                              InstX86Base::SseSuffix::Scalar>(Func, Dest,
-                                                              Source) {}
-  };
-
-  class InstX86Maxps
-      : public InstX86BaseBinopXmm<InstX86Base::Maxps, true,
-                                   InstX86Base::SseSuffix::None> {
-  public:
-    static InstX86Maxps *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Maxps>())
-          InstX86Maxps(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Maxps(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopXmm<InstX86Base::Maxps, true,
-                              InstX86Base::SseSuffix::None>(Func, Dest,
-                                                            Source) {}
-  };
-
-  class InstX86Minps
-      : public InstX86BaseBinopXmm<InstX86Base::Minps, true,
-                                   InstX86Base::SseSuffix::None> {
-  public:
-    static InstX86Minps *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Minps>())
-          InstX86Minps(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Minps(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopXmm<InstX86Base::Minps, true,
-                              InstX86Base::SseSuffix::None>(Func, Dest,
-                                                            Source) {}
-  };
-
-  class InstX86Or : public InstX86BaseBinopGPR<InstX86Base::Or> {
-  public:
-    static InstX86Or *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Or>()) InstX86Or(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Or(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopGPR<InstX86Base::Or>(Func, Dest, Source) {}
-  };
-
-  class InstX86Orps
-      : public InstX86BaseBinopXmm<InstX86Base::Orps, true,
-                                   InstX86Base::SseSuffix::Packed> {
-  public:
-    static InstX86Orps *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Orps>())
-          InstX86Orps(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Orps(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopXmm<InstX86Base::Orps, true,
-                              InstX86Base::SseSuffix::Packed>(Func, Dest,
-                                                              Source) {}
-  };
-
-  class InstX86OrRMW : public InstX86BaseBinopRMW<InstX86Base::OrRMW> {
-  public:
-    static InstX86OrRMW *create(Cfg *Func, X86OperandMem *DestSrc0,
-                                Operand *Src1) {
-      return new (Func->allocate<InstX86OrRMW>())
-          InstX86OrRMW(Func, DestSrc0, Src1);
-    }
-
-  private:
-    InstX86OrRMW(Cfg *Func, X86OperandMem *DestSrc0, Operand *Src1)
-        : InstX86BaseBinopRMW<InstX86Base::OrRMW>(Func, DestSrc0, Src1) {}
-  };
-
-  class InstX86Por : public InstX86BaseBinopXmm<InstX86Base::Por, false,
-                                                InstX86Base::SseSuffix::None> {
-  public:
-    static InstX86Por *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Por>()) InstX86Por(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Por(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopXmm<InstX86Base::Por, false,
-                              InstX86Base::SseSuffix::None>(Func, Dest,
-                                                            Source) {}
-  };
-
-  class InstX86Xor : public InstX86BaseBinopGPR<InstX86Base::Xor> {
-  public:
-    static InstX86Xor *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Xor>()) InstX86Xor(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Xor(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopGPR<InstX86Base::Xor>(Func, Dest, Source) {}
-  };
-
-  class InstX86Xorps
-      : public InstX86BaseBinopXmm<InstX86Base::Xorps, true,
-                                   InstX86Base::SseSuffix::Packed> {
-  public:
-    static InstX86Xorps *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Xorps>())
-          InstX86Xorps(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Xorps(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopXmm<InstX86Base::Xorps, true,
-                              InstX86Base::SseSuffix::Packed>(Func, Dest,
-                                                              Source) {}
-  };
-
-  class InstX86XorRMW : public InstX86BaseBinopRMW<InstX86Base::XorRMW> {
-  public:
-    static InstX86XorRMW *create(Cfg *Func, X86OperandMem *DestSrc0,
-                                 Operand *Src1) {
-      return new (Func->allocate<InstX86XorRMW>())
-          InstX86XorRMW(Func, DestSrc0, Src1);
-    }
-
-  private:
-    InstX86XorRMW(Cfg *Func, X86OperandMem *DestSrc0, Operand *Src1)
-        : InstX86BaseBinopRMW<InstX86Base::XorRMW>(Func, DestSrc0, Src1) {}
-  };
-
-  class InstX86Pxor : public InstX86BaseBinopXmm<InstX86Base::Pxor, false,
-                                                 InstX86Base::SseSuffix::None> {
-  public:
-    static InstX86Pxor *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Pxor>())
-          InstX86Pxor(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Pxor(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopXmm<InstX86Base::Pxor, false,
-                              InstX86Base::SseSuffix::None>(Func, Dest,
-                                                            Source) {}
-  };
-
-  class InstX86Imul : public InstX86BaseBinopGPR<InstX86Base::Imul> {
-  public:
-    static InstX86Imul *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Imul>())
-          InstX86Imul(Func, Dest, Source);
-    }
-
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-
-  private:
-    InstX86Imul(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopGPR<InstX86Base::Imul>(Func, Dest, Source) {}
-  };
-
-  class InstX86ImulImm
-      : public InstX86BaseThreeAddressop<InstX86Base::ImulImm> {
-  public:
-    static InstX86ImulImm *create(Cfg *Func, Variable *Dest, Operand *Source0,
-                                  Operand *Source1) {
-      return new (Func->allocate<InstX86ImulImm>())
-          InstX86ImulImm(Func, Dest, Source0, Source1);
-    }
-
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-
-  private:
-    InstX86ImulImm(Cfg *Func, Variable *Dest, Operand *Source0,
-                   Operand *Source1)
-        : InstX86BaseThreeAddressop<InstX86Base::ImulImm>(Func, Dest, Source0,
-                                                          Source1) {}
-  };
-
-  class InstX86Mulps
-      : public InstX86BaseBinopXmm<InstX86Base::Mulps, true,
-                                   InstX86Base::SseSuffix::Packed> {
-  public:
-    static InstX86Mulps *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Mulps>())
-          InstX86Mulps(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Mulps(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopXmm<InstX86Base::Mulps, true,
-                              InstX86Base::SseSuffix::Packed>(Func, Dest,
-                                                              Source) {}
-  };
-
-  class InstX86Mulss
-      : public InstX86BaseBinopXmm<InstX86Base::Mulss, false,
-                                   InstX86Base::SseSuffix::Scalar> {
-  public:
-    static InstX86Mulss *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Mulss>())
-          InstX86Mulss(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Mulss(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopXmm<InstX86Base::Mulss, false,
-                              InstX86Base::SseSuffix::Scalar>(Func, Dest,
-                                                              Source) {}
-  };
-
-  class InstX86Pmull
-      : public InstX86BaseBinopXmm<InstX86Base::Pmull, true,
-                                   InstX86Base::SseSuffix::Integral> {
-  public:
-    static InstX86Pmull *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      bool TypesAreValid =
-          Dest->getType() == IceType_v4i32 || Dest->getType() == IceType_v8i16;
-      auto *Target = InstX86Base::getTarget(Func);
-      bool InstructionSetIsValid =
-          Dest->getType() == IceType_v8i16 ||
-          Target->getInstructionSet() >= Traits::SSE4_1;
-      (void)TypesAreValid;
-      (void)InstructionSetIsValid;
-      assert(TypesAreValid);
-      assert(InstructionSetIsValid);
-      return new (Func->allocate<InstX86Pmull>())
-          InstX86Pmull(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Pmull(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopXmm<InstX86Base::Pmull, true,
-                              InstX86Base::SseSuffix::Integral>(Func, Dest,
-                                                                Source) {}
-  };
-
-  class InstX86Pmulhw
-      : public InstX86BaseBinopXmm<InstX86Base::Pmulhw, false,
-                                   InstX86Base::SseSuffix::None> {
-  public:
-    static InstX86Pmulhw *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      assert(Dest->getType() == IceType_v8i16 &&
-             Source->getType() == IceType_v8i16);
-      return new (Func->allocate<InstX86Pmulhw>())
-          InstX86Pmulhw(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Pmulhw(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopXmm<InstX86Base::Pmulhw, false,
-                              InstX86Base::SseSuffix::None>(Func, Dest,
-                                                            Source) {}
-  };
-
-  class InstX86Pmulhuw
-      : public InstX86BaseBinopXmm<InstX86Base::Pmulhuw, false,
-                                   InstX86Base::SseSuffix::None> {
-  public:
-    static InstX86Pmulhuw *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      assert(Dest->getType() == IceType_v8i16 &&
-             Source->getType() == IceType_v8i16);
-      return new (Func->allocate<InstX86Pmulhuw>())
-          InstX86Pmulhuw(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Pmulhuw(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopXmm<InstX86Base::Pmulhuw, false,
-                              InstX86Base::SseSuffix::None>(Func, Dest,
-                                                            Source) {}
-  };
-
-  class InstX86Pmaddwd
-      : public InstX86BaseBinopXmm<InstX86Base::Pmaddwd, false,
-                                   InstX86Base::SseSuffix::None> {
-  public:
-    static InstX86Pmaddwd *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      assert(Dest->getType() == IceType_v8i16 &&
-             Source->getType() == IceType_v8i16);
-      return new (Func->allocate<InstX86Pmaddwd>())
-          InstX86Pmaddwd(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Pmaddwd(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopXmm<InstX86Base::Pmaddwd, false,
-                              InstX86Base::SseSuffix::None>(Func, Dest,
-                                                            Source) {}
-  };
-
-  class InstX86Pmuludq
-      : public InstX86BaseBinopXmm<InstX86Base::Pmuludq, false,
-                                   InstX86Base::SseSuffix::None> {
-  public:
-    static InstX86Pmuludq *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      assert(Dest->getType() == IceType_v4i32 &&
-             Source->getType() == IceType_v4i32);
-      return new (Func->allocate<InstX86Pmuludq>())
-          InstX86Pmuludq(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Pmuludq(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopXmm<InstX86Base::Pmuludq, false,
-                              InstX86Base::SseSuffix::None>(Func, Dest,
-                                                            Source) {}
-  };
-
-  class InstX86Divps
-      : public InstX86BaseBinopXmm<InstX86Base::Divps, true,
-                                   InstX86Base::SseSuffix::Packed> {
-  public:
-    static InstX86Divps *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Divps>())
-          InstX86Divps(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Divps(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopXmm<InstX86Base::Divps, true,
-                              InstX86Base::SseSuffix::Packed>(Func, Dest,
-                                                              Source) {}
-  };
-
-  class InstX86Divss
-      : public InstX86BaseBinopXmm<InstX86Base::Divss, false,
-                                   InstX86Base::SseSuffix::Scalar> {
-  public:
-    static InstX86Divss *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Divss>())
-          InstX86Divss(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Divss(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopXmm<InstX86Base::Divss, false,
-                              InstX86Base::SseSuffix::Scalar>(Func, Dest,
-                                                              Source) {}
-  };
-
-  class InstX86Rol : public InstX86BaseBinopGPRShift<InstX86Base::Rol> {
-  public:
-    static InstX86Rol *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Rol>()) InstX86Rol(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Rol(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopGPRShift<InstX86Base::Rol>(Func, Dest, Source) {}
-  };
-
-  class InstX86Shl : public InstX86BaseBinopGPRShift<InstX86Base::Shl> {
-  public:
-    static InstX86Shl *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Shl>()) InstX86Shl(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Shl(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopGPRShift<InstX86Base::Shl>(Func, Dest, Source) {}
-  };
-
-  class InstX86Psll : public InstX86BaseBinopXmmShift<InstX86Base::Psll> {
-  public:
-    static InstX86Psll *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      assert(
-          Dest->getType() == IceType_v8i16 || Dest->getType() == IceType_v8i1 ||
-          Dest->getType() == IceType_v4i32 || Dest->getType() == IceType_v4i1);
-      return new (Func->allocate<InstX86Psll>())
-          InstX86Psll(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Psll(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopXmmShift<InstX86Base::Psll>(Func, Dest, Source) {}
-  };
-
-  class InstX86Psrl : public InstX86BaseBinopXmmShift<InstX86Base::Psrl, true> {
-  public:
-    static InstX86Psrl *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Psrl>())
-          InstX86Psrl(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Psrl(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopXmmShift<InstX86Base::Psrl, true>(Func, Dest,
-                                                            Source) {}
-  };
-
-  class InstX86Shr : public InstX86BaseBinopGPRShift<InstX86Base::Shr> {
-  public:
-    static InstX86Shr *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Shr>()) InstX86Shr(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Shr(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopGPRShift<InstX86Base::Shr>(Func, Dest, Source) {}
-  };
-
-  class InstX86Sar : public InstX86BaseBinopGPRShift<InstX86Base::Sar> {
-  public:
-    static InstX86Sar *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Sar>()) InstX86Sar(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Sar(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopGPRShift<InstX86Base::Sar>(Func, Dest, Source) {}
-  };
-
-  class InstX86Psra : public InstX86BaseBinopXmmShift<InstX86Base::Psra> {
-  public:
-    static InstX86Psra *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      assert(
-          Dest->getType() == IceType_v8i16 || Dest->getType() == IceType_v8i1 ||
-          Dest->getType() == IceType_v4i32 || Dest->getType() == IceType_v4i1);
-      return new (Func->allocate<InstX86Psra>())
-          InstX86Psra(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Psra(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopXmmShift<InstX86Base::Psra>(Func, Dest, Source) {}
-  };
-
-  class InstX86Pcmpeq
-      : public InstX86BaseBinopXmm<InstX86Base::Pcmpeq, true,
-                                   InstX86Base::SseSuffix::Integral> {
-  public:
-    static InstX86Pcmpeq *create(Cfg *Func, Variable *Dest, Operand *Source,
-                                 Type ArithmeticTypeOverride = IceType_void) {
-      const Type Ty = ArithmeticTypeOverride == IceType_void
-                          ? Dest->getType()
-                          : ArithmeticTypeOverride;
-      (void)Ty;
-      assert((Ty != IceType_f64 && Ty != IceType_i64) ||
-             InstX86Base::getTarget(Func)->getInstructionSet() >=
-                 Traits::SSE4_1);
-      return new (Func->allocate<InstX86Pcmpeq>())
-          InstX86Pcmpeq(Func, Dest, Source, ArithmeticTypeOverride);
-    }
-
-  private:
-    InstX86Pcmpeq(Cfg *Func, Variable *Dest, Operand *Source,
-                  Type ArithmeticTypeOverride)
-        : InstX86BaseBinopXmm<InstX86Base::Pcmpeq, true,
-                              InstX86Base::SseSuffix::Integral>(
-              Func, Dest, Source, ArithmeticTypeOverride) {}
-  };
-
-  class InstX86Pcmpgt
-      : public InstX86BaseBinopXmm<InstX86Base::Pcmpgt, true,
-                                   InstX86Base::SseSuffix::Integral> {
-  public:
-    static InstX86Pcmpgt *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      assert(Dest->getType() != IceType_f64 ||
-             InstX86Base::getTarget(Func)->getInstructionSet() >=
-                 Traits::SSE4_1);
-      return new (Func->allocate<InstX86Pcmpgt>())
-          InstX86Pcmpgt(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Pcmpgt(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopXmm<InstX86Base::Pcmpgt, true,
-                              InstX86Base::SseSuffix::Integral>(Func, Dest,
-                                                                Source) {}
-  };
-
-  /// movss is only a binary operation when the source and dest operands are
-  /// both registers (the high bits of dest are left untouched). In other cases,
-  /// it behaves like a copy (mov-like) operation (and the high bits of dest are
-  /// cleared). InstX86Movss will assert that both its source and dest operands
-  /// are registers, so the lowering code should use _mov instead of _movss in
-  /// cases where a copy operation is intended.
-  class InstX86MovssRegs
-      : public InstX86BaseBinopXmm<InstX86Base::MovssRegs, false,
-                                   InstX86Base::SseSuffix::None> {
-  public:
-    static InstX86MovssRegs *create(Cfg *Func, Variable *Dest,
-                                    Operand *Source) {
-      return new (Func->allocate<InstX86MovssRegs>())
-          InstX86MovssRegs(Func, Dest, Source);
-    }
-
-    void emitIAS(const Cfg *Func) const override;
-
-  private:
-    InstX86MovssRegs(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopXmm<InstX86Base::MovssRegs, false,
-                              InstX86Base::SseSuffix::None>(Func, Dest,
-                                                            Source) {}
-  };
-
-  class InstX86Idiv : public InstX86BaseTernop<InstX86Base::Idiv> {
-  public:
-    static InstX86Idiv *create(Cfg *Func, Variable *Dest, Operand *Source1,
-                               Operand *Source2) {
-      return new (Func->allocate<InstX86Idiv>())
-          InstX86Idiv(Func, Dest, Source1, Source2);
-    }
-
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-
-  private:
-    InstX86Idiv(Cfg *Func, Variable *Dest, Operand *Source1, Operand *Source2)
-        : InstX86BaseTernop<InstX86Base::Idiv>(Func, Dest, Source1, Source2) {}
-  };
-
-  class InstX86Div : public InstX86BaseTernop<InstX86Base::Div> {
-  public:
-    static InstX86Div *create(Cfg *Func, Variable *Dest, Operand *Source1,
-                              Operand *Source2) {
-      return new (Func->allocate<InstX86Div>())
-          InstX86Div(Func, Dest, Source1, Source2);
-    }
-
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-
-  private:
-    InstX86Div(Cfg *Func, Variable *Dest, Operand *Source1, Operand *Source2)
-        : InstX86BaseTernop<InstX86Base::Div>(Func, Dest, Source1, Source2) {}
-  };
-
-  class InstX86Insertps : public InstX86BaseTernop<InstX86Base::Insertps> {
-  public:
-    static InstX86Insertps *create(Cfg *Func, Variable *Dest, Operand *Source1,
-                                   Operand *Source2) {
-      return new (Func->allocate<InstX86Insertps>())
-          InstX86Insertps(Func, Dest, Source1, Source2);
-    }
-
-    void emitIAS(const Cfg *Func) const override;
-
-  private:
-    InstX86Insertps(Cfg *Func, Variable *Dest, Operand *Source1,
-                    Operand *Source2)
-        : InstX86BaseTernop<InstX86Base::Insertps>(Func, Dest, Source1,
-                                                   Source2) {}
-  };
-
-  class InstX86Pinsr : public InstX86BaseTernop<InstX86Base::Pinsr> {
-  public:
-    static InstX86Pinsr *create(Cfg *Func, Variable *Dest, Operand *Source1,
-                                Operand *Source2) {
-      // pinsrb and pinsrd are SSE4.1 instructions.
-      assert(
-          Dest->getType() == IceType_v8i16 || Dest->getType() == IceType_v8i1 ||
-          InstX86Base::getTarget(Func)->getInstructionSet() >= Traits::SSE4_1);
-      return new (Func->allocate<InstX86Pinsr>())
-          InstX86Pinsr(Func, Dest, Source1, Source2);
-    }
-
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-
-  private:
-    InstX86Pinsr(Cfg *Func, Variable *Dest, Operand *Source1, Operand *Source2)
-        : InstX86BaseTernop<InstX86Base::Pinsr>(Func, Dest, Source1, Source2) {}
-  };
-
-  class InstX86Shufps : public InstX86BaseTernop<InstX86Base::Shufps> {
-  public:
-    static InstX86Shufps *create(Cfg *Func, Variable *Dest, Operand *Source1,
-                                 Operand *Source2) {
-      return new (Func->allocate<InstX86Shufps>())
-          InstX86Shufps(Func, Dest, Source1, Source2);
-    }
-
-    void emitIAS(const Cfg *Func) const override;
-
-  private:
-    InstX86Shufps(Cfg *Func, Variable *Dest, Operand *Source1, Operand *Source2)
-        : InstX86BaseTernop<InstX86Base::Shufps>(Func, Dest, Source1, Source2) {
-    }
-  };
-
-  class InstX86Blendvps : public InstX86BaseTernop<InstX86Base::Blendvps> {
-  public:
-    static InstX86Blendvps *create(Cfg *Func, Variable *Dest, Operand *Source1,
-                                   Operand *Source2) {
-      assert(InstX86Base::getTarget(Func)->getInstructionSet() >=
-             Traits::SSE4_1);
-      return new (Func->allocate<InstX86Blendvps>())
-          InstX86Blendvps(Func, Dest, Source1, Source2);
-    }
-
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Fund) const override;
-
-  private:
-    InstX86Blendvps(Cfg *Func, Variable *Dest, Operand *Source1,
-                    Operand *Source2)
-        : InstX86BaseTernop<InstX86Base::Blendvps>(Func, Dest, Source1,
-                                                   Source2) {}
-  };
-
-  class InstX86Pblendvb : public InstX86BaseTernop<InstX86Base::Pblendvb> {
-  public:
-    static InstX86Pblendvb *create(Cfg *Func, Variable *Dest, Operand *Source1,
-                                   Operand *Source2) {
-      assert(InstX86Base::getTarget(Func)->getInstructionSet() >=
-             Traits::SSE4_1);
-      return new (Func->allocate<InstX86Pblendvb>())
-          InstX86Pblendvb(Func, Dest, Source1, Source2);
-    }
-
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-
-  private:
-    InstX86Pblendvb(Cfg *Func, Variable *Dest, Operand *Source1,
-                    Operand *Source2)
-        : InstX86BaseTernop<InstX86Base::Pblendvb>(Func, Dest, Source1,
-                                                   Source2) {}
-  };
-
-  class InstX86Pextr : public InstX86BaseThreeAddressop<InstX86Base::Pextr> {
-  public:
-    static InstX86Pextr *create(Cfg *Func, Variable *Dest, Operand *Source0,
-                                Operand *Source1) {
-      assert(Source0->getType() == IceType_v8i16 ||
-             Source0->getType() == IceType_v8i1 ||
-             InstX86Base::getTarget(Func)->getInstructionSet() >=
-                 Traits::SSE4_1);
-      return new (Func->allocate<InstX86Pextr>())
-          InstX86Pextr(Func, Dest, Source0, Source1);
-    }
-
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-
-  private:
-    InstX86Pextr(Cfg *Func, Variable *Dest, Operand *Source0, Operand *Source1)
-        : InstX86BaseThreeAddressop<InstX86Base::Pextr>(Func, Dest, Source0,
-                                                        Source1) {}
-  };
-
-  class InstX86Pshufd : public InstX86BaseThreeAddressop<InstX86Base::Pshufd> {
-  public:
-    static InstX86Pshufd *create(Cfg *Func, Variable *Dest, Operand *Source0,
-                                 Operand *Source1) {
-      return new (Func->allocate<InstX86Pshufd>())
-          InstX86Pshufd(Func, Dest, Source0, Source1);
-    }
-
-    void emitIAS(const Cfg *Func) const override;
-
-  private:
-    InstX86Pshufd(Cfg *Func, Variable *Dest, Operand *Source0, Operand *Source1)
-        : InstX86BaseThreeAddressop<InstX86Base::Pshufd>(Func, Dest, Source0,
-                                                         Source1) {}
-  };
-
-  /// Base class for a lockable x86-32 instruction (emits a locked prefix).
-  class InstX86BaseLockable : public InstX86Base {
-    InstX86BaseLockable() = delete;
-    InstX86BaseLockable(const InstX86BaseLockable &) = delete;
-    InstX86BaseLockable &operator=(const InstX86BaseLockable &) = delete;
-
-  protected:
-    bool Locked;
-
-    InstX86BaseLockable(Cfg *Func, typename InstX86Base::InstKindX86 Kind,
-                        SizeT Maxsrcs, Variable *Dest, bool Locked)
-        : InstX86Base(Func, Kind, Maxsrcs, Dest), Locked(Locked) {
-      // Assume that such instructions are used for Atomics and be careful with
-      // optimizations.
-      this->HasSideEffects = Locked;
-    }
-  };
-
-  /// Mul instruction - unsigned multiply.
-  class InstX86Mul final : public InstX86Base {
-    InstX86Mul() = delete;
-    InstX86Mul(const InstX86Mul &) = delete;
-    InstX86Mul &operator=(const InstX86Mul &) = delete;
-
-  public:
-    static InstX86Mul *create(Cfg *Func, Variable *Dest, Variable *Source1,
-                              Operand *Source2) {
-      return new (Func->allocate<InstX86Mul>())
-          InstX86Mul(Func, Dest, Source1, Source2);
-    }
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-    void dump(const Cfg *Func) const override;
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::Mul);
-    }
-
-  private:
-    InstX86Mul(Cfg *Func, Variable *Dest, Variable *Source1, Operand *Source2);
-  };
-
-  /// Shld instruction - shift across a pair of operands.
-  class InstX86Shld final : public InstX86Base {
-    InstX86Shld() = delete;
-    InstX86Shld(const InstX86Shld &) = delete;
-    InstX86Shld &operator=(const InstX86Shld &) = delete;
-
-  public:
-    static InstX86Shld *create(Cfg *Func, Variable *Dest, Variable *Source1,
-                               Operand *Source2) {
-      return new (Func->allocate<InstX86Shld>())
-          InstX86Shld(Func, Dest, Source1, Source2);
-    }
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-    void dump(const Cfg *Func) const override;
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::Shld);
-    }
-
-  private:
-    InstX86Shld(Cfg *Func, Variable *Dest, Variable *Source1, Operand *Source2);
-  };
-
-  /// Shrd instruction - shift across a pair of operands.
-  class InstX86Shrd final : public InstX86Base {
-    InstX86Shrd() = delete;
-    InstX86Shrd(const InstX86Shrd &) = delete;
-    InstX86Shrd &operator=(const InstX86Shrd &) = delete;
-
-  public:
-    static InstX86Shrd *create(Cfg *Func, Variable *Dest, Variable *Source1,
-                               Operand *Source2) {
-      return new (Func->allocate<InstX86Shrd>())
-          InstX86Shrd(Func, Dest, Source1, Source2);
-    }
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-    void dump(const Cfg *Func) const override;
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::Shrd);
-    }
-
-  private:
-    InstX86Shrd(Cfg *Func, Variable *Dest, Variable *Source1, Operand *Source2);
-  };
-
-  /// Conditional move instruction.
-  class InstX86Cmov final : public InstX86Base {
-    InstX86Cmov() = delete;
-    InstX86Cmov(const InstX86Cmov &) = delete;
-    InstX86Cmov &operator=(const InstX86Cmov &) = delete;
-
-  public:
-    static InstX86Cmov *create(Cfg *Func, Variable *Dest, Operand *Source,
-                               BrCond Cond) {
-      return new (Func->allocate<InstX86Cmov>())
-          InstX86Cmov(Func, Dest, Source, Cond);
-    }
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-    void dump(const Cfg *Func) const override;
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::Cmov);
-    }
-
-  private:
-    InstX86Cmov(Cfg *Func, Variable *Dest, Operand *Source, BrCond Cond);
-
-    BrCond Condition;
-  };
-
-  /// Cmpps instruction - compare packed singled-precision floating point values
-  class InstX86Cmpps final : public InstX86Base {
-    InstX86Cmpps() = delete;
-    InstX86Cmpps(const InstX86Cmpps &) = delete;
-    InstX86Cmpps &operator=(const InstX86Cmpps &) = delete;
-
-  public:
-    static InstX86Cmpps *create(Cfg *Func, Variable *Dest, Operand *Source,
-                                CmppsCond Condition) {
-      return new (Func->allocate<InstX86Cmpps>())
-          InstX86Cmpps(Func, Dest, Source, Condition);
-    }
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-    void dump(const Cfg *Func) const override;
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::Cmpps);
-    }
-
-  private:
-    InstX86Cmpps(Cfg *Func, Variable *Dest, Operand *Source, CmppsCond Cond);
-
-    CmppsCond Condition;
-  };
-
-  /// Cmpxchg instruction - cmpxchg <dest>, <desired> will compare if <dest>
-  /// equals eax. If so, the ZF is set and <desired> is stored in <dest>. If
-  /// not, ZF is cleared and <dest> is copied to eax (or subregister). <dest>
-  /// can be a register or memory, while <desired> must be a register. It is
-  /// the user's responsibility to mark eax with a FakeDef.
-  class InstX86Cmpxchg final : public InstX86BaseLockable {
-    InstX86Cmpxchg() = delete;
-    InstX86Cmpxchg(const InstX86Cmpxchg &) = delete;
-    InstX86Cmpxchg &operator=(const InstX86Cmpxchg &) = delete;
-
-  public:
-    static InstX86Cmpxchg *create(Cfg *Func, Operand *DestOrAddr, Variable *Eax,
-                                  Variable *Desired, bool Locked) {
-      return new (Func->allocate<InstX86Cmpxchg>())
-          InstX86Cmpxchg(Func, DestOrAddr, Eax, Desired, Locked);
-    }
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-    void dump(const Cfg *Func) const override;
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::Cmpxchg);
-    }
-
-  private:
-    InstX86Cmpxchg(Cfg *Func, Operand *DestOrAddr, Variable *Eax,
-                   Variable *Desired, bool Locked);
-  };
-
-  /// Cmpxchg8b instruction - cmpxchg8b <m64> will compare if <m64> equals
-  /// edx:eax. If so, the ZF is set and ecx:ebx is stored in <m64>. If not, ZF
-  /// is cleared and <m64> is copied to edx:eax. The caller is responsible for
-  /// inserting FakeDefs to mark edx and eax as modified. <m64> must be a memory
-  /// operand.
-  class InstX86Cmpxchg8b final : public InstX86BaseLockable {
-    InstX86Cmpxchg8b() = delete;
-    InstX86Cmpxchg8b(const InstX86Cmpxchg8b &) = delete;
-    InstX86Cmpxchg8b &operator=(const InstX86Cmpxchg8b &) = delete;
-
-  public:
-    static InstX86Cmpxchg8b *create(Cfg *Func, X86OperandMem *Dest,
-                                    Variable *Edx, Variable *Eax, Variable *Ecx,
-                                    Variable *Ebx, bool Locked) {
-      return new (Func->allocate<InstX86Cmpxchg8b>())
-          InstX86Cmpxchg8b(Func, Dest, Edx, Eax, Ecx, Ebx, Locked);
-    }
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-    void dump(const Cfg *Func) const override;
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::Cmpxchg8b);
-    }
-
-  private:
-    InstX86Cmpxchg8b(Cfg *Func, X86OperandMem *Dest, Variable *Edx,
-                     Variable *Eax, Variable *Ecx, Variable *Ebx, bool Locked);
-  };
-
-  /// Cvt instruction - wrapper for cvtsX2sY where X and Y are in {s,d,i} as
-  /// appropriate.  s=float, d=double, i=int. X and Y are determined from
-  /// dest/src types. Sign and zero extension on the integer operand needs to be
-  /// done separately.
-  class InstX86Cvt final : public InstX86Base {
-    InstX86Cvt() = delete;
-    InstX86Cvt(const InstX86Cvt &) = delete;
-    InstX86Cvt &operator=(const InstX86Cvt &) = delete;
-
-  public:
-    enum CvtVariant { Si2ss, Tss2si, Ss2si, Float2float, Dq2ps, Tps2dq, Ps2dq };
-    static InstX86Cvt *create(Cfg *Func, Variable *Dest, Operand *Source,
-                              CvtVariant Variant) {
-      return new (Func->allocate<InstX86Cvt>())
-          InstX86Cvt(Func, Dest, Source, Variant);
-    }
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-    void dump(const Cfg *Func) const override;
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::Cvt);
-    }
-    bool isTruncating() const { return Variant == Tss2si || Variant == Tps2dq; }
-
-  private:
-    CvtVariant Variant;
-    InstX86Cvt(Cfg *Func, Variable *Dest, Operand *Source, CvtVariant Variant);
-  };
-
-  /// Round instruction
-  class InstX86Round final
-      : public InstX86BaseThreeAddressop<InstX86Base::Round> {
-  public:
-    static InstX86Round *create(Cfg *Func, Variable *Dest, Operand *Source,
-                                Operand *Imm) {
-      return new (Func->allocate<InstX86Round>())
-          InstX86Round(Func, Dest, Source, Imm);
-    }
-
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-
-  private:
-    InstX86Round(Cfg *Func, Variable *Dest, Operand *Source, Operand *Imm)
-        : InstX86BaseThreeAddressop<InstX86Base::Round>(Func, Dest, Source,
-                                                        Imm) {}
-  };
-
-  /// cmp - Integer compare instruction.
-  class InstX86Icmp final : public InstX86Base {
-    InstX86Icmp() = delete;
-    InstX86Icmp(const InstX86Icmp &) = delete;
-    InstX86Icmp &operator=(const InstX86Icmp &) = delete;
-
-  public:
-    static InstX86Icmp *create(Cfg *Func, Operand *Src1, Operand *Src2) {
-      return new (Func->allocate<InstX86Icmp>()) InstX86Icmp(Func, Src1, Src2);
-    }
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-    void dump(const Cfg *Func) const override;
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::Icmp);
-    }
-
-  private:
-    InstX86Icmp(Cfg *Func, Operand *Src1, Operand *Src2);
-  };
-
-  /// ucomiss/ucomisd - floating-point compare instruction.
-  class InstX86Ucomiss final : public InstX86Base {
-    InstX86Ucomiss() = delete;
-    InstX86Ucomiss(const InstX86Ucomiss &) = delete;
-    InstX86Ucomiss &operator=(const InstX86Ucomiss &) = delete;
-
-  public:
-    static InstX86Ucomiss *create(Cfg *Func, Operand *Src1, Operand *Src2) {
-      return new (Func->allocate<InstX86Ucomiss>())
-          InstX86Ucomiss(Func, Src1, Src2);
-    }
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-    void dump(const Cfg *Func) const override;
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::Ucomiss);
-    }
-
-  private:
-    InstX86Ucomiss(Cfg *Func, Operand *Src1, Operand *Src2);
-  };
-
-  /// UD2 instruction.
-  class InstX86UD2 final : public InstX86Base {
-    InstX86UD2() = delete;
-    InstX86UD2(const InstX86UD2 &) = delete;
-    InstX86UD2 &operator=(const InstX86UD2 &) = delete;
-
-  public:
-    static InstX86UD2 *create(Cfg *Func) {
-      return new (Func->allocate<InstX86UD2>()) InstX86UD2(Func);
-    }
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-    void dump(const Cfg *Func) const override;
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::UD2);
-    }
-
-  private:
-    explicit InstX86UD2(Cfg *Func);
-  };
-
-  /// Int3 instruction.
-  class InstX86Int3 final : public InstX86Base {
-    InstX86Int3() = delete;
-    InstX86Int3(const InstX86Int3 &) = delete;
-    InstX86Int3 &operator=(const InstX86Int3 &) = delete;
-
-  public:
-    static InstX86Int3 *create(Cfg *Func) {
-      return new (Func->allocate<InstX86Int3>()) InstX86Int3(Func);
-    }
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-    void dump(const Cfg *Func) const override;
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::Int3);
-    }
-
-  private:
-    explicit InstX86Int3(Cfg *Func);
-  };
-
-  /// Test instruction.
-  class InstX86Test final : public InstX86Base {
-    InstX86Test() = delete;
-    InstX86Test(const InstX86Test &) = delete;
-    InstX86Test &operator=(const InstX86Test &) = delete;
-
-  public:
-    static InstX86Test *create(Cfg *Func, Operand *Source1, Operand *Source2) {
-      return new (Func->allocate<InstX86Test>())
-          InstX86Test(Func, Source1, Source2);
-    }
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-    void dump(const Cfg *Func) const override;
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::Test);
-    }
-
-  private:
-    InstX86Test(Cfg *Func, Operand *Source1, Operand *Source2);
-  };
-
-  /// Mfence instruction.
-  class InstX86Mfence final : public InstX86Base {
-    InstX86Mfence() = delete;
-    InstX86Mfence(const InstX86Mfence &) = delete;
-    InstX86Mfence &operator=(const InstX86Mfence &) = delete;
-
-  public:
-    static InstX86Mfence *create(Cfg *Func) {
-      return new (Func->allocate<InstX86Mfence>()) InstX86Mfence(Func);
-    }
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-    void dump(const Cfg *Func) const override;
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::Mfence);
-    }
-
-  private:
-    explicit InstX86Mfence(Cfg *Func);
-  };
-
-  /// This is essentially a "mov" instruction with anX86OperandMem operand
-  /// instead of Variable as the destination. It's important for liveness that
-  /// there is no Dest operand.
-  class InstX86Store final : public InstX86Base {
-    InstX86Store() = delete;
-    InstX86Store(const InstX86Store &) = delete;
-    InstX86Store &operator=(const InstX86Store &) = delete;
-
-  public:
-    static InstX86Store *create(Cfg *Func, Operand *Value, X86Operand *Mem) {
-      return new (Func->allocate<InstX86Store>())
-          InstX86Store(Func, Value, Mem);
-    }
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-    void dump(const Cfg *Func) const override;
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::Store);
-    }
-
-  private:
-    InstX86Store(Cfg *Func, Operand *Value, X86Operand *Mem);
-  };
-
-  /// This is essentially a vector "mov" instruction with an typename
-  /// X86OperandMem operand instead of Variable as the destination. It's
-  /// important for liveness that there is no Dest operand. The source must be
-  /// an Xmm register, since Dest is mem.
-  class InstX86StoreP final : public InstX86Base {
-    InstX86StoreP() = delete;
-    InstX86StoreP(const InstX86StoreP &) = delete;
-    InstX86StoreP &operator=(const InstX86StoreP &) = delete;
-
-  public:
-    static InstX86StoreP *create(Cfg *Func, Variable *Value,
-                                 X86OperandMem *Mem) {
-      return new (Func->allocate<InstX86StoreP>())
-          InstX86StoreP(Func, Value, Mem);
-    }
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-    void dump(const Cfg *Func) const override;
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::StoreP);
-    }
-
-  private:
-    InstX86StoreP(Cfg *Func, Variable *Value, X86OperandMem *Mem);
-  };
-
-  class InstX86StoreQ final : public InstX86Base {
-    InstX86StoreQ() = delete;
-    InstX86StoreQ(const InstX86StoreQ &) = delete;
-    InstX86StoreQ &operator=(const InstX86StoreQ &) = delete;
-
-  public:
-    static InstX86StoreQ *create(Cfg *Func, Operand *Value,
-                                 X86OperandMem *Mem) {
-      return new (Func->allocate<InstX86StoreQ>())
-          InstX86StoreQ(Func, Value, Mem);
-    }
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-    void dump(const Cfg *Func) const override;
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::StoreQ);
-    }
-
-  private:
-    InstX86StoreQ(Cfg *Func, Operand *Value, X86OperandMem *Mem);
-  };
-
-  class InstX86StoreD final : public InstX86Base {
-    InstX86StoreD() = delete;
-    InstX86StoreD(const InstX86StoreD &) = delete;
-    InstX86StoreD &operator=(const InstX86StoreD &) = delete;
-
-  public:
-    static InstX86StoreD *create(Cfg *Func, Operand *Value,
-                                 X86OperandMem *Mem) {
-      return new (Func->allocate<InstX86StoreD>())
-          InstX86StoreD(Func, Value, Mem);
-    }
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-    void dump(const Cfg *Func) const override;
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::StoreQ);
-    }
-
-  private:
-    InstX86StoreD(Cfg *Func, Operand *Value, X86OperandMem *Mem);
-  };
-
-  /// Nop instructions of varying length
-  class InstX86Nop final : public InstX86Base {
-    InstX86Nop() = delete;
-    InstX86Nop(const InstX86Nop &) = delete;
-    InstX86Nop &operator=(const InstX86Nop &) = delete;
-
-  public:
-    // TODO: Replace with enum.
-    using NopVariant = unsigned;
-
-    static InstX86Nop *create(Cfg *Func, NopVariant Variant) {
-      return new (Func->allocate<InstX86Nop>()) InstX86Nop(Func, Variant);
-    }
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-    void dump(const Cfg *Func) const override;
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::Nop);
-    }
-
-  private:
-    InstX86Nop(Cfg *Func, NopVariant Length);
-
-    NopVariant Variant;
-  };
-
-  class InstX86Pop final : public InstX86Base {
-    InstX86Pop() = delete;
-    InstX86Pop(const InstX86Pop &) = delete;
-    InstX86Pop &operator=(const InstX86Pop &) = delete;
-
-  public:
-    static InstX86Pop *create(Cfg *Func, Variable *Dest) {
-      return new (Func->allocate<InstX86Pop>()) InstX86Pop(Func, Dest);
-    }
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-    void dump(const Cfg *Func) const override;
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::Pop);
-    }
-
-  private:
-    InstX86Pop(Cfg *Func, Variable *Dest);
-  };
-
-  class InstX86Push final : public InstX86Base {
-    InstX86Push() = delete;
-    InstX86Push(const InstX86Push &) = delete;
-    InstX86Push &operator=(const InstX86Push &) = delete;
-
-  public:
-    static InstX86Push *create(Cfg *Func, InstX86Label *Label) {
-      return new (Func->allocate<InstX86Push>()) InstX86Push(Func, Label);
-    }
-    static InstX86Push *create(Cfg *Func, Operand *Source) {
-      return new (Func->allocate<InstX86Push>()) InstX86Push(Func, Source);
-    }
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-    void dump(const Cfg *Func) const override;
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::Push);
-    }
-
-  private:
-    InstX86Label *Label = nullptr;
-
-    InstX86Push(Cfg *Func, Operand *Source);
-    InstX86Push(Cfg *Func, InstX86Label *Label);
-  };
-
-  /// Ret instruction. Currently only supports the "ret" version that does not
-  /// pop arguments. This instruction takes a Source operand (for non-void
-  /// returning functions) for liveness analysis, though a FakeUse before the
-  /// ret would do just as well.
-  class InstX86Ret final : public InstX86Base {
-    InstX86Ret() = delete;
-    InstX86Ret(const InstX86Ret &) = delete;
-    InstX86Ret &operator=(const InstX86Ret &) = delete;
-
-  public:
-    static InstX86Ret *create(Cfg *Func, Variable *Source = nullptr) {
-      return new (Func->allocate<InstX86Ret>()) InstX86Ret(Func, Source);
-    }
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-    void dump(const Cfg *Func) const override;
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::Ret);
-    }
-
-  private:
-    InstX86Ret(Cfg *Func, Variable *Source);
-  };
-
-  /// Conditional set-byte instruction.
-  class InstX86Setcc final : public InstX86Base {
-    InstX86Setcc() = delete;
-    InstX86Setcc(const InstX86Cmov &) = delete;
-    InstX86Setcc &operator=(const InstX86Setcc &) = delete;
-
-  public:
-    static InstX86Setcc *create(Cfg *Func, Variable *Dest, BrCond Cond) {
-      return new (Func->allocate<InstX86Setcc>())
-          InstX86Setcc(Func, Dest, Cond);
-    }
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-    void dump(const Cfg *Func) const override;
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::Setcc);
-    }
-
-  private:
-    InstX86Setcc(Cfg *Func, Variable *Dest, BrCond Cond);
-
-    const BrCond Condition;
-  };
-
-  /// Exchanging Add instruction. Exchanges the first operand (destination
-  /// operand) with the second operand (source operand), then loads the sum of
-  /// the two values into the destination operand. The destination may be a
-  /// register or memory, while the source must be a register.
-  ///
-  /// Both the dest and source are updated. The caller should then insert a
-  /// FakeDef to reflect the second udpate.
-  class InstX86Xadd final : public InstX86BaseLockable {
-    InstX86Xadd() = delete;
-    InstX86Xadd(const InstX86Xadd &) = delete;
-    InstX86Xadd &operator=(const InstX86Xadd &) = delete;
-
-  public:
-    static InstX86Xadd *create(Cfg *Func, Operand *Dest, Variable *Source,
-                               bool Locked) {
-      return new (Func->allocate<InstX86Xadd>())
-          InstX86Xadd(Func, Dest, Source, Locked);
-    }
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-    void dump(const Cfg *Func) const override;
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::Xadd);
-    }
-
-  private:
-    InstX86Xadd(Cfg *Func, Operand *Dest, Variable *Source, bool Locked);
-  };
-
-  /// Exchange instruction. Exchanges the first operand (destination operand)
-  /// with the second operand (source operand). At least one of the operands
-  /// must be a register (and the other can be reg or mem). Both the Dest and
-  /// Source are updated. If there is a memory operand, then the instruction is
-  /// automatically "locked" without the need for a lock prefix.
-  class InstX86Xchg final : public InstX86Base {
-    InstX86Xchg() = delete;
-    InstX86Xchg(const InstX86Xchg &) = delete;
-    InstX86Xchg &operator=(const InstX86Xchg &) = delete;
-
-  public:
-    static InstX86Xchg *create(Cfg *Func, Operand *Dest, Variable *Source) {
-      return new (Func->allocate<InstX86Xchg>())
-          InstX86Xchg(Func, Dest, Source);
-    }
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-    void dump(const Cfg *Func) const override;
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::Xchg);
-    }
-
-  private:
-    InstX86Xchg(Cfg *Func, Operand *Dest, Variable *Source);
-  };
-
-  /// Start marker for the Intel Architecture Code Analyzer. This is not an
-  /// executable instruction and must only be used for analysis.
-  class InstX86IacaStart final : public InstX86Base {
-    InstX86IacaStart() = delete;
-    InstX86IacaStart(const InstX86IacaStart &) = delete;
-    InstX86IacaStart &operator=(const InstX86IacaStart &) = delete;
-
-  public:
-    static InstX86IacaStart *create(Cfg *Func) {
-      return new (Func->allocate<InstX86IacaStart>()) InstX86IacaStart(Func);
-    }
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-    void dump(const Cfg *Func) const override;
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::IacaStart);
-    }
-
-  private:
-    InstX86IacaStart(Cfg *Func);
-  };
-
-  /// End marker for the Intel Architecture Code Analyzer. This is not an
-  /// executable instruction and must only be used for analysis.
-  class InstX86IacaEnd final : public InstX86Base {
-    InstX86IacaEnd() = delete;
-    InstX86IacaEnd(const InstX86IacaEnd &) = delete;
-    InstX86IacaEnd &operator=(const InstX86IacaEnd &) = delete;
-
-  public:
-    static InstX86IacaEnd *create(Cfg *Func) {
-      return new (Func->allocate<InstX86IacaEnd>()) InstX86IacaEnd(Func);
-    }
-    void emit(const Cfg *Func) const override;
-    void emitIAS(const Cfg *Func) const override;
-    void dump(const Cfg *Func) const override;
-    static bool classof(const Inst *Instr) {
-      return InstX86Base::isClassof(Instr, InstX86Base::IacaEnd);
-    }
-
-  private:
-    InstX86IacaEnd(Cfg *Func);
-  };
-
-  class InstX86Pshufb
-      : public InstX86BaseBinopXmm<InstX86Base::Pshufb, false,
-                                   InstX86Base::SseSuffix::None> {
-  public:
-    static InstX86Pshufb *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Pshufb>())
-          InstX86Pshufb(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Pshufb(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopXmm<InstX86Base::Pshufb, false,
-                              InstX86Base::SseSuffix::None>(Func, Dest,
-                                                            Source) {}
-  };
-
-  class InstX86Punpckl
-      : public InstX86BaseBinopXmm<InstX86Base::Punpckl, false,
-                                   InstX86Base::SseSuffix::Unpack> {
-  public:
-    static InstX86Punpckl *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Punpckl>())
-          InstX86Punpckl(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Punpckl(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopXmm<InstX86Base::Punpckl, false,
-                              InstX86Base::SseSuffix::Unpack>(Func, Dest,
-                                                              Source) {}
-  };
-
-  class InstX86Punpckh
-      : public InstX86BaseBinopXmm<InstX86Base::Punpckh, false,
-                                   InstX86Base::SseSuffix::Unpack> {
-  public:
-    static InstX86Punpckh *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Punpckh>())
-          InstX86Punpckh(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Punpckh(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopXmm<InstX86Base::Punpckh, false,
-                              InstX86Base::SseSuffix::Unpack>(Func, Dest,
-                                                              Source) {}
-  };
-
-  class InstX86Packss
-      : public InstX86BaseBinopXmm<InstX86Base::Packss, false,
-                                   InstX86Base::SseSuffix::Pack> {
-  public:
-    static InstX86Packss *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Packss>())
-          InstX86Packss(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Packss(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopXmm<InstX86Base::Packss, false,
-                              InstX86Base::SseSuffix::Pack>(Func, Dest,
-                                                            Source) {}
-  };
-
-  class InstX86Packus
-      : public InstX86BaseBinopXmm<InstX86Base::Packus, false,
-                                   InstX86Base::SseSuffix::Pack> {
-  public:
-    static InstX86Packus *create(Cfg *Func, Variable *Dest, Operand *Source) {
-      return new (Func->allocate<InstX86Packus>())
-          InstX86Packus(Func, Dest, Source);
-    }
-
-  private:
-    InstX86Packus(Cfg *Func, Variable *Dest, Operand *Source)
-        : InstX86BaseBinopXmm<InstX86Base::Packus, false,
-                              InstX86Base::SseSuffix::Pack>(Func, Dest,
-                                                            Source) {}
-  };
-
-}; // struct InstImpl
-
-/// struct Insts is a template that can be used to instantiate all the X86
-/// instructions for a target with a simple
-///
-/// using Insts = ::Ice::X8664::Insts<TraitsType>;
-template <typename TraitsType> struct Insts {
-  using FakeRMW = typename InstImpl<TraitsType>::InstX86FakeRMW;
-  using Label = typename InstImpl<TraitsType>::InstX86Label;
-
-  using Call = typename InstImpl<TraitsType>::InstX86Call;
-
-  using Br = typename InstImpl<TraitsType>::InstX86Br;
-  using Jmp = typename InstImpl<TraitsType>::InstX86Jmp;
-  using Bswap = typename InstImpl<TraitsType>::InstX86Bswap;
-  using Neg = typename InstImpl<TraitsType>::InstX86Neg;
-  using Bsf = typename InstImpl<TraitsType>::InstX86Bsf;
-  using Bsr = typename InstImpl<TraitsType>::InstX86Bsr;
-  using Lea = typename InstImpl<TraitsType>::InstX86Lea;
-  using Cbwdq = typename InstImpl<TraitsType>::InstX86Cbwdq;
-  using Movsx = typename InstImpl<TraitsType>::InstX86Movsx;
-  using Movzx = typename InstImpl<TraitsType>::InstX86Movzx;
-  using Movd = typename InstImpl<TraitsType>::InstX86Movd;
-  using Movmsk = typename InstImpl<TraitsType>::InstX86Movmsk;
-  using Sqrt = typename InstImpl<TraitsType>::InstX86Sqrt;
-  using Mov = typename InstImpl<TraitsType>::InstX86Mov;
-  using Movp = typename InstImpl<TraitsType>::InstX86Movp;
-  using Movq = typename InstImpl<TraitsType>::InstX86Movq;
-  using Add = typename InstImpl<TraitsType>::InstX86Add;
-  using AddRMW = typename InstImpl<TraitsType>::InstX86AddRMW;
-  using Addps = typename InstImpl<TraitsType>::InstX86Addps;
-  using Adc = typename InstImpl<TraitsType>::InstX86Adc;
-  using AdcRMW = typename InstImpl<TraitsType>::InstX86AdcRMW;
-  using Addss = typename InstImpl<TraitsType>::InstX86Addss;
-  using Andnps = typename InstImpl<TraitsType>::InstX86Andnps;
-  using Andps = typename InstImpl<TraitsType>::InstX86Andps;
-  using Padd = typename InstImpl<TraitsType>::InstX86Padd;
-  using Padds = typename InstImpl<TraitsType>::InstX86Padds;
-  using Paddus = typename InstImpl<TraitsType>::InstX86Paddus;
-  using Sub = typename InstImpl<TraitsType>::InstX86Sub;
-  using SubRMW = typename InstImpl<TraitsType>::InstX86SubRMW;
-  using Subps = typename InstImpl<TraitsType>::InstX86Subps;
-  using Subss = typename InstImpl<TraitsType>::InstX86Subss;
-  using Sbb = typename InstImpl<TraitsType>::InstX86Sbb;
-  using SbbRMW = typename InstImpl<TraitsType>::InstX86SbbRMW;
-  using Psub = typename InstImpl<TraitsType>::InstX86Psub;
-  using Psubs = typename InstImpl<TraitsType>::InstX86Psubs;
-  using Psubus = typename InstImpl<TraitsType>::InstX86Psubus;
-  using And = typename InstImpl<TraitsType>::InstX86And;
-  using AndRMW = typename InstImpl<TraitsType>::InstX86AndRMW;
-  using Pand = typename InstImpl<TraitsType>::InstX86Pand;
-  using Pandn = typename InstImpl<TraitsType>::InstX86Pandn;
-  using Or = typename InstImpl<TraitsType>::InstX86Or;
-  using Orps = typename InstImpl<TraitsType>::InstX86Orps;
-  using OrRMW = typename InstImpl<TraitsType>::InstX86OrRMW;
-  using Por = typename InstImpl<TraitsType>::InstX86Por;
-  using Xor = typename InstImpl<TraitsType>::InstX86Xor;
-  using Xorps = typename InstImpl<TraitsType>::InstX86Xorps;
-  using XorRMW = typename InstImpl<TraitsType>::InstX86XorRMW;
-  using Pxor = typename InstImpl<TraitsType>::InstX86Pxor;
-  using Maxss = typename InstImpl<TraitsType>::InstX86Maxss;
-  using Minss = typename InstImpl<TraitsType>::InstX86Minss;
-  using Maxps = typename InstImpl<TraitsType>::InstX86Maxps;
-  using Minps = typename InstImpl<TraitsType>::InstX86Minps;
-  using Imul = typename InstImpl<TraitsType>::InstX86Imul;
-  using ImulImm = typename InstImpl<TraitsType>::InstX86ImulImm;
-  using Mulps = typename InstImpl<TraitsType>::InstX86Mulps;
-  using Mulss = typename InstImpl<TraitsType>::InstX86Mulss;
-  using Pmull = typename InstImpl<TraitsType>::InstX86Pmull;
-  using Pmulhw = typename InstImpl<TraitsType>::InstX86Pmulhw;
-  using Pmulhuw = typename InstImpl<TraitsType>::InstX86Pmulhuw;
-  using Pmaddwd = typename InstImpl<TraitsType>::InstX86Pmaddwd;
-  using Pmuludq = typename InstImpl<TraitsType>::InstX86Pmuludq;
-  using Divps = typename InstImpl<TraitsType>::InstX86Divps;
-  using Divss = typename InstImpl<TraitsType>::InstX86Divss;
-  using Rol = typename InstImpl<TraitsType>::InstX86Rol;
-  using Shl = typename InstImpl<TraitsType>::InstX86Shl;
-  using Psll = typename InstImpl<TraitsType>::InstX86Psll;
-  using Psrl = typename InstImpl<TraitsType>::InstX86Psrl;
-  using Shr = typename InstImpl<TraitsType>::InstX86Shr;
-  using Sar = typename InstImpl<TraitsType>::InstX86Sar;
-  using Psra = typename InstImpl<TraitsType>::InstX86Psra;
-  using Pcmpeq = typename InstImpl<TraitsType>::InstX86Pcmpeq;
-  using Pcmpgt = typename InstImpl<TraitsType>::InstX86Pcmpgt;
-  using MovssRegs = typename InstImpl<TraitsType>::InstX86MovssRegs;
-  using Idiv = typename InstImpl<TraitsType>::InstX86Idiv;
-  using Div = typename InstImpl<TraitsType>::InstX86Div;
-  using Insertps = typename InstImpl<TraitsType>::InstX86Insertps;
-  using Pinsr = typename InstImpl<TraitsType>::InstX86Pinsr;
-  using Shufps = typename InstImpl<TraitsType>::InstX86Shufps;
-  using Blendvps = typename InstImpl<TraitsType>::InstX86Blendvps;
-  using Pblendvb = typename InstImpl<TraitsType>::InstX86Pblendvb;
-  using Pextr = typename InstImpl<TraitsType>::InstX86Pextr;
-  using Pshufd = typename InstImpl<TraitsType>::InstX86Pshufd;
-  using Lockable = typename InstImpl<TraitsType>::InstX86BaseLockable;
-  using Mul = typename InstImpl<TraitsType>::InstX86Mul;
-  using Shld = typename InstImpl<TraitsType>::InstX86Shld;
-  using Shrd = typename InstImpl<TraitsType>::InstX86Shrd;
-  using Cmov = typename InstImpl<TraitsType>::InstX86Cmov;
-  using Cmpps = typename InstImpl<TraitsType>::InstX86Cmpps;
-  using Cmpxchg = typename InstImpl<TraitsType>::InstX86Cmpxchg;
-  using Cmpxchg8b = typename InstImpl<TraitsType>::InstX86Cmpxchg8b;
-  using Cvt = typename InstImpl<TraitsType>::InstX86Cvt;
-  using Round = typename InstImpl<TraitsType>::InstX86Round;
-  using Icmp = typename InstImpl<TraitsType>::InstX86Icmp;
-  using Ucomiss = typename InstImpl<TraitsType>::InstX86Ucomiss;
-  using UD2 = typename InstImpl<TraitsType>::InstX86UD2;
-  using Int3 = typename InstImpl<TraitsType>::InstX86Int3;
-  using Test = typename InstImpl<TraitsType>::InstX86Test;
-  using Mfence = typename InstImpl<TraitsType>::InstX86Mfence;
-  using Store = typename InstImpl<TraitsType>::InstX86Store;
-  using StoreP = typename InstImpl<TraitsType>::InstX86StoreP;
-  using StoreQ = typename InstImpl<TraitsType>::InstX86StoreQ;
-  using StoreD = typename InstImpl<TraitsType>::InstX86StoreD;
-  using Nop = typename InstImpl<TraitsType>::InstX86Nop;
-  using Pop = typename InstImpl<TraitsType>::InstX86Pop;
-  using Push = typename InstImpl<TraitsType>::InstX86Push;
-  using Ret = typename InstImpl<TraitsType>::InstX86Ret;
-  using Setcc = typename InstImpl<TraitsType>::InstX86Setcc;
-  using Xadd = typename InstImpl<TraitsType>::InstX86Xadd;
-  using Xchg = typename InstImpl<TraitsType>::InstX86Xchg;
-
-  using IacaStart = typename InstImpl<TraitsType>::InstX86IacaStart;
-  using IacaEnd = typename InstImpl<TraitsType>::InstX86IacaEnd;
-
-  using Pshufb = typename InstImpl<TraitsType>::InstX86Pshufb;
-  using Punpckl = typename InstImpl<TraitsType>::InstX86Punpckl;
-  using Punpckh = typename InstImpl<TraitsType>::InstX86Punpckh;
-  using Packss = typename InstImpl<TraitsType>::InstX86Packss;
-  using Packus = typename InstImpl<TraitsType>::InstX86Packus;
-};
-
-/// X86 Instructions have static data (particularly, opcodes and instruction
-/// emitters). Each X86 target needs to declare and define all of these, so the
-/// macros below are provided so that, if something changes, all X86
-/// targets will be updated automatically.
-#define X86INSTS_DEFINE_STATIC_DATA(TraitsType)                                \
-  namespace Ice {                                                              \
-  namespace X8664 {                                                            \
-  /* In-place ops */                                                           \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Bswap::Base::Opcode =     \
-      "bswap";                                                                 \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Neg::Base::Opcode =       \
-      "neg";                                                                   \
-  /* Unary ops */                                                              \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Bsf::Base::Opcode =       \
-      "bsf";                                                                   \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Bsr::Base::Opcode =       \
-      "bsr";                                                                   \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Lea::Base::Opcode =       \
-      "lea";                                                                   \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Movd::Base::Opcode =      \
-      "movd";                                                                  \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Movsx::Base::Opcode =     \
-      "movs";                                                                  \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Movzx::Base::Opcode =     \
-      "movz";                                                                  \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Sqrt::Base::Opcode =      \
-      "sqrt";                                                                  \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Cbwdq::Base::Opcode =     \
-      "cbw/cwd/cdq";                                                           \
-  /* Mov-like ops */                                                           \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Mov::Base::Opcode =       \
-      "mov";                                                                   \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Movp::Base::Opcode =      \
-      "movups";                                                                \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Movq::Base::Opcode =      \
-      "movq";                                                                  \
-  /* Binary ops */                                                             \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Add::Base::Opcode =       \
-      "add";                                                                   \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86AddRMW::Base::Opcode =    \
-      "add";                                                                   \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Addps::Base::Opcode =     \
-      "add";                                                                   \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Adc::Base::Opcode =       \
-      "adc";                                                                   \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86AdcRMW::Base::Opcode =    \
-      "adc";                                                                   \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Addss::Base::Opcode =     \
-      "add";                                                                   \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Andnps::Base::Opcode =    \
-      "andn";                                                                  \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Andps::Base::Opcode =     \
-      "and";                                                                   \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Maxss::Base::Opcode =     \
-      "max";                                                                   \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Minss::Base::Opcode =     \
-      "min";                                                                   \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Maxps::Base::Opcode =     \
-      "max";                                                                   \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Minps::Base::Opcode =     \
-      "min";                                                                   \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Padd::Base::Opcode =      \
-      "padd";                                                                  \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Padds::Base::Opcode =     \
-      "padds";                                                                 \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Paddus::Base::Opcode =    \
-      "paddus";                                                                \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Sub::Base::Opcode =       \
-      "sub";                                                                   \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86SubRMW::Base::Opcode =    \
-      "sub";                                                                   \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Subps::Base::Opcode =     \
-      "sub";                                                                   \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Subss::Base::Opcode =     \
-      "sub";                                                                   \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Sbb::Base::Opcode =       \
-      "sbb";                                                                   \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86SbbRMW::Base::Opcode =    \
-      "sbb";                                                                   \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Psub::Base::Opcode =      \
-      "psub";                                                                  \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Psubs::Base::Opcode =     \
-      "psubs";                                                                 \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Psubus::Base::Opcode =    \
-      "psubus";                                                                \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86And::Base::Opcode =       \
-      "and";                                                                   \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86AndRMW::Base::Opcode =    \
-      "and";                                                                   \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Pand::Base::Opcode =      \
-      "pand";                                                                  \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Pandn::Base::Opcode =     \
-      "pandn";                                                                 \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Or::Base::Opcode = "or";  \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Orps::Base::Opcode =      \
-      "or";                                                                    \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86OrRMW::Base::Opcode =     \
-      "or";                                                                    \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Por::Base::Opcode =       \
-      "por";                                                                   \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Xor::Base::Opcode =       \
-      "xor";                                                                   \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Xorps::Base::Opcode =     \
-      "xor";                                                                   \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86XorRMW::Base::Opcode =    \
-      "xor";                                                                   \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Pxor::Base::Opcode =      \
-      "pxor";                                                                  \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Imul::Base::Opcode =      \
-      "imul";                                                                  \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86ImulImm::Base::Opcode =   \
-      "imul";                                                                  \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Mulps::Base::Opcode =     \
-      "mul";                                                                   \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Mulss::Base::Opcode =     \
-      "mul";                                                                   \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Pmull::Base::Opcode =     \
-      "pmull";                                                                 \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Pmulhw::Base::Opcode =    \
-      "pmulhw";                                                                \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Pmulhuw::Base::Opcode =   \
-      "pmulhuw";                                                               \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Pmaddwd::Base::Opcode =   \
-      "pmaddwd";                                                               \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Pmuludq::Base::Opcode =   \
-      "pmuludq";                                                               \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Div::Base::Opcode =       \
-      "div";                                                                   \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Divps::Base::Opcode =     \
-      "div";                                                                   \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Divss::Base::Opcode =     \
-      "div";                                                                   \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Idiv::Base::Opcode =      \
-      "idiv";                                                                  \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Rol::Base::Opcode =       \
-      "rol";                                                                   \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Shl::Base::Opcode =       \
-      "shl";                                                                   \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Psll::Base::Opcode =      \
-      "psll";                                                                  \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Shr::Base::Opcode =       \
-      "shr";                                                                   \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Sar::Base::Opcode =       \
-      "sar";                                                                   \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Psra::Base::Opcode =      \
-      "psra";                                                                  \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Psrl::Base::Opcode =      \
-      "psrl";                                                                  \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Pcmpeq::Base::Opcode =    \
-      "pcmpeq";                                                                \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Pcmpgt::Base::Opcode =    \
-      "pcmpgt";                                                                \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86MovssRegs::Base::Opcode = \
-      "movss";                                                                 \
-  /* Ternary ops */                                                            \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Insertps::Base::Opcode =  \
-      "insertps";                                                              \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Round::Base::Opcode =     \
-      "round";                                                                 \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Shufps::Base::Opcode =    \
-      "shufps";                                                                \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Pinsr::Base::Opcode =     \
-      "pinsr";                                                                 \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Blendvps::Base::Opcode =  \
-      "blendvps";                                                              \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Pblendvb::Base::Opcode =  \
-      "pblendvb";                                                              \
-  /* Three address ops */                                                      \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Pextr::Base::Opcode =     \
-      "pextr";                                                                 \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Pshufd::Base::Opcode =    \
-      "pshufd";                                                                \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Pshufb::Base::Opcode =    \
-      "pshufb";                                                                \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Punpckl::Base::Opcode =   \
-      "punpckl";                                                               \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Punpckh::Base::Opcode =   \
-      "punpckh";                                                               \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Packss::Base::Opcode =    \
-      "packss";                                                                \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr const char *InstImpl<TraitsType>::InstX86Packus::Base::Opcode =    \
-      "packus";                                                                \
-  /* Inplace GPR ops */                                                        \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::GPREmitterOneOp                   \
-      InstImpl<TraitsType>::InstX86Bswap::Base::Emitter = {                    \
-          &InstImpl<TraitsType>::Assembler::bswap,                             \
-          nullptr /* only a reg form exists */                                 \
-  };                                                                           \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::GPREmitterOneOp                   \
-      InstImpl<TraitsType>::InstX86Neg::Base::Emitter = {                      \
-          &InstImpl<TraitsType>::Assembler::neg,                               \
-          &InstImpl<TraitsType>::Assembler::neg};                              \
-                                                                               \
-  /* Unary GPR ops */                                                          \
-  template <>                                                                  \
-  template <> /* uses specialized emitter. */                                  \
-  constexpr InstImpl<TraitsType>::Assembler::GPREmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Cbwdq::Base::Emitter = {nullptr, nullptr,   \
-                                                           nullptr};           \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::GPREmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Bsf::Base::Emitter = {                      \
-          &InstImpl<TraitsType>::Assembler::bsf,                               \
-          &InstImpl<TraitsType>::Assembler::bsf, nullptr};                     \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::GPREmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Bsr::Base::Emitter = {                      \
-          &InstImpl<TraitsType>::Assembler::bsr,                               \
-          &InstImpl<TraitsType>::Assembler::bsr, nullptr};                     \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::GPREmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Lea::Base::Emitter = {                      \
-          /* reg/reg and reg/imm are illegal */ nullptr,                       \
-          &InstImpl<TraitsType>::Assembler::lea, nullptr};                     \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::GPREmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Movsx::Base::Emitter = {                    \
-          &InstImpl<TraitsType>::Assembler::movsx,                             \
-          &InstImpl<TraitsType>::Assembler::movsx, nullptr};                   \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::GPREmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Movzx::Base::Emitter = {                    \
-          &InstImpl<TraitsType>::Assembler::movzx,                             \
-          &InstImpl<TraitsType>::Assembler::movzx, nullptr};                   \
-                                                                               \
-  /* Unary XMM ops */                                                          \
-  template <>                                                                  \
-  template <> /* uses specialized emitter. */                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Movd::Base::Emitter = {nullptr, nullptr};   \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Sqrt::Base::Emitter = {                     \
-          &InstImpl<TraitsType>::Assembler::sqrt,                              \
-          &InstImpl<TraitsType>::Assembler::sqrt};                             \
-                                                                               \
-  /* Binary GPR ops */                                                         \
-  template <>                                                                  \
-  template <> /* uses specialized emitter. */                                  \
-  constexpr InstImpl<TraitsType>::Assembler::GPREmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Imul::Base::Emitter = {nullptr, nullptr,    \
-                                                          nullptr};            \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::GPREmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Add::Base::Emitter = {                      \
-          &InstImpl<TraitsType>::Assembler::add,                               \
-          &InstImpl<TraitsType>::Assembler::add,                               \
-          &InstImpl<TraitsType>::Assembler::add};                              \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::GPREmitterAddrOp                  \
-      InstImpl<TraitsType>::InstX86AddRMW::Base::Emitter = {                   \
-          &InstImpl<TraitsType>::Assembler::add,                               \
-          &InstImpl<TraitsType>::Assembler::add};                              \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::GPREmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Adc::Base::Emitter = {                      \
-          &InstImpl<TraitsType>::Assembler::adc,                               \
-          &InstImpl<TraitsType>::Assembler::adc,                               \
-          &InstImpl<TraitsType>::Assembler::adc};                              \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::GPREmitterAddrOp                  \
-      InstImpl<TraitsType>::InstX86AdcRMW::Base::Emitter = {                   \
-          &InstImpl<TraitsType>::Assembler::adc,                               \
-          &InstImpl<TraitsType>::Assembler::adc};                              \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::GPREmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86And::Base::Emitter = {                      \
-          &InstImpl<TraitsType>::Assembler::And,                               \
-          &InstImpl<TraitsType>::Assembler::And,                               \
-          &InstImpl<TraitsType>::Assembler::And};                              \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::GPREmitterAddrOp                  \
-      InstImpl<TraitsType>::InstX86AndRMW::Base::Emitter = {                   \
-          &InstImpl<TraitsType>::Assembler::And,                               \
-          &InstImpl<TraitsType>::Assembler::And};                              \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::GPREmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Or::Base::Emitter = {                       \
-          &InstImpl<TraitsType>::Assembler::Or,                                \
-          &InstImpl<TraitsType>::Assembler::Or,                                \
-          &InstImpl<TraitsType>::Assembler::Or};                               \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::GPREmitterAddrOp                  \
-      InstImpl<TraitsType>::InstX86OrRMW::Base::Emitter = {                    \
-          &InstImpl<TraitsType>::Assembler::Or,                                \
-          &InstImpl<TraitsType>::Assembler::Or};                               \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::GPREmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Sbb::Base::Emitter = {                      \
-          &InstImpl<TraitsType>::Assembler::sbb,                               \
-          &InstImpl<TraitsType>::Assembler::sbb,                               \
-          &InstImpl<TraitsType>::Assembler::sbb};                              \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::GPREmitterAddrOp                  \
-      InstImpl<TraitsType>::InstX86SbbRMW::Base::Emitter = {                   \
-          &InstImpl<TraitsType>::Assembler::sbb,                               \
-          &InstImpl<TraitsType>::Assembler::sbb};                              \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::GPREmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Sub::Base::Emitter = {                      \
-          &InstImpl<TraitsType>::Assembler::sub,                               \
-          &InstImpl<TraitsType>::Assembler::sub,                               \
-          &InstImpl<TraitsType>::Assembler::sub};                              \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::GPREmitterAddrOp                  \
-      InstImpl<TraitsType>::InstX86SubRMW::Base::Emitter = {                   \
-          &InstImpl<TraitsType>::Assembler::sub,                               \
-          &InstImpl<TraitsType>::Assembler::sub};                              \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::GPREmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Xor::Base::Emitter = {                      \
-          &InstImpl<TraitsType>::Assembler::Xor,                               \
-          &InstImpl<TraitsType>::Assembler::Xor,                               \
-          &InstImpl<TraitsType>::Assembler::Xor};                              \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::GPREmitterAddrOp                  \
-      InstImpl<TraitsType>::InstX86XorRMW::Base::Emitter = {                   \
-          &InstImpl<TraitsType>::Assembler::Xor,                               \
-          &InstImpl<TraitsType>::Assembler::Xor};                              \
-                                                                               \
-  /* Binary Shift GPR ops */                                                   \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::GPREmitterShiftOp                 \
-      InstImpl<TraitsType>::InstX86Rol::Base::Emitter = {                      \
-          &InstImpl<TraitsType>::Assembler::rol,                               \
-          &InstImpl<TraitsType>::Assembler::rol};                              \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::GPREmitterShiftOp                 \
-      InstImpl<TraitsType>::InstX86Sar::Base::Emitter = {                      \
-          &InstImpl<TraitsType>::Assembler::sar,                               \
-          &InstImpl<TraitsType>::Assembler::sar};                              \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::GPREmitterShiftOp                 \
-      InstImpl<TraitsType>::InstX86Shl::Base::Emitter = {                      \
-          &InstImpl<TraitsType>::Assembler::shl,                               \
-          &InstImpl<TraitsType>::Assembler::shl};                              \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::GPREmitterShiftOp                 \
-      InstImpl<TraitsType>::InstX86Shr::Base::Emitter = {                      \
-          &InstImpl<TraitsType>::Assembler::shr,                               \
-          &InstImpl<TraitsType>::Assembler::shr};                              \
-                                                                               \
-  /* Binary XMM ops */                                                         \
-  template <>                                                                  \
-  template <> /* uses specialized emitter. */                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86MovssRegs::Base::Emitter = {nullptr,        \
-                                                               nullptr};       \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Addss::Base::Emitter = {                    \
-          &InstImpl<TraitsType>::Assembler::addss,                             \
-          &InstImpl<TraitsType>::Assembler::addss};                            \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Addps::Base::Emitter = {                    \
-          &InstImpl<TraitsType>::Assembler::addps,                             \
-          &InstImpl<TraitsType>::Assembler::addps};                            \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Divss::Base::Emitter = {                    \
-          &InstImpl<TraitsType>::Assembler::divss,                             \
-          &InstImpl<TraitsType>::Assembler::divss};                            \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Divps::Base::Emitter = {                    \
-          &InstImpl<TraitsType>::Assembler::divps,                             \
-          &InstImpl<TraitsType>::Assembler::divps};                            \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Mulss::Base::Emitter = {                    \
-          &InstImpl<TraitsType>::Assembler::mulss,                             \
-          &InstImpl<TraitsType>::Assembler::mulss};                            \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Mulps::Base::Emitter = {                    \
-          &InstImpl<TraitsType>::Assembler::mulps,                             \
-          &InstImpl<TraitsType>::Assembler::mulps};                            \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Padd::Base::Emitter = {                     \
-          &InstImpl<TraitsType>::Assembler::padd,                              \
-          &InstImpl<TraitsType>::Assembler::padd};                             \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Padds::Base::Emitter = {                    \
-          &InstImpl<TraitsType>::Assembler::padds,                             \
-          &InstImpl<TraitsType>::Assembler::padds};                            \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Paddus::Base::Emitter = {                   \
-          &InstImpl<TraitsType>::Assembler::paddus,                            \
-          &InstImpl<TraitsType>::Assembler::paddus};                           \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Pand::Base::Emitter = {                     \
-          &InstImpl<TraitsType>::Assembler::pand,                              \
-          &InstImpl<TraitsType>::Assembler::pand};                             \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Pandn::Base::Emitter = {                    \
-          &InstImpl<TraitsType>::Assembler::pandn,                             \
-          &InstImpl<TraitsType>::Assembler::pandn};                            \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Pcmpeq::Base::Emitter = {                   \
-          &InstImpl<TraitsType>::Assembler::pcmpeq,                            \
-          &InstImpl<TraitsType>::Assembler::pcmpeq};                           \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Pcmpgt::Base::Emitter = {                   \
-          &InstImpl<TraitsType>::Assembler::pcmpgt,                            \
-          &InstImpl<TraitsType>::Assembler::pcmpgt};                           \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Pmull::Base::Emitter = {                    \
-          &InstImpl<TraitsType>::Assembler::pmull,                             \
-          &InstImpl<TraitsType>::Assembler::pmull};                            \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Pmulhw::Base::Emitter = {                   \
-          &InstImpl<TraitsType>::Assembler::pmulhw,                            \
-          &InstImpl<TraitsType>::Assembler::pmulhw};                           \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Pmulhuw::Base::Emitter = {                  \
-          &InstImpl<TraitsType>::Assembler::pmulhuw,                           \
-          &InstImpl<TraitsType>::Assembler::pmulhuw};                          \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Pmaddwd::Base::Emitter = {                  \
-          &InstImpl<TraitsType>::Assembler::pmaddwd,                           \
-          &InstImpl<TraitsType>::Assembler::pmaddwd};                          \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Pmuludq::Base::Emitter = {                  \
-          &InstImpl<TraitsType>::Assembler::pmuludq,                           \
-          &InstImpl<TraitsType>::Assembler::pmuludq};                          \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Por::Base::Emitter = {                      \
-          &InstImpl<TraitsType>::Assembler::por,                               \
-          &InstImpl<TraitsType>::Assembler::por};                              \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Psub::Base::Emitter = {                     \
-          &InstImpl<TraitsType>::Assembler::psub,                              \
-          &InstImpl<TraitsType>::Assembler::psub};                             \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Psubs::Base::Emitter = {                    \
-          &InstImpl<TraitsType>::Assembler::psubs,                             \
-          &InstImpl<TraitsType>::Assembler::psubs};                            \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Psubus::Base::Emitter = {                   \
-          &InstImpl<TraitsType>::Assembler::psubus,                            \
-          &InstImpl<TraitsType>::Assembler::psubus};                           \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Pxor::Base::Emitter = {                     \
-          &InstImpl<TraitsType>::Assembler::pxor,                              \
-          &InstImpl<TraitsType>::Assembler::pxor};                             \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Subss::Base::Emitter = {                    \
-          &InstImpl<TraitsType>::Assembler::subss,                             \
-          &InstImpl<TraitsType>::Assembler::subss};                            \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Subps::Base::Emitter = {                    \
-          &InstImpl<TraitsType>::Assembler::subps,                             \
-          &InstImpl<TraitsType>::Assembler::subps};                            \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Andnps::Base::Emitter = {                   \
-          &InstImpl<TraitsType>::Assembler::andnps,                            \
-          &InstImpl<TraitsType>::Assembler::andnps};                           \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Andps::Base::Emitter = {                    \
-          &InstImpl<TraitsType>::Assembler::andps,                             \
-          &InstImpl<TraitsType>::Assembler::andps};                            \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Maxss::Base::Emitter = {                    \
-          &InstImpl<TraitsType>::Assembler::maxss,                             \
-          &InstImpl<TraitsType>::Assembler::maxss};                            \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Minss::Base::Emitter = {                    \
-          &InstImpl<TraitsType>::Assembler::minss,                             \
-          &InstImpl<TraitsType>::Assembler::minss};                            \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Maxps::Base::Emitter = {                    \
-          &InstImpl<TraitsType>::Assembler::maxps,                             \
-          &InstImpl<TraitsType>::Assembler::maxps};                            \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Minps::Base::Emitter = {                    \
-          &InstImpl<TraitsType>::Assembler::minps,                             \
-          &InstImpl<TraitsType>::Assembler::minps};                            \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Orps::Base::Emitter = {                     \
-          &InstImpl<TraitsType>::Assembler::orps,                              \
-          &InstImpl<TraitsType>::Assembler::orps};                             \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Xorps::Base::Emitter = {                    \
-          &InstImpl<TraitsType>::Assembler::xorps,                             \
-          &InstImpl<TraitsType>::Assembler::xorps};                            \
-                                                                               \
-  /* Binary XMM Shift ops */                                                   \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterShiftOp                 \
-      InstImpl<TraitsType>::InstX86Psll::Base::Emitter = {                     \
-          &InstImpl<TraitsType>::Assembler::psll,                              \
-          &InstImpl<TraitsType>::Assembler::psll,                              \
-          &InstImpl<TraitsType>::Assembler::psll};                             \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterShiftOp                 \
-      InstImpl<TraitsType>::InstX86Psra::Base::Emitter = {                     \
-          &InstImpl<TraitsType>::Assembler::psra,                              \
-          &InstImpl<TraitsType>::Assembler::psra,                              \
-          &InstImpl<TraitsType>::Assembler::psra};                             \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterShiftOp                 \
-      InstImpl<TraitsType>::InstX86Psrl::Base::Emitter = {                     \
-          &InstImpl<TraitsType>::Assembler::psrl,                              \
-          &InstImpl<TraitsType>::Assembler::psrl,                              \
-          &InstImpl<TraitsType>::Assembler::psrl};                             \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Pshufb::Base::Emitter = {                   \
-          &InstImpl<TraitsType>::Assembler::pshufb,                            \
-          &InstImpl<TraitsType>::Assembler::pshufb};                           \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Punpckl::Base::Emitter = {                  \
-          &InstImpl<TraitsType>::Assembler::punpckl,                           \
-          &InstImpl<TraitsType>::Assembler::punpckl};                          \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Punpckh::Base::Emitter = {                  \
-          &InstImpl<TraitsType>::Assembler::punpckh,                           \
-          &InstImpl<TraitsType>::Assembler::punpckh};                          \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Packss::Base::Emitter = {                   \
-          &InstImpl<TraitsType>::Assembler::packss,                            \
-          &InstImpl<TraitsType>::Assembler::packss};                           \
-  template <>                                                                  \
-  template <>                                                                  \
-  constexpr InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                   \
-      InstImpl<TraitsType>::InstX86Packus::Base::Emitter = {                   \
-          &InstImpl<TraitsType>::Assembler::packus,                            \
-          &InstImpl<TraitsType>::Assembler::packus};                           \
-  }                                                                            \
-  }
-
-} // namespace X8664
-} // end of namespace Ice
-
-#include "IceInstX8664BaseImpl.h"
-
-#endif // SUBZERO_SRC_ICEINSTX8664BASE_H
diff --git a/third_party/subzero/src/IceInstX8664BaseImpl.h b/third_party/subzero/src/IceInstX8664BaseImpl.h
deleted file mode 100644
index 4870be5..0000000
--- a/third_party/subzero/src/IceInstX8664BaseImpl.h
+++ /dev/null
@@ -1,2963 +0,0 @@
-//===- subzero/src/IceInstX8664BaseImpl.h - Generic X86 instructions -*- C++
-//-*=//
-//
-//                        The Subzero Code Generator
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// \brief Implements the InstX86Base class and its descendants.
-///
-//===----------------------------------------------------------------------===//
-
-#ifndef SUBZERO_SRC_ICEINSTX8664BASEIMPL_H
-#define SUBZERO_SRC_ICEINSTX8664BASEIMPL_H
-
-#include "IceInstX8664Base.h"
-
-#include "IceAssemblerX8664.h"
-#include "IceCfg.h"
-#include "IceCfgNode.h"
-#include "IceDefs.h"
-#include "IceInst.h"
-#include "IceOperand.h"
-#include "IceTargetLowering.h"
-#include "IceTargetLoweringX8664Base.h"
-
-namespace Ice {
-namespace X8664 {
-
-template <typename TraitsType>
-const char *InstImpl<TraitsType>::InstX86Base::getWidthString(Type Ty) {
-  return Traits::TypeAttributes[Ty].WidthString;
-}
-
-template <typename TraitsType>
-const char *InstImpl<TraitsType>::InstX86Base::getFldString(Type Ty) {
-  return Traits::TypeAttributes[Ty].FldString;
-}
-
-template <typename TraitsType>
-typename InstImpl<TraitsType>::Cond::BrCond
-InstImpl<TraitsType>::InstX86Base::getOppositeCondition(BrCond Cond) {
-  return Traits::InstBrAttributes[Cond].Opposite;
-}
-
-template <typename TraitsType>
-InstImpl<TraitsType>::InstX86FakeRMW::InstX86FakeRMW(Cfg *Func, Operand *Data,
-                                                     Operand *Addr,
-                                                     InstArithmetic::OpKind Op,
-                                                     Variable *Beacon)
-    : InstX86Base(Func, InstX86Base::FakeRMW, 3, nullptr), Op(Op) {
-  this->addSource(Data);
-  this->addSource(Addr);
-  this->addSource(Beacon);
-}
-
-template <typename TraitsType>
-InstImpl<TraitsType>::InstX86Mul::InstX86Mul(Cfg *Func, Variable *Dest,
-                                             Variable *Source1,
-                                             Operand *Source2)
-    : InstX86Base(Func, InstX86Base::Mul, 2, Dest) {
-  this->addSource(Source1);
-  this->addSource(Source2);
-}
-
-template <typename TraitsType>
-InstImpl<TraitsType>::InstX86Shld::InstX86Shld(Cfg *Func, Variable *Dest,
-                                               Variable *Source1,
-                                               Operand *Source2)
-    : InstX86Base(Func, InstX86Base::Shld, 3, Dest) {
-  this->addSource(Dest);
-  this->addSource(Source1);
-  this->addSource(Source2);
-}
-
-template <typename TraitsType>
-InstImpl<TraitsType>::InstX86Shrd::InstX86Shrd(Cfg *Func, Variable *Dest,
-                                               Variable *Source1,
-                                               Operand *Source2)
-    : InstX86Base(Func, InstX86Base::Shrd, 3, Dest) {
-  this->addSource(Dest);
-  this->addSource(Source1);
-  this->addSource(Source2);
-}
-
-template <typename TraitsType>
-InstImpl<TraitsType>::InstX86Label::InstX86Label(Cfg *Func,
-                                                 TargetLowering *Target)
-    : InstX86Base(Func, InstX86Base::Label, 0, nullptr),
-      LabelNumber(Target->makeNextLabelNumber()) {
-  if (BuildDefs::dump()) {
-    Name = GlobalString::createWithString(
-        Func->getContext(), ".L" + Func->getFunctionName() + "$local$__" +
-                                std::to_string(LabelNumber));
-  } else {
-    Name = GlobalString::createWithoutString(Func->getContext());
-  }
-}
-
-template <typename TraitsType>
-InstImpl<TraitsType>::InstX86Br::InstX86Br(Cfg *Func, const CfgNode *TargetTrue,
-                                           const CfgNode *TargetFalse,
-                                           const InstX86Label *Label,
-                                           BrCond Condition, Mode Kind)
-    : InstX86Base(Func, InstX86Base::Br, 0, nullptr), Condition(Condition),
-      TargetTrue(TargetTrue), TargetFalse(TargetFalse), Label(Label),
-      Kind(Kind) {}
-
-template <typename TraitsType>
-bool InstImpl<TraitsType>::InstX86Br::optimizeBranch(const CfgNode *NextNode) {
-  // If there is no next block, then there can be no fallthrough to optimize.
-  if (NextNode == nullptr)
-    return false;
-  // Intra-block conditional branches can't be optimized.
-  if (Label)
-    return false;
-  // If there is no fallthrough node, such as a non-default case label for a
-  // switch instruction, then there is no opportunity to optimize.
-  if (getTargetFalse() == nullptr)
-    return false;
-
-  // Unconditional branch to the next node can be removed.
-  if (Condition == Cond::Br_None && getTargetFalse() == NextNode) {
-    assert(getTargetTrue() == nullptr);
-    this->setDeleted();
-    return true;
-  }
-  // If the fallthrough is to the next node, set fallthrough to nullptr to
-  // indicate.
-  if (getTargetFalse() == NextNode) {
-    TargetFalse = nullptr;
-    return true;
-  }
-  // If TargetTrue is the next node, and TargetFalse is not nullptr (which was
-  // already tested above), then invert the branch condition, swap the targets,
-  // and set new fallthrough to nullptr.
-  if (getTargetTrue() == NextNode) {
-    assert(Condition != Cond::Br_None);
-    Condition = this->getOppositeCondition(Condition);
-    TargetTrue = getTargetFalse();
-    TargetFalse = nullptr;
-    return true;
-  }
-  return false;
-}
-
-template <typename TraitsType>
-bool InstImpl<TraitsType>::InstX86Br::repointEdges(CfgNode *OldNode,
-                                                   CfgNode *NewNode) {
-  bool Found = false;
-  if (TargetFalse == OldNode) {
-    TargetFalse = NewNode;
-    Found = true;
-  }
-  if (TargetTrue == OldNode) {
-    TargetTrue = NewNode;
-    Found = true;
-  }
-  return Found;
-}
-
-template <typename TraitsType>
-InstImpl<TraitsType>::InstX86Jmp::InstX86Jmp(Cfg *Func, Operand *Target)
-    : InstX86Base(Func, InstX86Base::Jmp, 1, nullptr) {
-  this->addSource(Target);
-}
-
-template <typename TraitsType>
-InstImpl<TraitsType>::InstX86Call::InstX86Call(Cfg *Func, Variable *Dest,
-                                               Operand *CallTarget)
-    : InstX86Base(Func, InstX86Base::Call, 1, Dest) {
-  this->HasSideEffects = true;
-  this->addSource(CallTarget);
-}
-
-template <typename TraitsType>
-InstImpl<TraitsType>::InstX86Movmsk::InstX86Movmsk(Cfg *Func, Variable *Dest,
-                                                   Operand *Source)
-    : InstX86Base(Func, InstX86Base::Movmsk, 1, Dest) {
-  this->addSource(Source);
-}
-
-template <typename TraitsType>
-InstImpl<TraitsType>::InstX86Cmov::InstX86Cmov(Cfg *Func, Variable *Dest,
-                                               Operand *Source,
-                                               BrCond Condition)
-    : InstX86Base(Func, InstX86Base::Cmov, 2, Dest), Condition(Condition) {
-  // The final result is either the original Dest, or Source, so mark both as
-  // sources.
-  this->addSource(Dest);
-  this->addSource(Source);
-}
-
-template <typename TraitsType>
-InstImpl<TraitsType>::InstX86Cmpps::InstX86Cmpps(Cfg *Func, Variable *Dest,
-                                                 Operand *Source,
-                                                 CmppsCond Condition)
-    : InstX86Base(Func, InstX86Base::Cmpps, 2, Dest), Condition(Condition) {
-  this->addSource(Dest);
-  this->addSource(Source);
-}
-
-template <typename TraitsType>
-InstImpl<TraitsType>::InstX86Cmpxchg::InstX86Cmpxchg(Cfg *Func,
-                                                     Operand *DestOrAddr,
-                                                     Variable *Eax,
-                                                     Variable *Desired,
-                                                     bool Locked)
-    : InstImpl<TraitsType>::InstX86BaseLockable(
-          Func, InstX86Base::Cmpxchg, 3, llvm::dyn_cast<Variable>(DestOrAddr),
-          Locked) {
-  constexpr uint16_t Encoded_rAX = 0;
-  (void)Encoded_rAX;
-  assert(Traits::getEncodedGPR(Eax->getRegNum()) == Encoded_rAX);
-  this->addSource(DestOrAddr);
-  this->addSource(Eax);
-  this->addSource(Desired);
-}
-
-template <typename TraitsType>
-InstImpl<TraitsType>::InstX86Cmpxchg8b::InstX86Cmpxchg8b(
-    Cfg *Func, X86OperandMem *Addr, Variable *Edx, Variable *Eax, Variable *Ecx,
-    Variable *Ebx, bool Locked)
-    : InstImpl<TraitsType>::InstX86BaseLockable(Func, InstX86Base::Cmpxchg, 5,
-                                                nullptr, Locked) {
-  assert(Edx->getRegNum() == RegisterSet::Reg_edx);
-  assert(Eax->getRegNum() == RegisterSet::Reg_eax);
-  assert(Ecx->getRegNum() == RegisterSet::Reg_ecx);
-  assert(Ebx->getRegNum() == RegisterSet::Reg_ebx);
-  this->addSource(Addr);
-  this->addSource(Edx);
-  this->addSource(Eax);
-  this->addSource(Ecx);
-  this->addSource(Ebx);
-}
-
-template <typename TraitsType>
-InstImpl<TraitsType>::InstX86Cvt::InstX86Cvt(Cfg *Func, Variable *Dest,
-                                             Operand *Source,
-                                             CvtVariant Variant)
-    : InstX86Base(Func, InstX86Base::Cvt, 1, Dest), Variant(Variant) {
-  this->addSource(Source);
-}
-
-template <typename TraitsType>
-InstImpl<TraitsType>::InstX86Icmp::InstX86Icmp(Cfg *Func, Operand *Src0,
-                                               Operand *Src1)
-    : InstX86Base(Func, InstX86Base::Icmp, 2, nullptr) {
-  this->addSource(Src0);
-  this->addSource(Src1);
-}
-
-template <typename TraitsType>
-InstImpl<TraitsType>::InstX86Ucomiss::InstX86Ucomiss(Cfg *Func, Operand *Src0,
-                                                     Operand *Src1)
-    : InstX86Base(Func, InstX86Base::Ucomiss, 2, nullptr) {
-  this->addSource(Src0);
-  this->addSource(Src1);
-}
-
-template <typename TraitsType>
-InstImpl<TraitsType>::InstX86UD2::InstX86UD2(Cfg *Func)
-    : InstX86Base(Func, InstX86Base::UD2, 0, nullptr) {}
-
-template <typename TraitsType>
-InstImpl<TraitsType>::InstX86Int3::InstX86Int3(Cfg *Func)
-    : InstX86Base(Func, InstX86Base::Int3, 0, nullptr) {}
-
-template <typename TraitsType>
-InstImpl<TraitsType>::InstX86Test::InstX86Test(Cfg *Func, Operand *Src1,
-                                               Operand *Src2)
-    : InstX86Base(Func, InstX86Base::Test, 2, nullptr) {
-  this->addSource(Src1);
-  this->addSource(Src2);
-}
-
-template <typename TraitsType>
-InstImpl<TraitsType>::InstX86Mfence::InstX86Mfence(Cfg *Func)
-    : InstX86Base(Func, InstX86Base::Mfence, 0, nullptr) {
-  this->HasSideEffects = true;
-}
-
-template <typename TraitsType>
-InstImpl<TraitsType>::InstX86Store::InstX86Store(Cfg *Func, Operand *Value,
-                                                 X86Operand *Mem)
-    : InstX86Base(Func, InstX86Base::Store, 2, nullptr) {
-  this->addSource(Value);
-  this->addSource(Mem);
-}
-
-template <typename TraitsType>
-InstImpl<TraitsType>::InstX86StoreP::InstX86StoreP(Cfg *Func, Variable *Value,
-                                                   X86OperandMem *Mem)
-    : InstX86Base(Func, InstX86Base::StoreP, 2, nullptr) {
-  this->addSource(Value);
-  this->addSource(Mem);
-}
-
-template <typename TraitsType>
-InstImpl<TraitsType>::InstX86StoreQ::InstX86StoreQ(Cfg *Func, Operand *Value,
-                                                   X86OperandMem *Mem)
-    : InstX86Base(Func, InstX86Base::StoreQ, 2, nullptr) {
-  this->addSource(Value);
-  this->addSource(Mem);
-}
-
-template <typename TraitsType>
-InstImpl<TraitsType>::InstX86StoreD::InstX86StoreD(Cfg *Func, Operand *Value,
-                                                   X86OperandMem *Mem)
-    : InstX86Base(Func, InstX86Base::StoreD, 2, nullptr) {
-  this->addSource(Value);
-  this->addSource(Mem);
-}
-
-template <typename TraitsType>
-InstImpl<TraitsType>::InstX86Nop::InstX86Nop(Cfg *Func, NopVariant Variant)
-    : InstX86Base(Func, InstX86Base::Nop, 0, nullptr), Variant(Variant) {}
-
-template <typename TraitsType>
-InstImpl<TraitsType>::InstX86Pop::InstX86Pop(Cfg *Func, Variable *Dest)
-    : InstX86Base(Func, InstX86Base::Pop, 0, Dest) {
-  // A pop instruction affects the stack pointer and so it should not be
-  // allowed to be automatically dead-code eliminated. (The corresponding push
-  // instruction doesn't need this treatment because it has no dest variable
-  // and therefore won't be dead-code eliminated.) This is needed for
-  // late-stage liveness analysis (e.g. asm-verbose mode).
-  this->HasSideEffects = true;
-}
-
-template <typename TraitsType>
-InstImpl<TraitsType>::InstX86Push::InstX86Push(Cfg *Func, Operand *Source)
-    : InstX86Base(Func, InstX86Base::Push, 1, nullptr) {
-  this->addSource(Source);
-}
-
-template <typename TraitsType>
-InstImpl<TraitsType>::InstX86Push::InstX86Push(Cfg *Func, InstX86Label *L)
-    : InstX86Base(Func, InstX86Base::Push, 0, nullptr), Label(L) {}
-
-template <typename TraitsType>
-InstImpl<TraitsType>::InstX86Ret::InstX86Ret(Cfg *Func, Variable *Source)
-    : InstX86Base(Func, InstX86Base::Ret, Source ? 1 : 0, nullptr) {
-  if (Source)
-    this->addSource(Source);
-}
-
-template <typename TraitsType>
-InstImpl<TraitsType>::InstX86Setcc::InstX86Setcc(Cfg *Func, Variable *Dest,
-                                                 BrCond Cond)
-    : InstX86Base(Func, InstX86Base::Setcc, 0, Dest), Condition(Cond) {}
-
-template <typename TraitsType>
-InstImpl<TraitsType>::InstX86Xadd::InstX86Xadd(Cfg *Func, Operand *Dest,
-                                               Variable *Source, bool Locked)
-    : InstImpl<TraitsType>::InstX86BaseLockable(
-          Func, InstX86Base::Xadd, 2, llvm::dyn_cast<Variable>(Dest), Locked) {
-  this->addSource(Dest);
-  this->addSource(Source);
-}
-
-template <typename TraitsType>
-InstImpl<TraitsType>::InstX86Xchg::InstX86Xchg(Cfg *Func, Operand *Dest,
-                                               Variable *Source)
-    : InstX86Base(Func, InstX86Base::Xchg, 2, llvm::dyn_cast<Variable>(Dest)) {
-  this->addSource(Dest);
-  this->addSource(Source);
-}
-
-template <typename TraitsType>
-InstImpl<TraitsType>::InstX86IacaStart::InstX86IacaStart(Cfg *Func)
-    : InstX86Base(Func, InstX86Base::IacaStart, 0, nullptr) {
-  assert(getFlags().getAllowIacaMarks());
-}
-
-template <typename TraitsType>
-InstImpl<TraitsType>::InstX86IacaEnd::InstX86IacaEnd(Cfg *Func)
-    : InstX86Base(Func, InstX86Base::IacaEnd, 0, nullptr) {
-  assert(getFlags().getAllowIacaMarks());
-}
-
-// ======================== Dump routines ======================== //
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Base::dump(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrDump();
-  Str << "[" << Traits::TargetName << "] ";
-  Inst::dump(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86FakeRMW::dump(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrDump();
-  Type Ty = getData()->getType();
-  Str << "rmw " << InstArithmetic::getOpName(getOp()) << " " << Ty << " *";
-  getAddr()->dump(Func);
-  Str << ", ";
-  getData()->dump(Func);
-  Str << ", beacon=";
-  getBeacon()->dump(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Label::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  Str << getLabelName() << ":";
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Label::emitIAS(const Cfg *Func) const {
-  Assembler *Asm = Func->getAssembler<Assembler>();
-  Asm->bindLocalLabel(LabelNumber);
-  if (OffsetReloc != nullptr) {
-    Asm->bindRelocOffset(OffsetReloc);
-  }
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Label::dump(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrDump();
-  Str << getLabelName() << ":";
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Br::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  Str << "\t";
-
-  if (Condition == Cond::Br_None) {
-    Str << "jmp";
-  } else {
-    Str << Traits::InstBrAttributes[Condition].EmitString;
-  }
-
-  if (Label) {
-    Str << "\t" << Label->getLabelName();
-  } else {
-    if (Condition == Cond::Br_None) {
-      Str << "\t" << getTargetFalse()->getAsmName();
-    } else {
-      Str << "\t" << getTargetTrue()->getAsmName();
-      if (getTargetFalse()) {
-        Str << "\n\t"
-               "jmp\t"
-            << getTargetFalse()->getAsmName();
-      }
-    }
-  }
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Br::emitIAS(const Cfg *Func) const {
-  Assembler *Asm = Func->getAssembler<Assembler>();
-  if (Label) {
-    auto *L = Asm->getOrCreateLocalLabel(Label->getLabelNumber());
-    if (Condition == Cond::Br_None) {
-      Asm->jmp(L, isNear());
-    } else {
-      Asm->j(Condition, L, isNear());
-    }
-  } else {
-    if (Condition == Cond::Br_None) {
-      auto *L = Asm->getOrCreateCfgNodeLabel(getTargetFalse()->getIndex());
-      assert(!getTargetTrue());
-      Asm->jmp(L, isNear());
-    } else {
-      auto *L = Asm->getOrCreateCfgNodeLabel(getTargetTrue()->getIndex());
-      Asm->j(Condition, L, isNear());
-      if (getTargetFalse()) {
-        auto *L2 = Asm->getOrCreateCfgNodeLabel(getTargetFalse()->getIndex());
-        Asm->jmp(L2, isNear());
-      }
-    }
-  }
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Br::dump(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrDump();
-  Str << "br ";
-
-  if (Condition == Cond::Br_None) {
-    if (Label) {
-      Str << "label %" << Label->getLabelName();
-    } else {
-      Str << "label %" << getTargetFalse()->getName();
-    }
-    return;
-  }
-
-  Str << Traits::InstBrAttributes[Condition].DisplayString;
-  if (Label) {
-    Str << ", label %" << Label->getLabelName();
-  } else {
-    Str << ", label %" << getTargetTrue()->getName();
-    if (getTargetFalse()) {
-      Str << ", label %" << getTargetFalse()->getName();
-    }
-  }
-
-  Str << " // (" << (isNear() ? "near" : "far") << " jump)";
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Jmp::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  assert(this->getSrcSize() == 1);
-  const Operand *Src = this->getSrc(0);
-  if (Traits::Is64Bit) {
-    if (const auto *CR = llvm::dyn_cast<ConstantRelocatable>(Src)) {
-      Str << "\t"
-             "jmp"
-             "\t"
-          << CR->getName();
-      return;
-    }
-  }
-  Str << "\t"
-         "jmp"
-         "\t*";
-  getJmpTarget()->emit(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Jmp::emitIAS(const Cfg *Func) const {
-  // Note: Adapted (mostly copied) from
-  // InstImpl<TraitsType>::InstX86Call::emitIAS().
-  Assembler *Asm = Func->getAssembler<Assembler>();
-  Operand *Target = getJmpTarget();
-  if (const auto *Var = llvm::dyn_cast<Variable>(Target)) {
-    if (Var->hasReg()) {
-      Asm->jmp(Traits::getEncodedGPR(Var->getRegNum()));
-    } else {
-      // The jmp instruction with a memory operand should be possible to
-      // encode, but it isn't a valid sandboxed instruction, and there
-      // shouldn't be a register allocation issue to jump through a scratch
-      // register, so we don't really need to bother implementing it.
-      llvm::report_fatal_error("Assembler can't jmp to memory operand");
-    }
-  } else if (const auto *Mem = llvm::dyn_cast<X86OperandMem>(Target)) {
-    (void)Mem;
-    assert(Mem->getSegmentRegister() == X86OperandMem::DefaultSegment);
-    llvm::report_fatal_error("Assembler can't jmp to memory operand");
-  } else if (const auto *CR = llvm::dyn_cast<ConstantRelocatable>(Target)) {
-    Asm->jmp(CR);
-  } else if (const auto *Imm = llvm::dyn_cast<ConstantInteger32>(Target)) {
-    // NaCl trampoline calls refer to an address within the sandbox directly.
-    // This is usually only needed for non-IRT builds and otherwise not very
-    // portable or stable. Usually this is only done for "calls" and not jumps.
-    Asm->jmp(AssemblerImmediate(Imm->getValue()));
-  } else {
-    llvm::report_fatal_error("Unexpected operand type");
-  }
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Jmp::dump(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrDump();
-  Str << "jmp ";
-  getJmpTarget()->dump(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Call::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  assert(this->getSrcSize() == 1);
-  Str << "\t"
-         "call\t";
-  Operand *CallTarget = getCallTarget();
-  auto *Target = InstX86Base::getTarget(Func);
-  if (const auto *CI = llvm::dyn_cast<ConstantInteger32>(CallTarget)) {
-    // Emit without a leading '$'.
-    Str << CI->getValue();
-  } else if (const auto DirectCallTarget =
-                 llvm::dyn_cast<ConstantRelocatable>(CallTarget)) {
-    DirectCallTarget->emitWithoutPrefix(Target);
-  } else {
-    Str << "*";
-    CallTarget->emit(Func);
-  }
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Call::emitIAS(const Cfg *Func) const {
-  Assembler *Asm = Func->getAssembler<Assembler>();
-  Operand *CallTarget = getCallTarget();
-  auto *Target = InstX86Base::getTarget(Func);
-  if (const auto *Var = llvm::dyn_cast<Variable>(CallTarget)) {
-    if (Var->hasReg()) {
-      Asm->call(Traits::getEncodedGPR(Var->getRegNum()));
-    } else {
-      Asm->call(Target->stackVarToAsmOperand(Var));
-    }
-  } else if (const auto *Mem = llvm::dyn_cast<X86OperandMem>(CallTarget)) {
-    assert(Mem->getSegmentRegister() == X86OperandMem::DefaultSegment);
-    Asm->call(Mem->toAsmAddress(Asm, Target));
-  } else if (const auto *CR = llvm::dyn_cast<ConstantRelocatable>(CallTarget)) {
-    Asm->call(CR);
-  } else if (const auto *Imm = llvm::dyn_cast<ConstantInteger32>(CallTarget)) {
-    Asm->call(AssemblerImmediate(Imm->getValue()));
-  } else {
-    llvm_unreachable("Unexpected operand type");
-  }
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Call::dump(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrDump();
-  if (this->getDest()) {
-    this->dumpDest(Func);
-    Str << " = ";
-  }
-  Str << "call ";
-  getCallTarget()->dump(Func);
-}
-
-// The this->Opcode parameter needs to be char* and not std::string because of
-// template issues.
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Base::emitTwoAddress(
-    const Cfg *Func, const char *Opcode, const char *Suffix) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  assert(getSrcSize() == 2);
-  Operand *Dest = getDest();
-  if (Dest == nullptr)
-    Dest = getSrc(0);
-  assert(Dest == getSrc(0));
-  Operand *Src1 = getSrc(1);
-  Str << "\t" << Opcode << Suffix
-      << InstX86Base::getWidthString(Dest->getType()) << "\t";
-  Src1->emit(Func);
-  Str << ", ";
-  Dest->emit(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::emitIASOpTyGPR(const Cfg *Func, Type Ty,
-                                          const Operand *Op,
-                                          const GPREmitterOneOp &Emitter) {
-  auto *Target = InstX86Base::getTarget(Func);
-  Assembler *Asm = Func->getAssembler<Assembler>();
-  if (const auto *Var = llvm::dyn_cast<Variable>(Op)) {
-    if (Var->hasReg()) {
-      // We cheat a little and use GPRRegister even for byte operations.
-      GPRRegister VarReg = Traits::getEncodedGPR(Var->getRegNum());
-      (Asm->*(Emitter.Reg))(Ty, VarReg);
-    } else {
-      Address StackAddr(Target->stackVarToAsmOperand(Var));
-      (Asm->*(Emitter.Addr))(Ty, StackAddr);
-    }
-  } else if (const auto *Mem = llvm::dyn_cast<X86OperandMem>(Op)) {
-    Mem->emitSegmentOverride(Asm);
-    (Asm->*(Emitter.Addr))(Ty, Mem->toAsmAddress(Asm, Target));
-  } else {
-    llvm_unreachable("Unexpected operand type");
-  }
-}
-
-template <typename TraitsType>
-template <bool VarCanBeByte, bool SrcCanBeByte>
-void InstImpl<TraitsType>::emitIASRegOpTyGPR(const Cfg *Func, bool IsLea,
-                                             Type Ty, const Variable *Var,
-                                             const Operand *Src,
-                                             const GPREmitterRegOp &Emitter) {
-  auto *Target = InstX86Base::getTarget(Func);
-  Assembler *Asm = Func->getAssembler<Assembler>();
-  assert(Var->hasReg());
-  // We cheat a little and use GPRRegister even for byte operations.
-  GPRRegister VarReg = VarCanBeByte ? Traits::getEncodedGPR(Var->getRegNum())
-                                    : Traits::getEncodedGPR(Var->getRegNum());
-  if (const auto *SrcVar = llvm::dyn_cast<Variable>(Src)) {
-    if (SrcVar->hasReg()) {
-      GPRRegister SrcReg = SrcCanBeByte
-                               ? Traits::getEncodedGPR(SrcVar->getRegNum())
-                               : Traits::getEncodedGPR(SrcVar->getRegNum());
-      (Asm->*(Emitter.GPRGPR))(Ty, VarReg, SrcReg);
-    } else {
-      Address SrcStackAddr = Target->stackVarToAsmOperand(SrcVar);
-      (Asm->*(Emitter.GPRAddr))(Ty, VarReg, SrcStackAddr);
-    }
-  } else if (const auto *Mem = llvm::dyn_cast<X86OperandMem>(Src)) {
-    Mem->emitSegmentOverride(Asm);
-    (Asm->*(Emitter.GPRAddr))(Ty, VarReg,
-                              Mem->toAsmAddress(Asm, Target, IsLea));
-  } else if (const auto *Imm = llvm::dyn_cast<ConstantInteger32>(Src)) {
-    (Asm->*(Emitter.GPRImm))(Ty, VarReg, AssemblerImmediate(Imm->getValue()));
-  } else if (const auto *Imm = llvm::dyn_cast<ConstantInteger64>(Src)) {
-    assert(Traits::Is64Bit);
-    assert(Utils::IsInt(32, Imm->getValue()));
-    (Asm->*(Emitter.GPRImm))(Ty, VarReg, AssemblerImmediate(Imm->getValue()));
-  } else if (const auto *Reloc = llvm::dyn_cast<ConstantRelocatable>(Src)) {
-    const auto FixupKind = (Reloc->getName().hasStdString() &&
-                            Reloc->getName().toString() == GlobalOffsetTable)
-                               ? Traits::FK_GotPC
-                               : Traits::TargetLowering::getAbsFixup();
-    AssemblerFixup *Fixup = Asm->createFixup(FixupKind, Reloc);
-    (Asm->*(Emitter.GPRImm))(Ty, VarReg, AssemblerImmediate(Fixup));
-  } else if (const auto *Split = llvm::dyn_cast<VariableSplit>(Src)) {
-    (Asm->*(Emitter.GPRAddr))(Ty, VarReg, Split->toAsmAddress(Func));
-  } else {
-    llvm_unreachable("Unexpected operand type");
-  }
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::emitIASAddrOpTyGPR(const Cfg *Func, Type Ty,
-                                              const Address &Addr,
-                                              const Operand *Src,
-                                              const GPREmitterAddrOp &Emitter) {
-  Assembler *Asm = Func->getAssembler<Assembler>();
-  // Src can only be Reg or AssemblerImmediate.
-  if (const auto *SrcVar = llvm::dyn_cast<Variable>(Src)) {
-    assert(SrcVar->hasReg());
-    GPRRegister SrcReg = Traits::getEncodedGPR(SrcVar->getRegNum());
-    (Asm->*(Emitter.AddrGPR))(Ty, Addr, SrcReg);
-  } else if (const auto *Imm = llvm::dyn_cast<ConstantInteger32>(Src)) {
-    (Asm->*(Emitter.AddrImm))(Ty, Addr, AssemblerImmediate(Imm->getValue()));
-  } else if (const auto *Imm = llvm::dyn_cast<ConstantInteger64>(Src)) {
-    assert(Traits::Is64Bit);
-    assert(Utils::IsInt(32, Imm->getValue()));
-    (Asm->*(Emitter.AddrImm))(Ty, Addr, AssemblerImmediate(Imm->getValue()));
-  } else if (const auto *Reloc = llvm::dyn_cast<ConstantRelocatable>(Src)) {
-    const auto FixupKind = (Reloc->getName().hasStdString() &&
-                            Reloc->getName().toString() == GlobalOffsetTable)
-                               ? Traits::FK_GotPC
-                               : Traits::TargetLowering::getAbsFixup();
-    AssemblerFixup *Fixup = Asm->createFixup(FixupKind, Reloc);
-    (Asm->*(Emitter.AddrImm))(Ty, Addr, AssemblerImmediate(Fixup));
-  } else {
-    llvm_unreachable("Unexpected operand type");
-  }
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::emitIASAsAddrOpTyGPR(
-    const Cfg *Func, Type Ty, const Operand *Op0, const Operand *Op1,
-    const GPREmitterAddrOp &Emitter) {
-  auto *Target = InstX86Base::getTarget(Func);
-  if (const auto *Op0Var = llvm::dyn_cast<Variable>(Op0)) {
-    assert(!Op0Var->hasReg());
-    Address StackAddr(Target->stackVarToAsmOperand(Op0Var));
-    emitIASAddrOpTyGPR(Func, Ty, StackAddr, Op1, Emitter);
-  } else if (const auto *Op0Mem = llvm::dyn_cast<X86OperandMem>(Op0)) {
-    Assembler *Asm = Func->getAssembler<Assembler>();
-    Op0Mem->emitSegmentOverride(Asm);
-    emitIASAddrOpTyGPR(Func, Ty, Op0Mem->toAsmAddress(Asm, Target), Op1,
-                       Emitter);
-  } else if (const auto *Split = llvm::dyn_cast<VariableSplit>(Op0)) {
-    emitIASAddrOpTyGPR(Func, Ty, Split->toAsmAddress(Func), Op1, Emitter);
-  } else {
-    llvm_unreachable("Unexpected operand type");
-  }
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::emitIASGPRShift(const Cfg *Func, Type Ty,
-                                           const Variable *Var,
-                                           const Operand *Src,
-                                           const GPREmitterShiftOp &Emitter) {
-  Assembler *Asm = Func->getAssembler<Assembler>();
-  // Technically, the Dest Var can be mem as well, but we only use Reg. We can
-  // extend this to check Dest if we decide to use that form.
-  assert(Var->hasReg());
-  // We cheat a little and use GPRRegister even for byte operations.
-  GPRRegister VarReg = Traits::getEncodedGPR(Var->getRegNum());
-  // Src must be reg == ECX or an Imm8. This is asserted by the assembler.
-  if (const auto *SrcVar = llvm::dyn_cast<Variable>(Src)) {
-    assert(SrcVar->hasReg());
-    GPRRegister SrcReg = Traits::getEncodedGPR(SrcVar->getRegNum());
-    (Asm->*(Emitter.GPRGPR))(Ty, VarReg, SrcReg);
-  } else if (const auto *Imm = llvm::dyn_cast<ConstantInteger32>(Src)) {
-    (Asm->*(Emitter.GPRImm))(Ty, VarReg, AssemblerImmediate(Imm->getValue()));
-  } else if (const auto *Imm = llvm::dyn_cast<ConstantInteger64>(Src)) {
-    assert(Traits::Is64Bit);
-    assert(Utils::IsInt(32, Imm->getValue()));
-    (Asm->*(Emitter.GPRImm))(Ty, VarReg, AssemblerImmediate(Imm->getValue()));
-  } else {
-    llvm_unreachable("Unexpected operand type");
-  }
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::emitIASGPRShiftDouble(
-    const Cfg *Func, const Variable *Dest, const Operand *Src1Op,
-    const Operand *Src2Op, const GPREmitterShiftD &Emitter) {
-  Assembler *Asm = Func->getAssembler<Assembler>();
-  // Dest can be reg or mem, but we only use the reg variant.
-  assert(Dest->hasReg());
-  GPRRegister DestReg = Traits::getEncodedGPR(Dest->getRegNum());
-  // SrcVar1 must be reg.
-  const auto *SrcVar1 = llvm::cast<Variable>(Src1Op);
-  assert(SrcVar1->hasReg());
-  GPRRegister SrcReg = Traits::getEncodedGPR(SrcVar1->getRegNum());
-  Type Ty = SrcVar1->getType();
-  // Src2 can be the implicit CL register or an immediate.
-  if (const auto *Imm = llvm::dyn_cast<ConstantInteger32>(Src2Op)) {
-    (Asm->*(Emitter.GPRGPRImm))(Ty, DestReg, SrcReg,
-                                AssemblerImmediate(Imm->getValue()));
-  } else {
-    assert(llvm::cast<Variable>(Src2Op)->getRegNum() == RegisterSet::Reg_cl);
-    (Asm->*(Emitter.GPRGPR))(Ty, DestReg, SrcReg);
-  }
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::emitIASXmmShift(const Cfg *Func, Type Ty,
-                                           const Variable *Var,
-                                           const Operand *Src,
-                                           const XmmEmitterShiftOp &Emitter) {
-  auto *Target = InstX86Base::getTarget(Func);
-  Assembler *Asm = Func->getAssembler<Assembler>();
-  assert(Var->hasReg());
-  XmmRegister VarReg = Traits::getEncodedXmm(Var->getRegNum());
-  if (const auto *SrcVar = llvm::dyn_cast<Variable>(Src)) {
-    if (SrcVar->hasReg()) {
-      XmmRegister SrcReg = Traits::getEncodedXmm(SrcVar->getRegNum());
-      (Asm->*(Emitter.XmmXmm))(Ty, VarReg, SrcReg);
-    } else {
-      Address SrcStackAddr = Target->stackVarToAsmOperand(SrcVar);
-      (Asm->*(Emitter.XmmAddr))(Ty, VarReg, SrcStackAddr);
-    }
-  } else if (const auto *Mem = llvm::dyn_cast<X86OperandMem>(Src)) {
-    assert(Mem->getSegmentRegister() == X86OperandMem::DefaultSegment);
-    (Asm->*(Emitter.XmmAddr))(Ty, VarReg, Mem->toAsmAddress(Asm, Target));
-  } else if (const auto *Imm = llvm::dyn_cast<ConstantInteger32>(Src)) {
-    (Asm->*(Emitter.XmmImm))(Ty, VarReg, AssemblerImmediate(Imm->getValue()));
-  } else {
-    llvm_unreachable("Unexpected operand type");
-  }
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::emitIASRegOpTyXMM(const Cfg *Func, Type Ty,
-                                             const Variable *Var,
-                                             const Operand *Src,
-                                             const XmmEmitterRegOp &Emitter) {
-  auto *Target = InstX86Base::getTarget(Func);
-  Assembler *Asm = Func->getAssembler<Assembler>();
-  assert(Var->hasReg());
-  XmmRegister VarReg = Traits::getEncodedXmm(Var->getRegNum());
-  if (const auto *SrcVar = llvm::dyn_cast<Variable>(Src)) {
-    if (SrcVar->hasReg()) {
-      XmmRegister SrcReg = Traits::getEncodedXmm(SrcVar->getRegNum());
-      (Asm->*(Emitter.XmmXmm))(Ty, VarReg, SrcReg);
-    } else {
-      Address SrcStackAddr = Target->stackVarToAsmOperand(SrcVar);
-      (Asm->*(Emitter.XmmAddr))(Ty, VarReg, SrcStackAddr);
-    }
-  } else if (const auto *Mem = llvm::dyn_cast<X86OperandMem>(Src)) {
-    assert(Mem->getSegmentRegister() == X86OperandMem::DefaultSegment);
-    (Asm->*(Emitter.XmmAddr))(Ty, VarReg, Mem->toAsmAddress(Asm, Target));
-  } else if (const auto *Imm = llvm::dyn_cast<Constant>(Src)) {
-    (Asm->*(Emitter.XmmAddr))(Ty, VarReg,
-                              Traits::Address::ofConstPool(Asm, Imm));
-  } else {
-    llvm_unreachable("Unexpected operand type");
-  }
-}
-
-template <typename TraitsType>
-template <typename DReg_t, typename SReg_t, DReg_t (*destEnc)(RegNumT),
-          SReg_t (*srcEnc)(RegNumT)>
-void InstImpl<TraitsType>::emitIASCastRegOp(
-    const Cfg *Func, Type DestTy, const Variable *Dest, Type SrcTy,
-    const Operand *Src, const CastEmitterRegOp<DReg_t, SReg_t> &Emitter) {
-  auto *Target = InstX86Base::getTarget(Func);
-  Assembler *Asm = Func->getAssembler<Assembler>();
-  assert(Dest->hasReg());
-  DReg_t DestReg = destEnc(Dest->getRegNum());
-  if (const auto *SrcVar = llvm::dyn_cast<Variable>(Src)) {
-    if (SrcVar->hasReg()) {
-      SReg_t SrcReg = srcEnc(SrcVar->getRegNum());
-      (Asm->*(Emitter.RegReg))(DestTy, DestReg, SrcTy, SrcReg);
-    } else {
-      Address SrcStackAddr = Target->stackVarToAsmOperand(SrcVar);
-      (Asm->*(Emitter.RegAddr))(DestTy, DestReg, SrcTy, SrcStackAddr);
-    }
-  } else if (const auto *Mem = llvm::dyn_cast<X86OperandMem>(Src)) {
-    Mem->emitSegmentOverride(Asm);
-    (Asm->*(Emitter.RegAddr))(DestTy, DestReg, SrcTy,
-                              Mem->toAsmAddress(Asm, Target));
-  } else {
-    llvm_unreachable("Unexpected operand type");
-  }
-}
-
-template <typename TraitsType>
-template <typename DReg_t, typename SReg_t, DReg_t (*destEnc)(RegNumT),
-          SReg_t (*srcEnc)(RegNumT)>
-void InstImpl<TraitsType>::emitIASThreeOpImmOps(
-    const Cfg *Func, Type DispatchTy, const Variable *Dest, const Operand *Src0,
-    const Operand *Src1, const ThreeOpImmEmitter<DReg_t, SReg_t> Emitter) {
-  auto *Target = InstX86Base::getTarget(Func);
-  Assembler *Asm = Func->getAssembler<Assembler>();
-  // This only handles Dest being a register, and Src1 being an immediate.
-  assert(Dest->hasReg());
-  DReg_t DestReg = destEnc(Dest->getRegNum());
-  AssemblerImmediate Imm(llvm::cast<ConstantInteger32>(Src1)->getValue());
-  if (const auto *SrcVar = llvm::dyn_cast<Variable>(Src0)) {
-    if (SrcVar->hasReg()) {
-      SReg_t SrcReg = srcEnc(SrcVar->getRegNum());
-      (Asm->*(Emitter.RegRegImm))(DispatchTy, DestReg, SrcReg, Imm);
-    } else {
-      Address SrcStackAddr = Target->stackVarToAsmOperand(SrcVar);
-      (Asm->*(Emitter.RegAddrImm))(DispatchTy, DestReg, SrcStackAddr, Imm);
-    }
-  } else if (const auto *Mem = llvm::dyn_cast<X86OperandMem>(Src0)) {
-    Mem->emitSegmentOverride(Asm);
-    (Asm->*(Emitter.RegAddrImm))(DispatchTy, DestReg,
-                                 Mem->toAsmAddress(Asm, Target), Imm);
-  } else {
-    llvm_unreachable("Unexpected operand type");
-  }
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::emitIASMovlikeXMM(const Cfg *Func,
-                                             const Variable *Dest,
-                                             const Operand *Src,
-                                             const XmmEmitterMovOps Emitter) {
-  auto *Target = InstX86Base::getTarget(Func);
-  Assembler *Asm = Func->getAssembler<Assembler>();
-  if (Dest->hasReg()) {
-    XmmRegister DestReg = Traits::getEncodedXmm(Dest->getRegNum());
-    if (const auto *SrcVar = llvm::dyn_cast<Variable>(Src)) {
-      if (SrcVar->hasReg()) {
-        (Asm->*(Emitter.XmmXmm))(DestReg,
-                                 Traits::getEncodedXmm(SrcVar->getRegNum()));
-      } else {
-        Address StackAddr(Target->stackVarToAsmOperand(SrcVar));
-        (Asm->*(Emitter.XmmAddr))(DestReg, StackAddr);
-      }
-    } else if (const auto *SrcMem = llvm::dyn_cast<X86OperandMem>(Src)) {
-      assert(SrcMem->getSegmentRegister() == X86OperandMem::DefaultSegment);
-      (Asm->*(Emitter.XmmAddr))(DestReg, SrcMem->toAsmAddress(Asm, Target));
-    } else {
-      llvm_unreachable("Unexpected operand type");
-    }
-  } else {
-    Address StackAddr(Target->stackVarToAsmOperand(Dest));
-    // Src must be a register in this case.
-    const auto *SrcVar = llvm::cast<Variable>(Src);
-    assert(SrcVar->hasReg());
-    (Asm->*(Emitter.AddrXmm))(StackAddr,
-                              Traits::getEncodedXmm(SrcVar->getRegNum()));
-  }
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Movmsk::dump(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrDump();
-  this->dumpDest(Func);
-  Str << " = movmsk." << this->getSrc(0)->getType() << " ";
-  this->dumpSources(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Movmsk::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  assert(this->getSrcSize() == 1);
-  Type SrcTy = this->getSrc(0)->getType();
-  assert(isVectorType(SrcTy));
-  switch (SrcTy) {
-  case IceType_v16i8:
-    Str << "\t"
-           "pmovmskb"
-           "\t";
-    break;
-  case IceType_v4i32:
-  case IceType_v4f32:
-    Str << "\t"
-           "movmskps"
-           "\t";
-    break;
-  default:
-    llvm_unreachable("Unexpected operand type");
-  }
-  this->getSrc(0)->emit(Func);
-  Str << ", ";
-  this->getDest()->emit(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Movmsk::emitIAS(const Cfg *Func) const {
-  assert(this->getSrcSize() == 1);
-  Assembler *Asm = Func->getAssembler<Assembler>();
-  const Variable *Dest = this->getDest();
-  const Variable *Src = llvm::cast<Variable>(this->getSrc(0));
-  const Type DestTy = Dest->getType();
-  (void)DestTy;
-  const Type SrcTy = Src->getType();
-  assert(isVectorType(SrcTy));
-  assert(isScalarIntegerType(DestTy));
-  if (Traits::Is64Bit) {
-    assert(DestTy == IceType_i32 || DestTy == IceType_i64);
-  } else {
-    assert(typeWidthInBytes(DestTy) <= 4);
-  }
-  XmmRegister SrcReg = Traits::getEncodedXmm(Src->getRegNum());
-  GPRRegister DestReg = Traits::getEncodedGPR(Dest->getRegNum());
-  Asm->movmsk(SrcTy, DestReg, SrcReg);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Sqrt::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  assert(this->getSrcSize() == 1);
-  Type Ty = this->getSrc(0)->getType();
-  assert(isScalarFloatingType(Ty));
-  Str << "\t"
-         "sqrt"
-      << Traits::TypeAttributes[Ty].SpSdString << "\t";
-  this->getSrc(0)->emit(Func);
-  Str << ", ";
-  this->getDest()->emit(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Div::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  assert(this->getSrcSize() == 3);
-  Operand *Src1 = this->getSrc(1);
-  Str << "\t" << this->Opcode << this->getWidthString(Src1->getType()) << "\t";
-  Src1->emit(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Div::emitIAS(const Cfg *Func) const {
-  assert(this->getSrcSize() == 3);
-  const Operand *Src = this->getSrc(1);
-  Type Ty = Src->getType();
-  static GPREmitterOneOp Emitter = {&Assembler::div, &Assembler::div};
-  emitIASOpTyGPR(Func, Ty, Src, Emitter);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Idiv::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  assert(this->getSrcSize() == 3);
-  Operand *Src1 = this->getSrc(1);
-  Str << "\t" << this->Opcode << this->getWidthString(Src1->getType()) << "\t";
-  Src1->emit(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Idiv::emitIAS(const Cfg *Func) const {
-  assert(this->getSrcSize() == 3);
-  const Operand *Src = this->getSrc(1);
-  Type Ty = Src->getType();
-  static const GPREmitterOneOp Emitter = {&Assembler::idiv, &Assembler::idiv};
-  emitIASOpTyGPR(Func, Ty, Src, Emitter);
-}
-
-// pblendvb and blendvps take xmm0 as a final implicit argument.
-template <typename TraitsType>
-void InstImpl<TraitsType>::emitVariableBlendInst(const char *Opcode,
-                                                 const Inst *Instr,
-                                                 const Cfg *Func) {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  assert(Instr->getSrcSize() == 3);
-  assert(llvm::cast<Variable>(Instr->getSrc(2))->getRegNum() ==
-         RegisterSet::Reg_xmm0);
-  Str << "\t" << Opcode << "\t";
-  Instr->getSrc(1)->emit(Func);
-  Str << ", ";
-  Instr->getDest()->emit(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::emitIASVariableBlendInst(
-    const Inst *Instr, const Cfg *Func, const XmmEmitterRegOp &Emitter) {
-  assert(Instr->getSrcSize() == 3);
-  assert(llvm::cast<Variable>(Instr->getSrc(2))->getRegNum() ==
-         RegisterSet::Reg_xmm0);
-  const Variable *Dest = Instr->getDest();
-  const Operand *Src = Instr->getSrc(1);
-  emitIASRegOpTyXMM(Func, Dest->getType(), Dest, Src, Emitter);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Blendvps::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  emitVariableBlendInst(this->Opcode, this, Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Blendvps::emitIAS(const Cfg *Func) const {
-  static const XmmEmitterRegOp Emitter = {&Assembler::blendvps,
-                                          &Assembler::blendvps};
-  emitIASVariableBlendInst(this, Func, Emitter);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Pblendvb::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  emitVariableBlendInst(this->Opcode, this, Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Pblendvb::emitIAS(const Cfg *Func) const {
-  static const XmmEmitterRegOp Emitter = {&Assembler::pblendvb,
-                                          &Assembler::pblendvb};
-  emitIASVariableBlendInst(this, Func, Emitter);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Imul::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  assert(this->getSrcSize() == 2);
-  Variable *Dest = this->getDest();
-  if (isByteSizedArithType(Dest->getType())) {
-    // The 8-bit version of imul only allows the form "imul r/m8".
-    const auto *Src0Var = llvm::dyn_cast<Variable>(this->getSrc(0));
-    (void)Src0Var;
-    assert(Src0Var->getRegNum() == RegisterSet::Reg_al);
-    Str << "\t"
-           "imulb\t";
-    this->getSrc(1)->emit(Func);
-  } else if (llvm::isa<Constant>(this->getSrc(1))) {
-    Str << "\t"
-           "imul"
-        << this->getWidthString(Dest->getType()) << "\t";
-    this->getSrc(1)->emit(Func);
-    Str << ", ";
-    this->getSrc(0)->emit(Func);
-    Str << ", ";
-    Dest->emit(Func);
-  } else {
-    this->emitTwoAddress(Func, this->Opcode);
-  }
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Imul::emitIAS(const Cfg *Func) const {
-  assert(this->getSrcSize() == 2);
-  const Variable *Var = this->getDest();
-  Type Ty = Var->getType();
-  const Operand *Src = this->getSrc(1);
-  if (isByteSizedArithType(Ty)) {
-    // The 8-bit version of imul only allows the form "imul r/m8".
-    const auto *Src0Var = llvm::dyn_cast<Variable>(this->getSrc(0));
-    (void)Src0Var;
-    assert(Src0Var->getRegNum() == RegisterSet::Reg_al);
-    static const GPREmitterOneOp Emitter = {&Assembler::imul, &Assembler::imul};
-    emitIASOpTyGPR(Func, Ty, this->getSrc(1), Emitter);
-  } else {
-    // The two-address version is used when multiplying by a non-constant
-    // or doing an 8-bit multiply.
-    assert(Var == this->getSrc(0));
-    static const GPREmitterRegOp Emitter = {&Assembler::imul, &Assembler::imul,
-                                            &Assembler::imul};
-    constexpr bool NotLea = false;
-    emitIASRegOpTyGPR(Func, NotLea, Ty, Var, Src, Emitter);
-  }
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86ImulImm::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  assert(this->getSrcSize() == 2);
-  Variable *Dest = this->getDest();
-  assert(Dest->getType() == IceType_i16 || Dest->getType() == IceType_i32);
-  assert(llvm::isa<Constant>(this->getSrc(1)));
-  Str << "\t"
-         "imul"
-      << this->getWidthString(Dest->getType()) << "\t";
-  this->getSrc(1)->emit(Func);
-  Str << ", ";
-  this->getSrc(0)->emit(Func);
-  Str << ", ";
-  Dest->emit(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86ImulImm::emitIAS(const Cfg *Func) const {
-  assert(this->getSrcSize() == 2);
-  const Variable *Dest = this->getDest();
-  Type Ty = Dest->getType();
-  assert(llvm::isa<Constant>(this->getSrc(1)));
-  static const ThreeOpImmEmitter<GPRRegister, GPRRegister> Emitter = {
-      &Assembler::imul, &Assembler::imul};
-  emitIASThreeOpImmOps<GPRRegister, GPRRegister, Traits::getEncodedGPR,
-                       Traits::getEncodedGPR>(Func, Ty, Dest, this->getSrc(0),
-                                              this->getSrc(1), Emitter);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Insertps::emitIAS(const Cfg *Func) const {
-  assert(this->getSrcSize() == 3);
-  assert(InstX86Base::getTarget(Func)->getInstructionSet() >= Traits::SSE4_1);
-  const Variable *Dest = this->getDest();
-  assert(Dest == this->getSrc(0));
-  Type Ty = Dest->getType();
-  static const ThreeOpImmEmitter<XmmRegister, XmmRegister> Emitter = {
-      &Assembler::insertps, &Assembler::insertps};
-  emitIASThreeOpImmOps<XmmRegister, XmmRegister, Traits::getEncodedXmm,
-                       Traits::getEncodedXmm>(Func, Ty, Dest, this->getSrc(1),
-                                              this->getSrc(2), Emitter);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Cbwdq::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  assert(this->getSrcSize() == 1);
-  Operand *Src0 = this->getSrc(0);
-  const auto DestReg = this->getDest()->getRegNum();
-  const auto SrcReg = llvm::cast<Variable>(Src0)->getRegNum();
-  (void)DestReg;
-  (void)SrcReg;
-  switch (Src0->getType()) {
-  default:
-    llvm_unreachable("unexpected source type!");
-    break;
-  case IceType_i8:
-    assert(SrcReg == RegisterSet::Reg_al);
-    assert(DestReg == RegisterSet::Reg_ax || DestReg == RegisterSet::Reg_ah);
-    Str << "\t"
-           "cbtw";
-    break;
-  case IceType_i16:
-    assert(SrcReg == RegisterSet::Reg_ax);
-    assert(DestReg == RegisterSet::Reg_dx);
-    Str << "\t"
-           "cwtd";
-    break;
-  case IceType_i32:
-    assert(SrcReg == RegisterSet::Reg_eax);
-    assert(DestReg == RegisterSet::Reg_edx);
-    Str << "\t"
-           "cltd";
-    break;
-  case IceType_i64:
-    assert(Traits::Is64Bit);
-    assert(SrcReg == Traits::getRaxOrDie());
-    assert(DestReg == Traits::getRdxOrDie());
-    Str << "\t"
-           "cqo";
-    break;
-  }
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Cbwdq::emitIAS(const Cfg *Func) const {
-  Assembler *Asm = Func->getAssembler<Assembler>();
-  assert(this->getSrcSize() == 1);
-  Operand *Src0 = this->getSrc(0);
-  const auto DestReg = this->getDest()->getRegNum();
-  const auto SrcReg = llvm::cast<Variable>(Src0)->getRegNum();
-  (void)DestReg;
-  (void)SrcReg;
-  switch (Src0->getType()) {
-  default:
-    llvm_unreachable("unexpected source type!");
-    break;
-  case IceType_i8:
-    assert(SrcReg == RegisterSet::Reg_al);
-    assert(DestReg == RegisterSet::Reg_ax || DestReg == RegisterSet::Reg_ah);
-    Asm->cbw();
-    break;
-  case IceType_i16:
-    assert(SrcReg == RegisterSet::Reg_ax);
-    assert(DestReg == RegisterSet::Reg_dx);
-    Asm->cwd();
-    break;
-  case IceType_i32:
-    assert(SrcReg == RegisterSet::Reg_eax);
-    assert(DestReg == RegisterSet::Reg_edx);
-    Asm->cdq();
-    break;
-  case IceType_i64:
-    assert(Traits::Is64Bit);
-    assert(SrcReg == Traits::getRaxOrDie());
-    assert(DestReg == Traits::getRdxOrDie());
-    Asm->cqo();
-    break;
-  }
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Mul::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  assert(this->getSrcSize() == 2);
-  assert(llvm::isa<Variable>(this->getSrc(0)));
-  assert(llvm::cast<Variable>(this->getSrc(0))->getRegNum() ==
-         RegisterSet::Reg_eax);
-  assert(this->getDest()->getRegNum() == RegisterSet::Reg_eax); // TODO:
-                                                                // allow
-                                                                // edx?
-  Str << "\t"
-         "mul"
-      << this->getWidthString(this->getDest()->getType()) << "\t";
-  this->getSrc(1)->emit(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Mul::emitIAS(const Cfg *Func) const {
-  assert(this->getSrcSize() == 2);
-  assert(llvm::isa<Variable>(this->getSrc(0)));
-  assert(llvm::cast<Variable>(this->getSrc(0))->getRegNum() ==
-         RegisterSet::Reg_eax);
-  assert(this->getDest()->getRegNum() == RegisterSet::Reg_eax); // TODO:
-                                                                // allow
-                                                                // edx?
-  const Operand *Src = this->getSrc(1);
-  Type Ty = Src->getType();
-  static const GPREmitterOneOp Emitter = {&Assembler::mul, &Assembler::mul};
-  emitIASOpTyGPR(Func, Ty, Src, Emitter);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Mul::dump(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrDump();
-  this->dumpDest(Func);
-  Str << " = mul." << this->getDest()->getType() << " ";
-  this->dumpSources(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Shld::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  Variable *Dest = this->getDest();
-  assert(this->getSrcSize() == 3);
-  assert(Dest == this->getSrc(0));
-  Str << "\t"
-         "shld"
-      << this->getWidthString(Dest->getType()) << "\t";
-  this->getSrc(2)->emit(Func);
-  Str << ", ";
-  this->getSrc(1)->emit(Func);
-  Str << ", ";
-  Dest->emit(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Shld::emitIAS(const Cfg *Func) const {
-  assert(this->getSrcSize() == 3);
-  assert(this->getDest() == this->getSrc(0));
-  const Variable *Dest = this->getDest();
-  const Operand *Src1 = this->getSrc(1);
-  const Operand *Src2 = this->getSrc(2);
-  static const GPREmitterShiftD Emitter = {&Assembler::shld, &Assembler::shld};
-  emitIASGPRShiftDouble(Func, Dest, Src1, Src2, Emitter);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Shld::dump(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrDump();
-  this->dumpDest(Func);
-  Str << " = shld." << this->getDest()->getType() << " ";
-  this->dumpSources(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Shrd::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  Variable *Dest = this->getDest();
-  assert(this->getSrcSize() == 3);
-  assert(Dest == this->getSrc(0));
-  Str << "\t"
-         "shrd"
-      << this->getWidthString(Dest->getType()) << "\t";
-  this->getSrc(2)->emit(Func);
-  Str << ", ";
-  this->getSrc(1)->emit(Func);
-  Str << ", ";
-  Dest->emit(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Shrd::emitIAS(const Cfg *Func) const {
-  assert(this->getSrcSize() == 3);
-  assert(this->getDest() == this->getSrc(0));
-  const Variable *Dest = this->getDest();
-  const Operand *Src1 = this->getSrc(1);
-  const Operand *Src2 = this->getSrc(2);
-  static const GPREmitterShiftD Emitter = {&Assembler::shrd, &Assembler::shrd};
-  emitIASGPRShiftDouble(Func, Dest, Src1, Src2, Emitter);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Shrd::dump(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrDump();
-  this->dumpDest(Func);
-  Str << " = shrd." << this->getDest()->getType() << " ";
-  this->dumpSources(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Cmov::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  Variable *Dest = this->getDest();
-  Str << "\t";
-  assert(Condition != Cond::Br_None);
-  assert(this->getDest()->hasReg());
-  Str << "cmov" << Traits::InstBrAttributes[Condition].DisplayString
-      << this->getWidthString(Dest->getType()) << "\t";
-  this->getSrc(1)->emit(Func);
-  Str << ", ";
-  Dest->emit(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Cmov::emitIAS(const Cfg *Func) const {
-  assert(Condition != Cond::Br_None);
-  assert(this->getDest()->hasReg());
-  assert(this->getSrcSize() == 2);
-  Operand *Src = this->getSrc(1);
-  Type SrcTy = Src->getType();
-  assert(SrcTy == IceType_i16 || SrcTy == IceType_i32 || (Traits::Is64Bit));
-  Assembler *Asm = Func->getAssembler<Assembler>();
-  auto *Target = InstX86Base::getTarget(Func);
-  if (const auto *SrcVar = llvm::dyn_cast<Variable>(Src)) {
-    if (SrcVar->hasReg()) {
-      Asm->cmov(SrcTy, Condition,
-                Traits::getEncodedGPR(this->getDest()->getRegNum()),
-                Traits::getEncodedGPR(SrcVar->getRegNum()));
-    } else {
-      Asm->cmov(SrcTy, Condition,
-                Traits::getEncodedGPR(this->getDest()->getRegNum()),
-                Target->stackVarToAsmOperand(SrcVar));
-    }
-  } else if (const auto *Mem = llvm::dyn_cast<X86OperandMem>(Src)) {
-    assert(Mem->getSegmentRegister() == X86OperandMem::DefaultSegment);
-    Asm->cmov(SrcTy, Condition,
-              Traits::getEncodedGPR(this->getDest()->getRegNum()),
-              Mem->toAsmAddress(Asm, Target));
-  } else {
-    llvm_unreachable("Unexpected operand type");
-  }
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Cmov::dump(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrDump();
-  Str << "cmov" << Traits::InstBrAttributes[Condition].DisplayString << ".";
-  Str << this->getDest()->getType() << " ";
-  this->dumpDest(Func);
-  Str << ", ";
-  this->dumpSources(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Cmpps::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  assert(this->getSrcSize() == 2);
-  assert(Condition < Cond::Cmpps_Invalid);
-  Type DestTy = this->Dest->getType();
-  Str << "\t"
-         "cmp"
-      << Traits::InstCmppsAttributes[Condition].EmitString
-      << Traits::TypeAttributes[DestTy].PdPsString << "\t";
-  this->getSrc(1)->emit(Func);
-  Str << ", ";
-  this->getDest()->emit(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Cmpps::emitIAS(const Cfg *Func) const {
-  Assembler *Asm = Func->getAssembler<Assembler>();
-  assert(this->getSrcSize() == 2);
-  assert(Condition < Cond::Cmpps_Invalid);
-  // Assuming there isn't any load folding for cmpps, and vector constants are
-  // not allowed in PNaCl.
-  assert(llvm::isa<Variable>(this->getSrc(1)));
-  auto *Target = InstX86Base::getTarget(Func);
-  const auto *SrcVar = llvm::cast<Variable>(this->getSrc(1));
-  if (SrcVar->hasReg()) {
-    Asm->cmpps(this->getDest()->getType(),
-               Traits::getEncodedXmm(this->getDest()->getRegNum()),
-               Traits::getEncodedXmm(SrcVar->getRegNum()), Condition);
-  } else {
-    Address SrcStackAddr = Target->stackVarToAsmOperand(SrcVar);
-    Asm->cmpps(this->getDest()->getType(),
-               Traits::getEncodedXmm(this->getDest()->getRegNum()),
-               SrcStackAddr, Condition);
-  }
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Cmpps::dump(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrDump();
-  assert(Condition < Cond::Cmpps_Invalid);
-  this->dumpDest(Func);
-  Str << " = cmp" << Traits::InstCmppsAttributes[Condition].EmitString
-      << "ps"
-         "\t";
-  this->dumpSources(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Cmpxchg::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  assert(this->getSrcSize() == 3);
-  if (this->Locked) {
-    Str << "\t"
-           "lock";
-  }
-  Str << "\t"
-         "cmpxchg"
-      << this->getWidthString(this->getSrc(0)->getType()) << "\t";
-  this->getSrc(2)->emit(Func);
-  Str << ", ";
-  this->getSrc(0)->emit(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Cmpxchg::emitIAS(const Cfg *Func) const {
-  assert(this->getSrcSize() == 3);
-  Assembler *Asm = Func->getAssembler<Assembler>();
-  Type Ty = this->getSrc(0)->getType();
-  auto *Target = InstX86Base::getTarget(Func);
-  const auto Mem = llvm::cast<X86OperandMem>(this->getSrc(0));
-  assert(Mem->getSegmentRegister() == X86OperandMem::DefaultSegment);
-  const Address Addr = Mem->toAsmAddress(Asm, Target);
-  const auto *VarReg = llvm::cast<Variable>(this->getSrc(2));
-  assert(VarReg->hasReg());
-  const GPRRegister Reg = Traits::getEncodedGPR(VarReg->getRegNum());
-  Asm->cmpxchg(Ty, Addr, Reg, this->Locked);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Cmpxchg::dump(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrDump();
-  if (this->Locked) {
-    Str << "lock ";
-  }
-  Str << "cmpxchg." << this->getSrc(0)->getType() << " ";
-  this->dumpSources(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Cmpxchg8b::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  assert(this->getSrcSize() == 5);
-  if (this->Locked) {
-    Str << "\t"
-           "lock";
-  }
-  Str << "\t"
-         "cmpxchg8b\t";
-  this->getSrc(0)->emit(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Cmpxchg8b::emitIAS(const Cfg *Func) const {
-  assert(this->getSrcSize() == 5);
-  Assembler *Asm = Func->getAssembler<Assembler>();
-  const auto Mem = llvm::cast<X86OperandMem>(this->getSrc(0));
-  assert(Mem->getSegmentRegister() == X86OperandMem::DefaultSegment);
-  auto *Target = InstX86Base::getTarget(Func);
-  const Address Addr = Mem->toAsmAddress(Asm, Target);
-  Asm->cmpxchg8b(Addr, this->Locked);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Cmpxchg8b::dump(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrDump();
-  if (this->Locked) {
-    Str << "lock ";
-  }
-  Str << "cmpxchg8b ";
-  this->dumpSources(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Cvt::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  assert(this->getSrcSize() == 1);
-  Str << "\t"
-         "cvt";
-  if (isTruncating())
-    Str << "t";
-  Str << Traits::TypeAttributes[this->getSrc(0)->getType()].CvtString << "2"
-      << Traits::TypeAttributes[this->getDest()->getType()].CvtString << "\t";
-  this->getSrc(0)->emit(Func);
-  Str << ", ";
-  this->getDest()->emit(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Cvt::emitIAS(const Cfg *Func) const {
-  assert(this->getSrcSize() == 1);
-  const Variable *Dest = this->getDest();
-  const Operand *Src = this->getSrc(0);
-  Type DestTy = Dest->getType();
-  Type SrcTy = Src->getType();
-  switch (Variant) {
-  case Si2ss: {
-    assert(isScalarIntegerType(SrcTy));
-    if (!Traits::Is64Bit) {
-      assert(typeWidthInBytes(SrcTy) <= 4);
-    } else {
-      assert(SrcTy == IceType_i32 || SrcTy == IceType_i64);
-    }
-    assert(isScalarFloatingType(DestTy));
-    static const CastEmitterRegOp<XmmRegister, GPRRegister> Emitter = {
-        &Assembler::cvtsi2ss, &Assembler::cvtsi2ss};
-    emitIASCastRegOp<XmmRegister, GPRRegister, Traits::getEncodedXmm,
-                     Traits::getEncodedGPR>(Func, DestTy, Dest, SrcTy, Src,
-                                            Emitter);
-    return;
-  }
-  case Tss2si: {
-    assert(isScalarFloatingType(SrcTy));
-    assert(isScalarIntegerType(DestTy));
-    if (Traits::Is64Bit) {
-      assert(DestTy == IceType_i32 || DestTy == IceType_i64);
-    } else {
-      assert(typeWidthInBytes(DestTy) <= 4);
-    }
-    static const CastEmitterRegOp<GPRRegister, XmmRegister> Emitter = {
-        &Assembler::cvttss2si, &Assembler::cvttss2si};
-    emitIASCastRegOp<GPRRegister, XmmRegister, Traits::getEncodedGPR,
-                     Traits::getEncodedXmm>(Func, DestTy, Dest, SrcTy, Src,
-                                            Emitter);
-    return;
-  }
-  case Ss2si: {
-    assert(isScalarFloatingType(SrcTy));
-    assert(isScalarIntegerType(DestTy));
-    if (Traits::Is64Bit) {
-      assert(DestTy == IceType_i32 || DestTy == IceType_i64);
-    } else {
-      assert(typeWidthInBytes(DestTy) <= 4);
-    }
-    static const CastEmitterRegOp<GPRRegister, XmmRegister> Emitter = {
-        &Assembler::cvtss2si, &Assembler::cvtss2si};
-    emitIASCastRegOp<GPRRegister, XmmRegister, Traits::getEncodedGPR,
-                     Traits::getEncodedXmm>(Func, DestTy, Dest, SrcTy, Src,
-                                            Emitter);
-    return;
-  }
-  case Float2float: {
-    assert(isScalarFloatingType(SrcTy));
-    assert(isScalarFloatingType(DestTy));
-    assert(DestTy != SrcTy);
-    static const XmmEmitterRegOp Emitter = {&Assembler::cvtfloat2float,
-                                            &Assembler::cvtfloat2float};
-    emitIASRegOpTyXMM(Func, SrcTy, Dest, Src, Emitter);
-    return;
-  }
-  case Dq2ps: {
-    assert(isVectorIntegerType(SrcTy));
-    assert(isVectorFloatingType(DestTy));
-    static const XmmEmitterRegOp Emitter = {&Assembler::cvtdq2ps,
-                                            &Assembler::cvtdq2ps};
-    emitIASRegOpTyXMM(Func, DestTy, Dest, Src, Emitter);
-    return;
-  }
-  case Tps2dq: {
-    assert(isVectorFloatingType(SrcTy));
-    assert(isVectorIntegerType(DestTy));
-    static const XmmEmitterRegOp Emitter = {&Assembler::cvttps2dq,
-                                            &Assembler::cvttps2dq};
-    emitIASRegOpTyXMM(Func, DestTy, Dest, Src, Emitter);
-    return;
-  }
-  case Ps2dq: {
-    assert(isVectorFloatingType(SrcTy));
-    assert(isVectorIntegerType(DestTy));
-    static const XmmEmitterRegOp Emitter = {&Assembler::cvtps2dq,
-                                            &Assembler::cvtps2dq};
-    emitIASRegOpTyXMM(Func, DestTy, Dest, Src, Emitter);
-    return;
-  }
-  }
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Cvt::dump(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrDump();
-  this->dumpDest(Func);
-  Str << " = cvt";
-  if (isTruncating())
-    Str << "t";
-  Str << Traits::TypeAttributes[this->getSrc(0)->getType()].CvtString << "2"
-      << Traits::TypeAttributes[this->getDest()->getType()].CvtString << " ";
-  this->dumpSources(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Round::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  assert(this->getSrcSize() == 3);
-  Str << "\t" << this->Opcode
-      << Traits::TypeAttributes[this->getDest()->getType()].SpSdString << "\t";
-  this->getSrc(1)->emit(Func);
-  Str << ", ";
-  this->getSrc(0)->emit(Func);
-  Str << ", ";
-  this->getDest()->emit(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Round::emitIAS(const Cfg *Func) const {
-  assert(this->getSrcSize() == 2);
-  assert(InstX86Base::getTarget(Func)->getInstructionSet() >= Traits::SSE4_1);
-  const Variable *Dest = this->getDest();
-  Type Ty = Dest->getType();
-  static const ThreeOpImmEmitter<XmmRegister, XmmRegister> Emitter = {
-      &Assembler::round, &Assembler::round};
-  emitIASThreeOpImmOps<XmmRegister, XmmRegister, Traits::getEncodedXmm,
-                       Traits::getEncodedXmm>(Func, Ty, Dest, this->getSrc(0),
-                                              this->getSrc(1), Emitter);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Icmp::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  assert(this->getSrcSize() == 2);
-  Str << "\t"
-         "cmp"
-      << this->getWidthString(this->getSrc(0)->getType()) << "\t";
-  this->getSrc(1)->emit(Func);
-  Str << ", ";
-  this->getSrc(0)->emit(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Icmp::emitIAS(const Cfg *Func) const {
-  assert(this->getSrcSize() == 2);
-  const Operand *Src0 = this->getSrc(0);
-  const Operand *Src1 = this->getSrc(1);
-  Type Ty = Src0->getType();
-  static const GPREmitterRegOp RegEmitter = {&Assembler::cmp, &Assembler::cmp,
-                                             &Assembler::cmp};
-  static const GPREmitterAddrOp AddrEmitter = {&Assembler::cmp,
-                                               &Assembler::cmp};
-  if (const auto *SrcVar0 = llvm::dyn_cast<Variable>(Src0)) {
-    if (SrcVar0->hasReg()) {
-      constexpr bool NotLea = false;
-      emitIASRegOpTyGPR(Func, NotLea, Ty, SrcVar0, Src1, RegEmitter);
-      return;
-    }
-  }
-  emitIASAsAddrOpTyGPR(Func, Ty, Src0, Src1, AddrEmitter);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Icmp::dump(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrDump();
-  Str << "cmp." << this->getSrc(0)->getType() << " ";
-  this->dumpSources(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Ucomiss::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  assert(this->getSrcSize() == 2);
-  Str << "\t"
-         "ucomi"
-      << Traits::TypeAttributes[this->getSrc(0)->getType()].SdSsString << "\t";
-  this->getSrc(1)->emit(Func);
-  Str << ", ";
-  this->getSrc(0)->emit(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Ucomiss::emitIAS(const Cfg *Func) const {
-  assert(this->getSrcSize() == 2);
-  // Currently src0 is always a variable by convention, to avoid having two
-  // memory operands.
-  assert(llvm::isa<Variable>(this->getSrc(0)));
-  const auto *Src0Var = llvm::cast<Variable>(this->getSrc(0));
-  Type Ty = Src0Var->getType();
-  static const XmmEmitterRegOp Emitter = {&Assembler::ucomiss,
-                                          &Assembler::ucomiss};
-  emitIASRegOpTyXMM(Func, Ty, Src0Var, this->getSrc(1), Emitter);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Ucomiss::dump(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrDump();
-  Str << "ucomiss." << this->getSrc(0)->getType() << " ";
-  this->dumpSources(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86UD2::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  assert(this->getSrcSize() == 0);
-  Str << "\t"
-         "ud2";
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86UD2::emitIAS(const Cfg *Func) const {
-  Assembler *Asm = Func->getAssembler<Assembler>();
-  Asm->ud2();
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86UD2::dump(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrDump();
-  Str << "ud2";
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Int3::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  assert(this->getSrcSize() == 0);
-  Str << "\t"
-         "int 3";
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Int3::emitIAS(const Cfg *Func) const {
-  Assembler *Asm = Func->getAssembler<Assembler>();
-  Asm->int3();
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Int3::dump(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrDump();
-  Str << "int 3";
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Test::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  assert(this->getSrcSize() == 2);
-  Str << "\t"
-         "test"
-      << this->getWidthString(this->getSrc(0)->getType()) << "\t";
-  this->getSrc(1)->emit(Func);
-  Str << ", ";
-  this->getSrc(0)->emit(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Test::emitIAS(const Cfg *Func) const {
-  assert(this->getSrcSize() == 2);
-  const Operand *Src0 = this->getSrc(0);
-  const Operand *Src1 = this->getSrc(1);
-  Type Ty = Src0->getType();
-  // The Reg/Addr form of test is not encodeable.
-  static const GPREmitterRegOp RegEmitter = {&Assembler::test, nullptr,
-                                             &Assembler::test};
-  static const GPREmitterAddrOp AddrEmitter = {&Assembler::test,
-                                               &Assembler::test};
-  if (const auto *SrcVar0 = llvm::dyn_cast<Variable>(Src0)) {
-    if (SrcVar0->hasReg()) {
-      constexpr bool NotLea = false;
-      emitIASRegOpTyGPR(Func, NotLea, Ty, SrcVar0, Src1, RegEmitter);
-      return;
-    }
-  }
-  emitIASAsAddrOpTyGPR(Func, Ty, Src0, Src1, AddrEmitter);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Test::dump(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrDump();
-  Str << "test." << this->getSrc(0)->getType() << " ";
-  this->dumpSources(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Mfence::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  assert(this->getSrcSize() == 0);
-  Str << "\t"
-         "mfence";
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Mfence::emitIAS(const Cfg *Func) const {
-  Assembler *Asm = Func->getAssembler<Assembler>();
-  Asm->mfence();
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Mfence::dump(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrDump();
-  Str << "mfence";
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Store::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  assert(this->getSrcSize() == 2);
-  Type Ty = this->getSrc(0)->getType();
-  Str << "\t"
-         "mov"
-      << this->getWidthString(Ty) << Traits::TypeAttributes[Ty].SdSsString
-      << "\t";
-  this->getSrc(0)->emit(Func);
-  Str << ", ";
-  this->getSrc(1)->emit(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Store::emitIAS(const Cfg *Func) const {
-  assert(this->getSrcSize() == 2);
-  const Operand *Dest = this->getSrc(1);
-  const Operand *Src = this->getSrc(0);
-  Type DestTy = Dest->getType();
-  if (isScalarFloatingType(DestTy)) {
-    // Src must be a register, since Dest is a Mem operand of some kind.
-    const auto *SrcVar = llvm::cast<Variable>(Src);
-    assert(SrcVar->hasReg());
-    XmmRegister SrcReg = Traits::getEncodedXmm(SrcVar->getRegNum());
-    Assembler *Asm = Func->getAssembler<Assembler>();
-    auto *Target = InstX86Base::getTarget(Func);
-    if (const auto *DestVar = llvm::dyn_cast<Variable>(Dest)) {
-      assert(!DestVar->hasReg());
-      Address StackAddr(Target->stackVarToAsmOperand(DestVar));
-      Asm->movss(DestTy, StackAddr, SrcReg);
-    } else {
-      const auto DestMem = llvm::cast<X86OperandMem>(Dest);
-      assert(DestMem->getSegmentRegister() == X86OperandMem::DefaultSegment);
-      Asm->movss(DestTy, DestMem->toAsmAddress(Asm, Target), SrcReg);
-    }
-    return;
-  } else {
-    assert(isScalarIntegerType(DestTy));
-    static const GPREmitterAddrOp GPRAddrEmitter = {&Assembler::mov,
-                                                    &Assembler::mov};
-    emitIASAsAddrOpTyGPR(Func, DestTy, Dest, Src, GPRAddrEmitter);
-  }
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Store::dump(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrDump();
-  Str << "mov." << this->getSrc(0)->getType() << " ";
-  this->getSrc(1)->dump(Func);
-  Str << ", ";
-  this->getSrc(0)->dump(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86StoreP::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  assert(this->getSrcSize() == 2);
-  assert(isVectorType(this->getSrc(1)->getType()));
-  Str << "\t"
-         "movups\t";
-  this->getSrc(0)->emit(Func);
-  Str << ", ";
-  this->getSrc(1)->emit(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86StoreP::emitIAS(const Cfg *Func) const {
-  Assembler *Asm = Func->getAssembler<Assembler>();
-  assert(this->getSrcSize() == 2);
-  const auto *SrcVar = llvm::cast<Variable>(this->getSrc(0));
-  const auto DestMem = llvm::cast<X86OperandMem>(this->getSrc(1));
-  assert(DestMem->getSegmentRegister() == X86OperandMem::DefaultSegment);
-  assert(SrcVar->hasReg());
-  auto *Target = InstX86Base::getTarget(Func);
-  Asm->movups(DestMem->toAsmAddress(Asm, Target),
-              Traits::getEncodedXmm(SrcVar->getRegNum()));
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86StoreP::dump(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrDump();
-  Str << "storep." << this->getSrc(0)->getType() << " ";
-  this->getSrc(1)->dump(Func);
-  Str << ", ";
-  this->getSrc(0)->dump(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86StoreQ::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  assert(this->getSrcSize() == 2);
-  assert(this->getSrc(1)->getType() == IceType_i64 ||
-         this->getSrc(1)->getType() == IceType_f64 ||
-         isVectorType(this->getSrc(1)->getType()));
-  Str << "\t"
-         "movq\t";
-  this->getSrc(0)->emit(Func);
-  Str << ", ";
-  this->getSrc(1)->emit(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86StoreQ::emitIAS(const Cfg *Func) const {
-  Assembler *Asm = Func->getAssembler<Assembler>();
-  assert(this->getSrcSize() == 2);
-  const auto *SrcVar = llvm::cast<Variable>(this->getSrc(0));
-  const auto DestMem = llvm::cast<X86OperandMem>(this->getSrc(1));
-  assert(DestMem->getSegmentRegister() == X86OperandMem::DefaultSegment);
-  assert(SrcVar->hasReg());
-  auto *Target = InstX86Base::getTarget(Func);
-  Asm->movq(DestMem->toAsmAddress(Asm, Target),
-            Traits::getEncodedXmm(SrcVar->getRegNum()));
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86StoreQ::dump(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrDump();
-  Str << "storeq." << this->getSrc(0)->getType() << " ";
-  this->getSrc(1)->dump(Func);
-  Str << ", ";
-  this->getSrc(0)->dump(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86StoreD::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  assert(this->getSrcSize() == 2);
-  assert(this->getSrc(1)->getType() == IceType_i64 ||
-         this->getSrc(1)->getType() == IceType_f64 ||
-         isVectorType(this->getSrc(1)->getType()));
-  Str << "\t"
-         "movd\t";
-  this->getSrc(0)->emit(Func);
-  Str << ", ";
-  this->getSrc(1)->emit(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86StoreD::emitIAS(const Cfg *Func) const {
-  Assembler *Asm = Func->getAssembler<Assembler>();
-  assert(this->getSrcSize() == 2);
-  const auto *SrcVar = llvm::cast<Variable>(this->getSrc(0));
-  const auto DestMem = llvm::cast<X86OperandMem>(this->getSrc(1));
-  assert(DestMem->getSegmentRegister() == X86OperandMem::DefaultSegment);
-  assert(SrcVar->hasReg());
-  auto *Target = InstX86Base::getTarget(Func);
-  Asm->movd(SrcVar->getType(), DestMem->toAsmAddress(Asm, Target),
-            Traits::getEncodedXmm(SrcVar->getRegNum()));
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86StoreD::dump(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrDump();
-  Str << "stored." << this->getSrc(0)->getType() << " ";
-  this->getSrc(1)->dump(Func);
-  Str << ", ";
-  this->getSrc(0)->dump(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Lea::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  if (auto *Add = this->deoptLeaToAddOrNull(Func)) {
-    Add->emit(Func);
-    return;
-  }
-
-  Ostream &Str = Func->getContext()->getStrEmit();
-  assert(this->getSrcSize() == 1);
-  assert(this->getDest()->hasReg());
-  Str << "\t"
-         "lea"
-      << this->getWidthString(this->getDest()->getType()) << "\t";
-  Operand *Src0 = this->getSrc(0);
-  if (const auto *Src0Var = llvm::dyn_cast<Variable>(Src0)) {
-    Type Ty = Src0Var->getType();
-    // lea on x86-32 doesn't accept mem128 operands, so cast VSrc0 to an
-    // acceptable type.
-    Src0Var->asType(Func, isVectorType(Ty) ? IceType_i32 : Ty, RegNumT())
-        ->emit(Func);
-  } else {
-    Src0->emit(Func);
-  }
-  Str << ", ";
-  this->getDest()->emit(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Mov::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  assert(this->getSrcSize() == 1);
-  Operand *Src = this->getSrc(0);
-  Type SrcTy = Src->getType();
-  Type DestTy = this->getDest()->getType();
-  if (Traits::Is64Bit && DestTy == IceType_i64 &&
-      llvm::isa<ConstantInteger64>(Src) &&
-      !Utils::IsInt(32, llvm::cast<ConstantInteger64>(Src)->getValue())) {
-    Str << "\t"
-           "movabs"
-           "\t";
-  } else {
-    Str << "\t"
-           "mov"
-        << (!isScalarFloatingType(DestTy)
-                ? this->getWidthString(DestTy)
-                : Traits::TypeAttributes[DestTy].SdSsString)
-        << "\t";
-  }
-  // For an integer truncation operation, src is wider than dest. In this case,
-  // we use a mov instruction whose data width matches the narrower dest.
-  // TODO: This assert disallows usages such as copying a floating
-  // point value between a vector and a scalar (which movss is used for). Clean
-  // this up.
-  assert(InstX86Base::getTarget(Func)->typeWidthInBytesOnStack(DestTy) ==
-         InstX86Base::getTarget(Func)->typeWidthInBytesOnStack(SrcTy));
-  const Operand *NewSrc = Src;
-  if (auto *SrcVar = llvm::dyn_cast<Variable>(Src)) {
-    RegNumT NewRegNum;
-    if (SrcVar->hasReg())
-      NewRegNum = Traits::getGprForType(DestTy, SrcVar->getRegNum());
-    if (SrcTy != DestTy)
-      NewSrc = SrcVar->asType(Func, DestTy, NewRegNum);
-  }
-  NewSrc->emit(Func);
-  Str << ", ";
-  this->getDest()->emit(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Mov::emitIAS(const Cfg *Func) const {
-  assert(this->getSrcSize() == 1);
-  const Variable *Dest = this->getDest();
-  const Operand *Src = this->getSrc(0);
-  Type DestTy = Dest->getType();
-  Type SrcTy = Src->getType();
-  // Mov can be used for GPRs or XMM registers. Also, the type does not
-  // necessarily match (Mov can be used for bitcasts). However, when the type
-  // does not match, one of the operands must be a register. Thus, the strategy
-  // is to find out if Src or Dest are a register, then use that register's
-  // type to decide on which emitter set to use. The emitter set will include
-  // reg-reg movs, but that case should be unused when the types don't match.
-  static const XmmEmitterRegOp XmmRegEmitter = {&Assembler::movss,
-                                                &Assembler::movss};
-  static const GPREmitterRegOp GPRRegEmitter = {
-      &Assembler::mov, &Assembler::mov, &Assembler::mov};
-  static const GPREmitterAddrOp GPRAddrEmitter = {&Assembler::mov,
-                                                  &Assembler::mov};
-  // For an integer truncation operation, src is wider than dest. In this case,
-  // we use a mov instruction whose data width matches the narrower dest.
-  // TODO: This assert disallows usages such as copying a floating
-  // point value between a vector and a scalar (which movss is used for). Clean
-  // this up.
-  auto *Target = InstX86Base::getTarget(Func);
-  assert(Target->typeWidthInBytesOnStack(this->getDest()->getType()) ==
-         Target->typeWidthInBytesOnStack(Src->getType()));
-  if (Dest->hasReg()) {
-    if (isScalarFloatingType(DestTy)) {
-      emitIASRegOpTyXMM(Func, DestTy, Dest, Src, XmmRegEmitter);
-      return;
-    } else {
-      assert(isScalarIntegerType(DestTy));
-      // Widen DestTy for truncation (see above note). We should only do this
-      // when both Src and Dest are integer types.
-      if (Traits::Is64Bit && DestTy == IceType_i64) {
-        if (const auto *C64 = llvm::dyn_cast<ConstantInteger64>(Src)) {
-          Func->getAssembler<Assembler>()->movabs(
-              Traits::getEncodedGPR(Dest->getRegNum()), C64->getValue());
-          return;
-        }
-      }
-      if (isScalarIntegerType(SrcTy)) {
-        SrcTy = DestTy;
-      }
-      constexpr bool NotLea = false;
-      emitIASRegOpTyGPR(Func, NotLea, DestTy, Dest, Src, GPRRegEmitter);
-      return;
-    }
-  } else {
-    // Dest must be Stack and Src *could* be a register. Use Src's type to
-    // decide on the emitters.
-    Address StackAddr(Target->stackVarToAsmOperand(Dest));
-    if (isScalarFloatingType(SrcTy)) {
-      // Src must be a register.
-      const auto *SrcVar = llvm::cast<Variable>(Src);
-      assert(SrcVar->hasReg());
-      Assembler *Asm = Func->getAssembler<Assembler>();
-      Asm->movss(SrcTy, StackAddr, Traits::getEncodedXmm(SrcVar->getRegNum()));
-      return;
-    } else if (isVectorType(SrcTy)) {
-      // Src must be a register
-      const auto *SrcVar = llvm::cast<Variable>(Src);
-      assert(SrcVar->hasReg());
-      Assembler *Asm = Func->getAssembler<Assembler>();
-      Asm->movups(StackAddr, Traits::getEncodedXmm(SrcVar->getRegNum()));
-    } else {
-      // Src can be a register or immediate.
-      assert(isScalarIntegerType(SrcTy));
-      emitIASAddrOpTyGPR(Func, SrcTy, StackAddr, Src, GPRAddrEmitter);
-      return;
-    }
-    return;
-  }
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Movd::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  assert(this->getSrcSize() == 1);
-  Variable *Dest = this->getDest();
-  Operand *Src = this->getSrc(0);
-
-  if (Dest->getType() == IceType_i64 || Src->getType() == IceType_i64) {
-    assert(Dest->getType() == IceType_f64 || Src->getType() == IceType_f64);
-    assert(Dest->getType() != Src->getType());
-    Ostream &Str = Func->getContext()->getStrEmit();
-    Str << "\t"
-           "movq"
-           "\t";
-    Src->emit(Func);
-    Str << ", ";
-    Dest->emit(Func);
-    return;
-  }
-
-  InstX86BaseUnaryopXmm<InstX86Base::Movd>::emit(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Movd::emitIAS(const Cfg *Func) const {
-  Assembler *Asm = Func->getAssembler<Assembler>();
-  assert(this->getSrcSize() == 1);
-  const Variable *Dest = this->getDest();
-  auto *Target = InstX86Base::getTarget(Func);
-  // For insert/extract element (one of Src/Dest is an Xmm vector and the other
-  // is an int type).
-  if (const auto *SrcVar = llvm::dyn_cast<Variable>(this->getSrc(0))) {
-    if (SrcVar->getType() == IceType_i32 ||
-        (Traits::Is64Bit && SrcVar->getType() == IceType_i64)) {
-      assert(isVectorType(Dest->getType()) ||
-             (isScalarFloatingType(Dest->getType()) &&
-              typeWidthInBytes(SrcVar->getType()) ==
-                  typeWidthInBytes(Dest->getType())));
-      assert(Dest->hasReg());
-      XmmRegister DestReg = Traits::getEncodedXmm(Dest->getRegNum());
-      if (SrcVar->hasReg()) {
-        Asm->movd(SrcVar->getType(), DestReg,
-                  Traits::getEncodedGPR(SrcVar->getRegNum()));
-      } else {
-        Address StackAddr(Target->stackVarToAsmOperand(SrcVar));
-        Asm->movd(SrcVar->getType(), DestReg, StackAddr);
-      }
-    } else {
-      assert(isVectorType(SrcVar->getType()) ||
-             (isScalarFloatingType(SrcVar->getType()) &&
-              typeWidthInBytes(SrcVar->getType()) ==
-                  typeWidthInBytes(Dest->getType())));
-      assert(SrcVar->hasReg());
-      assert(Dest->getType() == IceType_i32 ||
-             (Traits::Is64Bit && Dest->getType() == IceType_i64));
-      XmmRegister SrcReg = Traits::getEncodedXmm(SrcVar->getRegNum());
-      if (Dest->hasReg()) {
-        Asm->movd(Dest->getType(), Traits::getEncodedGPR(Dest->getRegNum()),
-                  SrcReg);
-      } else {
-        Address StackAddr(Target->stackVarToAsmOperand(Dest));
-        Asm->movd(Dest->getType(), StackAddr, SrcReg);
-      }
-    }
-  } else {
-    assert(Dest->hasReg());
-    XmmRegister DestReg = Traits::getEncodedXmm(Dest->getRegNum());
-    auto *Mem = llvm::cast<X86OperandMem>(this->getSrc(0));
-    Asm->movd(Mem->getType(), DestReg, Mem->toAsmAddress(Asm, Target));
-  }
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Movp::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  // TODO(wala,stichnot): movups works with all vector operands, but there
-  // exist other instructions (movaps, movdqa, movdqu) that may perform better,
-  // depending on the data type and alignment of the operands.
-  Ostream &Str = Func->getContext()->getStrEmit();
-  assert(this->getSrcSize() == 1);
-  Str << "\t"
-         "movups\t";
-  this->getSrc(0)->emit(Func);
-  Str << ", ";
-  this->getDest()->emit(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Movp::emitIAS(const Cfg *Func) const {
-  assert(this->getSrcSize() == 1);
-  assert(isVectorType(this->getDest()->getType()));
-  const Variable *Dest = this->getDest();
-  const Operand *Src = this->getSrc(0);
-  static const XmmEmitterMovOps Emitter = {
-      &Assembler::movups, &Assembler::movups, &Assembler::movups};
-  emitIASMovlikeXMM(Func, Dest, Src, Emitter);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Movq::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  assert(this->getSrcSize() == 1);
-  assert(this->getDest()->getType() == IceType_i64 ||
-         this->getDest()->getType() == IceType_f64);
-  Str << "\t"
-         "movq"
-         "\t";
-  this->getSrc(0)->emit(Func);
-  Str << ", ";
-  this->getDest()->emit(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Movq::emitIAS(const Cfg *Func) const {
-  assert(this->getSrcSize() == 1);
-  assert(this->getDest()->getType() == IceType_i64 ||
-         this->getDest()->getType() == IceType_f64 ||
-         isVectorType(this->getDest()->getType()));
-  const Variable *Dest = this->getDest();
-  const Operand *Src = this->getSrc(0);
-  static const XmmEmitterMovOps Emitter = {&Assembler::movq, &Assembler::movq,
-                                           &Assembler::movq};
-  emitIASMovlikeXMM(Func, Dest, Src, Emitter);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86MovssRegs::emitIAS(const Cfg *Func) const {
-  // This is Binop variant is only intended to be used for reg-reg moves where
-  // part of the Dest register is untouched.
-  assert(this->getSrcSize() == 2);
-  const Variable *Dest = this->getDest();
-  assert(Dest == this->getSrc(0));
-  const auto *SrcVar = llvm::cast<Variable>(this->getSrc(1));
-  assert(Dest->hasReg() && SrcVar->hasReg());
-  Assembler *Asm = Func->getAssembler<Assembler>();
-  Asm->movss(IceType_f32, Traits::getEncodedXmm(Dest->getRegNum()),
-             Traits::getEncodedXmm(SrcVar->getRegNum()));
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Movsx::emitIAS(const Cfg *Func) const {
-  assert(this->getSrcSize() == 1);
-  const Variable *Dest = this->getDest();
-  const Operand *Src = this->getSrc(0);
-  // Dest must be a > 8-bit register, but Src can be 8-bit. In practice we just
-  // use the full register for Dest to avoid having an OperandSizeOverride
-  // prefix. It also allows us to only dispatch on SrcTy.
-  Type SrcTy = Src->getType();
-  assert(typeWidthInBytes(Dest->getType()) > 1);
-  assert(typeWidthInBytes(Dest->getType()) > typeWidthInBytes(SrcTy));
-  constexpr bool NotLea = false;
-  emitIASRegOpTyGPR<false, true>(Func, NotLea, SrcTy, Dest, Src, this->Emitter);
-}
-
-template <typename TraitsType>
-bool InstImpl<TraitsType>::InstX86Movzx::mayBeElided(
-    const Variable *Dest, const Operand *SrcOpnd) const {
-  assert(Traits::Is64Bit);
-  const auto *Src = llvm::dyn_cast<Variable>(SrcOpnd);
-
-  // Src is not a Variable, so it does not have a register. Movzx can't be
-  // elided.
-  if (Src == nullptr)
-    return false;
-
-  // Movzx to/from memory can't be elided.
-  if (!Src->hasReg() || !Dest->hasReg())
-    return false;
-
-  // Reg/reg move with different source and dest can't be elided.
-  if (Traits::getEncodedGPR(Src->getRegNum()) !=
-      Traits::getEncodedGPR(Dest->getRegNum()))
-    return false;
-
-  // A must-keep movzx 32- to 64-bit is sometimes needed in x86-64 sandboxing.
-  return !MustKeep;
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Movzx::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  if (Traits::Is64Bit) {
-    // There's no movzx %eXX, %rXX. To zero extend 32- to 64-bits, we emit a
-    // mov %eXX, %eXX. The processor will still do a movzx[bw]q.
-    assert(this->getSrcSize() == 1);
-    const Operand *Src = this->getSrc(0);
-    const Variable *Dest = this->Dest;
-    if (Src->getType() == IceType_i32 && Dest->getType() == IceType_i64) {
-      Ostream &Str = Func->getContext()->getStrEmit();
-      if (mayBeElided(Dest, Src)) {
-        Str << "\t/* elided movzx */";
-      } else {
-        Str << "\t"
-               "mov"
-               "\t";
-        Src->emit(Func);
-        Str << ", ";
-        Dest->asType(Func, IceType_i32,
-                     Traits::getGprForType(IceType_i32, Dest->getRegNum()))
-            ->emit(Func);
-        Str << " /* movzx */";
-      }
-      return;
-    }
-  }
-  InstX86BaseUnaryopGPR<InstX86Base::Movzx>::emit(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Movzx::emitIAS(const Cfg *Func) const {
-  assert(this->getSrcSize() == 1);
-  const Variable *Dest = this->getDest();
-  const Operand *Src = this->getSrc(0);
-  Type SrcTy = Src->getType();
-  assert(typeWidthInBytes(Dest->getType()) > 1);
-  assert(typeWidthInBytes(Dest->getType()) > typeWidthInBytes(SrcTy));
-  if (Traits::Is64Bit) {
-    if (Src->getType() == IceType_i32 && Dest->getType() == IceType_i64 &&
-        mayBeElided(Dest, Src)) {
-      return;
-    }
-  }
-  constexpr bool NotLea = false;
-  emitIASRegOpTyGPR<false, true>(Func, NotLea, SrcTy, Dest, Src, this->Emitter);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Nop::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  // TODO: Emit the right code for each variant.
-  Str << "\t"
-         "nop\t/* variant = "
-      << Variant << " */";
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Nop::emitIAS(const Cfg *Func) const {
-  Assembler *Asm = Func->getAssembler<Assembler>();
-  // TODO: Emit the right code for the variant.
-  Asm->nop();
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Nop::dump(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrDump();
-  Str << "nop (variant = " << Variant << ")";
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Pextr::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  assert(this->getSrcSize() == 2);
-  // pextrb and pextrd are SSE4.1 instructions.
-  Str << "\t" << this->Opcode
-      << Traits::TypeAttributes[this->getSrc(0)->getType()].IntegralString
-      << "\t";
-  this->getSrc(1)->emit(Func);
-  Str << ", ";
-  this->getSrc(0)->emit(Func);
-  Str << ", ";
-  Variable *Dest = this->getDest();
-  // pextrw must take a register dest. There is an SSE4.1 version that takes a
-  // memory dest, but we aren't using it. For uniformity, just restrict them
-  // all to have a register dest for now.
-  assert(Dest->hasReg());
-  Dest->asType(Func, IceType_i32, Dest->getRegNum())->emit(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Pextr::emitIAS(const Cfg *Func) const {
-  assert(this->getSrcSize() == 2);
-  // pextrb and pextrd are SSE4.1 instructions.
-  const Variable *Dest = this->getDest();
-  Type DispatchTy = Traits::getInVectorElementType(this->getSrc(0)->getType());
-  // pextrw must take a register dest. There is an SSE4.1 version that takes a
-  // memory dest, but we aren't using it. For uniformity, just restrict them
-  // all to have a register dest for now.
-  assert(Dest->hasReg());
-  // pextrw's Src(0) must be a register (both SSE4.1 and SSE2).
-  assert(llvm::cast<Variable>(this->getSrc(0))->hasReg());
-  static const ThreeOpImmEmitter<GPRRegister, XmmRegister> Emitter = {
-      &Assembler::pextr, nullptr};
-  emitIASThreeOpImmOps<GPRRegister, XmmRegister, Traits::getEncodedGPR,
-                       Traits::getEncodedXmm>(
-      Func, DispatchTy, Dest, this->getSrc(0), this->getSrc(1), Emitter);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Pinsr::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  assert(this->getSrcSize() == 3);
-  Str << "\t" << this->Opcode
-      << Traits::TypeAttributes[this->getDest()->getType()].IntegralString
-      << "\t";
-  this->getSrc(2)->emit(Func);
-  Str << ", ";
-  Operand *Src1 = this->getSrc(1);
-  if (const auto *Src1Var = llvm::dyn_cast<Variable>(Src1)) {
-    // If src1 is a register, it should always be r32.
-    if (Src1Var->hasReg()) {
-      const auto NewRegNum = Traits::getBaseReg(Src1Var->getRegNum());
-      const Variable *NewSrc = Src1Var->asType(Func, IceType_i32, NewRegNum);
-      NewSrc->emit(Func);
-    } else {
-      Src1Var->emit(Func);
-    }
-  } else {
-    Src1->emit(Func);
-  }
-  Str << ", ";
-  this->getDest()->emit(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Pinsr::emitIAS(const Cfg *Func) const {
-  assert(this->getSrcSize() == 3);
-  assert(this->getDest() == this->getSrc(0));
-  // pinsrb and pinsrd are SSE4.1 instructions.
-  const Operand *Src0 = this->getSrc(1);
-  Type DispatchTy = Src0->getType();
-  // If src1 is a register, it should always be r32 (this should fall out from
-  // the encodings for ByteRegs overlapping the encodings for r32), but we have
-  // to make sure the register allocator didn't choose an 8-bit high register
-  // like "ah".
-  if (BuildDefs::asserts()) {
-    if (auto *Src0Var = llvm::dyn_cast<Variable>(Src0)) {
-      if (Src0Var->hasReg()) {
-        const auto RegNum = Src0Var->getRegNum();
-        const auto BaseRegNum = Traits::getBaseReg(RegNum);
-        (void)BaseRegNum;
-        assert(Traits::getEncodedGPR(RegNum) ==
-               Traits::getEncodedGPR(BaseRegNum));
-      }
-    }
-  }
-  static const ThreeOpImmEmitter<XmmRegister, GPRRegister> Emitter = {
-      &Assembler::pinsr, &Assembler::pinsr};
-  emitIASThreeOpImmOps<XmmRegister, GPRRegister, Traits::getEncodedXmm,
-                       Traits::getEncodedGPR>(Func, DispatchTy, this->getDest(),
-                                              Src0, this->getSrc(2), Emitter);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Pshufd::emitIAS(const Cfg *Func) const {
-  assert(this->getSrcSize() == 2);
-  const Variable *Dest = this->getDest();
-  Type Ty = Dest->getType();
-  static const ThreeOpImmEmitter<XmmRegister, XmmRegister> Emitter = {
-      &Assembler::pshufd, &Assembler::pshufd};
-  emitIASThreeOpImmOps<XmmRegister, XmmRegister, Traits::getEncodedXmm,
-                       Traits::getEncodedXmm>(Func, Ty, Dest, this->getSrc(0),
-                                              this->getSrc(1), Emitter);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Shufps::emitIAS(const Cfg *Func) const {
-  assert(this->getSrcSize() == 3);
-  const Variable *Dest = this->getDest();
-  assert(Dest == this->getSrc(0));
-  Type Ty = Dest->getType();
-  static const ThreeOpImmEmitter<XmmRegister, XmmRegister> Emitter = {
-      &Assembler::shufps, &Assembler::shufps};
-  emitIASThreeOpImmOps<XmmRegister, XmmRegister, Traits::getEncodedXmm,
-                       Traits::getEncodedXmm>(Func, Ty, Dest, this->getSrc(1),
-                                              this->getSrc(2), Emitter);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Pop::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  assert(this->getSrcSize() == 0);
-  Str << "\t"
-         "pop\t";
-  this->getDest()->emit(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Pop::emitIAS(const Cfg *Func) const {
-  assert(this->getSrcSize() == 0);
-  Assembler *Asm = Func->getAssembler<Assembler>();
-  if (this->getDest()->hasReg()) {
-    Asm->popl(Traits::getEncodedGPR(this->getDest()->getRegNum()));
-  } else {
-    auto *Target = InstX86Base::getTarget(Func);
-    Asm->popl(Target->stackVarToAsmOperand(this->getDest()));
-  }
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Pop::dump(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrDump();
-  this->dumpDest(Func);
-  Str << " = pop." << this->getDest()->getType() << " ";
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Push::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  Str << "\t"
-         "push"
-         "\t";
-  assert(this->getSrcSize() == 1);
-  const Operand *Src = this->getSrc(0);
-  Src->emit(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Push::emitIAS(const Cfg *Func) const {
-  Assembler *Asm = Func->getAssembler<Assembler>();
-
-  assert(this->getSrcSize() == 1);
-  const Operand *Src = this->getSrc(0);
-
-  if (const auto *Var = llvm::dyn_cast<Variable>(Src)) {
-    Asm->pushl(Traits::getEncodedGPR(Var->getRegNum()));
-  } else if (const auto *Const32 = llvm::dyn_cast<ConstantInteger32>(Src)) {
-    Asm->pushl(AssemblerImmediate(Const32->getValue()));
-  } else if (auto *CR = llvm::dyn_cast<ConstantRelocatable>(Src)) {
-    Asm->pushl(CR);
-  } else {
-    llvm_unreachable("Unexpected operand type");
-  }
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Push::dump(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrDump();
-  Str << "push." << this->getSrc(0)->getType() << " ";
-  this->dumpSources(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Ret::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  Str << "\t"
-         "ret";
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Ret::emitIAS(const Cfg *Func) const {
-  Assembler *Asm = Func->getAssembler<Assembler>();
-  Asm->ret();
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Ret::dump(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrDump();
-  Type Ty =
-      (this->getSrcSize() == 0 ? IceType_void : this->getSrc(0)->getType());
-  Str << "ret." << Ty << " ";
-  this->dumpSources(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Setcc::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  Str << "\t"
-         "set"
-      << Traits::InstBrAttributes[Condition].DisplayString << "\t";
-  this->Dest->emit(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Setcc::emitIAS(const Cfg *Func) const {
-  assert(Condition != Cond::Br_None);
-  assert(this->getDest()->getType() == IceType_i1);
-  assert(this->getSrcSize() == 0);
-  Assembler *Asm = Func->getAssembler<Assembler>();
-  auto *Target = InstX86Base::getTarget(Func);
-  if (this->getDest()->hasReg())
-    Asm->setcc(Condition,
-               Traits::getEncodedByteReg(this->getDest()->getRegNum()));
-  else
-    Asm->setcc(Condition, Target->stackVarToAsmOperand(this->getDest()));
-  return;
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Setcc::dump(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrDump();
-  Str << "setcc." << Traits::InstBrAttributes[Condition].DisplayString << " ";
-  this->dumpDest(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Xadd::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  if (this->Locked) {
-    Str << "\t"
-           "lock";
-  }
-  Str << "\t"
-         "xadd"
-      << this->getWidthString(this->getSrc(0)->getType()) << "\t";
-  this->getSrc(1)->emit(Func);
-  Str << ", ";
-  this->getSrc(0)->emit(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Xadd::emitIAS(const Cfg *Func) const {
-  assert(this->getSrcSize() == 2);
-  Assembler *Asm = Func->getAssembler<Assembler>();
-  Type Ty = this->getSrc(0)->getType();
-  const auto Mem = llvm::cast<X86OperandMem>(this->getSrc(0));
-  assert(Mem->getSegmentRegister() == X86OperandMem::DefaultSegment);
-  auto *Target = InstX86Base::getTarget(Func);
-  const Address Addr = Mem->toAsmAddress(Asm, Target);
-  const auto *VarReg = llvm::cast<Variable>(this->getSrc(1));
-  assert(VarReg->hasReg());
-  const GPRRegister Reg = Traits::getEncodedGPR(VarReg->getRegNum());
-  Asm->xadd(Ty, Addr, Reg, this->Locked);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Xadd::dump(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrDump();
-  if (this->Locked) {
-    Str << "lock ";
-  }
-  Type Ty = this->getSrc(0)->getType();
-  Str << "xadd." << Ty << " ";
-  this->dumpSources(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Xchg::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  Str << "\t"
-         "xchg"
-      << this->getWidthString(this->getSrc(0)->getType()) << "\t";
-  this->getSrc(1)->emit(Func);
-  Str << ", ";
-  this->getSrc(0)->emit(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Xchg::emitIAS(const Cfg *Func) const {
-  assert(this->getSrcSize() == 2);
-  Assembler *Asm = Func->getAssembler<Assembler>();
-  Type Ty = this->getSrc(0)->getType();
-  const auto *VarReg1 = llvm::cast<Variable>(this->getSrc(1));
-  assert(VarReg1->hasReg());
-  const GPRRegister Reg1 = Traits::getEncodedGPR(VarReg1->getRegNum());
-
-  if (const auto *VarReg0 = llvm::dyn_cast<Variable>(this->getSrc(0))) {
-    assert(VarReg0->hasReg());
-    const GPRRegister Reg0 = Traits::getEncodedGPR(VarReg0->getRegNum());
-    Asm->xchg(Ty, Reg0, Reg1);
-    return;
-  }
-
-  const auto *Mem = llvm::cast<X86OperandMem>(this->getSrc(0));
-  assert(Mem->getSegmentRegister() == X86OperandMem::DefaultSegment);
-  auto *Target = InstX86Base::getTarget(Func);
-  const Address Addr = Mem->toAsmAddress(Asm, Target);
-  Asm->xchg(Ty, Addr, Reg1);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86Xchg::dump(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrDump();
-  Type Ty = this->getSrc(0)->getType();
-  Str << "xchg." << Ty << " ";
-  this->dumpSources(Func);
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86IacaStart::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  Str << "\t# IACA_START\n"
-         "\t.byte 0x0F, 0x0B\n"
-         "\t"
-         "movl\t$111, %ebx\n"
-         "\t.byte 0x64, 0x67, 0x90";
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86IacaStart::emitIAS(const Cfg *Func) const {
-  Assembler *Asm = Func->getAssembler<Assembler>();
-  Asm->iaca_start();
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86IacaStart::dump(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrDump();
-  Str << "IACA_START";
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86IacaEnd::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  Str << "\t# IACA_END\n"
-         "\t"
-         "movl\t$222, %ebx\n"
-         "\t.byte 0x64, 0x67, 0x90\n"
-         "\t.byte 0x0F, 0x0B";
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86IacaEnd::emitIAS(const Cfg *Func) const {
-  Assembler *Asm = Func->getAssembler<Assembler>();
-  Asm->iaca_end();
-}
-
-template <typename TraitsType>
-void InstImpl<TraitsType>::InstX86IacaEnd::dump(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrDump();
-  Str << "IACA_END";
-}
-
-} // end of namespace X8664
-} // end of namespace Ice
-
-#endif // SUBZERO_SRC_ICEINSTX8664BASEIMPL_H
diff --git a/third_party/subzero/src/IceTargetLoweringX86.h b/third_party/subzero/src/IceTargetLoweringX86.h
index 4521f2e..75893c5 100644
--- a/third_party/subzero/src/IceTargetLoweringX86.h
+++ b/third_party/subzero/src/IceTargetLoweringX86.h
@@ -13,11 +13,61 @@
 ///
 //===----------------------------------------------------------------------===//
 
+#ifndef SUBZERO_SRC_ICETARGETLOWERINGX86_H
+#define SUBZERO_SRC_ICETARGETLOWERINGX86_H
+
+#include "IceCfg.h"
+#include "IceTargetLowering.h"
+
 #include <inttypes.h>
 
 namespace Ice {
 namespace X86 {
 
+enum InstructionSetX86 {
+  Begin,
+  // SSE2 is the baseline instruction set.
+  SSE2 = Begin,
+  SSE4_1,
+  End
+};
+
+class TargetX86 : public ::Ice::TargetLowering {
+  TargetX86() = delete;
+  TargetX86(const TargetX86 &) = delete;
+  TargetX86 &operator=(const TargetX86 &) = delete;
+
+public:
+  ~TargetX86() override = default;
+
+  InstructionSetX86 getInstructionSet() const { return InstructionSet; }
+
+protected:
+  explicit TargetX86(Cfg *Func) : TargetLowering(Func) {
+    static_assert(
+        (InstructionSetX86::End - InstructionSetX86::Begin) ==
+            (TargetInstructionSet::X86InstructionSet_End -
+             TargetInstructionSet::X86InstructionSet_Begin),
+        "Traits::InstructionSet range different from TargetInstructionSet");
+    if (getFlags().getTargetInstructionSet() !=
+        TargetInstructionSet::BaseInstructionSet) {
+      InstructionSet = static_cast<InstructionSetX86>(
+          (getFlags().getTargetInstructionSet() -
+           TargetInstructionSet::X86InstructionSet_Begin) +
+          InstructionSetX86::Begin);
+    }
+  }
+
+  InstructionSetX86 InstructionSet = InstructionSetX86::Begin;
+
+private:
+  ENABLE_MAKE_UNIQUE;
+};
+
+inline InstructionSetX86 getInstructionSet(const Cfg *Func) {
+  return reinterpret_cast<TargetX86 *>(Func->getTarget())->getInstructionSet();
+}
+
 template <typename T> struct PoolTypeConverter {};
 
 template <> struct PoolTypeConverter<float> {
@@ -70,3 +120,5 @@
 
 } // end of namespace X86
 } // end of namespace Ice
+
+#endif // SUBZERO_SRC_ICETARGETLOWERINGX8632_H
diff --git a/third_party/subzero/src/IceTargetLoweringX8632.cpp b/third_party/subzero/src/IceTargetLoweringX8632.cpp
index 3e680eb..5092a70 100644
--- a/third_party/subzero/src/IceTargetLoweringX8632.cpp
+++ b/third_party/subzero/src/IceTargetLoweringX8632.cpp
@@ -15,21 +15,38 @@
 
 #include "IceTargetLoweringX8632.h"
 
+#include "IceCfg.h"
+#include "IceCfgNode.h"
+#include "IceClFlags.h"
+#include "IceDefs.h"
+#include "IceELFObjectWriter.h"
+#include "IceGlobalInits.h"
+#include "IceInstVarIter.h"
+#include "IceInstX8632.h"
+#include "IceLiveness.h"
+#include "IceOperand.h"
+#include "IcePhiLoweringImpl.h"
 #include "IceTargetLoweringX8632Traits.h"
+#include "IceUtils.h"
+#include "IceVariableSplitting.h"
+
+#include "llvm/Support/MathExtras.h"
+
+#include <stack>
 
 #if defined(_WIN32)
 extern "C" void _chkstk();
 #endif
 
 namespace X8632 {
+
 std::unique_ptr<::Ice::TargetLowering> createTargetLowering(::Ice::Cfg *Func) {
   return ::Ice::X8632::TargetX8632::create(Func);
 }
 
 std::unique_ptr<::Ice::TargetDataLowering>
 createTargetDataLowering(::Ice::GlobalContext *Ctx) {
-  return ::Ice::X8632::TargetDataX86<::Ice::X8632::TargetX8632Traits>::create(
-      Ctx);
+  return ::Ice::X8632::TargetDataX8632::create(Ctx);
 }
 
 std::unique_ptr<::Ice::TargetHeaderLowering>
@@ -54,6 +71,7994 @@
 namespace Ice {
 namespace X8632 {
 
+template <typename T> struct PoolTypeConverter {};
+
+template <> struct PoolTypeConverter<float> {
+  using PrimitiveIntType = uint32_t;
+  using IceType = ConstantFloat;
+  static const Type Ty = IceType_f32;
+  static const char *TypeName;
+  static const char *AsmTag;
+  static const char *PrintfString;
+};
+
+template <> struct PoolTypeConverter<double> {
+  using PrimitiveIntType = uint64_t;
+  using IceType = ConstantDouble;
+  static const Type Ty = IceType_f64;
+  static const char *TypeName;
+  static const char *AsmTag;
+  static const char *PrintfString;
+};
+
+// Add converter for int type constant pooling
+template <> struct PoolTypeConverter<uint32_t> {
+  using PrimitiveIntType = uint32_t;
+  using IceType = ConstantInteger32;
+  static const Type Ty = IceType_i32;
+  static const char *TypeName;
+  static const char *AsmTag;
+  static const char *PrintfString;
+};
+
+// Add converter for int type constant pooling
+template <> struct PoolTypeConverter<uint16_t> {
+  using PrimitiveIntType = uint32_t;
+  using IceType = ConstantInteger32;
+  static const Type Ty = IceType_i16;
+  static const char *TypeName;
+  static const char *AsmTag;
+  static const char *PrintfString;
+};
+
+// Add converter for int type constant pooling
+template <> struct PoolTypeConverter<uint8_t> {
+  using PrimitiveIntType = uint32_t;
+  using IceType = ConstantInteger32;
+  static const Type Ty = IceType_i8;
+  static const char *TypeName;
+  static const char *AsmTag;
+  static const char *PrintfString;
+};
+
+const char *PoolTypeConverter<float>::TypeName = "float";
+const char *PoolTypeConverter<float>::AsmTag = ".long";
+const char *PoolTypeConverter<float>::PrintfString = "0x%x";
+
+const char *PoolTypeConverter<double>::TypeName = "double";
+const char *PoolTypeConverter<double>::AsmTag = ".quad";
+const char *PoolTypeConverter<double>::PrintfString = "0x%llx";
+
+const char *PoolTypeConverter<uint32_t>::TypeName = "i32";
+const char *PoolTypeConverter<uint32_t>::AsmTag = ".long";
+const char *PoolTypeConverter<uint32_t>::PrintfString = "0x%x";
+
+const char *PoolTypeConverter<uint16_t>::TypeName = "i16";
+const char *PoolTypeConverter<uint16_t>::AsmTag = ".short";
+const char *PoolTypeConverter<uint16_t>::PrintfString = "0x%x";
+
+const char *PoolTypeConverter<uint8_t>::TypeName = "i8";
+const char *PoolTypeConverter<uint8_t>::AsmTag = ".byte";
+const char *PoolTypeConverter<uint8_t>::PrintfString = "0x%x";
+
+// The Microsoft x64 ABI requires the caller to allocate a minimum 32 byte
+// "shadow store" (aka "home space") so that the callee may copy the 4
+// register args to it.
+SizeT getShadowStoreSize() {
+#if defined(_WIN64)
+  static const SizeT ShadowStoreSize =
+      Traits::Is64Bit ? 4 * typeWidthInBytes(Traits::WordType) : 0;
+  return ShadowStoreSize;
+#else
+  return 0;
+#endif
+}
+
+BoolFoldingEntry::BoolFoldingEntry(Inst *I)
+    : Instr(I), IsComplex(BoolFolding::hasComplexLowering(I)) {}
+
+typename BoolFolding::BoolFoldingProducerKind
+BoolFolding::getProducerKind(const Inst *Instr) {
+  if (llvm::isa<InstIcmp>(Instr)) {
+    if (Traits::Is64Bit || Instr->getSrc(0)->getType() != IceType_i64)
+      return PK_Icmp32;
+    return PK_Icmp64;
+  }
+  if (llvm::isa<InstFcmp>(Instr))
+    return PK_Fcmp;
+  if (auto *Arith = llvm::dyn_cast<InstArithmetic>(Instr)) {
+    if (Traits::Is64Bit || Arith->getSrc(0)->getType() != IceType_i64) {
+      switch (Arith->getOp()) {
+      default:
+        return PK_None;
+      case InstArithmetic::And:
+      case InstArithmetic::Or:
+        return PK_Arith;
+      }
+    }
+  }
+  return PK_None; // TODO(stichnot): remove this
+
+  if (auto *Cast = llvm::dyn_cast<InstCast>(Instr)) {
+    switch (Cast->getCastKind()) {
+    default:
+      return PK_None;
+    case InstCast::Trunc:
+      return PK_Trunc;
+    }
+  }
+  return PK_None;
+}
+
+typename BoolFolding::BoolFoldingConsumerKind
+BoolFolding::getConsumerKind(const Inst *Instr) {
+  if (llvm::isa<InstBr>(Instr))
+    return CK_Br;
+  if (llvm::isa<InstSelect>(Instr))
+    return CK_Select;
+  return CK_None; // TODO(stichnot): remove this
+
+  if (auto *Cast = llvm::dyn_cast<InstCast>(Instr)) {
+    switch (Cast->getCastKind()) {
+    default:
+      return CK_None;
+    case InstCast::Sext:
+      return CK_Sext;
+    case InstCast::Zext:
+      return CK_Zext;
+    }
+  }
+  return CK_None;
+}
+
+/// Returns true if the producing instruction has a "complex" lowering sequence.
+/// This generally means that its lowering sequence requires more than one
+/// conditional branch, namely 64-bit integer compares and some floating-point
+/// compares. When this is true, and there is more than one consumer, we prefer
+/// to disable the folding optimization because it minimizes branches.
+
+bool BoolFolding::hasComplexLowering(const Inst *Instr) {
+  switch (getProducerKind(Instr)) {
+  default:
+    return false;
+  case PK_Icmp64:
+    return !Traits::Is64Bit;
+  case PK_Fcmp:
+    return Traits::TableFcmp[llvm::cast<InstFcmp>(Instr)->getCondition()].C2 !=
+           CondX86::Br_None;
+  }
+}
+
+bool BoolFolding::isValidFolding(
+    typename BoolFolding::BoolFoldingProducerKind ProducerKind,
+    typename BoolFolding::BoolFoldingConsumerKind ConsumerKind) {
+  switch (ProducerKind) {
+  default:
+    return false;
+  case PK_Icmp32:
+  case PK_Icmp64:
+  case PK_Fcmp:
+    return (ConsumerKind == CK_Br) || (ConsumerKind == CK_Select);
+  case PK_Arith:
+    return ConsumerKind == CK_Br;
+  }
+}
+
+void BoolFolding::init(CfgNode *Node) {
+  Producers.clear();
+  for (Inst &Instr : Node->getInsts()) {
+    if (Instr.isDeleted())
+      continue;
+    invalidateProducersOnStore(&Instr);
+    // Check whether Instr is a valid producer.
+    Variable *Var = Instr.getDest();
+    if (Var) { // only consider instructions with an actual dest var
+      if (isBooleanType(Var->getType())) {        // only bool-type dest vars
+        if (getProducerKind(&Instr) != PK_None) { // white-listed instructions
+          Producers[Var->getIndex()] = BoolFoldingEntry(&Instr);
+        }
+      }
+    }
+    // Check each src variable against the map.
+    FOREACH_VAR_IN_INST(Var, Instr) {
+      SizeT VarNum = Var->getIndex();
+      if (!containsValid(VarNum))
+        continue;
+      // All valid consumers use Var as the first source operand
+      if (IndexOfVarOperandInInst(Var) != 0) {
+        setInvalid(VarNum);
+        continue;
+      }
+      // Consumer instructions must be white-listed
+      typename BoolFolding::BoolFoldingConsumerKind ConsumerKind =
+          getConsumerKind(&Instr);
+      if (ConsumerKind == CK_None) {
+        setInvalid(VarNum);
+        continue;
+      }
+      typename BoolFolding::BoolFoldingProducerKind ProducerKind =
+          getProducerKind(Producers[VarNum].Instr);
+      if (!isValidFolding(ProducerKind, ConsumerKind)) {
+        setInvalid(VarNum);
+        continue;
+      }
+      // Avoid creating multiple copies of complex producer instructions.
+      if (Producers[VarNum].IsComplex && Producers[VarNum].NumUses > 0) {
+        setInvalid(VarNum);
+        continue;
+      }
+      ++Producers[VarNum].NumUses;
+      if (Instr.isLastUse(Var)) {
+        Producers[VarNum].IsLiveOut = false;
+      }
+    }
+  }
+  for (auto &I : Producers) {
+    // Ignore entries previously marked invalid.
+    if (I.second.Instr == nullptr)
+      continue;
+    // Disable the producer if its dest may be live beyond this block.
+    if (I.second.IsLiveOut) {
+      setInvalid(I.first);
+      continue;
+    }
+    // Mark as "dead" rather than outright deleting. This is so that other
+    // peephole style optimizations during or before lowering have access to
+    // this instruction in undeleted form. See for example
+    // tryOptimizedCmpxchgCmpBr().
+    I.second.Instr->setDead();
+  }
+}
+
+const Inst *BoolFolding::getProducerFor(const Operand *Opnd) const {
+  auto *Var = llvm::dyn_cast<const Variable>(Opnd);
+  if (Var == nullptr)
+    return nullptr;
+  SizeT VarNum = Var->getIndex();
+  auto Element = Producers.find(VarNum);
+  if (Element == Producers.end())
+    return nullptr;
+  return Element->second.Instr;
+}
+
+void BoolFolding::dump(const Cfg *Func) const {
+  if (!BuildDefs::dump() || !Func->isVerbose(IceV_Folding))
+    return;
+  OstreamLocker L(Func->getContext());
+  Ostream &Str = Func->getContext()->getStrDump();
+  for (auto &I : Producers) {
+    if (I.second.Instr == nullptr)
+      continue;
+    Str << "Found foldable producer:\n  ";
+    I.second.Instr->dump(Func);
+    Str << "\n";
+  }
+}
+
+/// If the given instruction has potential memory side effects (e.g. store, rmw,
+/// or a call instruction with potential memory side effects), then we must not
+/// allow a pre-store Producer instruction with memory operands to be folded
+/// into a post-store Consumer instruction.  If this is detected, the Producer
+/// is invalidated.
+///
+/// We use the Producer's IsLiveOut field to determine whether any potential
+/// Consumers come after this store instruction.  The IsLiveOut field is
+/// initialized to true, and BoolFolding::init() sets IsLiveOut to false when it
+/// sees the variable's definitive last use (indicating the variable is not in
+/// the node's live-out set).  Thus if we see here that IsLiveOut is false, we
+/// know that there can be no consumers after the store, and therefore we know
+/// the folding is safe despite the store instruction.
+
+void BoolFolding::invalidateProducersOnStore(const Inst *Instr) {
+  if (!Instr->isMemoryWrite())
+    return;
+  for (auto &ProducerPair : Producers) {
+    if (!ProducerPair.second.IsLiveOut)
+      continue;
+    Inst *PInst = ProducerPair.second.Instr;
+    if (PInst == nullptr)
+      continue;
+    bool HasMemOperand = false;
+    const SizeT SrcSize = PInst->getSrcSize();
+    for (SizeT I = 0; I < SrcSize; ++I) {
+      if (llvm::isa<typename Traits::X86OperandMem>(PInst->getSrc(I))) {
+        HasMemOperand = true;
+        break;
+      }
+    }
+    if (!HasMemOperand)
+      continue;
+    setInvalid(ProducerPair.first);
+  }
+}
+
+void TargetX8632::initNodeForLowering(CfgNode *Node) {
+  FoldingInfo.init(Node);
+  FoldingInfo.dump(Func);
+}
+
+TargetX8632::TargetX8632(Cfg *Func) : TargetX86(Func) {}
+
+void TargetX8632::staticInit(GlobalContext *Ctx) {
+  RegNumT::setLimit(Traits::RegisterSet::Reg_NUM);
+  Traits::initRegisterSet(getFlags(), &TypeToRegisterSet, &RegisterAliases);
+  for (size_t i = 0; i < TypeToRegisterSet.size(); ++i)
+    TypeToRegisterSetUnfiltered[i] = TypeToRegisterSet[i];
+  filterTypeToRegisterSet(Ctx, Traits::RegisterSet::Reg_NUM,
+                          TypeToRegisterSet.data(), TypeToRegisterSet.size(),
+                          Traits::getRegName, getRegClassName);
+}
+
+bool TargetX8632::shouldBePooled(const Constant *C) {
+  if (auto *ConstFloat = llvm::dyn_cast<ConstantFloat>(C)) {
+    return !Utils::isPositiveZero(ConstFloat->getValue());
+  }
+  if (auto *ConstDouble = llvm::dyn_cast<ConstantDouble>(C)) {
+    return !Utils::isPositiveZero(ConstDouble->getValue());
+  }
+  return false;
+}
+
+::Ice::Type TargetX8632::getPointerType() {
+  if (!Traits::Is64Bit) {
+    return ::Ice::IceType_i32;
+  }
+  return ::Ice::IceType_i64;
+}
+
+void TargetX8632::translateO2() {
+  TimerMarker T(TimerStack::TT_O2, Func);
+
+  genTargetHelperCalls();
+  Func->dump("After target helper call insertion");
+
+  // Merge Alloca instructions, and lay out the stack.
+  static constexpr bool SortAndCombineAllocas = true;
+  Func->processAllocas(SortAndCombineAllocas);
+  Func->dump("After Alloca processing");
+
+  // Run this early so it can be used to focus optimizations on potentially hot
+  // code.
+  // TODO(stichnot,ascull): currently only used for regalloc not
+  // expensive high level optimizations which could be focused on potentially
+  // hot code.
+  Func->generateLoopInfo();
+  Func->dump("After loop analysis");
+  if (getFlags().getLoopInvariantCodeMotion()) {
+    Func->loopInvariantCodeMotion();
+    Func->dump("After LICM");
+  }
+
+  if (getFlags().getLocalCSE() != Ice::LCSE_Disabled) {
+    Func->localCSE(getFlags().getLocalCSE() == Ice::LCSE_EnabledSSA);
+    Func->dump("After Local CSE");
+    Func->floatConstantCSE();
+  }
+  if (getFlags().getEnableShortCircuit()) {
+    Func->shortCircuitJumps();
+    Func->dump("After Short Circuiting");
+  }
+
+  if (!getFlags().getEnablePhiEdgeSplit()) {
+    // Lower Phi instructions.
+    Func->placePhiLoads();
+    if (Func->hasError())
+      return;
+    Func->placePhiStores();
+    if (Func->hasError())
+      return;
+    Func->deletePhis();
+    if (Func->hasError())
+      return;
+    Func->dump("After Phi lowering");
+  }
+
+  // Address mode optimization.
+  Func->getVMetadata()->init(VMK_SingleDefs);
+  Func->doAddressOpt();
+  Func->materializeVectorShuffles();
+
+  // Find read-modify-write opportunities. Do this after address mode
+  // optimization so that doAddressOpt() doesn't need to be applied to RMW
+  // instructions as well.
+  findRMW();
+  Func->dump("After RMW transform");
+
+  // Argument lowering
+  Func->doArgLowering();
+
+  // Target lowering. This requires liveness analysis for some parts of the
+  // lowering decisions, such as compare/branch fusing. If non-lightweight
+  // liveness analysis is used, the instructions need to be renumbered first
+  // TODO: This renumbering should only be necessary if we're actually
+  // calculating live intervals, which we only do for register allocation.
+  Func->renumberInstructions();
+  if (Func->hasError())
+    return;
+
+  // TODO: It should be sufficient to use the fastest liveness calculation,
+  // i.e. livenessLightweight(). However, for some reason that slows down the
+  // rest of the translation. Investigate.
+  Func->liveness(Liveness_Basic);
+  if (Func->hasError())
+    return;
+  Func->dump("After x86 address mode opt");
+
+  doLoadOpt();
+
+  Func->genCode();
+  if (Func->hasError())
+    return;
+  Func->dump("After x86 codegen");
+  splitBlockLocalVariables(Func);
+
+  // Register allocation. This requires instruction renumbering and full
+  // liveness analysis. Loops must be identified before liveness so variable
+  // use weights are correct.
+  Func->renumberInstructions();
+  if (Func->hasError())
+    return;
+  Func->liveness(Liveness_Intervals);
+  if (Func->hasError())
+    return;
+  // The post-codegen dump is done here, after liveness analysis and associated
+  // cleanup, to make the dump cleaner and more useful.
+  Func->dump("After initial x86 codegen");
+  // Validate the live range computations. The expensive validation call is
+  // deliberately only made when assertions are enabled.
+  assert(Func->validateLiveness());
+  Func->getVMetadata()->init(VMK_All);
+  regAlloc(RAK_Global);
+  if (Func->hasError())
+    return;
+  Func->dump("After linear scan regalloc");
+
+  if (getFlags().getEnablePhiEdgeSplit()) {
+    Func->advancedPhiLowering();
+    Func->dump("After advanced Phi lowering");
+  }
+
+  // Stack frame mapping.
+  Func->genFrame();
+  if (Func->hasError())
+    return;
+  Func->dump("After stack frame mapping");
+
+  Func->contractEmptyNodes();
+  Func->reorderNodes();
+
+  // Branch optimization.  This needs to be done just before code emission. In
+  // particular, no transformations that insert or reorder CfgNodes should be
+  // done after branch optimization. We go ahead and do it before nop insertion
+  // to reduce the amount of work needed for searching for opportunities.
+  Func->doBranchOpt();
+  Func->dump("After branch optimization");
+}
+
+void TargetX8632::translateOm1() {
+  TimerMarker T(TimerStack::TT_Om1, Func);
+
+  genTargetHelperCalls();
+
+  // Do not merge Alloca instructions, and lay out the stack.
+  // static constexpr bool SortAndCombineAllocas = false;
+  static constexpr bool SortAndCombineAllocas =
+      true; // TODO(b/171222930): Fix Win32 bug when this is false
+  Func->processAllocas(SortAndCombineAllocas);
+  Func->dump("After Alloca processing");
+
+  Func->placePhiLoads();
+  if (Func->hasError())
+    return;
+  Func->placePhiStores();
+  if (Func->hasError())
+    return;
+  Func->deletePhis();
+  if (Func->hasError())
+    return;
+  Func->dump("After Phi lowering");
+
+  Func->doArgLowering();
+  Func->genCode();
+  if (Func->hasError())
+    return;
+  Func->dump("After initial x86 codegen");
+
+  regAlloc(RAK_InfOnly);
+  if (Func->hasError())
+    return;
+  Func->dump("After regalloc of infinite-weight variables");
+
+  Func->genFrame();
+  if (Func->hasError())
+    return;
+  Func->dump("After stack frame mapping");
+}
+
+inline bool canRMW(const InstArithmetic *Arith) {
+  Type Ty = Arith->getDest()->getType();
+  // X86 vector instructions write to a register and have no RMW option.
+  if (isVectorType(Ty))
+    return false;
+  bool isI64 = Ty == IceType_i64;
+
+  switch (Arith->getOp()) {
+  // Not handled for lack of simple lowering:
+  //   shift on i64
+  //   mul, udiv, urem, sdiv, srem, frem
+  // Not handled for lack of RMW instructions:
+  //   fadd, fsub, fmul, fdiv (also vector types)
+  default:
+    return false;
+  case InstArithmetic::Add:
+  case InstArithmetic::Sub:
+  case InstArithmetic::And:
+  case InstArithmetic::Or:
+  case InstArithmetic::Xor:
+    return true;
+  case InstArithmetic::Shl:
+  case InstArithmetic::Lshr:
+  case InstArithmetic::Ashr:
+    return false; // TODO(stichnot): implement
+    return !isI64;
+  }
+}
+
+bool isSameMemAddressOperand(const Operand *A, const Operand *B) {
+  if (A == B)
+    return true;
+  if (auto *MemA = llvm::dyn_cast<typename TargetX8632::X86OperandMem>(A)) {
+    if (auto *MemB = llvm::dyn_cast<typename TargetX8632::X86OperandMem>(B)) {
+      return MemA->getBase() == MemB->getBase() &&
+             MemA->getOffset() == MemB->getOffset() &&
+             MemA->getIndex() == MemB->getIndex() &&
+             MemA->getShift() == MemB->getShift() &&
+             MemA->getSegmentRegister() == MemB->getSegmentRegister();
+    }
+  }
+  return false;
+}
+
+void TargetX8632::findRMW() {
+  TimerMarker _(TimerStack::TT_findRMW, Func);
+  Func->dump("Before RMW");
+  if (Func->isVerbose(IceV_RMW))
+    Func->getContext()->lockStr();
+  for (CfgNode *Node : Func->getNodes()) {
+    // Walk through the instructions, considering each sequence of 3
+    // instructions, and look for the particular RMW pattern. Note that this
+    // search can be "broken" (false negatives) if there are intervening
+    // deleted instructions, or intervening instructions that could be safely
+    // moved out of the way to reveal an RMW pattern.
+    auto E = Node->getInsts().end();
+    auto I1 = E, I2 = E, I3 = Node->getInsts().begin();
+    for (; I3 != E; I1 = I2, I2 = I3, ++I3) {
+      // Make I3 skip over deleted instructions.
+      while (I3 != E && I3->isDeleted())
+        ++I3;
+      if (I1 == E || I2 == E || I3 == E)
+        continue;
+      assert(!I1->isDeleted());
+      assert(!I2->isDeleted());
+      assert(!I3->isDeleted());
+      auto *Load = llvm::dyn_cast<InstLoad>(I1);
+      auto *Arith = llvm::dyn_cast<InstArithmetic>(I2);
+      auto *Store = llvm::dyn_cast<InstStore>(I3);
+      if (!Load || !Arith || !Store)
+        continue;
+      // Look for:
+      //   a = Load addr
+      //   b = <op> a, other
+      //   Store b, addr
+      // Change to:
+      //   a = Load addr
+      //   b = <op> a, other
+      //   x = FakeDef
+      //   RMW <op>, addr, other, x
+      //   b = Store b, addr, x
+      // Note that inferTwoAddress() makes sure setDestRedefined() gets called
+      // on the updated Store instruction, to avoid liveness problems later.
+      //
+      // With this transformation, the Store instruction acquires a Dest
+      // variable and is now subject to dead code elimination if there are no
+      // more uses of "b".  Variable "x" is a beacon for determining whether the
+      // Store instruction gets dead-code eliminated.  If the Store instruction
+      // is eliminated, then it must be the case that the RMW instruction ends
+      // x's live range, and therefore the RMW instruction will be retained and
+      // later lowered.  On the other hand, if the RMW instruction does not end
+      // x's live range, then the Store instruction must still be present, and
+      // therefore the RMW instruction is ignored during lowering because it is
+      // redundant with the Store instruction.
+      //
+      // Note that if "a" has further uses, the RMW transformation may still
+      // trigger, resulting in two loads and one store, which is worse than the
+      // original one load and one store.  However, this is probably rare, and
+      // caching probably keeps it just as fast.
+      if (!isSameMemAddressOperand(Load->getLoadAddress(),
+                                   Store->getStoreAddress()))
+        continue;
+      Operand *ArithSrcFromLoad = Arith->getSrc(0);
+      Operand *ArithSrcOther = Arith->getSrc(1);
+      if (ArithSrcFromLoad != Load->getDest()) {
+        if (!Arith->isCommutative() || ArithSrcOther != Load->getDest())
+          continue;
+        std::swap(ArithSrcFromLoad, ArithSrcOther);
+      }
+      if (Arith->getDest() != Store->getData())
+        continue;
+      if (!canRMW(Arith))
+        continue;
+      if (Func->isVerbose(IceV_RMW)) {
+        Ostream &Str = Func->getContext()->getStrDump();
+        Str << "Found RMW in " << Func->getFunctionName() << ":\n  ";
+        Load->dump(Func);
+        Str << "\n  ";
+        Arith->dump(Func);
+        Str << "\n  ";
+        Store->dump(Func);
+        Str << "\n";
+      }
+      Variable *Beacon = Func->makeVariable(IceType_i32);
+      Beacon->setMustNotHaveReg();
+      Store->setRmwBeacon(Beacon);
+      auto *BeaconDef = InstFakeDef::create(Func, Beacon);
+      Node->getInsts().insert(I3, BeaconDef);
+      auto *RMW =
+          InstX86FakeRMW::create(Func, ArithSrcOther, Store->getStoreAddress(),
+                                 Beacon, Arith->getOp());
+      Node->getInsts().insert(I3, RMW);
+    }
+  }
+  if (Func->isVerbose(IceV_RMW))
+    Func->getContext()->unlockStr();
+}
+
+// Converts a ConstantInteger32 operand into its constant value, or
+// MemoryOrderInvalid if the operand is not a ConstantInteger32.
+inline uint64_t getConstantMemoryOrder(Operand *Opnd) {
+  if (auto *Integer = llvm::dyn_cast<ConstantInteger32>(Opnd))
+    return Integer->getValue();
+  return Intrinsics::MemoryOrderInvalid;
+}
+
+/// Determines whether the dest of a Load instruction can be folded into one of
+/// the src operands of a 2-operand instruction. This is true as long as the
+/// load dest matches exactly one of the binary instruction's src operands.
+/// Replaces Src0 or Src1 with LoadSrc if the answer is true.
+inline bool canFoldLoadIntoBinaryInst(Operand *LoadSrc, Variable *LoadDest,
+                                      Operand *&Src0, Operand *&Src1) {
+  if (Src0 == LoadDest && Src1 != LoadDest) {
+    Src0 = LoadSrc;
+    return true;
+  }
+  if (Src0 != LoadDest && Src1 == LoadDest) {
+    Src1 = LoadSrc;
+    return true;
+  }
+  return false;
+}
+
+void TargetX8632::doLoadOpt() {
+  TimerMarker _(TimerStack::TT_loadOpt, Func);
+  for (CfgNode *Node : Func->getNodes()) {
+    Context.init(Node);
+    while (!Context.atEnd()) {
+      Variable *LoadDest = nullptr;
+      Operand *LoadSrc = nullptr;
+      Inst *CurInst = iteratorToInst(Context.getCur());
+      Inst *Next = Context.getNextInst();
+      // Determine whether the current instruction is a Load instruction or
+      // equivalent.
+      if (auto *Load = llvm::dyn_cast<InstLoad>(CurInst)) {
+        // An InstLoad qualifies unless it uses a 64-bit absolute address,
+        // which requires legalization to insert a copy to register.
+        // TODO(b/148272103): Fold these after legalization.
+        if (!Traits::Is64Bit || !llvm::isa<Constant>(Load->getLoadAddress())) {
+          LoadDest = Load->getDest();
+          constexpr bool DoLegalize = false;
+          LoadSrc = formMemoryOperand(Load->getLoadAddress(),
+                                      LoadDest->getType(), DoLegalize);
+        }
+      } else if (auto *Intrin = llvm::dyn_cast<InstIntrinsic>(CurInst)) {
+        // An AtomicLoad intrinsic qualifies as long as it has a valid memory
+        // ordering, and can be implemented in a single instruction (i.e., not
+        // i64 on x86-32).
+        Intrinsics::IntrinsicID ID = Intrin->getIntrinsicID();
+        if (ID == Intrinsics::AtomicLoad &&
+            (Traits::Is64Bit || Intrin->getDest()->getType() != IceType_i64) &&
+            Intrinsics::isMemoryOrderValid(
+                ID, getConstantMemoryOrder(Intrin->getArg(1)))) {
+          LoadDest = Intrin->getDest();
+          constexpr bool DoLegalize = false;
+          LoadSrc = formMemoryOperand(Intrin->getArg(0), LoadDest->getType(),
+                                      DoLegalize);
+        }
+      }
+      // A Load instruction can be folded into the following instruction only
+      // if the following instruction ends the Load's Dest variable's live
+      // range.
+      if (LoadDest && Next && Next->isLastUse(LoadDest)) {
+        assert(LoadSrc);
+        Inst *NewInst = nullptr;
+        if (auto *Arith = llvm::dyn_cast<InstArithmetic>(Next)) {
+          Operand *Src0 = Arith->getSrc(0);
+          Operand *Src1 = Arith->getSrc(1);
+          if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) {
+            NewInst = InstArithmetic::create(Func, Arith->getOp(),
+                                             Arith->getDest(), Src0, Src1);
+          }
+        } else if (auto *Icmp = llvm::dyn_cast<InstIcmp>(Next)) {
+          Operand *Src0 = Icmp->getSrc(0);
+          Operand *Src1 = Icmp->getSrc(1);
+          if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) {
+            NewInst = InstIcmp::create(Func, Icmp->getCondition(),
+                                       Icmp->getDest(), Src0, Src1);
+          }
+        } else if (auto *Fcmp = llvm::dyn_cast<InstFcmp>(Next)) {
+          Operand *Src0 = Fcmp->getSrc(0);
+          Operand *Src1 = Fcmp->getSrc(1);
+          if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) {
+            NewInst = InstFcmp::create(Func, Fcmp->getCondition(),
+                                       Fcmp->getDest(), Src0, Src1);
+          }
+        } else if (auto *Select = llvm::dyn_cast<InstSelect>(Next)) {
+          Operand *Src0 = Select->getTrueOperand();
+          Operand *Src1 = Select->getFalseOperand();
+          if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) {
+            NewInst = InstSelect::create(Func, Select->getDest(),
+                                         Select->getCondition(), Src0, Src1);
+          }
+        } else if (auto *Cast = llvm::dyn_cast<InstCast>(Next)) {
+          // The load dest can always be folded into a Cast instruction.
+          auto *Src0 = llvm::dyn_cast<Variable>(Cast->getSrc(0));
+          if (Src0 == LoadDest) {
+            NewInst = InstCast::create(Func, Cast->getCastKind(),
+                                       Cast->getDest(), LoadSrc);
+          }
+        }
+        if (NewInst) {
+          CurInst->setDeleted();
+          Next->setDeleted();
+          Context.insert(NewInst);
+          // Update NewInst->LiveRangesEnded so that target lowering may
+          // benefit. Also update NewInst->HasSideEffects.
+          NewInst->spliceLivenessInfo(Next, CurInst);
+        }
+      }
+      Context.advanceCur();
+      Context.advanceNext();
+    }
+  }
+  Func->dump("After load optimization");
+}
+
+bool TargetX8632::doBranchOpt(Inst *I, const CfgNode *NextNode) {
+  if (auto *Br = llvm::dyn_cast<InstX86Br>(I)) {
+    return Br->optimizeBranch(NextNode);
+  }
+  return false;
+}
+
+Variable *TargetX8632::getPhysicalRegister(RegNumT RegNum, Type Ty) {
+  if (Ty == IceType_void)
+    Ty = IceType_i32;
+  if (PhysicalRegisters[Ty].empty())
+    PhysicalRegisters[Ty].resize(Traits::RegisterSet::Reg_NUM);
+  assert(unsigned(RegNum) < PhysicalRegisters[Ty].size());
+  Variable *Reg = PhysicalRegisters[Ty][RegNum];
+  if (Reg == nullptr) {
+    Reg = Func->makeVariable(Ty);
+    Reg->setRegNum(RegNum);
+    PhysicalRegisters[Ty][RegNum] = Reg;
+    // Specially mark a named physical register as an "argument" so that it is
+    // considered live upon function entry.  Otherwise it's possible to get
+    // liveness validation errors for saving callee-save registers.
+    Func->addImplicitArg(Reg);
+    // Don't bother tracking the live range of a named physical register.
+    Reg->setIgnoreLiveness();
+  }
+  assert(Traits::getGprForType(Ty, RegNum) == RegNum);
+  return Reg;
+}
+
+const char *TargetX8632::getRegName(RegNumT RegNum, Type Ty) const {
+  return Traits::getRegName(Traits::getGprForType(Ty, RegNum));
+}
+
+void TargetX8632::emitVariable(const Variable *Var) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Ctx->getStrEmit();
+  if (Var->hasReg()) {
+    Str << "%" << getRegName(Var->getRegNum(), Var->getType());
+    return;
+  }
+  if (Var->mustHaveReg()) {
+    llvm::report_fatal_error("Infinite-weight Variable (" + Var->getName() +
+                             ") has no register assigned - function " +
+                             Func->getFunctionName());
+  }
+  const int32_t Offset = Var->getStackOffset();
+  auto BaseRegNum = Var->getBaseRegNum();
+  if (BaseRegNum.hasNoValue())
+    BaseRegNum = getFrameOrStackReg();
+
+  // Print in the form "Offset(%reg)", omitting Offset when it is 0.
+  if (getFlags().getDecorateAsm()) {
+    Str << Var->getSymbolicStackOffset();
+  } else if (Offset != 0) {
+    Str << Offset;
+  }
+  const Type FrameSPTy = Traits::WordType;
+  Str << "(%" << getRegName(BaseRegNum, FrameSPTy) << ")";
+}
+
+typename TargetX8632::X86Address
+TargetX8632::stackVarToAsmOperand(const Variable *Var) const {
+  if (Var->hasReg())
+    llvm::report_fatal_error("Stack Variable has a register assigned");
+  if (Var->mustHaveReg()) {
+    llvm::report_fatal_error("Infinite-weight Variable (" + Var->getName() +
+                             ") has no register assigned - function " +
+                             Func->getFunctionName());
+  }
+  int32_t Offset = Var->getStackOffset();
+  auto BaseRegNum = Var->getBaseRegNum();
+  if (Var->getBaseRegNum().hasNoValue()) {
+    // If the stack pointer needs alignment, we must use the frame pointer for
+    // arguments. For locals, getFrameOrStackReg will return the stack pointer
+    // in this case.
+    if (needsStackPointerAlignment() && Var->getIsArg()) {
+      assert(hasFramePointer());
+      BaseRegNum = getFrameReg();
+    } else {
+      BaseRegNum = getFrameOrStackReg();
+    }
+  }
+  return X86Address(Traits::getEncodedGPR(BaseRegNum), Offset,
+                    AssemblerFixup::NoFixup);
+}
+
+void TargetX8632::addProlog(CfgNode *Node) {
+  // Stack frame layout:
+  //
+  // +------------------------+  ^ +
+  // | 1. return address      |  |
+  // +------------------------+  v -
+  // | 2. preserved registers |
+  // +------------------------+ <--- BasePointer (if used)
+  // | 3. padding             |
+  // +------------------------+
+  // | 4. global spill area   |
+  // +------------------------+
+  // | 5. padding             |
+  // +------------------------+
+  // | 6. local spill area    |
+  // +------------------------+
+  // | 7. padding             |
+  // +------------------------+
+  // | 7.5 shadow (WinX64)    |
+  // +------------------------+
+  // | 8. allocas             |
+  // +------------------------+
+  // | 9. padding             |
+  // +------------------------+
+  // | 10. out args           |
+  // +------------------------+ <--- StackPointer
+  //
+  // The following variables record the size in bytes of the given areas:
+  //  * X86_RET_IP_SIZE_BYTES:   area 1
+  //  * PreservedRegsSizeBytes:  area 2
+  //  * SpillAreaPaddingBytes:   area 3
+  //  * GlobalsSize:             area 4
+  //  * LocalsSlotsPaddingBytes: area 5
+  //  * GlobalsAndSubsequentPaddingSize: areas 4 - 5
+  //  * LocalsSpillAreaSize:     area 6
+  //  * FixedAllocaSizeBytes:    areas 7 - 8
+  //  * SpillAreaSizeBytes:      areas 3 - 10
+  //  * maxOutArgsSizeBytes():   areas 9 - 10
+
+  // Determine stack frame offsets for each Variable without a register
+  // assignment. This can be done as one variable per stack slot. Or, do
+  // coalescing by running the register allocator again with an infinite set of
+  // registers (as a side effect, this gives variables a second chance at
+  // physical register assignment).
+  //
+  // A middle ground approach is to leverage sparsity and allocate one block of
+  // space on the frame for globals (variables with multi-block lifetime), and
+  // one block to share for locals (single-block lifetime).
+
+  const SizeT ShadowStoreSize = getShadowStoreSize();
+
+  // StackPointer: points just past return address of calling function
+
+  Context.init(Node);
+  Context.setInsertPoint(Context.getCur());
+
+  SmallBitVector CalleeSaves = getRegisterSet(RegSet_CalleeSave, RegSet_None);
+  RegsUsed = SmallBitVector(CalleeSaves.size());
+  VarList SortedSpilledVariables, VariablesLinkedToSpillSlots;
+  size_t GlobalsSize = 0;
+  // If there is a separate locals area, this represents that area. Otherwise
+  // it counts any variable not counted by GlobalsSize.
+  SpillAreaSizeBytes = 0;
+  // If there is a separate locals area, this specifies the alignment for it.
+  uint32_t LocalsSlotsAlignmentBytes = 0;
+  // The entire spill locations area gets aligned to largest natural alignment
+  // of the variables that have a spill slot.
+  uint32_t SpillAreaAlignmentBytes = 0;
+  // A spill slot linked to a variable with a stack slot should reuse that
+  // stack slot.
+  std::function<bool(Variable *)> TargetVarHook =
+      [&VariablesLinkedToSpillSlots](Variable *Var) {
+        // TODO(stichnot): Refactor this into the base class.
+        Variable *Root = Var->getLinkedToStackRoot();
+        if (Root != nullptr) {
+          assert(!Root->hasReg());
+          if (!Root->hasReg()) {
+            VariablesLinkedToSpillSlots.push_back(Var);
+            return true;
+          }
+        }
+        return false;
+      };
+
+  // Compute the list of spilled variables and bounds for GlobalsSize, etc.
+  getVarStackSlotParams(SortedSpilledVariables, RegsUsed, &GlobalsSize,
+                        &SpillAreaSizeBytes, &SpillAreaAlignmentBytes,
+                        &LocalsSlotsAlignmentBytes, TargetVarHook);
+  uint32_t LocalsSpillAreaSize = SpillAreaSizeBytes;
+  SpillAreaSizeBytes += GlobalsSize;
+
+  // Add push instructions for preserved registers.
+  uint32_t NumCallee = 0;
+  size_t PreservedRegsSizeBytes = 0;
+  SmallBitVector Pushed(CalleeSaves.size());
+  for (RegNumT i : RegNumBVIter(CalleeSaves)) {
+    const auto Canonical = Traits::getBaseReg(i);
+    assert(Canonical == Traits::getBaseReg(Canonical));
+    if (RegsUsed[i]) {
+      Pushed[Canonical] = true;
+    }
+  }
+  for (RegNumT RegNum : RegNumBVIter(Pushed)) {
+    assert(RegNum == Traits::getBaseReg(RegNum));
+    ++NumCallee;
+    if (Traits::isXmm(RegNum)) {
+      PreservedRegsSizeBytes += 16;
+    } else {
+      PreservedRegsSizeBytes += typeWidthInBytes(Traits::WordType);
+    }
+    _push_reg(RegNum);
+  }
+  Ctx->statsUpdateRegistersSaved(NumCallee);
+
+  // StackPointer: points past preserved registers at start of spill area
+
+  // Generate "push frameptr; mov frameptr, stackptr"
+  if (IsEbpBasedFrame) {
+    assert(
+        (RegsUsed & getRegisterSet(RegSet_FramePointer, RegSet_None)).count() ==
+        0);
+    PreservedRegsSizeBytes += typeWidthInBytes(Traits::WordType);
+    _link_bp();
+  }
+
+  // Align the variables area. SpillAreaPaddingBytes is the size of the region
+  // after the preserved registers and before the spill areas.
+  // LocalsSlotsPaddingBytes is the amount of padding between the globals and
+  // locals area if they are separate.
+  assert(LocalsSlotsAlignmentBytes <= SpillAreaAlignmentBytes);
+  uint32_t SpillAreaPaddingBytes = 0;
+  uint32_t LocalsSlotsPaddingBytes = 0;
+  alignStackSpillAreas(Traits::X86_RET_IP_SIZE_BYTES + PreservedRegsSizeBytes,
+                       SpillAreaAlignmentBytes, GlobalsSize,
+                       LocalsSlotsAlignmentBytes, &SpillAreaPaddingBytes,
+                       &LocalsSlotsPaddingBytes);
+  SpillAreaSizeBytes += SpillAreaPaddingBytes + LocalsSlotsPaddingBytes;
+  uint32_t GlobalsAndSubsequentPaddingSize =
+      GlobalsSize + LocalsSlotsPaddingBytes;
+
+  // Functions returning scalar floating point types may need to convert values
+  // from an in-register xmm value to the top of the x87 floating point stack.
+  // This is done by a movp[sd] and an fld[sd].  Ensure there is enough scratch
+  // space on the stack for this.
+  const Type ReturnType = Func->getReturnType();
+  if (!Traits::X86_PASS_SCALAR_FP_IN_XMM) {
+    if (isScalarFloatingType(ReturnType)) {
+      // Avoid misaligned double-precision load/store.
+      RequiredStackAlignment = std::max<size_t>(
+          RequiredStackAlignment, Traits::X86_STACK_ALIGNMENT_BYTES);
+      SpillAreaSizeBytes =
+          std::max(typeWidthInBytesOnStack(ReturnType), SpillAreaSizeBytes);
+    }
+  }
+
+  RequiredStackAlignment =
+      std::max<size_t>(RequiredStackAlignment, SpillAreaAlignmentBytes);
+
+  if (PrologEmitsFixedAllocas) {
+    RequiredStackAlignment =
+        std::max(RequiredStackAlignment, FixedAllocaAlignBytes);
+  }
+
+  // Combine fixed allocations into SpillAreaSizeBytes if we are emitting the
+  // fixed allocations in the prolog.
+  if (PrologEmitsFixedAllocas)
+    SpillAreaSizeBytes += FixedAllocaSizeBytes;
+
+  // Win64 ABI: add space for shadow store (aka home space)
+  SpillAreaSizeBytes += ShadowStoreSize;
+
+  // Entering the function has made the stack pointer unaligned. Re-align it by
+  // adjusting the stack size.
+  // Note that StackOffset does not include spill area. It's the offset from the
+  // base stack pointer (epb), whether we set it or not, to the the first stack
+  // arg (if any). StackSize, on the other hand, does include the spill area.
+  const uint32_t StackOffset =
+      ShadowStoreSize + Traits::X86_RET_IP_SIZE_BYTES + PreservedRegsSizeBytes;
+  uint32_t StackSize = Utils::applyAlignment(StackOffset + SpillAreaSizeBytes,
+                                             RequiredStackAlignment);
+  StackSize = Utils::applyAlignment(StackSize + maxOutArgsSizeBytes(),
+                                    RequiredStackAlignment);
+  SpillAreaSizeBytes = StackSize - StackOffset; // Adjust for alignment, if any
+
+  if (SpillAreaSizeBytes) {
+    auto *Func = Node->getCfg();
+    if (SpillAreaSizeBytes > Func->getStackSizeLimit()) {
+      Func->setError("Stack size limit exceeded");
+    }
+
+    emitStackProbe(SpillAreaSizeBytes);
+
+    // Generate "sub stackptr, SpillAreaSizeBytes"
+    _sub_sp(Ctx->getConstantInt32(SpillAreaSizeBytes));
+  }
+
+  // StackPointer: points just past the spill area (end of stack frame)
+
+  // If the required alignment is greater than the stack pointer's guaranteed
+  // alignment, align the stack pointer accordingly.
+  if (RequiredStackAlignment > Traits::X86_STACK_ALIGNMENT_BYTES) {
+    assert(IsEbpBasedFrame);
+    _and(getPhysicalRegister(getStackReg(), Traits::WordType),
+         Ctx->getConstantInt32(-RequiredStackAlignment));
+  }
+
+  // StackPointer: may have just been offset for alignment
+
+  // Account for known-frame-offset alloca instructions that were not already
+  // combined into the prolog.
+  if (!PrologEmitsFixedAllocas)
+    SpillAreaSizeBytes += FixedAllocaSizeBytes;
+
+  Ctx->statsUpdateFrameBytes(SpillAreaSizeBytes);
+
+  // Fill in stack offsets for stack args, and copy args into registers for
+  // those that were register-allocated. Args are pushed right to left, so
+  // Arg[0] is closest to the stack/frame pointer.
+  RegNumT FrameOrStackReg = IsEbpBasedFrame ? getFrameReg() : getStackReg();
+  Variable *FramePtr = getPhysicalRegister(FrameOrStackReg, Traits::WordType);
+  size_t BasicFrameOffset = StackOffset;
+  if (!IsEbpBasedFrame)
+    BasicFrameOffset += SpillAreaSizeBytes;
+
+  const VarList &Args = Func->getArgs();
+  size_t InArgsSizeBytes = 0;
+  unsigned NumXmmArgs = 0;
+  unsigned NumGPRArgs = 0;
+  for (SizeT i = 0, NumArgs = Args.size(); i < NumArgs; ++i) {
+    Variable *Arg = Args[i];
+    // Skip arguments passed in registers.
+    if (isVectorType(Arg->getType())) {
+      if (Traits::getRegisterForXmmArgNum(Traits::getArgIndex(i, NumXmmArgs))
+              .hasValue()) {
+        ++NumXmmArgs;
+        continue;
+      }
+    } else if (isScalarFloatingType(Arg->getType())) {
+      if (Traits::X86_PASS_SCALAR_FP_IN_XMM &&
+          Traits::getRegisterForXmmArgNum(Traits::getArgIndex(i, NumXmmArgs))
+              .hasValue()) {
+        ++NumXmmArgs;
+        continue;
+      }
+    } else {
+      assert(isScalarIntegerType(Arg->getType()));
+      if (Traits::getRegisterForGprArgNum(Traits::WordType,
+                                          Traits::getArgIndex(i, NumGPRArgs))
+              .hasValue()) {
+        ++NumGPRArgs;
+        continue;
+      }
+    }
+    // For esp-based frames where the allocas are done outside the prolog, the
+    // esp value may not stabilize to its home value until after all the
+    // fixed-size alloca instructions have executed.  In this case, a stack
+    // adjustment is needed when accessing in-args in order to copy them into
+    // registers.
+    size_t StackAdjBytes = 0;
+    if (!IsEbpBasedFrame && !PrologEmitsFixedAllocas)
+      StackAdjBytes -= FixedAllocaSizeBytes;
+    finishArgumentLowering(Arg, FramePtr, BasicFrameOffset, StackAdjBytes,
+                           InArgsSizeBytes);
+  }
+
+  // Fill in stack offsets for locals.
+  assignVarStackSlots(SortedSpilledVariables, SpillAreaPaddingBytes,
+                      SpillAreaSizeBytes, GlobalsAndSubsequentPaddingSize,
+                      IsEbpBasedFrame && !needsStackPointerAlignment());
+  // Assign stack offsets to variables that have been linked to spilled
+  // variables.
+  for (Variable *Var : VariablesLinkedToSpillSlots) {
+    const Variable *Root = Var->getLinkedToStackRoot();
+    assert(Root != nullptr);
+    Var->setStackOffset(Root->getStackOffset());
+
+    // If the stack root variable is an arg, make this variable an arg too so
+    // that stackVarToAsmOperand uses the correct base pointer (e.g. ebp on
+    // x86).
+    Var->setIsArg(Root->getIsArg());
+  }
+  this->HasComputedFrame = true;
+
+  if (BuildDefs::dump() && Func->isVerbose(IceV_Frame)) {
+    OstreamLocker L(Func->getContext());
+    Ostream &Str = Func->getContext()->getStrDump();
+
+    Str << "Stack layout:\n";
+    uint32_t EspAdjustmentPaddingSize =
+        SpillAreaSizeBytes - LocalsSpillAreaSize -
+        GlobalsAndSubsequentPaddingSize - SpillAreaPaddingBytes -
+        maxOutArgsSizeBytes();
+    Str << " in-args = " << InArgsSizeBytes << " bytes\n"
+        << " return address = " << Traits::X86_RET_IP_SIZE_BYTES << " bytes\n"
+        << " preserved registers = " << PreservedRegsSizeBytes << " bytes\n"
+        << " spill area padding = " << SpillAreaPaddingBytes << " bytes\n"
+        << " globals spill area = " << GlobalsSize << " bytes\n"
+        << " globals-locals spill areas intermediate padding = "
+        << GlobalsAndSubsequentPaddingSize - GlobalsSize << " bytes\n"
+        << " locals spill area = " << LocalsSpillAreaSize << " bytes\n"
+        << " esp alignment padding = " << EspAdjustmentPaddingSize
+        << " bytes\n";
+
+    Str << "Stack details:\n"
+        << " esp adjustment = " << SpillAreaSizeBytes << " bytes\n"
+        << " spill area alignment = " << SpillAreaAlignmentBytes << " bytes\n"
+        << " outgoing args size = " << maxOutArgsSizeBytes() << " bytes\n"
+        << " locals spill area alignment = " << LocalsSlotsAlignmentBytes
+        << " bytes\n"
+        << " is ebp based = " << IsEbpBasedFrame << "\n";
+  }
+}
+
+/// Helper function for addProlog().
+///
+/// This assumes Arg is an argument passed on the stack. This sets the frame
+/// offset for Arg and updates InArgsSizeBytes according to Arg's width. For an
+/// I64 arg that has been split into Lo and Hi components, it calls itself
+/// recursively on the components, taking care to handle Lo first because of the
+/// little-endian architecture. Lastly, this function generates an instruction
+/// to copy Arg into its assigned register if applicable.
+
+void TargetX8632::finishArgumentLowering(Variable *Arg, Variable *FramePtr,
+                                         size_t BasicFrameOffset,
+                                         size_t StackAdjBytes,
+                                         size_t &InArgsSizeBytes) {
+  if (!Traits::Is64Bit) {
+    if (auto *Arg64On32 = llvm::dyn_cast<Variable64On32>(Arg)) {
+      Variable *Lo = Arg64On32->getLo();
+      Variable *Hi = Arg64On32->getHi();
+      finishArgumentLowering(Lo, FramePtr, BasicFrameOffset, StackAdjBytes,
+                             InArgsSizeBytes);
+      finishArgumentLowering(Hi, FramePtr, BasicFrameOffset, StackAdjBytes,
+                             InArgsSizeBytes);
+      return;
+    }
+  }
+  Type Ty = Arg->getType();
+  if (isVectorType(Ty)) {
+    InArgsSizeBytes = Traits::applyStackAlignment(InArgsSizeBytes);
+  }
+  Arg->setStackOffset(BasicFrameOffset + InArgsSizeBytes);
+  InArgsSizeBytes += typeWidthInBytesOnStack(Ty);
+  if (Arg->hasReg()) {
+    assert(Ty != IceType_i64 || Traits::Is64Bit);
+    auto *Mem = X86OperandMem::create(
+        Func, Ty, FramePtr,
+        Ctx->getConstantInt32(Arg->getStackOffset() + StackAdjBytes));
+    if (isVectorType(Arg->getType())) {
+      _movp(Arg, Mem);
+    } else {
+      _mov(Arg, Mem);
+    }
+    // This argument-copying instruction uses an explicit X86OperandMem
+    // operand instead of a Variable, so its fill-from-stack operation has to
+    // be tracked separately for statistics.
+    Ctx->statsUpdateFills();
+  }
+}
+
+void TargetX8632::addEpilog(CfgNode *Node) {
+  InstList &Insts = Node->getInsts();
+  InstList::reverse_iterator RI, E;
+  for (RI = Insts.rbegin(), E = Insts.rend(); RI != E; ++RI) {
+    if (llvm::isa<Insts::Ret>(*RI))
+      break;
+  }
+  if (RI == E)
+    return;
+
+  // Convert the reverse_iterator position into its corresponding (forward)
+  // iterator position.
+  InstList::iterator InsertPoint = reverseToForwardIterator(RI);
+  --InsertPoint;
+  Context.init(Node);
+  Context.setInsertPoint(InsertPoint);
+
+  if (IsEbpBasedFrame) {
+    _unlink_bp();
+  } else {
+    // add stackptr, SpillAreaSizeBytes
+    if (SpillAreaSizeBytes != 0) {
+      _add_sp(Ctx->getConstantInt32(SpillAreaSizeBytes));
+    }
+  }
+
+  // Add pop instructions for preserved registers.
+  SmallBitVector CalleeSaves = getRegisterSet(RegSet_CalleeSave, RegSet_None);
+  SmallBitVector Popped(CalleeSaves.size());
+  for (int32_t i = CalleeSaves.size() - 1; i >= 0; --i) {
+    const auto RegNum = RegNumT::fromInt(i);
+    if (RegNum == getFrameReg() && IsEbpBasedFrame)
+      continue;
+    const RegNumT Canonical = Traits::getBaseReg(RegNum);
+    if (CalleeSaves[i] && RegsUsed[i]) {
+      Popped[Canonical] = true;
+    }
+  }
+  for (int32_t i = Popped.size() - 1; i >= 0; --i) {
+    if (!Popped[i])
+      continue;
+    const auto RegNum = RegNumT::fromInt(i);
+    assert(RegNum == Traits::getBaseReg(RegNum));
+    _pop_reg(RegNum);
+  }
+}
+
+Type TargetX8632::stackSlotType() { return Traits::WordType; }
+
+template <typename T>
+typename std::enable_if<!T::Is64Bit, Operand>::type *
+TargetX8632::loOperand(Operand *Operand) {
+  assert(Operand->getType() == IceType_i64 ||
+         Operand->getType() == IceType_f64);
+  if (Operand->getType() != IceType_i64 && Operand->getType() != IceType_f64)
+    return Operand;
+  if (auto *Var64On32 = llvm::dyn_cast<Variable64On32>(Operand))
+    return Var64On32->getLo();
+  if (auto *Const = llvm::dyn_cast<ConstantInteger64>(Operand)) {
+    auto *ConstInt = llvm::dyn_cast<ConstantInteger32>(
+        Ctx->getConstantInt32(static_cast<int32_t>(Const->getValue())));
+    // Check if we need to blind/pool the constant.
+    return legalize(ConstInt);
+  }
+  if (auto *Mem = llvm::dyn_cast<X86OperandMem>(Operand)) {
+    auto *MemOperand = X86OperandMem::create(
+        Func, IceType_i32, Mem->getBase(), Mem->getOffset(), Mem->getIndex(),
+        Mem->getShift(), Mem->getSegmentRegister(), Mem->getIsRebased());
+    // Test if we should randomize or pool the offset, if so randomize it or
+    // pool it then create mem operand with the blinded/pooled constant.
+    // Otherwise, return the mem operand as ordinary mem operand.
+    return legalize(MemOperand);
+  }
+  llvm_unreachable("Unsupported operand type");
+  return nullptr;
+}
+
+template <typename T>
+typename std::enable_if<!T::Is64Bit, Operand>::type *
+TargetX8632::hiOperand(Operand *Operand) {
+  assert(Operand->getType() == IceType_i64 ||
+         Operand->getType() == IceType_f64);
+  if (Operand->getType() != IceType_i64 && Operand->getType() != IceType_f64)
+    return Operand;
+  if (auto *Var64On32 = llvm::dyn_cast<Variable64On32>(Operand))
+    return Var64On32->getHi();
+  if (auto *Const = llvm::dyn_cast<ConstantInteger64>(Operand)) {
+    auto *ConstInt = llvm::dyn_cast<ConstantInteger32>(
+        Ctx->getConstantInt32(static_cast<int32_t>(Const->getValue() >> 32)));
+    // Check if we need to blind/pool the constant.
+    return legalize(ConstInt);
+  }
+  if (auto *Mem = llvm::dyn_cast<X86OperandMem>(Operand)) {
+    Constant *Offset = Mem->getOffset();
+    if (Offset == nullptr) {
+      Offset = Ctx->getConstantInt32(4);
+    } else if (auto *IntOffset = llvm::dyn_cast<ConstantInteger32>(Offset)) {
+      Offset = Ctx->getConstantInt32(4 + IntOffset->getValue());
+    } else if (auto *SymOffset = llvm::dyn_cast<ConstantRelocatable>(Offset)) {
+      assert(!Utils::WouldOverflowAdd(SymOffset->getOffset(), 4));
+      Offset =
+          Ctx->getConstantSym(4 + SymOffset->getOffset(), SymOffset->getName());
+    }
+    auto *MemOperand = X86OperandMem::create(
+        Func, IceType_i32, Mem->getBase(), Offset, Mem->getIndex(),
+        Mem->getShift(), Mem->getSegmentRegister(), Mem->getIsRebased());
+    // Test if the Offset is an eligible i32 constants for randomization and
+    // pooling. Blind/pool it if it is. Otherwise return as oridinary mem
+    // operand.
+    return legalize(MemOperand);
+  }
+  llvm_unreachable("Unsupported operand type");
+  return nullptr;
+}
+
+SmallBitVector TargetX8632::getRegisterSet(RegSetMask Include,
+                                           RegSetMask Exclude) const {
+  return Traits::getRegisterSet(getFlags(), Include, Exclude);
+}
+
+void TargetX8632::lowerAlloca(const InstAlloca *Instr) {
+  // Conservatively require the stack to be aligned. Some stack adjustment
+  // operations implemented below assume that the stack is aligned before the
+  // alloca. All the alloca code ensures that the stack alignment is preserved
+  // after the alloca. The stack alignment restriction can be relaxed in some
+  // cases.
+  RequiredStackAlignment = std::max<size_t>(RequiredStackAlignment,
+                                            Traits::X86_STACK_ALIGNMENT_BYTES);
+
+  // For default align=0, set it to the real value 1, to avoid any
+  // bit-manipulation problems below.
+  const uint32_t AlignmentParam = std::max(1u, Instr->getAlignInBytes());
+
+  // LLVM enforces power of 2 alignment.
+  assert(llvm::isPowerOf2_32(AlignmentParam));
+  assert(llvm::isPowerOf2_32(Traits::X86_STACK_ALIGNMENT_BYTES));
+
+  const uint32_t Alignment =
+      std::max(AlignmentParam, Traits::X86_STACK_ALIGNMENT_BYTES);
+  const bool OverAligned = Alignment > Traits::X86_STACK_ALIGNMENT_BYTES;
+  const bool OptM1 = Func->getOptLevel() == Opt_m1;
+  const bool AllocaWithKnownOffset = Instr->getKnownFrameOffset();
+  const bool UseFramePointer =
+      hasFramePointer() || OverAligned || !AllocaWithKnownOffset || OptM1;
+
+  if (UseFramePointer)
+    setHasFramePointer();
+
+  Variable *esp = getPhysicalRegister(getStackReg(), Traits::WordType);
+  if (OverAligned) {
+    _and(esp, Ctx->getConstantInt32(-Alignment));
+  }
+
+  Variable *Dest = Instr->getDest();
+  Operand *TotalSize = legalize(Instr->getSizeInBytes());
+
+  if (const auto *ConstantTotalSize =
+          llvm::dyn_cast<ConstantInteger32>(TotalSize)) {
+    const uint32_t Value =
+        Utils::applyAlignment(ConstantTotalSize->getValue(), Alignment);
+    if (UseFramePointer) {
+      _sub_sp(Ctx->getConstantInt32(Value));
+    } else {
+      // If we don't need a Frame Pointer, this alloca has a known offset to the
+      // stack pointer. We don't need adjust the stack pointer, nor assign any
+      // value to Dest, as Dest is rematerializable.
+      assert(Dest->isRematerializable());
+      FixedAllocaSizeBytes += Value;
+      Context.insert<InstFakeDef>(Dest);
+    }
+  } else {
+    // Non-constant sizes need to be adjusted to the next highest multiple of
+    // the required alignment at runtime.
+    Variable *T = nullptr;
+    if (Traits::Is64Bit && TotalSize->getType() != IceType_i64) {
+      T = makeReg(IceType_i64);
+      _movzx(T, TotalSize);
+    } else {
+      T = makeReg(IceType_i32);
+      _mov(T, TotalSize);
+    }
+    _add(T, Ctx->getConstantInt32(Alignment - 1));
+    _and(T, Ctx->getConstantInt32(-Alignment));
+    _sub_sp(T);
+  }
+  // Add enough to the returned address to account for the out args area.
+  uint32_t OutArgsSize = maxOutArgsSizeBytes();
+  if (OutArgsSize > 0) {
+    Variable *T = makeReg(Dest->getType());
+    auto *CalculateOperand = X86OperandMem::create(
+        Func, IceType_void, esp, Ctx->getConstantInt(IceType_i32, OutArgsSize));
+    _lea(T, CalculateOperand);
+    _mov(Dest, T);
+  } else {
+    _mov(Dest, esp);
+  }
+}
+
+void TargetX8632::lowerArguments() {
+  const bool OptM1 = Func->getOptLevel() == Opt_m1;
+  VarList &Args = Func->getArgs();
+  unsigned NumXmmArgs = 0;
+  bool XmmSlotsRemain = true;
+  unsigned NumGprArgs = 0;
+  bool GprSlotsRemain = true;
+
+  Context.init(Func->getEntryNode());
+  Context.setInsertPoint(Context.getCur());
+
+  for (SizeT i = 0, End = Args.size();
+       i < End && (XmmSlotsRemain || GprSlotsRemain); ++i) {
+    Variable *Arg = Args[i];
+    Type Ty = Arg->getType();
+    Variable *RegisterArg = nullptr;
+    RegNumT RegNum;
+    if (isVectorType(Ty)) {
+      RegNum =
+          Traits::getRegisterForXmmArgNum(Traits::getArgIndex(i, NumXmmArgs));
+      if (RegNum.hasNoValue()) {
+        XmmSlotsRemain = false;
+        continue;
+      }
+      ++NumXmmArgs;
+      RegisterArg = Func->makeVariable(Ty);
+    } else if (isScalarFloatingType(Ty)) {
+      if (!Traits::X86_PASS_SCALAR_FP_IN_XMM) {
+        continue;
+      }
+      RegNum =
+          Traits::getRegisterForXmmArgNum(Traits::getArgIndex(i, NumXmmArgs));
+      if (RegNum.hasNoValue()) {
+        XmmSlotsRemain = false;
+        continue;
+      }
+      ++NumXmmArgs;
+      RegisterArg = Func->makeVariable(Ty);
+    } else if (isScalarIntegerType(Ty)) {
+      RegNum = Traits::getRegisterForGprArgNum(
+          Ty, Traits::getArgIndex(i, NumGprArgs));
+      if (RegNum.hasNoValue()) {
+        GprSlotsRemain = false;
+        continue;
+      }
+      ++NumGprArgs;
+      RegisterArg = Func->makeVariable(Ty);
+    }
+    assert(RegNum.hasValue());
+    assert(RegisterArg != nullptr);
+    // Replace Arg in the argument list with the home register. Then generate
+    // an instruction in the prolog to copy the home register to the assigned
+    // location of Arg.
+    if (BuildDefs::dump())
+      RegisterArg->setName(Func, "home_reg:" + Arg->getName());
+    RegisterArg->setRegNum(RegNum);
+    RegisterArg->setIsArg();
+    Arg->setIsArg(false);
+
+    Args[i] = RegisterArg;
+    // When not Om1, do the assignment through a temporary, instead of directly
+    // from the pre-colored variable, so that a subsequent availabilityGet()
+    // call has a chance to work.  (In Om1, don't bother creating extra
+    // instructions with extra variables to register-allocate.)
+    if (OptM1) {
+      Context.insert<InstAssign>(Arg, RegisterArg);
+    } else {
+      Variable *Tmp = makeReg(RegisterArg->getType());
+      Context.insert<InstAssign>(Tmp, RegisterArg);
+      Context.insert<InstAssign>(Arg, Tmp);
+    }
+  }
+  if (!OptM1)
+    Context.availabilityUpdate();
+}
+
+/// Strength-reduce scalar integer multiplication by a constant (for i32 or
+/// narrower) for certain constants. The lea instruction can be used to multiply
+/// by 3, 5, or 9, and the lsh instruction can be used to multiply by powers of
+/// 2. These can be combined such that e.g. multiplying by 100 can be done as 2
+/// lea-based multiplies by 5, combined with left-shifting by 2.
+
+bool TargetX8632::optimizeScalarMul(Variable *Dest, Operand *Src0,
+                                    int32_t Src1) {
+  // Disable this optimization for Om1 and O0, just to keep things simple
+  // there.
+  if (Func->getOptLevel() < Opt_1)
+    return false;
+  Type Ty = Dest->getType();
+  if (Src1 == -1) {
+    Variable *T = nullptr;
+    _mov(T, Src0);
+    _neg(T);
+    _mov(Dest, T);
+    return true;
+  }
+  if (Src1 == 0) {
+    _mov(Dest, Ctx->getConstantZero(Ty));
+    return true;
+  }
+  if (Src1 == 1) {
+    Variable *T = nullptr;
+    _mov(T, Src0);
+    _mov(Dest, T);
+    return true;
+  }
+  // Don't bother with the edge case where Src1 == MININT.
+  if (Src1 == -Src1)
+    return false;
+  const bool Src1IsNegative = Src1 < 0;
+  if (Src1IsNegative)
+    Src1 = -Src1;
+  uint32_t Count9 = 0;
+  uint32_t Count5 = 0;
+  uint32_t Count3 = 0;
+  uint32_t Count2 = 0;
+  uint32_t CountOps = 0;
+  while (Src1 > 1) {
+    if (Src1 % 9 == 0) {
+      ++CountOps;
+      ++Count9;
+      Src1 /= 9;
+    } else if (Src1 % 5 == 0) {
+      ++CountOps;
+      ++Count5;
+      Src1 /= 5;
+    } else if (Src1 % 3 == 0) {
+      ++CountOps;
+      ++Count3;
+      Src1 /= 3;
+    } else if (Src1 % 2 == 0) {
+      if (Count2 == 0)
+        ++CountOps;
+      ++Count2;
+      Src1 /= 2;
+    } else {
+      return false;
+    }
+  }
+  // Lea optimization only works for i16 and i32 types, not i8.
+  if (Ty != IceType_i32 && !(Traits::Is64Bit && Ty == IceType_i64) &&
+      (Count3 || Count5 || Count9))
+    return false;
+  // Limit the number of lea/shl operations for a single multiply, to a
+  // somewhat arbitrary choice of 3.
+  constexpr uint32_t MaxOpsForOptimizedMul = 3;
+  if (CountOps > MaxOpsForOptimizedMul)
+    return false;
+  Variable *T = makeReg(Traits::WordType);
+  if (typeWidthInBytes(Src0->getType()) < typeWidthInBytes(T->getType())) {
+    Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+    _movzx(T, Src0RM);
+  } else {
+    _mov(T, Src0);
+  }
+  Constant *Zero = Ctx->getConstantZero(IceType_i32);
+  for (uint32_t i = 0; i < Count9; ++i) {
+    constexpr uint16_t Shift = 3; // log2(9-1)
+    _lea(T, X86OperandMem::create(Func, IceType_void, T, Zero, T, Shift));
+  }
+  for (uint32_t i = 0; i < Count5; ++i) {
+    constexpr uint16_t Shift = 2; // log2(5-1)
+    _lea(T, X86OperandMem::create(Func, IceType_void, T, Zero, T, Shift));
+  }
+  for (uint32_t i = 0; i < Count3; ++i) {
+    constexpr uint16_t Shift = 1; // log2(3-1)
+    _lea(T, X86OperandMem::create(Func, IceType_void, T, Zero, T, Shift));
+  }
+  if (Count2) {
+    _shl(T, Ctx->getConstantInt(Ty, Count2));
+  }
+  if (Src1IsNegative)
+    _neg(T);
+  _mov(Dest, T);
+  return true;
+}
+
+void TargetX8632::lowerShift64(InstArithmetic::OpKind Op, Operand *Src0Lo,
+                               Operand *Src0Hi, Operand *Src1Lo,
+                               Variable *DestLo, Variable *DestHi) {
+  // TODO: Refactor the similarities between Shl, Lshr, and Ashr.
+  Variable *T_1 = nullptr, *T_2 = nullptr, *T_3 = nullptr;
+  Constant *Zero = Ctx->getConstantZero(IceType_i32);
+  Constant *SignExtend = Ctx->getConstantInt32(0x1f);
+  if (auto *ConstantShiftAmount = llvm::dyn_cast<ConstantInteger32>(Src1Lo)) {
+    uint32_t ShiftAmount = ConstantShiftAmount->getValue();
+    if (ShiftAmount > 32) {
+      Constant *ReducedShift = Ctx->getConstantInt32(ShiftAmount - 32);
+      switch (Op) {
+      default:
+        assert(0 && "non-shift op");
+        break;
+      case InstArithmetic::Shl: {
+        // a=b<<c ==>
+        //   t2 = b.lo
+        //   t2 = shl t2, ShiftAmount-32
+        //   t3 = t2
+        //   t2 = 0
+        _mov(T_2, Src0Lo);
+        _shl(T_2, ReducedShift);
+        _mov(DestHi, T_2);
+        _mov(DestLo, Zero);
+      } break;
+      case InstArithmetic::Lshr: {
+        // a=b>>c (unsigned) ==>
+        //   t2 = b.hi
+        //   t2 = shr t2, ShiftAmount-32
+        //   a.lo = t2
+        //   a.hi = 0
+        _mov(T_2, Src0Hi);
+        _shr(T_2, ReducedShift);
+        _mov(DestLo, T_2);
+        _mov(DestHi, Zero);
+      } break;
+      case InstArithmetic::Ashr: {
+        // a=b>>c (signed) ==>
+        //   t3 = b.hi
+        //   t3 = sar t3, 0x1f
+        //   t2 = b.hi
+        //   t2 = shrd t2, t3, ShiftAmount-32
+        //   a.lo = t2
+        //   a.hi = t3
+        _mov(T_3, Src0Hi);
+        _sar(T_3, SignExtend);
+        _mov(T_2, Src0Hi);
+        _shrd(T_2, T_3, ReducedShift);
+        _mov(DestLo, T_2);
+        _mov(DestHi, T_3);
+      } break;
+      }
+    } else if (ShiftAmount == 32) {
+      switch (Op) {
+      default:
+        assert(0 && "non-shift op");
+        break;
+      case InstArithmetic::Shl: {
+        // a=b<<c ==>
+        //   t2 = b.lo
+        //   a.hi = t2
+        //   a.lo = 0
+        _mov(T_2, Src0Lo);
+        _mov(DestHi, T_2);
+        _mov(DestLo, Zero);
+      } break;
+      case InstArithmetic::Lshr: {
+        // a=b>>c (unsigned) ==>
+        //   t2 = b.hi
+        //   a.lo = t2
+        //   a.hi = 0
+        _mov(T_2, Src0Hi);
+        _mov(DestLo, T_2);
+        _mov(DestHi, Zero);
+      } break;
+      case InstArithmetic::Ashr: {
+        // a=b>>c (signed) ==>
+        //   t2 = b.hi
+        //   a.lo = t2
+        //   t3 = b.hi
+        //   t3 = sar t3, 0x1f
+        //   a.hi = t3
+        _mov(T_2, Src0Hi);
+        _mov(DestLo, T_2);
+        _mov(T_3, Src0Hi);
+        _sar(T_3, SignExtend);
+        _mov(DestHi, T_3);
+      } break;
+      }
+    } else {
+      // COMMON PREFIX OF: a=b SHIFT_OP c ==>
+      //   t2 = b.lo
+      //   t3 = b.hi
+      _mov(T_2, Src0Lo);
+      _mov(T_3, Src0Hi);
+      switch (Op) {
+      default:
+        assert(0 && "non-shift op");
+        break;
+      case InstArithmetic::Shl: {
+        // a=b<<c ==>
+        //   t3 = shld t3, t2, ShiftAmount
+        //   t2 = shl t2, ShiftAmount
+        _shld(T_3, T_2, ConstantShiftAmount);
+        _shl(T_2, ConstantShiftAmount);
+      } break;
+      case InstArithmetic::Lshr: {
+        // a=b>>c (unsigned) ==>
+        //   t2 = shrd t2, t3, ShiftAmount
+        //   t3 = shr t3, ShiftAmount
+        _shrd(T_2, T_3, ConstantShiftAmount);
+        _shr(T_3, ConstantShiftAmount);
+      } break;
+      case InstArithmetic::Ashr: {
+        // a=b>>c (signed) ==>
+        //   t2 = shrd t2, t3, ShiftAmount
+        //   t3 = sar t3, ShiftAmount
+        _shrd(T_2, T_3, ConstantShiftAmount);
+        _sar(T_3, ConstantShiftAmount);
+      } break;
+      }
+      // COMMON SUFFIX OF: a=b SHIFT_OP c ==>
+      //   a.lo = t2
+      //   a.hi = t3
+      _mov(DestLo, T_2);
+      _mov(DestHi, T_3);
+    }
+  } else {
+    // NON-CONSTANT CASES.
+    Constant *BitTest = Ctx->getConstantInt32(0x20);
+    InstX86Label *Label = InstX86Label::create(Func, this);
+    // COMMON PREFIX OF: a=b SHIFT_OP c ==>
+    //   t1:ecx = c.lo & 0xff
+    //   t2 = b.lo
+    //   t3 = b.hi
+    T_1 = copyToReg8(Src1Lo, Traits::RegisterSet::Reg_cl);
+    _mov(T_2, Src0Lo);
+    _mov(T_3, Src0Hi);
+    switch (Op) {
+    default:
+      assert(0 && "non-shift op");
+      break;
+    case InstArithmetic::Shl: {
+      // a=b<<c ==>
+      //   t3 = shld t3, t2, t1
+      //   t2 = shl t2, t1
+      //   test t1, 0x20
+      //   je L1
+      //   use(t3)
+      //   t3 = t2
+      //   t2 = 0
+      _shld(T_3, T_2, T_1);
+      _shl(T_2, T_1);
+      _test(T_1, BitTest);
+      _br(CondX86::Br_e, Label);
+      // T_2 and T_3 are being assigned again because of the intra-block control
+      // flow, so we need to use _redefined to avoid liveness problems.
+      _redefined(_mov(T_3, T_2));
+      _redefined(_mov(T_2, Zero));
+    } break;
+    case InstArithmetic::Lshr: {
+      // a=b>>c (unsigned) ==>
+      //   t2 = shrd t2, t3, t1
+      //   t3 = shr t3, t1
+      //   test t1, 0x20
+      //   je L1
+      //   use(t2)
+      //   t2 = t3
+      //   t3 = 0
+      _shrd(T_2, T_3, T_1);
+      _shr(T_3, T_1);
+      _test(T_1, BitTest);
+      _br(CondX86::Br_e, Label);
+      // T_2 and T_3 are being assigned again because of the intra-block control
+      // flow, so we need to use _redefined to avoid liveness problems.
+      _redefined(_mov(T_2, T_3));
+      _redefined(_mov(T_3, Zero));
+    } break;
+    case InstArithmetic::Ashr: {
+      // a=b>>c (signed) ==>
+      //   t2 = shrd t2, t3, t1
+      //   t3 = sar t3, t1
+      //   test t1, 0x20
+      //   je L1
+      //   use(t2)
+      //   t2 = t3
+      //   t3 = sar t3, 0x1f
+      Constant *SignExtend = Ctx->getConstantInt32(0x1f);
+      _shrd(T_2, T_3, T_1);
+      _sar(T_3, T_1);
+      _test(T_1, BitTest);
+      _br(CondX86::Br_e, Label);
+      // T_2 and T_3 are being assigned again because of the intra-block control
+      // flow, so T_2 needs to use _redefined to avoid liveness problems. T_3
+      // doesn't need special treatment because it is reassigned via _sar
+      // instead of _mov.
+      _redefined(_mov(T_2, T_3));
+      _sar(T_3, SignExtend);
+    } break;
+    }
+    // COMMON SUFFIX OF: a=b SHIFT_OP c ==>
+    // L1:
+    //   a.lo = t2
+    //   a.hi = t3
+    Context.insert(Label);
+    _mov(DestLo, T_2);
+    _mov(DestHi, T_3);
+  }
+}
+
+void TargetX8632::lowerArithmetic(const InstArithmetic *Instr) {
+  Variable *Dest = Instr->getDest();
+  if (Dest->isRematerializable()) {
+    Context.insert<InstFakeDef>(Dest);
+    return;
+  }
+  Type Ty = Dest->getType();
+  Operand *Src0 = legalize(Instr->getSrc(0));
+  Operand *Src1 = legalize(Instr->getSrc(1));
+  if (Instr->isCommutative()) {
+    uint32_t SwapCount = 0;
+    if (!llvm::isa<Variable>(Src0) && llvm::isa<Variable>(Src1)) {
+      std::swap(Src0, Src1);
+      ++SwapCount;
+    }
+    if (llvm::isa<Constant>(Src0) && !llvm::isa<Constant>(Src1)) {
+      std::swap(Src0, Src1);
+      ++SwapCount;
+    }
+    // Improve two-address code patterns by avoiding a copy to the dest
+    // register when one of the source operands ends its lifetime here.
+    if (!Instr->isLastUse(Src0) && Instr->isLastUse(Src1)) {
+      std::swap(Src0, Src1);
+      ++SwapCount;
+    }
+    assert(SwapCount <= 1);
+    (void)SwapCount;
+  }
+  if (!Traits::Is64Bit && Ty == IceType_i64) {
+    // These x86-32 helper-call-involved instructions are lowered in this
+    // separate switch. This is because loOperand() and hiOperand() may insert
+    // redundant instructions for constant blinding and pooling. Such redundant
+    // instructions will fail liveness analysis under -Om1 setting. And,
+    // actually these arguments do not need to be processed with loOperand()
+    // and hiOperand() to be used.
+    switch (Instr->getOp()) {
+    case InstArithmetic::Udiv:
+    case InstArithmetic::Sdiv:
+    case InstArithmetic::Urem:
+    case InstArithmetic::Srem:
+      llvm::report_fatal_error("Helper call was expected");
+      return;
+    default:
+      break;
+    }
+
+    auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
+    auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
+    Operand *Src0Lo = loOperand(Src0);
+    Operand *Src0Hi = hiOperand(Src0);
+    Operand *Src1Lo = loOperand(Src1);
+    Operand *Src1Hi = hiOperand(Src1);
+    Variable *T_Lo = nullptr, *T_Hi = nullptr;
+    switch (Instr->getOp()) {
+    case InstArithmetic::_num:
+      llvm_unreachable("Unknown arithmetic operator");
+      break;
+    case InstArithmetic::Add:
+      _mov(T_Lo, Src0Lo);
+      _add(T_Lo, Src1Lo);
+      _mov(DestLo, T_Lo);
+      _mov(T_Hi, Src0Hi);
+      _adc(T_Hi, Src1Hi);
+      _mov(DestHi, T_Hi);
+      break;
+    case InstArithmetic::And:
+      _mov(T_Lo, Src0Lo);
+      _and(T_Lo, Src1Lo);
+      _mov(DestLo, T_Lo);
+      _mov(T_Hi, Src0Hi);
+      _and(T_Hi, Src1Hi);
+      _mov(DestHi, T_Hi);
+      break;
+    case InstArithmetic::Or:
+      _mov(T_Lo, Src0Lo);
+      _or(T_Lo, Src1Lo);
+      _mov(DestLo, T_Lo);
+      _mov(T_Hi, Src0Hi);
+      _or(T_Hi, Src1Hi);
+      _mov(DestHi, T_Hi);
+      break;
+    case InstArithmetic::Xor:
+      _mov(T_Lo, Src0Lo);
+      _xor(T_Lo, Src1Lo);
+      _mov(DestLo, T_Lo);
+      _mov(T_Hi, Src0Hi);
+      _xor(T_Hi, Src1Hi);
+      _mov(DestHi, T_Hi);
+      break;
+    case InstArithmetic::Sub:
+      _mov(T_Lo, Src0Lo);
+      _sub(T_Lo, Src1Lo);
+      _mov(DestLo, T_Lo);
+      _mov(T_Hi, Src0Hi);
+      _sbb(T_Hi, Src1Hi);
+      _mov(DestHi, T_Hi);
+      break;
+    case InstArithmetic::Mul: {
+      Variable *T_1 = nullptr, *T_2 = nullptr, *T_3 = nullptr;
+      Variable *T_4Lo = makeReg(IceType_i32, Traits::RegisterSet::Reg_eax);
+      Variable *T_4Hi = makeReg(IceType_i32, Traits::RegisterSet::Reg_edx);
+      // gcc does the following:
+      // a=b*c ==>
+      //   t1 = b.hi; t1 *=(imul) c.lo
+      //   t2 = c.hi; t2 *=(imul) b.lo
+      //   t3:eax = b.lo
+      //   t4.hi:edx,t4.lo:eax = t3:eax *(mul) c.lo
+      //   a.lo = t4.lo
+      //   t4.hi += t1
+      //   t4.hi += t2
+      //   a.hi = t4.hi
+      // The mul instruction cannot take an immediate operand.
+      Src1Lo = legalize(Src1Lo, Legal_Reg | Legal_Mem);
+      _mov(T_1, Src0Hi);
+      _imul(T_1, Src1Lo);
+      _mov(T_3, Src0Lo, Traits::RegisterSet::Reg_eax);
+      _mul(T_4Lo, T_3, Src1Lo);
+      // The mul instruction produces two dest variables, edx:eax. We create a
+      // fake definition of edx to account for this.
+      Context.insert<InstFakeDef>(T_4Hi, T_4Lo);
+      Context.insert<InstFakeUse>(T_4Hi);
+      _mov(DestLo, T_4Lo);
+      _add(T_4Hi, T_1);
+      _mov(T_2, Src1Hi);
+      Src0Lo = legalize(Src0Lo, Legal_Reg | Legal_Mem);
+      _imul(T_2, Src0Lo);
+      _add(T_4Hi, T_2);
+      _mov(DestHi, T_4Hi);
+    } break;
+    case InstArithmetic::Shl:
+    case InstArithmetic::Lshr:
+    case InstArithmetic::Ashr:
+      lowerShift64(Instr->getOp(), Src0Lo, Src0Hi, Src1Lo, DestLo, DestHi);
+      break;
+    case InstArithmetic::Fadd:
+    case InstArithmetic::Fsub:
+    case InstArithmetic::Fmul:
+    case InstArithmetic::Fdiv:
+    case InstArithmetic::Frem:
+      llvm_unreachable("FP instruction with i64 type");
+      break;
+    case InstArithmetic::Udiv:
+    case InstArithmetic::Sdiv:
+    case InstArithmetic::Urem:
+    case InstArithmetic::Srem:
+      llvm_unreachable("Call-helper-involved instruction for i64 type \
+                       should have already been handled before");
+      break;
+    }
+    return;
+  }
+  if (isVectorType(Ty)) {
+    // TODO: Trap on integer divide and integer modulo by zero. See:
+    // https://code.google.com/p/nativeclient/issues/detail?id=3899
+    if (llvm::isa<X86OperandMem>(Src1))
+      Src1 = legalizeToReg(Src1);
+    switch (Instr->getOp()) {
+    case InstArithmetic::_num:
+      llvm_unreachable("Unknown arithmetic operator");
+      break;
+    case InstArithmetic::Add: {
+      Variable *T = makeReg(Ty);
+      _movp(T, Src0);
+      _padd(T, Src1);
+      _movp(Dest, T);
+    } break;
+    case InstArithmetic::And: {
+      Variable *T = makeReg(Ty);
+      _movp(T, Src0);
+      _pand(T, Src1);
+      _movp(Dest, T);
+    } break;
+    case InstArithmetic::Or: {
+      Variable *T = makeReg(Ty);
+      _movp(T, Src0);
+      _por(T, Src1);
+      _movp(Dest, T);
+    } break;
+    case InstArithmetic::Xor: {
+      Variable *T = makeReg(Ty);
+      _movp(T, Src0);
+      _pxor(T, Src1);
+      _movp(Dest, T);
+    } break;
+    case InstArithmetic::Sub: {
+      Variable *T = makeReg(Ty);
+      _movp(T, Src0);
+      _psub(T, Src1);
+      _movp(Dest, T);
+    } break;
+    case InstArithmetic::Mul: {
+      bool TypesAreValidForPmull = Ty == IceType_v4i32 || Ty == IceType_v8i16;
+      bool InstructionSetIsValidForPmull =
+          Ty == IceType_v8i16 || InstructionSet >= SSE4_1;
+      if (TypesAreValidForPmull && InstructionSetIsValidForPmull) {
+        Variable *T = makeReg(Ty);
+        _movp(T, Src0);
+        _pmull(T, Src0 == Src1 ? T : Src1);
+        _movp(Dest, T);
+      } else if (Ty == IceType_v4i32) {
+        // Lowering sequence:
+        // Note: The mask arguments have index 0 on the left.
+        //
+        // movups  T1, Src0
+        // pshufd  T2, Src0, {1,0,3,0}
+        // pshufd  T3, Src1, {1,0,3,0}
+        // # T1 = {Src0[0] * Src1[0], Src0[2] * Src1[2]}
+        // pmuludq T1, Src1
+        // # T2 = {Src0[1] * Src1[1], Src0[3] * Src1[3]}
+        // pmuludq T2, T3
+        // # T1 = {lo(T1[0]), lo(T1[2]), lo(T2[0]), lo(T2[2])}
+        // shufps  T1, T2, {0,2,0,2}
+        // pshufd  T4, T1, {0,2,1,3}
+        // movups  Dest, T4
+
+        // Mask that directs pshufd to create a vector with entries
+        // Src[1, 0, 3, 0]
+        constexpr unsigned Constant1030 = 0x31;
+        Constant *Mask1030 = Ctx->getConstantInt32(Constant1030);
+        // Mask that directs shufps to create a vector with entries
+        // Dest[0, 2], Src[0, 2]
+        constexpr unsigned Mask0202 = 0x88;
+        // Mask that directs pshufd to create a vector with entries
+        // Src[0, 2, 1, 3]
+        constexpr unsigned Mask0213 = 0xd8;
+        Variable *T1 = makeReg(IceType_v4i32);
+        Variable *T2 = makeReg(IceType_v4i32);
+        Variable *T3 = makeReg(IceType_v4i32);
+        Variable *T4 = makeReg(IceType_v4i32);
+        _movp(T1, Src0);
+        _pshufd(T2, Src0, Mask1030);
+        _pshufd(T3, Src1, Mask1030);
+        _pmuludq(T1, Src1);
+        _pmuludq(T2, T3);
+        _shufps(T1, T2, Ctx->getConstantInt32(Mask0202));
+        _pshufd(T4, T1, Ctx->getConstantInt32(Mask0213));
+        _movp(Dest, T4);
+      } else if (Ty == IceType_v16i8) {
+        llvm::report_fatal_error("Scalarized operation was expected");
+      } else {
+        llvm::report_fatal_error("Invalid vector multiply type");
+      }
+    } break;
+    case InstArithmetic::Shl: {
+      assert(llvm::isa<Constant>(Src1) && "Non-constant shift not scalarized");
+      Variable *T = makeReg(Ty);
+      _movp(T, Src0);
+      _psll(T, Src1);
+      _movp(Dest, T);
+    } break;
+    case InstArithmetic::Lshr: {
+      assert(llvm::isa<Constant>(Src1) && "Non-constant shift not scalarized");
+      Variable *T = makeReg(Ty);
+      _movp(T, Src0);
+      _psrl(T, Src1);
+      _movp(Dest, T);
+    } break;
+    case InstArithmetic::Ashr: {
+      assert(llvm::isa<Constant>(Src1) && "Non-constant shift not scalarized");
+      Variable *T = makeReg(Ty);
+      _movp(T, Src0);
+      _psra(T, Src1);
+      _movp(Dest, T);
+    } break;
+    case InstArithmetic::Udiv:
+    case InstArithmetic::Urem:
+    case InstArithmetic::Sdiv:
+    case InstArithmetic::Srem:
+      llvm::report_fatal_error("Scalarized operation was expected");
+      break;
+    case InstArithmetic::Fadd: {
+      Variable *T = makeReg(Ty);
+      _movp(T, Src0);
+      _addps(T, Src1);
+      _movp(Dest, T);
+    } break;
+    case InstArithmetic::Fsub: {
+      Variable *T = makeReg(Ty);
+      _movp(T, Src0);
+      _subps(T, Src1);
+      _movp(Dest, T);
+    } break;
+    case InstArithmetic::Fmul: {
+      Variable *T = makeReg(Ty);
+      _movp(T, Src0);
+      _mulps(T, Src0 == Src1 ? T : Src1);
+      _movp(Dest, T);
+    } break;
+    case InstArithmetic::Fdiv: {
+      Variable *T = makeReg(Ty);
+      _movp(T, Src0);
+      _divps(T, Src1);
+      _movp(Dest, T);
+    } break;
+    case InstArithmetic::Frem:
+      llvm::report_fatal_error("Scalarized operation was expected");
+      break;
+    }
+    return;
+  }
+  Variable *T_edx = nullptr;
+  Variable *T = nullptr;
+  switch (Instr->getOp()) {
+  case InstArithmetic::_num:
+    llvm_unreachable("Unknown arithmetic operator");
+    break;
+  case InstArithmetic::Add: {
+    const bool ValidType =
+        Ty == IceType_i32 || (Ty == IceType_i64 && Traits::Is64Bit);
+    auto *Const = llvm::dyn_cast<Constant>(Instr->getSrc(1));
+    const bool ValidKind =
+        Const != nullptr && (llvm::isa<ConstantInteger32>(Const) ||
+                             llvm::isa<ConstantRelocatable>(Const));
+    if (getFlags().getAggressiveLea() && ValidType && ValidKind) {
+      auto *Var = legalizeToReg(Src0);
+      auto *Mem = Traits::X86OperandMem::create(Func, IceType_void, Var, Const);
+      T = makeReg(Ty);
+      _lea(T, Mem);
+      _mov(Dest, T);
+      break;
+    }
+    _mov(T, Src0);
+    _add(T, Src1);
+    _mov(Dest, T);
+  } break;
+  case InstArithmetic::And:
+    _mov(T, Src0);
+    _and(T, Src1);
+    _mov(Dest, T);
+    break;
+  case InstArithmetic::Or:
+    _mov(T, Src0);
+    _or(T, Src1);
+    _mov(Dest, T);
+    break;
+  case InstArithmetic::Xor:
+    _mov(T, Src0);
+    _xor(T, Src1);
+    _mov(Dest, T);
+    break;
+  case InstArithmetic::Sub:
+    _mov(T, Src0);
+    _sub(T, Src1);
+    _mov(Dest, T);
+    break;
+  case InstArithmetic::Mul:
+    if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) {
+      if (optimizeScalarMul(Dest, Src0, C->getValue()))
+        return;
+    }
+    // The 8-bit version of imul only allows the form "imul r/m8" where T must
+    // be in al.
+    if (isByteSizedArithType(Ty)) {
+      _mov(T, Src0, Traits::RegisterSet::Reg_al);
+      Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
+      _imul(T, Src0 == Src1 ? T : Src1);
+      _mov(Dest, T);
+    } else if (auto *ImmConst = llvm::dyn_cast<ConstantInteger32>(Src1)) {
+      T = makeReg(Ty);
+      Src0 = legalize(Src0, Legal_Reg | Legal_Mem);
+      _imul_imm(T, Src0, ImmConst);
+      _mov(Dest, T);
+    } else {
+      _mov(T, Src0);
+      // No need to legalize Src1 to Reg | Mem because the Imm case is handled
+      // already by the ConstantInteger32 case above.
+      _imul(T, Src0 == Src1 ? T : Src1);
+      _mov(Dest, T);
+    }
+    break;
+  case InstArithmetic::Shl:
+    _mov(T, Src0);
+    if (!llvm::isa<ConstantInteger32>(Src1) &&
+        !llvm::isa<ConstantInteger64>(Src1))
+      Src1 = copyToReg8(Src1, Traits::RegisterSet::Reg_cl);
+    _shl(T, Src1);
+    _mov(Dest, T);
+    break;
+  case InstArithmetic::Lshr:
+    _mov(T, Src0);
+    if (!llvm::isa<ConstantInteger32>(Src1) &&
+        !llvm::isa<ConstantInteger64>(Src1))
+      Src1 = copyToReg8(Src1, Traits::RegisterSet::Reg_cl);
+    _shr(T, Src1);
+    _mov(Dest, T);
+    break;
+  case InstArithmetic::Ashr:
+    _mov(T, Src0);
+    if (!llvm::isa<ConstantInteger32>(Src1) &&
+        !llvm::isa<ConstantInteger64>(Src1))
+      Src1 = copyToReg8(Src1, Traits::RegisterSet::Reg_cl);
+    _sar(T, Src1);
+    _mov(Dest, T);
+    break;
+  case InstArithmetic::Udiv: {
+    // div and idiv are the few arithmetic operators that do not allow
+    // immediates as the operand.
+    Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
+    RegNumT Eax;
+    RegNumT Edx;
+    switch (Ty) {
+    default:
+      llvm::report_fatal_error("Bad type for udiv");
+    case IceType_i64:
+      Eax = Traits::getRaxOrDie();
+      Edx = Traits::getRdxOrDie();
+      break;
+    case IceType_i32:
+      Eax = Traits::RegisterSet::Reg_eax;
+      Edx = Traits::RegisterSet::Reg_edx;
+      break;
+    case IceType_i16:
+      Eax = Traits::RegisterSet::Reg_ax;
+      Edx = Traits::RegisterSet::Reg_dx;
+      break;
+    case IceType_i8:
+      Eax = Traits::RegisterSet::Reg_al;
+      Edx = Traits::RegisterSet::Reg_ah;
+      break;
+    }
+    T_edx = makeReg(Ty, Edx);
+    _mov(T, Src0, Eax);
+    _mov(T_edx, Ctx->getConstantZero(Ty));
+    _div(T_edx, Src1, T);
+    _redefined(Context.insert<InstFakeDef>(T, T_edx));
+    _mov(Dest, T);
+  } break;
+  case InstArithmetic::Sdiv:
+    // TODO(stichnot): Enable this after doing better performance and cross
+    // testing.
+    if (false && Func->getOptLevel() >= Opt_1) {
+      // Optimize division by constant power of 2, but not for Om1 or O0, just
+      // to keep things simple there.
+      if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) {
+        const int32_t Divisor = C->getValue();
+        const uint32_t UDivisor = Divisor;
+        if (Divisor > 0 && llvm::isPowerOf2_32(UDivisor)) {
+          uint32_t LogDiv = llvm::Log2_32(UDivisor);
+          // LLVM does the following for dest=src/(1<<log):
+          //   t=src
+          //   sar t,typewidth-1 // -1 if src is negative, 0 if not
+          //   shr t,typewidth-log
+          //   add t,src
+          //   sar t,log
+          //   dest=t
+          uint32_t TypeWidth = Traits::X86_CHAR_BIT * typeWidthInBytes(Ty);
+          _mov(T, Src0);
+          // If for some reason we are dividing by 1, just treat it like an
+          // assignment.
+          if (LogDiv > 0) {
+            // The initial sar is unnecessary when dividing by 2.
+            if (LogDiv > 1)
+              _sar(T, Ctx->getConstantInt(Ty, TypeWidth - 1));
+            _shr(T, Ctx->getConstantInt(Ty, TypeWidth - LogDiv));
+            _add(T, Src0);
+            _sar(T, Ctx->getConstantInt(Ty, LogDiv));
+          }
+          _mov(Dest, T);
+          return;
+        }
+      }
+    }
+    Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
+    switch (Ty) {
+    default:
+      llvm::report_fatal_error("Bad type for sdiv");
+    case IceType_i64:
+      T_edx = makeReg(Ty, Traits::getRdxOrDie());
+      _mov(T, Src0, Traits::getRaxOrDie());
+      break;
+    case IceType_i32:
+      T_edx = makeReg(Ty, Traits::RegisterSet::Reg_edx);
+      _mov(T, Src0, Traits::RegisterSet::Reg_eax);
+      break;
+    case IceType_i16:
+      T_edx = makeReg(Ty, Traits::RegisterSet::Reg_dx);
+      _mov(T, Src0, Traits::RegisterSet::Reg_ax);
+      break;
+    case IceType_i8:
+      T_edx = makeReg(IceType_i16, Traits::RegisterSet::Reg_ax);
+      _mov(T, Src0, Traits::RegisterSet::Reg_al);
+      break;
+    }
+    _cbwdq(T_edx, T);
+    _idiv(T_edx, Src1, T);
+    _redefined(Context.insert<InstFakeDef>(T, T_edx));
+    _mov(Dest, T);
+    break;
+  case InstArithmetic::Urem: {
+    Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
+    RegNumT Eax;
+    RegNumT Edx;
+    switch (Ty) {
+    default:
+      llvm::report_fatal_error("Bad type for urem");
+    case IceType_i64:
+      Eax = Traits::getRaxOrDie();
+      Edx = Traits::getRdxOrDie();
+      break;
+    case IceType_i32:
+      Eax = Traits::RegisterSet::Reg_eax;
+      Edx = Traits::RegisterSet::Reg_edx;
+      break;
+    case IceType_i16:
+      Eax = Traits::RegisterSet::Reg_ax;
+      Edx = Traits::RegisterSet::Reg_dx;
+      break;
+    case IceType_i8:
+      Eax = Traits::RegisterSet::Reg_al;
+      Edx = Traits::RegisterSet::Reg_ah;
+      break;
+    }
+    T_edx = makeReg(Ty, Edx);
+    _mov(T_edx, Ctx->getConstantZero(Ty));
+    _mov(T, Src0, Eax);
+    _div(T, Src1, T_edx);
+    _redefined(Context.insert<InstFakeDef>(T_edx, T));
+    if (Ty == IceType_i8) {
+      // Register ah must be moved into one of {al,bl,cl,dl} before it can be
+      // moved into a general 8-bit register.
+      auto *T_AhRcvr = makeReg(Ty);
+      T_AhRcvr->setRegClass(RCX86_IsAhRcvr);
+      _mov(T_AhRcvr, T_edx);
+      T_edx = T_AhRcvr;
+    }
+    _mov(Dest, T_edx);
+  } break;
+  case InstArithmetic::Srem: {
+    // TODO(stichnot): Enable this after doing better performance and cross
+    // testing.
+    if (false && Func->getOptLevel() >= Opt_1) {
+      // Optimize mod by constant power of 2, but not for Om1 or O0, just to
+      // keep things simple there.
+      if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) {
+        const int32_t Divisor = C->getValue();
+        const uint32_t UDivisor = Divisor;
+        if (Divisor > 0 && llvm::isPowerOf2_32(UDivisor)) {
+          uint32_t LogDiv = llvm::Log2_32(UDivisor);
+          // LLVM does the following for dest=src%(1<<log):
+          //   t=src
+          //   sar t,typewidth-1 // -1 if src is negative, 0 if not
+          //   shr t,typewidth-log
+          //   add t,src
+          //   and t, -(1<<log)
+          //   sub t,src
+          //   neg t
+          //   dest=t
+          uint32_t TypeWidth = Traits::X86_CHAR_BIT * typeWidthInBytes(Ty);
+          // If for some reason we are dividing by 1, just assign 0.
+          if (LogDiv == 0) {
+            _mov(Dest, Ctx->getConstantZero(Ty));
+            return;
+          }
+          _mov(T, Src0);
+          // The initial sar is unnecessary when dividing by 2.
+          if (LogDiv > 1)
+            _sar(T, Ctx->getConstantInt(Ty, TypeWidth - 1));
+          _shr(T, Ctx->getConstantInt(Ty, TypeWidth - LogDiv));
+          _add(T, Src0);
+          _and(T, Ctx->getConstantInt(Ty, -(1 << LogDiv)));
+          _sub(T, Src0);
+          _neg(T);
+          _mov(Dest, T);
+          return;
+        }
+      }
+    }
+    Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
+    RegNumT Eax;
+    RegNumT Edx;
+    switch (Ty) {
+    default:
+      llvm::report_fatal_error("Bad type for srem");
+    case IceType_i64:
+      Eax = Traits::getRaxOrDie();
+      Edx = Traits::getRdxOrDie();
+      break;
+    case IceType_i32:
+      Eax = Traits::RegisterSet::Reg_eax;
+      Edx = Traits::RegisterSet::Reg_edx;
+      break;
+    case IceType_i16:
+      Eax = Traits::RegisterSet::Reg_ax;
+      Edx = Traits::RegisterSet::Reg_dx;
+      break;
+    case IceType_i8:
+      Eax = Traits::RegisterSet::Reg_al;
+      Edx = Traits::RegisterSet::Reg_ah;
+      break;
+    }
+    T_edx = makeReg(Ty, Edx);
+    _mov(T, Src0, Eax);
+    _cbwdq(T_edx, T);
+    _idiv(T, Src1, T_edx);
+    _redefined(Context.insert<InstFakeDef>(T_edx, T));
+    if (Ty == IceType_i8) {
+      // Register ah must be moved into one of {al,bl,cl,dl} before it can be
+      // moved into a general 8-bit register.
+      auto *T_AhRcvr = makeReg(Ty);
+      T_AhRcvr->setRegClass(RCX86_IsAhRcvr);
+      _mov(T_AhRcvr, T_edx);
+      T_edx = T_AhRcvr;
+    }
+    _mov(Dest, T_edx);
+  } break;
+  case InstArithmetic::Fadd:
+    _mov(T, Src0);
+    _addss(T, Src1);
+    _mov(Dest, T);
+    break;
+  case InstArithmetic::Fsub:
+    _mov(T, Src0);
+    _subss(T, Src1);
+    _mov(Dest, T);
+    break;
+  case InstArithmetic::Fmul:
+    _mov(T, Src0);
+    _mulss(T, Src0 == Src1 ? T : Src1);
+    _mov(Dest, T);
+    break;
+  case InstArithmetic::Fdiv:
+    _mov(T, Src0);
+    _divss(T, Src1);
+    _mov(Dest, T);
+    break;
+  case InstArithmetic::Frem:
+    llvm::report_fatal_error("Helper call was expected");
+    break;
+  }
+}
+
+void TargetX8632::lowerAssign(const InstAssign *Instr) {
+  Variable *Dest = Instr->getDest();
+  if (Dest->isRematerializable()) {
+    Context.insert<InstFakeDef>(Dest);
+    return;
+  }
+  Operand *Src = Instr->getSrc(0);
+  assert(Dest->getType() == Src->getType());
+  lowerMove(Dest, Src, false);
+}
+
+void TargetX8632::lowerBr(const InstBr *Br) {
+  if (Br->isUnconditional()) {
+    _br(Br->getTargetUnconditional());
+    return;
+  }
+  Operand *Cond = Br->getCondition();
+
+  // Handle folding opportunities.
+  if (const Inst *Producer = FoldingInfo.getProducerFor(Cond)) {
+    assert(Producer->isDeleted());
+    switch (BoolFolding::getProducerKind(Producer)) {
+    default:
+      break;
+    case BoolFolding::PK_Icmp32:
+    case BoolFolding::PK_Icmp64: {
+      lowerIcmpAndConsumer(llvm::cast<InstIcmp>(Producer), Br);
+      return;
+    }
+    case BoolFolding::PK_Fcmp: {
+      lowerFcmpAndConsumer(llvm::cast<InstFcmp>(Producer), Br);
+      return;
+    }
+    case BoolFolding::PK_Arith: {
+      lowerArithAndConsumer(llvm::cast<InstArithmetic>(Producer), Br);
+      return;
+    }
+    }
+  }
+  Operand *Src0 = legalize(Cond, Legal_Reg | Legal_Mem);
+  Constant *Zero = Ctx->getConstantZero(IceType_i32);
+  _cmp(Src0, Zero);
+  _br(CondX86::Br_ne, Br->getTargetTrue(), Br->getTargetFalse());
+}
+
+// constexprMax returns a (constexpr) max(S0, S1), and it is used for defining
+// OperandList in lowerCall. std::max() is supposed to work, but it doesn't.
+inline constexpr SizeT constexprMax(SizeT S0, SizeT S1) {
+  return S0 < S1 ? S1 : S0;
+}
+
+void TargetX8632::lowerCall(const InstCall *Instr) {
+  // Common x86 calling convention lowering:
+  //
+  // * At the point before the call, the stack must be aligned to 16 bytes.
+  //
+  // * Non-register arguments are pushed onto the stack in right-to-left order,
+  // such that the left-most argument ends up on the top of the stack at the
+  // lowest memory address.
+  //
+  // * Stack arguments of vector type are aligned to start at the next highest
+  // multiple of 16 bytes. Other stack arguments are aligned to the next word
+  // size boundary (4 or 8 bytes, respectively).
+  RequiredStackAlignment = std::max<size_t>(RequiredStackAlignment,
+                                            Traits::X86_STACK_ALIGNMENT_BYTES);
+
+  constexpr SizeT MaxOperands =
+      constexprMax(Traits::X86_MAX_XMM_ARGS, Traits::X86_MAX_GPR_ARGS);
+  using OperandList = llvm::SmallVector<Operand *, MaxOperands>;
+
+  OperandList XmmArgs;
+  llvm::SmallVector<SizeT, MaxOperands> XmmArgIndices;
+  CfgVector<std::pair<const Type, Operand *>> GprArgs;
+  CfgVector<SizeT> GprArgIndices;
+  OperandList StackArgs, StackArgLocations;
+  uint32_t ParameterAreaSizeBytes = 0;
+
+  ParameterAreaSizeBytes += getShadowStoreSize();
+
+  // Classify each argument operand according to the location where the argument
+  // is passed.
+  for (SizeT i = 0, NumArgs = Instr->getNumArgs(); i < NumArgs; ++i) {
+    Operand *Arg = Instr->getArg(i);
+    const Type Ty = Arg->getType();
+    // The PNaCl ABI requires the width of arguments to be at least 32 bits.
+    assert(typeWidthInBytes(Ty) >= 4);
+    if (isVectorType(Ty) &&
+        Traits::getRegisterForXmmArgNum(Traits::getArgIndex(i, XmmArgs.size()))
+            .hasValue()) {
+      XmmArgs.push_back(Arg);
+      XmmArgIndices.push_back(i);
+    } else if (isScalarFloatingType(Ty) && Traits::X86_PASS_SCALAR_FP_IN_XMM &&
+               Traits::getRegisterForXmmArgNum(
+                   Traits::getArgIndex(i, XmmArgs.size()))
+                   .hasValue()) {
+      XmmArgs.push_back(Arg);
+      XmmArgIndices.push_back(i);
+    } else if (isScalarIntegerType(Ty) &&
+               Traits::getRegisterForGprArgNum(
+                   Ty, Traits::getArgIndex(i, GprArgs.size()))
+                   .hasValue()) {
+      GprArgs.emplace_back(Ty, Arg);
+      GprArgIndices.push_back(i);
+    } else {
+      // Place on stack.
+      StackArgs.push_back(Arg);
+      if (isVectorType(Arg->getType())) {
+        ParameterAreaSizeBytes =
+            Traits::applyStackAlignment(ParameterAreaSizeBytes);
+      }
+      Variable *esp = getPhysicalRegister(getStackReg(), Traits::WordType);
+      Constant *Loc = Ctx->getConstantInt32(ParameterAreaSizeBytes);
+      StackArgLocations.push_back(
+          Traits::X86OperandMem::create(Func, Ty, esp, Loc));
+      ParameterAreaSizeBytes += typeWidthInBytesOnStack(Arg->getType());
+    }
+  }
+  // Ensure there is enough space for the fstp/movs for floating returns.
+  Variable *Dest = Instr->getDest();
+  const Type DestTy = Dest ? Dest->getType() : IceType_void;
+  if (!Traits::X86_PASS_SCALAR_FP_IN_XMM) {
+    if (isScalarFloatingType(DestTy)) {
+      ParameterAreaSizeBytes =
+          std::max(static_cast<size_t>(ParameterAreaSizeBytes),
+                   typeWidthInBytesOnStack(DestTy));
+    }
+  }
+  // Adjust the parameter area so that the stack is aligned. It is assumed that
+  // the stack is already aligned at the start of the calling sequence.
+  ParameterAreaSizeBytes = Traits::applyStackAlignment(ParameterAreaSizeBytes);
+  assert(ParameterAreaSizeBytes <= maxOutArgsSizeBytes());
+  // Copy arguments that are passed on the stack to the appropriate stack
+  // locations.  We make sure legalize() is called on each argument at this
+  // point, to allow availabilityGet() to work.
+  for (SizeT i = 0, NumStackArgs = StackArgs.size(); i < NumStackArgs; ++i) {
+    lowerStore(
+        InstStore::create(Func, legalize(StackArgs[i]), StackArgLocations[i]));
+  }
+  // Copy arguments to be passed in registers to the appropriate registers.
+  for (SizeT i = 0, NumXmmArgs = XmmArgs.size(); i < NumXmmArgs; ++i) {
+    XmmArgs[i] = legalizeToReg(legalize(XmmArgs[i]),
+                               Traits::getRegisterForXmmArgNum(
+                                   Traits::getArgIndex(XmmArgIndices[i], i)));
+  }
+  // Materialize moves for arguments passed in GPRs.
+  for (SizeT i = 0, NumGprArgs = GprArgs.size(); i < NumGprArgs; ++i) {
+    const Type SignatureTy = GprArgs[i].first;
+    Operand *Arg =
+        legalize(GprArgs[i].second, Legal_Default | Legal_Rematerializable);
+    GprArgs[i].second = legalizeToReg(
+        Arg, Traits::getRegisterForGprArgNum(
+                 Arg->getType(), Traits::getArgIndex(GprArgIndices[i], i)));
+    assert(SignatureTy == IceType_i64 || SignatureTy == IceType_i32);
+    assert(SignatureTy == Arg->getType());
+    (void)SignatureTy;
+  }
+  // Generate a FakeUse of register arguments so that they do not get dead code
+  // eliminated as a result of the FakeKill of scratch registers after the call.
+  // These need to be right before the call instruction.
+  for (auto *Arg : XmmArgs) {
+    Context.insert<InstFakeUse>(llvm::cast<Variable>(Arg));
+  }
+  for (auto &ArgPair : GprArgs) {
+    Context.insert<InstFakeUse>(llvm::cast<Variable>(ArgPair.second));
+  }
+  // Generate the call instruction. Assign its result to a temporary with high
+  // register allocation weight.
+  // ReturnReg doubles as ReturnRegLo as necessary.
+  Variable *ReturnReg = nullptr;
+  Variable *ReturnRegHi = nullptr;
+  if (Dest) {
+    switch (DestTy) {
+    case IceType_NUM:
+    case IceType_void:
+    case IceType_i1:
+    case IceType_i8:
+    case IceType_i16:
+      llvm::report_fatal_error("Invalid Call dest type");
+      break;
+    case IceType_i32:
+      ReturnReg = makeReg(DestTy, Traits::RegisterSet::Reg_eax);
+      break;
+    case IceType_i64:
+      if (Traits::Is64Bit) {
+        ReturnReg = makeReg(IceType_i64, Traits::getRaxOrDie());
+      } else {
+        ReturnReg = makeReg(IceType_i32, Traits::RegisterSet::Reg_eax);
+        ReturnRegHi = makeReg(IceType_i32, Traits::RegisterSet::Reg_edx);
+      }
+      break;
+    case IceType_f32:
+    case IceType_f64:
+      if (!Traits::X86_PASS_SCALAR_FP_IN_XMM) {
+        // Leave ReturnReg==ReturnRegHi==nullptr, and capture the result with
+        // the fstp instruction.
+        break;
+      }
+    // Fallthrough intended.
+    case IceType_v4i1:
+    case IceType_v8i1:
+    case IceType_v16i1:
+    case IceType_v16i8:
+    case IceType_v8i16:
+    case IceType_v4i32:
+    case IceType_v4f32:
+      ReturnReg = makeReg(DestTy, Traits::RegisterSet::Reg_xmm0);
+      break;
+    }
+  }
+  // Emit the call to the function.
+  Operand *CallTarget =
+      legalize(Instr->getCallTarget(), Legal_Reg | Legal_Imm | Legal_AddrAbs);
+  size_t NumVariadicFpArgs = Instr->isVariadic() ? XmmArgs.size() : 0;
+  Inst *NewCall = emitCallToTarget(CallTarget, ReturnReg, NumVariadicFpArgs);
+  // Keep the upper return register live on 32-bit platform.
+  if (ReturnRegHi)
+    Context.insert<InstFakeDef>(ReturnRegHi);
+  // Mark the call as killing all the caller-save registers.
+  Context.insert<InstFakeKill>(NewCall);
+  // Handle x86-32 floating point returns.
+  if (Dest != nullptr && isScalarFloatingType(DestTy) &&
+      !Traits::X86_PASS_SCALAR_FP_IN_XMM) {
+    // Special treatment for an FP function which returns its result in st(0).
+    // If Dest ends up being a physical xmm register, the fstp emit code will
+    // route st(0) through the space reserved in the function argument area
+    // we allocated.
+    _fstp(Dest);
+    // Create a fake use of Dest in case it actually isn't used, because st(0)
+    // still needs to be popped.
+    Context.insert<InstFakeUse>(Dest);
+  }
+  // Generate a FakeUse to keep the call live if necessary.
+  if (Instr->hasSideEffects() && ReturnReg) {
+    Context.insert<InstFakeUse>(ReturnReg);
+  }
+  // Process the return value, if any.
+  if (Dest == nullptr)
+    return;
+  // Assign the result of the call to Dest.  Route it through a temporary so
+  // that the local register availability peephole can be subsequently used.
+  Variable *Tmp = nullptr;
+  if (isVectorType(DestTy)) {
+    assert(ReturnReg && "Vector type requires a return register");
+    Tmp = makeReg(DestTy);
+    _movp(Tmp, ReturnReg);
+    _movp(Dest, Tmp);
+  } else if (isScalarFloatingType(DestTy)) {
+    if (Traits::X86_PASS_SCALAR_FP_IN_XMM) {
+      assert(ReturnReg && "FP type requires a return register");
+      _mov(Tmp, ReturnReg);
+      _mov(Dest, Tmp);
+    }
+  } else {
+    assert(isScalarIntegerType(DestTy));
+    assert(ReturnReg && "Integer type requires a return register");
+    if (DestTy == IceType_i64 && !Traits::Is64Bit) {
+      assert(ReturnRegHi && "64-bit type requires two return registers");
+      auto *Dest64On32 = llvm::cast<Variable64On32>(Dest);
+      Variable *DestLo = Dest64On32->getLo();
+      Variable *DestHi = Dest64On32->getHi();
+      _mov(Tmp, ReturnReg);
+      _mov(DestLo, Tmp);
+      Variable *TmpHi = nullptr;
+      _mov(TmpHi, ReturnRegHi);
+      _mov(DestHi, TmpHi);
+    } else {
+      _mov(Tmp, ReturnReg);
+      _mov(Dest, Tmp);
+    }
+  }
+}
+
+void TargetX8632::lowerCast(const InstCast *Instr) {
+  // a = cast(b) ==> t=cast(b); a=t; (link t->b, link a->t, no overlap)
+  InstCast::OpKind CastKind = Instr->getCastKind();
+  Variable *Dest = Instr->getDest();
+  Type DestTy = Dest->getType();
+  switch (CastKind) {
+  default:
+    Func->setError("Cast type not supported");
+    return;
+  case InstCast::Sext: {
+    // Src0RM is the source operand legalized to physical register or memory,
+    // but not immediate, since the relevant x86 native instructions don't
+    // allow an immediate operand. If the operand is an immediate, we could
+    // consider computing the strength-reduced result at translation time, but
+    // we're unlikely to see something like that in the bitcode that the
+    // optimizer wouldn't have already taken care of.
+    Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
+    if (isVectorType(DestTy)) {
+      if (DestTy == IceType_v16i8) {
+        // onemask = materialize(1,1,...); dst = (src & onemask) > 0
+        Variable *OneMask = makeVectorOfOnes(DestTy);
+        Variable *T = makeReg(DestTy);
+        _movp(T, Src0RM);
+        _pand(T, OneMask);
+        Variable *Zeros = makeVectorOfZeros(DestTy);
+        _pcmpgt(T, Zeros);
+        _movp(Dest, T);
+      } else {
+        /// width = width(elty) - 1; dest = (src << width) >> width
+        SizeT ShiftAmount =
+            Traits::X86_CHAR_BIT * typeWidthInBytes(typeElementType(DestTy)) -
+            1;
+        Constant *ShiftConstant = Ctx->getConstantInt8(ShiftAmount);
+        Variable *T = makeReg(DestTy);
+        _movp(T, Src0RM);
+        _psll(T, ShiftConstant);
+        _psra(T, ShiftConstant);
+        _movp(Dest, T);
+      }
+    } else if (!Traits::Is64Bit && DestTy == IceType_i64) {
+      // t1=movsx src; t2=t1; t2=sar t2, 31; dst.lo=t1; dst.hi=t2
+      Constant *Shift = Ctx->getConstantInt32(31);
+      auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
+      auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
+      Variable *T_Lo = makeReg(DestLo->getType());
+      if (Src0RM->getType() == IceType_i32) {
+        _mov(T_Lo, Src0RM);
+      } else if (Src0RM->getType() == IceType_i1) {
+        _movzx(T_Lo, Src0RM);
+        _shl(T_Lo, Shift);
+        _sar(T_Lo, Shift);
+      } else {
+        _movsx(T_Lo, Src0RM);
+      }
+      _mov(DestLo, T_Lo);
+      Variable *T_Hi = nullptr;
+      _mov(T_Hi, T_Lo);
+      if (Src0RM->getType() != IceType_i1)
+        // For i1, the sar instruction is already done above.
+        _sar(T_Hi, Shift);
+      _mov(DestHi, T_Hi);
+    } else if (Src0RM->getType() == IceType_i1) {
+      // t1 = src
+      // shl t1, dst_bitwidth - 1
+      // sar t1, dst_bitwidth - 1
+      // dst = t1
+      size_t DestBits = Traits::X86_CHAR_BIT * typeWidthInBytes(DestTy);
+      Constant *ShiftAmount = Ctx->getConstantInt32(DestBits - 1);
+      Variable *T = makeReg(DestTy);
+      if (typeWidthInBytes(DestTy) <= typeWidthInBytes(Src0RM->getType())) {
+        _mov(T, Src0RM);
+      } else {
+        // Widen the source using movsx or movzx. (It doesn't matter which one,
+        // since the following shl/sar overwrite the bits.)
+        _movzx(T, Src0RM);
+      }
+      _shl(T, ShiftAmount);
+      _sar(T, ShiftAmount);
+      _mov(Dest, T);
+    } else {
+      // t1 = movsx src; dst = t1
+      Variable *T = makeReg(DestTy);
+      _movsx(T, Src0RM);
+      _mov(Dest, T);
+    }
+    break;
+  }
+  case InstCast::Zext: {
+    Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
+    if (isVectorType(DestTy)) {
+      // onemask = materialize(1,1,...); dest = onemask & src
+      Variable *OneMask = makeVectorOfOnes(DestTy);
+      Variable *T = makeReg(DestTy);
+      _movp(T, Src0RM);
+      _pand(T, OneMask);
+      _movp(Dest, T);
+    } else if (!Traits::Is64Bit && DestTy == IceType_i64) {
+      // t1=movzx src; dst.lo=t1; dst.hi=0
+      Constant *Zero = Ctx->getConstantZero(IceType_i32);
+      auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
+      auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
+      Variable *Tmp = makeReg(DestLo->getType());
+      if (Src0RM->getType() == IceType_i32) {
+        _mov(Tmp, Src0RM);
+      } else {
+        _movzx(Tmp, Src0RM);
+      }
+      _mov(DestLo, Tmp);
+      _mov(DestHi, Zero);
+    } else if (Src0RM->getType() == IceType_i1) {
+      // t = Src0RM; Dest = t
+      Variable *T = nullptr;
+      if (DestTy == IceType_i8) {
+        _mov(T, Src0RM);
+      } else {
+        assert(DestTy != IceType_i1);
+        assert(Traits::Is64Bit || DestTy != IceType_i64);
+        // Use 32-bit for both 16-bit and 32-bit, since 32-bit ops are shorter.
+        // In x86-64 we need to widen T to 64-bits to ensure that T -- if
+        // written to the stack (i.e., in -Om1) will be fully zero-extended.
+        T = makeReg(DestTy == IceType_i64 ? IceType_i64 : IceType_i32);
+        _movzx(T, Src0RM);
+      }
+      _mov(Dest, T);
+    } else {
+      // t1 = movzx src; dst = t1
+      Variable *T = makeReg(DestTy);
+      _movzx(T, Src0RM);
+      _mov(Dest, T);
+    }
+    break;
+  }
+  case InstCast::Trunc: {
+    if (isVectorType(DestTy)) {
+      // onemask = materialize(1,1,...); dst = src & onemask
+      Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
+      Type Src0Ty = Src0RM->getType();
+      Variable *OneMask = makeVectorOfOnes(Src0Ty);
+      Variable *T = makeReg(DestTy);
+      _movp(T, Src0RM);
+      _pand(T, OneMask);
+      _movp(Dest, T);
+    } else if (DestTy == IceType_i1 || DestTy == IceType_i8) {
+      // Make sure we truncate from and into valid registers.
+      Operand *Src0 = legalizeUndef(Instr->getSrc(0));
+      if (!Traits::Is64Bit && Src0->getType() == IceType_i64)
+        Src0 = loOperand(Src0);
+      Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+      Variable *T = copyToReg8(Src0RM);
+      if (DestTy == IceType_i1)
+        _and(T, Ctx->getConstantInt1(1));
+      _mov(Dest, T);
+    } else {
+      Operand *Src0 = legalizeUndef(Instr->getSrc(0));
+      if (!Traits::Is64Bit && Src0->getType() == IceType_i64)
+        Src0 = loOperand(Src0);
+      Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+      // t1 = trunc Src0RM; Dest = t1
+      Variable *T = makeReg(DestTy);
+      _mov(T, Src0RM);
+      _mov(Dest, T);
+    }
+    break;
+  }
+  case InstCast::Fptrunc:
+  case InstCast::Fpext: {
+    Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
+    // t1 = cvt Src0RM; Dest = t1
+    Variable *T = makeReg(DestTy);
+    _cvt(T, Src0RM, Insts::Cvt::Float2float);
+    _mov(Dest, T);
+    break;
+  }
+  case InstCast::Fptosi:
+    if (isVectorType(DestTy)) {
+      assert(DestTy == IceType_v4i32);
+      assert(Instr->getSrc(0)->getType() == IceType_v4f32);
+      Operand *Src0R = legalizeToReg(Instr->getSrc(0));
+      Variable *T = makeReg(DestTy);
+      _cvt(T, Src0R, Insts::Cvt::Tps2dq);
+      _movp(Dest, T);
+    } else if (!Traits::Is64Bit && DestTy == IceType_i64) {
+      llvm::report_fatal_error("Helper call was expected");
+    } else {
+      Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
+      // t1.i32 = cvt Src0RM; t2.dest_type = t1; Dest = t2.dest_type
+      Variable *T_1 = nullptr;
+      if (Traits::Is64Bit && DestTy == IceType_i64) {
+        T_1 = makeReg(IceType_i64);
+      } else {
+        assert(DestTy != IceType_i64);
+        T_1 = makeReg(IceType_i32);
+      }
+      // cvt() requires its integer argument to be a GPR.
+      Variable *T_2 = makeReg(DestTy);
+      if (isByteSizedType(DestTy)) {
+        assert(T_1->getType() == IceType_i32);
+        T_1->setRegClass(RCX86_Is32To8);
+        T_2->setRegClass(RCX86_IsTrunc8Rcvr);
+      }
+      _cvt(T_1, Src0RM, Insts::Cvt::Tss2si);
+      _mov(T_2, T_1); // T_1 and T_2 may have different integer types
+      if (DestTy == IceType_i1)
+        _and(T_2, Ctx->getConstantInt1(1));
+      _mov(Dest, T_2);
+    }
+    break;
+  case InstCast::Fptoui:
+    if (isVectorType(DestTy)) {
+      llvm::report_fatal_error("Helper call was expected");
+    } else if (DestTy == IceType_i64 ||
+               (!Traits::Is64Bit && DestTy == IceType_i32)) {
+      llvm::report_fatal_error("Helper call was expected");
+    } else {
+      Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
+      // t1.i32 = cvt Src0RM; t2.dest_type = t1; Dest = t2.dest_type
+      assert(DestTy != IceType_i64);
+      Variable *T_1 = nullptr;
+      if (Traits::Is64Bit && DestTy == IceType_i32) {
+        T_1 = makeReg(IceType_i64);
+      } else {
+        assert(DestTy != IceType_i32);
+        T_1 = makeReg(IceType_i32);
+      }
+      Variable *T_2 = makeReg(DestTy);
+      if (isByteSizedType(DestTy)) {
+        assert(T_1->getType() == IceType_i32);
+        T_1->setRegClass(RCX86_Is32To8);
+        T_2->setRegClass(RCX86_IsTrunc8Rcvr);
+      }
+      _cvt(T_1, Src0RM, Insts::Cvt::Tss2si);
+      _mov(T_2, T_1); // T_1 and T_2 may have different integer types
+      if (DestTy == IceType_i1)
+        _and(T_2, Ctx->getConstantInt1(1));
+      _mov(Dest, T_2);
+    }
+    break;
+  case InstCast::Sitofp:
+    if (isVectorType(DestTy)) {
+      assert(DestTy == IceType_v4f32);
+      assert(Instr->getSrc(0)->getType() == IceType_v4i32);
+      Operand *Src0R = legalizeToReg(Instr->getSrc(0));
+      Variable *T = makeReg(DestTy);
+      _cvt(T, Src0R, Insts::Cvt::Dq2ps);
+      _movp(Dest, T);
+    } else if (!Traits::Is64Bit && Instr->getSrc(0)->getType() == IceType_i64) {
+      llvm::report_fatal_error("Helper call was expected");
+    } else {
+      Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
+      // Sign-extend the operand.
+      // t1.i32 = movsx Src0RM; t2 = Cvt t1.i32; Dest = t2
+      Variable *T_1 = nullptr;
+      if (Traits::Is64Bit && Src0RM->getType() == IceType_i64) {
+        T_1 = makeReg(IceType_i64);
+      } else {
+        assert(Src0RM->getType() != IceType_i64);
+        T_1 = makeReg(IceType_i32);
+      }
+      Variable *T_2 = makeReg(DestTy);
+      if (Src0RM->getType() == T_1->getType())
+        _mov(T_1, Src0RM);
+      else
+        _movsx(T_1, Src0RM);
+      _cvt(T_2, T_1, Insts::Cvt::Si2ss);
+      _mov(Dest, T_2);
+    }
+    break;
+  case InstCast::Uitofp: {
+    Operand *Src0 = Instr->getSrc(0);
+    if (isVectorType(Src0->getType())) {
+      llvm::report_fatal_error("Helper call was expected");
+    } else if (Src0->getType() == IceType_i64 ||
+               (!Traits::Is64Bit && Src0->getType() == IceType_i32)) {
+      llvm::report_fatal_error("Helper call was expected");
+    } else {
+      Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+      // Zero-extend the operand.
+      // t1.i32 = movzx Src0RM; t2 = Cvt t1.i32; Dest = t2
+      Variable *T_1 = nullptr;
+      if (Traits::Is64Bit && Src0RM->getType() == IceType_i32) {
+        T_1 = makeReg(IceType_i64);
+      } else {
+        assert(Src0RM->getType() != IceType_i64);
+        assert(Traits::Is64Bit || Src0RM->getType() != IceType_i32);
+        T_1 = makeReg(IceType_i32);
+      }
+      Variable *T_2 = makeReg(DestTy);
+      if (Src0RM->getType() == T_1->getType())
+        _mov(T_1, Src0RM);
+      else
+        _movzx(T_1, Src0RM)->setMustKeep();
+      _cvt(T_2, T_1, Insts::Cvt::Si2ss);
+      _mov(Dest, T_2);
+    }
+    break;
+  }
+  case InstCast::Bitcast: {
+    Operand *Src0 = Instr->getSrc(0);
+    if (DestTy == Src0->getType()) {
+      auto *Assign = InstAssign::create(Func, Dest, Src0);
+      lowerAssign(Assign);
+      return;
+    }
+    switch (DestTy) {
+    default:
+      llvm_unreachable("Unexpected Bitcast dest type");
+    case IceType_i8: {
+      llvm::report_fatal_error("Helper call was expected");
+    } break;
+    case IceType_i16: {
+      llvm::report_fatal_error("Helper call was expected");
+    } break;
+    case IceType_i32:
+    case IceType_f32: {
+      Variable *Src0R = legalizeToReg(Src0);
+      Variable *T = makeReg(DestTy);
+      _movd(T, Src0R);
+      _mov(Dest, T);
+    } break;
+    case IceType_i64: {
+      assert(Src0->getType() == IceType_f64);
+      if (Traits::Is64Bit) {
+        Variable *Src0R = legalizeToReg(Src0);
+        Variable *T = makeReg(IceType_i64);
+        _movd(T, Src0R);
+        _mov(Dest, T);
+      } else {
+        Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+        // a.i64 = bitcast b.f64 ==>
+        //   s.f64 = spill b.f64
+        //   t_lo.i32 = lo(s.f64)
+        //   a_lo.i32 = t_lo.i32
+        //   t_hi.i32 = hi(s.f64)
+        //   a_hi.i32 = t_hi.i32
+        Operand *SpillLo, *SpillHi;
+        if (auto *Src0Var = llvm::dyn_cast<Variable>(Src0RM)) {
+          Variable *Spill = Func->makeVariable(IceType_f64);
+          Spill->setLinkedTo(Src0Var);
+          Spill->setMustNotHaveReg();
+          _movq(Spill, Src0RM);
+          SpillLo = Traits::VariableSplit::create(Func, Spill,
+                                                  Traits::VariableSplit::Low);
+          SpillHi = Traits::VariableSplit::create(Func, Spill,
+                                                  Traits::VariableSplit::High);
+        } else {
+          SpillLo = loOperand(Src0RM);
+          SpillHi = hiOperand(Src0RM);
+        }
+
+        auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
+        auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
+        Variable *T_Lo = makeReg(IceType_i32);
+        Variable *T_Hi = makeReg(IceType_i32);
+
+        _mov(T_Lo, SpillLo);
+        _mov(DestLo, T_Lo);
+        _mov(T_Hi, SpillHi);
+        _mov(DestHi, T_Hi);
+      }
+    } break;
+    case IceType_f64: {
+      assert(Src0->getType() == IceType_i64);
+      if (Traits::Is64Bit) {
+        Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+        Variable *T = makeReg(IceType_f64);
+        _movd(T, Src0RM);
+        _mov(Dest, T);
+      } else {
+        Src0 = legalize(Src0);
+        if (llvm::isa<X86OperandMem>(Src0)) {
+          Variable *T = makeReg(DestTy);
+          _movq(T, Src0);
+          _movq(Dest, T);
+          break;
+        }
+        // a.f64 = bitcast b.i64 ==>
+        //   t_lo.i32 = b_lo.i32
+        //   FakeDef(s.f64)
+        //   lo(s.f64) = t_lo.i32
+        //   t_hi.i32 = b_hi.i32
+        //   hi(s.f64) = t_hi.i32
+        //   a.f64 = s.f64
+        Variable *Spill = Func->makeVariable(IceType_f64);
+        Spill->setLinkedTo(Dest);
+        Spill->setMustNotHaveReg();
+
+        Variable *T_Lo = nullptr, *T_Hi = nullptr;
+        auto *SpillLo = Traits::VariableSplit::create(
+            Func, Spill, Traits::VariableSplit::Low);
+        auto *SpillHi = Traits::VariableSplit::create(
+            Func, Spill, Traits::VariableSplit::High);
+        _mov(T_Lo, loOperand(Src0));
+        // Technically, the Spill is defined after the _store happens, but
+        // SpillLo is considered a "use" of Spill so define Spill before it is
+        // used.
+        Context.insert<InstFakeDef>(Spill);
+        _store(T_Lo, SpillLo);
+        _mov(T_Hi, hiOperand(Src0));
+        _store(T_Hi, SpillHi);
+        _movq(Dest, Spill);
+      }
+    } break;
+    case IceType_v8i1: {
+      llvm::report_fatal_error("Helper call was expected");
+    } break;
+    case IceType_v16i1: {
+      llvm::report_fatal_error("Helper call was expected");
+    } break;
+    case IceType_v8i16:
+    case IceType_v16i8:
+    case IceType_v4i32:
+    case IceType_v4f32: {
+      if (Src0->getType() == IceType_i32) {
+        // Bitcast requires equal type sizes, which isn't strictly the case
+        // between scalars and vectors, but to emulate v4i8 vectors one has to
+        // use v16i8 vectors.
+        Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+        Variable *T = makeReg(DestTy);
+        _movd(T, Src0RM);
+        _mov(Dest, T);
+      } else {
+        _movp(Dest, legalizeToReg(Src0));
+      }
+    } break;
+    }
+    break;
+  }
+  }
+}
+
+void TargetX8632::lowerExtractElement(const InstExtractElement *Instr) {
+  Operand *SourceVectNotLegalized = Instr->getSrc(0);
+  auto *ElementIndex = llvm::dyn_cast<ConstantInteger32>(Instr->getSrc(1));
+  // Only constant indices are allowed in PNaCl IR.
+  assert(ElementIndex);
+
+  unsigned Index = ElementIndex->getValue();
+  Type Ty = SourceVectNotLegalized->getType();
+  Type ElementTy = typeElementType(Ty);
+  Type InVectorElementTy = Traits::getInVectorElementType(Ty);
+
+  // TODO(wala): Determine the best lowering sequences for each type.
+  bool CanUsePextr = Ty == IceType_v8i16 || Ty == IceType_v8i1 ||
+                     (InstructionSet >= SSE4_1 && Ty != IceType_v4f32);
+  Variable *ExtractedElementR =
+      makeReg(CanUsePextr ? IceType_i32 : InVectorElementTy);
+  if (CanUsePextr) {
+    // Use pextrb, pextrw, or pextrd.  The "b" and "w" versions clear the upper
+    // bits of the destination register, so we represent this by always
+    // extracting into an i32 register.  The _mov into Dest below will do
+    // truncation as necessary.
+    Constant *Mask = Ctx->getConstantInt32(Index);
+    Variable *SourceVectR = legalizeToReg(SourceVectNotLegalized);
+    _pextr(ExtractedElementR, SourceVectR, Mask);
+  } else if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) {
+    // Use pshufd and movd/movss.
+    Variable *T = nullptr;
+    if (Index) {
+      // The shuffle only needs to occur if the element to be extracted is not
+      // at the lowest index.
+      Constant *Mask = Ctx->getConstantInt32(Index);
+      T = makeReg(Ty);
+      _pshufd(T, legalize(SourceVectNotLegalized, Legal_Reg | Legal_Mem), Mask);
+    } else {
+      T = legalizeToReg(SourceVectNotLegalized);
+    }
+
+    if (InVectorElementTy == IceType_i32) {
+      _movd(ExtractedElementR, T);
+    } else { // Ty == IceType_f32
+      // TODO(wala): _movss is only used here because _mov does not allow a
+      // vector source and a scalar destination.  _mov should be able to be
+      // used here.
+      // _movss is a binary instruction, so the FakeDef is needed to keep the
+      // live range analysis consistent.
+      Context.insert<InstFakeDef>(ExtractedElementR);
+      _movss(ExtractedElementR, T);
+    }
+  } else {
+    assert(Ty == IceType_v16i8 || Ty == IceType_v16i1);
+    // Spill the value to a stack slot and do the extraction in memory.
+    //
+    // TODO(wala): use legalize(SourceVectNotLegalized, Legal_Mem) when support
+    // for legalizing to mem is implemented.
+    Variable *Slot = Func->makeVariable(Ty);
+    Slot->setMustNotHaveReg();
+    _movp(Slot, legalizeToReg(SourceVectNotLegalized));
+
+    // Compute the location of the element in memory.
+    unsigned Offset = Index * typeWidthInBytes(InVectorElementTy);
+    X86OperandMem *Loc =
+        getMemoryOperandForStackSlot(InVectorElementTy, Slot, Offset);
+    _mov(ExtractedElementR, Loc);
+  }
+
+  if (ElementTy == IceType_i1) {
+    // Truncate extracted integers to i1s if necessary.
+    Variable *T = makeReg(IceType_i1);
+    InstCast *Cast =
+        InstCast::create(Func, InstCast::Trunc, T, ExtractedElementR);
+    lowerCast(Cast);
+    ExtractedElementR = T;
+  }
+
+  // Copy the element to the destination.
+  Variable *Dest = Instr->getDest();
+  _mov(Dest, ExtractedElementR);
+}
+
+void TargetX8632::lowerFcmp(const InstFcmp *Fcmp) {
+  Variable *Dest = Fcmp->getDest();
+
+  if (isVectorType(Dest->getType())) {
+    lowerFcmpVector(Fcmp);
+  } else {
+    constexpr Inst *Consumer = nullptr;
+    lowerFcmpAndConsumer(Fcmp, Consumer);
+  }
+}
+
+void TargetX8632::lowerFcmpAndConsumer(const InstFcmp *Fcmp,
+                                       const Inst *Consumer) {
+  Operand *Src0 = Fcmp->getSrc(0);
+  Operand *Src1 = Fcmp->getSrc(1);
+  Variable *Dest = Fcmp->getDest();
+
+  if (Consumer != nullptr) {
+    if (auto *Select = llvm::dyn_cast<InstSelect>(Consumer)) {
+      if (lowerOptimizeFcmpSelect(Fcmp, Select))
+        return;
+    }
+  }
+
+  if (isVectorType(Dest->getType())) {
+    lowerFcmp(Fcmp);
+    if (Consumer != nullptr)
+      lowerSelectVector(llvm::cast<InstSelect>(Consumer));
+    return;
+  }
+
+  // Lowering a = fcmp cond, b, c
+  //   ucomiss b, c       /* only if C1 != Br_None */
+  //                      /* but swap b,c order if SwapOperands==true */
+  //   mov a, <default>
+  //   j<C1> label        /* only if C1 != Br_None */
+  //   j<C2> label        /* only if C2 != Br_None */
+  //   FakeUse(a)         /* only if C1 != Br_None */
+  //   mov a, !<default>  /* only if C1 != Br_None */
+  //   label:             /* only if C1 != Br_None */
+  //
+  // setcc lowering when C1 != Br_None && C2 == Br_None:
+  //   ucomiss b, c       /* but swap b,c order if SwapOperands==true */
+  //   setcc a, C1
+  InstFcmp::FCond Condition = Fcmp->getCondition();
+  assert(static_cast<size_t>(Condition) < Traits::TableFcmpSize);
+  if (Traits::TableFcmp[Condition].SwapScalarOperands)
+    std::swap(Src0, Src1);
+  const bool HasC1 = (Traits::TableFcmp[Condition].C1 != CondX86::Br_None);
+  const bool HasC2 = (Traits::TableFcmp[Condition].C2 != CondX86::Br_None);
+  if (HasC1) {
+    Src0 = legalize(Src0);
+    Operand *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
+    Variable *T = nullptr;
+    _mov(T, Src0);
+    _ucomiss(T, Src1RM);
+    if (!HasC2) {
+      assert(Traits::TableFcmp[Condition].Default);
+      setccOrConsumer(Traits::TableFcmp[Condition].C1, Dest, Consumer);
+      return;
+    }
+  }
+  int32_t IntDefault = Traits::TableFcmp[Condition].Default;
+  if (Consumer == nullptr) {
+    Constant *Default = Ctx->getConstantInt(Dest->getType(), IntDefault);
+    _mov(Dest, Default);
+    if (HasC1) {
+      InstX86Label *Label = InstX86Label::create(Func, this);
+      _br(Traits::TableFcmp[Condition].C1, Label);
+      if (HasC2) {
+        _br(Traits::TableFcmp[Condition].C2, Label);
+      }
+      Constant *NonDefault = Ctx->getConstantInt(Dest->getType(), !IntDefault);
+      _redefined(_mov(Dest, NonDefault));
+      Context.insert(Label);
+    }
+    return;
+  }
+  if (const auto *Br = llvm::dyn_cast<InstBr>(Consumer)) {
+    CfgNode *TrueSucc = Br->getTargetTrue();
+    CfgNode *FalseSucc = Br->getTargetFalse();
+    if (IntDefault != 0)
+      std::swap(TrueSucc, FalseSucc);
+    if (HasC1) {
+      _br(Traits::TableFcmp[Condition].C1, FalseSucc);
+      if (HasC2) {
+        _br(Traits::TableFcmp[Condition].C2, FalseSucc);
+      }
+      _br(TrueSucc);
+      return;
+    }
+    _br(FalseSucc);
+    return;
+  }
+  if (auto *Select = llvm::dyn_cast<InstSelect>(Consumer)) {
+    Operand *SrcT = Select->getTrueOperand();
+    Operand *SrcF = Select->getFalseOperand();
+    Variable *SelectDest = Select->getDest();
+    if (IntDefault != 0)
+      std::swap(SrcT, SrcF);
+    lowerMove(SelectDest, SrcF, false);
+    if (HasC1) {
+      InstX86Label *Label = InstX86Label::create(Func, this);
+      _br(Traits::TableFcmp[Condition].C1, Label);
+      if (HasC2) {
+        _br(Traits::TableFcmp[Condition].C2, Label);
+      }
+      static constexpr bool IsRedefinition = true;
+      lowerMove(SelectDest, SrcT, IsRedefinition);
+      Context.insert(Label);
+    }
+    return;
+  }
+  llvm::report_fatal_error("Unexpected consumer type");
+}
+
+void TargetX8632::lowerFcmpVector(const InstFcmp *Fcmp) {
+  Operand *Src0 = Fcmp->getSrc(0);
+  Operand *Src1 = Fcmp->getSrc(1);
+  Variable *Dest = Fcmp->getDest();
+
+  if (!isVectorType(Dest->getType()))
+    llvm::report_fatal_error("Expected vector compare");
+
+  InstFcmp::FCond Condition = Fcmp->getCondition();
+  assert(static_cast<size_t>(Condition) < Traits::TableFcmpSize);
+
+  if (Traits::TableFcmp[Condition].SwapVectorOperands)
+    std::swap(Src0, Src1);
+
+  Variable *T = nullptr;
+
+  if (Condition == InstFcmp::True) {
+    // makeVectorOfOnes() requires an integer vector type.
+    T = makeVectorOfMinusOnes(IceType_v4i32);
+  } else if (Condition == InstFcmp::False) {
+    T = makeVectorOfZeros(Dest->getType());
+  } else {
+    Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+    Operand *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
+    if (llvm::isa<X86OperandMem>(Src1RM))
+      Src1RM = legalizeToReg(Src1RM);
+
+    switch (Condition) {
+    default: {
+      const CmppsCond Predicate = Traits::TableFcmp[Condition].Predicate;
+      assert(Predicate != CondX86::Cmpps_Invalid);
+      T = makeReg(Src0RM->getType());
+      _movp(T, Src0RM);
+      _cmpps(T, Src1RM, Predicate);
+    } break;
+    case InstFcmp::One: {
+      // Check both unequal and ordered.
+      T = makeReg(Src0RM->getType());
+      Variable *T2 = makeReg(Src0RM->getType());
+      _movp(T, Src0RM);
+      _cmpps(T, Src1RM, CondX86::Cmpps_neq);
+      _movp(T2, Src0RM);
+      _cmpps(T2, Src1RM, CondX86::Cmpps_ord);
+      _pand(T, T2);
+    } break;
+    case InstFcmp::Ueq: {
+      // Check both equal or unordered.
+      T = makeReg(Src0RM->getType());
+      Variable *T2 = makeReg(Src0RM->getType());
+      _movp(T, Src0RM);
+      _cmpps(T, Src1RM, CondX86::Cmpps_eq);
+      _movp(T2, Src0RM);
+      _cmpps(T2, Src1RM, CondX86::Cmpps_unord);
+      _por(T, T2);
+    } break;
+    }
+  }
+
+  assert(T != nullptr);
+  _movp(Dest, T);
+  eliminateNextVectorSextInstruction(Dest);
+}
+
+inline bool isZero(const Operand *Opnd) {
+  if (auto *C64 = llvm::dyn_cast<ConstantInteger64>(Opnd))
+    return C64->getValue() == 0;
+  if (auto *C32 = llvm::dyn_cast<ConstantInteger32>(Opnd))
+    return C32->getValue() == 0;
+  return false;
+}
+
+void TargetX8632::lowerIcmpAndConsumer(const InstIcmp *Icmp,
+                                       const Inst *Consumer) {
+  Operand *Src0 = legalize(Icmp->getSrc(0));
+  Operand *Src1 = legalize(Icmp->getSrc(1));
+  Variable *Dest = Icmp->getDest();
+
+  if (isVectorType(Dest->getType())) {
+    lowerIcmp(Icmp);
+    if (Consumer != nullptr)
+      lowerSelectVector(llvm::cast<InstSelect>(Consumer));
+    return;
+  }
+
+  if (!Traits::Is64Bit && Src0->getType() == IceType_i64) {
+    lowerIcmp64(Icmp, Consumer);
+    return;
+  }
+
+  // cmp b, c
+  if (isZero(Src1)) {
+    switch (Icmp->getCondition()) {
+    default:
+      break;
+    case InstIcmp::Uge:
+      movOrConsumer(true, Dest, Consumer);
+      return;
+    case InstIcmp::Ult:
+      movOrConsumer(false, Dest, Consumer);
+      return;
+    }
+  }
+  Operand *Src0RM = legalizeSrc0ForCmp(Src0, Src1);
+  _cmp(Src0RM, Src1);
+  setccOrConsumer(Traits::getIcmp32Mapping(Icmp->getCondition()), Dest,
+                  Consumer);
+}
+
+void TargetX8632::lowerIcmpVector(const InstIcmp *Icmp) {
+  Operand *Src0 = legalize(Icmp->getSrc(0));
+  Operand *Src1 = legalize(Icmp->getSrc(1));
+  Variable *Dest = Icmp->getDest();
+
+  if (!isVectorType(Dest->getType()))
+    llvm::report_fatal_error("Expected a vector compare");
+
+  Type Ty = Src0->getType();
+  // Promote i1 vectors to 128 bit integer vector types.
+  if (typeElementType(Ty) == IceType_i1) {
+    Type NewTy = IceType_NUM;
+    switch (Ty) {
+    default:
+      llvm::report_fatal_error("unexpected type");
+      break;
+    case IceType_v4i1:
+      NewTy = IceType_v4i32;
+      break;
+    case IceType_v8i1:
+      NewTy = IceType_v8i16;
+      break;
+    case IceType_v16i1:
+      NewTy = IceType_v16i8;
+      break;
+    }
+    Variable *NewSrc0 = Func->makeVariable(NewTy);
+    Variable *NewSrc1 = Func->makeVariable(NewTy);
+    lowerCast(InstCast::create(Func, InstCast::Sext, NewSrc0, Src0));
+    lowerCast(InstCast::create(Func, InstCast::Sext, NewSrc1, Src1));
+    Src0 = NewSrc0;
+    Src1 = NewSrc1;
+    Ty = NewTy;
+  }
+
+  InstIcmp::ICond Condition = Icmp->getCondition();
+
+  Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+  Operand *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
+
+  // SSE2 only has signed comparison operations. Transform unsigned inputs in
+  // a manner that allows for the use of signed comparison operations by
+  // flipping the high order bits.
+  if (Condition == InstIcmp::Ugt || Condition == InstIcmp::Uge ||
+      Condition == InstIcmp::Ult || Condition == InstIcmp::Ule) {
+    Variable *T0 = makeReg(Ty);
+    Variable *T1 = makeReg(Ty);
+    Variable *HighOrderBits = makeVectorOfHighOrderBits(Ty);
+    _movp(T0, Src0RM);
+    _pxor(T0, HighOrderBits);
+    _movp(T1, Src1RM);
+    _pxor(T1, HighOrderBits);
+    Src0RM = T0;
+    Src1RM = T1;
+  }
+
+  Variable *T = makeReg(Ty);
+  switch (Condition) {
+  default:
+    llvm_unreachable("unexpected condition");
+    break;
+  case InstIcmp::Eq: {
+    if (llvm::isa<X86OperandMem>(Src1RM))
+      Src1RM = legalizeToReg(Src1RM);
+    _movp(T, Src0RM);
+    _pcmpeq(T, Src1RM);
+  } break;
+  case InstIcmp::Ne: {
+    if (llvm::isa<X86OperandMem>(Src1RM))
+      Src1RM = legalizeToReg(Src1RM);
+    _movp(T, Src0RM);
+    _pcmpeq(T, Src1RM);
+    Variable *MinusOne = makeVectorOfMinusOnes(Ty);
+    _pxor(T, MinusOne);
+  } break;
+  case InstIcmp::Ugt:
+  case InstIcmp::Sgt: {
+    if (llvm::isa<X86OperandMem>(Src1RM))
+      Src1RM = legalizeToReg(Src1RM);
+    _movp(T, Src0RM);
+    _pcmpgt(T, Src1RM);
+  } break;
+  case InstIcmp::Uge:
+  case InstIcmp::Sge: {
+    // !(Src1RM > Src0RM)
+    if (llvm::isa<X86OperandMem>(Src0RM))
+      Src0RM = legalizeToReg(Src0RM);
+    _movp(T, Src1RM);
+    _pcmpgt(T, Src0RM);
+    Variable *MinusOne = makeVectorOfMinusOnes(Ty);
+    _pxor(T, MinusOne);
+  } break;
+  case InstIcmp::Ult:
+  case InstIcmp::Slt: {
+    if (llvm::isa<X86OperandMem>(Src0RM))
+      Src0RM = legalizeToReg(Src0RM);
+    _movp(T, Src1RM);
+    _pcmpgt(T, Src0RM);
+  } break;
+  case InstIcmp::Ule:
+  case InstIcmp::Sle: {
+    // !(Src0RM > Src1RM)
+    if (llvm::isa<X86OperandMem>(Src1RM))
+      Src1RM = legalizeToReg(Src1RM);
+    _movp(T, Src0RM);
+    _pcmpgt(T, Src1RM);
+    Variable *MinusOne = makeVectorOfMinusOnes(Ty);
+    _pxor(T, MinusOne);
+  } break;
+  }
+
+  _movp(Dest, T);
+  eliminateNextVectorSextInstruction(Dest);
+}
+
+template <typename T>
+typename std::enable_if<!T::Is64Bit, void>::type
+TargetX8632::lowerIcmp64(const InstIcmp *Icmp, const Inst *Consumer) {
+  // a=icmp cond, b, c ==> cmp b,c; a=1; br cond,L1; FakeUse(a); a=0; L1:
+  Operand *Src0 = legalize(Icmp->getSrc(0));
+  Operand *Src1 = legalize(Icmp->getSrc(1));
+  Variable *Dest = Icmp->getDest();
+  InstIcmp::ICond Condition = Icmp->getCondition();
+  assert(static_cast<size_t>(Condition) < Traits::TableIcmp64Size);
+  Operand *Src0LoRM = nullptr;
+  Operand *Src0HiRM = nullptr;
+  // Legalize the portions of Src0 that are going to be needed.
+  if (isZero(Src1)) {
+    switch (Condition) {
+    default:
+      llvm_unreachable("unexpected condition");
+      break;
+    // These two are not optimized, so we fall through to the general case,
+    // which needs the upper and lower halves legalized.
+    case InstIcmp::Sgt:
+    case InstIcmp::Sle:
+    // These four compare after performing an "or" of the high and low half, so
+    // they need the upper and lower halves legalized.
+    case InstIcmp::Eq:
+    case InstIcmp::Ule:
+    case InstIcmp::Ne:
+    case InstIcmp::Ugt:
+      Src0LoRM = legalize(loOperand(Src0), Legal_Reg | Legal_Mem);
+    // These two test only the high half's sign bit, so they need only
+    // the upper half legalized.
+    case InstIcmp::Sge:
+    case InstIcmp::Slt:
+      Src0HiRM = legalize(hiOperand(Src0), Legal_Reg | Legal_Mem);
+      break;
+
+    // These two move constants and hence need no legalization.
+    case InstIcmp::Uge:
+    case InstIcmp::Ult:
+      break;
+    }
+  } else {
+    Src0LoRM = legalize(loOperand(Src0), Legal_Reg | Legal_Mem);
+    Src0HiRM = legalize(hiOperand(Src0), Legal_Reg | Legal_Mem);
+  }
+  // Optimize comparisons with zero.
+  if (isZero(Src1)) {
+    Constant *SignMask = Ctx->getConstantInt32(0x80000000);
+    Variable *Temp = nullptr;
+    switch (Condition) {
+    default:
+      llvm_unreachable("unexpected condition");
+      break;
+    case InstIcmp::Eq:
+    case InstIcmp::Ule:
+      // Mov Src0HiRM first, because it was legalized most recently, and will
+      // sometimes avoid a move before the OR.
+      _mov(Temp, Src0HiRM);
+      _or(Temp, Src0LoRM);
+      Context.insert<InstFakeUse>(Temp);
+      setccOrConsumer(CondX86::Br_e, Dest, Consumer);
+      return;
+    case InstIcmp::Ne:
+    case InstIcmp::Ugt:
+      // Mov Src0HiRM first, because it was legalized most recently, and will
+      // sometimes avoid a move before the OR.
+      _mov(Temp, Src0HiRM);
+      _or(Temp, Src0LoRM);
+      Context.insert<InstFakeUse>(Temp);
+      setccOrConsumer(CondX86::Br_ne, Dest, Consumer);
+      return;
+    case InstIcmp::Uge:
+      movOrConsumer(true, Dest, Consumer);
+      return;
+    case InstIcmp::Ult:
+      movOrConsumer(false, Dest, Consumer);
+      return;
+    case InstIcmp::Sgt:
+      break;
+    case InstIcmp::Sge:
+      _test(Src0HiRM, SignMask);
+      setccOrConsumer(CondX86::Br_e, Dest, Consumer);
+      return;
+    case InstIcmp::Slt:
+      _test(Src0HiRM, SignMask);
+      setccOrConsumer(CondX86::Br_ne, Dest, Consumer);
+      return;
+    case InstIcmp::Sle:
+      break;
+    }
+  }
+  // Handle general compares.
+  Operand *Src1LoRI = legalize(loOperand(Src1), Legal_Reg | Legal_Imm);
+  Operand *Src1HiRI = legalize(hiOperand(Src1), Legal_Reg | Legal_Imm);
+  if (Consumer == nullptr) {
+    Constant *Zero = Ctx->getConstantInt(Dest->getType(), 0);
+    Constant *One = Ctx->getConstantInt(Dest->getType(), 1);
+    InstX86Label *LabelFalse = InstX86Label::create(Func, this);
+    InstX86Label *LabelTrue = InstX86Label::create(Func, this);
+    _mov(Dest, One);
+    _cmp(Src0HiRM, Src1HiRI);
+    if (Traits::TableIcmp64[Condition].C1 != CondX86::Br_None)
+      _br(Traits::TableIcmp64[Condition].C1, LabelTrue);
+    if (Traits::TableIcmp64[Condition].C2 != CondX86::Br_None)
+      _br(Traits::TableIcmp64[Condition].C2, LabelFalse);
+    _cmp(Src0LoRM, Src1LoRI);
+    _br(Traits::TableIcmp64[Condition].C3, LabelTrue);
+    Context.insert(LabelFalse);
+    _redefined(_mov(Dest, Zero));
+    Context.insert(LabelTrue);
+    return;
+  }
+  if (const auto *Br = llvm::dyn_cast<InstBr>(Consumer)) {
+    _cmp(Src0HiRM, Src1HiRI);
+    if (Traits::TableIcmp64[Condition].C1 != CondX86::Br_None)
+      _br(Traits::TableIcmp64[Condition].C1, Br->getTargetTrue());
+    if (Traits::TableIcmp64[Condition].C2 != CondX86::Br_None)
+      _br(Traits::TableIcmp64[Condition].C2, Br->getTargetFalse());
+    _cmp(Src0LoRM, Src1LoRI);
+    _br(Traits::TableIcmp64[Condition].C3, Br->getTargetTrue(),
+        Br->getTargetFalse());
+    return;
+  }
+  if (auto *Select = llvm::dyn_cast<InstSelect>(Consumer)) {
+    Operand *SrcT = Select->getTrueOperand();
+    Operand *SrcF = Select->getFalseOperand();
+    Variable *SelectDest = Select->getDest();
+    InstX86Label *LabelFalse = InstX86Label::create(Func, this);
+    InstX86Label *LabelTrue = InstX86Label::create(Func, this);
+    lowerMove(SelectDest, SrcT, false);
+    _cmp(Src0HiRM, Src1HiRI);
+    if (Traits::TableIcmp64[Condition].C1 != CondX86::Br_None)
+      _br(Traits::TableIcmp64[Condition].C1, LabelTrue);
+    if (Traits::TableIcmp64[Condition].C2 != CondX86::Br_None)
+      _br(Traits::TableIcmp64[Condition].C2, LabelFalse);
+    _cmp(Src0LoRM, Src1LoRI);
+    _br(Traits::TableIcmp64[Condition].C3, LabelTrue);
+    Context.insert(LabelFalse);
+    static constexpr bool IsRedefinition = true;
+    lowerMove(SelectDest, SrcF, IsRedefinition);
+    Context.insert(LabelTrue);
+    return;
+  }
+  llvm::report_fatal_error("Unexpected consumer type");
+}
+
+void TargetX8632::setccOrConsumer(BrCond Condition, Variable *Dest,
+                                  const Inst *Consumer) {
+  if (Consumer == nullptr) {
+    _setcc(Dest, Condition);
+    return;
+  }
+  if (const auto *Br = llvm::dyn_cast<InstBr>(Consumer)) {
+    _br(Condition, Br->getTargetTrue(), Br->getTargetFalse());
+    return;
+  }
+  if (const auto *Select = llvm::dyn_cast<InstSelect>(Consumer)) {
+    Operand *SrcT = Select->getTrueOperand();
+    Operand *SrcF = Select->getFalseOperand();
+    Variable *SelectDest = Select->getDest();
+    lowerSelectMove(SelectDest, Condition, SrcT, SrcF);
+    return;
+  }
+  llvm::report_fatal_error("Unexpected consumer type");
+}
+
+void TargetX8632::movOrConsumer(bool IcmpResult, Variable *Dest,
+                                const Inst *Consumer) {
+  if (Consumer == nullptr) {
+    _mov(Dest, Ctx->getConstantInt(Dest->getType(), (IcmpResult ? 1 : 0)));
+    return;
+  }
+  if (const auto *Br = llvm::dyn_cast<InstBr>(Consumer)) {
+    // TODO(sehr,stichnot): This could be done with a single unconditional
+    // branch instruction, but subzero doesn't know how to handle the resulting
+    // control flow graph changes now.  Make it do so to eliminate mov and cmp.
+    _mov(Dest, Ctx->getConstantInt(Dest->getType(), (IcmpResult ? 1 : 0)));
+    _cmp(Dest, Ctx->getConstantInt(Dest->getType(), 0));
+    _br(CondX86::Br_ne, Br->getTargetTrue(), Br->getTargetFalse());
+    return;
+  }
+  if (const auto *Select = llvm::dyn_cast<InstSelect>(Consumer)) {
+    Operand *Src = nullptr;
+    if (IcmpResult) {
+      Src = legalize(Select->getTrueOperand(), Legal_Reg | Legal_Imm);
+    } else {
+      Src = legalize(Select->getFalseOperand(), Legal_Reg | Legal_Imm);
+    }
+    Variable *SelectDest = Select->getDest();
+    lowerMove(SelectDest, Src, false);
+    return;
+  }
+  llvm::report_fatal_error("Unexpected consumer type");
+}
+
+void TargetX8632::lowerArithAndConsumer(const InstArithmetic *Arith,
+                                        const Inst *Consumer) {
+  Variable *T = nullptr;
+  Operand *Src0 = legalize(Arith->getSrc(0));
+  Operand *Src1 = legalize(Arith->getSrc(1));
+  Variable *Dest = Arith->getDest();
+  switch (Arith->getOp()) {
+  default:
+    llvm_unreachable("arithmetic operator not AND or OR");
+    break;
+  case InstArithmetic::And:
+    _mov(T, Src0);
+    // Test cannot have an address in the second position.  Since T is
+    // guaranteed to be a register and Src1 could be a memory load, ensure
+    // that the second argument is a register.
+    if (llvm::isa<Constant>(Src1))
+      _test(T, Src1);
+    else
+      _test(Src1, T);
+    break;
+  case InstArithmetic::Or:
+    _mov(T, Src0);
+    _or(T, Src1);
+    break;
+  }
+
+  if (Consumer == nullptr) {
+    llvm::report_fatal_error("Expected a consumer instruction");
+  }
+  if (const auto *Br = llvm::dyn_cast<InstBr>(Consumer)) {
+    Context.insert<InstFakeUse>(T);
+    Context.insert<InstFakeDef>(Dest);
+    _br(CondX86::Br_ne, Br->getTargetTrue(), Br->getTargetFalse());
+    return;
+  }
+  llvm::report_fatal_error("Unexpected consumer type");
+}
+
+void TargetX8632::lowerInsertElement(const InstInsertElement *Instr) {
+  Operand *SourceVectNotLegalized = Instr->getSrc(0);
+  Operand *ElementToInsertNotLegalized = Instr->getSrc(1);
+  auto *ElementIndex = llvm::dyn_cast<ConstantInteger32>(Instr->getSrc(2));
+  // Only constant indices are allowed in PNaCl IR.
+  assert(ElementIndex);
+  unsigned Index = ElementIndex->getValue();
+  assert(Index < typeNumElements(SourceVectNotLegalized->getType()));
+
+  Type Ty = SourceVectNotLegalized->getType();
+  Type ElementTy = typeElementType(Ty);
+  Type InVectorElementTy = Traits::getInVectorElementType(Ty);
+
+  if (ElementTy == IceType_i1) {
+    // Expand the element to the appropriate size for it to be inserted in the
+    // vector.
+    Variable *Expanded = Func->makeVariable(InVectorElementTy);
+    auto *Cast = InstCast::create(Func, InstCast::Zext, Expanded,
+                                  ElementToInsertNotLegalized);
+    lowerCast(Cast);
+    ElementToInsertNotLegalized = Expanded;
+  }
+
+  if (Ty == IceType_v8i16 || Ty == IceType_v8i1 || InstructionSet >= SSE4_1) {
+    // Use insertps, pinsrb, pinsrw, or pinsrd.
+    Operand *ElementRM =
+        legalize(ElementToInsertNotLegalized, Legal_Reg | Legal_Mem);
+    Operand *SourceVectRM =
+        legalize(SourceVectNotLegalized, Legal_Reg | Legal_Mem);
+    Variable *T = makeReg(Ty);
+    _movp(T, SourceVectRM);
+    if (Ty == IceType_v4f32) {
+      _insertps(T, ElementRM, Ctx->getConstantInt32(Index << 4));
+    } else {
+      // For the pinsrb and pinsrw instructions, when the source operand is a
+      // register, it must be a full r32 register like eax, and not ax/al/ah.
+      // For filetype=asm, InstX86Pinsr::emit() compensates for
+      // the use
+      // of r16 and r8 by converting them through getBaseReg(), while emitIAS()
+      // validates that the original and base register encodings are the same.
+      if (ElementRM->getType() == IceType_i8 &&
+          llvm::isa<Variable>(ElementRM)) {
+        // Don't use ah/bh/ch/dh for pinsrb.
+        ElementRM = copyToReg8(ElementRM);
+      }
+      _pinsr(T, ElementRM, Ctx->getConstantInt32(Index));
+    }
+    _movp(Instr->getDest(), T);
+  } else if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) {
+    // Use shufps or movss.
+    Variable *ElementR = nullptr;
+    Operand *SourceVectRM =
+        legalize(SourceVectNotLegalized, Legal_Reg | Legal_Mem);
+
+    if (InVectorElementTy == IceType_f32) {
+      // ElementR will be in an XMM register since it is floating point.
+      ElementR = legalizeToReg(ElementToInsertNotLegalized);
+    } else {
+      // Copy an integer to an XMM register.
+      Operand *T = legalize(ElementToInsertNotLegalized, Legal_Reg | Legal_Mem);
+      ElementR = makeReg(Ty);
+      _movd(ElementR, T);
+    }
+
+    if (Index == 0) {
+      Variable *T = makeReg(Ty);
+      _movp(T, SourceVectRM);
+      _movss(T, ElementR);
+      _movp(Instr->getDest(), T);
+      return;
+    }
+
+    // shufps treats the source and destination operands as vectors of four
+    // doublewords. The destination's two high doublewords are selected from
+    // the source operand and the two low doublewords are selected from the
+    // (original value of) the destination operand. An insertelement operation
+    // can be effected with a sequence of two shufps operations with
+    // appropriate masks. In all cases below, Element[0] is being inserted into
+    // SourceVectOperand. Indices are ordered from left to right.
+    //
+    // insertelement into index 1 (result is stored in ElementR):
+    //   ElementR := ElementR[0, 0] SourceVectRM[0, 0]
+    //   ElementR := ElementR[3, 0] SourceVectRM[2, 3]
+    //
+    // insertelement into index 2 (result is stored in T):
+    //   T := SourceVectRM
+    //   ElementR := ElementR[0, 0] T[0, 3]
+    //   T := T[0, 1] ElementR[0, 3]
+    //
+    // insertelement into index 3 (result is stored in T):
+    //   T := SourceVectRM
+    //   ElementR := ElementR[0, 0] T[0, 2]
+    //   T := T[0, 1] ElementR[3, 0]
+    const unsigned char Mask1[3] = {0, 192, 128};
+    const unsigned char Mask2[3] = {227, 196, 52};
+
+    Constant *Mask1Constant = Ctx->getConstantInt32(Mask1[Index - 1]);
+    Constant *Mask2Constant = Ctx->getConstantInt32(Mask2[Index - 1]);
+
+    if (Index == 1) {
+      _shufps(ElementR, SourceVectRM, Mask1Constant);
+      _shufps(ElementR, SourceVectRM, Mask2Constant);
+      _movp(Instr->getDest(), ElementR);
+    } else {
+      Variable *T = makeReg(Ty);
+      _movp(T, SourceVectRM);
+      _shufps(ElementR, T, Mask1Constant);
+      _shufps(T, ElementR, Mask2Constant);
+      _movp(Instr->getDest(), T);
+    }
+  } else {
+    assert(Ty == IceType_v16i8 || Ty == IceType_v16i1);
+    // Spill the value to a stack slot and perform the insertion in memory.
+    //
+    // TODO(wala): use legalize(SourceVectNotLegalized, Legal_Mem) when support
+    // for legalizing to mem is implemented.
+    Variable *Slot = Func->makeVariable(Ty);
+    Slot->setMustNotHaveReg();
+    _movp(Slot, legalizeToReg(SourceVectNotLegalized));
+
+    // Compute the location of the position to insert in memory.
+    unsigned Offset = Index * typeWidthInBytes(InVectorElementTy);
+    X86OperandMem *Loc =
+        getMemoryOperandForStackSlot(InVectorElementTy, Slot, Offset);
+    _store(legalizeToReg(ElementToInsertNotLegalized), Loc);
+
+    Variable *T = makeReg(Ty);
+    _movp(T, Slot);
+    _movp(Instr->getDest(), T);
+  }
+}
+
+void TargetX8632::lowerIntrinsic(const InstIntrinsic *Instr) {
+  switch (Intrinsics::IntrinsicID ID = Instr->getIntrinsicID()) {
+  case Intrinsics::AtomicCmpxchg: {
+    if (!Intrinsics::isMemoryOrderValid(
+            ID, getConstantMemoryOrder(Instr->getArg(3)),
+            getConstantMemoryOrder(Instr->getArg(4)))) {
+      Func->setError("Unexpected memory ordering for AtomicCmpxchg");
+      return;
+    }
+    Variable *DestPrev = Instr->getDest();
+    Operand *PtrToMem = legalize(Instr->getArg(0));
+    Operand *Expected = legalize(Instr->getArg(1));
+    Operand *Desired = legalize(Instr->getArg(2));
+    if (tryOptimizedCmpxchgCmpBr(DestPrev, PtrToMem, Expected, Desired))
+      return;
+    lowerAtomicCmpxchg(DestPrev, PtrToMem, Expected, Desired);
+    return;
+  }
+  case Intrinsics::AtomicFence:
+    if (!Intrinsics::isMemoryOrderValid(
+            ID, getConstantMemoryOrder(Instr->getArg(0)))) {
+      Func->setError("Unexpected memory ordering for AtomicFence");
+      return;
+    }
+    _mfence();
+    return;
+  case Intrinsics::AtomicFenceAll:
+    // NOTE: FenceAll should prevent and load/store from being moved across the
+    // fence (both atomic and non-atomic). The InstX8632Mfence instruction is
+    // currently marked coarsely as "HasSideEffects".
+    _mfence();
+    return;
+  case Intrinsics::AtomicIsLockFree: {
+    // X86 is always lock free for 8/16/32/64 bit accesses.
+    // TODO(jvoung): Since the result is constant when given a constant byte
+    // size, this opens up DCE opportunities.
+    Operand *ByteSize = Instr->getArg(0);
+    Variable *Dest = Instr->getDest();
+    if (auto *CI = llvm::dyn_cast<ConstantInteger32>(ByteSize)) {
+      Constant *Result;
+      switch (CI->getValue()) {
+      default:
+        // Some x86-64 processors support the cmpxchg16b instruction, which can
+        // make 16-byte operations lock free (when used with the LOCK prefix).
+        // However, that's not supported in 32-bit mode, so just return 0 even
+        // for large sizes.
+        Result = Ctx->getConstantZero(IceType_i32);
+        break;
+      case 1:
+      case 2:
+      case 4:
+      case 8:
+        Result = Ctx->getConstantInt32(1);
+        break;
+      }
+      _mov(Dest, Result);
+      return;
+    }
+    // The PNaCl ABI requires the byte size to be a compile-time constant.
+    Func->setError("AtomicIsLockFree byte size should be compile-time const");
+    return;
+  }
+  case Intrinsics::AtomicLoad: {
+    // We require the memory address to be naturally aligned. Given that is the
+    // case, then normal loads are atomic.
+    if (!Intrinsics::isMemoryOrderValid(
+            ID, getConstantMemoryOrder(Instr->getArg(1)))) {
+      Func->setError("Unexpected memory ordering for AtomicLoad");
+      return;
+    }
+    Variable *Dest = Instr->getDest();
+    if (!Traits::Is64Bit) {
+      if (auto *Dest64On32 = llvm::dyn_cast<Variable64On32>(Dest)) {
+        // Follow what GCC does and use a movq instead of what lowerLoad()
+        // normally does (split the load into two). Thus, this skips
+        // load/arithmetic op folding. Load/arithmetic folding can't happen
+        // anyway, since this is x86-32 and integer arithmetic only happens on
+        // 32-bit quantities.
+        Variable *T = makeReg(IceType_f64);
+        X86OperandMem *Addr = formMemoryOperand(Instr->getArg(0), IceType_f64);
+        _movq(T, Addr);
+        // Then cast the bits back out of the XMM register to the i64 Dest.
+        auto *Cast = InstCast::create(Func, InstCast::Bitcast, Dest, T);
+        lowerCast(Cast);
+        // Make sure that the atomic load isn't elided when unused.
+        Context.insert<InstFakeUse>(Dest64On32->getLo());
+        Context.insert<InstFakeUse>(Dest64On32->getHi());
+        return;
+      }
+    }
+    auto *Load = InstLoad::create(Func, Dest, Instr->getArg(0));
+    lowerLoad(Load);
+    // Make sure the atomic load isn't elided when unused, by adding a FakeUse.
+    // Since lowerLoad may fuse the load w/ an arithmetic instruction, insert
+    // the FakeUse on the last-inserted instruction's dest.
+    Context.insert<InstFakeUse>(Context.getLastInserted()->getDest());
+    return;
+  }
+  case Intrinsics::AtomicRMW:
+    if (!Intrinsics::isMemoryOrderValid(
+            ID, getConstantMemoryOrder(Instr->getArg(3)))) {
+      Func->setError("Unexpected memory ordering for AtomicRMW");
+      return;
+    }
+    lowerAtomicRMW(
+        Instr->getDest(),
+        static_cast<uint32_t>(
+            llvm::cast<ConstantInteger32>(Instr->getArg(0))->getValue()),
+        Instr->getArg(1), Instr->getArg(2));
+    return;
+  case Intrinsics::AtomicStore: {
+    if (!Intrinsics::isMemoryOrderValid(
+            ID, getConstantMemoryOrder(Instr->getArg(2)))) {
+      Func->setError("Unexpected memory ordering for AtomicStore");
+      return;
+    }
+    // We require the memory address to be naturally aligned. Given that is the
+    // case, then normal stores are atomic. Add a fence after the store to make
+    // it visible.
+    Operand *Value = Instr->getArg(0);
+    Operand *Ptr = Instr->getArg(1);
+    if (!Traits::Is64Bit && Value->getType() == IceType_i64) {
+      // Use a movq instead of what lowerStore() normally does (split the store
+      // into two), following what GCC does. Cast the bits from int -> to an
+      // xmm register first.
+      Variable *T = makeReg(IceType_f64);
+      auto *Cast = InstCast::create(Func, InstCast::Bitcast, T, Value);
+      lowerCast(Cast);
+      // Then store XMM w/ a movq.
+      X86OperandMem *Addr = formMemoryOperand(Ptr, IceType_f64);
+      _storeq(T, Addr);
+      _mfence();
+      return;
+    }
+    auto *Store = InstStore::create(Func, Value, Ptr);
+    lowerStore(Store);
+    _mfence();
+    return;
+  }
+  case Intrinsics::Bswap: {
+    Variable *Dest = Instr->getDest();
+    Operand *Val = Instr->getArg(0);
+    // In 32-bit mode, bswap only works on 32-bit arguments, and the argument
+    // must be a register. Use rotate left for 16-bit bswap.
+    if (!Traits::Is64Bit && Val->getType() == IceType_i64) {
+      Val = legalizeUndef(Val);
+      Variable *T_Lo = legalizeToReg(loOperand(Val));
+      Variable *T_Hi = legalizeToReg(hiOperand(Val));
+      auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
+      auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
+      _bswap(T_Lo);
+      _bswap(T_Hi);
+      _mov(DestLo, T_Hi);
+      _mov(DestHi, T_Lo);
+    } else if ((Traits::Is64Bit && Val->getType() == IceType_i64) ||
+               Val->getType() == IceType_i32) {
+      Variable *T = legalizeToReg(Val);
+      _bswap(T);
+      _mov(Dest, T);
+    } else {
+      assert(Val->getType() == IceType_i16);
+      Constant *Eight = Ctx->getConstantInt16(8);
+      Variable *T = nullptr;
+      Val = legalize(Val);
+      _mov(T, Val);
+      _rol(T, Eight);
+      _mov(Dest, T);
+    }
+    return;
+  }
+  case Intrinsics::Ctpop: {
+    Variable *Dest = Instr->getDest();
+    Variable *T = nullptr;
+    Operand *Val = Instr->getArg(0);
+    Type ValTy = Val->getType();
+    assert(ValTy == IceType_i32 || ValTy == IceType_i64);
+
+    if (!Traits::Is64Bit) {
+      T = Dest;
+    } else {
+      T = makeReg(IceType_i64);
+      if (ValTy == IceType_i32) {
+        // in x86-64, __popcountsi2 is not defined, so we cheat a bit by
+        // converting it to a 64-bit value, and using ctpop_i64. _movzx should
+        // ensure we will not have any bits set on Val's upper 32 bits.
+        Variable *V = makeReg(IceType_i64);
+        Operand *ValRM = legalize(Val, Legal_Reg | Legal_Mem);
+        _movzx(V, ValRM);
+        Val = V;
+      }
+      ValTy = IceType_i64;
+    }
+
+    InstCall *Call =
+        makeHelperCall(ValTy == IceType_i32 ? RuntimeHelper::H_call_ctpop_i32
+                                            : RuntimeHelper::H_call_ctpop_i64,
+                       T, 1);
+    Call->addArg(Val);
+    lowerCall(Call);
+    // The popcount helpers always return 32-bit values, while the intrinsic's
+    // signature matches the native POPCNT instruction and fills a 64-bit reg
+    // (in 64-bit mode). Thus, clear the upper bits of the dest just in case
+    // the user doesn't do that in the IR. If the user does that in the IR,
+    // then this zero'ing instruction is dead and gets optimized out.
+    if (!Traits::Is64Bit) {
+      assert(T == Dest);
+      if (Val->getType() == IceType_i64) {
+        auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
+        Constant *Zero = Ctx->getConstantZero(IceType_i32);
+        _mov(DestHi, Zero);
+      }
+    } else {
+      assert(Val->getType() == IceType_i64);
+      // T is 64 bit. It needs to be copied to dest. We need to:
+      //
+      // T_1.32 = trunc T.64 to i32
+      // T_2.64 = zext T_1.32 to i64
+      // Dest.<<right_size>> = T_2.<<right_size>>
+      //
+      // which ensures the upper 32 bits will always be cleared. Just doing a
+      //
+      // mov Dest.32 = trunc T.32 to i32
+      //
+      // is dangerous because there's a chance the compiler will optimize this
+      // copy out. To use _movzx we need two new registers (one 32-, and
+      // another 64-bit wide.)
+      Variable *T_1 = makeReg(IceType_i32);
+      _mov(T_1, T);
+      Variable *T_2 = makeReg(IceType_i64);
+      _movzx(T_2, T_1);
+      _mov(Dest, T_2);
+    }
+    return;
+  }
+  case Intrinsics::Ctlz: {
+    // The "is zero undef" parameter is ignored and we always return a
+    // well-defined value.
+    Operand *Val = legalize(Instr->getArg(0));
+    Operand *FirstVal;
+    Operand *SecondVal = nullptr;
+    if (!Traits::Is64Bit && Val->getType() == IceType_i64) {
+      FirstVal = loOperand(Val);
+      SecondVal = hiOperand(Val);
+    } else {
+      FirstVal = Val;
+    }
+    constexpr bool IsCttz = false;
+    lowerCountZeros(IsCttz, Val->getType(), Instr->getDest(), FirstVal,
+                    SecondVal);
+    return;
+  }
+  case Intrinsics::Cttz: {
+    // The "is zero undef" parameter is ignored and we always return a
+    // well-defined value.
+    Operand *Val = legalize(Instr->getArg(0));
+    Operand *FirstVal;
+    Operand *SecondVal = nullptr;
+    if (!Traits::Is64Bit && Val->getType() == IceType_i64) {
+      FirstVal = hiOperand(Val);
+      SecondVal = loOperand(Val);
+    } else {
+      FirstVal = Val;
+    }
+    constexpr bool IsCttz = true;
+    lowerCountZeros(IsCttz, Val->getType(), Instr->getDest(), FirstVal,
+                    SecondVal);
+    return;
+  }
+  case Intrinsics::Fabs: {
+    Operand *Src = legalize(Instr->getArg(0));
+    Type Ty = Src->getType();
+    Variable *Dest = Instr->getDest();
+    Variable *T = makeVectorOfFabsMask(Ty);
+    // The pand instruction operates on an m128 memory operand, so if Src is an
+    // f32 or f64, we need to make sure it's in a register.
+    if (isVectorType(Ty)) {
+      if (llvm::isa<X86OperandMem>(Src))
+        Src = legalizeToReg(Src);
+    } else {
+      Src = legalizeToReg(Src);
+    }
+    _pand(T, Src);
+    if (isVectorType(Ty))
+      _movp(Dest, T);
+    else
+      _mov(Dest, T);
+    return;
+  }
+  case Intrinsics::Longjmp: {
+    InstCall *Call = makeHelperCall(RuntimeHelper::H_call_longjmp, nullptr, 2);
+    Call->addArg(Instr->getArg(0));
+    Call->addArg(Instr->getArg(1));
+    lowerCall(Call);
+    return;
+  }
+  case Intrinsics::Memcpy: {
+    lowerMemcpy(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2));
+    return;
+  }
+  case Intrinsics::Memmove: {
+    lowerMemmove(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2));
+    return;
+  }
+  case Intrinsics::Memset: {
+    lowerMemset(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2));
+    return;
+  }
+  case Intrinsics::Setjmp: {
+    InstCall *Call =
+        makeHelperCall(RuntimeHelper::H_call_setjmp, Instr->getDest(), 1);
+    Call->addArg(Instr->getArg(0));
+    lowerCall(Call);
+    return;
+  }
+  case Intrinsics::Sqrt: {
+    Operand *Src = legalize(Instr->getArg(0));
+    Variable *Dest = Instr->getDest();
+    Variable *T = makeReg(Dest->getType());
+    _sqrt(T, Src);
+    if (isVectorType(Dest->getType())) {
+      _movp(Dest, T);
+    } else {
+      _mov(Dest, T);
+    }
+    return;
+  }
+  case Intrinsics::Stacksave: {
+    Variable *esp =
+        Func->getTarget()->getPhysicalRegister(getStackReg(), Traits::WordType);
+    Variable *Dest = Instr->getDest();
+    _mov(Dest, esp);
+    return;
+  }
+  case Intrinsics::Stackrestore: {
+    Operand *Src = Instr->getArg(0);
+    _mov_sp(Src);
+    return;
+  }
+
+  case Intrinsics::Trap:
+    _ud2();
+    return;
+  case Intrinsics::LoadSubVector: {
+    assert(llvm::isa<ConstantInteger32>(Instr->getArg(1)) &&
+           "LoadSubVector second argument must be a constant");
+    Variable *Dest = Instr->getDest();
+    Type Ty = Dest->getType();
+    auto *SubVectorSize = llvm::cast<ConstantInteger32>(Instr->getArg(1));
+    Operand *Addr = Instr->getArg(0);
+    X86OperandMem *Src = formMemoryOperand(Addr, Ty);
+    doMockBoundsCheck(Src);
+
+    if (Dest->isRematerializable()) {
+      Context.insert<InstFakeDef>(Dest);
+      return;
+    }
+
+    auto *T = makeReg(Ty);
+    switch (SubVectorSize->getValue()) {
+    case 4:
+      _movd(T, Src);
+      break;
+    case 8:
+      _movq(T, Src);
+      break;
+    default:
+      Func->setError("Unexpected size for LoadSubVector");
+      return;
+    }
+    _movp(Dest, T);
+    return;
+  }
+  case Intrinsics::StoreSubVector: {
+    assert(llvm::isa<ConstantInteger32>(Instr->getArg(2)) &&
+           "StoreSubVector third argument must be a constant");
+    auto *SubVectorSize = llvm::cast<ConstantInteger32>(Instr->getArg(2));
+    Operand *Value = Instr->getArg(0);
+    Operand *Addr = Instr->getArg(1);
+    X86OperandMem *NewAddr = formMemoryOperand(Addr, Value->getType());
+    doMockBoundsCheck(NewAddr);
+
+    Value = legalizeToReg(Value);
+
+    switch (SubVectorSize->getValue()) {
+    case 4:
+      _stored(Value, NewAddr);
+      break;
+    case 8:
+      _storeq(Value, NewAddr);
+      break;
+    default:
+      Func->setError("Unexpected size for StoreSubVector");
+      return;
+    }
+    return;
+  }
+  case Intrinsics::VectorPackSigned: {
+    Operand *Src0 = Instr->getArg(0);
+    Operand *Src1 = Instr->getArg(1);
+    Variable *Dest = Instr->getDest();
+    auto *T = makeReg(Src0->getType());
+    auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+    auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
+    _movp(T, Src0RM);
+    _packss(T, Src1RM);
+    _movp(Dest, T);
+    return;
+  }
+  case Intrinsics::VectorPackUnsigned: {
+    Operand *Src0 = Instr->getArg(0);
+    Operand *Src1 = Instr->getArg(1);
+    Variable *Dest = Instr->getDest();
+    auto *T = makeReg(Src0->getType());
+    auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+    auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
+    _movp(T, Src0RM);
+    _packus(T, Src1RM);
+    _movp(Dest, T);
+    return;
+  }
+  case Intrinsics::SignMask: {
+    Operand *SrcReg = legalizeToReg(Instr->getArg(0));
+    Variable *Dest = Instr->getDest();
+    Variable *T = makeReg(IceType_i32);
+    if (SrcReg->getType() == IceType_v4f32 ||
+        SrcReg->getType() == IceType_v4i32 ||
+        SrcReg->getType() == IceType_v16i8) {
+      _movmsk(T, SrcReg);
+    } else {
+      // TODO(capn): We could implement v8i16 sign mask using packsswb/pmovmskb
+      llvm::report_fatal_error("Invalid type for SignMask intrinsic");
+    }
+    _mov(Dest, T);
+    return;
+  }
+  case Intrinsics::MultiplyHighSigned: {
+    Operand *Src0 = Instr->getArg(0);
+    Operand *Src1 = Instr->getArg(1);
+    Variable *Dest = Instr->getDest();
+    auto *T = makeReg(Dest->getType());
+    auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+    auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
+    _movp(T, Src0RM);
+    _pmulhw(T, Src1RM);
+    _movp(Dest, T);
+    return;
+  }
+  case Intrinsics::MultiplyHighUnsigned: {
+    Operand *Src0 = Instr->getArg(0);
+    Operand *Src1 = Instr->getArg(1);
+    Variable *Dest = Instr->getDest();
+    auto *T = makeReg(Dest->getType());
+    auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+    auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
+    _movp(T, Src0RM);
+    _pmulhuw(T, Src1RM);
+    _movp(Dest, T);
+    return;
+  }
+  case Intrinsics::MultiplyAddPairs: {
+    Operand *Src0 = Instr->getArg(0);
+    Operand *Src1 = Instr->getArg(1);
+    Variable *Dest = Instr->getDest();
+    auto *T = makeReg(Dest->getType());
+    auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+    auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
+    _movp(T, Src0RM);
+    _pmaddwd(T, Src1RM);
+    _movp(Dest, T);
+    return;
+  }
+  case Intrinsics::AddSaturateSigned: {
+    Operand *Src0 = Instr->getArg(0);
+    Operand *Src1 = Instr->getArg(1);
+    Variable *Dest = Instr->getDest();
+    auto *T = makeReg(Dest->getType());
+    auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+    auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
+    _movp(T, Src0RM);
+    _padds(T, Src1RM);
+    _movp(Dest, T);
+    return;
+  }
+  case Intrinsics::SubtractSaturateSigned: {
+    Operand *Src0 = Instr->getArg(0);
+    Operand *Src1 = Instr->getArg(1);
+    Variable *Dest = Instr->getDest();
+    auto *T = makeReg(Dest->getType());
+    auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+    auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
+    _movp(T, Src0RM);
+    _psubs(T, Src1RM);
+    _movp(Dest, T);
+    return;
+  }
+  case Intrinsics::AddSaturateUnsigned: {
+    Operand *Src0 = Instr->getArg(0);
+    Operand *Src1 = Instr->getArg(1);
+    Variable *Dest = Instr->getDest();
+    auto *T = makeReg(Dest->getType());
+    auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+    auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
+    _movp(T, Src0RM);
+    _paddus(T, Src1RM);
+    _movp(Dest, T);
+    return;
+  }
+  case Intrinsics::SubtractSaturateUnsigned: {
+    Operand *Src0 = Instr->getArg(0);
+    Operand *Src1 = Instr->getArg(1);
+    Variable *Dest = Instr->getDest();
+    auto *T = makeReg(Dest->getType());
+    auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+    auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
+    _movp(T, Src0RM);
+    _psubus(T, Src1RM);
+    _movp(Dest, T);
+    return;
+  }
+  case Intrinsics::Nearbyint: {
+    Operand *Src = Instr->getArg(0);
+    Variable *Dest = Instr->getDest();
+    Type DestTy = Dest->getType();
+    if (isVectorType(DestTy)) {
+      assert(DestTy == IceType_v4i32);
+      assert(Src->getType() == IceType_v4f32);
+      Operand *Src0R = legalizeToReg(Src);
+      Variable *T = makeReg(DestTy);
+      _cvt(T, Src0R, Insts::Cvt::Ps2dq);
+      _movp(Dest, T);
+    } else if (!Traits::Is64Bit && DestTy == IceType_i64) {
+      llvm::report_fatal_error("Helper call was expected");
+    } else {
+      Operand *Src0RM = legalize(Src, Legal_Reg | Legal_Mem);
+      // t1.i32 = cvt Src0RM; t2.dest_type = t1; Dest = t2.dest_type
+      Variable *T_1 = nullptr;
+      if (Traits::Is64Bit && DestTy == IceType_i64) {
+        T_1 = makeReg(IceType_i64);
+      } else {
+        assert(DestTy != IceType_i64);
+        T_1 = makeReg(IceType_i32);
+      }
+      // cvt() requires its integer argument to be a GPR.
+      Variable *T_2 = makeReg(DestTy);
+      if (isByteSizedType(DestTy)) {
+        assert(T_1->getType() == IceType_i32);
+        T_1->setRegClass(RCX86_Is32To8);
+        T_2->setRegClass(RCX86_IsTrunc8Rcvr);
+      }
+      _cvt(T_1, Src0RM, Insts::Cvt::Ss2si);
+      _mov(T_2, T_1); // T_1 and T_2 may have different integer types
+      if (DestTy == IceType_i1)
+        _and(T_2, Ctx->getConstantInt1(1));
+      _mov(Dest, T_2);
+    }
+    return;
+  }
+  case Intrinsics::Round: {
+    assert(InstructionSet >= SSE4_1);
+    Variable *Dest = Instr->getDest();
+    Operand *Src = Instr->getArg(0);
+    Operand *Mode = Instr->getArg(1);
+    assert(llvm::isa<ConstantInteger32>(Mode) &&
+           "Round last argument must be a constant");
+    auto *SrcRM = legalize(Src, Legal_Reg | Legal_Mem);
+    int32_t Imm = llvm::cast<ConstantInteger32>(Mode)->getValue();
+    (void)Imm;
+    assert(Imm >= 0 && Imm < 4 && "Invalid rounding mode");
+    auto *T = makeReg(Dest->getType());
+    _round(T, SrcRM, Mode);
+    _movp(Dest, T);
+    return;
+  }
+  default: // UnknownIntrinsic
+    Func->setError("Unexpected intrinsic");
+    return;
+  }
+  return;
+}
+
+void TargetX8632::lowerAtomicCmpxchg(Variable *DestPrev, Operand *Ptr,
+                                     Operand *Expected, Operand *Desired) {
+  Type Ty = Expected->getType();
+  if (!Traits::Is64Bit && Ty == IceType_i64) {
+    // Reserve the pre-colored registers first, before adding any more
+    // infinite-weight variables from formMemoryOperand's legalization.
+    Variable *T_edx = makeReg(IceType_i32, Traits::RegisterSet::Reg_edx);
+    Variable *T_eax = makeReg(IceType_i32, Traits::RegisterSet::Reg_eax);
+    Variable *T_ecx = makeReg(IceType_i32, Traits::RegisterSet::Reg_ecx);
+    Variable *T_ebx = makeReg(IceType_i32, Traits::RegisterSet::Reg_ebx);
+    _mov(T_eax, loOperand(Expected));
+    _mov(T_edx, hiOperand(Expected));
+    _mov(T_ebx, loOperand(Desired));
+    _mov(T_ecx, hiOperand(Desired));
+    X86OperandMem *Addr = formMemoryOperand(Ptr, Ty);
+    constexpr bool Locked = true;
+    _cmpxchg8b(Addr, T_edx, T_eax, T_ecx, T_ebx, Locked);
+    auto *DestLo = llvm::cast<Variable>(loOperand(DestPrev));
+    auto *DestHi = llvm::cast<Variable>(hiOperand(DestPrev));
+    _mov(DestLo, T_eax);
+    _mov(DestHi, T_edx);
+    return;
+  }
+  RegNumT Eax;
+  switch (Ty) {
+  default:
+    llvm::report_fatal_error("Bad type for cmpxchg");
+  case IceType_i64:
+    Eax = Traits::getRaxOrDie();
+    break;
+  case IceType_i32:
+    Eax = Traits::RegisterSet::Reg_eax;
+    break;
+  case IceType_i16:
+    Eax = Traits::RegisterSet::Reg_ax;
+    break;
+  case IceType_i8:
+    Eax = Traits::RegisterSet::Reg_al;
+    break;
+  }
+  Variable *T_eax = makeReg(Ty, Eax);
+  _mov(T_eax, Expected);
+  X86OperandMem *Addr = formMemoryOperand(Ptr, Ty);
+  Variable *DesiredReg = legalizeToReg(Desired);
+  constexpr bool Locked = true;
+  _cmpxchg(Addr, T_eax, DesiredReg, Locked);
+  _mov(DestPrev, T_eax);
+}
+
+bool TargetX8632::tryOptimizedCmpxchgCmpBr(Variable *Dest, Operand *PtrToMem,
+                                           Operand *Expected,
+                                           Operand *Desired) {
+  if (Func->getOptLevel() == Opt_m1)
+    return false;
+  // Peek ahead a few instructions and see how Dest is used.
+  // It's very common to have:
+  //
+  // %x = call i32 @llvm.nacl.atomic.cmpxchg.i32(i32* ptr, i32 %expected, ...)
+  // [%y_phi = ...] // list of phi stores
+  // %p = icmp eq i32 %x, %expected
+  // br i1 %p, label %l1, label %l2
+  //
+  // which we can optimize into:
+  //
+  // %x = <cmpxchg code>
+  // [%y_phi = ...] // list of phi stores
+  // br eq, %l1, %l2
+  InstList::iterator I = Context.getCur();
+  // I is currently the InstIntrinsic. Peek past that.
+  // This assumes that the atomic cmpxchg has not been lowered yet,
+  // so that the instructions seen in the scan from "Cur" is simple.
+  assert(llvm::isa<InstIntrinsic>(*I));
+  Inst *NextInst = Context.getNextInst(I);
+  if (!NextInst)
+    return false;
+  // There might be phi assignments right before the compare+branch, since this
+  // could be a backward branch for a loop. This placement of assignments is
+  // determined by placePhiStores().
+  CfgVector<InstAssign *> PhiAssigns;
+  while (auto *PhiAssign = llvm::dyn_cast<InstAssign>(NextInst)) {
+    if (PhiAssign->getDest() == Dest)
+      return false;
+    PhiAssigns.push_back(PhiAssign);
+    NextInst = Context.getNextInst(I);
+    if (!NextInst)
+      return false;
+  }
+  if (auto *NextCmp = llvm::dyn_cast<InstIcmp>(NextInst)) {
+    if (!(NextCmp->getCondition() == InstIcmp::Eq &&
+          ((NextCmp->getSrc(0) == Dest && NextCmp->getSrc(1) == Expected) ||
+           (NextCmp->getSrc(1) == Dest && NextCmp->getSrc(0) == Expected)))) {
+      return false;
+    }
+    NextInst = Context.getNextInst(I);
+    if (!NextInst)
+      return false;
+    if (auto *NextBr = llvm::dyn_cast<InstBr>(NextInst)) {
+      if (!NextBr->isUnconditional() &&
+          NextCmp->getDest() == NextBr->getCondition() &&
+          NextBr->isLastUse(NextCmp->getDest())) {
+        lowerAtomicCmpxchg(Dest, PtrToMem, Expected, Desired);
+        for (size_t i = 0; i < PhiAssigns.size(); ++i) {
+          // Lower the phi assignments now, before the branch (same placement
+          // as before).
+          InstAssign *PhiAssign = PhiAssigns[i];
+          PhiAssign->setDeleted();
+          lowerAssign(PhiAssign);
+          Context.advanceNext();
+        }
+        _br(CondX86::Br_e, NextBr->getTargetTrue(), NextBr->getTargetFalse());
+        // Skip over the old compare and branch, by deleting them.
+        NextCmp->setDeleted();
+        NextBr->setDeleted();
+        Context.advanceNext();
+        Context.advanceNext();
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+void TargetX8632::lowerAtomicRMW(Variable *Dest, uint32_t Operation,
+                                 Operand *Ptr, Operand *Val) {
+  bool NeedsCmpxchg = false;
+  LowerBinOp Op_Lo = nullptr;
+  LowerBinOp Op_Hi = nullptr;
+  switch (Operation) {
+  default:
+    Func->setError("Unknown AtomicRMW operation");
+    return;
+  case Intrinsics::AtomicAdd: {
+    if (!Traits::Is64Bit && Dest->getType() == IceType_i64) {
+      // All the fall-through paths must set this to true, but use this
+      // for asserting.
+      NeedsCmpxchg = true;
+      Op_Lo = &TargetX8632::_add;
+      Op_Hi = &TargetX8632::_adc;
+      break;
+    }
+    X86OperandMem *Addr = formMemoryOperand(Ptr, Dest->getType());
+    constexpr bool Locked = true;
+    Variable *T = nullptr;
+    _mov(T, Val);
+    _xadd(Addr, T, Locked);
+    _mov(Dest, T);
+    return;
+  }
+  case Intrinsics::AtomicSub: {
+    if (!Traits::Is64Bit && Dest->getType() == IceType_i64) {
+      NeedsCmpxchg = true;
+      Op_Lo = &TargetX8632::_sub;
+      Op_Hi = &TargetX8632::_sbb;
+      break;
+    }
+    X86OperandMem *Addr = formMemoryOperand(Ptr, Dest->getType());
+    constexpr bool Locked = true;
+    Variable *T = nullptr;
+    _mov(T, Val);
+    _neg(T);
+    _xadd(Addr, T, Locked);
+    _mov(Dest, T);
+    return;
+  }
+  case Intrinsics::AtomicOr:
+    // TODO(jvoung): If Dest is null or dead, then some of these
+    // operations do not need an "exchange", but just a locked op.
+    // That appears to be "worth" it for sub, or, and, and xor.
+    // xadd is probably fine vs lock add for add, and xchg is fine
+    // vs an atomic store.
+    NeedsCmpxchg = true;
+    Op_Lo = &TargetX8632::_or;
+    Op_Hi = &TargetX8632::_or;
+    break;
+  case Intrinsics::AtomicAnd:
+    NeedsCmpxchg = true;
+    Op_Lo = &TargetX8632::_and;
+    Op_Hi = &TargetX8632::_and;
+    break;
+  case Intrinsics::AtomicXor:
+    NeedsCmpxchg = true;
+    Op_Lo = &TargetX8632::_xor;
+    Op_Hi = &TargetX8632::_xor;
+    break;
+  case Intrinsics::AtomicExchange:
+    if (!Traits::Is64Bit && Dest->getType() == IceType_i64) {
+      NeedsCmpxchg = true;
+      // NeedsCmpxchg, but no real Op_Lo/Op_Hi need to be done. The values
+      // just need to be moved to the ecx and ebx registers.
+      Op_Lo = nullptr;
+      Op_Hi = nullptr;
+      break;
+    }
+    X86OperandMem *Addr = formMemoryOperand(Ptr, Dest->getType());
+    Variable *T = nullptr;
+    _mov(T, Val);
+    _xchg(Addr, T);
+    _mov(Dest, T);
+    return;
+  }
+  // Otherwise, we need a cmpxchg loop.
+  (void)NeedsCmpxchg;
+  assert(NeedsCmpxchg);
+  expandAtomicRMWAsCmpxchg(Op_Lo, Op_Hi, Dest, Ptr, Val);
+}
+
+void TargetX8632::expandAtomicRMWAsCmpxchg(LowerBinOp Op_Lo, LowerBinOp Op_Hi,
+                                           Variable *Dest, Operand *Ptr,
+                                           Operand *Val) {
+  // Expand a more complex RMW operation as a cmpxchg loop:
+  // For 64-bit:
+  //   mov     eax, [ptr]
+  //   mov     edx, [ptr + 4]
+  // .LABEL:
+  //   mov     ebx, eax
+  //   <Op_Lo> ebx, <desired_adj_lo>
+  //   mov     ecx, edx
+  //   <Op_Hi> ecx, <desired_adj_hi>
+  //   lock cmpxchg8b [ptr]
+  //   jne     .LABEL
+  //   mov     <dest_lo>, eax
+  //   mov     <dest_lo>, edx
+  //
+  // For 32-bit:
+  //   mov     eax, [ptr]
+  // .LABEL:
+  //   mov     <reg>, eax
+  //   op      <reg>, [desired_adj]
+  //   lock cmpxchg [ptr], <reg>
+  //   jne     .LABEL
+  //   mov     <dest>, eax
+  //
+  // If Op_{Lo,Hi} are nullptr, then just copy the value.
+  Val = legalize(Val);
+  Type Ty = Val->getType();
+  if (!Traits::Is64Bit && Ty == IceType_i64) {
+    Variable *T_edx = makeReg(IceType_i32, Traits::RegisterSet::Reg_edx);
+    Variable *T_eax = makeReg(IceType_i32, Traits::RegisterSet::Reg_eax);
+    X86OperandMem *Addr = formMemoryOperand(Ptr, Ty);
+    _mov(T_eax, loOperand(Addr));
+    _mov(T_edx, hiOperand(Addr));
+    Variable *T_ecx = makeReg(IceType_i32, Traits::RegisterSet::Reg_ecx);
+    Variable *T_ebx = makeReg(IceType_i32, Traits::RegisterSet::Reg_ebx);
+    InstX86Label *Label = InstX86Label::create(Func, this);
+    const bool IsXchg8b = Op_Lo == nullptr && Op_Hi == nullptr;
+    if (!IsXchg8b) {
+      Context.insert(Label);
+      _mov(T_ebx, T_eax);
+      (this->*Op_Lo)(T_ebx, loOperand(Val));
+      _mov(T_ecx, T_edx);
+      (this->*Op_Hi)(T_ecx, hiOperand(Val));
+    } else {
+      // This is for xchg, which doesn't need an actual Op_Lo/Op_Hi.
+      // It just needs the Val loaded into ebx and ecx.
+      // That can also be done before the loop.
+      _mov(T_ebx, loOperand(Val));
+      _mov(T_ecx, hiOperand(Val));
+      Context.insert(Label);
+    }
+    constexpr bool Locked = true;
+    _cmpxchg8b(Addr, T_edx, T_eax, T_ecx, T_ebx, Locked);
+    _br(CondX86::Br_ne, Label);
+    if (!IsXchg8b) {
+      // If Val is a variable, model the extended live range of Val through
+      // the end of the loop, since it will be re-used by the loop.
+      if (auto *ValVar = llvm::dyn_cast<Variable>(Val)) {
+        auto *ValLo = llvm::cast<Variable>(loOperand(ValVar));
+        auto *ValHi = llvm::cast<Variable>(hiOperand(ValVar));
+        Context.insert<InstFakeUse>(ValLo);
+        Context.insert<InstFakeUse>(ValHi);
+      }
+    } else {
+      // For xchg, the loop is slightly smaller and ebx/ecx are used.
+      Context.insert<InstFakeUse>(T_ebx);
+      Context.insert<InstFakeUse>(T_ecx);
+    }
+    // The address base (if any) is also reused in the loop.
+    if (Variable *Base = Addr->getBase())
+      Context.insert<InstFakeUse>(Base);
+    auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
+    auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
+    _mov(DestLo, T_eax);
+    _mov(DestHi, T_edx);
+    return;
+  }
+  X86OperandMem *Addr = formMemoryOperand(Ptr, Ty);
+  RegNumT Eax;
+  switch (Ty) {
+  default:
+    llvm::report_fatal_error("Bad type for atomicRMW");
+  case IceType_i64:
+    Eax = Traits::getRaxOrDie();
+    break;
+  case IceType_i32:
+    Eax = Traits::RegisterSet::Reg_eax;
+    break;
+  case IceType_i16:
+    Eax = Traits::RegisterSet::Reg_ax;
+    break;
+  case IceType_i8:
+    Eax = Traits::RegisterSet::Reg_al;
+    break;
+  }
+  Variable *T_eax = makeReg(Ty, Eax);
+  _mov(T_eax, Addr);
+  auto *Label = Context.insert<InstX86Label>(this);
+  // We want to pick a different register for T than Eax, so don't use
+  // _mov(T == nullptr, T_eax).
+  Variable *T = makeReg(Ty);
+  _mov(T, T_eax);
+  (this->*Op_Lo)(T, Val);
+  constexpr bool Locked = true;
+  _cmpxchg(Addr, T_eax, T, Locked);
+  _br(CondX86::Br_ne, Label);
+  // If Val is a variable, model the extended live range of Val through
+  // the end of the loop, since it will be re-used by the loop.
+  if (auto *ValVar = llvm::dyn_cast<Variable>(Val)) {
+    Context.insert<InstFakeUse>(ValVar);
+  }
+  // The address base (if any) is also reused in the loop.
+  if (Variable *Base = Addr->getBase())
+    Context.insert<InstFakeUse>(Base);
+  _mov(Dest, T_eax);
+}
+
+/// Lowers count {trailing, leading} zeros intrinsic.
+///
+/// We could do constant folding here, but that should have
+/// been done by the front-end/middle-end optimizations.
+
+void TargetX8632::lowerCountZeros(bool Cttz, Type Ty, Variable *Dest,
+                                  Operand *FirstVal, Operand *SecondVal) {
+  // TODO(jvoung): Determine if the user CPU supports LZCNT (BMI).
+  // Then the instructions will handle the Val == 0 case much more simply
+  // and won't require conversion from bit position to number of zeros.
+  //
+  // Otherwise:
+  //   bsr IF_NOT_ZERO, Val
+  //   mov T_DEST, ((Ty == i32) ? 63 : 127)
+  //   cmovne T_DEST, IF_NOT_ZERO
+  //   xor T_DEST, ((Ty == i32) ? 31 : 63)
+  //   mov DEST, T_DEST
+  //
+  // NOTE: T_DEST must be a register because cmov requires its dest to be a
+  // register. Also, bsf and bsr require their dest to be a register.
+  //
+  // The xor DEST, C(31|63) converts a bit position to # of leading zeroes.
+  // E.g., for 000... 00001100, bsr will say that the most significant bit
+  // set is at position 3, while the number of leading zeros is 28. Xor is
+  // like (M - N) for N <= M, and converts 63 to 32, and 127 to 64 (for the
+  // all-zeros case).
+  //
+  // X8632 only: Similar for 64-bit, but start w/ speculating that the upper 32
+  // bits are all zero, and compute the result for that case (checking the
+  // lower 32 bits). Then actually compute the result for the upper bits and
+  // cmov in the result from the lower computation if the earlier speculation
+  // was correct.
+  //
+  // Cttz, is similar, but uses bsf instead, and doesn't require the xor
+  // bit position conversion, and the speculation is reversed.
+
+  // TODO(jpp): refactor this method.
+  assert(Ty == IceType_i32 || Ty == IceType_i64);
+  const Type DestTy = Traits::Is64Bit ? Dest->getType() : IceType_i32;
+  Variable *T = makeReg(DestTy);
+  Operand *FirstValRM = legalize(FirstVal, Legal_Mem | Legal_Reg);
+  if (Cttz) {
+    _bsf(T, FirstValRM);
+  } else {
+    _bsr(T, FirstValRM);
+  }
+  Variable *T_Dest = makeReg(DestTy);
+  Constant *_31 = Ctx->getConstantInt32(31);
+  Constant *_32 = Ctx->getConstantInt(DestTy, 32);
+  Constant *_63 = Ctx->getConstantInt(DestTy, 63);
+  Constant *_64 = Ctx->getConstantInt(DestTy, 64);
+  if (Cttz) {
+    if (DestTy == IceType_i64) {
+      _mov(T_Dest, _64);
+    } else {
+      _mov(T_Dest, _32);
+    }
+  } else {
+    Constant *_127 = Ctx->getConstantInt(DestTy, 127);
+    if (DestTy == IceType_i64) {
+      _mov(T_Dest, _127);
+    } else {
+      _mov(T_Dest, _63);
+    }
+  }
+  _cmov(T_Dest, T, CondX86::Br_ne);
+  if (!Cttz) {
+    if (DestTy == IceType_i64) {
+      // Even though there's a _63 available at this point, that constant might
+      // not be an i32, which will cause the xor emission to fail.
+      Constant *_63 = Ctx->getConstantInt32(63);
+      _xor(T_Dest, _63);
+    } else {
+      _xor(T_Dest, _31);
+    }
+  }
+  if (Traits::Is64Bit || Ty == IceType_i32) {
+    _mov(Dest, T_Dest);
+    return;
+  }
+  _add(T_Dest, _32);
+  auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
+  auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
+  // Will be using "test" on this, so we need a registerized variable.
+  Variable *SecondVar = legalizeToReg(SecondVal);
+  Variable *T_Dest2 = makeReg(IceType_i32);
+  if (Cttz) {
+    _bsf(T_Dest2, SecondVar);
+  } else {
+    _bsr(T_Dest2, SecondVar);
+    _xor(T_Dest2, _31);
+  }
+  _test(SecondVar, SecondVar);
+  _cmov(T_Dest2, T_Dest, CondX86::Br_e);
+  _mov(DestLo, T_Dest2);
+  _mov(DestHi, Ctx->getConstantZero(IceType_i32));
+}
+
+void TargetX8632::typedLoad(Type Ty, Variable *Dest, Variable *Base,
+                            Constant *Offset) {
+  // If Offset is a ConstantRelocatable in Non-SFI mode, we will need to
+  // legalize Mem properly.
+  if (Offset)
+    assert(!llvm::isa<ConstantRelocatable>(Offset));
+
+  auto *Mem = X86OperandMem::create(Func, Ty, Base, Offset);
+
+  if (isVectorType(Ty))
+    _movp(Dest, Mem);
+  else if (Ty == IceType_f64)
+    _movq(Dest, Mem);
+  else
+    _mov(Dest, Mem);
+}
+
+void TargetX8632::typedStore(Type Ty, Variable *Value, Variable *Base,
+                             Constant *Offset) {
+  // If Offset is a ConstantRelocatable in Non-SFI mode, we will need to
+  // legalize Mem properly.
+  if (Offset)
+    assert(!llvm::isa<ConstantRelocatable>(Offset));
+
+  auto *Mem = X86OperandMem::create(Func, Ty, Base, Offset);
+
+  if (isVectorType(Ty))
+    _storep(Value, Mem);
+  else if (Ty == IceType_f64)
+    _storeq(Value, Mem);
+  else
+    _store(Value, Mem);
+}
+
+void TargetX8632::copyMemory(Type Ty, Variable *Dest, Variable *Src,
+                             int32_t OffsetAmt) {
+  Constant *Offset = OffsetAmt ? Ctx->getConstantInt32(OffsetAmt) : nullptr;
+  // TODO(ascull): this or add nullptr test to _movp, _movq
+  Variable *Data = makeReg(Ty);
+
+  typedLoad(Ty, Data, Src, Offset);
+  typedStore(Ty, Data, Dest, Offset);
+}
+
+void TargetX8632::lowerMemcpy(Operand *Dest, Operand *Src, Operand *Count) {
+  // There is a load and store for each chunk in the unroll
+  constexpr uint32_t BytesPerStorep = 16;
+
+  // Check if the operands are constants
+  const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count);
+  const bool IsCountConst = CountConst != nullptr;
+  const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0;
+
+  if (shouldOptimizeMemIntrins() && IsCountConst &&
+      CountValue <= BytesPerStorep * Traits::MEMCPY_UNROLL_LIMIT) {
+    // Unlikely, but nothing to do if it does happen
+    if (CountValue == 0)
+      return;
+
+    Variable *SrcBase = legalizeToReg(Src);
+    Variable *DestBase = legalizeToReg(Dest);
+
+    // Find the largest type that can be used and use it as much as possible in
+    // reverse order. Then handle any remainder with overlapping copies. Since
+    // the remainder will be at the end, there will be reduced pressure on the
+    // memory unit as the accesses to the same memory are far apart.
+    Type Ty = largestTypeInSize(CountValue);
+    uint32_t TyWidth = typeWidthInBytes(Ty);
+
+    uint32_t RemainingBytes = CountValue;
+    int32_t Offset = (CountValue & ~(TyWidth - 1)) - TyWidth;
+    while (RemainingBytes >= TyWidth) {
+      copyMemory(Ty, DestBase, SrcBase, Offset);
+      RemainingBytes -= TyWidth;
+      Offset -= TyWidth;
+    }
+
+    if (RemainingBytes == 0)
+      return;
+
+    // Lower the remaining bytes. Adjust to larger types in order to make use
+    // of overlaps in the copies.
+    Type LeftOverTy = firstTypeThatFitsSize(RemainingBytes);
+    Offset = CountValue - typeWidthInBytes(LeftOverTy);
+    copyMemory(LeftOverTy, DestBase, SrcBase, Offset);
+    return;
+  }
+
+  // Fall back on a function call
+  InstCall *Call = makeHelperCall(RuntimeHelper::H_call_memcpy, nullptr, 3);
+  Call->addArg(Dest);
+  Call->addArg(Src);
+  Call->addArg(Count);
+  lowerCall(Call);
+}
+
+void TargetX8632::lowerMemmove(Operand *Dest, Operand *Src, Operand *Count) {
+  // There is a load and store for each chunk in the unroll
+  constexpr uint32_t BytesPerStorep = 16;
+
+  // Check if the operands are constants
+  const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count);
+  const bool IsCountConst = CountConst != nullptr;
+  const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0;
+
+  if (shouldOptimizeMemIntrins() && IsCountConst &&
+      CountValue <= BytesPerStorep * Traits::MEMMOVE_UNROLL_LIMIT) {
+    // Unlikely, but nothing to do if it does happen
+    if (CountValue == 0)
+      return;
+
+    Variable *SrcBase = legalizeToReg(Src);
+    Variable *DestBase = legalizeToReg(Dest);
+
+    std::tuple<Type, Constant *, Variable *>
+        Moves[Traits::MEMMOVE_UNROLL_LIMIT];
+    Constant *Offset;
+    Variable *Reg;
+
+    // Copy the data into registers as the source and destination could overlap
+    // so make sure not to clobber the memory. This also means overlapping
+    // moves can be used as we are taking a safe snapshot of the memory.
+    Type Ty = largestTypeInSize(CountValue);
+    uint32_t TyWidth = typeWidthInBytes(Ty);
+
+    uint32_t RemainingBytes = CountValue;
+    int32_t OffsetAmt = (CountValue & ~(TyWidth - 1)) - TyWidth;
+    size_t N = 0;
+    while (RemainingBytes >= TyWidth) {
+      assert(N <= Traits::MEMMOVE_UNROLL_LIMIT);
+      Offset = Ctx->getConstantInt32(OffsetAmt);
+      Reg = makeReg(Ty);
+      typedLoad(Ty, Reg, SrcBase, Offset);
+      RemainingBytes -= TyWidth;
+      OffsetAmt -= TyWidth;
+      Moves[N++] = std::make_tuple(Ty, Offset, Reg);
+    }
+
+    if (RemainingBytes != 0) {
+      // Lower the remaining bytes. Adjust to larger types in order to make use
+      // of overlaps in the copies.
+      assert(N <= Traits::MEMMOVE_UNROLL_LIMIT);
+      Ty = firstTypeThatFitsSize(RemainingBytes);
+      Offset = Ctx->getConstantInt32(CountValue - typeWidthInBytes(Ty));
+      Reg = makeReg(Ty);
+      typedLoad(Ty, Reg, SrcBase, Offset);
+      Moves[N++] = std::make_tuple(Ty, Offset, Reg);
+    }
+
+    // Copy the data out into the destination memory
+    for (size_t i = 0; i < N; ++i) {
+      std::tie(Ty, Offset, Reg) = Moves[i];
+      typedStore(Ty, Reg, DestBase, Offset);
+    }
+
+    return;
+  }
+
+  // Fall back on a function call
+  InstCall *Call = makeHelperCall(RuntimeHelper::H_call_memmove, nullptr, 3);
+  Call->addArg(Dest);
+  Call->addArg(Src);
+  Call->addArg(Count);
+  lowerCall(Call);
+}
+
+void TargetX8632::lowerMemset(Operand *Dest, Operand *Val, Operand *Count) {
+  constexpr uint32_t BytesPerStorep = 16;
+  constexpr uint32_t BytesPerStoreq = 8;
+  constexpr uint32_t BytesPerStorei32 = 4;
+  assert(Val->getType() == IceType_i8);
+
+  // Check if the operands are constants
+  const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count);
+  const auto *ValConst = llvm::dyn_cast<const ConstantInteger32>(Val);
+  const bool IsCountConst = CountConst != nullptr;
+  const bool IsValConst = ValConst != nullptr;
+  const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0;
+  const uint32_t ValValue = IsValConst ? ValConst->getValue() : 0;
+
+  // Unlikely, but nothing to do if it does happen
+  if (IsCountConst && CountValue == 0)
+    return;
+
+  // TODO(ascull): if the count is constant but val is not it would be possible
+  // to inline by spreading the value across 4 bytes and accessing subregs e.g.
+  // eax, ax and al.
+  if (shouldOptimizeMemIntrins() && IsCountConst && IsValConst) {
+    Variable *Base = nullptr;
+    Variable *VecReg = nullptr;
+    const uint32_t MaskValue = (ValValue & 0xff);
+    const uint32_t SpreadValue =
+        (MaskValue << 24) | (MaskValue << 16) | (MaskValue << 8) | MaskValue;
+
+    auto lowerSet = [this, &Base, SpreadValue, &VecReg](Type Ty,
+                                                        uint32_t OffsetAmt) {
+      assert(Base != nullptr);
+      Constant *Offset = OffsetAmt ? Ctx->getConstantInt32(OffsetAmt) : nullptr;
+
+      // TODO(ascull): is 64-bit better with vector or scalar movq?
+      auto *Mem = X86OperandMem::create(Func, Ty, Base, Offset);
+      if (isVectorType(Ty)) {
+        assert(VecReg != nullptr);
+        _storep(VecReg, Mem);
+      } else if (Ty == IceType_f64) {
+        assert(VecReg != nullptr);
+        _storeq(VecReg, Mem);
+      } else {
+        assert(Ty != IceType_i64);
+        _store(Ctx->getConstantInt(Ty, SpreadValue), Mem);
+      }
+    };
+
+    // Find the largest type that can be used and use it as much as possible in
+    // reverse order. Then handle any remainder with overlapping copies. Since
+    // the remainder will be at the end, there will be reduces pressure on the
+    // memory unit as the access to the same memory are far apart.
+    Type Ty = IceType_void;
+    if (ValValue == 0 && CountValue >= BytesPerStoreq &&
+        CountValue <= BytesPerStorep * Traits::MEMSET_UNROLL_LIMIT) {
+      // When the value is zero it can be loaded into a vector register cheaply
+      // using the xor trick.
+      Base = legalizeToReg(Dest);
+      VecReg = makeVectorOfZeros(IceType_v16i8);
+      Ty = largestTypeInSize(CountValue);
+    } else if (CountValue <= BytesPerStorei32 * Traits::MEMSET_UNROLL_LIMIT) {
+      // When the value is non-zero or the count is small we can't use vector
+      // instructions so are limited to 32-bit stores.
+      Base = legalizeToReg(Dest);
+      constexpr uint32_t MaxSize = 4;
+      Ty = largestTypeInSize(CountValue, MaxSize);
+    }
+
+    if (Base) {
+      uint32_t TyWidth = typeWidthInBytes(Ty);
+
+      uint32_t RemainingBytes = CountValue;
+      uint32_t Offset = (CountValue & ~(TyWidth - 1)) - TyWidth;
+      while (RemainingBytes >= TyWidth) {
+        lowerSet(Ty, Offset);
+        RemainingBytes -= TyWidth;
+        Offset -= TyWidth;
+      }
+
+      if (RemainingBytes == 0)
+        return;
+
+      // Lower the remaining bytes. Adjust to larger types in order to make use
+      // of overlaps in the copies.
+      Type LeftOverTy = firstTypeThatFitsSize(RemainingBytes);
+      Offset = CountValue - typeWidthInBytes(LeftOverTy);
+      lowerSet(LeftOverTy, Offset);
+      return;
+    }
+  }
+
+  // Fall back on calling the memset function. The value operand needs to be
+  // extended to a stack slot size because the PNaCl ABI requires arguments to
+  // be at least 32 bits wide.
+  Operand *ValExt;
+  if (IsValConst) {
+    ValExt = Ctx->getConstantInt(stackSlotType(), ValValue);
+  } else {
+    Variable *ValExtVar = Func->makeVariable(stackSlotType());
+    lowerCast(InstCast::create(Func, InstCast::Zext, ValExtVar, Val));
+    ValExt = ValExtVar;
+  }
+  InstCall *Call = makeHelperCall(RuntimeHelper::H_call_memset, nullptr, 3);
+  Call->addArg(Dest);
+  Call->addArg(ValExt);
+  Call->addArg(Count);
+  lowerCall(Call);
+}
+
+class AddressOptimizer {
+  AddressOptimizer() = delete;
+  AddressOptimizer(const AddressOptimizer &) = delete;
+  AddressOptimizer &operator=(const AddressOptimizer &) = delete;
+
+public:
+  explicit AddressOptimizer(const Cfg *Func)
+      : Func(Func), VMetadata(Func->getVMetadata()) {}
+
+  inline void dumpAddressOpt(const ConstantRelocatable *const Relocatable,
+                             int32_t Offset, const Variable *Base,
+                             const Variable *Index, uint16_t Shift,
+                             const Inst *Reason) const;
+
+  inline const Inst *matchAssign(Variable **Var,
+                                 ConstantRelocatable **Relocatable,
+                                 int32_t *Offset);
+
+  inline const Inst *matchCombinedBaseIndex(Variable **Base, Variable **Index,
+                                            uint16_t *Shift);
+
+  inline const Inst *matchShiftedIndex(Variable **Index, uint16_t *Shift);
+
+  inline const Inst *matchOffsetIndexOrBase(Variable **IndexOrBase,
+                                            const uint16_t Shift,
+                                            ConstantRelocatable **Relocatable,
+                                            int32_t *Offset);
+
+private:
+  const Cfg *const Func;
+  const VariablesMetadata *const VMetadata;
+
+  static bool isAdd(const Inst *Instr) {
+    if (auto *Arith = llvm::dyn_cast_or_null<const InstArithmetic>(Instr)) {
+      return (Arith->getOp() == InstArithmetic::Add);
+    }
+    return false;
+  }
+};
+
+void AddressOptimizer::dumpAddressOpt(
+    const ConstantRelocatable *const Relocatable, int32_t Offset,
+    const Variable *Base, const Variable *Index, uint16_t Shift,
+    const Inst *Reason) const {
+  if (!BuildDefs::dump())
+    return;
+  if (!Func->isVerbose(IceV_AddrOpt))
+    return;
+  OstreamLocker L(Func->getContext());
+  Ostream &Str = Func->getContext()->getStrDump();
+  Str << "Instruction: ";
+  Reason->dumpDecorated(Func);
+  Str << "  results in Base=";
+  if (Base)
+    Base->dump(Func);
+  else
+    Str << "<null>";
+  Str << ", Index=";
+  if (Index)
+    Index->dump(Func);
+  else
+    Str << "<null>";
+  Str << ", Shift=" << Shift << ", Offset=" << Offset
+      << ", Relocatable=" << Relocatable << "\n";
+}
+
+const Inst *AddressOptimizer::matchAssign(Variable **Var,
+                                          ConstantRelocatable **Relocatable,
+                                          int32_t *Offset) {
+  // Var originates from Var=SrcVar ==> set Var:=SrcVar
+  if (*Var == nullptr)
+    return nullptr;
+  if (const Inst *VarAssign = VMetadata->getSingleDefinition(*Var)) {
+    assert(!VMetadata->isMultiDef(*Var));
+    if (llvm::isa<InstAssign>(VarAssign)) {
+      Operand *SrcOp = VarAssign->getSrc(0);
+      assert(SrcOp);
+      if (auto *SrcVar = llvm::dyn_cast<Variable>(SrcOp)) {
+        if (!VMetadata->isMultiDef(SrcVar) &&
+            // TODO: ensure SrcVar stays single-BB
+            true) {
+          *Var = SrcVar;
+          return VarAssign;
+        }
+      } else if (auto *Const = llvm::dyn_cast<ConstantInteger32>(SrcOp)) {
+        int32_t MoreOffset = Const->getValue();
+        if (Utils::WouldOverflowAdd(*Offset, MoreOffset))
+          return nullptr;
+        *Var = nullptr;
+        *Offset += MoreOffset;
+        return VarAssign;
+      } else if (auto *AddReloc = llvm::dyn_cast<ConstantRelocatable>(SrcOp)) {
+        if (*Relocatable == nullptr) {
+          // It is always safe to fold a relocatable through assignment -- the
+          // assignment frees a slot in the address operand that can be used to
+          // hold the Sandbox Pointer -- if any.
+          *Var = nullptr;
+          *Relocatable = AddReloc;
+          return VarAssign;
+        }
+      }
+    }
+  }
+  return nullptr;
+}
+
+const Inst *AddressOptimizer::matchCombinedBaseIndex(Variable **Base,
+                                                     Variable **Index,
+                                                     uint16_t *Shift) {
+  // Index==nullptr && Base is Base=Var1+Var2 ==>
+  //   set Base=Var1, Index=Var2, Shift=0
+  if (*Base == nullptr)
+    return nullptr;
+  if (*Index != nullptr)
+    return nullptr;
+  auto *BaseInst = VMetadata->getSingleDefinition(*Base);
+  if (BaseInst == nullptr)
+    return nullptr;
+  assert(!VMetadata->isMultiDef(*Base));
+  if (BaseInst->getSrcSize() < 2)
+    return nullptr;
+  if (auto *Var1 = llvm::dyn_cast<Variable>(BaseInst->getSrc(0))) {
+    if (VMetadata->isMultiDef(Var1))
+      return nullptr;
+    if (auto *Var2 = llvm::dyn_cast<Variable>(BaseInst->getSrc(1))) {
+      if (VMetadata->isMultiDef(Var2))
+        return nullptr;
+      if (isAdd(BaseInst) &&
+          // TODO: ensure Var1 and Var2 stay single-BB
+          true) {
+        *Base = Var1;
+        *Index = Var2;
+        *Shift = 0; // should already have been 0
+        return BaseInst;
+      }
+    }
+  }
+  return nullptr;
+}
+
+const Inst *AddressOptimizer::matchShiftedIndex(Variable **Index,
+                                                uint16_t *Shift) {
+  // Index is Index=Var*Const && log2(Const)+Shift<=3 ==>
+  //   Index=Var, Shift+=log2(Const)
+  if (*Index == nullptr)
+    return nullptr;
+  auto *IndexInst = VMetadata->getSingleDefinition(*Index);
+  if (IndexInst == nullptr)
+    return nullptr;
+  assert(!VMetadata->isMultiDef(*Index));
+
+  // When using an unsigned 32-bit array index on x64, it gets zero-extended
+  // before the shift & add. The explicit zero extension can be eliminated
+  // because x86 32-bit operations automatically get zero-extended into the
+  // corresponding 64-bit register.
+  if (auto *CastInst = llvm::dyn_cast<InstCast>(IndexInst)) {
+    if (CastInst->getCastKind() == InstCast::Zext) {
+      if (auto *Var = llvm::dyn_cast<Variable>(CastInst->getSrc(0))) {
+        if (Var->getType() == IceType_i32 &&
+            CastInst->getDest()->getType() == IceType_i64) {
+          IndexInst = VMetadata->getSingleDefinition(Var);
+        }
+      }
+    }
+  }
+
+  if (IndexInst->getSrcSize() < 2)
+    return nullptr;
+  if (auto *ArithInst = llvm::dyn_cast<InstArithmetic>(IndexInst)) {
+    if (auto *Var = llvm::dyn_cast<Variable>(ArithInst->getSrc(0))) {
+      if (auto *Const =
+              llvm::dyn_cast<ConstantInteger32>(ArithInst->getSrc(1))) {
+        if (VMetadata->isMultiDef(Var) || Const->getType() != IceType_i32)
+          return nullptr;
+        switch (ArithInst->getOp()) {
+        default:
+          return nullptr;
+        case InstArithmetic::Mul: {
+          uint32_t Mult = Const->getValue();
+          uint32_t LogMult;
+          switch (Mult) {
+          case 1:
+            LogMult = 0;
+            break;
+          case 2:
+            LogMult = 1;
+            break;
+          case 4:
+            LogMult = 2;
+            break;
+          case 8:
+            LogMult = 3;
+            break;
+          default:
+            return nullptr;
+          }
+          if (*Shift + LogMult <= 3) {
+            *Index = Var;
+            *Shift += LogMult;
+            return IndexInst;
+          }
+        }
+        case InstArithmetic::Shl: {
+          uint32_t ShiftAmount = Const->getValue();
+          switch (ShiftAmount) {
+          case 0:
+          case 1:
+          case 2:
+          case 3:
+            break;
+          default:
+            return nullptr;
+          }
+          if (*Shift + ShiftAmount <= 3) {
+            *Index = Var;
+            *Shift += ShiftAmount;
+            return IndexInst;
+          }
+        }
+        }
+      }
+    }
+  }
+  return nullptr;
+}
+
+const Inst *AddressOptimizer::matchOffsetIndexOrBase(
+    Variable **IndexOrBase, const uint16_t Shift,
+    ConstantRelocatable **Relocatable, int32_t *Offset) {
+  // Base is Base=Var+Const || Base is Base=Const+Var ==>
+  //   set Base=Var, Offset+=Const
+  // Base is Base=Var-Const ==>
+  //   set Base=Var, Offset-=Const
+  // Index is Index=Var+Const ==>
+  //   set Index=Var, Offset+=(Const<<Shift)
+  // Index is Index=Const+Var ==>
+  //   set Index=Var, Offset+=(Const<<Shift)
+  // Index is Index=Var-Const ==>
+  //   set Index=Var, Offset-=(Const<<Shift)
+  // Treat Index=Var Or Const as Index=Var + Const
+  //    when Var = Var' << N and log2(Const) <= N
+  // or when Var = (2^M) * (2^N) and log2(Const) <= (M+N)
+
+  if (*IndexOrBase == nullptr) {
+    return nullptr;
+  }
+  const Inst *Definition = VMetadata->getSingleDefinition(*IndexOrBase);
+  if (Definition == nullptr) {
+    return nullptr;
+  }
+  assert(!VMetadata->isMultiDef(*IndexOrBase));
+  if (auto *ArithInst = llvm::dyn_cast<const InstArithmetic>(Definition)) {
+    switch (ArithInst->getOp()) {
+    case InstArithmetic::Add:
+    case InstArithmetic::Sub:
+    case InstArithmetic::Or:
+      break;
+    default:
+      return nullptr;
+    }
+
+    Operand *Src0 = ArithInst->getSrc(0);
+    Operand *Src1 = ArithInst->getSrc(1);
+    auto *Var0 = llvm::dyn_cast<Variable>(Src0);
+    auto *Var1 = llvm::dyn_cast<Variable>(Src1);
+    auto *Const0 = llvm::dyn_cast<ConstantInteger32>(Src0);
+    auto *Const1 = llvm::dyn_cast<ConstantInteger32>(Src1);
+    auto *Reloc0 = llvm::dyn_cast<ConstantRelocatable>(Src0);
+    auto *Reloc1 = llvm::dyn_cast<ConstantRelocatable>(Src1);
+
+    bool IsAdd = false;
+    if (ArithInst->getOp() == InstArithmetic::Or) {
+      Variable *Var = nullptr;
+      ConstantInteger32 *Const = nullptr;
+      if (Var0 && Const1) {
+        Var = Var0;
+        Const = Const1;
+      } else if (Const0 && Var1) {
+        Var = Var1;
+        Const = Const0;
+      } else {
+        return nullptr;
+      }
+      auto *VarDef =
+          llvm::dyn_cast<InstArithmetic>(VMetadata->getSingleDefinition(Var));
+      if (VarDef == nullptr)
+        return nullptr;
+
+      SizeT ZeroesAvailable = 0;
+      if (VarDef->getOp() == InstArithmetic::Shl) {
+        if (auto *ConstInt =
+                llvm::dyn_cast<ConstantInteger32>(VarDef->getSrc(1))) {
+          ZeroesAvailable = ConstInt->getValue();
+        }
+      } else if (VarDef->getOp() == InstArithmetic::Mul) {
+        SizeT PowerOfTwo = 0;
+        if (auto *MultConst =
+                llvm::dyn_cast<ConstantInteger32>(VarDef->getSrc(0))) {
+          if (llvm::isPowerOf2_32(MultConst->getValue())) {
+            PowerOfTwo += MultConst->getValue();
+          }
+        }
+        if (auto *MultConst =
+                llvm::dyn_cast<ConstantInteger32>(VarDef->getSrc(1))) {
+          if (llvm::isPowerOf2_32(MultConst->getValue())) {
+            PowerOfTwo += MultConst->getValue();
+          }
+        }
+        ZeroesAvailable = llvm::Log2_32(PowerOfTwo) + 1;
+      }
+      SizeT ZeroesNeeded = llvm::Log2_32(Const->getValue()) + 1;
+      if (ZeroesNeeded == 0 || ZeroesNeeded > ZeroesAvailable)
+        return nullptr;
+      IsAdd = true; // treat it as an add if the above conditions hold
+    } else {
+      IsAdd = ArithInst->getOp() == InstArithmetic::Add;
+    }
+
+    Variable *NewIndexOrBase = nullptr;
+    int32_t NewOffset = 0;
+    ConstantRelocatable *NewRelocatable = *Relocatable;
+    if (Var0 && Var1)
+      // TODO(sehr): merge base/index splitting into here.
+      return nullptr;
+    if (!IsAdd && Var1)
+      return nullptr;
+    if (Var0)
+      NewIndexOrBase = Var0;
+    else if (Var1)
+      NewIndexOrBase = Var1;
+    // Don't know how to add/subtract two relocatables.
+    if ((*Relocatable && (Reloc0 || Reloc1)) || (Reloc0 && Reloc1))
+      return nullptr;
+    // Don't know how to subtract a relocatable.
+    if (!IsAdd && Reloc1)
+      return nullptr;
+    // Incorporate ConstantRelocatables.
+    if (Reloc0)
+      NewRelocatable = Reloc0;
+    else if (Reloc1)
+      NewRelocatable = Reloc1;
+    // Compute the updated constant offset.
+    if (Const0) {
+      const int32_t MoreOffset =
+          IsAdd ? Const0->getValue() : -Const0->getValue();
+      if (Utils::WouldOverflowAdd(*Offset + NewOffset, MoreOffset))
+        return nullptr;
+      NewOffset += MoreOffset;
+    }
+    if (Const1) {
+      const int32_t MoreOffset =
+          IsAdd ? Const1->getValue() : -Const1->getValue();
+      if (Utils::WouldOverflowAdd(*Offset + NewOffset, MoreOffset))
+        return nullptr;
+      NewOffset += MoreOffset;
+    }
+    if (Utils::WouldOverflowAdd(*Offset, NewOffset << Shift))
+      return nullptr;
+    *IndexOrBase = NewIndexOrBase;
+    *Offset += (NewOffset << Shift);
+    // Shift is always zero if this is called with the base
+    *Relocatable = NewRelocatable;
+    return Definition;
+  }
+  return nullptr;
+}
+
+typename TargetX8632::X86OperandMem *
+TargetX8632::computeAddressOpt(const Inst *Instr, Type MemType, Operand *Addr) {
+  Func->resetCurrentNode();
+  if (Func->isVerbose(IceV_AddrOpt)) {
+    OstreamLocker L(Func->getContext());
+    Ostream &Str = Func->getContext()->getStrDump();
+    Str << "\nStarting computeAddressOpt for instruction:\n  ";
+    Instr->dumpDecorated(Func);
+  }
+
+  OptAddr NewAddr;
+  NewAddr.Base = llvm::dyn_cast<Variable>(Addr);
+  if (NewAddr.Base == nullptr)
+    return nullptr;
+
+  // If the Base has more than one use or is live across multiple blocks, then
+  // don't go further. Alternatively (?), never consider a transformation that
+  // would change a variable that is currently *not* live across basic block
+  // boundaries into one that *is*.
+  if (!getFlags().getLoopInvariantCodeMotion()) {
+    // Need multi block address opt when licm is enabled.
+    // Might make sense to restrict to current node and loop header.
+    if (Func->getVMetadata()->isMultiBlock(
+            NewAddr.Base) /* || Base->getUseCount() > 1*/)
+      return nullptr;
+  }
+  AddressOptimizer AddrOpt(Func);
+  const bool MockBounds = getFlags().getMockBoundsCheck();
+  const Inst *Reason = nullptr;
+  bool AddressWasOptimized = false;
+  // The following unnamed struct identifies the address mode formation steps
+  // that could potentially create an invalid memory operand (i.e., no free
+  // slots for RebasePtr.) We add all those variables to this struct so that we
+  // can use memset() to reset all members to false.
+  struct {
+    bool AssignBase = false;
+    bool AssignIndex = false;
+    bool OffsetFromBase = false;
+    bool OffsetFromIndex = false;
+    bool CombinedBaseIndex = false;
+  } Skip;
+  // NewAddrCheckpoint is used to rollback the address being formed in case an
+  // invalid address is formed.
+  OptAddr NewAddrCheckpoint;
+  Reason = Instr;
+  do {
+    if (Reason) {
+      AddrOpt.dumpAddressOpt(NewAddr.Relocatable, NewAddr.Offset, NewAddr.Base,
+                             NewAddr.Index, NewAddr.Shift, Reason);
+      AddressWasOptimized = true;
+      Reason = nullptr;
+      memset(reinterpret_cast<void *>(&Skip), 0, sizeof(Skip));
+    }
+
+    NewAddrCheckpoint = NewAddr;
+
+    // Update Base and Index to follow through assignments to definitions.
+    if (!Skip.AssignBase &&
+        (Reason = AddrOpt.matchAssign(&NewAddr.Base, &NewAddr.Relocatable,
+                                      &NewAddr.Offset))) {
+      // Assignments of Base from a Relocatable or ConstantInt32 can result
+      // in Base becoming nullptr.  To avoid code duplication in this loop we
+      // prefer that Base be non-nullptr if possible.
+      if ((NewAddr.Base == nullptr) && (NewAddr.Index != nullptr) &&
+          NewAddr.Shift == 0) {
+        std::swap(NewAddr.Base, NewAddr.Index);
+      }
+      continue;
+    }
+    if (!Skip.AssignBase &&
+        (Reason = AddrOpt.matchAssign(&NewAddr.Index, &NewAddr.Relocatable,
+                                      &NewAddr.Offset))) {
+      continue;
+    }
+
+    if (!MockBounds) {
+      // Transition from:
+      //   <Relocatable + Offset>(Base) to
+      //   <Relocatable + Offset>(Base, Index)
+      if (!Skip.CombinedBaseIndex &&
+          (Reason = AddrOpt.matchCombinedBaseIndex(
+               &NewAddr.Base, &NewAddr.Index, &NewAddr.Shift))) {
+        continue;
+      }
+
+      // Recognize multiply/shift and update Shift amount.
+      // Index becomes Index=Var<<Const && Const+Shift<=3 ==>
+      //   Index=Var, Shift+=Const
+      // Index becomes Index=Const*Var && log2(Const)+Shift<=3 ==>
+      //   Index=Var, Shift+=log2(Const)
+      if ((Reason =
+               AddrOpt.matchShiftedIndex(&NewAddr.Index, &NewAddr.Shift))) {
+        continue;
+      }
+
+      // If Shift is zero, the choice of Base and Index was purely arbitrary.
+      // Recognize multiply/shift and set Shift amount.
+      // Shift==0 && Base is Base=Var*Const && log2(Const)+Shift<=3 ==>
+      //   swap(Index,Base)
+      // Similar for Base=Const*Var and Base=Var<<Const
+      if (NewAddr.Shift == 0 &&
+          (Reason = AddrOpt.matchShiftedIndex(&NewAddr.Base, &NewAddr.Shift))) {
+        std::swap(NewAddr.Base, NewAddr.Index);
+        continue;
+      }
+    }
+
+    // Update Offset to reflect additions/subtractions with constants and
+    // relocatables.
+    // TODO: consider overflow issues with respect to Offset.
+    if (!Skip.OffsetFromBase && (Reason = AddrOpt.matchOffsetIndexOrBase(
+                                     &NewAddr.Base, /*Shift =*/0,
+                                     &NewAddr.Relocatable, &NewAddr.Offset))) {
+      continue;
+    }
+    if (!Skip.OffsetFromIndex && (Reason = AddrOpt.matchOffsetIndexOrBase(
+                                      &NewAddr.Index, NewAddr.Shift,
+                                      &NewAddr.Relocatable, &NewAddr.Offset))) {
+      continue;
+    }
+
+    break;
+  } while (Reason);
+
+  if (!AddressWasOptimized) {
+    return nullptr;
+  }
+
+  // Undo any addition of RebasePtr.  It will be added back when the mem
+  // operand is sandboxed.
+  if (NewAddr.Base == RebasePtr) {
+    NewAddr.Base = nullptr;
+  }
+
+  if (NewAddr.Index == RebasePtr) {
+    NewAddr.Index = nullptr;
+    NewAddr.Shift = 0;
+  }
+
+  Constant *OffsetOp = nullptr;
+  if (NewAddr.Relocatable == nullptr) {
+    OffsetOp = Ctx->getConstantInt32(NewAddr.Offset);
+  } else {
+    OffsetOp =
+        Ctx->getConstantSym(NewAddr.Relocatable->getOffset() + NewAddr.Offset,
+                            NewAddr.Relocatable->getName());
+  }
+  // Vanilla ICE load instructions should not use the segment registers, and
+  // computeAddressOpt only works at the level of Variables and Constants, not
+  // other X86OperandMem, so there should be no mention of segment
+  // registers there either.
+  static constexpr auto SegmentReg =
+      X86OperandMem::SegmentRegisters::DefaultSegment;
+
+  return X86OperandMem::create(Func, MemType, NewAddr.Base, OffsetOp,
+                               NewAddr.Index, NewAddr.Shift, SegmentReg);
+}
+
+/// Add a mock bounds check on the memory address before using it as a load or
+/// store operand.  The basic idea is that given a memory operand [reg], we
+/// would first add bounds-check code something like:
+///
+///   cmp reg, <lb>
+///   jl out_of_line_error
+///   cmp reg, <ub>
+///   jg out_of_line_error
+///
+/// In reality, the specific code will depend on how <lb> and <ub> are
+/// represented, e.g. an immediate, a global, or a function argument.
+///
+/// As such, we need to enforce that the memory operand does not have the form
+/// [reg1+reg2], because then there is no simple cmp instruction that would
+/// suffice.  However, we consider [reg+offset] to be OK because the offset is
+/// usually small, and so <ub> could have a safety buffer built in and then we
+/// could instead branch to a custom out_of_line_error that does the precise
+/// check and jumps back if it turns out OK.
+///
+/// For the purpose of mocking the bounds check, we'll do something like this:
+///
+///   cmp reg, 0
+///   je label
+///   cmp reg, 1
+///   je label
+///   label:
+///
+/// Also note that we don't need to add a bounds check to a dereference of a
+/// simple global variable address.
+
+void TargetX8632::doMockBoundsCheck(Operand *Opnd) {
+  if (!getFlags().getMockBoundsCheck())
+    return;
+  if (auto *Mem = llvm::dyn_cast<X86OperandMem>(Opnd)) {
+    if (Mem->getIndex()) {
+      llvm::report_fatal_error("doMockBoundsCheck: Opnd contains index reg");
+    }
+    Opnd = Mem->getBase();
+  }
+  // At this point Opnd could be nullptr, or Variable, or Constant, or perhaps
+  // something else.  We only care if it is Variable.
+  auto *Var = llvm::dyn_cast_or_null<Variable>(Opnd);
+  if (Var == nullptr)
+    return;
+  // We use lowerStore() to copy out-args onto the stack.  This creates a memory
+  // operand with the stack pointer as the base register.  Don't do bounds
+  // checks on that.
+  if (Var->getRegNum() == getStackReg())
+    return;
+
+  auto *Label = InstX86Label::create(Func, this);
+  _cmp(Opnd, Ctx->getConstantZero(IceType_i32));
+  _br(CondX86::Br_e, Label);
+  _cmp(Opnd, Ctx->getConstantInt32(1));
+  _br(CondX86::Br_e, Label);
+  Context.insert(Label);
+}
+
+void TargetX8632::lowerLoad(const InstLoad *Load) {
+  // A Load instruction can be treated the same as an Assign instruction, after
+  // the source operand is transformed into an X86OperandMem operand.  Note that
+  // the address mode optimization already creates an X86OperandMem operand, so
+  // it doesn't need another level of transformation.
+  Variable *DestLoad = Load->getDest();
+  Type Ty = DestLoad->getType();
+  Operand *Src0 = formMemoryOperand(Load->getLoadAddress(), Ty);
+  doMockBoundsCheck(Src0);
+  auto *Assign = InstAssign::create(Func, DestLoad, Src0);
+  lowerAssign(Assign);
+}
+
+void TargetX8632::doAddressOptOther() {
+  // Inverts some Icmp instructions which helps doAddressOptLoad later.
+  // TODO(manasijm): Refactor to unify the conditions for Var0 and Var1
+  Inst *Instr = iteratorToInst(Context.getCur());
+  auto *VMetadata = Func->getVMetadata();
+  if (auto *Icmp = llvm::dyn_cast<InstIcmp>(Instr)) {
+    if (llvm::isa<Constant>(Icmp->getSrc(0)) ||
+        llvm::isa<Constant>(Icmp->getSrc(1)))
+      return;
+    auto *Var0 = llvm::dyn_cast<Variable>(Icmp->getSrc(0));
+    if (Var0 == nullptr)
+      return;
+    if (!VMetadata->isTracked(Var0))
+      return;
+    auto *Op0Def = VMetadata->getFirstDefinitionSingleBlock(Var0);
+    if (Op0Def == nullptr || !llvm::isa<InstLoad>(Op0Def))
+      return;
+    if (VMetadata->getLocalUseNode(Var0) != Context.getNode())
+      return;
+
+    auto *Var1 = llvm::dyn_cast<Variable>(Icmp->getSrc(1));
+    if (Var1 != nullptr && VMetadata->isTracked(Var1)) {
+      auto *Op1Def = VMetadata->getFirstDefinitionSingleBlock(Var1);
+      if (Op1Def != nullptr && !VMetadata->isMultiBlock(Var1) &&
+          llvm::isa<InstLoad>(Op1Def)) {
+        return; // Both are loads
+      }
+    }
+    Icmp->reverseConditionAndOperands();
+  }
+}
+
+void TargetX8632::doAddressOptLoad() {
+  Inst *Instr = iteratorToInst(Context.getCur());
+  Operand *Addr = Instr->getSrc(0);
+  Variable *Dest = Instr->getDest();
+  if (auto *OptAddr = computeAddressOpt(Instr, Dest->getType(), Addr)) {
+    Instr->setDeleted();
+    Context.insert<InstLoad>(Dest, OptAddr);
+  }
+}
+
+void TargetX8632::doAddressOptLoadSubVector() {
+  auto *Intrinsic = llvm::cast<InstIntrinsic>(Context.getCur());
+  Operand *Addr = Intrinsic->getArg(0);
+  Variable *Dest = Intrinsic->getDest();
+  if (auto *OptAddr = computeAddressOpt(Intrinsic, Dest->getType(), Addr)) {
+    Intrinsic->setDeleted();
+    const Ice::Intrinsics::IntrinsicInfo Info = {
+        Ice::Intrinsics::LoadSubVector, Ice::Intrinsics::SideEffects_F,
+        Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
+    auto *NewLoad = Context.insert<InstIntrinsic>(2, Dest, Info);
+    NewLoad->addArg(OptAddr);
+    NewLoad->addArg(Intrinsic->getArg(1));
+  }
+}
+
+void TargetX8632::lowerPhi(const InstPhi * /*Instr*/) {
+  Func->setError("Phi found in regular instruction list");
+}
+
+void TargetX8632::lowerRet(const InstRet *Instr) {
+  Variable *Reg = nullptr;
+  if (Instr->hasRetValue()) {
+    Operand *RetValue = legalize(Instr->getRetValue());
+    const Type ReturnType = RetValue->getType();
+    assert(isVectorType(ReturnType) || isScalarFloatingType(ReturnType) ||
+           (ReturnType == IceType_i32) || (ReturnType == IceType_i64));
+    Reg = moveReturnValueToRegister(RetValue, ReturnType);
+  }
+  // Add a ret instruction even if sandboxing is enabled, because addEpilog
+  // explicitly looks for a ret instruction as a marker for where to insert the
+  // frame removal instructions.
+  _ret(Reg);
+  // Add a fake use of esp to make sure esp stays alive for the entire
+  // function. Otherwise post-call esp adjustments get dead-code eliminated.
+  keepEspLiveAtExit();
+}
+
+inline uint32_t makePshufdMask(SizeT Index0, SizeT Index1, SizeT Index2,
+                               SizeT Index3) {
+  const SizeT Mask = (Index0 & 0x3) | ((Index1 & 0x3) << 2) |
+                     ((Index2 & 0x3) << 4) | ((Index3 & 0x3) << 6);
+  assert(Mask < 256);
+  return Mask;
+}
+
+Variable *TargetX8632::lowerShuffleVector_AllFromSameSrc(
+    Operand *Src, SizeT Index0, SizeT Index1, SizeT Index2, SizeT Index3) {
+  constexpr SizeT SrcBit = 1 << 2;
+  assert((Index0 & SrcBit) == (Index1 & SrcBit));
+  assert((Index0 & SrcBit) == (Index2 & SrcBit));
+  assert((Index0 & SrcBit) == (Index3 & SrcBit));
+  (void)SrcBit;
+
+  const Type SrcTy = Src->getType();
+  auto *T = makeReg(SrcTy);
+  auto *SrcRM = legalize(Src, Legal_Reg | Legal_Mem);
+  auto *Mask =
+      Ctx->getConstantInt32(makePshufdMask(Index0, Index1, Index2, Index3));
+  _pshufd(T, SrcRM, Mask);
+  return T;
+}
+
+Variable *
+TargetX8632::lowerShuffleVector_TwoFromSameSrc(Operand *Src0, SizeT Index0,
+                                               SizeT Index1, Operand *Src1,
+                                               SizeT Index2, SizeT Index3) {
+  constexpr SizeT SrcBit = 1 << 2;
+  assert((Index0 & SrcBit) == (Index1 & SrcBit) || (Index1 == IGNORE_INDEX));
+  assert((Index2 & SrcBit) == (Index3 & SrcBit) || (Index3 == IGNORE_INDEX));
+  (void)SrcBit;
+
+  const Type SrcTy = Src0->getType();
+  assert(Src1->getType() == SrcTy);
+  auto *T = makeReg(SrcTy);
+  auto *Src0R = legalizeToReg(Src0);
+  auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
+  auto *Mask =
+      Ctx->getConstantInt32(makePshufdMask(Index0, Index1, Index2, Index3));
+  _movp(T, Src0R);
+  _shufps(T, Src1RM, Mask);
+  return T;
+}
+
+Variable *TargetX8632::lowerShuffleVector_UnifyFromDifferentSrcs(Operand *Src0,
+                                                                 SizeT Index0,
+                                                                 Operand *Src1,
+                                                                 SizeT Index1) {
+  return lowerShuffleVector_TwoFromSameSrc(Src0, Index0, IGNORE_INDEX, Src1,
+                                           Index1, IGNORE_INDEX);
+}
+
+inline SizeT makeSrcSwitchMask(SizeT Index0, SizeT Index1, SizeT Index2,
+                               SizeT Index3) {
+  constexpr SizeT SrcBit = 1 << 2;
+  const SizeT Index0Bits = ((Index0 & SrcBit) == 0) ? 0 : (1 << 0);
+  const SizeT Index1Bits = ((Index1 & SrcBit) == 0) ? 0 : (1 << 1);
+  const SizeT Index2Bits = ((Index2 & SrcBit) == 0) ? 0 : (1 << 2);
+  const SizeT Index3Bits = ((Index3 & SrcBit) == 0) ? 0 : (1 << 3);
+  return Index0Bits | Index1Bits | Index2Bits | Index3Bits;
+}
+
+GlobalString TargetX8632::lowerShuffleVector_NewMaskName() {
+  GlobalString FuncName = Func->getFunctionName();
+  const SizeT Id = PshufbMaskCount++;
+  if (!BuildDefs::dump() || !FuncName.hasStdString()) {
+    return GlobalString::createWithString(
+        Ctx,
+        "$PS" + std::to_string(FuncName.getID()) + "_" + std::to_string(Id));
+  }
+  return GlobalString::createWithString(
+      Ctx, "Pshufb$" + Func->getFunctionName() + "$" + std::to_string(Id));
+}
+
+ConstantRelocatable *TargetX8632::lowerShuffleVector_CreatePshufbMask(
+    int8_t Idx0, int8_t Idx1, int8_t Idx2, int8_t Idx3, int8_t Idx4,
+    int8_t Idx5, int8_t Idx6, int8_t Idx7, int8_t Idx8, int8_t Idx9,
+    int8_t Idx10, int8_t Idx11, int8_t Idx12, int8_t Idx13, int8_t Idx14,
+    int8_t Idx15) {
+  static constexpr uint8_t NumElements = 16;
+  const char Initializer[NumElements] = {
+      Idx0, Idx1, Idx2,  Idx3,  Idx4,  Idx5,  Idx6,  Idx7,
+      Idx8, Idx9, Idx10, Idx11, Idx12, Idx13, Idx14, Idx15,
+  };
+
+  static constexpr Type V4VectorType = IceType_v4i32;
+  const uint32_t MaskAlignment = typeWidthInBytesOnStack(V4VectorType);
+  auto *Mask = VariableDeclaration::create(Func->getGlobalPool());
+  GlobalString MaskName = lowerShuffleVector_NewMaskName();
+  Mask->setIsConstant(true);
+  Mask->addInitializer(VariableDeclaration::DataInitializer::create(
+      Func->getGlobalPool(), Initializer, NumElements));
+  Mask->setName(MaskName);
+  // Mask needs to be 16-byte aligned, or pshufb will seg fault.
+  Mask->setAlignment(MaskAlignment);
+  Func->addGlobal(Mask);
+
+  constexpr RelocOffsetT Offset = 0;
+  return llvm::cast<ConstantRelocatable>(Ctx->getConstantSym(Offset, MaskName));
+}
+
+void TargetX8632::lowerShuffleVector_UsingPshufb(
+    Variable *Dest, Operand *Src0, Operand *Src1, int8_t Idx0, int8_t Idx1,
+    int8_t Idx2, int8_t Idx3, int8_t Idx4, int8_t Idx5, int8_t Idx6,
+    int8_t Idx7, int8_t Idx8, int8_t Idx9, int8_t Idx10, int8_t Idx11,
+    int8_t Idx12, int8_t Idx13, int8_t Idx14, int8_t Idx15) {
+  const Type DestTy = Dest->getType();
+  static constexpr bool NotRebased = false;
+  static constexpr Variable *NoBase = nullptr;
+  // We use void for the memory operand instead of DestTy because using the
+  // latter causes a validation failure: the X86 Inst layer complains that
+  // vector mem operands could be under aligned. Thus, using void we avoid the
+  // validation error. Note that the mask global declaration is aligned, so it
+  // can be used as an XMM mem operand.
+  static constexpr Type MaskType = IceType_void;
+#define IDX_IN_SRC(N, S)                                                       \
+  ((((N) & (1 << 4)) == (S << 4)) ? ((N)&0xf) : CLEAR_ALL_BITS)
+  auto *Mask0M = X86OperandMem::create(
+      Func, MaskType, NoBase,
+      lowerShuffleVector_CreatePshufbMask(
+          IDX_IN_SRC(Idx0, 0), IDX_IN_SRC(Idx1, 0), IDX_IN_SRC(Idx2, 0),
+          IDX_IN_SRC(Idx3, 0), IDX_IN_SRC(Idx4, 0), IDX_IN_SRC(Idx5, 0),
+          IDX_IN_SRC(Idx6, 0), IDX_IN_SRC(Idx7, 0), IDX_IN_SRC(Idx8, 0),
+          IDX_IN_SRC(Idx9, 0), IDX_IN_SRC(Idx10, 0), IDX_IN_SRC(Idx11, 0),
+          IDX_IN_SRC(Idx12, 0), IDX_IN_SRC(Idx13, 0), IDX_IN_SRC(Idx14, 0),
+          IDX_IN_SRC(Idx15, 0)),
+      NotRebased);
+
+  auto *T0 = makeReg(DestTy);
+  auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+  _movp(T0, Src0RM);
+
+  _pshufb(T0, Mask0M);
+
+  if (Idx0 >= 16 || Idx1 >= 16 || Idx2 >= 16 || Idx3 >= 16 || Idx4 >= 16 ||
+      Idx5 >= 16 || Idx6 >= 16 || Idx7 >= 16 || Idx8 >= 16 || Idx9 >= 16 ||
+      Idx10 >= 16 || Idx11 >= 16 || Idx12 >= 16 || Idx13 >= 16 || Idx14 >= 16 ||
+      Idx15 >= 16) {
+    auto *Mask1M = X86OperandMem::create(
+        Func, MaskType, NoBase,
+        lowerShuffleVector_CreatePshufbMask(
+            IDX_IN_SRC(Idx0, 1), IDX_IN_SRC(Idx1, 1), IDX_IN_SRC(Idx2, 1),
+            IDX_IN_SRC(Idx3, 1), IDX_IN_SRC(Idx4, 1), IDX_IN_SRC(Idx5, 1),
+            IDX_IN_SRC(Idx6, 1), IDX_IN_SRC(Idx7, 1), IDX_IN_SRC(Idx8, 1),
+            IDX_IN_SRC(Idx9, 1), IDX_IN_SRC(Idx10, 1), IDX_IN_SRC(Idx11, 1),
+            IDX_IN_SRC(Idx12, 1), IDX_IN_SRC(Idx13, 1), IDX_IN_SRC(Idx14, 1),
+            IDX_IN_SRC(Idx15, 1)),
+        NotRebased);
+#undef IDX_IN_SRC
+    auto *T1 = makeReg(DestTy);
+    auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
+    _movp(T1, Src1RM);
+    _pshufb(T1, Mask1M);
+    _por(T0, T1);
+  }
+
+  _movp(Dest, T0);
+}
+
+void TargetX8632::lowerShuffleVector(const InstShuffleVector *Instr) {
+  auto *Dest = Instr->getDest();
+  const Type DestTy = Dest->getType();
+  auto *Src0 = Instr->getSrc(0);
+  auto *Src1 = Instr->getSrc(1);
+  const SizeT NumElements = typeNumElements(DestTy);
+
+  auto *T = makeReg(DestTy);
+
+  switch (DestTy) {
+  default:
+    llvm::report_fatal_error("Unexpected vector type.");
+  case IceType_v16i1:
+  case IceType_v16i8: {
+    static constexpr SizeT ExpectedNumElements = 16;
+    assert(ExpectedNumElements == Instr->getNumIndexes());
+    (void)ExpectedNumElements;
+
+    if (Instr->indexesAre(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7)) {
+      auto *T = makeReg(DestTy);
+      auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+      _movp(T, Src0RM);
+      _punpckl(T, Src0RM);
+      _movp(Dest, T);
+      return;
+    }
+
+    if (Instr->indexesAre(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7,
+                          23)) {
+      auto *T = makeReg(DestTy);
+      auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+      auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
+      _movp(T, Src0RM);
+      _punpckl(T, Src1RM);
+      _movp(Dest, T);
+      return;
+    }
+
+    if (Instr->indexesAre(8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14,
+                          15, 15)) {
+      auto *T = makeReg(DestTy);
+      auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+      _movp(T, Src0RM);
+      _punpckh(T, Src0RM);
+      _movp(Dest, T);
+      return;
+    }
+
+    if (Instr->indexesAre(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30,
+                          15, 31)) {
+      auto *T = makeReg(DestTy);
+      auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+      auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
+      _movp(T, Src0RM);
+      _punpckh(T, Src1RM);
+      _movp(Dest, T);
+      return;
+    }
+
+    if (InstructionSet < SSE4_1) {
+      // TODO(jpp): figure out how to lower with sse2.
+      break;
+    }
+
+    const SizeT Index0 = Instr->getIndexValue(0);
+    const SizeT Index1 = Instr->getIndexValue(1);
+    const SizeT Index2 = Instr->getIndexValue(2);
+    const SizeT Index3 = Instr->getIndexValue(3);
+    const SizeT Index4 = Instr->getIndexValue(4);
+    const SizeT Index5 = Instr->getIndexValue(5);
+    const SizeT Index6 = Instr->getIndexValue(6);
+    const SizeT Index7 = Instr->getIndexValue(7);
+    const SizeT Index8 = Instr->getIndexValue(8);
+    const SizeT Index9 = Instr->getIndexValue(9);
+    const SizeT Index10 = Instr->getIndexValue(10);
+    const SizeT Index11 = Instr->getIndexValue(11);
+    const SizeT Index12 = Instr->getIndexValue(12);
+    const SizeT Index13 = Instr->getIndexValue(13);
+    const SizeT Index14 = Instr->getIndexValue(14);
+    const SizeT Index15 = Instr->getIndexValue(15);
+
+    lowerShuffleVector_UsingPshufb(Dest, Src0, Src1, Index0, Index1, Index2,
+                                   Index3, Index4, Index5, Index6, Index7,
+                                   Index8, Index9, Index10, Index11, Index12,
+                                   Index13, Index14, Index15);
+    return;
+  }
+  case IceType_v8i1:
+  case IceType_v8i16: {
+    static constexpr SizeT ExpectedNumElements = 8;
+    assert(ExpectedNumElements == Instr->getNumIndexes());
+    (void)ExpectedNumElements;
+
+    if (Instr->indexesAre(0, 0, 1, 1, 2, 2, 3, 3)) {
+      auto *T = makeReg(DestTy);
+      auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+      _movp(T, Src0RM);
+      _punpckl(T, Src0RM);
+      _movp(Dest, T);
+      return;
+    }
+
+    if (Instr->indexesAre(0, 8, 1, 9, 2, 10, 3, 11)) {
+      auto *T = makeReg(DestTy);
+      auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+      auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
+      _movp(T, Src0RM);
+      _punpckl(T, Src1RM);
+      _movp(Dest, T);
+      return;
+    }
+
+    if (Instr->indexesAre(4, 4, 5, 5, 6, 6, 7, 7)) {
+      auto *T = makeReg(DestTy);
+      auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+      _movp(T, Src0RM);
+      _punpckh(T, Src0RM);
+      _movp(Dest, T);
+      return;
+    }
+
+    if (Instr->indexesAre(4, 12, 5, 13, 6, 14, 7, 15)) {
+      auto *T = makeReg(DestTy);
+      auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+      auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
+      _movp(T, Src0RM);
+      _punpckh(T, Src1RM);
+      _movp(Dest, T);
+      return;
+    }
+
+    if (InstructionSet < SSE4_1) {
+      // TODO(jpp): figure out how to lower with sse2.
+      break;
+    }
+
+    const SizeT Index0 = Instr->getIndexValue(0);
+    const SizeT Index1 = Instr->getIndexValue(1);
+    const SizeT Index2 = Instr->getIndexValue(2);
+    const SizeT Index3 = Instr->getIndexValue(3);
+    const SizeT Index4 = Instr->getIndexValue(4);
+    const SizeT Index5 = Instr->getIndexValue(5);
+    const SizeT Index6 = Instr->getIndexValue(6);
+    const SizeT Index7 = Instr->getIndexValue(7);
+
+#define TO_BYTE_INDEX(I) ((I) << 1)
+    lowerShuffleVector_UsingPshufb(
+        Dest, Src0, Src1, TO_BYTE_INDEX(Index0), TO_BYTE_INDEX(Index0) + 1,
+        TO_BYTE_INDEX(Index1), TO_BYTE_INDEX(Index1) + 1, TO_BYTE_INDEX(Index2),
+        TO_BYTE_INDEX(Index2) + 1, TO_BYTE_INDEX(Index3),
+        TO_BYTE_INDEX(Index3) + 1, TO_BYTE_INDEX(Index4),
+        TO_BYTE_INDEX(Index4) + 1, TO_BYTE_INDEX(Index5),
+        TO_BYTE_INDEX(Index5) + 1, TO_BYTE_INDEX(Index6),
+        TO_BYTE_INDEX(Index6) + 1, TO_BYTE_INDEX(Index7),
+        TO_BYTE_INDEX(Index7) + 1);
+#undef TO_BYTE_INDEX
+    return;
+  }
+  case IceType_v4i1:
+  case IceType_v4i32:
+  case IceType_v4f32: {
+    static constexpr SizeT ExpectedNumElements = 4;
+    assert(ExpectedNumElements == Instr->getNumIndexes());
+    const SizeT Index0 = Instr->getIndexValue(0);
+    const SizeT Index1 = Instr->getIndexValue(1);
+    const SizeT Index2 = Instr->getIndexValue(2);
+    const SizeT Index3 = Instr->getIndexValue(3);
+    Variable *T = nullptr;
+    switch (makeSrcSwitchMask(Index0, Index1, Index2, Index3)) {
+#define CASE_SRCS_IN(S0, S1, S2, S3)                                           \
+  case (((S0) << 0) | ((S1) << 1) | ((S2) << 2) | ((S3) << 3))
+      CASE_SRCS_IN(0, 0, 0, 0) : {
+        T = lowerShuffleVector_AllFromSameSrc(Src0, Index0, Index1, Index2,
+                                              Index3);
+      }
+      break;
+      CASE_SRCS_IN(0, 0, 0, 1) : {
+        auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src0, Index2,
+                                                                  Src1, Index3);
+        T = lowerShuffleVector_TwoFromSameSrc(Src0, Index0, Index1, Unified,
+                                              UNIFIED_INDEX_0, UNIFIED_INDEX_1);
+      }
+      break;
+      CASE_SRCS_IN(0, 0, 1, 0) : {
+        auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src1, Index2,
+                                                                  Src0, Index3);
+        T = lowerShuffleVector_TwoFromSameSrc(Src0, Index0, Index1, Unified,
+                                              UNIFIED_INDEX_0, UNIFIED_INDEX_1);
+      }
+      break;
+      CASE_SRCS_IN(0, 0, 1, 1) : {
+        T = lowerShuffleVector_TwoFromSameSrc(Src0, Index0, Index1, Src1,
+                                              Index2, Index3);
+      }
+      break;
+      CASE_SRCS_IN(0, 1, 0, 0) : {
+        auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src0, Index0,
+                                                                  Src1, Index1);
+        T = lowerShuffleVector_TwoFromSameSrc(
+            Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Src0, Index2, Index3);
+      }
+      break;
+      CASE_SRCS_IN(0, 1, 0, 1) : {
+        if (Index0 == 0 && (Index1 - ExpectedNumElements) == 0 && Index2 == 1 &&
+            (Index3 - ExpectedNumElements) == 1) {
+          auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
+          auto *Src0R = legalizeToReg(Src0);
+          T = makeReg(DestTy);
+          _movp(T, Src0R);
+          _punpckl(T, Src1RM);
+        } else if (Index0 == Index2 && Index1 == Index3) {
+          auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(
+              Src0, Index0, Src1, Index1);
+          T = lowerShuffleVector_AllFromSameSrc(
+              Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, UNIFIED_INDEX_0,
+              UNIFIED_INDEX_1);
+        } else {
+          auto *Unified0 = lowerShuffleVector_UnifyFromDifferentSrcs(
+              Src0, Index0, Src1, Index1);
+          auto *Unified1 = lowerShuffleVector_UnifyFromDifferentSrcs(
+              Src0, Index2, Src1, Index3);
+          T = lowerShuffleVector_TwoFromSameSrc(
+              Unified0, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Unified1,
+              UNIFIED_INDEX_0, UNIFIED_INDEX_1);
+        }
+      }
+      break;
+      CASE_SRCS_IN(0, 1, 1, 0) : {
+        if (Index0 == Index3 && Index1 == Index2) {
+          auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(
+              Src0, Index0, Src1, Index1);
+          T = lowerShuffleVector_AllFromSameSrc(
+              Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, UNIFIED_INDEX_1,
+              UNIFIED_INDEX_0);
+        } else {
+          auto *Unified0 = lowerShuffleVector_UnifyFromDifferentSrcs(
+              Src0, Index0, Src1, Index1);
+          auto *Unified1 = lowerShuffleVector_UnifyFromDifferentSrcs(
+              Src1, Index2, Src0, Index3);
+          T = lowerShuffleVector_TwoFromSameSrc(
+              Unified0, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Unified1,
+              UNIFIED_INDEX_0, UNIFIED_INDEX_1);
+        }
+      }
+      break;
+      CASE_SRCS_IN(0, 1, 1, 1) : {
+        auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src0, Index0,
+                                                                  Src1, Index1);
+        T = lowerShuffleVector_TwoFromSameSrc(
+            Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Src1, Index2, Index3);
+      }
+      break;
+      CASE_SRCS_IN(1, 0, 0, 0) : {
+        auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src1, Index0,
+                                                                  Src0, Index1);
+        T = lowerShuffleVector_TwoFromSameSrc(
+            Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Src0, Index2, Index3);
+      }
+      break;
+      CASE_SRCS_IN(1, 0, 0, 1) : {
+        if (Index0 == Index3 && Index1 == Index2) {
+          auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(
+              Src1, Index0, Src0, Index1);
+          T = lowerShuffleVector_AllFromSameSrc(
+              Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, UNIFIED_INDEX_1,
+              UNIFIED_INDEX_0);
+        } else {
+          auto *Unified0 = lowerShuffleVector_UnifyFromDifferentSrcs(
+              Src1, Index0, Src0, Index1);
+          auto *Unified1 = lowerShuffleVector_UnifyFromDifferentSrcs(
+              Src0, Index2, Src1, Index3);
+          T = lowerShuffleVector_TwoFromSameSrc(
+              Unified0, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Unified1,
+              UNIFIED_INDEX_0, UNIFIED_INDEX_1);
+        }
+      }
+      break;
+      CASE_SRCS_IN(1, 0, 1, 0) : {
+        if ((Index0 - ExpectedNumElements) == 0 && Index1 == 0 &&
+            (Index2 - ExpectedNumElements) == 1 && Index3 == 1) {
+          auto *Src1RM = legalize(Src0, Legal_Reg | Legal_Mem);
+          auto *Src0R = legalizeToReg(Src1);
+          T = makeReg(DestTy);
+          _movp(T, Src0R);
+          _punpckl(T, Src1RM);
+        } else if (Index0 == Index2 && Index1 == Index3) {
+          auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(
+              Src1, Index0, Src0, Index1);
+          T = lowerShuffleVector_AllFromSameSrc(
+              Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, UNIFIED_INDEX_0,
+              UNIFIED_INDEX_1);
+        } else {
+          auto *Unified0 = lowerShuffleVector_UnifyFromDifferentSrcs(
+              Src1, Index0, Src0, Index1);
+          auto *Unified1 = lowerShuffleVector_UnifyFromDifferentSrcs(
+              Src1, Index2, Src0, Index3);
+          T = lowerShuffleVector_TwoFromSameSrc(
+              Unified0, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Unified1,
+              UNIFIED_INDEX_0, UNIFIED_INDEX_1);
+        }
+      }
+      break;
+      CASE_SRCS_IN(1, 0, 1, 1) : {
+        auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src1, Index0,
+                                                                  Src0, Index1);
+        T = lowerShuffleVector_TwoFromSameSrc(
+            Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Src1, Index2, Index3);
+      }
+      break;
+      CASE_SRCS_IN(1, 1, 0, 0) : {
+        T = lowerShuffleVector_TwoFromSameSrc(Src1, Index0, Index1, Src0,
+                                              Index2, Index3);
+      }
+      break;
+      CASE_SRCS_IN(1, 1, 0, 1) : {
+        auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src0, Index2,
+                                                                  Src1, Index3);
+        T = lowerShuffleVector_TwoFromSameSrc(Src1, Index0, Index1, Unified,
+                                              UNIFIED_INDEX_0, UNIFIED_INDEX_1);
+      }
+      break;
+      CASE_SRCS_IN(1, 1, 1, 0) : {
+        auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src1, Index2,
+                                                                  Src0, Index3);
+        T = lowerShuffleVector_TwoFromSameSrc(Src1, Index0, Index1, Unified,
+                                              UNIFIED_INDEX_0, UNIFIED_INDEX_1);
+      }
+      break;
+      CASE_SRCS_IN(1, 1, 1, 1) : {
+        T = lowerShuffleVector_AllFromSameSrc(Src1, Index0, Index1, Index2,
+                                              Index3);
+      }
+      break;
+#undef CASE_SRCS_IN
+    }
+
+    assert(T != nullptr);
+    assert(T->getType() == DestTy);
+    _movp(Dest, T);
+    return;
+  } break;
+  }
+
+  // Unoptimized shuffle. Perform a series of inserts and extracts.
+  Context.insert<InstFakeDef>(T);
+  const Type ElementType = typeElementType(DestTy);
+  for (SizeT I = 0; I < Instr->getNumIndexes(); ++I) {
+    auto *Index = Instr->getIndex(I);
+    const SizeT Elem = Index->getValue();
+    auto *ExtElmt = makeReg(ElementType);
+    if (Elem < NumElements) {
+      lowerExtractElement(
+          InstExtractElement::create(Func, ExtElmt, Src0, Index));
+    } else {
+      lowerExtractElement(InstExtractElement::create(
+          Func, ExtElmt, Src1, Ctx->getConstantInt32(Elem - NumElements)));
+    }
+    auto *NewT = makeReg(DestTy);
+    lowerInsertElement(InstInsertElement::create(Func, NewT, T, ExtElmt,
+                                                 Ctx->getConstantInt32(I)));
+    T = NewT;
+  }
+  _movp(Dest, T);
+}
+
+void TargetX8632::lowerSelect(const InstSelect *Select) {
+  Variable *Dest = Select->getDest();
+
+  Operand *Condition = Select->getCondition();
+  // Handle folding opportunities.
+  if (const Inst *Producer = FoldingInfo.getProducerFor(Condition)) {
+    assert(Producer->isDeleted());
+    switch (BoolFolding::getProducerKind(Producer)) {
+    default:
+      break;
+    case BoolFolding::PK_Icmp32:
+    case BoolFolding::PK_Icmp64: {
+      lowerIcmpAndConsumer(llvm::cast<InstIcmp>(Producer), Select);
+      return;
+    }
+    case BoolFolding::PK_Fcmp: {
+      lowerFcmpAndConsumer(llvm::cast<InstFcmp>(Producer), Select);
+      return;
+    }
+    }
+  }
+
+  if (isVectorType(Dest->getType())) {
+    lowerSelectVector(Select);
+    return;
+  }
+
+  Operand *CmpResult = legalize(Condition, Legal_Reg | Legal_Mem);
+  Operand *Zero = Ctx->getConstantZero(IceType_i32);
+  _cmp(CmpResult, Zero);
+  Operand *SrcT = Select->getTrueOperand();
+  Operand *SrcF = Select->getFalseOperand();
+  const BrCond Cond = CondX86::Br_ne;
+  lowerSelectMove(Dest, Cond, SrcT, SrcF);
+}
+
+void TargetX8632::lowerSelectMove(Variable *Dest, BrCond Cond, Operand *SrcT,
+                                  Operand *SrcF) {
+  Type DestTy = Dest->getType();
+  if (typeWidthInBytes(DestTy) == 1 || isFloatingType(DestTy)) {
+    // The cmov instruction doesn't allow 8-bit or FP operands, so we need
+    // explicit control flow.
+    // d=cmp e,f; a=d?b:c ==> cmp e,f; a=b; jne L1; a=c; L1:
+    auto *Label = InstX86Label::create(Func, this);
+    SrcT = legalize(SrcT, Legal_Reg | Legal_Imm);
+    _mov(Dest, SrcT);
+    _br(Cond, Label);
+    SrcF = legalize(SrcF, Legal_Reg | Legal_Imm);
+    _redefined(_mov(Dest, SrcF));
+    Context.insert(Label);
+    return;
+  }
+  // mov t, SrcF; cmov_cond t, SrcT; mov dest, t
+  // But if SrcT is immediate, we might be able to do better, as the cmov
+  // instruction doesn't allow an immediate operand:
+  // mov t, SrcT; cmov_!cond t, SrcF; mov dest, t
+  if (llvm::isa<Constant>(SrcT) && !llvm::isa<Constant>(SrcF)) {
+    std::swap(SrcT, SrcF);
+    Cond = InstX86Base::getOppositeCondition(Cond);
+  }
+  if (!Traits::Is64Bit && DestTy == IceType_i64) {
+    SrcT = legalizeUndef(SrcT);
+    SrcF = legalizeUndef(SrcF);
+    // Set the low portion.
+    auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
+    lowerSelectIntMove(DestLo, Cond, loOperand(SrcT), loOperand(SrcF));
+    // Set the high portion.
+    auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
+    lowerSelectIntMove(DestHi, Cond, hiOperand(SrcT), hiOperand(SrcF));
+    return;
+  }
+
+  assert(DestTy == IceType_i16 || DestTy == IceType_i32 ||
+         (Traits::Is64Bit && DestTy == IceType_i64));
+  lowerSelectIntMove(Dest, Cond, SrcT, SrcF);
+}
+
+void TargetX8632::lowerSelectIntMove(Variable *Dest, BrCond Cond, Operand *SrcT,
+                                     Operand *SrcF) {
+  Variable *T = nullptr;
+  SrcF = legalize(SrcF);
+  _mov(T, SrcF);
+  SrcT = legalize(SrcT, Legal_Reg | Legal_Mem);
+  _cmov(T, SrcT, Cond);
+  _mov(Dest, T);
+}
+
+void TargetX8632::lowerMove(Variable *Dest, Operand *Src, bool IsRedefinition) {
+  assert(Dest->getType() == Src->getType());
+  assert(!Dest->isRematerializable());
+  if (!Traits::Is64Bit && Dest->getType() == IceType_i64) {
+    Src = legalize(Src);
+    Operand *SrcLo = loOperand(Src);
+    Operand *SrcHi = hiOperand(Src);
+    auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
+    auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
+    Variable *T_Lo = nullptr, *T_Hi = nullptr;
+    _mov(T_Lo, SrcLo);
+    _redefined(_mov(DestLo, T_Lo), IsRedefinition);
+    _mov(T_Hi, SrcHi);
+    _redefined(_mov(DestHi, T_Hi), IsRedefinition);
+  } else {
+    Operand *SrcLegal;
+    if (Dest->hasReg()) {
+      // If Dest already has a physical register, then only basic legalization
+      // is needed, as the source operand can be a register, immediate, or
+      // memory.
+      SrcLegal = legalize(Src, Legal_Reg, Dest->getRegNum());
+    } else {
+      // If Dest could be a stack operand, then RI must be a physical register
+      // or a scalar integer immediate.
+      SrcLegal = legalize(Src, Legal_Reg | Legal_Imm);
+    }
+    if (isVectorType(Dest->getType())) {
+      _redefined(_movp(Dest, SrcLegal), IsRedefinition);
+    } else {
+      _redefined(_mov(Dest, SrcLegal), IsRedefinition);
+    }
+  }
+}
+
+bool TargetX8632::lowerOptimizeFcmpSelect(const InstFcmp *Fcmp,
+                                          const InstSelect *Select) {
+  Operand *CmpSrc0 = Fcmp->getSrc(0);
+  Operand *CmpSrc1 = Fcmp->getSrc(1);
+  Operand *SelectSrcT = Select->getTrueOperand();
+  Operand *SelectSrcF = Select->getFalseOperand();
+  Variable *SelectDest = Select->getDest();
+
+  // TODO(capn): also handle swapped compare/select operand order.
+  if (CmpSrc0 != SelectSrcT || CmpSrc1 != SelectSrcF)
+    return false;
+
+  // TODO(sehr, stichnot): fcmp/select patterns (e.g., minsd/maxss) go here.
+  InstFcmp::FCond Condition = Fcmp->getCondition();
+  switch (Condition) {
+  default:
+    return false;
+  case InstFcmp::True:
+    break;
+  case InstFcmp::False:
+    break;
+  case InstFcmp::Ogt: {
+    Variable *T = makeReg(SelectDest->getType());
+    if (isScalarFloatingType(SelectSrcT->getType())) {
+      _mov(T, legalize(SelectSrcT, Legal_Reg | Legal_Mem));
+      _maxss(T, legalize(SelectSrcF, Legal_Reg | Legal_Mem));
+      _mov(SelectDest, T);
+    } else {
+      _movp(T, legalize(SelectSrcT, Legal_Reg | Legal_Mem));
+      _maxps(T, legalize(SelectSrcF, Legal_Reg | Legal_Mem));
+      _movp(SelectDest, T);
+    }
+    return true;
+  } break;
+  case InstFcmp::Olt: {
+    Variable *T = makeReg(SelectSrcT->getType());
+    if (isScalarFloatingType(SelectSrcT->getType())) {
+      _mov(T, legalize(SelectSrcT, Legal_Reg | Legal_Mem));
+      _minss(T, legalize(SelectSrcF, Legal_Reg | Legal_Mem));
+      _mov(SelectDest, T);
+    } else {
+      _movp(T, legalize(SelectSrcT, Legal_Reg | Legal_Mem));
+      _minps(T, legalize(SelectSrcF, Legal_Reg | Legal_Mem));
+      _movp(SelectDest, T);
+    }
+    return true;
+  } break;
+  }
+  return false;
+}
+
+void TargetX8632::lowerIcmp(const InstIcmp *Icmp) {
+  Variable *Dest = Icmp->getDest();
+  if (isVectorType(Dest->getType())) {
+    lowerIcmpVector(Icmp);
+  } else {
+    constexpr Inst *Consumer = nullptr;
+    lowerIcmpAndConsumer(Icmp, Consumer);
+  }
+}
+
+void TargetX8632::lowerSelectVector(const InstSelect *Instr) {
+  Variable *Dest = Instr->getDest();
+  Type DestTy = Dest->getType();
+  Operand *SrcT = Instr->getTrueOperand();
+  Operand *SrcF = Instr->getFalseOperand();
+  Operand *Condition = Instr->getCondition();
+
+  if (!isVectorType(DestTy))
+    llvm::report_fatal_error("Expected a vector select");
+
+  Type SrcTy = SrcT->getType();
+  Variable *T = makeReg(SrcTy);
+  Operand *SrcTRM = legalize(SrcT, Legal_Reg | Legal_Mem);
+  Operand *SrcFRM = legalize(SrcF, Legal_Reg | Legal_Mem);
+
+  if (InstructionSet >= SSE4_1) {
+    // TODO(wala): If the condition operand is a constant, use blendps or
+    // pblendw.
+    //
+    // Use blendvps or pblendvb to implement select.
+    if (SrcTy == IceType_v4i1 || SrcTy == IceType_v4i32 ||
+        SrcTy == IceType_v4f32) {
+      Operand *ConditionRM = legalize(Condition, Legal_Reg | Legal_Mem);
+      Variable *xmm0 = makeReg(IceType_v4i32, Traits::RegisterSet::Reg_xmm0);
+      _movp(xmm0, ConditionRM);
+      _psll(xmm0, Ctx->getConstantInt8(31));
+      _movp(T, SrcFRM);
+      _blendvps(T, SrcTRM, xmm0);
+      _movp(Dest, T);
+    } else {
+      assert(typeNumElements(SrcTy) == 8 || typeNumElements(SrcTy) == 16);
+      Type SignExtTy =
+          Condition->getType() == IceType_v8i1 ? IceType_v8i16 : IceType_v16i8;
+      Variable *xmm0 = makeReg(SignExtTy, Traits::RegisterSet::Reg_xmm0);
+      lowerCast(InstCast::create(Func, InstCast::Sext, xmm0, Condition));
+      _movp(T, SrcFRM);
+      _pblendvb(T, SrcTRM, xmm0);
+      _movp(Dest, T);
+    }
+    return;
+  }
+  // Lower select without Traits::SSE4.1:
+  // a=d?b:c ==>
+  //   if elementtype(d) != i1:
+  //      d=sext(d);
+  //   a=(b&d)|(c&~d);
+  Variable *T2 = makeReg(SrcTy);
+  // Sign extend the condition operand if applicable.
+  if (SrcTy == IceType_v4f32) {
+    // The sext operation takes only integer arguments.
+    Variable *T3 = Func->makeVariable(IceType_v4i32);
+    lowerCast(InstCast::create(Func, InstCast::Sext, T3, Condition));
+    _movp(T, T3);
+  } else if (typeElementType(SrcTy) != IceType_i1) {
+    lowerCast(InstCast::create(Func, InstCast::Sext, T, Condition));
+  } else {
+    Operand *ConditionRM = legalize(Condition, Legal_Reg | Legal_Mem);
+    _movp(T, ConditionRM);
+  }
+  _movp(T2, T);
+  _pand(T, SrcTRM);
+  _pandn(T2, SrcFRM);
+  _por(T, T2);
+  _movp(Dest, T);
+
+  return;
+}
+
+void TargetX8632::lowerStore(const InstStore *Instr) {
+  Operand *Value = Instr->getData();
+  Operand *Addr = Instr->getStoreAddress();
+  X86OperandMem *NewAddr = formMemoryOperand(Addr, Value->getType());
+  doMockBoundsCheck(NewAddr);
+  Type Ty = NewAddr->getType();
+
+  if (!Traits::Is64Bit && Ty == IceType_i64) {
+    Value = legalizeUndef(Value);
+    Operand *ValueHi = legalize(hiOperand(Value), Legal_Reg | Legal_Imm);
+    _store(ValueHi, llvm::cast<X86OperandMem>(hiOperand(NewAddr)));
+    Operand *ValueLo = legalize(loOperand(Value), Legal_Reg | Legal_Imm);
+    _store(ValueLo, llvm::cast<X86OperandMem>(loOperand(NewAddr)));
+  } else if (isVectorType(Ty)) {
+    _storep(legalizeToReg(Value), NewAddr);
+  } else {
+    Value = legalize(Value, Legal_Reg | Legal_Imm);
+    _store(Value, NewAddr);
+  }
+}
+
+void TargetX8632::doAddressOptStore() {
+  auto *Instr = llvm::cast<InstStore>(Context.getCur());
+  Operand *Addr = Instr->getStoreAddress();
+  Operand *Data = Instr->getData();
+  if (auto *OptAddr = computeAddressOpt(Instr, Data->getType(), Addr)) {
+    Instr->setDeleted();
+    auto *NewStore = Context.insert<InstStore>(Data, OptAddr);
+    if (Instr->getDest())
+      NewStore->setRmwBeacon(Instr->getRmwBeacon());
+  }
+}
+
+void TargetX8632::doAddressOptStoreSubVector() {
+  auto *Intrinsic = llvm::cast<InstIntrinsic>(Context.getCur());
+  Operand *Addr = Intrinsic->getArg(1);
+  Operand *Data = Intrinsic->getArg(0);
+  if (auto *OptAddr = computeAddressOpt(Intrinsic, Data->getType(), Addr)) {
+    Intrinsic->setDeleted();
+    const Ice::Intrinsics::IntrinsicInfo Info = {
+        Ice::Intrinsics::StoreSubVector, Ice::Intrinsics::SideEffects_T,
+        Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_T};
+    auto *NewStore = Context.insert<InstIntrinsic>(3, nullptr, Info);
+    NewStore->addArg(Data);
+    NewStore->addArg(OptAddr);
+    NewStore->addArg(Intrinsic->getArg(2));
+  }
+}
+
+Operand *TargetX8632::lowerCmpRange(Operand *Comparison, uint64_t Min,
+                                    uint64_t Max) {
+  // TODO(ascull): 64-bit should not reach here but only because it is not
+  // implemented yet. This should be able to handle the 64-bit case.
+  assert(Traits::Is64Bit || Comparison->getType() != IceType_i64);
+  // Subtracting 0 is a nop so don't do it
+  if (Min != 0) {
+    // Avoid clobbering the comparison by copying it
+    Variable *T = nullptr;
+    _mov(T, Comparison);
+    _sub(T, Ctx->getConstantInt32(Min));
+    Comparison = T;
+  }
+
+  _cmp(Comparison, Ctx->getConstantInt32(Max - Min));
+
+  return Comparison;
+}
+
+void TargetX8632::lowerCaseCluster(const CaseCluster &Case, Operand *Comparison,
+                                   bool DoneCmp, CfgNode *DefaultTarget) {
+  switch (Case.getKind()) {
+  case CaseCluster::JumpTable: {
+    InstX86Label *SkipJumpTable;
+
+    Operand *RangeIndex =
+        lowerCmpRange(Comparison, Case.getLow(), Case.getHigh());
+    if (DefaultTarget == nullptr) {
+      // Skip over jump table logic if comparison not in range and no default
+      SkipJumpTable = InstX86Label::create(Func, this);
+      _br(CondX86::Br_a, SkipJumpTable);
+    } else {
+      _br(CondX86::Br_a, DefaultTarget);
+    }
+
+    InstJumpTable *JumpTable = Case.getJumpTable();
+    Context.insert(JumpTable);
+
+    // Make sure the index is a register of the same width as the base
+    Variable *Index;
+    const Type PointerType = getPointerType();
+    if (RangeIndex->getType() != PointerType) {
+      Index = makeReg(PointerType);
+      if (RangeIndex->getType() == IceType_i64) {
+        assert(Traits::Is64Bit);
+        _mov(Index, RangeIndex); // trunc
+      } else {
+        Operand *RangeIndexRM = legalize(RangeIndex, Legal_Reg | Legal_Mem);
+        _movzx(Index, RangeIndexRM);
+      }
+    } else {
+      Index = legalizeToReg(RangeIndex);
+    }
+
+    constexpr RelocOffsetT RelocOffset = 0;
+    constexpr Variable *NoBase = nullptr;
+    constexpr Constant *NoOffset = nullptr;
+    auto JTName = GlobalString::createWithString(Ctx, JumpTable->getName());
+    Constant *Offset = Ctx->getConstantSym(RelocOffset, JTName);
+    uint16_t Shift = typeWidthInBytesLog2(PointerType);
+    constexpr auto Segment = X86OperandMem::SegmentRegisters::DefaultSegment;
+
+    Variable *Target = nullptr;
+    if (PointerType == IceType_i32) {
+      _mov(Target, X86OperandMem::create(Func, PointerType, NoBase, Offset,
+                                         Index, Shift, Segment));
+    } else {
+      auto *Base = makeReg(IceType_i64);
+      _lea(Base, X86OperandMem::create(Func, IceType_void, NoBase, Offset));
+      _mov(Target, X86OperandMem::create(Func, PointerType, Base, NoOffset,
+                                         Index, Shift, Segment));
+    }
+
+    lowerIndirectJump(Target);
+
+    if (DefaultTarget == nullptr)
+      Context.insert(SkipJumpTable);
+    return;
+  }
+  case CaseCluster::Range: {
+    if (Case.isUnitRange()) {
+      // Single item
+      if (!DoneCmp) {
+        Constant *Value = Ctx->getConstantInt32(Case.getLow());
+        _cmp(Comparison, Value);
+      }
+      _br(CondX86::Br_e, Case.getTarget());
+    } else if (DoneCmp && Case.isPairRange()) {
+      // Range of two items with first item aleady compared against
+      _br(CondX86::Br_e, Case.getTarget());
+      Constant *Value = Ctx->getConstantInt32(Case.getHigh());
+      _cmp(Comparison, Value);
+      _br(CondX86::Br_e, Case.getTarget());
+    } else {
+      // Range
+      lowerCmpRange(Comparison, Case.getLow(), Case.getHigh());
+      _br(CondX86::Br_be, Case.getTarget());
+    }
+    if (DefaultTarget != nullptr)
+      _br(DefaultTarget);
+    return;
+  }
+  }
+}
+
+void TargetX8632::lowerSwitch(const InstSwitch *Instr) {
+  // Group cases together and navigate through them with a binary search
+  CaseClusterArray CaseClusters = CaseCluster::clusterizeSwitch(Func, Instr);
+  Operand *Src0 = Instr->getComparison();
+  CfgNode *DefaultTarget = Instr->getLabelDefault();
+
+  assert(CaseClusters.size() != 0); // Should always be at least one
+
+  if (!Traits::Is64Bit && Src0->getType() == IceType_i64) {
+    Src0 = legalize(Src0); // get Base/Index into physical registers
+    Operand *Src0Lo = loOperand(Src0);
+    Operand *Src0Hi = hiOperand(Src0);
+    if (CaseClusters.back().getHigh() > UINT32_MAX) {
+      // TODO(ascull): handle 64-bit case properly (currently naive version)
+      // This might be handled by a higher level lowering of switches.
+      SizeT NumCases = Instr->getNumCases();
+      if (NumCases >= 2) {
+        Src0Lo = legalizeToReg(Src0Lo);
+        Src0Hi = legalizeToReg(Src0Hi);
+      } else {
+        Src0Lo = legalize(Src0Lo, Legal_Reg | Legal_Mem);
+        Src0Hi = legalize(Src0Hi, Legal_Reg | Legal_Mem);
+      }
+      for (SizeT I = 0; I < NumCases; ++I) {
+        Constant *ValueLo = Ctx->getConstantInt32(Instr->getValue(I));
+        Constant *ValueHi = Ctx->getConstantInt32(Instr->getValue(I) >> 32);
+        InstX86Label *Label = InstX86Label::create(Func, this);
+        _cmp(Src0Lo, ValueLo);
+        _br(CondX86::Br_ne, Label);
+        _cmp(Src0Hi, ValueHi);
+        _br(CondX86::Br_e, Instr->getLabel(I));
+        Context.insert(Label);
+      }
+      _br(Instr->getLabelDefault());
+      return;
+    } else {
+      // All the values are 32-bit so just check the operand is too and then
+      // fall through to the 32-bit implementation. This is a common case.
+      Src0Hi = legalize(Src0Hi, Legal_Reg | Legal_Mem);
+      Constant *Zero = Ctx->getConstantInt32(0);
+      _cmp(Src0Hi, Zero);
+      _br(CondX86::Br_ne, DefaultTarget);
+      Src0 = Src0Lo;
+    }
+  }
+
+  // 32-bit lowering
+
+  if (CaseClusters.size() == 1) {
+    // Jump straight to default if needed. Currently a common case as jump
+    // tables occur on their own.
+    constexpr bool DoneCmp = false;
+    lowerCaseCluster(CaseClusters.front(), Src0, DoneCmp, DefaultTarget);
+    return;
+  }
+
+  // Going to be using multiple times so get it in a register early
+  Variable *Comparison = legalizeToReg(Src0);
+
+  // A span is over the clusters
+  struct SearchSpan {
+    SearchSpan(SizeT Begin, SizeT Size, InstX86Label *Label)
+        : Begin(Begin), Size(Size), Label(Label) {}
+
+    SizeT Begin;
+    SizeT Size;
+    InstX86Label *Label;
+  };
+  // The stack will only grow to the height of the tree so 12 should be plenty
+  std::stack<SearchSpan, llvm::SmallVector<SearchSpan, 12>> SearchSpanStack;
+  SearchSpanStack.emplace(0, CaseClusters.size(), nullptr);
+  bool DoneCmp = false;
+
+  while (!SearchSpanStack.empty()) {
+    SearchSpan Span = SearchSpanStack.top();
+    SearchSpanStack.pop();
+
+    if (Span.Label != nullptr)
+      Context.insert(Span.Label);
+
+    switch (Span.Size) {
+    case 0:
+      llvm::report_fatal_error("Invalid SearchSpan size");
+      break;
+
+    case 1:
+      lowerCaseCluster(CaseClusters[Span.Begin], Comparison, DoneCmp,
+                       SearchSpanStack.empty() ? nullptr : DefaultTarget);
+      DoneCmp = false;
+      break;
+
+    case 2: {
+      const CaseCluster *CaseA = &CaseClusters[Span.Begin];
+      const CaseCluster *CaseB = &CaseClusters[Span.Begin + 1];
+
+      // Placing a range last may allow register clobbering during the range
+      // test. That means there is no need to clone the register. If it is a
+      // unit range the comparison may have already been done in the binary
+      // search (DoneCmp) and so it should be placed first. If this is a range
+      // of two items and the comparison with the low value has already been
+      // done, comparing with the other element is cheaper than a range test.
+      // If the low end of the range is zero then there is no subtraction and
+      // nothing to be gained.
+      if (!CaseA->isUnitRange() &&
+          !(CaseA->getLow() == 0 || (DoneCmp && CaseA->isPairRange()))) {
+        std::swap(CaseA, CaseB);
+        DoneCmp = false;
+      }
+
+      lowerCaseCluster(*CaseA, Comparison, DoneCmp);
+      DoneCmp = false;
+      lowerCaseCluster(*CaseB, Comparison, DoneCmp,
+                       SearchSpanStack.empty() ? nullptr : DefaultTarget);
+    } break;
+
+    default:
+      // Pick the middle item and branch b or ae
+      SizeT PivotIndex = Span.Begin + (Span.Size / 2);
+      const CaseCluster &Pivot = CaseClusters[PivotIndex];
+      Constant *Value = Ctx->getConstantInt32(Pivot.getLow());
+      InstX86Label *Label = InstX86Label::create(Func, this);
+      _cmp(Comparison, Value);
+      // TODO(ascull): does it alway have to be far?
+      _br(CondX86::Br_b, Label, InstX86Br::Far);
+      // Lower the left and (pivot+right) sides, falling through to the right
+      SearchSpanStack.emplace(Span.Begin, Span.Size / 2, Label);
+      SearchSpanStack.emplace(PivotIndex, Span.Size - (Span.Size / 2), nullptr);
+      DoneCmp = true;
+      break;
+    }
+  }
+
+  _br(DefaultTarget);
+}
+
+/// The following pattern occurs often in lowered C and C++ code:
+///
+///   %cmp     = fcmp/icmp pred <n x ty> %src0, %src1
+///   %cmp.ext = sext <n x i1> %cmp to <n x ty>
+///
+/// We can eliminate the sext operation by copying the result of pcmpeqd,
+/// pcmpgtd, or cmpps (which produce sign extended results) to the result of the
+/// sext operation.
+
+void TargetX8632::eliminateNextVectorSextInstruction(
+    Variable *SignExtendedResult) {
+  if (auto *NextCast =
+          llvm::dyn_cast_or_null<InstCast>(Context.getNextInst())) {
+    if (NextCast->getCastKind() == InstCast::Sext &&
+        NextCast->getSrc(0) == SignExtendedResult) {
+      NextCast->setDeleted();
+      _movp(NextCast->getDest(), legalizeToReg(SignExtendedResult));
+      // Skip over the instruction.
+      Context.advanceNext();
+    }
+  }
+}
+
+void TargetX8632::lowerUnreachable(const InstUnreachable * /*Instr*/) {
+  _ud2();
+  // Add a fake use of esp to make sure esp adjustments after the unreachable
+  // do not get dead-code eliminated.
+  keepEspLiveAtExit();
+}
+
+void TargetX8632::lowerBreakpoint(const InstBreakpoint * /*Instr*/) { _int3(); }
+
+void TargetX8632::lowerRMW(const InstX86FakeRMW *RMW) {
+  // If the beacon variable's live range does not end in this instruction, then
+  // it must end in the modified Store instruction that follows. This means
+  // that the original Store instruction is still there, either because the
+  // value being stored is used beyond the Store instruction, or because dead
+  // code elimination did not happen. In either case, we cancel RMW lowering
+  // (and the caller deletes the RMW instruction).
+  if (!RMW->isLastUse(RMW->getBeacon()))
+    return;
+  Operand *Src = RMW->getData();
+  Type Ty = Src->getType();
+  X86OperandMem *Addr = formMemoryOperand(RMW->getAddr(), Ty);
+  doMockBoundsCheck(Addr);
+  if (!Traits::Is64Bit && Ty == IceType_i64) {
+    Src = legalizeUndef(Src);
+    Operand *SrcLo = legalize(loOperand(Src), Legal_Reg | Legal_Imm);
+    Operand *SrcHi = legalize(hiOperand(Src), Legal_Reg | Legal_Imm);
+    auto *AddrLo = llvm::cast<X86OperandMem>(loOperand(Addr));
+    auto *AddrHi = llvm::cast<X86OperandMem>(hiOperand(Addr));
+    switch (RMW->getOp()) {
+    default:
+      // TODO(stichnot): Implement other arithmetic operators.
+      break;
+    case InstArithmetic::Add:
+      _add_rmw(AddrLo, SrcLo);
+      _adc_rmw(AddrHi, SrcHi);
+      return;
+    case InstArithmetic::Sub:
+      _sub_rmw(AddrLo, SrcLo);
+      _sbb_rmw(AddrHi, SrcHi);
+      return;
+    case InstArithmetic::And:
+      _and_rmw(AddrLo, SrcLo);
+      _and_rmw(AddrHi, SrcHi);
+      return;
+    case InstArithmetic::Or:
+      _or_rmw(AddrLo, SrcLo);
+      _or_rmw(AddrHi, SrcHi);
+      return;
+    case InstArithmetic::Xor:
+      _xor_rmw(AddrLo, SrcLo);
+      _xor_rmw(AddrHi, SrcHi);
+      return;
+    }
+  } else {
+    // x86-32: i8, i16, i32
+    // x86-64: i8, i16, i32, i64
+    switch (RMW->getOp()) {
+    default:
+      // TODO(stichnot): Implement other arithmetic operators.
+      break;
+    case InstArithmetic::Add:
+      Src = legalize(Src, Legal_Reg | Legal_Imm);
+      _add_rmw(Addr, Src);
+      return;
+    case InstArithmetic::Sub:
+      Src = legalize(Src, Legal_Reg | Legal_Imm);
+      _sub_rmw(Addr, Src);
+      return;
+    case InstArithmetic::And:
+      Src = legalize(Src, Legal_Reg | Legal_Imm);
+      _and_rmw(Addr, Src);
+      return;
+    case InstArithmetic::Or:
+      Src = legalize(Src, Legal_Reg | Legal_Imm);
+      _or_rmw(Addr, Src);
+      return;
+    case InstArithmetic::Xor:
+      Src = legalize(Src, Legal_Reg | Legal_Imm);
+      _xor_rmw(Addr, Src);
+      return;
+    }
+  }
+  llvm::report_fatal_error("Couldn't lower RMW instruction");
+}
+
+void TargetX8632::lowerOther(const Inst *Instr) {
+  if (const auto *RMW = llvm::dyn_cast<InstX86FakeRMW>(Instr)) {
+    lowerRMW(RMW);
+  } else {
+    TargetLowering::lowerOther(Instr);
+  }
+}
+
+/// Turn an i64 Phi instruction into a pair of i32 Phi instructions, to preserve
+/// integrity of liveness analysis. Undef values are also turned into zeroes,
+/// since loOperand() and hiOperand() don't expect Undef input.
+void TargetX8632::prelowerPhis() {
+  if (Traits::Is64Bit) {
+    // On x86-64 we don't need to prelower phis -- the architecture can handle
+    // 64-bit integer natively.
+    return;
+  }
+
+  PhiLowering::prelowerPhis32Bit<TargetX8632>(this, Context.getNode(), Func);
+}
+
+void TargetX8632::genTargetHelperCallFor(Inst *Instr) {
+  uint32_t StackArgumentsSize = 0;
+  if (auto *Arith = llvm::dyn_cast<InstArithmetic>(Instr)) {
+    RuntimeHelper HelperID = RuntimeHelper::H_Num;
+    Variable *Dest = Arith->getDest();
+    Type DestTy = Dest->getType();
+    if (!Traits::Is64Bit && DestTy == IceType_i64) {
+      switch (Arith->getOp()) {
+      default:
+        return;
+      case InstArithmetic::Udiv:
+        HelperID = RuntimeHelper::H_udiv_i64;
+        break;
+      case InstArithmetic::Sdiv:
+        HelperID = RuntimeHelper::H_sdiv_i64;
+        break;
+      case InstArithmetic::Urem:
+        HelperID = RuntimeHelper::H_urem_i64;
+        break;
+      case InstArithmetic::Srem:
+        HelperID = RuntimeHelper::H_srem_i64;
+        break;
+      }
+    } else if (isVectorType(DestTy)) {
+      Variable *Dest = Arith->getDest();
+      Operand *Src0 = Arith->getSrc(0);
+      Operand *Src1 = Arith->getSrc(1);
+      switch (Arith->getOp()) {
+      default:
+        return;
+      case InstArithmetic::Mul:
+        if (DestTy == IceType_v16i8) {
+          scalarizeArithmetic(Arith->getOp(), Dest, Src0, Src1);
+          Arith->setDeleted();
+        }
+        return;
+      case InstArithmetic::Shl:
+      case InstArithmetic::Lshr:
+      case InstArithmetic::Ashr:
+        if (llvm::isa<Constant>(Src1)) {
+          return;
+        }
+      case InstArithmetic::Udiv:
+      case InstArithmetic::Urem:
+      case InstArithmetic::Sdiv:
+      case InstArithmetic::Srem:
+      case InstArithmetic::Frem:
+        scalarizeArithmetic(Arith->getOp(), Dest, Src0, Src1);
+        Arith->setDeleted();
+        return;
+      }
+    } else {
+      switch (Arith->getOp()) {
+      default:
+        return;
+      case InstArithmetic::Frem:
+        if (isFloat32Asserting32Or64(DestTy))
+          HelperID = RuntimeHelper::H_frem_f32;
+        else
+          HelperID = RuntimeHelper::H_frem_f64;
+      }
+    }
+    constexpr SizeT MaxSrcs = 2;
+    InstCall *Call = makeHelperCall(HelperID, Dest, MaxSrcs);
+    Call->addArg(Arith->getSrc(0));
+    Call->addArg(Arith->getSrc(1));
+    StackArgumentsSize = getCallStackArgumentsSizeBytes(Call);
+    Context.insert(Call);
+    Arith->setDeleted();
+  } else if (auto *Cast = llvm::dyn_cast<InstCast>(Instr)) {
+    InstCast::OpKind CastKind = Cast->getCastKind();
+    Operand *Src0 = Cast->getSrc(0);
+    const Type SrcType = Src0->getType();
+    Variable *Dest = Cast->getDest();
+    const Type DestTy = Dest->getType();
+    RuntimeHelper HelperID = RuntimeHelper::H_Num;
+    Variable *CallDest = Dest;
+    switch (CastKind) {
+    default:
+      return;
+    case InstCast::Fptosi:
+      if (!Traits::Is64Bit && DestTy == IceType_i64) {
+        HelperID = isFloat32Asserting32Or64(SrcType)
+                       ? RuntimeHelper::H_fptosi_f32_i64
+                       : RuntimeHelper::H_fptosi_f64_i64;
+      } else {
+        return;
+      }
+      break;
+    case InstCast::Fptoui:
+      if (isVectorType(DestTy)) {
+        assert(DestTy == IceType_v4i32);
+        assert(SrcType == IceType_v4f32);
+        HelperID = RuntimeHelper::H_fptoui_4xi32_f32;
+      } else if (DestTy == IceType_i64 ||
+                 (!Traits::Is64Bit && DestTy == IceType_i32)) {
+        if (Traits::Is64Bit) {
+          HelperID = isFloat32Asserting32Or64(SrcType)
+                         ? RuntimeHelper::H_fptoui_f32_i64
+                         : RuntimeHelper::H_fptoui_f64_i64;
+        } else if (isInt32Asserting32Or64(DestTy)) {
+          HelperID = isFloat32Asserting32Or64(SrcType)
+                         ? RuntimeHelper::H_fptoui_f32_i32
+                         : RuntimeHelper::H_fptoui_f64_i32;
+        } else {
+          HelperID = isFloat32Asserting32Or64(SrcType)
+                         ? RuntimeHelper::H_fptoui_f32_i64
+                         : RuntimeHelper::H_fptoui_f64_i64;
+        }
+      } else {
+        return;
+      }
+      break;
+    case InstCast::Sitofp:
+      if (!Traits::Is64Bit && SrcType == IceType_i64) {
+        HelperID = isFloat32Asserting32Or64(DestTy)
+                       ? RuntimeHelper::H_sitofp_i64_f32
+                       : RuntimeHelper::H_sitofp_i64_f64;
+      } else {
+        return;
+      }
+      break;
+    case InstCast::Uitofp:
+      if (isVectorType(SrcType)) {
+        assert(DestTy == IceType_v4f32);
+        assert(SrcType == IceType_v4i32);
+        HelperID = RuntimeHelper::H_uitofp_4xi32_4xf32;
+      } else if (SrcType == IceType_i64 ||
+                 (!Traits::Is64Bit && SrcType == IceType_i32)) {
+        if (isInt32Asserting32Or64(SrcType)) {
+          HelperID = isFloat32Asserting32Or64(DestTy)
+                         ? RuntimeHelper::H_uitofp_i32_f32
+                         : RuntimeHelper::H_uitofp_i32_f64;
+        } else {
+          HelperID = isFloat32Asserting32Or64(DestTy)
+                         ? RuntimeHelper::H_uitofp_i64_f32
+                         : RuntimeHelper::H_uitofp_i64_f64;
+        }
+      } else {
+        return;
+      }
+      break;
+    case InstCast::Bitcast: {
+      if (DestTy == Src0->getType())
+        return;
+      switch (DestTy) {
+      default:
+        return;
+      case IceType_i8:
+        assert(Src0->getType() == IceType_v8i1);
+        HelperID = RuntimeHelper::H_bitcast_8xi1_i8;
+        CallDest = Func->makeVariable(IceType_i32);
+        break;
+      case IceType_i16:
+        assert(Src0->getType() == IceType_v16i1);
+        HelperID = RuntimeHelper::H_bitcast_16xi1_i16;
+        CallDest = Func->makeVariable(IceType_i32);
+        break;
+      case IceType_v8i1: {
+        assert(Src0->getType() == IceType_i8);
+        HelperID = RuntimeHelper::H_bitcast_i8_8xi1;
+        Variable *Src0AsI32 = Func->makeVariable(stackSlotType());
+        // Arguments to functions are required to be at least 32 bits wide.
+        Context.insert<InstCast>(InstCast::Zext, Src0AsI32, Src0);
+        Src0 = Src0AsI32;
+      } break;
+      case IceType_v16i1: {
+        assert(Src0->getType() == IceType_i16);
+        HelperID = RuntimeHelper::H_bitcast_i16_16xi1;
+        Variable *Src0AsI32 = Func->makeVariable(stackSlotType());
+        // Arguments to functions are required to be at least 32 bits wide.
+        Context.insert<InstCast>(InstCast::Zext, Src0AsI32, Src0);
+        Src0 = Src0AsI32;
+      } break;
+      }
+    } break;
+    }
+    constexpr SizeT MaxSrcs = 1;
+    InstCall *Call = makeHelperCall(HelperID, CallDest, MaxSrcs);
+    Call->addArg(Src0);
+    StackArgumentsSize = getCallStackArgumentsSizeBytes(Call);
+    Context.insert(Call);
+    // The PNaCl ABI disallows i8/i16 return types, so truncate the helper call
+    // result to the appropriate type as necessary.
+    if (CallDest->getType() != Dest->getType())
+      Context.insert<InstCast>(InstCast::Trunc, Dest, CallDest);
+    Cast->setDeleted();
+  } else if (auto *Intrinsic = llvm::dyn_cast<InstIntrinsic>(Instr)) {
+    CfgVector<Type> ArgTypes;
+    Type ReturnType = IceType_void;
+    switch (Intrinsic->getIntrinsicID()) {
+    default:
+      return;
+    case Intrinsics::Ctpop: {
+      Operand *Val = Intrinsic->getArg(0);
+      Type ValTy = Val->getType();
+      if (ValTy == IceType_i64)
+        ArgTypes = {IceType_i64};
+      else
+        ArgTypes = {IceType_i32};
+      ReturnType = IceType_i32;
+    } break;
+    case Intrinsics::Longjmp:
+      ArgTypes = {IceType_i32, IceType_i32};
+      ReturnType = IceType_void;
+      break;
+    case Intrinsics::Memcpy:
+      ArgTypes = {IceType_i32, IceType_i32, IceType_i32};
+      ReturnType = IceType_void;
+      break;
+    case Intrinsics::Memmove:
+      ArgTypes = {IceType_i32, IceType_i32, IceType_i32};
+      ReturnType = IceType_void;
+      break;
+    case Intrinsics::Memset:
+      ArgTypes = {IceType_i32, IceType_i32, IceType_i32};
+      ReturnType = IceType_void;
+      break;
+    case Intrinsics::Setjmp:
+      ArgTypes = {IceType_i32};
+      ReturnType = IceType_i32;
+      break;
+    }
+    StackArgumentsSize = getCallStackArgumentsSizeBytes(ArgTypes, ReturnType);
+  } else if (auto *Call = llvm::dyn_cast<InstCall>(Instr)) {
+    StackArgumentsSize = getCallStackArgumentsSizeBytes(Call);
+  } else if (auto *Ret = llvm::dyn_cast<InstRet>(Instr)) {
+    if (!Ret->hasRetValue())
+      return;
+    Operand *RetValue = Ret->getRetValue();
+    Type ReturnType = RetValue->getType();
+    if (!isScalarFloatingType(ReturnType))
+      return;
+    StackArgumentsSize = typeWidthInBytes(ReturnType);
+  } else {
+    return;
+  }
+  StackArgumentsSize = Traits::applyStackAlignment(StackArgumentsSize);
+  updateMaxOutArgsSizeBytes(StackArgumentsSize);
+}
+
+uint32_t
+TargetX8632::getCallStackArgumentsSizeBytes(const CfgVector<Type> &ArgTypes,
+                                            Type ReturnType) {
+  uint32_t OutArgumentsSizeBytes = 0;
+  uint32_t XmmArgCount = 0;
+  uint32_t GprArgCount = 0;
+  for (SizeT i = 0, NumArgTypes = ArgTypes.size(); i < NumArgTypes; ++i) {
+    Type Ty = ArgTypes[i];
+    // The PNaCl ABI requires the width of arguments to be at least 32 bits.
+    assert(typeWidthInBytes(Ty) >= 4);
+    if (isVectorType(Ty) &&
+        Traits::getRegisterForXmmArgNum(Traits::getArgIndex(i, XmmArgCount))
+            .hasValue()) {
+      ++XmmArgCount;
+    } else if (isScalarFloatingType(Ty) && Traits::X86_PASS_SCALAR_FP_IN_XMM &&
+               Traits::getRegisterForXmmArgNum(
+                   Traits::getArgIndex(i, XmmArgCount))
+                   .hasValue()) {
+      ++XmmArgCount;
+    } else if (isScalarIntegerType(Ty) &&
+               Traits::getRegisterForGprArgNum(
+                   Ty, Traits::getArgIndex(i, GprArgCount))
+                   .hasValue()) {
+      // The 64 bit ABI allows some integers to be passed in GPRs.
+      ++GprArgCount;
+    } else {
+      if (isVectorType(Ty)) {
+        OutArgumentsSizeBytes =
+            Traits::applyStackAlignment(OutArgumentsSizeBytes);
+      }
+      OutArgumentsSizeBytes += typeWidthInBytesOnStack(Ty);
+    }
+  }
+  if (Traits::Is64Bit)
+    return OutArgumentsSizeBytes;
+  // The 32 bit ABI requires floating point values to be returned on the x87 FP
+  // stack. Ensure there is enough space for the fstp/movs for floating returns.
+  if (isScalarFloatingType(ReturnType)) {
+    OutArgumentsSizeBytes =
+        std::max(OutArgumentsSizeBytes,
+                 static_cast<uint32_t>(typeWidthInBytesOnStack(ReturnType)));
+  }
+  return OutArgumentsSizeBytes;
+}
+
+uint32_t TargetX8632::getCallStackArgumentsSizeBytes(const InstCall *Instr) {
+  // Build a vector of the arguments' types.
+  const SizeT NumArgs = Instr->getNumArgs();
+  CfgVector<Type> ArgTypes;
+  ArgTypes.reserve(NumArgs);
+  for (SizeT i = 0; i < NumArgs; ++i) {
+    Operand *Arg = Instr->getArg(i);
+    ArgTypes.emplace_back(Arg->getType());
+  }
+  // Compute the return type (if any);
+  Type ReturnType = IceType_void;
+  Variable *Dest = Instr->getDest();
+  if (Dest != nullptr)
+    ReturnType = Dest->getType();
+  return getShadowStoreSize() +
+         getCallStackArgumentsSizeBytes(ArgTypes, ReturnType);
+}
+
+Variable *TargetX8632::makeZeroedRegister(Type Ty, RegNumT RegNum) {
+  Variable *Reg = makeReg(Ty, RegNum);
+  switch (Ty) {
+  case IceType_i1:
+  case IceType_i8:
+  case IceType_i16:
+  case IceType_i32:
+  case IceType_i64:
+    // Conservatively do "mov reg, 0" to avoid modifying FLAGS.
+    _mov(Reg, Ctx->getConstantZero(Ty));
+    break;
+  case IceType_f32:
+  case IceType_f64:
+    Context.insert<InstFakeDef>(Reg);
+    _xorps(Reg, Reg);
+    break;
+  default:
+    // All vector types use the same pxor instruction.
+    assert(isVectorType(Ty));
+    Context.insert<InstFakeDef>(Reg);
+    _pxor(Reg, Reg);
+    break;
+  }
+  return Reg;
+}
+
+// There is no support for loading or emitting vector constants, so the vector
+// values returned from makeVectorOfZeros, makeVectorOfOnes, etc. are
+// initialized with register operations.
+//
+// TODO(wala): Add limited support for vector constants so that complex
+// initialization in registers is unnecessary.
+
+Variable *TargetX8632::makeVectorOfZeros(Type Ty, RegNumT RegNum) {
+  return makeZeroedRegister(Ty, RegNum);
+}
+
+Variable *TargetX8632::makeVectorOfMinusOnes(Type Ty, RegNumT RegNum) {
+  Variable *MinusOnes = makeReg(Ty, RegNum);
+  // Insert a FakeDef so the live range of MinusOnes is not overestimated.
+  Context.insert<InstFakeDef>(MinusOnes);
+  if (Ty == IceType_f64)
+    // Making a vector of minus ones of type f64 is currently only used for the
+    // fabs intrinsic.  To use the f64 type to create this mask with pcmpeqq
+    // requires SSE 4.1.  Since we're just creating a mask, pcmpeqd does the
+    // same job and only requires SSE2.
+    _pcmpeq(MinusOnes, MinusOnes, IceType_f32);
+  else
+    _pcmpeq(MinusOnes, MinusOnes);
+  return MinusOnes;
+}
+
+Variable *TargetX8632::makeVectorOfOnes(Type Ty, RegNumT RegNum) {
+  Variable *Dest = makeVectorOfZeros(Ty, RegNum);
+  Variable *MinusOne = makeVectorOfMinusOnes(Ty);
+  _psub(Dest, MinusOne);
+  return Dest;
+}
+
+Variable *TargetX8632::makeVectorOfHighOrderBits(Type Ty, RegNumT RegNum) {
+  assert(Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v8i16 ||
+         Ty == IceType_v16i8);
+  if (Ty == IceType_v4f32 || Ty == IceType_v4i32 || Ty == IceType_v8i16) {
+    Variable *Reg = makeVectorOfOnes(Ty, RegNum);
+    SizeT Shift =
+        typeWidthInBytes(typeElementType(Ty)) * Traits::X86_CHAR_BIT - 1;
+    _psll(Reg, Ctx->getConstantInt8(Shift));
+    return Reg;
+  } else {
+    // SSE has no left shift operation for vectors of 8 bit integers.
+    constexpr uint32_t HIGH_ORDER_BITS_MASK = 0x80808080;
+    Constant *ConstantMask = Ctx->getConstantInt32(HIGH_ORDER_BITS_MASK);
+    Variable *Reg = makeReg(Ty, RegNum);
+    _movd(Reg, legalize(ConstantMask, Legal_Reg | Legal_Mem));
+    _pshufd(Reg, Reg, Ctx->getConstantZero(IceType_i8));
+    return Reg;
+  }
+}
+
+/// Construct a mask in a register that can be and'ed with a floating-point
+/// value to mask off its sign bit. The value will be <4 x 0x7fffffff> for f32
+/// and v4f32, and <2 x 0x7fffffffffffffff> for f64. Construct it as vector of
+/// ones logically right shifted one bit.
+// TODO(stichnot): Fix the wala
+// TODO: above, to represent vector constants in memory.
+
+Variable *TargetX8632::makeVectorOfFabsMask(Type Ty, RegNumT RegNum) {
+  Variable *Reg = makeVectorOfMinusOnes(Ty, RegNum);
+  _psrl(Reg, Ctx->getConstantInt8(1));
+  return Reg;
+}
+
+typename TargetX8632::X86OperandMem *
+TargetX8632::getMemoryOperandForStackSlot(Type Ty, Variable *Slot,
+                                          uint32_t Offset) {
+  // Ensure that Loc is a stack slot.
+  assert(Slot->mustNotHaveReg());
+  assert(Slot->getRegNum().hasNoValue());
+  // Compute the location of Loc in memory.
+  // TODO(wala,stichnot): lea should not
+  // be required. The address of the stack slot is known at compile time
+  // (although not until after addProlog()).
+  const Type PointerType = getPointerType();
+  Variable *Loc = makeReg(PointerType);
+  _lea(Loc, Slot);
+  Constant *ConstantOffset = Ctx->getConstantInt32(Offset);
+  return X86OperandMem::create(Func, Ty, Loc, ConstantOffset);
+}
+
+/// Lowering helper to copy a scalar integer source operand into some 8-bit GPR.
+/// Src is assumed to already be legalized.  If the source operand is known to
+/// be a memory or immediate operand, a simple mov will suffice.  But if the
+/// source operand can be a physical register, then it must first be copied into
+/// a physical register that is truncable to 8-bit, then truncated into a
+/// physical register that can receive a truncation, and finally copied into the
+/// result 8-bit register (which in general can be any 8-bit register).  For
+/// example, moving %ebp into %ah may be accomplished as:
+///   movl %ebp, %edx
+///   mov_trunc %edx, %dl  // this redundant assignment is ultimately elided
+///   movb %dl, %ah
+/// On the other hand, moving a memory or immediate operand into ah:
+///   movb 4(%ebp), %ah
+///   movb $my_imm, %ah
+///
+/// Note #1.  On a 64-bit target, the "movb 4(%ebp), %ah" is likely not
+/// encodable, so RegNum=Reg_ah should NOT be given as an argument.  Instead,
+/// use RegNum=RegNumT() and then let the caller do a separate copy into
+/// Reg_ah.
+///
+/// Note #2.  ConstantRelocatable operands are also put through this process
+/// (not truncated directly) because our ELF emitter does R_386_32 relocations
+/// but not R_386_8 relocations.
+///
+/// Note #3.  If Src is a Variable, the result will be an infinite-weight i8
+/// Variable with the RCX86_IsTrunc8Rcvr register class.  As such, this helper
+/// is a convenient way to prevent ah/bh/ch/dh from being an (invalid) argument
+/// to the pinsrb instruction.
+
+Variable *TargetX8632::copyToReg8(Operand *Src, RegNumT RegNum) {
+  Type Ty = Src->getType();
+  assert(isScalarIntegerType(Ty));
+  assert(Ty != IceType_i1);
+  Variable *Reg = makeReg(IceType_i8, RegNum);
+  Reg->setRegClass(RCX86_IsTrunc8Rcvr);
+  if (llvm::isa<Variable>(Src) || llvm::isa<ConstantRelocatable>(Src)) {
+    Variable *SrcTruncable = makeReg(Ty);
+    switch (Ty) {
+    case IceType_i64:
+      SrcTruncable->setRegClass(RCX86_Is64To8);
+      break;
+    case IceType_i32:
+      SrcTruncable->setRegClass(RCX86_Is32To8);
+      break;
+    case IceType_i16:
+      SrcTruncable->setRegClass(RCX86_Is16To8);
+      break;
+    default:
+      // i8 - just use default register class
+      break;
+    }
+    Variable *SrcRcvr = makeReg(IceType_i8);
+    SrcRcvr->setRegClass(RCX86_IsTrunc8Rcvr);
+    _mov(SrcTruncable, Src);
+    _mov(SrcRcvr, SrcTruncable);
+    Src = SrcRcvr;
+  }
+  _mov(Reg, Src);
+  return Reg;
+}
+
+/// Helper for legalize() to emit the right code to lower an operand to a
+/// register of the appropriate type.
+
+Variable *TargetX8632::copyToReg(Operand *Src, RegNumT RegNum) {
+  Type Ty = Src->getType();
+  Variable *Reg = makeReg(Ty, RegNum);
+  if (isVectorType(Ty)) {
+    _movp(Reg, Src);
+  } else {
+    _mov(Reg, Src);
+  }
+  return Reg;
+}
+
+Operand *TargetX8632::legalize(Operand *From, LegalMask Allowed,
+                               RegNumT RegNum) {
+  const Type Ty = From->getType();
+  // Assert that a physical register is allowed. To date, all calls to
+  // legalize() allow a physical register. If a physical register needs to be
+  // explicitly disallowed, then new code will need to be written to force a
+  // spill.
+  assert(Allowed & Legal_Reg);
+  // If we're asking for a specific physical register, make sure we're not
+  // allowing any other operand kinds. (This could be future work, e.g. allow
+  // the shl shift amount to be either an immediate or in ecx.)
+  assert(RegNum.hasNoValue() || Allowed == Legal_Reg);
+
+  // Substitute with an available infinite-weight variable if possible.  Only do
+  // this when we are not asking for a specific register, and when the
+  // substitution is not locked to a specific register, and when the types
+  // match, in order to capture the vast majority of opportunities and avoid
+  // corner cases in the lowering.
+  if (RegNum.hasNoValue()) {
+    if (Variable *Subst = getContext().availabilityGet(From)) {
+      // At this point we know there is a potential substitution available.
+      if (Subst->mustHaveReg() && !Subst->hasReg()) {
+        // At this point we know the substitution will have a register.
+        if (From->getType() == Subst->getType()) {
+          // At this point we know the substitution's register is compatible.
+          return Subst;
+        }
+      }
+    }
+  }
+
+  if (auto *Mem = llvm::dyn_cast<X86OperandMem>(From)) {
+    // Before doing anything with a Mem operand, we need to ensure that the
+    // Base and Index components are in physical registers.
+    Variable *Base = Mem->getBase();
+    Variable *Index = Mem->getIndex();
+    Constant *Offset = Mem->getOffset();
+    Variable *RegBase = nullptr;
+    Variable *RegIndex = nullptr;
+    uint16_t Shift = Mem->getShift();
+    if (Base) {
+      RegBase = llvm::cast<Variable>(
+          legalize(Base, Legal_Reg | Legal_Rematerializable));
+    }
+    if (Index) {
+      // TODO(jpp): perhaps we should only allow Legal_Reg if
+      // Base->isRematerializable.
+      RegIndex = llvm::cast<Variable>(
+          legalize(Index, Legal_Reg | Legal_Rematerializable));
+    }
+
+    if (Base != RegBase || Index != RegIndex) {
+      Mem = X86OperandMem::create(Func, Ty, RegBase, Offset, RegIndex, Shift,
+                                  Mem->getSegmentRegister());
+    }
+
+    From = Mem;
+
+    if (!(Allowed & Legal_Mem)) {
+      From = copyToReg(From, RegNum);
+    }
+    return From;
+  }
+
+  if (auto *Const = llvm::dyn_cast<Constant>(From)) {
+    if (llvm::isa<ConstantUndef>(Const)) {
+      From = legalizeUndef(Const, RegNum);
+      if (isVectorType(Ty))
+        return From;
+      Const = llvm::cast<Constant>(From);
+    }
+    // There should be no constants of vector type (other than undef).
+    assert(!isVectorType(Ty));
+
+    // If the operand is a 64 bit constant integer we need to legalize it to a
+    // register in x86-64.
+    if (Traits::Is64Bit) {
+      if (auto *C64 = llvm::dyn_cast<ConstantInteger64>(Const)) {
+        if (!Utils::IsInt(32, C64->getValue())) {
+          if (RegNum.hasValue()) {
+            assert(Traits::getGprForType(IceType_i64, RegNum) == RegNum);
+          }
+          return copyToReg(Const, RegNum);
+        }
+      }
+    }
+
+    if (!llvm::dyn_cast<ConstantRelocatable>(Const)) {
+      if (isScalarFloatingType(Ty)) {
+        // Convert a scalar floating point constant into an explicit memory
+        // operand.
+        if (auto *ConstFloat = llvm::dyn_cast<ConstantFloat>(Const)) {
+          if (Utils::isPositiveZero(ConstFloat->getValue()))
+            return makeZeroedRegister(Ty, RegNum);
+        } else if (auto *ConstDouble = llvm::dyn_cast<ConstantDouble>(Const)) {
+          if (Utils::isPositiveZero(ConstDouble->getValue()))
+            return makeZeroedRegister(Ty, RegNum);
+        }
+
+        auto *CFrom = llvm::cast<Constant>(From);
+        assert(CFrom->getShouldBePooled());
+        Constant *Offset = Ctx->getConstantSym(0, CFrom->getLabelName());
+        auto *Mem = X86OperandMem::create(Func, Ty, nullptr, Offset);
+        From = Mem;
+      }
+    }
+
+    bool NeedsReg = false;
+    if (!(Allowed & Legal_Imm) && !isScalarFloatingType(Ty))
+      // Immediate specifically not allowed.
+      NeedsReg = true;
+    if (!(Allowed & Legal_Mem) && isScalarFloatingType(Ty))
+      // On x86, FP constants are lowered to mem operands.
+      NeedsReg = true;
+    if (NeedsReg) {
+      From = copyToReg(From, RegNum);
+    }
+    return From;
+  }
+
+  if (auto *Var = llvm::dyn_cast<Variable>(From)) {
+    // Check if the variable is guaranteed a physical register. This can happen
+    // either when the variable is pre-colored or when it is assigned infinite
+    // weight.
+    bool MustHaveRegister = (Var->hasReg() || Var->mustHaveReg());
+    bool MustRematerialize =
+        (Var->isRematerializable() && !(Allowed & Legal_Rematerializable));
+    // We need a new physical register for the operand if:
+    // - Mem is not allowed and Var isn't guaranteed a physical register, or
+    // - RegNum is required and Var->getRegNum() doesn't match, or
+    // - Var is a rematerializable variable and rematerializable pass-through is
+    //   not allowed (in which case we need a lea instruction).
+    if (MustRematerialize) {
+      Variable *NewVar = makeReg(Ty, RegNum);
+      // Since Var is rematerializable, the offset will be added when the lea is
+      // emitted.
+      constexpr Constant *NoOffset = nullptr;
+      auto *Mem = X86OperandMem::create(Func, Ty, Var, NoOffset);
+      _lea(NewVar, Mem);
+      From = NewVar;
+    } else if ((!(Allowed & Legal_Mem) && !MustHaveRegister) ||
+               (RegNum.hasValue() && RegNum != Var->getRegNum())) {
+      From = copyToReg(From, RegNum);
+    }
+    return From;
+  }
+
+  llvm::report_fatal_error("Unhandled operand kind in legalize()");
+  return From;
+}
+
+/// Provide a trivial wrapper to legalize() for this common usage.
+
+Variable *TargetX8632::legalizeToReg(Operand *From, RegNumT RegNum) {
+  return llvm::cast<Variable>(legalize(From, Legal_Reg, RegNum));
+}
+
+/// Legalize undef values to concrete values.
+
+Operand *TargetX8632::legalizeUndef(Operand *From, RegNumT RegNum) {
+  Type Ty = From->getType();
+  if (llvm::isa<ConstantUndef>(From)) {
+    // Lower undefs to zero.  Another option is to lower undefs to an
+    // uninitialized register; however, using an uninitialized register results
+    // in less predictable code.
+    //
+    // If in the future the implementation is changed to lower undef values to
+    // uninitialized registers, a FakeDef will be needed:
+    //     Context.insert<InstFakeDef>(Reg);
+    // This is in order to ensure that the live range of Reg is not
+    // overestimated.  If the constant being lowered is a 64 bit value, then
+    // the result should be split and the lo and hi components will need to go
+    // in uninitialized registers.
+    if (isVectorType(Ty))
+      return makeVectorOfZeros(Ty, RegNum);
+    return Ctx->getConstantZero(Ty);
+  }
+  return From;
+}
+
+/// For the cmp instruction, if Src1 is an immediate, or known to be a physical
+/// register, we can allow Src0 to be a memory operand. Otherwise, Src0 must be
+/// copied into a physical register. (Actually, either Src0 or Src1 can be
+/// chosen for the physical register, but unfortunately we have to commit to one
+/// or the other before register allocation.)
+
+Operand *TargetX8632::legalizeSrc0ForCmp(Operand *Src0, Operand *Src1) {
+  bool IsSrc1ImmOrReg = false;
+  if (llvm::isa<Constant>(Src1)) {
+    IsSrc1ImmOrReg = true;
+  } else if (auto *Var = llvm::dyn_cast<Variable>(Src1)) {
+    if (Var->hasReg())
+      IsSrc1ImmOrReg = true;
+  }
+  return legalize(Src0, IsSrc1ImmOrReg ? (Legal_Reg | Legal_Mem) : Legal_Reg);
+}
+
+typename TargetX8632::X86OperandMem *
+TargetX8632::formMemoryOperand(Operand *Opnd, Type Ty, bool DoLegalize) {
+  auto *Mem = llvm::dyn_cast<X86OperandMem>(Opnd);
+  // It may be the case that address mode optimization already creates an
+  // X86OperandMem, so in that case it wouldn't need another level of
+  // transformation.
+  if (!Mem) {
+    auto *Base = llvm::dyn_cast<Variable>(Opnd);
+    auto *Offset = llvm::dyn_cast<Constant>(Opnd);
+    assert(Base || Offset);
+    if (Offset) {
+      if (!llvm::isa<ConstantRelocatable>(Offset)) {
+        if (llvm::isa<ConstantInteger64>(Offset)) {
+          // Memory operands cannot have 64-bit immediates, so they must be
+          // legalized into a register only.
+          Base = llvm::cast<Variable>(legalize(Offset, Legal_Reg));
+          Offset = nullptr;
+        } else {
+          Offset = llvm::cast<Constant>(legalize(Offset));
+
+          assert(llvm::isa<ConstantInteger32>(Offset) ||
+                 llvm::isa<ConstantRelocatable>(Offset));
+        }
+      }
+    }
+    Mem = X86OperandMem::create(Func, Ty, Base, Offset);
+  }
+  return llvm::cast<X86OperandMem>(DoLegalize ? legalize(Mem) : Mem);
+}
+
+Variable *TargetX8632::makeReg(Type Type, RegNumT RegNum) {
+  // There aren't any 64-bit integer registers for x86-32.
+  assert(Traits::Is64Bit || Type != IceType_i64);
+  Variable *Reg = Func->makeVariable(Type);
+  if (RegNum.hasValue())
+    Reg->setRegNum(RegNum);
+  else
+    Reg->setMustHaveReg();
+  return Reg;
+}
+
+const Type TypeForSize[] = {IceType_i8, IceType_i16, IceType_i32, IceType_f64,
+                            IceType_v16i8};
+
+Type TargetX8632::largestTypeInSize(uint32_t Size, uint32_t MaxSize) {
+  assert(Size != 0);
+  uint32_t TyIndex = llvm::findLastSet(Size, llvm::ZB_Undefined);
+  uint32_t MaxIndex = MaxSize == NoSizeLimit
+                          ? llvm::array_lengthof(TypeForSize) - 1
+                          : llvm::findLastSet(MaxSize, llvm::ZB_Undefined);
+  return TypeForSize[std::min(TyIndex, MaxIndex)];
+}
+
+Type TargetX8632::firstTypeThatFitsSize(uint32_t Size, uint32_t MaxSize) {
+  assert(Size != 0);
+  uint32_t TyIndex = llvm::findLastSet(Size, llvm::ZB_Undefined);
+  if (!llvm::isPowerOf2_32(Size))
+    ++TyIndex;
+  uint32_t MaxIndex = MaxSize == NoSizeLimit
+                          ? llvm::array_lengthof(TypeForSize) - 1
+                          : llvm::findLastSet(MaxSize, llvm::ZB_Undefined);
+  return TypeForSize[std::min(TyIndex, MaxIndex)];
+}
+
+void TargetX8632::postLower() {
+  if (Func->getOptLevel() == Opt_m1)
+    return;
+  markRedefinitions();
+  Context.availabilityUpdate();
+}
+
+void TargetX8632::emit(const ConstantInteger32 *C) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Ctx->getStrEmit();
+  Str << "$" << C->getValue();
+}
+
+void TargetX8632::emit(const ConstantInteger64 *C) const {
+  if (!Traits::Is64Bit) {
+    llvm::report_fatal_error("Not expecting to emit 64-bit integers");
+  } else {
+    if (!BuildDefs::dump())
+      return;
+    Ostream &Str = Ctx->getStrEmit();
+    Str << "$" << C->getValue();
+  }
+}
+
+void TargetX8632::emit(const ConstantFloat *C) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Ctx->getStrEmit();
+  Str << C->getLabelName();
+}
+
+void TargetX8632::emit(const ConstantDouble *C) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Ctx->getStrEmit();
+  Str << C->getLabelName();
+}
+
+void TargetX8632::emit(const ConstantUndef *) const {
+  llvm::report_fatal_error("undef value encountered by emitter.");
+}
+
+void TargetX8632::emit(const ConstantRelocatable *C) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Ctx->getStrEmit();
+  Str << "$";
+  emitWithoutPrefix(C);
+}
+
+void TargetX8632::emitJumpTable(const Cfg *,
+                                const InstJumpTable *JumpTable) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Ctx->getStrEmit();
+  Str << "\t.section\t.rodata." << JumpTable->getSectionName()
+      << ",\"a\",@progbits\n"
+         "\t.align\t"
+      << typeWidthInBytes(getPointerType()) << "\n"
+      << JumpTable->getName() << ":";
+
+  for (SizeT I = 0; I < JumpTable->getNumTargets(); ++I)
+    Str << "\n\t.long\t" << JumpTable->getTarget(I)->getAsmName();
+  Str << "\n";
+}
+
+template <typename T>
+void TargetDataX8632::emitConstantPool(GlobalContext *Ctx) {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Ctx->getStrEmit();
+  Type Ty = T::Ty;
+  SizeT Align = typeAlignInBytes(Ty);
+  ConstantList Pool = Ctx->getConstantPool(Ty);
+
+  Str << "\t.section\t.rodata.cst" << Align << ",\"aM\",@progbits," << Align
+      << "\n";
+  Str << "\t.align\t" << Align << "\n";
+
+  for (Constant *C : Pool) {
+    if (!C->getShouldBePooled())
+      continue;
+    auto *Const = llvm::cast<typename T::IceType>(C);
+    typename T::IceType::PrimType Value = Const->getValue();
+    // Use memcpy() to copy bits from Value into RawValue in a way that avoids
+    // breaking strict-aliasing rules.
+    typename T::PrimitiveIntType RawValue;
+    memcpy(&RawValue, &Value, sizeof(Value));
+    char buf[30];
+    int CharsPrinted =
+        snprintf(buf, llvm::array_lengthof(buf), T::PrintfString, RawValue);
+    assert(CharsPrinted >= 0);
+    assert((size_t)CharsPrinted < llvm::array_lengthof(buf));
+    (void)CharsPrinted; // avoid warnings if asserts are disabled
+    Str << Const->getLabelName();
+    Str << ":\n\t" << T::AsmTag << "\t" << buf << "\t/* " << T::TypeName << " "
+        << Value << " */\n";
+  }
+}
+
+void TargetDataX8632::lowerConstants() {
+  if (getFlags().getDisableTranslation())
+    return;
+  switch (getFlags().getOutFileType()) {
+  case FT_Elf: {
+    ELFObjectWriter *Writer = Ctx->getObjectWriter();
+
+    Writer->writeConstantPool<ConstantInteger32>(IceType_i8);
+    Writer->writeConstantPool<ConstantInteger32>(IceType_i16);
+    Writer->writeConstantPool<ConstantInteger32>(IceType_i32);
+
+    Writer->writeConstantPool<ConstantFloat>(IceType_f32);
+    Writer->writeConstantPool<ConstantDouble>(IceType_f64);
+  } break;
+  case FT_Asm:
+  case FT_Iasm: {
+    OstreamLocker L(Ctx);
+
+    emitConstantPool<PoolTypeConverter<uint8_t>>(Ctx);
+    emitConstantPool<PoolTypeConverter<uint16_t>>(Ctx);
+    emitConstantPool<PoolTypeConverter<uint32_t>>(Ctx);
+
+    emitConstantPool<PoolTypeConverter<float>>(Ctx);
+    emitConstantPool<PoolTypeConverter<double>>(Ctx);
+  } break;
+  }
+}
+
+void TargetDataX8632::lowerJumpTables() {
+  const bool IsPIC = false;
+  switch (getFlags().getOutFileType()) {
+  case FT_Elf: {
+    ELFObjectWriter *Writer = Ctx->getObjectWriter();
+    constexpr FixupKind FK_Abs64 = llvm::ELF::R_X86_64_64;
+    const FixupKind RelocationKind =
+        (getPointerType() == IceType_i32) ? Traits::FK_Abs : FK_Abs64;
+    for (const JumpTableData &JT : Ctx->getJumpTables())
+      Writer->writeJumpTable(JT, RelocationKind, IsPIC);
+  } break;
+  case FT_Asm:
+    // Already emitted from Cfg
+    break;
+  case FT_Iasm: {
+    if (!BuildDefs::dump())
+      return;
+    Ostream &Str = Ctx->getStrEmit();
+    const char *Prefix = IsPIC ? ".data.rel.ro." : ".rodata.";
+    for (const JumpTableData &JT : Ctx->getJumpTables()) {
+      Str << "\t.section\t" << Prefix << JT.getSectionName()
+          << ",\"a\",@progbits\n"
+             "\t.align\t"
+          << typeWidthInBytes(getPointerType()) << "\n"
+          << JT.getName().toString() << ":";
+
+      // On X8664 ILP32 pointers are 32-bit hence the use of .long
+      for (intptr_t TargetOffset : JT.getTargetOffsets())
+        Str << "\n\t.long\t" << JT.getFunctionName() << "+" << TargetOffset;
+      Str << "\n";
+    }
+  } break;
+  }
+}
+
+void TargetDataX8632::lowerGlobals(const VariableDeclarationList &Vars,
+                                   const std::string &SectionSuffix) {
+  const bool IsPIC = false;
+  switch (getFlags().getOutFileType()) {
+  case FT_Elf: {
+    ELFObjectWriter *Writer = Ctx->getObjectWriter();
+    Writer->writeDataSection(Vars, Traits::FK_Abs, SectionSuffix, IsPIC);
+  } break;
+  case FT_Asm:
+  case FT_Iasm: {
+    OstreamLocker L(Ctx);
+    for (const VariableDeclaration *Var : Vars) {
+      if (getFlags().matchTranslateOnly(Var->getName(), 0)) {
+        emitGlobal(*Var, SectionSuffix);
+      }
+    }
+  } break;
+  }
+}
+
 //------------------------------------------------------------------------------
 //      ______   ______     ______     __     ______   ______
 //     /\__  _\ /\  == \   /\  __ \   /\ \   /\__  _\ /\  ___\
@@ -109,18 +8114,13 @@
 #endif
 const char *TargetX8632Traits::TargetName = "X8632";
 
-template <>
-std::array<SmallBitVector, RCX86_NUM>
-    TargetX86Base<X8632::Traits>::TypeToRegisterSet = {{}};
+std::array<SmallBitVector, RCX86_NUM> TargetX8632::TypeToRegisterSet = {{}};
 
-template <>
-std::array<SmallBitVector, RCX86_NUM>
-    TargetX86Base<X8632::Traits>::TypeToRegisterSetUnfiltered = {{}};
+std::array<SmallBitVector, RCX86_NUM> TargetX8632::TypeToRegisterSetUnfiltered =
+    {{}};
 
-template <>
-std::array<SmallBitVector,
-           TargetX86Base<X8632::Traits>::Traits::RegisterSet::Reg_NUM>
-    TargetX86Base<X8632::Traits>::RegisterAliases = {{}};
+std::array<SmallBitVector, TargetX8632::Traits::RegisterSet::Reg_NUM>
+    TargetX8632::RegisterAliases = {{}};
 
 //------------------------------------------------------------------------------
 //     __      ______  __     __  ______  ______  __  __   __  ______
@@ -176,6 +8176,7 @@
   _pop(getPhysicalRegister(RegNum, Traits::WordType));
 }
 
+/// Lower an indirect jump adding sandboxing when needed.
 void TargetX8632::lowerIndirectJump(Variable *JumpTarget) { _jmp(JumpTarget); }
 
 Inst *TargetX8632::emitCallToTarget(Operand *CallTarget, Variable *ReturnReg,
@@ -185,7 +8186,7 @@
   // calls, because floating point arguments are passed via vector registers,
   // whereas for x86-32, all args are passed via the stack.
 
-  return Context.insert<Traits::Insts::Call>(ReturnReg, CallTarget);
+  return Context.insert<Insts::Call>(ReturnReg, CallTarget);
 }
 
 Variable *TargetX8632::moveReturnValueToRegister(Operand *Value,
@@ -340,6 +8341,7 @@
                 "Inconsistency between ICETYPEX86_TABLE and ICETYPE_TABLE");
 ICETYPE_TABLE
 #undef X
+
 } // end of namespace dummy3
 } // end of anonymous namespace
 
diff --git a/third_party/subzero/src/IceTargetLoweringX8632.h b/third_party/subzero/src/IceTargetLoweringX8632.h
index b4bffd3..de77c1b 100644
--- a/third_party/subzero/src/IceTargetLoweringX8632.h
+++ b/third_party/subzero/src/IceTargetLoweringX8632.h
@@ -18,23 +18,1001 @@
 
 #include "IceAssemblerX8632.h"
 #include "IceDefs.h"
+#include "IceInst.h"
 #include "IceInstX8632.h"
 #include "IceRegistersX8632.h"
-#include "IceTargetLowering.h"
-#include "IceTargetLoweringX8632Base.h"
+#include "IceSwitchLowering.h"
+#include "IceTargetLoweringX86.h"
 #include "IceTargetLoweringX8632Traits.h"
+#include "IceTargetLoweringX86RegClass.h"
+#include "IceUtils.h"
+
+#include <array>
+#include <type_traits>
+#include <utility>
 
 namespace Ice {
 namespace X8632 {
 
-class TargetX8632 final : public ::Ice::X8632::TargetX86Base<X8632::Traits> {
+using namespace ::Ice::X86;
+
+class BoolFoldingEntry {
+  BoolFoldingEntry(const BoolFoldingEntry &) = delete;
+
+public:
+  BoolFoldingEntry() = default;
+  explicit BoolFoldingEntry(Inst *I);
+  BoolFoldingEntry &operator=(const BoolFoldingEntry &) = default;
+  /// Instr is the instruction producing the i1-type variable of interest.
+  Inst *Instr = nullptr;
+  /// IsComplex is the cached result of BoolFolding::hasComplexLowering(Instr).
+  bool IsComplex = false;
+  /// IsLiveOut is initialized conservatively to true, and is set to false when
+  /// we encounter an instruction that ends Var's live range. We disable the
+  /// folding optimization when Var is live beyond this basic block. Note that
+  /// if liveness analysis is not performed (e.g. in Om1 mode), IsLiveOut will
+  /// always be true and the folding optimization will never be performed.
+  bool IsLiveOut = true;
+  // NumUses counts the number of times Var is used as a source operand in the
+  // basic block. If IsComplex is true and there is more than one use of Var,
+  // then the folding optimization is disabled for Var.
+  uint32_t NumUses = 0;
+};
+
+class BoolFolding {
+public:
+  enum BoolFoldingProducerKind {
+    PK_None,
+    // TODO(jpp): PK_Icmp32 is no longer meaningful. Rename to PK_IcmpNative.
+    PK_Icmp32,
+    PK_Icmp64,
+    PK_Fcmp,
+    PK_Trunc,
+    PK_Arith // A flag-setting arithmetic instruction.
+  };
+
+  /// Currently the actual enum values are not used (other than CK_None), but we
+  /// go ahead and produce them anyway for symmetry with the
+  /// BoolFoldingProducerKind.
+  enum BoolFoldingConsumerKind { CK_None, CK_Br, CK_Select, CK_Sext, CK_Zext };
+
+private:
+  BoolFolding(const BoolFolding &) = delete;
+  BoolFolding &operator=(const BoolFolding &) = delete;
+
+public:
+  BoolFolding() = default;
+  static BoolFoldingProducerKind getProducerKind(const Inst *Instr);
+  static BoolFoldingConsumerKind getConsumerKind(const Inst *Instr);
+  static bool hasComplexLowering(const Inst *Instr);
+  static bool isValidFolding(BoolFoldingProducerKind ProducerKind,
+                             BoolFoldingConsumerKind ConsumerKind);
+  void init(CfgNode *Node);
+  const Inst *getProducerFor(const Operand *Opnd) const;
+  void dump(const Cfg *Func) const;
+
+private:
+  /// Returns true if Producers contains a valid entry for the given VarNum.
+  bool containsValid(SizeT VarNum) const {
+    auto Element = Producers.find(VarNum);
+    return Element != Producers.end() && Element->second.Instr != nullptr;
+  }
+  void setInvalid(SizeT VarNum) { Producers[VarNum].Instr = nullptr; }
+  void invalidateProducersOnStore(const Inst *Instr);
+  /// Producers maps Variable::Number to a BoolFoldingEntry.
+  CfgUnorderedMap<SizeT, BoolFoldingEntry> Producers;
+};
+
+/// TargetX8632 is a template for all X86 Targets, and it relies on the CRT
+/// pattern for generating code, delegating to actual backends target-specific
+/// lowerings (e.g., call, ret, and intrinsics.)
+///
+/// Note: Ideally, we should be able to
+///
+///  static_assert(std::is_base_of<TargetX8632<TraitsType>,
+///  Machine>::value);
+///
+/// but that does not work: the compiler does not know that Machine inherits
+/// from TargetX8632 at this point in translation.
+class TargetX8632 : public TargetX86 {
   TargetX8632() = delete;
   TargetX8632(const TargetX8632 &) = delete;
   TargetX8632 &operator=(const TargetX8632 &) = delete;
 
 public:
-  ~TargetX8632() = default;
+  using Traits = TargetX8632Traits;
+  using TargetLowering = typename Traits::TargetLowering;
 
+  using BrCond = CondX86::BrCond;
+  using CmppsCond = CondX86::CmppsCond;
+
+  using X86Address = typename Traits::Address;
+  using X86Operand = typename Traits::X86Operand;
+  using X86OperandMem = typename Traits::X86OperandMem;
+  using SegmentRegisters = typename Traits::X86OperandMem::SegmentRegisters;
+
+  using InstX86Br = Insts::Br;
+  using InstX86FakeRMW = Insts::FakeRMW;
+  using InstX86Label = Insts::Label;
+
+  ~TargetX8632() override = default;
+
+  static void staticInit(GlobalContext *Ctx);
+  static bool shouldBePooled(const Constant *C);
+  static ::Ice::Type getPointerType();
+
+  static FixupKind getPcRelFixup() { return PcRelFixup; }
+  static FixupKind getAbsFixup() { return AbsFixup; }
+
+  void translateOm1() override;
+  void translateO2() override;
+  void doLoadOpt();
+  bool doBranchOpt(Inst *I, const CfgNode *NextNode) override;
+
+  SizeT getNumRegisters() const override {
+    return Traits::RegisterSet::Reg_NUM;
+  }
+
+  Inst *createLoweredMove(Variable *Dest, Variable *SrcVar) override {
+    if (isVectorType(Dest->getType())) {
+      return Insts::Movp::create(Func, Dest, SrcVar);
+    }
+    return Insts::Mov::create(Func, Dest, SrcVar);
+    (void)Dest;
+    (void)SrcVar;
+    return nullptr;
+  }
+
+  Variable *getPhysicalRegister(RegNumT RegNum,
+                                Type Ty = IceType_void) override;
+  const char *getRegName(RegNumT RegNum, Type Ty) const override;
+  static const char *getRegClassName(RegClass C) {
+    auto ClassNum = static_cast<RegClassX86>(C);
+    assert(ClassNum < RCX86_NUM);
+    switch (ClassNum) {
+    default:
+      assert(C < RC_Target);
+      return regClassString(C);
+    case RCX86_Is64To8:
+      return "i64to8"; // 64-bit GPR truncable to i8
+    case RCX86_Is32To8:
+      return "i32to8"; // 32-bit GPR truncable to i8
+    case RCX86_Is16To8:
+      return "i16to8"; // 16-bit GPR truncable to i8
+    case RCX86_IsTrunc8Rcvr:
+      return "i8from"; // 8-bit GPR truncable from wider GPRs
+    case RCX86_IsAhRcvr:
+      return "i8fromah"; // 8-bit GPR that ah can be assigned to
+    }
+  }
+  SmallBitVector getRegisterSet(RegSetMask Include,
+                                RegSetMask Exclude) const override;
+  const SmallBitVector &
+  getRegistersForVariable(const Variable *Var) const override {
+    RegClass RC = Var->getRegClass();
+    assert(static_cast<RegClassX86>(RC) < RCX86_NUM);
+    return TypeToRegisterSet[RC];
+  }
+
+  const SmallBitVector &
+  getAllRegistersForVariable(const Variable *Var) const override {
+    RegClass RC = Var->getRegClass();
+    assert(static_cast<RegClassX86>(RC) < RCX86_NUM);
+    return TypeToRegisterSetUnfiltered[RC];
+  }
+
+  const SmallBitVector &getAliasesForRegister(RegNumT Reg) const override {
+    Reg.assertIsValid();
+    return RegisterAliases[Reg];
+  }
+
+  bool hasFramePointer() const override { return IsEbpBasedFrame; }
+  void setHasFramePointer() override { IsEbpBasedFrame = true; }
+  RegNumT getStackReg() const override { return Traits::StackPtr; }
+  RegNumT getFrameReg() const override { return Traits::FramePtr; }
+  RegNumT getFrameOrStackReg() const override {
+    // If the stack pointer needs to be aligned, then the frame pointer is
+    // unaligned, so always use the stack pointer.
+    if (needsStackPointerAlignment())
+      return getStackReg();
+    return IsEbpBasedFrame ? getFrameReg() : getStackReg();
+  }
+  size_t typeWidthInBytesOnStack(Type Ty) const override {
+    // Round up to the next multiple of WordType bytes.
+    const uint32_t WordSizeInBytes = typeWidthInBytes(Traits::WordType);
+    return Utils::applyAlignment(typeWidthInBytes(Ty), WordSizeInBytes);
+  }
+  uint32_t getStackAlignment() const override {
+    return Traits::X86_STACK_ALIGNMENT_BYTES;
+  }
+  bool needsStackPointerAlignment() const override {
+    // If the ABI's stack alignment is smaller than the vector size (16 bytes),
+    // use the (realigned) stack pointer for addressing any stack variables.
+    return Traits::X86_STACK_ALIGNMENT_BYTES < 16;
+  }
+  void reserveFixedAllocaArea(size_t Size, size_t Align) override {
+    FixedAllocaSizeBytes = Size;
+    assert(llvm::isPowerOf2_32(Align));
+    FixedAllocaAlignBytes = Align;
+    PrologEmitsFixedAllocas = true;
+  }
+  /// Returns the (negative) offset from ebp/rbp where the fixed Allocas start.
+  int32_t getFrameFixedAllocaOffset() const override {
+    return FixedAllocaSizeBytes - (SpillAreaSizeBytes - maxOutArgsSizeBytes());
+  }
+  virtual uint32_t maxOutArgsSizeBytes() const override {
+    return MaxOutArgsSizeBytes;
+  }
+  virtual void updateMaxOutArgsSizeBytes(uint32_t Size) {
+    MaxOutArgsSizeBytes = std::max(MaxOutArgsSizeBytes, Size);
+  }
+
+  bool shouldSplitToVariable64On32(Type Ty) const override {
+    return Traits::Is64Bit ? false : Ty == IceType_i64;
+  }
+
+  SizeT getMinJumpTableSize() const override { return 4; }
+
+  void emitVariable(const Variable *Var) const override;
+
+  void emit(const ConstantInteger32 *C) const final;
+  void emit(const ConstantInteger64 *C) const final;
+  void emit(const ConstantFloat *C) const final;
+  void emit(const ConstantDouble *C) const final;
+  void emit(const ConstantUndef *C) const final;
+  void emit(const ConstantRelocatable *C) const final;
+
+  void initNodeForLowering(CfgNode *Node) override;
+
+  template <typename T = Traits>
+  typename std::enable_if<!T::Is64Bit, Operand>::type *
+  loOperand(Operand *Operand);
+  template <typename T = Traits>
+  typename std::enable_if<T::Is64Bit, Operand>::type *loOperand(Operand *) {
+    llvm::report_fatal_error(
+        "Hey, yo! This is x86-64. Watcha doin'? (loOperand)");
+  }
+
+  template <typename T = Traits>
+  typename std::enable_if<!T::Is64Bit, Operand>::type *
+  hiOperand(Operand *Operand);
+  template <typename T = Traits>
+  typename std::enable_if<T::Is64Bit, Operand>::type *hiOperand(Operand *) {
+    llvm::report_fatal_error(
+        "Hey, yo! This is x86-64. Watcha doin'? (hiOperand)");
+  }
+
+  void addProlog(CfgNode *Node) override;
+  void finishArgumentLowering(Variable *Arg, Variable *FramePtr,
+                              size_t BasicFrameOffset, size_t StackAdjBytes,
+                              size_t &InArgsSizeBytes);
+  void addEpilog(CfgNode *Node) override;
+  X86Address stackVarToAsmOperand(const Variable *Var) const;
+
+  Operand *legalizeUndef(Operand *From, RegNumT RegNum = RegNumT());
+
+protected:
+  void postLower() override;
+
+  void lowerAlloca(const InstAlloca *Instr) override;
+  void lowerArguments() override;
+  void lowerArithmetic(const InstArithmetic *Instr) override;
+  void lowerAssign(const InstAssign *Instr) override;
+  void lowerBr(const InstBr *Instr) override;
+  void lowerBreakpoint(const InstBreakpoint *Instr) override;
+  void lowerCall(const InstCall *Instr) override;
+  void lowerCast(const InstCast *Instr) override;
+  void lowerExtractElement(const InstExtractElement *Instr) override;
+  void lowerFcmp(const InstFcmp *Instr) override;
+  void lowerIcmp(const InstIcmp *Instr) override;
+
+  void lowerIntrinsic(const InstIntrinsic *Instr) override;
+  void lowerInsertElement(const InstInsertElement *Instr) override;
+  void lowerLoad(const InstLoad *Instr) override;
+  void lowerPhi(const InstPhi *Instr) override;
+  void lowerRet(const InstRet *Instr) override;
+  void lowerSelect(const InstSelect *Instr) override;
+  void lowerShuffleVector(const InstShuffleVector *Instr) override;
+  void lowerStore(const InstStore *Instr) override;
+  void lowerSwitch(const InstSwitch *Instr) override;
+  void lowerUnreachable(const InstUnreachable *Instr) override;
+  void lowerOther(const Inst *Instr) override;
+  void lowerRMW(const InstX86FakeRMW *RMW);
+  void prelowerPhis() override;
+  uint32_t getCallStackArgumentsSizeBytes(const CfgVector<Type> &ArgTypes,
+                                          Type ReturnType);
+  uint32_t getCallStackArgumentsSizeBytes(const InstCall *Instr) override;
+  void genTargetHelperCallFor(Inst *Instr) override;
+
+  /// OptAddr wraps all the possible operands that an x86 address might have.
+  struct OptAddr {
+    Variable *Base = nullptr;
+    Variable *Index = nullptr;
+    uint16_t Shift = 0;
+    int32_t Offset = 0;
+    ConstantRelocatable *Relocatable = nullptr;
+  };
+
+  // Builds information for a canonical address expresion:
+  //   <Relocatable + Offset>(Base, Index, Shift)
+  X86OperandMem *computeAddressOpt(const Inst *Instr, Type MemType,
+                                   Operand *Addr);
+  void doAddressOptOther() override;
+  void doAddressOptLoad() override;
+  void doAddressOptStore() override;
+  void doAddressOptLoadSubVector() override;
+  void doAddressOptStoreSubVector() override;
+  void doMockBoundsCheck(Operand *Opnd) override;
+
+  /// Naive lowering of cmpxchg.
+  void lowerAtomicCmpxchg(Variable *DestPrev, Operand *Ptr, Operand *Expected,
+                          Operand *Desired);
+  /// Attempt a more optimized lowering of cmpxchg. Returns true if optimized.
+  bool tryOptimizedCmpxchgCmpBr(Variable *DestPrev, Operand *Ptr,
+                                Operand *Expected, Operand *Desired);
+  void lowerAtomicRMW(Variable *Dest, uint32_t Operation, Operand *Ptr,
+                      Operand *Val);
+  void lowerCountZeros(bool Cttz, Type Ty, Variable *Dest, Operand *FirstVal,
+                       Operand *SecondVal);
+  /// Load from memory for a given type.
+  void typedLoad(Type Ty, Variable *Dest, Variable *Base, Constant *Offset);
+  /// Store to memory for a given type.
+  void typedStore(Type Ty, Variable *Value, Variable *Base, Constant *Offset);
+  /// Copy memory of given type from Src to Dest using OffsetAmt on both.
+  void copyMemory(Type Ty, Variable *Dest, Variable *Src, int32_t OffsetAmt);
+  /// Replace some calls to memcpy with inline instructions.
+  void lowerMemcpy(Operand *Dest, Operand *Src, Operand *Count);
+  /// Replace some calls to memmove with inline instructions.
+  void lowerMemmove(Operand *Dest, Operand *Src, Operand *Count);
+  /// Replace some calls to memset with inline instructions.
+  void lowerMemset(Operand *Dest, Operand *Val, Operand *Count);
+
+  /// Lower an indirect jump adding sandboxing when needed.
+  void lowerIndirectJump(Variable *JumpTarget);
+
+  /// Check the comparison is in [Min,Max]. The flags register will be modified
+  /// with:
+  ///   - below equal, if in range
+  ///   - above, set if not in range
+  /// The index into the range is returned.
+  Operand *lowerCmpRange(Operand *Comparison, uint64_t Min, uint64_t Max);
+  /// Lowering of a cluster of switch cases. If the case is not matched control
+  /// will pass to the default label provided. If the default label is nullptr
+  /// then control will fall through to the next instruction. DoneCmp should be
+  /// true if the flags contain the result of a comparison with the Comparison.
+  void lowerCaseCluster(const CaseCluster &Case, Operand *Src0, bool DoneCmp,
+                        CfgNode *DefaultLabel = nullptr);
+
+  using LowerBinOp = void (TargetX8632::*)(Variable *, Operand *);
+  void expandAtomicRMWAsCmpxchg(LowerBinOp op_lo, LowerBinOp op_hi,
+                                Variable *Dest, Operand *Ptr, Operand *Val);
+
+  void eliminateNextVectorSextInstruction(Variable *SignExtendedResult);
+
+  void emitStackProbe(size_t StackSizeBytes);
+
+  /// Emit just the call instruction (without argument or return variable
+  /// processing), sandboxing if needed.
+  Inst *emitCallToTarget(Operand *CallTarget, Variable *ReturnReg,
+                         size_t NumVariadicFpArgs = 0);
+  /// Materialize the moves needed to return a value of the specified type.
+  Variable *moveReturnValueToRegister(Operand *Value, Type ReturnType);
+
+  /// Emit a jump table to the constant pool.
+  void emitJumpTable(const Cfg *Func,
+                     const InstJumpTable *JumpTable) const override;
+
+  /// Emit a fake use of esp to make sure esp stays alive for the entire
+  /// function. Otherwise some esp adjustments get dead-code eliminated.
+  void keepEspLiveAtExit() {
+    Variable *esp =
+        Func->getTarget()->getPhysicalRegister(getStackReg(), Traits::WordType);
+    Context.insert<InstFakeUse>(esp);
+  }
+
+  /// Operand legalization helpers. To deal with address mode constraints, the
+  /// helpers will create a new Operand and emit instructions that guarantee
+  /// that the Operand kind is one of those indicated by the LegalMask (a
+  /// bitmask of allowed kinds). If the input Operand is known to already meet
+  /// the constraints, it may be simply returned as the result, without creating
+  /// any new instructions or operands.
+  enum OperandLegalization {
+    Legal_None = 0,
+    Legal_Reg = 1 << 0, // physical register, not stack location
+    Legal_Imm = 1 << 1,
+    Legal_Mem = 1 << 2, // includes [eax+4*ecx] as well as [esp+12]
+    Legal_Rematerializable = 1 << 3,
+    Legal_AddrAbs = 1 << 4, // ConstantRelocatable doesn't have to add RebasePtr
+    Legal_Default = ~(Legal_Rematerializable | Legal_AddrAbs)
+    // TODO(stichnot): Figure out whether this default works for x86-64.
+  };
+  using LegalMask = uint32_t;
+  Operand *legalize(Operand *From, LegalMask Allowed = Legal_Default,
+                    RegNumT RegNum = RegNumT());
+  Variable *legalizeToReg(Operand *From, RegNumT RegNum = RegNumT());
+  /// Legalize the first source operand for use in the cmp instruction.
+  Operand *legalizeSrc0ForCmp(Operand *Src0, Operand *Src1);
+  /// Turn a pointer operand into a memory operand that can be used by a real
+  /// load/store operation. Legalizes the operand as well. This is a nop if the
+  /// operand is already a legal memory operand.
+  X86OperandMem *formMemoryOperand(Operand *Ptr, Type Ty,
+                                   bool DoLegalize = true);
+
+  Variable *makeReg(Type Ty, RegNumT RegNum = RegNumT());
+  static Type stackSlotType();
+
+  static constexpr uint32_t NoSizeLimit = 0;
+  /// Returns the largest type which is equal to or larger than Size bytes. The
+  /// type is suitable for copying memory i.e. a load and store will be a single
+  /// instruction (for example x86 will get f64 not i64).
+  static Type largestTypeInSize(uint32_t Size, uint32_t MaxSize = NoSizeLimit);
+  /// Returns the smallest type which is equal to or larger than Size bytes. If
+  /// one doesn't exist then the largest type smaller than Size bytes is
+  /// returned. The type is suitable for memory copies as described at
+  /// largestTypeInSize.
+  static Type firstTypeThatFitsSize(uint32_t Size,
+                                    uint32_t MaxSize = NoSizeLimit);
+
+  Variable *copyToReg8(Operand *Src, RegNumT RegNum = RegNumT());
+  Variable *copyToReg(Operand *Src, RegNumT RegNum = RegNumT());
+
+  /// Returns a register containing all zeros, without affecting the FLAGS
+  /// register, using the best instruction for the type.
+  Variable *makeZeroedRegister(Type Ty, RegNumT RegNum = RegNumT());
+
+  /// \name Returns a vector in a register with the given constant entries.
+  /// @{
+  Variable *makeVectorOfZeros(Type Ty, RegNumT RegNum = RegNumT());
+  Variable *makeVectorOfOnes(Type Ty, RegNumT RegNum = RegNumT());
+  Variable *makeVectorOfMinusOnes(Type Ty, RegNumT RegNum = RegNumT());
+  Variable *makeVectorOfHighOrderBits(Type Ty, RegNumT RegNum = RegNumT());
+  Variable *makeVectorOfFabsMask(Type Ty, RegNumT RegNum = RegNumT());
+  /// @}
+
+  /// Return a memory operand corresponding to a stack allocated Variable.
+  X86OperandMem *getMemoryOperandForStackSlot(Type Ty, Variable *Slot,
+                                              uint32_t Offset = 0);
+
+  /// The following are helpers that insert lowered x86 instructions with
+  /// minimal syntactic overhead, so that the lowering code can look as close to
+  /// assembly as practical.
+  void _adc(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Adc>(Dest, Src0);
+  }
+  void _adc_rmw(X86OperandMem *DestSrc0, Operand *Src1) {
+    Context.insert<Insts::AdcRMW>(DestSrc0, Src1);
+  }
+  void _add(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Add>(Dest, Src0);
+  }
+  void _add_rmw(X86OperandMem *DestSrc0, Operand *Src1) {
+    Context.insert<Insts::AddRMW>(DestSrc0, Src1);
+  }
+  void _addps(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Addps>(Dest, Src0);
+  }
+  void _addss(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Addss>(Dest, Src0);
+  }
+  void _add_sp(Operand *Adjustment);
+  void _and(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::And>(Dest, Src0);
+  }
+  void _andnps(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Andnps>(Dest, Src0);
+  }
+  void _andps(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Andps>(Dest, Src0);
+  }
+  void _and_rmw(X86OperandMem *DestSrc0, Operand *Src1) {
+    Context.insert<Insts::AndRMW>(DestSrc0, Src1);
+  }
+  void _blendvps(Variable *Dest, Operand *Src0, Operand *Src1) {
+    Context.insert<Insts::Blendvps>(Dest, Src0, Src1);
+  }
+  void _br(BrCond Condition, CfgNode *TargetTrue, CfgNode *TargetFalse) {
+    Context.insert<InstX86Br>(TargetTrue, TargetFalse, Condition,
+                              InstX86Br::Far);
+  }
+  void _br(CfgNode *Target) {
+    Context.insert<InstX86Br>(Target, InstX86Br::Far);
+  }
+  void _br(BrCond Condition, CfgNode *Target) {
+    Context.insert<InstX86Br>(Target, Condition, InstX86Br::Far);
+  }
+  void _br(BrCond Condition, InstX86Label *Label,
+           typename InstX86Br::Mode Kind = InstX86Br::Near) {
+    Context.insert<InstX86Br>(Label, Condition, Kind);
+  }
+  void _bsf(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Bsf>(Dest, Src0);
+  }
+  void _bsr(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Bsr>(Dest, Src0);
+  }
+  void _bswap(Variable *SrcDest) {
+    Context.insert<Insts::Bswap>(SrcDest);
+  }
+  void _cbwdq(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Cbwdq>(Dest, Src0);
+  }
+  void _cmov(Variable *Dest, Operand *Src0, BrCond Condition) {
+    Context.insert<Insts::Cmov>(Dest, Src0, Condition);
+  }
+  void _cmp(Operand *Src0, Operand *Src1) {
+    Context.insert<Insts::Icmp>(Src0, Src1);
+  }
+  void _cmpps(Variable *Dest, Operand *Src0, CmppsCond Condition) {
+    Context.insert<Insts::Cmpps>(Dest, Src0, Condition);
+  }
+  void _cmpxchg(Operand *DestOrAddr, Variable *Eax, Variable *Desired,
+                bool Locked) {
+    Context.insert<Insts::Cmpxchg>(DestOrAddr, Eax, Desired,
+                                                    Locked);
+    // Mark eax as possibly modified by cmpxchg.
+    Context.insert<InstFakeDef>(Eax, llvm::dyn_cast<Variable>(DestOrAddr));
+    _set_dest_redefined();
+    Context.insert<InstFakeUse>(Eax);
+  }
+  void _cmpxchg8b(X86OperandMem *Addr, Variable *Edx, Variable *Eax,
+                  Variable *Ecx, Variable *Ebx, bool Locked) {
+    Context.insert<Insts::Cmpxchg8b>(Addr, Edx, Eax, Ecx, Ebx,
+                                                      Locked);
+    // Mark edx, and eax as possibly modified by cmpxchg8b.
+    Context.insert<InstFakeDef>(Edx);
+    _set_dest_redefined();
+    Context.insert<InstFakeUse>(Edx);
+    Context.insert<InstFakeDef>(Eax);
+    _set_dest_redefined();
+    Context.insert<InstFakeUse>(Eax);
+  }
+  void _cvt(Variable *Dest, Operand *Src0,
+            Insts::Cvt::CvtVariant Variant) {
+    Context.insert<Insts::Cvt>(Dest, Src0, Variant);
+  }
+  void _round(Variable *Dest, Operand *Src0, Operand *Imm) {
+    Context.insert<Insts::Round>(Dest, Src0, Imm);
+  }
+  void _div(Variable *Dest, Operand *Src0, Operand *Src1) {
+    Context.insert<Insts::Div>(Dest, Src0, Src1);
+  }
+  void _divps(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Divps>(Dest, Src0);
+  }
+  void _divss(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Divss>(Dest, Src0);
+  }
+  void _fld(Operand *Src0) {
+    Context.insert<Insts::Fld>(Src0);
+  }
+  void _fstp(Variable *Dest) {
+    Context.insert<Insts::Fstp>(Dest);
+  }
+  void _idiv(Variable *Dest, Operand *Src0, Operand *Src1) {
+    Context.insert<Insts::Idiv>(Dest, Src0, Src1);
+  }
+  void _imul(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Imul>(Dest, Src0);
+  }
+  void _imul_imm(Variable *Dest, Operand *Src0, Constant *Imm) {
+    Context.insert<Insts::ImulImm>(Dest, Src0, Imm);
+  }
+  void _insertps(Variable *Dest, Operand *Src0, Operand *Src1) {
+    Context.insert<Insts::Insertps>(Dest, Src0, Src1);
+  }
+  void _int3() { Context.insert<Insts::Int3>(); }
+  void _jmp(Operand *Target) {
+    Context.insert<Insts::Jmp>(Target);
+  }
+  void _lea(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Lea>(Dest, Src0);
+  }
+  void _link_bp();
+  void _push_reg(RegNumT RegNum);
+  void _pop_reg(RegNumT RegNum);
+  void _mfence() { Context.insert<Insts::Mfence>(); }
+  /// Moves can be used to redefine registers, creating "partial kills" for
+  /// liveness.  Mark where moves are used in this way.
+  void _redefined(Inst *MovInst, bool IsRedefinition = true) {
+    if (IsRedefinition)
+      MovInst->setDestRedefined();
+  }
+  /// If Dest=nullptr is passed in, then a new variable is created, marked as
+  /// infinite register allocation weight, and returned through the in/out Dest
+  /// argument.
+  Insts::Mov *_mov(Variable *&Dest, Operand *Src0,
+                                    RegNumT RegNum = RegNumT()) {
+    if (Dest == nullptr)
+      Dest = makeReg(Src0->getType(), RegNum);
+    return Context.insert<Insts::Mov>(Dest, Src0);
+  }
+  void _mov_sp(Operand *NewValue);
+  Insts::Movp *_movp(Variable *Dest, Operand *Src0) {
+    return Context.insert<Insts::Movp>(Dest, Src0);
+  }
+  void _movd(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Movd>(Dest, Src0);
+  }
+  void _movq(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Movq>(Dest, Src0);
+  }
+  void _movss(Variable *Dest, Variable *Src0) {
+    Context.insert<Insts::MovssRegs>(Dest, Src0);
+  }
+  void _movsx(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Movsx>(Dest, Src0);
+  }
+  Insts::Movzx *_movzx(Variable *Dest, Operand *Src0) {
+    return Context.insert<Insts::Movzx>(Dest, Src0);
+  }
+  void _maxss(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Maxss>(Dest, Src0);
+  }
+  void _minss(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Minss>(Dest, Src0);
+  }
+  void _maxps(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Maxps>(Dest, Src0);
+  }
+  void _minps(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Minps>(Dest, Src0);
+  }
+  void _mul(Variable *Dest, Variable *Src0, Operand *Src1) {
+    Context.insert<Insts::Mul>(Dest, Src0, Src1);
+  }
+  void _mulps(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Mulps>(Dest, Src0);
+  }
+  void _mulss(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Mulss>(Dest, Src0);
+  }
+  void _neg(Variable *SrcDest) {
+    Context.insert<Insts::Neg>(SrcDest);
+  }
+  void _nop(SizeT Variant) {
+    Context.insert<Insts::Nop>(Variant);
+  }
+  void _or(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Or>(Dest, Src0);
+  }
+  void _orps(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Orps>(Dest, Src0);
+  }
+  void _or_rmw(X86OperandMem *DestSrc0, Operand *Src1) {
+    Context.insert<Insts::OrRMW>(DestSrc0, Src1);
+  }
+  void _padd(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Padd>(Dest, Src0);
+  }
+  void _padds(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Padds>(Dest, Src0);
+  }
+  void _paddus(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Paddus>(Dest, Src0);
+  }
+  void _pand(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Pand>(Dest, Src0);
+  }
+  void _pandn(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Pandn>(Dest, Src0);
+  }
+  void _pblendvb(Variable *Dest, Operand *Src0, Operand *Src1) {
+    Context.insert<Insts::Pblendvb>(Dest, Src0, Src1);
+  }
+  void _pcmpeq(Variable *Dest, Operand *Src0,
+               Type ArithmeticTypeOverride = IceType_void) {
+    Context.insert<Insts::Pcmpeq>(Dest, Src0,
+                                                   ArithmeticTypeOverride);
+  }
+  void _pcmpgt(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Pcmpgt>(Dest, Src0);
+  }
+  void _pextr(Variable *Dest, Operand *Src0, Operand *Src1) {
+    Context.insert<Insts::Pextr>(Dest, Src0, Src1);
+  }
+  void _pinsr(Variable *Dest, Operand *Src0, Operand *Src1) {
+    Context.insert<Insts::Pinsr>(Dest, Src0, Src1);
+  }
+  void _pmull(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Pmull>(Dest, Src0);
+  }
+  void _pmulhw(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Pmulhw>(Dest, Src0);
+  }
+  void _pmulhuw(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Pmulhuw>(Dest, Src0);
+  }
+  void _pmaddwd(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Pmaddwd>(Dest, Src0);
+  }
+  void _pmuludq(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Pmuludq>(Dest, Src0);
+  }
+  void _pop(Variable *Dest) {
+    Context.insert<Insts::Pop>(Dest);
+  }
+  void _por(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Por>(Dest, Src0);
+  }
+  void _punpckl(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Punpckl>(Dest, Src0);
+  }
+  void _punpckh(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Punpckh>(Dest, Src0);
+  }
+  void _packss(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Packss>(Dest, Src0);
+  }
+  void _packus(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Packus>(Dest, Src0);
+  }
+  void _pshufb(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Pshufb>(Dest, Src0);
+  }
+  void _pshufd(Variable *Dest, Operand *Src0, Operand *Src1) {
+    Context.insert<Insts::Pshufd>(Dest, Src0, Src1);
+  }
+  void _psll(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Psll>(Dest, Src0);
+  }
+  void _psra(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Psra>(Dest, Src0);
+  }
+  void _psrl(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Psrl>(Dest, Src0);
+  }
+  void _psub(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Psub>(Dest, Src0);
+  }
+  void _psubs(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Psubs>(Dest, Src0);
+  }
+  void _psubus(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Psubus>(Dest, Src0);
+  }
+  void _push(Operand *Src0) {
+    Context.insert<Insts::Push>(Src0);
+  }
+  void _pxor(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Pxor>(Dest, Src0);
+  }
+  void _ret(Variable *Src0 = nullptr) {
+    Context.insert<Insts::Ret>(Src0);
+  }
+  void _rol(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Rol>(Dest, Src0);
+  }
+  void _round(Variable *Dest, Operand *Src, Constant *Imm) {
+    Context.insert<Insts::Round>(Dest, Src, Imm);
+  }
+  void _sar(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Sar>(Dest, Src0);
+  }
+  void _sbb(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Sbb>(Dest, Src0);
+  }
+  void _sbb_rmw(X86OperandMem *DestSrc0, Operand *Src1) {
+    Context.insert<Insts::SbbRMW>(DestSrc0, Src1);
+  }
+  void _setcc(Variable *Dest, BrCond Condition) {
+    Context.insert<Insts::Setcc>(Dest, Condition);
+  }
+  void _shl(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Shl>(Dest, Src0);
+  }
+  void _shld(Variable *Dest, Variable *Src0, Operand *Src1) {
+    Context.insert<Insts::Shld>(Dest, Src0, Src1);
+  }
+  void _shr(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Shr>(Dest, Src0);
+  }
+  void _shrd(Variable *Dest, Variable *Src0, Operand *Src1) {
+    Context.insert<Insts::Shrd>(Dest, Src0, Src1);
+  }
+  void _shufps(Variable *Dest, Operand *Src0, Operand *Src1) {
+    Context.insert<Insts::Shufps>(Dest, Src0, Src1);
+  }
+  void _movmsk(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Movmsk>(Dest, Src0);
+  }
+  void _sqrt(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Sqrt>(Dest, Src0);
+  }
+  void _store(Operand *Value, X86Operand *Mem) {
+    Context.insert<Insts::Store>(Value, Mem);
+  }
+  void _storep(Variable *Value, X86OperandMem *Mem) {
+    Context.insert<Insts::StoreP>(Value, Mem);
+  }
+  void _storeq(Operand *Value, X86OperandMem *Mem) {
+    Context.insert<Insts::StoreQ>(Value, Mem);
+  }
+  void _stored(Operand *Value, X86OperandMem *Mem) {
+    Context.insert<Insts::StoreD>(Value, Mem);
+  }
+  void _sub(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Sub>(Dest, Src0);
+  }
+  void _sub_rmw(X86OperandMem *DestSrc0, Operand *Src1) {
+    Context.insert<Insts::SubRMW>(DestSrc0, Src1);
+  }
+  void _sub_sp(Operand *Adjustment);
+  void _subps(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Subps>(Dest, Src0);
+  }
+  void _subss(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Subss>(Dest, Src0);
+  }
+  void _test(Operand *Src0, Operand *Src1) {
+    Context.insert<Insts::Test>(Src0, Src1);
+  }
+  void _ucomiss(Operand *Src0, Operand *Src1) {
+    Context.insert<Insts::Ucomiss>(Src0, Src1);
+  }
+  void _ud2() { Context.insert<Insts::UD2>(); }
+  void _unlink_bp();
+  void _xadd(Operand *Dest, Variable *Src, bool Locked) {
+    Context.insert<Insts::Xadd>(Dest, Src, Locked);
+    // The xadd exchanges Dest and Src (modifying Src). Model that update with
+    // a FakeDef followed by a FakeUse.
+    Context.insert<InstFakeDef>(Src, llvm::dyn_cast<Variable>(Dest));
+    _set_dest_redefined();
+    Context.insert<InstFakeUse>(Src);
+  }
+  void _xchg(Operand *Dest, Variable *Src) {
+    Context.insert<Insts::Xchg>(Dest, Src);
+    // The xchg modifies Dest and Src -- model that update with a
+    // FakeDef/FakeUse.
+    Context.insert<InstFakeDef>(Src, llvm::dyn_cast<Variable>(Dest));
+    _set_dest_redefined();
+    Context.insert<InstFakeUse>(Src);
+  }
+  void _xor(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Xor>(Dest, Src0);
+  }
+  void _xorps(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Xorps>(Dest, Src0);
+  }
+  void _xor_rmw(X86OperandMem *DestSrc0, Operand *Src1) {
+    Context.insert<Insts::XorRMW>(DestSrc0, Src1);
+  }
+
+  void _iaca_start() {
+    if (!BuildDefs::minimal())
+      Context.insert<Insts::IacaStart>();
+  }
+  void _iaca_end() {
+    if (!BuildDefs::minimal())
+      Context.insert<Insts::IacaEnd>();
+  }
+
+  /// This class helps wrap IACA markers around the code generated by the
+  /// current scope. It means you don't need to put an end before each return.
+  class ScopedIacaMark {
+    ScopedIacaMark(const ScopedIacaMark &) = delete;
+    ScopedIacaMark &operator=(const ScopedIacaMark &) = delete;
+
+  public:
+    ScopedIacaMark(TargetX8632 *Lowering) : Lowering(Lowering) {
+      Lowering->_iaca_start();
+    }
+    ~ScopedIacaMark() { end(); }
+    void end() {
+      if (!Lowering)
+        return;
+      Lowering->_iaca_end();
+      Lowering = nullptr;
+    }
+
+  private:
+    TargetX8632 *Lowering;
+  };
+
+  bool optimizeScalarMul(Variable *Dest, Operand *Src0, int32_t Src1);
+  void findRMW();
+
+  bool IsEbpBasedFrame = false;
+  size_t RequiredStackAlignment = sizeof(Traits::WordType);
+  size_t SpillAreaSizeBytes = 0;
+  size_t FixedAllocaSizeBytes = 0;
+  size_t FixedAllocaAlignBytes = 0;
+  bool PrologEmitsFixedAllocas = false;
+  uint32_t MaxOutArgsSizeBytes = 0;
+  static std::array<SmallBitVector, RCX86_NUM> TypeToRegisterSet;
+  static std::array<SmallBitVector, RCX86_NUM> TypeToRegisterSetUnfiltered;
+  static std::array<SmallBitVector, Traits::RegisterSet::Reg_NUM>
+      RegisterAliases;
+  SmallBitVector RegsUsed;
+  std::array<VarList, IceType_NUM> PhysicalRegisters;
+  // RebasePtr is a Variable that holds the Rebasing pointer (if any) for the
+  // current sandboxing type.
+  Variable *RebasePtr = nullptr;
+
+private:
+  void lowerShift64(InstArithmetic::OpKind Op, Operand *Src0Lo, Operand *Src0Hi,
+                    Operand *Src1Lo, Variable *DestLo, Variable *DestHi);
+
+  /// Emit the code for a combined operation and consumer instruction, or set
+  /// the destination variable of the operation if Consumer == nullptr.
+  void lowerIcmpAndConsumer(const InstIcmp *Icmp, const Inst *Consumer);
+  void lowerFcmpAndConsumer(const InstFcmp *Fcmp, const Inst *Consumer);
+  void lowerArithAndConsumer(const InstArithmetic *Arith, const Inst *Consumer);
+
+  /// Emit a setcc instruction if Consumer == nullptr; otherwise emit a
+  /// specialized version of Consumer.
+  void setccOrConsumer(BrCond Condition, Variable *Dest, const Inst *Consumer);
+
+  /// Emit a mov [1|0] instruction if Consumer == nullptr; otherwise emit a
+  /// specialized version of Consumer.
+  void movOrConsumer(bool IcmpResult, Variable *Dest, const Inst *Consumer);
+
+  /// Emit the code for instructions with a vector type.
+  void lowerIcmpVector(const InstIcmp *Icmp);
+  void lowerFcmpVector(const InstFcmp *Icmp);
+  void lowerSelectVector(const InstSelect *Instr);
+
+  /// Helpers for select lowering.
+  void lowerSelectMove(Variable *Dest, BrCond Cond, Operand *SrcT,
+                       Operand *SrcF);
+  void lowerSelectIntMove(Variable *Dest, BrCond Cond, Operand *SrcT,
+                          Operand *SrcF);
+  /// Generic helper to move an arbitrary type from Src to Dest.
+  void lowerMove(Variable *Dest, Operand *Src, bool IsRedefinition);
+
+  /// Optimizations for idiom recognition.
+  bool lowerOptimizeFcmpSelect(const InstFcmp *Fcmp, const InstSelect *Select);
+
+  /// Complains loudly if invoked because the cpu can handle 64-bit types
+  /// natively.
+  template <typename T = Traits>
+  typename std::enable_if<T::Is64Bit, void>::type lowerIcmp64(const InstIcmp *,
+                                                              const Inst *) {
+    llvm::report_fatal_error(
+        "Hey, yo! This is x86-64. Watcha doin'? (lowerIcmp64)");
+  }
+  /// x86lowerIcmp64 handles 64-bit icmp lowering.
+  template <typename T = Traits>
+  typename std::enable_if<!T::Is64Bit, void>::type
+  lowerIcmp64(const InstIcmp *Icmp, const Inst *Consumer);
+
+  BoolFolding FoldingInfo;
+
+  /// Helpers for lowering ShuffleVector
+  /// @{
+  Variable *lowerShuffleVector_AllFromSameSrc(Operand *Src, SizeT Index0,
+                                              SizeT Index1, SizeT Index2,
+                                              SizeT Index3);
+  static constexpr SizeT IGNORE_INDEX = 0x80000000u;
+  Variable *lowerShuffleVector_TwoFromSameSrc(Operand *Src0, SizeT Index0,
+                                              SizeT Index1, Operand *Src1,
+                                              SizeT Index2, SizeT Index3);
+  static constexpr SizeT UNIFIED_INDEX_0 = 0;
+  static constexpr SizeT UNIFIED_INDEX_1 = 2;
+  Variable *lowerShuffleVector_UnifyFromDifferentSrcs(Operand *Src0,
+                                                      SizeT Index0,
+                                                      Operand *Src1,
+                                                      SizeT Index1);
+  static constexpr SizeT CLEAR_ALL_BITS = 0x80;
+  SizeT PshufbMaskCount = 0;
+  GlobalString lowerShuffleVector_NewMaskName();
+  ConstantRelocatable *lowerShuffleVector_CreatePshufbMask(
+      int8_t Idx0, int8_t Idx1, int8_t Idx2, int8_t Idx3, int8_t Idx4,
+      int8_t Idx5, int8_t Idx6, int8_t Idx7, int8_t Idx8, int8_t Idx9,
+      int8_t Idx10, int8_t Idx11, int8_t Idx12, int8_t Idx13, int8_t Idx14,
+      int8_t Idx15);
+  void lowerShuffleVector_UsingPshufb(Variable *Dest, Operand *Src0,
+                                      Operand *Src1, int8_t Idx0, int8_t Idx1,
+                                      int8_t Idx2, int8_t Idx3, int8_t Idx4,
+                                      int8_t Idx5, int8_t Idx6, int8_t Idx7,
+                                      int8_t Idx8, int8_t Idx9, int8_t Idx10,
+                                      int8_t Idx11, int8_t Idx12, int8_t Idx13,
+                                      int8_t Idx14, int8_t Idx15);
+  /// @}
+
+  static constexpr FixupKind PcRelFixup = Traits::FK_PcRel;
+  static constexpr FixupKind AbsFixup = Traits::FK_Abs;
+
+public:
   static std::unique_ptr<::Ice::TargetLowering> create(Cfg *Func) {
     return makeUnique<TargetX8632>(Func);
   }
@@ -43,50 +1021,54 @@
     return makeUnique<X8632::AssemblerX8632>();
   }
 
-protected:
-  void _add_sp(Operand *Adjustment);
-  void _mov_sp(Operand *NewValue);
-  void _sub_sp(Operand *Adjustment);
-  void _link_bp();
-  void _unlink_bp();
-  void _push_reg(RegNumT RegNum);
-  void _pop_reg(RegNumT RegNum);
+private:
+  ENABLE_MAKE_UNIQUE;
 
-  void emitStackProbe(size_t StackSizeBytes);
-  void lowerIndirectJump(Variable *JumpTarget);
-  Inst *emitCallToTarget(Operand *CallTarget, Variable *ReturnReg,
-                         size_t NumVariadicFpArgs = 0) override;
-  Variable *moveReturnValueToRegister(Operand *Value, Type ReturnType) override;
+  explicit TargetX8632(Cfg *Func);
+};
+
+class TargetDataX8632 final : public TargetDataLowering {
+  using Traits = TargetX8632Traits;
+  TargetDataX8632() = delete;
+  TargetDataX8632(const TargetDataX8632 &) = delete;
+  TargetDataX8632 &operator=(const TargetDataX8632 &) = delete;
+
+public:
+  ~TargetDataX8632() override = default;
+
+  static std::unique_ptr<TargetDataLowering> create(GlobalContext *Ctx) {
+    return makeUnique<TargetDataX8632>(Ctx);
+  }
+
+  void lowerGlobals(const VariableDeclarationList &Vars,
+                    const std::string &SectionSuffix) override;
+  void lowerConstants() override;
+  void lowerJumpTables() override;
 
 private:
   ENABLE_MAKE_UNIQUE;
-  friend class X8632::TargetX86Base<X8632::Traits>;
 
-  explicit TargetX8632(Cfg *Func) : TargetX86Base(Func) {}
+  explicit TargetDataX8632(GlobalContext *Ctx) : TargetDataLowering(Ctx) {}
+  template <typename T> static void emitConstantPool(GlobalContext *Ctx);
 };
 
-// The -Wundefined-var-template warning requires to forward-declare static
-// members of template class specializations. Note that "An explicit
-// specialization of a static data member of a template is a definition if the
-// declaration includes an initializer; otherwise, it is a declaration."
-// Visual Studio has a bug which treats these declarations as definitions,
-// leading to multiple definition errors. Since we only enable
-// -Wundefined-var-template for Clang, omit these declarations on other
-// compilers.
-#if defined(__clang__)
-template <>
-std::array<SmallBitVector, RCX86_NUM>
-    TargetX86Base<X8632::Traits>::TypeToRegisterSet;
+class TargetHeaderX86 : public TargetHeaderLowering {
+  TargetHeaderX86() = delete;
+  TargetHeaderX86(const TargetHeaderX86 &) = delete;
+  TargetHeaderX86 &operator=(const TargetHeaderX86 &) = delete;
 
-template <>
-std::array<SmallBitVector, RCX86_NUM>
-    TargetX86Base<X8632::Traits>::TypeToRegisterSetUnfiltered;
+public:
+  ~TargetHeaderX86() = default;
 
-template <>
-std::array<SmallBitVector,
-           TargetX86Base<X8632::Traits>::Traits::RegisterSet::Reg_NUM>
-    TargetX86Base<X8632::Traits>::RegisterAliases;
-#endif // defined(__clang__)
+  static std::unique_ptr<TargetHeaderLowering> create(GlobalContext *Ctx) {
+    return makeUnique<TargetHeaderX86>(Ctx);
+  }
+
+private:
+  ENABLE_MAKE_UNIQUE;
+
+  explicit TargetHeaderX86(GlobalContext *Ctx) : TargetHeaderLowering(Ctx) {}
+};
 
 } // end of namespace X8632
 } // end of namespace Ice
diff --git a/third_party/subzero/src/IceTargetLoweringX8632Base.h b/third_party/subzero/src/IceTargetLoweringX8632Base.h
deleted file mode 100644
index b39c8e6..0000000
--- a/third_party/subzero/src/IceTargetLoweringX8632Base.h
+++ /dev/null
@@ -1,1042 +0,0 @@
-//===- subzero/src/IceTargetLoweringX8632Base.h - x86 lowering ----*- C++
-//-*-===//
-//
-//                        The Subzero Code Generator
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// \brief Declares the TargetLoweringX86 template class, which implements the
-/// TargetLowering base interface for the x86 architecture.
-///
-//===----------------------------------------------------------------------===//
-
-#ifndef SUBZERO_SRC_ICETARGETLOWERINGX8632BASE_H
-#define SUBZERO_SRC_ICETARGETLOWERINGX8632BASE_H
-
-#include "IceDefs.h"
-#include "IceInst.h"
-#include "IceSwitchLowering.h"
-#include "IceTargetLowering.h"
-#include "IceTargetLoweringX86RegClass.h"
-#include "IceUtils.h"
-
-#include <array>
-#include <type_traits>
-#include <utility>
-
-namespace Ice {
-namespace X8632 {
-
-using namespace ::Ice::X86;
-
-template <typename Traits> class BoolFolding;
-
-/// TargetX86Base is a template for all X86 Targets, and it relies on the CRT
-/// pattern for generating code, delegating to actual backends target-specific
-/// lowerings (e.g., call, ret, and intrinsics.).
-///
-/// Note: Ideally, we should be able to
-///
-///  static_assert(std::is_base_of<TargetX86Base<TraitsType>,
-///  Machine>::value);
-///
-/// but that does not work: the compiler does not know that Machine inherits
-/// from TargetX86Base at this point in translation.
-template <typename TraitsType> class TargetX86Base : public TargetLowering {
-  TargetX86Base() = delete;
-  TargetX86Base(const TargetX86Base &) = delete;
-  TargetX86Base &operator=(const TargetX86Base &) = delete;
-
-public:
-  using Traits = TraitsType;
-  using ConcreteTarget = typename Traits::ConcreteTarget;
-  using InstructionSetEnum = typename Traits::InstructionSet;
-
-  using BrCond = CondX86::BrCond;
-  using CmppsCond = CondX86::CmppsCond;
-
-  using X86Address = typename Traits::Address;
-  using X86Operand = typename Traits::X86Operand;
-  using X86OperandMem = typename Traits::X86OperandMem;
-  using SegmentRegisters = typename Traits::X86OperandMem::SegmentRegisters;
-
-  using InstX86Br = typename Traits::Insts::Br;
-  using InstX86FakeRMW = typename Traits::Insts::FakeRMW;
-  using InstX86Label = typename Traits::Insts::Label;
-
-  ~TargetX86Base() override = default;
-
-  static void staticInit(GlobalContext *Ctx);
-  static bool shouldBePooled(const Constant *C);
-  static ::Ice::Type getPointerType();
-
-  static FixupKind getPcRelFixup() { return PcRelFixup; }
-  static FixupKind getAbsFixup() { return AbsFixup; }
-
-  void translateOm1() override;
-  void translateO2() override;
-  void doLoadOpt();
-  bool doBranchOpt(Inst *I, const CfgNode *NextNode) override;
-
-  SizeT getNumRegisters() const override {
-    return Traits::RegisterSet::Reg_NUM;
-  }
-
-  Inst *createLoweredMove(Variable *Dest, Variable *SrcVar) override {
-    if (isVectorType(Dest->getType())) {
-      return Traits::Insts::Movp::create(Func, Dest, SrcVar);
-    }
-    return Traits::Insts::Mov::create(Func, Dest, SrcVar);
-    (void)Dest;
-    (void)SrcVar;
-    return nullptr;
-  }
-
-  Variable *getPhysicalRegister(RegNumT RegNum,
-                                Type Ty = IceType_void) override;
-  const char *getRegName(RegNumT RegNum, Type Ty) const override;
-  static const char *getRegClassName(RegClass C) {
-    auto ClassNum = static_cast<RegClassX86>(C);
-    assert(ClassNum < RCX86_NUM);
-    switch (ClassNum) {
-    default:
-      assert(C < RC_Target);
-      return regClassString(C);
-    case RCX86_Is64To8:
-      return "i64to8"; // 64-bit GPR truncable to i8
-    case RCX86_Is32To8:
-      return "i32to8"; // 32-bit GPR truncable to i8
-    case RCX86_Is16To8:
-      return "i16to8"; // 16-bit GPR truncable to i8
-    case RCX86_IsTrunc8Rcvr:
-      return "i8from"; // 8-bit GPR truncable from wider GPRs
-    case RCX86_IsAhRcvr:
-      return "i8fromah"; // 8-bit GPR that ah can be assigned to
-    }
-  }
-  SmallBitVector getRegisterSet(RegSetMask Include,
-                                RegSetMask Exclude) const override;
-  const SmallBitVector &
-  getRegistersForVariable(const Variable *Var) const override {
-    RegClass RC = Var->getRegClass();
-    assert(static_cast<RegClassX86>(RC) < RCX86_NUM);
-    return TypeToRegisterSet[RC];
-  }
-
-  const SmallBitVector &
-  getAllRegistersForVariable(const Variable *Var) const override {
-    RegClass RC = Var->getRegClass();
-    assert(static_cast<RegClassX86>(RC) < RCX86_NUM);
-    return TypeToRegisterSetUnfiltered[RC];
-  }
-
-  const SmallBitVector &getAliasesForRegister(RegNumT Reg) const override {
-    Reg.assertIsValid();
-    return RegisterAliases[Reg];
-  }
-
-  bool hasFramePointer() const override { return IsEbpBasedFrame; }
-  void setHasFramePointer() override { IsEbpBasedFrame = true; }
-  RegNumT getStackReg() const override { return Traits::StackPtr; }
-  RegNumT getFrameReg() const override { return Traits::FramePtr; }
-  RegNumT getFrameOrStackReg() const override {
-    // If the stack pointer needs to be aligned, then the frame pointer is
-    // unaligned, so always use the stack pointer.
-    if (needsStackPointerAlignment())
-      return getStackReg();
-    return IsEbpBasedFrame ? getFrameReg() : getStackReg();
-  }
-  size_t typeWidthInBytesOnStack(Type Ty) const override {
-    // Round up to the next multiple of WordType bytes.
-    const uint32_t WordSizeInBytes = typeWidthInBytes(Traits::WordType);
-    return Utils::applyAlignment(typeWidthInBytes(Ty), WordSizeInBytes);
-  }
-  uint32_t getStackAlignment() const override {
-    return Traits::X86_STACK_ALIGNMENT_BYTES;
-  }
-  bool needsStackPointerAlignment() const override {
-    // If the ABI's stack alignment is smaller than the vector size (16 bytes),
-    // use the (realigned) stack pointer for addressing any stack variables.
-    return Traits::X86_STACK_ALIGNMENT_BYTES < 16;
-  }
-  void reserveFixedAllocaArea(size_t Size, size_t Align) override {
-    FixedAllocaSizeBytes = Size;
-    assert(llvm::isPowerOf2_32(Align));
-    FixedAllocaAlignBytes = Align;
-    PrologEmitsFixedAllocas = true;
-  }
-  /// Returns the (negative) offset from ebp/rbp where the fixed Allocas start.
-  int32_t getFrameFixedAllocaOffset() const override {
-    return FixedAllocaSizeBytes - (SpillAreaSizeBytes - maxOutArgsSizeBytes());
-  }
-  virtual uint32_t maxOutArgsSizeBytes() const override {
-    return MaxOutArgsSizeBytes;
-  }
-  virtual void updateMaxOutArgsSizeBytes(uint32_t Size) {
-    MaxOutArgsSizeBytes = std::max(MaxOutArgsSizeBytes, Size);
-  }
-
-  bool shouldSplitToVariable64On32(Type Ty) const override {
-    return Traits::Is64Bit ? false : Ty == IceType_i64;
-  }
-
-  SizeT getMinJumpTableSize() const override { return 4; }
-
-  void emitVariable(const Variable *Var) const override;
-
-  void emit(const ConstantInteger32 *C) const final;
-  void emit(const ConstantInteger64 *C) const final;
-  void emit(const ConstantFloat *C) const final;
-  void emit(const ConstantDouble *C) const final;
-  void emit(const ConstantUndef *C) const final;
-  void emit(const ConstantRelocatable *C) const final;
-
-  void initNodeForLowering(CfgNode *Node) override;
-
-  template <typename T = Traits>
-  typename std::enable_if<!T::Is64Bit, Operand>::type *
-  loOperand(Operand *Operand);
-  template <typename T = Traits>
-  typename std::enable_if<T::Is64Bit, Operand>::type *loOperand(Operand *) {
-    llvm::report_fatal_error(
-        "Hey, yo! This is x86-64. Watcha doin'? (loOperand)");
-  }
-
-  template <typename T = Traits>
-  typename std::enable_if<!T::Is64Bit, Operand>::type *
-  hiOperand(Operand *Operand);
-  template <typename T = Traits>
-  typename std::enable_if<T::Is64Bit, Operand>::type *hiOperand(Operand *) {
-    llvm::report_fatal_error(
-        "Hey, yo! This is x86-64. Watcha doin'? (hiOperand)");
-  }
-
-  void addProlog(CfgNode *Node) override;
-  void finishArgumentLowering(Variable *Arg, Variable *FramePtr,
-                              size_t BasicFrameOffset, size_t StackAdjBytes,
-                              size_t &InArgsSizeBytes);
-  void addEpilog(CfgNode *Node) override;
-  X86Address stackVarToAsmOperand(const Variable *Var) const;
-
-  InstructionSetEnum getInstructionSet() const { return InstructionSet; }
-  Operand *legalizeUndef(Operand *From, RegNumT RegNum = RegNumT());
-
-protected:
-  explicit TargetX86Base(Cfg *Func);
-
-  void postLower() override;
-
-  void lowerAlloca(const InstAlloca *Instr) override;
-  void lowerArguments() override;
-  void lowerArithmetic(const InstArithmetic *Instr) override;
-  void lowerAssign(const InstAssign *Instr) override;
-  void lowerBr(const InstBr *Instr) override;
-  void lowerBreakpoint(const InstBreakpoint *Instr) override;
-  void lowerCall(const InstCall *Instr) override;
-  void lowerCast(const InstCast *Instr) override;
-  void lowerExtractElement(const InstExtractElement *Instr) override;
-  void lowerFcmp(const InstFcmp *Instr) override;
-  void lowerIcmp(const InstIcmp *Instr) override;
-
-  void lowerIntrinsic(const InstIntrinsic *Instr) override;
-  void lowerInsertElement(const InstInsertElement *Instr) override;
-  void lowerLoad(const InstLoad *Instr) override;
-  void lowerPhi(const InstPhi *Instr) override;
-  void lowerRet(const InstRet *Instr) override;
-  void lowerSelect(const InstSelect *Instr) override;
-  void lowerShuffleVector(const InstShuffleVector *Instr) override;
-  void lowerStore(const InstStore *Instr) override;
-  void lowerSwitch(const InstSwitch *Instr) override;
-  void lowerUnreachable(const InstUnreachable *Instr) override;
-  void lowerOther(const Inst *Instr) override;
-  void lowerRMW(const InstX86FakeRMW *RMW);
-  void prelowerPhis() override;
-  uint32_t getCallStackArgumentsSizeBytes(const CfgVector<Type> &ArgTypes,
-                                          Type ReturnType);
-  uint32_t getCallStackArgumentsSizeBytes(const InstCall *Instr) override;
-  void genTargetHelperCallFor(Inst *Instr) override;
-
-  /// OptAddr wraps all the possible operands that an x86 address might have.
-  struct OptAddr {
-    Variable *Base = nullptr;
-    Variable *Index = nullptr;
-    uint16_t Shift = 0;
-    int32_t Offset = 0;
-    ConstantRelocatable *Relocatable = nullptr;
-  };
-
-  // Builds information for a canonical address expresion:
-  //   <Relocatable + Offset>(Base, Index, Shift)
-  X86OperandMem *computeAddressOpt(const Inst *Instr, Type MemType,
-                                   Operand *Addr);
-  void doAddressOptOther() override;
-  void doAddressOptLoad() override;
-  void doAddressOptStore() override;
-  void doAddressOptLoadSubVector() override;
-  void doAddressOptStoreSubVector() override;
-  void doMockBoundsCheck(Operand *Opnd) override;
-
-  /// Naive lowering of cmpxchg.
-  void lowerAtomicCmpxchg(Variable *DestPrev, Operand *Ptr, Operand *Expected,
-                          Operand *Desired);
-  /// Attempt a more optimized lowering of cmpxchg. Returns true if optimized.
-  bool tryOptimizedCmpxchgCmpBr(Variable *DestPrev, Operand *Ptr,
-                                Operand *Expected, Operand *Desired);
-  void lowerAtomicRMW(Variable *Dest, uint32_t Operation, Operand *Ptr,
-                      Operand *Val);
-  void lowerCountZeros(bool Cttz, Type Ty, Variable *Dest, Operand *FirstVal,
-                       Operand *SecondVal);
-  /// Load from memory for a given type.
-  void typedLoad(Type Ty, Variable *Dest, Variable *Base, Constant *Offset);
-  /// Store to memory for a given type.
-  void typedStore(Type Ty, Variable *Value, Variable *Base, Constant *Offset);
-  /// Copy memory of given type from Src to Dest using OffsetAmt on both.
-  void copyMemory(Type Ty, Variable *Dest, Variable *Src, int32_t OffsetAmt);
-  /// Replace some calls to memcpy with inline instructions.
-  void lowerMemcpy(Operand *Dest, Operand *Src, Operand *Count);
-  /// Replace some calls to memmove with inline instructions.
-  void lowerMemmove(Operand *Dest, Operand *Src, Operand *Count);
-  /// Replace some calls to memset with inline instructions.
-  void lowerMemset(Operand *Dest, Operand *Val, Operand *Count);
-
-  void lowerIndirectJump(Variable *JumpTarget) {
-    // Without std::move below, the compiler deduces that the argument to
-    // lowerIndirectJmp is a Variable *&, not a Variable *.
-    dispatchToConcrete(&Traits::ConcreteTarget::lowerIndirectJump,
-                       std::move(JumpTarget));
-  }
-
-  /// Check the comparison is in [Min,Max]. The flags register will be modified
-  /// with:
-  ///   - below equal, if in range
-  ///   - above, set if not in range
-  /// The index into the range is returned.
-  Operand *lowerCmpRange(Operand *Comparison, uint64_t Min, uint64_t Max);
-  /// Lowering of a cluster of switch cases. If the case is not matched control
-  /// will pass to the default label provided. If the default label is nullptr
-  /// then control will fall through to the next instruction. DoneCmp should be
-  /// true if the flags contain the result of a comparison with the Comparison.
-  void lowerCaseCluster(const CaseCluster &Case, Operand *Src0, bool DoneCmp,
-                        CfgNode *DefaultLabel = nullptr);
-
-  using LowerBinOp = void (TargetX86Base::*)(Variable *, Operand *);
-  void expandAtomicRMWAsCmpxchg(LowerBinOp op_lo, LowerBinOp op_hi,
-                                Variable *Dest, Operand *Ptr, Operand *Val);
-
-  void eliminateNextVectorSextInstruction(Variable *SignExtendedResult);
-
-  void emitStackProbe(size_t StackSizeBytes) {
-    dispatchToConcrete(&Traits::ConcreteTarget::emitStackProbe,
-                       std::move(StackSizeBytes));
-  }
-
-  /// Emit just the call instruction (without argument or return variable
-  /// processing).
-  virtual Inst *emitCallToTarget(Operand *CallTarget, Variable *ReturnReg,
-                                 size_t NumVariadicFpArgs = 0) = 0;
-  /// Materialize the moves needed to return a value of the specified type.
-  virtual Variable *moveReturnValueToRegister(Operand *Value,
-                                              Type ReturnType) = 0;
-
-  /// Emit a jump table to the constant pool.
-  void emitJumpTable(const Cfg *Func,
-                     const InstJumpTable *JumpTable) const override;
-
-  /// Emit a fake use of esp to make sure esp stays alive for the entire
-  /// function. Otherwise some esp adjustments get dead-code eliminated.
-  void keepEspLiveAtExit() {
-    Variable *esp =
-        Func->getTarget()->getPhysicalRegister(getStackReg(), Traits::WordType);
-    Context.insert<InstFakeUse>(esp);
-  }
-
-  /// Operand legalization helpers. To deal with address mode constraints, the
-  /// helpers will create a new Operand and emit instructions that guarantee
-  /// that the Operand kind is one of those indicated by the LegalMask (a
-  /// bitmask of allowed kinds). If the input Operand is known to already meet
-  /// the constraints, it may be simply returned as the result, without creating
-  /// any new instructions or operands.
-  enum OperandLegalization {
-    Legal_None = 0,
-    Legal_Reg = 1 << 0, // physical register, not stack location
-    Legal_Imm = 1 << 1,
-    Legal_Mem = 1 << 2, // includes [eax+4*ecx] as well as [esp+12]
-    Legal_Rematerializable = 1 << 3,
-    Legal_AddrAbs = 1 << 4, // ConstantRelocatable doesn't have to add RebasePtr
-    Legal_Default = ~(Legal_Rematerializable | Legal_AddrAbs)
-    // TODO(stichnot): Figure out whether this default works for x86-64.
-  };
-  using LegalMask = uint32_t;
-  Operand *legalize(Operand *From, LegalMask Allowed = Legal_Default,
-                    RegNumT RegNum = RegNumT());
-  Variable *legalizeToReg(Operand *From, RegNumT RegNum = RegNumT());
-  /// Legalize the first source operand for use in the cmp instruction.
-  Operand *legalizeSrc0ForCmp(Operand *Src0, Operand *Src1);
-  /// Turn a pointer operand into a memory operand that can be used by a real
-  /// load/store operation. Legalizes the operand as well. This is a nop if the
-  /// operand is already a legal memory operand.
-  X86OperandMem *formMemoryOperand(Operand *Ptr, Type Ty,
-                                   bool DoLegalize = true);
-
-  Variable *makeReg(Type Ty, RegNumT RegNum = RegNumT());
-  static Type stackSlotType();
-
-  static constexpr uint32_t NoSizeLimit = 0;
-  /// Returns the largest type which is equal to or larger than Size bytes. The
-  /// type is suitable for copying memory i.e. a load and store will be a single
-  /// instruction (for example x86 will get f64 not i64).
-  static Type largestTypeInSize(uint32_t Size, uint32_t MaxSize = NoSizeLimit);
-  /// Returns the smallest type which is equal to or larger than Size bytes. If
-  /// one doesn't exist then the largest type smaller than Size bytes is
-  /// returned. The type is suitable for memory copies as described at
-  /// largestTypeInSize.
-  static Type firstTypeThatFitsSize(uint32_t Size,
-                                    uint32_t MaxSize = NoSizeLimit);
-
-  Variable *copyToReg8(Operand *Src, RegNumT RegNum = RegNumT());
-  Variable *copyToReg(Operand *Src, RegNumT RegNum = RegNumT());
-
-  /// Returns a register containing all zeros, without affecting the FLAGS
-  /// register, using the best instruction for the type.
-  Variable *makeZeroedRegister(Type Ty, RegNumT RegNum = RegNumT());
-
-  /// \name Returns a vector in a register with the given constant entries.
-  /// @{
-  Variable *makeVectorOfZeros(Type Ty, RegNumT RegNum = RegNumT());
-  Variable *makeVectorOfOnes(Type Ty, RegNumT RegNum = RegNumT());
-  Variable *makeVectorOfMinusOnes(Type Ty, RegNumT RegNum = RegNumT());
-  Variable *makeVectorOfHighOrderBits(Type Ty, RegNumT RegNum = RegNumT());
-  Variable *makeVectorOfFabsMask(Type Ty, RegNumT RegNum = RegNumT());
-  /// @}
-
-  /// Return a memory operand corresponding to a stack allocated Variable.
-  X86OperandMem *getMemoryOperandForStackSlot(Type Ty, Variable *Slot,
-                                              uint32_t Offset = 0);
-
-  /// The following are helpers that insert lowered x86 instructions with
-  /// minimal syntactic overhead, so that the lowering code can look as close to
-  /// assembly as practical.
-  void _adc(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Adc>(Dest, Src0);
-  }
-  void _adc_rmw(X86OperandMem *DestSrc0, Operand *Src1) {
-    Context.insert<typename Traits::Insts::AdcRMW>(DestSrc0, Src1);
-  }
-  void _add(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Add>(Dest, Src0);
-  }
-  void _add_rmw(X86OperandMem *DestSrc0, Operand *Src1) {
-    Context.insert<typename Traits::Insts::AddRMW>(DestSrc0, Src1);
-  }
-  void _addps(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Addps>(Dest, Src0);
-  }
-  void _addss(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Addss>(Dest, Src0);
-  }
-  void _add_sp(Operand *Adjustment) {
-    dispatchToConcrete(&Traits::ConcreteTarget::_add_sp, std::move(Adjustment));
-  }
-  void _and(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::And>(Dest, Src0);
-  }
-  void _andnps(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Andnps>(Dest, Src0);
-  }
-  void _andps(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Andps>(Dest, Src0);
-  }
-  void _and_rmw(X86OperandMem *DestSrc0, Operand *Src1) {
-    Context.insert<typename Traits::Insts::AndRMW>(DestSrc0, Src1);
-  }
-  void _blendvps(Variable *Dest, Operand *Src0, Operand *Src1) {
-    Context.insert<typename Traits::Insts::Blendvps>(Dest, Src0, Src1);
-  }
-  void _br(BrCond Condition, CfgNode *TargetTrue, CfgNode *TargetFalse) {
-    Context.insert<InstX86Br>(TargetTrue, TargetFalse, Condition,
-                              InstX86Br::Far);
-  }
-  void _br(CfgNode *Target) {
-    Context.insert<InstX86Br>(Target, InstX86Br::Far);
-  }
-  void _br(BrCond Condition, CfgNode *Target) {
-    Context.insert<InstX86Br>(Target, Condition, InstX86Br::Far);
-  }
-  void _br(BrCond Condition, InstX86Label *Label,
-           typename InstX86Br::Mode Kind = InstX86Br::Near) {
-    Context.insert<InstX86Br>(Label, Condition, Kind);
-  }
-  void _bsf(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Bsf>(Dest, Src0);
-  }
-  void _bsr(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Bsr>(Dest, Src0);
-  }
-  void _bswap(Variable *SrcDest) {
-    Context.insert<typename Traits::Insts::Bswap>(SrcDest);
-  }
-  void _cbwdq(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Cbwdq>(Dest, Src0);
-  }
-  void _cmov(Variable *Dest, Operand *Src0, BrCond Condition) {
-    Context.insert<typename Traits::Insts::Cmov>(Dest, Src0, Condition);
-  }
-  void _cmp(Operand *Src0, Operand *Src1) {
-    Context.insert<typename Traits::Insts::Icmp>(Src0, Src1);
-  }
-  void _cmpps(Variable *Dest, Operand *Src0, CmppsCond Condition) {
-    Context.insert<typename Traits::Insts::Cmpps>(Dest, Src0, Condition);
-  }
-  void _cmpxchg(Operand *DestOrAddr, Variable *Eax, Variable *Desired,
-                bool Locked) {
-    Context.insert<typename Traits::Insts::Cmpxchg>(DestOrAddr, Eax, Desired,
-                                                    Locked);
-    // Mark eax as possibly modified by cmpxchg.
-    Context.insert<InstFakeDef>(Eax, llvm::dyn_cast<Variable>(DestOrAddr));
-    _set_dest_redefined();
-    Context.insert<InstFakeUse>(Eax);
-  }
-  void _cmpxchg8b(X86OperandMem *Addr, Variable *Edx, Variable *Eax,
-                  Variable *Ecx, Variable *Ebx, bool Locked) {
-    Context.insert<typename Traits::Insts::Cmpxchg8b>(Addr, Edx, Eax, Ecx, Ebx,
-                                                      Locked);
-    // Mark edx, and eax as possibly modified by cmpxchg8b.
-    Context.insert<InstFakeDef>(Edx);
-    _set_dest_redefined();
-    Context.insert<InstFakeUse>(Edx);
-    Context.insert<InstFakeDef>(Eax);
-    _set_dest_redefined();
-    Context.insert<InstFakeUse>(Eax);
-  }
-  void _cvt(Variable *Dest, Operand *Src0,
-            typename Traits::Insts::Cvt::CvtVariant Variant) {
-    Context.insert<typename Traits::Insts::Cvt>(Dest, Src0, Variant);
-  }
-  void _round(Variable *Dest, Operand *Src0, Operand *Imm) {
-    Context.insert<typename Traits::Insts::Round>(Dest, Src0, Imm);
-  }
-  void _div(Variable *Dest, Operand *Src0, Operand *Src1) {
-    Context.insert<typename Traits::Insts::Div>(Dest, Src0, Src1);
-  }
-  void _divps(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Divps>(Dest, Src0);
-  }
-  void _divss(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Divss>(Dest, Src0);
-  }
-  void _fld(Operand *Src0) {
-    Context.insert<typename Traits::Insts::Fld>(Src0);
-  }
-  void _fstp(Variable *Dest) {
-    Context.insert<typename Traits::Insts::Fstp>(Dest);
-  }
-  void _idiv(Variable *Dest, Operand *Src0, Operand *Src1) {
-    Context.insert<typename Traits::Insts::Idiv>(Dest, Src0, Src1);
-  }
-  void _imul(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Imul>(Dest, Src0);
-  }
-  void _imul_imm(Variable *Dest, Operand *Src0, Constant *Imm) {
-    Context.insert<typename Traits::Insts::ImulImm>(Dest, Src0, Imm);
-  }
-  void _insertps(Variable *Dest, Operand *Src0, Operand *Src1) {
-    Context.insert<typename Traits::Insts::Insertps>(Dest, Src0, Src1);
-  }
-  void _int3() { Context.insert<typename Traits::Insts::Int3>(); }
-  void _jmp(Operand *Target) {
-    Context.insert<typename Traits::Insts::Jmp>(Target);
-  }
-  void _lea(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Lea>(Dest, Src0);
-  }
-  void _link_bp() { dispatchToConcrete(&Traits::ConcreteTarget::_link_bp); }
-  void _push_reg(RegNumT RegNum) {
-    dispatchToConcrete(&Traits::ConcreteTarget::_push_reg, std::move(RegNum));
-  }
-  void _pop_reg(RegNumT RegNum) {
-    dispatchToConcrete(&Traits::ConcreteTarget::_pop_reg, std::move(RegNum));
-  }
-  void _mfence() { Context.insert<typename Traits::Insts::Mfence>(); }
-  /// Moves can be used to redefine registers, creating "partial kills" for
-  /// liveness.  Mark where moves are used in this way.
-  void _redefined(Inst *MovInst, bool IsRedefinition = true) {
-    if (IsRedefinition)
-      MovInst->setDestRedefined();
-  }
-  /// If Dest=nullptr is passed in, then a new variable is created, marked as
-  /// infinite register allocation weight, and returned through the in/out Dest
-  /// argument.
-  typename Traits::Insts::Mov *_mov(Variable *&Dest, Operand *Src0,
-                                    RegNumT RegNum = RegNumT()) {
-    if (Dest == nullptr)
-      Dest = makeReg(Src0->getType(), RegNum);
-    return Context.insert<typename Traits::Insts::Mov>(Dest, Src0);
-  }
-  void _mov_sp(Operand *NewValue) {
-    dispatchToConcrete(&Traits::ConcreteTarget::_mov_sp, std::move(NewValue));
-  }
-  typename Traits::Insts::Movp *_movp(Variable *Dest, Operand *Src0) {
-    return Context.insert<typename Traits::Insts::Movp>(Dest, Src0);
-  }
-  void _movd(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Movd>(Dest, Src0);
-  }
-  void _movq(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Movq>(Dest, Src0);
-  }
-  void _movss(Variable *Dest, Variable *Src0) {
-    Context.insert<typename Traits::Insts::MovssRegs>(Dest, Src0);
-  }
-  void _movsx(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Movsx>(Dest, Src0);
-  }
-  typename Traits::Insts::Movzx *_movzx(Variable *Dest, Operand *Src0) {
-    return Context.insert<typename Traits::Insts::Movzx>(Dest, Src0);
-  }
-  void _maxss(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Maxss>(Dest, Src0);
-  }
-  void _minss(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Minss>(Dest, Src0);
-  }
-  void _maxps(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Maxps>(Dest, Src0);
-  }
-  void _minps(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Minps>(Dest, Src0);
-  }
-  void _mul(Variable *Dest, Variable *Src0, Operand *Src1) {
-    Context.insert<typename Traits::Insts::Mul>(Dest, Src0, Src1);
-  }
-  void _mulps(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Mulps>(Dest, Src0);
-  }
-  void _mulss(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Mulss>(Dest, Src0);
-  }
-  void _neg(Variable *SrcDest) {
-    Context.insert<typename Traits::Insts::Neg>(SrcDest);
-  }
-  void _nop(SizeT Variant) {
-    Context.insert<typename Traits::Insts::Nop>(Variant);
-  }
-  void _or(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Or>(Dest, Src0);
-  }
-  void _orps(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Orps>(Dest, Src0);
-  }
-  void _or_rmw(X86OperandMem *DestSrc0, Operand *Src1) {
-    Context.insert<typename Traits::Insts::OrRMW>(DestSrc0, Src1);
-  }
-  void _padd(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Padd>(Dest, Src0);
-  }
-  void _padds(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Padds>(Dest, Src0);
-  }
-  void _paddus(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Paddus>(Dest, Src0);
-  }
-  void _pand(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Pand>(Dest, Src0);
-  }
-  void _pandn(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Pandn>(Dest, Src0);
-  }
-  void _pblendvb(Variable *Dest, Operand *Src0, Operand *Src1) {
-    Context.insert<typename Traits::Insts::Pblendvb>(Dest, Src0, Src1);
-  }
-  void _pcmpeq(Variable *Dest, Operand *Src0,
-               Type ArithmeticTypeOverride = IceType_void) {
-    Context.insert<typename Traits::Insts::Pcmpeq>(Dest, Src0,
-                                                   ArithmeticTypeOverride);
-  }
-  void _pcmpgt(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Pcmpgt>(Dest, Src0);
-  }
-  void _pextr(Variable *Dest, Operand *Src0, Operand *Src1) {
-    Context.insert<typename Traits::Insts::Pextr>(Dest, Src0, Src1);
-  }
-  void _pinsr(Variable *Dest, Operand *Src0, Operand *Src1) {
-    Context.insert<typename Traits::Insts::Pinsr>(Dest, Src0, Src1);
-  }
-  void _pmull(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Pmull>(Dest, Src0);
-  }
-  void _pmulhw(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Pmulhw>(Dest, Src0);
-  }
-  void _pmulhuw(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Pmulhuw>(Dest, Src0);
-  }
-  void _pmaddwd(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Pmaddwd>(Dest, Src0);
-  }
-  void _pmuludq(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Pmuludq>(Dest, Src0);
-  }
-  void _pop(Variable *Dest) {
-    Context.insert<typename Traits::Insts::Pop>(Dest);
-  }
-  void _por(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Por>(Dest, Src0);
-  }
-  void _punpckl(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Punpckl>(Dest, Src0);
-  }
-  void _punpckh(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Punpckh>(Dest, Src0);
-  }
-  void _packss(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Packss>(Dest, Src0);
-  }
-  void _packus(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Packus>(Dest, Src0);
-  }
-  void _pshufb(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Pshufb>(Dest, Src0);
-  }
-  void _pshufd(Variable *Dest, Operand *Src0, Operand *Src1) {
-    Context.insert<typename Traits::Insts::Pshufd>(Dest, Src0, Src1);
-  }
-  void _psll(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Psll>(Dest, Src0);
-  }
-  void _psra(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Psra>(Dest, Src0);
-  }
-  void _psrl(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Psrl>(Dest, Src0);
-  }
-  void _psub(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Psub>(Dest, Src0);
-  }
-  void _psubs(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Psubs>(Dest, Src0);
-  }
-  void _psubus(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Psubus>(Dest, Src0);
-  }
-  void _push(Operand *Src0) {
-    Context.insert<typename Traits::Insts::Push>(Src0);
-  }
-  void _pxor(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Pxor>(Dest, Src0);
-  }
-  void _ret(Variable *Src0 = nullptr) {
-    Context.insert<typename Traits::Insts::Ret>(Src0);
-  }
-  void _rol(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Rol>(Dest, Src0);
-  }
-  void _round(Variable *Dest, Operand *Src, Constant *Imm) {
-    Context.insert<typename Traits::Insts::Round>(Dest, Src, Imm);
-  }
-  void _sar(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Sar>(Dest, Src0);
-  }
-  void _sbb(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Sbb>(Dest, Src0);
-  }
-  void _sbb_rmw(X86OperandMem *DestSrc0, Operand *Src1) {
-    Context.insert<typename Traits::Insts::SbbRMW>(DestSrc0, Src1);
-  }
-  void _setcc(Variable *Dest, BrCond Condition) {
-    Context.insert<typename Traits::Insts::Setcc>(Dest, Condition);
-  }
-  void _shl(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Shl>(Dest, Src0);
-  }
-  void _shld(Variable *Dest, Variable *Src0, Operand *Src1) {
-    Context.insert<typename Traits::Insts::Shld>(Dest, Src0, Src1);
-  }
-  void _shr(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Shr>(Dest, Src0);
-  }
-  void _shrd(Variable *Dest, Variable *Src0, Operand *Src1) {
-    Context.insert<typename Traits::Insts::Shrd>(Dest, Src0, Src1);
-  }
-  void _shufps(Variable *Dest, Operand *Src0, Operand *Src1) {
-    Context.insert<typename Traits::Insts::Shufps>(Dest, Src0, Src1);
-  }
-  void _movmsk(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Movmsk>(Dest, Src0);
-  }
-  void _sqrt(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Sqrt>(Dest, Src0);
-  }
-  void _store(Operand *Value, X86Operand *Mem) {
-    Context.insert<typename Traits::Insts::Store>(Value, Mem);
-  }
-  void _storep(Variable *Value, X86OperandMem *Mem) {
-    Context.insert<typename Traits::Insts::StoreP>(Value, Mem);
-  }
-  void _storeq(Operand *Value, X86OperandMem *Mem) {
-    Context.insert<typename Traits::Insts::StoreQ>(Value, Mem);
-  }
-  void _stored(Operand *Value, X86OperandMem *Mem) {
-    Context.insert<typename Traits::Insts::StoreD>(Value, Mem);
-  }
-  void _sub(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Sub>(Dest, Src0);
-  }
-  void _sub_rmw(X86OperandMem *DestSrc0, Operand *Src1) {
-    Context.insert<typename Traits::Insts::SubRMW>(DestSrc0, Src1);
-  }
-  void _sub_sp(Operand *Adjustment) {
-    dispatchToConcrete(&Traits::ConcreteTarget::_sub_sp, std::move(Adjustment));
-  }
-  void _subps(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Subps>(Dest, Src0);
-  }
-  void _subss(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Subss>(Dest, Src0);
-  }
-  void _test(Operand *Src0, Operand *Src1) {
-    Context.insert<typename Traits::Insts::Test>(Src0, Src1);
-  }
-  void _ucomiss(Operand *Src0, Operand *Src1) {
-    Context.insert<typename Traits::Insts::Ucomiss>(Src0, Src1);
-  }
-  void _ud2() { Context.insert<typename Traits::Insts::UD2>(); }
-  void _unlink_bp() { dispatchToConcrete(&Traits::ConcreteTarget::_unlink_bp); }
-  void _xadd(Operand *Dest, Variable *Src, bool Locked) {
-    Context.insert<typename Traits::Insts::Xadd>(Dest, Src, Locked);
-    // The xadd exchanges Dest and Src (modifying Src). Model that update with
-    // a FakeDef followed by a FakeUse.
-    Context.insert<InstFakeDef>(Src, llvm::dyn_cast<Variable>(Dest));
-    _set_dest_redefined();
-    Context.insert<InstFakeUse>(Src);
-  }
-  void _xchg(Operand *Dest, Variable *Src) {
-    Context.insert<typename Traits::Insts::Xchg>(Dest, Src);
-    // The xchg modifies Dest and Src -- model that update with a
-    // FakeDef/FakeUse.
-    Context.insert<InstFakeDef>(Src, llvm::dyn_cast<Variable>(Dest));
-    _set_dest_redefined();
-    Context.insert<InstFakeUse>(Src);
-  }
-  void _xor(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Xor>(Dest, Src0);
-  }
-  void _xorps(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Xorps>(Dest, Src0);
-  }
-  void _xor_rmw(X86OperandMem *DestSrc0, Operand *Src1) {
-    Context.insert<typename Traits::Insts::XorRMW>(DestSrc0, Src1);
-  }
-
-  void _iaca_start() {
-    if (!BuildDefs::minimal())
-      Context.insert<typename Traits::Insts::IacaStart>();
-  }
-  void _iaca_end() {
-    if (!BuildDefs::minimal())
-      Context.insert<typename Traits::Insts::IacaEnd>();
-  }
-
-  /// This class helps wrap IACA markers around the code generated by the
-  /// current scope. It means you don't need to put an end before each return.
-  class ScopedIacaMark {
-    ScopedIacaMark(const ScopedIacaMark &) = delete;
-    ScopedIacaMark &operator=(const ScopedIacaMark &) = delete;
-
-  public:
-    ScopedIacaMark(TargetX86Base *Lowering) : Lowering(Lowering) {
-      Lowering->_iaca_start();
-    }
-    ~ScopedIacaMark() { end(); }
-    void end() {
-      if (!Lowering)
-        return;
-      Lowering->_iaca_end();
-      Lowering = nullptr;
-    }
-
-  private:
-    TargetX86Base *Lowering;
-  };
-
-  bool optimizeScalarMul(Variable *Dest, Operand *Src0, int32_t Src1);
-  void findRMW();
-
-  InstructionSetEnum InstructionSet = Traits::InstructionSet::Begin;
-  bool IsEbpBasedFrame = false;
-  size_t RequiredStackAlignment = sizeof(Traits::WordType);
-  size_t SpillAreaSizeBytes = 0;
-  size_t FixedAllocaSizeBytes = 0;
-  size_t FixedAllocaAlignBytes = 0;
-  bool PrologEmitsFixedAllocas = false;
-  uint32_t MaxOutArgsSizeBytes = 0;
-  static std::array<SmallBitVector, RCX86_NUM> TypeToRegisterSet;
-  static std::array<SmallBitVector, RCX86_NUM> TypeToRegisterSetUnfiltered;
-  static std::array<SmallBitVector, Traits::RegisterSet::Reg_NUM>
-      RegisterAliases;
-  SmallBitVector RegsUsed;
-  std::array<VarList, IceType_NUM> PhysicalRegisters;
-
-private:
-  /// dispatchToConcrete is the template voodoo that allows TargetX86Base to
-  /// invoke methods in Machine (which inherits from TargetX86Base) without
-  /// having to rely on virtual method calls. There are two overloads, one
-  /// for non-void types, and one for void types. We need this becase, for
-  /// non-void types, we need to return the method result, where as for
-  /// void, we don't. While it is true that the code compiles without the
-  /// void "version", there used to be a time when compilers would reject
-  /// such code.
-  ///
-  /// This machinery is far from perfect. Note that, in particular, the
-  /// arguments provided to dispatchToConcrete() need to match the arguments
-  /// for Method **exactly** (i.e., no argument promotion is performed.)
-  template <typename Ret, typename... Args>
-  typename std::enable_if<!std::is_void<Ret>::value, Ret>::type
-  dispatchToConcrete(Ret (ConcreteTarget::*Method)(Args...), Args &&...args) {
-    return (static_cast<ConcreteTarget *>(this)->*Method)(
-        std::forward<Args>(args)...);
-  }
-
-  template <typename... Args>
-  void dispatchToConcrete(void (ConcreteTarget::*Method)(Args...),
-                          Args &&...args) {
-    (static_cast<ConcreteTarget *>(this)->*Method)(std::forward<Args>(args)...);
-  }
-
-  void lowerShift64(InstArithmetic::OpKind Op, Operand *Src0Lo, Operand *Src0Hi,
-                    Operand *Src1Lo, Variable *DestLo, Variable *DestHi);
-
-  /// Emit the code for a combined operation and consumer instruction, or set
-  /// the destination variable of the operation if Consumer == nullptr.
-  void lowerIcmpAndConsumer(const InstIcmp *Icmp, const Inst *Consumer);
-  void lowerFcmpAndConsumer(const InstFcmp *Fcmp, const Inst *Consumer);
-  void lowerArithAndConsumer(const InstArithmetic *Arith, const Inst *Consumer);
-
-  /// Emit a setcc instruction if Consumer == nullptr; otherwise emit a
-  /// specialized version of Consumer.
-  void setccOrConsumer(BrCond Condition, Variable *Dest, const Inst *Consumer);
-
-  /// Emit a mov [1|0] instruction if Consumer == nullptr; otherwise emit a
-  /// specialized version of Consumer.
-  void movOrConsumer(bool IcmpResult, Variable *Dest, const Inst *Consumer);
-
-  /// Emit the code for instructions with a vector type.
-  void lowerIcmpVector(const InstIcmp *Icmp);
-  void lowerFcmpVector(const InstFcmp *Icmp);
-  void lowerSelectVector(const InstSelect *Instr);
-
-  /// Helpers for select lowering.
-  void lowerSelectMove(Variable *Dest, BrCond Cond, Operand *SrcT,
-                       Operand *SrcF);
-  void lowerSelectIntMove(Variable *Dest, BrCond Cond, Operand *SrcT,
-                          Operand *SrcF);
-  /// Generic helper to move an arbitrary type from Src to Dest.
-  void lowerMove(Variable *Dest, Operand *Src, bool IsRedefinition);
-
-  /// Optimizations for idiom recognition.
-  bool lowerOptimizeFcmpSelect(const InstFcmp *Fcmp, const InstSelect *Select);
-
-  /// Complains loudly if invoked because the cpu can handle 64-bit types
-  /// natively.
-  template <typename T = Traits>
-  typename std::enable_if<T::Is64Bit, void>::type lowerIcmp64(const InstIcmp *,
-                                                              const Inst *) {
-    llvm::report_fatal_error(
-        "Hey, yo! This is x86-64. Watcha doin'? (lowerIcmp64)");
-  }
-  /// x86lowerIcmp64 handles 64-bit icmp lowering.
-  template <typename T = Traits>
-  typename std::enable_if<!T::Is64Bit, void>::type
-  lowerIcmp64(const InstIcmp *Icmp, const Inst *Consumer);
-
-  BoolFolding<Traits> FoldingInfo;
-
-  /// Helpers for lowering ShuffleVector
-  /// @{
-  Variable *lowerShuffleVector_AllFromSameSrc(Operand *Src, SizeT Index0,
-                                              SizeT Index1, SizeT Index2,
-                                              SizeT Index3);
-  static constexpr SizeT IGNORE_INDEX = 0x80000000u;
-  Variable *lowerShuffleVector_TwoFromSameSrc(Operand *Src0, SizeT Index0,
-                                              SizeT Index1, Operand *Src1,
-                                              SizeT Index2, SizeT Index3);
-  static constexpr SizeT UNIFIED_INDEX_0 = 0;
-  static constexpr SizeT UNIFIED_INDEX_1 = 2;
-  Variable *lowerShuffleVector_UnifyFromDifferentSrcs(Operand *Src0,
-                                                      SizeT Index0,
-                                                      Operand *Src1,
-                                                      SizeT Index1);
-  static constexpr SizeT CLEAR_ALL_BITS = 0x80;
-  SizeT PshufbMaskCount = 0;
-  GlobalString lowerShuffleVector_NewMaskName();
-  ConstantRelocatable *lowerShuffleVector_CreatePshufbMask(
-      int8_t Idx0, int8_t Idx1, int8_t Idx2, int8_t Idx3, int8_t Idx4,
-      int8_t Idx5, int8_t Idx6, int8_t Idx7, int8_t Idx8, int8_t Idx9,
-      int8_t Idx10, int8_t Idx11, int8_t Idx12, int8_t Idx13, int8_t Idx14,
-      int8_t Idx15);
-  void lowerShuffleVector_UsingPshufb(Variable *Dest, Operand *Src0,
-                                      Operand *Src1, int8_t Idx0, int8_t Idx1,
-                                      int8_t Idx2, int8_t Idx3, int8_t Idx4,
-                                      int8_t Idx5, int8_t Idx6, int8_t Idx7,
-                                      int8_t Idx8, int8_t Idx9, int8_t Idx10,
-                                      int8_t Idx11, int8_t Idx12, int8_t Idx13,
-                                      int8_t Idx14, int8_t Idx15);
-  /// @}
-
-  static constexpr FixupKind PcRelFixup = Traits::FK_PcRel;
-  static constexpr FixupKind AbsFixup = Traits::FK_Abs;
-};
-
-template <typename TraitsType>
-class TargetDataX86 final : public TargetDataLowering {
-  using Traits = TraitsType;
-  TargetDataX86() = delete;
-  TargetDataX86(const TargetDataX86 &) = delete;
-  TargetDataX86 &operator=(const TargetDataX86 &) = delete;
-
-public:
-  ~TargetDataX86() override = default;
-
-  static std::unique_ptr<TargetDataLowering> create(GlobalContext *Ctx) {
-    return makeUnique<TargetDataX86>(Ctx);
-  }
-
-  void lowerGlobals(const VariableDeclarationList &Vars,
-                    const std::string &SectionSuffix) override;
-  void lowerConstants() override;
-  void lowerJumpTables() override;
-
-private:
-  ENABLE_MAKE_UNIQUE;
-
-  explicit TargetDataX86(GlobalContext *Ctx) : TargetDataLowering(Ctx) {}
-  template <typename T> static void emitConstantPool(GlobalContext *Ctx);
-};
-
-class TargetHeaderX86 : public TargetHeaderLowering {
-  TargetHeaderX86() = delete;
-  TargetHeaderX86(const TargetHeaderX86 &) = delete;
-  TargetHeaderX86 &operator=(const TargetHeaderX86 &) = delete;
-
-public:
-  ~TargetHeaderX86() = default;
-
-  static std::unique_ptr<TargetHeaderLowering> create(GlobalContext *Ctx) {
-    return makeUnique<TargetHeaderX86>(Ctx);
-  }
-
-private:
-  ENABLE_MAKE_UNIQUE;
-
-  explicit TargetHeaderX86(GlobalContext *Ctx) : TargetHeaderLowering(Ctx) {}
-};
-
-} // end of namespace X8632
-} // end of namespace Ice
-
-#include "IceTargetLoweringX8632BaseImpl.h"
-
-#endif // SUBZERO_SRC_ICETARGETLOWERINGX8632BASE_H
diff --git a/third_party/subzero/src/IceTargetLoweringX8632BaseImpl.h b/third_party/subzero/src/IceTargetLoweringX8632BaseImpl.h
deleted file mode 100644
index 185c48b..0000000
--- a/third_party/subzero/src/IceTargetLoweringX8632BaseImpl.h
+++ /dev/null
@@ -1,8139 +0,0 @@
-//===- subzero/src/IceTargetLoweringX8632BaseImpl.h - x86 lowering -*- C++
-//-*-==//
-//
-//                        The Subzero Code Generator
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// \brief Implements the TargetLoweringX86Base class, which consists almost
-/// entirely of the lowering sequence for each high-level instruction.
-///
-//===----------------------------------------------------------------------===//
-
-#ifndef SUBZERO_SRC_ICETARGETLOWERINGX8632BASEIMPL_H
-#define SUBZERO_SRC_ICETARGETLOWERINGX8632BASEIMPL_H
-
-#include "IceCfg.h"
-#include "IceCfgNode.h"
-#include "IceClFlags.h"
-#include "IceDefs.h"
-#include "IceELFObjectWriter.h"
-#include "IceGlobalInits.h"
-#include "IceInstVarIter.h"
-#include "IceInstX8632Base.h"
-#include "IceLiveness.h"
-#include "IceOperand.h"
-#include "IcePhiLoweringImpl.h"
-#include "IceTargetLoweringX86.h"
-#include "IceUtils.h"
-#include "IceVariableSplitting.h"
-
-#include "llvm/Support/MathExtras.h"
-
-#include <stack>
-
-namespace Ice {
-namespace X8632 {
-
-// The Microsoft x64 ABI requires the caller to allocate a 32 byte
-// "shadow store" (aka "home space") so that the callee may copy the 4
-// register args to it.
-constexpr SizeT getShadowStoreSize() {
-#if defined(_WIN64)
-  return 4 * sizeof(int64_t);
-#else
-  return 0;
-#endif
-}
-
-using Utils::BoolFlagSaver;
-
-template <typename Traits> class BoolFoldingEntry {
-  BoolFoldingEntry(const BoolFoldingEntry &) = delete;
-
-public:
-  BoolFoldingEntry() = default;
-  explicit BoolFoldingEntry(Inst *I);
-  BoolFoldingEntry &operator=(const BoolFoldingEntry &) = default;
-  /// Instr is the instruction producing the i1-type variable of interest.
-  Inst *Instr = nullptr;
-  /// IsComplex is the cached result of BoolFolding::hasComplexLowering(Instr).
-  bool IsComplex = false;
-  /// IsLiveOut is initialized conservatively to true, and is set to false when
-  /// we encounter an instruction that ends Var's live range. We disable the
-  /// folding optimization when Var is live beyond this basic block. Note that
-  /// if liveness analysis is not performed (e.g. in Om1 mode), IsLiveOut will
-  /// always be true and the folding optimization will never be performed.
-  bool IsLiveOut = true;
-  // NumUses counts the number of times Var is used as a source operand in the
-  // basic block. If IsComplex is true and there is more than one use of Var,
-  // then the folding optimization is disabled for Var.
-  uint32_t NumUses = 0;
-};
-
-template <typename Traits> class BoolFolding {
-public:
-  enum BoolFoldingProducerKind {
-    PK_None,
-    // TODO(jpp): PK_Icmp32 is no longer meaningful. Rename to PK_IcmpNative.
-    PK_Icmp32,
-    PK_Icmp64,
-    PK_Fcmp,
-    PK_Trunc,
-    PK_Arith // A flag-setting arithmetic instruction.
-  };
-
-  /// Currently the actual enum values are not used (other than CK_None), but we
-  /// go ahead and produce them anyway for symmetry with the
-  /// BoolFoldingProducerKind.
-  enum BoolFoldingConsumerKind { CK_None, CK_Br, CK_Select, CK_Sext, CK_Zext };
-
-private:
-  BoolFolding(const BoolFolding &) = delete;
-  BoolFolding &operator=(const BoolFolding &) = delete;
-
-public:
-  BoolFolding() = default;
-  static BoolFoldingProducerKind getProducerKind(const Inst *Instr);
-  static BoolFoldingConsumerKind getConsumerKind(const Inst *Instr);
-  static bool hasComplexLowering(const Inst *Instr);
-  static bool isValidFolding(BoolFoldingProducerKind ProducerKind,
-                             BoolFoldingConsumerKind ConsumerKind);
-  void init(CfgNode *Node);
-  const Inst *getProducerFor(const Operand *Opnd) const;
-  void dump(const Cfg *Func) const;
-
-private:
-  /// Returns true if Producers contains a valid entry for the given VarNum.
-  bool containsValid(SizeT VarNum) const {
-    auto Element = Producers.find(VarNum);
-    return Element != Producers.end() && Element->second.Instr != nullptr;
-  }
-  void setInvalid(SizeT VarNum) { Producers[VarNum].Instr = nullptr; }
-  void invalidateProducersOnStore(const Inst *Instr);
-  /// Producers maps Variable::Number to a BoolFoldingEntry.
-  CfgUnorderedMap<SizeT, BoolFoldingEntry<Traits>> Producers;
-};
-
-template <typename Traits>
-BoolFoldingEntry<Traits>::BoolFoldingEntry(Inst *I)
-    : Instr(I), IsComplex(BoolFolding<Traits>::hasComplexLowering(I)) {}
-
-template <typename Traits>
-typename BoolFolding<Traits>::BoolFoldingProducerKind
-BoolFolding<Traits>::getProducerKind(const Inst *Instr) {
-  if (llvm::isa<InstIcmp>(Instr)) {
-    if (Traits::Is64Bit || Instr->getSrc(0)->getType() != IceType_i64)
-      return PK_Icmp32;
-    return PK_Icmp64;
-  }
-  if (llvm::isa<InstFcmp>(Instr))
-    return PK_Fcmp;
-  if (auto *Arith = llvm::dyn_cast<InstArithmetic>(Instr)) {
-    if (Traits::Is64Bit || Arith->getSrc(0)->getType() != IceType_i64) {
-      switch (Arith->getOp()) {
-      default:
-        return PK_None;
-      case InstArithmetic::And:
-      case InstArithmetic::Or:
-        return PK_Arith;
-      }
-    }
-  }
-  return PK_None; // TODO(stichnot): remove this
-
-  if (auto *Cast = llvm::dyn_cast<InstCast>(Instr)) {
-    switch (Cast->getCastKind()) {
-    default:
-      return PK_None;
-    case InstCast::Trunc:
-      return PK_Trunc;
-    }
-  }
-  return PK_None;
-}
-
-template <typename Traits>
-typename BoolFolding<Traits>::BoolFoldingConsumerKind
-BoolFolding<Traits>::getConsumerKind(const Inst *Instr) {
-  if (llvm::isa<InstBr>(Instr))
-    return CK_Br;
-  if (llvm::isa<InstSelect>(Instr))
-    return CK_Select;
-  return CK_None; // TODO(stichnot): remove this
-
-  if (auto *Cast = llvm::dyn_cast<InstCast>(Instr)) {
-    switch (Cast->getCastKind()) {
-    default:
-      return CK_None;
-    case InstCast::Sext:
-      return CK_Sext;
-    case InstCast::Zext:
-      return CK_Zext;
-    }
-  }
-  return CK_None;
-}
-
-/// Returns true if the producing instruction has a "complex" lowering sequence.
-/// This generally means that its lowering sequence requires more than one
-/// conditional branch, namely 64-bit integer compares and some floating-point
-/// compares. When this is true, and there is more than one consumer, we prefer
-/// to disable the folding optimization because it minimizes branches.
-template <typename Traits>
-bool BoolFolding<Traits>::hasComplexLowering(const Inst *Instr) {
-  switch (getProducerKind(Instr)) {
-  default:
-    return false;
-  case PK_Icmp64:
-    return !Traits::Is64Bit;
-  case PK_Fcmp:
-    return Traits::TableFcmp[llvm::cast<InstFcmp>(Instr)->getCondition()].C2 !=
-           CondX86::Br_None;
-  }
-}
-
-template <typename Traits>
-bool BoolFolding<Traits>::isValidFolding(
-    typename BoolFolding<Traits>::BoolFoldingProducerKind ProducerKind,
-    typename BoolFolding<Traits>::BoolFoldingConsumerKind ConsumerKind) {
-  switch (ProducerKind) {
-  default:
-    return false;
-  case PK_Icmp32:
-  case PK_Icmp64:
-  case PK_Fcmp:
-    return (ConsumerKind == CK_Br) || (ConsumerKind == CK_Select);
-  case PK_Arith:
-    return ConsumerKind == CK_Br;
-  }
-}
-
-template <typename Traits> void BoolFolding<Traits>::init(CfgNode *Node) {
-  Producers.clear();
-  for (Inst &Instr : Node->getInsts()) {
-    if (Instr.isDeleted())
-      continue;
-    invalidateProducersOnStore(&Instr);
-    // Check whether Instr is a valid producer.
-    Variable *Var = Instr.getDest();
-    if (Var) { // only consider instructions with an actual dest var
-      if (isBooleanType(Var->getType())) {        // only bool-type dest vars
-        if (getProducerKind(&Instr) != PK_None) { // white-listed instructions
-          Producers[Var->getIndex()] = BoolFoldingEntry<Traits>(&Instr);
-        }
-      }
-    }
-    // Check each src variable against the map.
-    FOREACH_VAR_IN_INST(Var, Instr) {
-      SizeT VarNum = Var->getIndex();
-      if (!containsValid(VarNum))
-        continue;
-      // All valid consumers use Var as the first source operand
-      if (IndexOfVarOperandInInst(Var) != 0) {
-        setInvalid(VarNum);
-        continue;
-      }
-      // Consumer instructions must be white-listed
-      typename BoolFolding<Traits>::BoolFoldingConsumerKind ConsumerKind =
-          getConsumerKind(&Instr);
-      if (ConsumerKind == CK_None) {
-        setInvalid(VarNum);
-        continue;
-      }
-      typename BoolFolding<Traits>::BoolFoldingProducerKind ProducerKind =
-          getProducerKind(Producers[VarNum].Instr);
-      if (!isValidFolding(ProducerKind, ConsumerKind)) {
-        setInvalid(VarNum);
-        continue;
-      }
-      // Avoid creating multiple copies of complex producer instructions.
-      if (Producers[VarNum].IsComplex && Producers[VarNum].NumUses > 0) {
-        setInvalid(VarNum);
-        continue;
-      }
-      ++Producers[VarNum].NumUses;
-      if (Instr.isLastUse(Var)) {
-        Producers[VarNum].IsLiveOut = false;
-      }
-    }
-  }
-  for (auto &I : Producers) {
-    // Ignore entries previously marked invalid.
-    if (I.second.Instr == nullptr)
-      continue;
-    // Disable the producer if its dest may be live beyond this block.
-    if (I.second.IsLiveOut) {
-      setInvalid(I.first);
-      continue;
-    }
-    // Mark as "dead" rather than outright deleting. This is so that other
-    // peephole style optimizations during or before lowering have access to
-    // this instruction in undeleted form. See for example
-    // tryOptimizedCmpxchgCmpBr().
-    I.second.Instr->setDead();
-  }
-}
-
-template <typename Traits>
-const Inst *BoolFolding<Traits>::getProducerFor(const Operand *Opnd) const {
-  auto *Var = llvm::dyn_cast<const Variable>(Opnd);
-  if (Var == nullptr)
-    return nullptr;
-  SizeT VarNum = Var->getIndex();
-  auto Element = Producers.find(VarNum);
-  if (Element == Producers.end())
-    return nullptr;
-  return Element->second.Instr;
-}
-
-template <typename Traits>
-void BoolFolding<Traits>::dump(const Cfg *Func) const {
-  if (!BuildDefs::dump() || !Func->isVerbose(IceV_Folding))
-    return;
-  OstreamLocker L(Func->getContext());
-  Ostream &Str = Func->getContext()->getStrDump();
-  for (auto &I : Producers) {
-    if (I.second.Instr == nullptr)
-      continue;
-    Str << "Found foldable producer:\n  ";
-    I.second.Instr->dump(Func);
-    Str << "\n";
-  }
-}
-
-/// If the given instruction has potential memory side effects (e.g. store, rmw,
-/// or a call instruction with potential memory side effects), then we must not
-/// allow a pre-store Producer instruction with memory operands to be folded
-/// into a post-store Consumer instruction.  If this is detected, the Producer
-/// is invalidated.
-///
-/// We use the Producer's IsLiveOut field to determine whether any potential
-/// Consumers come after this store instruction.  The IsLiveOut field is
-/// initialized to true, and BoolFolding::init() sets IsLiveOut to false when it
-/// sees the variable's definitive last use (indicating the variable is not in
-/// the node's live-out set).  Thus if we see here that IsLiveOut is false, we
-/// know that there can be no consumers after the store, and therefore we know
-/// the folding is safe despite the store instruction.
-template <typename Traits>
-void BoolFolding<Traits>::invalidateProducersOnStore(const Inst *Instr) {
-  if (!Instr->isMemoryWrite())
-    return;
-  for (auto &ProducerPair : Producers) {
-    if (!ProducerPair.second.IsLiveOut)
-      continue;
-    Inst *PInst = ProducerPair.second.Instr;
-    if (PInst == nullptr)
-      continue;
-    bool HasMemOperand = false;
-    const SizeT SrcSize = PInst->getSrcSize();
-    for (SizeT I = 0; I < SrcSize; ++I) {
-      if (llvm::isa<typename Traits::X86OperandMem>(PInst->getSrc(I))) {
-        HasMemOperand = true;
-        break;
-      }
-    }
-    if (!HasMemOperand)
-      continue;
-    setInvalid(ProducerPair.first);
-  }
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::initNodeForLowering(CfgNode *Node) {
-  FoldingInfo.init(Node);
-  FoldingInfo.dump(Func);
-}
-
-template <typename TraitsType>
-TargetX86Base<TraitsType>::TargetX86Base(Cfg *Func) : TargetLowering(Func) {
-  static_assert(
-      (Traits::InstructionSet::End - Traits::InstructionSet::Begin) ==
-          (TargetInstructionSet::X86InstructionSet_End -
-           TargetInstructionSet::X86InstructionSet_Begin),
-      "Traits::InstructionSet range different from TargetInstructionSet");
-  if (getFlags().getTargetInstructionSet() !=
-      TargetInstructionSet::BaseInstructionSet) {
-    InstructionSet = static_cast<InstructionSetEnum>(
-        (getFlags().getTargetInstructionSet() -
-         TargetInstructionSet::X86InstructionSet_Begin) +
-        Traits::InstructionSet::Begin);
-  }
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::staticInit(GlobalContext *Ctx) {
-  RegNumT::setLimit(Traits::RegisterSet::Reg_NUM);
-  Traits::initRegisterSet(getFlags(), &TypeToRegisterSet, &RegisterAliases);
-  for (size_t i = 0; i < TypeToRegisterSet.size(); ++i)
-    TypeToRegisterSetUnfiltered[i] = TypeToRegisterSet[i];
-  filterTypeToRegisterSet(Ctx, Traits::RegisterSet::Reg_NUM,
-                          TypeToRegisterSet.data(), TypeToRegisterSet.size(),
-                          Traits::getRegName, getRegClassName);
-}
-
-template <typename TraitsType>
-bool TargetX86Base<TraitsType>::shouldBePooled(const Constant *C) {
-  if (auto *ConstFloat = llvm::dyn_cast<ConstantFloat>(C)) {
-    return !Utils::isPositiveZero(ConstFloat->getValue());
-  }
-  if (auto *ConstDouble = llvm::dyn_cast<ConstantDouble>(C)) {
-    return !Utils::isPositiveZero(ConstDouble->getValue());
-  }
-  return false;
-}
-
-template <typename TraitsType>
-::Ice::Type TargetX86Base<TraitsType>::getPointerType() {
-  return Traits::Is64Bit ? IceType_i64 : IceType_i32;
-}
-
-template <typename TraitsType> void TargetX86Base<TraitsType>::translateO2() {
-  TimerMarker T(TimerStack::TT_O2, Func);
-
-  genTargetHelperCalls();
-  Func->dump("After target helper call insertion");
-
-  // Merge Alloca instructions, and lay out the stack.
-  static constexpr bool SortAndCombineAllocas = true;
-  Func->processAllocas(SortAndCombineAllocas);
-  Func->dump("After Alloca processing");
-
-  // Run this early so it can be used to focus optimizations on potentially hot
-  // code.
-  // TODO(stichnot,ascull): currently only used for regalloc not
-  // expensive high level optimizations which could be focused on potentially
-  // hot code.
-  Func->generateLoopInfo();
-  Func->dump("After loop analysis");
-  if (getFlags().getLoopInvariantCodeMotion()) {
-    Func->loopInvariantCodeMotion();
-    Func->dump("After LICM");
-  }
-
-  if (getFlags().getLocalCSE() != Ice::LCSE_Disabled) {
-    Func->localCSE(getFlags().getLocalCSE() == Ice::LCSE_EnabledSSA);
-    Func->dump("After Local CSE");
-    Func->floatConstantCSE();
-  }
-  if (getFlags().getEnableShortCircuit()) {
-    Func->shortCircuitJumps();
-    Func->dump("After Short Circuiting");
-  }
-
-  if (!getFlags().getEnablePhiEdgeSplit()) {
-    // Lower Phi instructions.
-    Func->placePhiLoads();
-    if (Func->hasError())
-      return;
-    Func->placePhiStores();
-    if (Func->hasError())
-      return;
-    Func->deletePhis();
-    if (Func->hasError())
-      return;
-    Func->dump("After Phi lowering");
-  }
-
-  // Address mode optimization.
-  Func->getVMetadata()->init(VMK_SingleDefs);
-  Func->doAddressOpt();
-  Func->materializeVectorShuffles();
-
-  // Find read-modify-write opportunities. Do this after address mode
-  // optimization so that doAddressOpt() doesn't need to be applied to RMW
-  // instructions as well.
-  findRMW();
-  Func->dump("After RMW transform");
-
-  // Argument lowering
-  Func->doArgLowering();
-
-  // Target lowering. This requires liveness analysis for some parts of the
-  // lowering decisions, such as compare/branch fusing. If non-lightweight
-  // liveness analysis is used, the instructions need to be renumbered first
-  // TODO: This renumbering should only be necessary if we're actually
-  // calculating live intervals, which we only do for register allocation.
-  Func->renumberInstructions();
-  if (Func->hasError())
-    return;
-
-  // TODO: It should be sufficient to use the fastest liveness calculation,
-  // i.e. livenessLightweight(). However, for some reason that slows down the
-  // rest of the translation. Investigate.
-  Func->liveness(Liveness_Basic);
-  if (Func->hasError())
-    return;
-  Func->dump("After x86 address mode opt");
-
-  doLoadOpt();
-
-  Func->genCode();
-  if (Func->hasError())
-    return;
-  Func->dump("After x86 codegen");
-  splitBlockLocalVariables(Func);
-
-  // Register allocation. This requires instruction renumbering and full
-  // liveness analysis. Loops must be identified before liveness so variable
-  // use weights are correct.
-  Func->renumberInstructions();
-  if (Func->hasError())
-    return;
-  Func->liveness(Liveness_Intervals);
-  if (Func->hasError())
-    return;
-  // The post-codegen dump is done here, after liveness analysis and associated
-  // cleanup, to make the dump cleaner and more useful.
-  Func->dump("After initial x86 codegen");
-  // Validate the live range computations. The expensive validation call is
-  // deliberately only made when assertions are enabled.
-  assert(Func->validateLiveness());
-  Func->getVMetadata()->init(VMK_All);
-  regAlloc(RAK_Global);
-  if (Func->hasError())
-    return;
-  Func->dump("After linear scan regalloc");
-
-  if (getFlags().getEnablePhiEdgeSplit()) {
-    Func->advancedPhiLowering();
-    Func->dump("After advanced Phi lowering");
-  }
-
-  // Stack frame mapping.
-  Func->genFrame();
-  if (Func->hasError())
-    return;
-  Func->dump("After stack frame mapping");
-
-  Func->contractEmptyNodes();
-  Func->reorderNodes();
-
-  // Branch optimization.  This needs to be done just before code emission. In
-  // particular, no transformations that insert or reorder CfgNodes should be
-  // done after branch optimization. We go ahead and do it before nop insertion
-  // to reduce the amount of work needed for searching for opportunities.
-  Func->doBranchOpt();
-  Func->dump("After branch optimization");
-}
-
-template <typename TraitsType> void TargetX86Base<TraitsType>::translateOm1() {
-  TimerMarker T(TimerStack::TT_Om1, Func);
-
-  genTargetHelperCalls();
-
-  // Do not merge Alloca instructions, and lay out the stack.
-  // static constexpr bool SortAndCombineAllocas = false;
-  static constexpr bool SortAndCombineAllocas =
-      true; // TODO(b/171222930): Fix Win32 bug when this is false
-  Func->processAllocas(SortAndCombineAllocas);
-  Func->dump("After Alloca processing");
-
-  Func->placePhiLoads();
-  if (Func->hasError())
-    return;
-  Func->placePhiStores();
-  if (Func->hasError())
-    return;
-  Func->deletePhis();
-  if (Func->hasError())
-    return;
-  Func->dump("After Phi lowering");
-
-  Func->doArgLowering();
-  Func->genCode();
-  if (Func->hasError())
-    return;
-  Func->dump("After initial x86 codegen");
-
-  regAlloc(RAK_InfOnly);
-  if (Func->hasError())
-    return;
-  Func->dump("After regalloc of infinite-weight variables");
-
-  Func->genFrame();
-  if (Func->hasError())
-    return;
-  Func->dump("After stack frame mapping");
-}
-
-inline bool canRMW(const InstArithmetic *Arith) {
-  Type Ty = Arith->getDest()->getType();
-  // X86 vector instructions write to a register and have no RMW option.
-  if (isVectorType(Ty))
-    return false;
-  bool isI64 = Ty == IceType_i64;
-
-  switch (Arith->getOp()) {
-  // Not handled for lack of simple lowering:
-  //   shift on i64
-  //   mul, udiv, urem, sdiv, srem, frem
-  // Not handled for lack of RMW instructions:
-  //   fadd, fsub, fmul, fdiv (also vector types)
-  default:
-    return false;
-  case InstArithmetic::Add:
-  case InstArithmetic::Sub:
-  case InstArithmetic::And:
-  case InstArithmetic::Or:
-  case InstArithmetic::Xor:
-    return true;
-  case InstArithmetic::Shl:
-  case InstArithmetic::Lshr:
-  case InstArithmetic::Ashr:
-    return false; // TODO(stichnot): implement
-    return !isI64;
-  }
-}
-
-template <typename TraitsType>
-bool isSameMemAddressOperand(const Operand *A, const Operand *B) {
-  if (A == B)
-    return true;
-  if (auto *MemA =
-          llvm::dyn_cast<typename TargetX86Base<TraitsType>::X86OperandMem>(
-              A)) {
-    if (auto *MemB =
-            llvm::dyn_cast<typename TargetX86Base<TraitsType>::X86OperandMem>(
-                B)) {
-      return MemA->getBase() == MemB->getBase() &&
-             MemA->getOffset() == MemB->getOffset() &&
-             MemA->getIndex() == MemB->getIndex() &&
-             MemA->getShift() == MemB->getShift() &&
-             MemA->getSegmentRegister() == MemB->getSegmentRegister();
-    }
-  }
-  return false;
-}
-
-template <typename TraitsType> void TargetX86Base<TraitsType>::findRMW() {
-  TimerMarker _(TimerStack::TT_findRMW, Func);
-  Func->dump("Before RMW");
-  if (Func->isVerbose(IceV_RMW))
-    Func->getContext()->lockStr();
-  for (CfgNode *Node : Func->getNodes()) {
-    // Walk through the instructions, considering each sequence of 3
-    // instructions, and look for the particular RMW pattern. Note that this
-    // search can be "broken" (false negatives) if there are intervening
-    // deleted instructions, or intervening instructions that could be safely
-    // moved out of the way to reveal an RMW pattern.
-    auto E = Node->getInsts().end();
-    auto I1 = E, I2 = E, I3 = Node->getInsts().begin();
-    for (; I3 != E; I1 = I2, I2 = I3, ++I3) {
-      // Make I3 skip over deleted instructions.
-      while (I3 != E && I3->isDeleted())
-        ++I3;
-      if (I1 == E || I2 == E || I3 == E)
-        continue;
-      assert(!I1->isDeleted());
-      assert(!I2->isDeleted());
-      assert(!I3->isDeleted());
-      auto *Load = llvm::dyn_cast<InstLoad>(I1);
-      auto *Arith = llvm::dyn_cast<InstArithmetic>(I2);
-      auto *Store = llvm::dyn_cast<InstStore>(I3);
-      if (!Load || !Arith || !Store)
-        continue;
-      // Look for:
-      //   a = Load addr
-      //   b = <op> a, other
-      //   Store b, addr
-      // Change to:
-      //   a = Load addr
-      //   b = <op> a, other
-      //   x = FakeDef
-      //   RMW <op>, addr, other, x
-      //   b = Store b, addr, x
-      // Note that inferTwoAddress() makes sure setDestRedefined() gets called
-      // on the updated Store instruction, to avoid liveness problems later.
-      //
-      // With this transformation, the Store instruction acquires a Dest
-      // variable and is now subject to dead code elimination if there are no
-      // more uses of "b".  Variable "x" is a beacon for determining whether the
-      // Store instruction gets dead-code eliminated.  If the Store instruction
-      // is eliminated, then it must be the case that the RMW instruction ends
-      // x's live range, and therefore the RMW instruction will be retained and
-      // later lowered.  On the other hand, if the RMW instruction does not end
-      // x's live range, then the Store instruction must still be present, and
-      // therefore the RMW instruction is ignored during lowering because it is
-      // redundant with the Store instruction.
-      //
-      // Note that if "a" has further uses, the RMW transformation may still
-      // trigger, resulting in two loads and one store, which is worse than the
-      // original one load and one store.  However, this is probably rare, and
-      // caching probably keeps it just as fast.
-      if (!isSameMemAddressOperand<TraitsType>(Load->getLoadAddress(),
-                                               Store->getStoreAddress()))
-        continue;
-      Operand *ArithSrcFromLoad = Arith->getSrc(0);
-      Operand *ArithSrcOther = Arith->getSrc(1);
-      if (ArithSrcFromLoad != Load->getDest()) {
-        if (!Arith->isCommutative() || ArithSrcOther != Load->getDest())
-          continue;
-        std::swap(ArithSrcFromLoad, ArithSrcOther);
-      }
-      if (Arith->getDest() != Store->getData())
-        continue;
-      if (!canRMW(Arith))
-        continue;
-      if (Func->isVerbose(IceV_RMW)) {
-        Ostream &Str = Func->getContext()->getStrDump();
-        Str << "Found RMW in " << Func->getFunctionName() << ":\n  ";
-        Load->dump(Func);
-        Str << "\n  ";
-        Arith->dump(Func);
-        Str << "\n  ";
-        Store->dump(Func);
-        Str << "\n";
-      }
-      Variable *Beacon = Func->makeVariable(IceType_i32);
-      Beacon->setMustNotHaveReg();
-      Store->setRmwBeacon(Beacon);
-      auto *BeaconDef = InstFakeDef::create(Func, Beacon);
-      Node->getInsts().insert(I3, BeaconDef);
-      auto *RMW =
-          InstX86FakeRMW::create(Func, ArithSrcOther, Store->getStoreAddress(),
-                                 Beacon, Arith->getOp());
-      Node->getInsts().insert(I3, RMW);
-    }
-  }
-  if (Func->isVerbose(IceV_RMW))
-    Func->getContext()->unlockStr();
-}
-
-// Converts a ConstantInteger32 operand into its constant value, or
-// MemoryOrderInvalid if the operand is not a ConstantInteger32.
-inline uint64_t getConstantMemoryOrder(Operand *Opnd) {
-  if (auto *Integer = llvm::dyn_cast<ConstantInteger32>(Opnd))
-    return Integer->getValue();
-  return Intrinsics::MemoryOrderInvalid;
-}
-
-/// Determines whether the dest of a Load instruction can be folded into one of
-/// the src operands of a 2-operand instruction. This is true as long as the
-/// load dest matches exactly one of the binary instruction's src operands.
-/// Replaces Src0 or Src1 with LoadSrc if the answer is true.
-inline bool canFoldLoadIntoBinaryInst(Operand *LoadSrc, Variable *LoadDest,
-                                      Operand *&Src0, Operand *&Src1) {
-  if (Src0 == LoadDest && Src1 != LoadDest) {
-    Src0 = LoadSrc;
-    return true;
-  }
-  if (Src0 != LoadDest && Src1 == LoadDest) {
-    Src1 = LoadSrc;
-    return true;
-  }
-  return false;
-}
-
-template <typename TraitsType> void TargetX86Base<TraitsType>::doLoadOpt() {
-  TimerMarker _(TimerStack::TT_loadOpt, Func);
-  for (CfgNode *Node : Func->getNodes()) {
-    Context.init(Node);
-    while (!Context.atEnd()) {
-      Variable *LoadDest = nullptr;
-      Operand *LoadSrc = nullptr;
-      Inst *CurInst = iteratorToInst(Context.getCur());
-      Inst *Next = Context.getNextInst();
-      // Determine whether the current instruction is a Load instruction or
-      // equivalent.
-      if (auto *Load = llvm::dyn_cast<InstLoad>(CurInst)) {
-        // An InstLoad qualifies unless it uses a 64-bit absolute address,
-        // which requires legalization to insert a copy to register.
-        // TODO(b/148272103): Fold these after legalization.
-        if (!Traits::Is64Bit || !llvm::isa<Constant>(Load->getLoadAddress())) {
-          LoadDest = Load->getDest();
-          constexpr bool DoLegalize = false;
-          LoadSrc = formMemoryOperand(Load->getLoadAddress(),
-                                      LoadDest->getType(), DoLegalize);
-        }
-      } else if (auto *Intrin = llvm::dyn_cast<InstIntrinsic>(CurInst)) {
-        // An AtomicLoad intrinsic qualifies as long as it has a valid memory
-        // ordering, and can be implemented in a single instruction (i.e., not
-        // i64 on x86-32).
-        Intrinsics::IntrinsicID ID = Intrin->getIntrinsicID();
-        if (ID == Intrinsics::AtomicLoad &&
-            (Traits::Is64Bit || Intrin->getDest()->getType() != IceType_i64) &&
-            Intrinsics::isMemoryOrderValid(
-                ID, getConstantMemoryOrder(Intrin->getArg(1)))) {
-          LoadDest = Intrin->getDest();
-          constexpr bool DoLegalize = false;
-          LoadSrc = formMemoryOperand(Intrin->getArg(0), LoadDest->getType(),
-                                      DoLegalize);
-        }
-      }
-      // A Load instruction can be folded into the following instruction only
-      // if the following instruction ends the Load's Dest variable's live
-      // range.
-      if (LoadDest && Next && Next->isLastUse(LoadDest)) {
-        assert(LoadSrc);
-        Inst *NewInst = nullptr;
-        if (auto *Arith = llvm::dyn_cast<InstArithmetic>(Next)) {
-          Operand *Src0 = Arith->getSrc(0);
-          Operand *Src1 = Arith->getSrc(1);
-          if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) {
-            NewInst = InstArithmetic::create(Func, Arith->getOp(),
-                                             Arith->getDest(), Src0, Src1);
-          }
-        } else if (auto *Icmp = llvm::dyn_cast<InstIcmp>(Next)) {
-          Operand *Src0 = Icmp->getSrc(0);
-          Operand *Src1 = Icmp->getSrc(1);
-          if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) {
-            NewInst = InstIcmp::create(Func, Icmp->getCondition(),
-                                       Icmp->getDest(), Src0, Src1);
-          }
-        } else if (auto *Fcmp = llvm::dyn_cast<InstFcmp>(Next)) {
-          Operand *Src0 = Fcmp->getSrc(0);
-          Operand *Src1 = Fcmp->getSrc(1);
-          if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) {
-            NewInst = InstFcmp::create(Func, Fcmp->getCondition(),
-                                       Fcmp->getDest(), Src0, Src1);
-          }
-        } else if (auto *Select = llvm::dyn_cast<InstSelect>(Next)) {
-          Operand *Src0 = Select->getTrueOperand();
-          Operand *Src1 = Select->getFalseOperand();
-          if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) {
-            NewInst = InstSelect::create(Func, Select->getDest(),
-                                         Select->getCondition(), Src0, Src1);
-          }
-        } else if (auto *Cast = llvm::dyn_cast<InstCast>(Next)) {
-          // The load dest can always be folded into a Cast instruction.
-          auto *Src0 = llvm::dyn_cast<Variable>(Cast->getSrc(0));
-          if (Src0 == LoadDest) {
-            NewInst = InstCast::create(Func, Cast->getCastKind(),
-                                       Cast->getDest(), LoadSrc);
-          }
-        }
-        if (NewInst) {
-          CurInst->setDeleted();
-          Next->setDeleted();
-          Context.insert(NewInst);
-          // Update NewInst->LiveRangesEnded so that target lowering may
-          // benefit. Also update NewInst->HasSideEffects.
-          NewInst->spliceLivenessInfo(Next, CurInst);
-        }
-      }
-      Context.advanceCur();
-      Context.advanceNext();
-    }
-  }
-  Func->dump("After load optimization");
-}
-
-template <typename TraitsType>
-bool TargetX86Base<TraitsType>::doBranchOpt(Inst *I, const CfgNode *NextNode) {
-  if (auto *Br = llvm::dyn_cast<InstX86Br>(I)) {
-    return Br->optimizeBranch(NextNode);
-  }
-  return false;
-}
-
-template <typename TraitsType>
-Variable *TargetX86Base<TraitsType>::getPhysicalRegister(RegNumT RegNum,
-                                                         Type Ty) {
-  if (Ty == IceType_void)
-    Ty = IceType_i32;
-  if (PhysicalRegisters[Ty].empty())
-    PhysicalRegisters[Ty].resize(Traits::RegisterSet::Reg_NUM);
-  assert(unsigned(RegNum) < PhysicalRegisters[Ty].size());
-  Variable *Reg = PhysicalRegisters[Ty][RegNum];
-  if (Reg == nullptr) {
-    Reg = Func->makeVariable(Ty);
-    Reg->setRegNum(RegNum);
-    PhysicalRegisters[Ty][RegNum] = Reg;
-    // Specially mark a named physical register as an "argument" so that it is
-    // considered live upon function entry.  Otherwise it's possible to get
-    // liveness validation errors for saving callee-save registers.
-    Func->addImplicitArg(Reg);
-    // Don't bother tracking the live range of a named physical register.
-    Reg->setIgnoreLiveness();
-  }
-  assert(Traits::getGprForType(Ty, RegNum) == RegNum);
-  return Reg;
-}
-
-template <typename TraitsType>
-const char *TargetX86Base<TraitsType>::getRegName(RegNumT RegNum,
-                                                  Type Ty) const {
-  return Traits::getRegName(Traits::getGprForType(Ty, RegNum));
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::emitVariable(const Variable *Var) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Ctx->getStrEmit();
-  if (Var->hasReg()) {
-    Str << "%" << getRegName(Var->getRegNum(), Var->getType());
-    return;
-  }
-  if (Var->mustHaveReg()) {
-    llvm::report_fatal_error("Infinite-weight Variable (" + Var->getName() +
-                             ") has no register assigned - function " +
-                             Func->getFunctionName());
-  }
-  const int32_t Offset = Var->getStackOffset();
-  auto BaseRegNum = Var->getBaseRegNum();
-  if (BaseRegNum.hasNoValue())
-    BaseRegNum = getFrameOrStackReg();
-
-  // Print in the form "Offset(%reg)", omitting Offset when it is 0.
-  if (getFlags().getDecorateAsm()) {
-    Str << Var->getSymbolicStackOffset();
-  } else if (Offset != 0) {
-    Str << Offset;
-  }
-  const Type FrameSPTy = Traits::WordType;
-  Str << "(%" << getRegName(BaseRegNum, FrameSPTy) << ")";
-}
-
-template <typename TraitsType>
-typename TargetX86Base<TraitsType>::X86Address
-TargetX86Base<TraitsType>::stackVarToAsmOperand(const Variable *Var) const {
-  if (Var->hasReg())
-    llvm::report_fatal_error("Stack Variable has a register assigned");
-  if (Var->mustHaveReg()) {
-    llvm::report_fatal_error("Infinite-weight Variable (" + Var->getName() +
-                             ") has no register assigned - function " +
-                             Func->getFunctionName());
-  }
-  int32_t Offset = Var->getStackOffset();
-  auto BaseRegNum = Var->getBaseRegNum();
-  if (Var->getBaseRegNum().hasNoValue()) {
-    // If the stack pointer needs alignment, we must use the frame pointer for
-    // arguments. For locals, getFrameOrStackReg will return the stack pointer
-    // in this case.
-    if (needsStackPointerAlignment() && Var->getIsArg()) {
-      assert(hasFramePointer());
-      BaseRegNum = getFrameReg();
-    } else {
-      BaseRegNum = getFrameOrStackReg();
-    }
-  }
-  return X86Address(Traits::getEncodedGPR(BaseRegNum), Offset,
-                    AssemblerFixup::NoFixup);
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::addProlog(CfgNode *Node) {
-  // Stack frame layout:
-  //
-  // +------------------------+  ^ +
-  // | 1. return address      |  |
-  // +------------------------+  v -
-  // | 2. preserved registers |
-  // +------------------------+ <--- BasePointer (if used)
-  // | 3. padding             |
-  // +------------------------+
-  // | 4. global spill area   |
-  // +------------------------+
-  // | 5. padding             |
-  // +------------------------+
-  // | 6. local spill area    |
-  // +------------------------+
-  // | 7. padding             |
-  // +------------------------+
-  // | 7.5 shadow (WinX64)    |
-  // +------------------------+
-  // | 8. allocas             |
-  // +------------------------+
-  // | 9. padding             |
-  // +------------------------+
-  // | 10. out args           |
-  // +------------------------+ <--- StackPointer
-  //
-  // The following variables record the size in bytes of the given areas:
-  //  * X86_RET_IP_SIZE_BYTES:   area 1
-  //  * PreservedRegsSizeBytes:  area 2
-  //  * SpillAreaPaddingBytes:   area 3
-  //  * GlobalsSize:             area 4
-  //  * LocalsSlotsPaddingBytes: area 5
-  //  * GlobalsAndSubsequentPaddingSize: areas 4 - 5
-  //  * LocalsSpillAreaSize:     area 6
-  //  * FixedAllocaSizeBytes:    areas 7 - 8
-  //  * SpillAreaSizeBytes:      areas 3 - 10
-  //  * maxOutArgsSizeBytes():   areas 9 - 10
-
-  // Determine stack frame offsets for each Variable without a register
-  // assignment. This can be done as one variable per stack slot. Or, do
-  // coalescing by running the register allocator again with an infinite set of
-  // registers (as a side effect, this gives variables a second chance at
-  // physical register assignment).
-  //
-  // A middle ground approach is to leverage sparsity and allocate one block of
-  // space on the frame for globals (variables with multi-block lifetime), and
-  // one block to share for locals (single-block lifetime).
-
-  const SizeT ShadowStoreSize = getShadowStoreSize();
-
-  // StackPointer: points just past return address of calling function
-
-  Context.init(Node);
-  Context.setInsertPoint(Context.getCur());
-
-  SmallBitVector CalleeSaves = getRegisterSet(RegSet_CalleeSave, RegSet_None);
-  RegsUsed = SmallBitVector(CalleeSaves.size());
-  VarList SortedSpilledVariables, VariablesLinkedToSpillSlots;
-  size_t GlobalsSize = 0;
-  // If there is a separate locals area, this represents that area. Otherwise
-  // it counts any variable not counted by GlobalsSize.
-  SpillAreaSizeBytes = 0;
-  // If there is a separate locals area, this specifies the alignment for it.
-  uint32_t LocalsSlotsAlignmentBytes = 0;
-  // The entire spill locations area gets aligned to largest natural alignment
-  // of the variables that have a spill slot.
-  uint32_t SpillAreaAlignmentBytes = 0;
-  // A spill slot linked to a variable with a stack slot should reuse that
-  // stack slot.
-  std::function<bool(Variable *)> TargetVarHook =
-      [&VariablesLinkedToSpillSlots](Variable *Var) {
-        // TODO(stichnot): Refactor this into the base class.
-        Variable *Root = Var->getLinkedToStackRoot();
-        if (Root != nullptr) {
-          assert(!Root->hasReg());
-          if (!Root->hasReg()) {
-            VariablesLinkedToSpillSlots.push_back(Var);
-            return true;
-          }
-        }
-        return false;
-      };
-
-  // Compute the list of spilled variables and bounds for GlobalsSize, etc.
-  getVarStackSlotParams(SortedSpilledVariables, RegsUsed, &GlobalsSize,
-                        &SpillAreaSizeBytes, &SpillAreaAlignmentBytes,
-                        &LocalsSlotsAlignmentBytes, TargetVarHook);
-  uint32_t LocalsSpillAreaSize = SpillAreaSizeBytes;
-  SpillAreaSizeBytes += GlobalsSize;
-
-  // Add push instructions for preserved registers.
-  uint32_t NumCallee = 0;
-  size_t PreservedRegsSizeBytes = 0;
-  SmallBitVector Pushed(CalleeSaves.size());
-  for (RegNumT i : RegNumBVIter(CalleeSaves)) {
-    const auto Canonical = Traits::getBaseReg(i);
-    assert(Canonical == Traits::getBaseReg(Canonical));
-    if (RegsUsed[i]) {
-      Pushed[Canonical] = true;
-    }
-  }
-  for (RegNumT RegNum : RegNumBVIter(Pushed)) {
-    assert(RegNum == Traits::getBaseReg(RegNum));
-    ++NumCallee;
-    if (Traits::isXmm(RegNum)) {
-      PreservedRegsSizeBytes += 16;
-    } else {
-      PreservedRegsSizeBytes += typeWidthInBytes(Traits::WordType);
-    }
-    _push_reg(RegNum);
-  }
-  Ctx->statsUpdateRegistersSaved(NumCallee);
-
-  // StackPointer: points past preserved registers at start of spill area
-
-  // Generate "push frameptr; mov frameptr, stackptr"
-  if (IsEbpBasedFrame) {
-    assert(
-        (RegsUsed & getRegisterSet(RegSet_FramePointer, RegSet_None)).count() ==
-        0);
-    PreservedRegsSizeBytes += typeWidthInBytes(Traits::WordType);
-    _link_bp();
-  }
-
-  // Align the variables area. SpillAreaPaddingBytes is the size of the region
-  // after the preserved registers and before the spill areas.
-  // LocalsSlotsPaddingBytes is the amount of padding between the globals and
-  // locals area if they are separate.
-  assert(LocalsSlotsAlignmentBytes <= SpillAreaAlignmentBytes);
-  uint32_t SpillAreaPaddingBytes = 0;
-  uint32_t LocalsSlotsPaddingBytes = 0;
-  alignStackSpillAreas(Traits::X86_RET_IP_SIZE_BYTES + PreservedRegsSizeBytes,
-                       SpillAreaAlignmentBytes, GlobalsSize,
-                       LocalsSlotsAlignmentBytes, &SpillAreaPaddingBytes,
-                       &LocalsSlotsPaddingBytes);
-  SpillAreaSizeBytes += SpillAreaPaddingBytes + LocalsSlotsPaddingBytes;
-  uint32_t GlobalsAndSubsequentPaddingSize =
-      GlobalsSize + LocalsSlotsPaddingBytes;
-
-  // Functions returning scalar floating point types may need to convert values
-  // from an in-register xmm value to the top of the x87 floating point stack.
-  // This is done by a movp[sd] and an fld[sd].  Ensure there is enough scratch
-  // space on the stack for this.
-  const Type ReturnType = Func->getReturnType();
-  if (isScalarFloatingType(ReturnType)) {
-    // Avoid misaligned double-precision load/store.
-    RequiredStackAlignment = std::max<size_t>(
-        RequiredStackAlignment, Traits::X86_STACK_ALIGNMENT_BYTES);
-    SpillAreaSizeBytes =
-        std::max(typeWidthInBytesOnStack(ReturnType), SpillAreaSizeBytes);
-  }
-
-  RequiredStackAlignment =
-      std::max<size_t>(RequiredStackAlignment, SpillAreaAlignmentBytes);
-
-  if (PrologEmitsFixedAllocas) {
-    RequiredStackAlignment =
-        std::max(RequiredStackAlignment, FixedAllocaAlignBytes);
-  }
-
-  // Combine fixed allocations into SpillAreaSizeBytes if we are emitting the
-  // fixed allocations in the prolog.
-  if (PrologEmitsFixedAllocas)
-    SpillAreaSizeBytes += FixedAllocaSizeBytes;
-
-  // Win64 ABI: add space for shadow store (aka home space)
-  SpillAreaSizeBytes += ShadowStoreSize;
-
-  // Entering the function has made the stack pointer unaligned. Re-align it by
-  // adjusting the stack size.
-  // Note that StackOffset does not include spill area. It's the offset from the
-  // base stack pointer (epb), whether we set it or not, to the the first stack
-  // arg (if any). StackSize, on the other hand, does include the spill area.
-  const uint32_t StackOffset =
-      ShadowStoreSize + Traits::X86_RET_IP_SIZE_BYTES + PreservedRegsSizeBytes;
-  uint32_t StackSize = Utils::applyAlignment(StackOffset + SpillAreaSizeBytes,
-                                             RequiredStackAlignment);
-  StackSize = Utils::applyAlignment(StackSize + maxOutArgsSizeBytes(),
-                                    RequiredStackAlignment);
-  SpillAreaSizeBytes = StackSize - StackOffset; // Adjust for alignment, if any
-
-  if (SpillAreaSizeBytes) {
-    auto *Func = Node->getCfg();
-    if (SpillAreaSizeBytes > Func->getStackSizeLimit()) {
-      Func->setError("Stack size limit exceeded");
-    }
-
-    emitStackProbe(SpillAreaSizeBytes);
-
-    // Generate "sub stackptr, SpillAreaSizeBytes"
-    _sub_sp(Ctx->getConstantInt32(SpillAreaSizeBytes));
-  }
-
-  // StackPointer: points just past the spill area (end of stack frame)
-
-  // If the required alignment is greater than the stack pointer's guaranteed
-  // alignment, align the stack pointer accordingly.
-  if (RequiredStackAlignment > Traits::X86_STACK_ALIGNMENT_BYTES) {
-    assert(IsEbpBasedFrame);
-    _and(getPhysicalRegister(getStackReg(), Traits::WordType),
-         Ctx->getConstantInt32(-RequiredStackAlignment));
-  }
-
-  // StackPointer: may have just been offset for alignment
-
-  // Account for known-frame-offset alloca instructions that were not already
-  // combined into the prolog.
-  if (!PrologEmitsFixedAllocas)
-    SpillAreaSizeBytes += FixedAllocaSizeBytes;
-
-  Ctx->statsUpdateFrameBytes(SpillAreaSizeBytes);
-
-  // Fill in stack offsets for stack args, and copy args into registers for
-  // those that were register-allocated. Args are pushed right to left, so
-  // Arg[0] is closest to the stack/frame pointer.
-  RegNumT FrameOrStackReg = IsEbpBasedFrame ? getFrameReg() : getStackReg();
-  Variable *FramePtr = getPhysicalRegister(FrameOrStackReg, Traits::WordType);
-  size_t BasicFrameOffset = StackOffset;
-  if (!IsEbpBasedFrame)
-    BasicFrameOffset += SpillAreaSizeBytes;
-
-  const VarList &Args = Func->getArgs();
-  size_t InArgsSizeBytes = 0;
-  unsigned NumXmmArgs = 0;
-  unsigned NumGPRArgs = 0;
-  for (SizeT i = 0, NumArgs = Args.size(); i < NumArgs; ++i) {
-    Variable *Arg = Args[i];
-    // Skip arguments passed in registers.
-    if (isVectorType(Arg->getType())) {
-      if (Traits::getRegisterForXmmArgNum(Traits::getArgIndex(i, NumXmmArgs))
-              .hasValue()) {
-        ++NumXmmArgs;
-        continue;
-      }
-    } else if (!isScalarFloatingType(Arg->getType())) {
-      assert(isScalarIntegerType(Arg->getType()));
-      if (Traits::getRegisterForGprArgNum(Traits::WordType,
-                                          Traits::getArgIndex(i, NumGPRArgs))
-              .hasValue()) {
-        ++NumGPRArgs;
-        continue;
-      }
-    }
-    // For esp-based frames where the allocas are done outside the prolog, the
-    // esp value may not stabilize to its home value until after all the
-    // fixed-size alloca instructions have executed.  In this case, a stack
-    // adjustment is needed when accessing in-args in order to copy them into
-    // registers.
-    size_t StackAdjBytes = 0;
-    if (!IsEbpBasedFrame && !PrologEmitsFixedAllocas)
-      StackAdjBytes -= FixedAllocaSizeBytes;
-    finishArgumentLowering(Arg, FramePtr, BasicFrameOffset, StackAdjBytes,
-                           InArgsSizeBytes);
-  }
-
-  // Fill in stack offsets for locals.
-  assignVarStackSlots(SortedSpilledVariables, SpillAreaPaddingBytes,
-                      SpillAreaSizeBytes, GlobalsAndSubsequentPaddingSize,
-                      IsEbpBasedFrame && !needsStackPointerAlignment());
-  // Assign stack offsets to variables that have been linked to spilled
-  // variables.
-  for (Variable *Var : VariablesLinkedToSpillSlots) {
-    const Variable *Root = Var->getLinkedToStackRoot();
-    assert(Root != nullptr);
-    Var->setStackOffset(Root->getStackOffset());
-
-    // If the stack root variable is an arg, make this variable an arg too so
-    // that stackVarToAsmOperand uses the correct base pointer (e.g. ebp on
-    // x86).
-    Var->setIsArg(Root->getIsArg());
-  }
-  this->HasComputedFrame = true;
-
-  if (BuildDefs::dump() && Func->isVerbose(IceV_Frame)) {
-    OstreamLocker L(Func->getContext());
-    Ostream &Str = Func->getContext()->getStrDump();
-
-    Str << "Stack layout:\n";
-    uint32_t EspAdjustmentPaddingSize =
-        SpillAreaSizeBytes - LocalsSpillAreaSize -
-        GlobalsAndSubsequentPaddingSize - SpillAreaPaddingBytes -
-        maxOutArgsSizeBytes();
-    Str << " in-args = " << InArgsSizeBytes << " bytes\n"
-        << " return address = " << Traits::X86_RET_IP_SIZE_BYTES << " bytes\n"
-        << " preserved registers = " << PreservedRegsSizeBytes << " bytes\n"
-        << " spill area padding = " << SpillAreaPaddingBytes << " bytes\n"
-        << " globals spill area = " << GlobalsSize << " bytes\n"
-        << " globals-locals spill areas intermediate padding = "
-        << GlobalsAndSubsequentPaddingSize - GlobalsSize << " bytes\n"
-        << " locals spill area = " << LocalsSpillAreaSize << " bytes\n"
-        << " esp alignment padding = " << EspAdjustmentPaddingSize
-        << " bytes\n";
-
-    Str << "Stack details:\n"
-        << " esp adjustment = " << SpillAreaSizeBytes << " bytes\n"
-        << " spill area alignment = " << SpillAreaAlignmentBytes << " bytes\n"
-        << " outgoing args size = " << maxOutArgsSizeBytes() << " bytes\n"
-        << " locals spill area alignment = " << LocalsSlotsAlignmentBytes
-        << " bytes\n"
-        << " is ebp based = " << IsEbpBasedFrame << "\n";
-  }
-}
-
-/// Helper function for addProlog().
-///
-/// This assumes Arg is an argument passed on the stack. This sets the frame
-/// offset for Arg and updates InArgsSizeBytes according to Arg's width. For an
-/// I64 arg that has been split into Lo and Hi components, it calls itself
-/// recursively on the components, taking care to handle Lo first because of the
-/// little-endian architecture. Lastly, this function generates an instruction
-/// to copy Arg into its assigned register if applicable.
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::finishArgumentLowering(
-    Variable *Arg, Variable *FramePtr, size_t BasicFrameOffset,
-    size_t StackAdjBytes, size_t &InArgsSizeBytes) {
-  if (!Traits::Is64Bit) {
-    if (auto *Arg64On32 = llvm::dyn_cast<Variable64On32>(Arg)) {
-      Variable *Lo = Arg64On32->getLo();
-      Variable *Hi = Arg64On32->getHi();
-      finishArgumentLowering(Lo, FramePtr, BasicFrameOffset, StackAdjBytes,
-                             InArgsSizeBytes);
-      finishArgumentLowering(Hi, FramePtr, BasicFrameOffset, StackAdjBytes,
-                             InArgsSizeBytes);
-      return;
-    }
-  }
-  Type Ty = Arg->getType();
-  if (isVectorType(Ty)) {
-    InArgsSizeBytes = Traits::applyStackAlignment(InArgsSizeBytes);
-  }
-  Arg->setStackOffset(BasicFrameOffset + InArgsSizeBytes);
-  InArgsSizeBytes += typeWidthInBytesOnStack(Ty);
-  if (Arg->hasReg()) {
-    assert(Ty != IceType_i64 || Traits::Is64Bit);
-    auto *Mem = X86OperandMem::create(
-        Func, Ty, FramePtr,
-        Ctx->getConstantInt32(Arg->getStackOffset() + StackAdjBytes));
-    if (isVectorType(Arg->getType())) {
-      _movp(Arg, Mem);
-    } else {
-      _mov(Arg, Mem);
-    }
-    // This argument-copying instruction uses an explicit X86OperandMem
-    // operand instead of a Variable, so its fill-from-stack operation has to
-    // be tracked separately for statistics.
-    Ctx->statsUpdateFills();
-  }
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::addEpilog(CfgNode *Node) {
-  InstList &Insts = Node->getInsts();
-  InstList::reverse_iterator RI, E;
-  for (RI = Insts.rbegin(), E = Insts.rend(); RI != E; ++RI) {
-    if (llvm::isa<typename Traits::Insts::Ret>(*RI))
-      break;
-  }
-  if (RI == E)
-    return;
-
-  // Convert the reverse_iterator position into its corresponding (forward)
-  // iterator position.
-  InstList::iterator InsertPoint = reverseToForwardIterator(RI);
-  --InsertPoint;
-  Context.init(Node);
-  Context.setInsertPoint(InsertPoint);
-
-  if (IsEbpBasedFrame) {
-    _unlink_bp();
-  } else {
-    // add stackptr, SpillAreaSizeBytes
-    if (SpillAreaSizeBytes != 0) {
-      _add_sp(Ctx->getConstantInt32(SpillAreaSizeBytes));
-    }
-  }
-
-  // Add pop instructions for preserved registers.
-  SmallBitVector CalleeSaves = getRegisterSet(RegSet_CalleeSave, RegSet_None);
-  SmallBitVector Popped(CalleeSaves.size());
-  for (int32_t i = CalleeSaves.size() - 1; i >= 0; --i) {
-    const auto RegNum = RegNumT::fromInt(i);
-    if (RegNum == getFrameReg() && IsEbpBasedFrame)
-      continue;
-    const RegNumT Canonical = Traits::getBaseReg(RegNum);
-    if (CalleeSaves[i] && RegsUsed[i]) {
-      Popped[Canonical] = true;
-    }
-  }
-  for (int32_t i = Popped.size() - 1; i >= 0; --i) {
-    if (!Popped[i])
-      continue;
-    const auto RegNum = RegNumT::fromInt(i);
-    assert(RegNum == Traits::getBaseReg(RegNum));
-    _pop_reg(RegNum);
-  }
-}
-
-template <typename TraitsType> Type TargetX86Base<TraitsType>::stackSlotType() {
-  return Traits::WordType;
-}
-
-template <typename TraitsType>
-template <typename T>
-typename std::enable_if<!T::Is64Bit, Operand>::type *
-TargetX86Base<TraitsType>::loOperand(Operand *Operand) {
-  assert(Operand->getType() == IceType_i64 ||
-         Operand->getType() == IceType_f64);
-  if (Operand->getType() != IceType_i64 && Operand->getType() != IceType_f64)
-    return Operand;
-  if (auto *Var64On32 = llvm::dyn_cast<Variable64On32>(Operand))
-    return Var64On32->getLo();
-  if (auto *Const = llvm::dyn_cast<ConstantInteger64>(Operand)) {
-    auto *ConstInt = llvm::dyn_cast<ConstantInteger32>(
-        Ctx->getConstantInt32(static_cast<int32_t>(Const->getValue())));
-    // Check if we need to blind/pool the constant.
-    return legalize(ConstInt);
-  }
-  if (auto *Mem = llvm::dyn_cast<X86OperandMem>(Operand)) {
-    auto *MemOperand = X86OperandMem::create(
-        Func, IceType_i32, Mem->getBase(), Mem->getOffset(), Mem->getIndex(),
-        Mem->getShift(), Mem->getSegmentRegister(), Mem->getIsRebased());
-    // Test if we should randomize or pool the offset, if so randomize it or
-    // pool it then create mem operand with the blinded/pooled constant.
-    // Otherwise, return the mem operand as ordinary mem operand.
-    return legalize(MemOperand);
-  }
-  llvm_unreachable("Unsupported operand type");
-  return nullptr;
-}
-
-template <typename TraitsType>
-template <typename T>
-typename std::enable_if<!T::Is64Bit, Operand>::type *
-TargetX86Base<TraitsType>::hiOperand(Operand *Operand) {
-  assert(Operand->getType() == IceType_i64 ||
-         Operand->getType() == IceType_f64);
-  if (Operand->getType() != IceType_i64 && Operand->getType() != IceType_f64)
-    return Operand;
-  if (auto *Var64On32 = llvm::dyn_cast<Variable64On32>(Operand))
-    return Var64On32->getHi();
-  if (auto *Const = llvm::dyn_cast<ConstantInteger64>(Operand)) {
-    auto *ConstInt = llvm::dyn_cast<ConstantInteger32>(
-        Ctx->getConstantInt32(static_cast<int32_t>(Const->getValue() >> 32)));
-    // Check if we need to blind/pool the constant.
-    return legalize(ConstInt);
-  }
-  if (auto *Mem = llvm::dyn_cast<X86OperandMem>(Operand)) {
-    Constant *Offset = Mem->getOffset();
-    if (Offset == nullptr) {
-      Offset = Ctx->getConstantInt32(4);
-    } else if (auto *IntOffset = llvm::dyn_cast<ConstantInteger32>(Offset)) {
-      Offset = Ctx->getConstantInt32(4 + IntOffset->getValue());
-    } else if (auto *SymOffset = llvm::dyn_cast<ConstantRelocatable>(Offset)) {
-      assert(!Utils::WouldOverflowAdd(SymOffset->getOffset(), 4));
-      Offset =
-          Ctx->getConstantSym(4 + SymOffset->getOffset(), SymOffset->getName());
-    }
-    auto *MemOperand = X86OperandMem::create(
-        Func, IceType_i32, Mem->getBase(), Offset, Mem->getIndex(),
-        Mem->getShift(), Mem->getSegmentRegister(), Mem->getIsRebased());
-    // Test if the Offset is an eligible i32 constants for randomization and
-    // pooling. Blind/pool it if it is. Otherwise return as oridinary mem
-    // operand.
-    return legalize(MemOperand);
-  }
-  llvm_unreachable("Unsupported operand type");
-  return nullptr;
-}
-
-template <typename TraitsType>
-SmallBitVector
-TargetX86Base<TraitsType>::getRegisterSet(RegSetMask Include,
-                                          RegSetMask Exclude) const {
-  return Traits::getRegisterSet(getFlags(), Include, Exclude);
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::lowerAlloca(const InstAlloca *Instr) {
-  // Conservatively require the stack to be aligned. Some stack adjustment
-  // operations implemented below assume that the stack is aligned before the
-  // alloca. All the alloca code ensures that the stack alignment is preserved
-  // after the alloca. The stack alignment restriction can be relaxed in some
-  // cases.
-  RequiredStackAlignment = std::max<size_t>(RequiredStackAlignment,
-                                            Traits::X86_STACK_ALIGNMENT_BYTES);
-
-  // For default align=0, set it to the real value 1, to avoid any
-  // bit-manipulation problems below.
-  const uint32_t AlignmentParam = std::max(1u, Instr->getAlignInBytes());
-
-  // LLVM enforces power of 2 alignment.
-  assert(llvm::isPowerOf2_32(AlignmentParam));
-  assert(llvm::isPowerOf2_32(Traits::X86_STACK_ALIGNMENT_BYTES));
-
-  const uint32_t Alignment =
-      std::max(AlignmentParam, Traits::X86_STACK_ALIGNMENT_BYTES);
-  const bool OverAligned = Alignment > Traits::X86_STACK_ALIGNMENT_BYTES;
-  const bool OptM1 = Func->getOptLevel() == Opt_m1;
-  const bool AllocaWithKnownOffset = Instr->getKnownFrameOffset();
-  const bool UseFramePointer =
-      hasFramePointer() || OverAligned || !AllocaWithKnownOffset || OptM1;
-
-  if (UseFramePointer)
-    setHasFramePointer();
-
-  Variable *esp = getPhysicalRegister(getStackReg(), Traits::WordType);
-  if (OverAligned) {
-    _and(esp, Ctx->getConstantInt32(-Alignment));
-  }
-
-  Variable *Dest = Instr->getDest();
-  Operand *TotalSize = legalize(Instr->getSizeInBytes());
-
-  if (const auto *ConstantTotalSize =
-          llvm::dyn_cast<ConstantInteger32>(TotalSize)) {
-    const uint32_t Value =
-        Utils::applyAlignment(ConstantTotalSize->getValue(), Alignment);
-    if (UseFramePointer) {
-      _sub_sp(Ctx->getConstantInt32(Value));
-    } else {
-      // If we don't need a Frame Pointer, this alloca has a known offset to the
-      // stack pointer. We don't need adjust the stack pointer, nor assign any
-      // value to Dest, as Dest is rematerializable.
-      assert(Dest->isRematerializable());
-      FixedAllocaSizeBytes += Value;
-      Context.insert<InstFakeDef>(Dest);
-    }
-  } else {
-    // Non-constant sizes need to be adjusted to the next highest multiple of
-    // the required alignment at runtime.
-    Variable *T = nullptr;
-    if (Traits::Is64Bit && TotalSize->getType() != IceType_i64) {
-      T = makeReg(IceType_i64);
-      _movzx(T, TotalSize);
-    } else {
-      T = makeReg(IceType_i32);
-      _mov(T, TotalSize);
-    }
-    _add(T, Ctx->getConstantInt32(Alignment - 1));
-    _and(T, Ctx->getConstantInt32(-Alignment));
-    _sub_sp(T);
-  }
-  // Add enough to the returned address to account for the out args area.
-  uint32_t OutArgsSize = maxOutArgsSizeBytes();
-  if (OutArgsSize > 0) {
-    Variable *T = makeReg(Dest->getType());
-    auto *CalculateOperand = X86OperandMem::create(
-        Func, IceType_void, esp, Ctx->getConstantInt(IceType_i32, OutArgsSize));
-    _lea(T, CalculateOperand);
-    _mov(Dest, T);
-  } else {
-    _mov(Dest, esp);
-  }
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::lowerArguments() {
-  const bool OptM1 = Func->getOptLevel() == Opt_m1;
-  VarList &Args = Func->getArgs();
-  unsigned NumXmmArgs = 0;
-  bool XmmSlotsRemain = true;
-  unsigned NumGprArgs = 0;
-  bool GprSlotsRemain = true;
-
-  Context.init(Func->getEntryNode());
-  Context.setInsertPoint(Context.getCur());
-
-  for (SizeT i = 0, End = Args.size();
-       i < End && (XmmSlotsRemain || GprSlotsRemain); ++i) {
-    Variable *Arg = Args[i];
-    Type Ty = Arg->getType();
-    Variable *RegisterArg = nullptr;
-    RegNumT RegNum;
-    if (isVectorType(Ty)) {
-      RegNum =
-          Traits::getRegisterForXmmArgNum(Traits::getArgIndex(i, NumXmmArgs));
-      if (RegNum.hasNoValue()) {
-        XmmSlotsRemain = false;
-        continue;
-      }
-      ++NumXmmArgs;
-      RegisterArg = Func->makeVariable(Ty);
-    } else if (isScalarFloatingType(Ty)) {
-      continue;
-    } else if (isScalarIntegerType(Ty)) {
-      RegNum = Traits::getRegisterForGprArgNum(
-          Ty, Traits::getArgIndex(i, NumGprArgs));
-      if (RegNum.hasNoValue()) {
-        GprSlotsRemain = false;
-        continue;
-      }
-      ++NumGprArgs;
-      RegisterArg = Func->makeVariable(Ty);
-    }
-    assert(RegNum.hasValue());
-    assert(RegisterArg != nullptr);
-    // Replace Arg in the argument list with the home register. Then generate
-    // an instruction in the prolog to copy the home register to the assigned
-    // location of Arg.
-    if (BuildDefs::dump())
-      RegisterArg->setName(Func, "home_reg:" + Arg->getName());
-    RegisterArg->setRegNum(RegNum);
-    RegisterArg->setIsArg();
-    Arg->setIsArg(false);
-
-    Args[i] = RegisterArg;
-    // When not Om1, do the assignment through a temporary, instead of directly
-    // from the pre-colored variable, so that a subsequent availabilityGet()
-    // call has a chance to work.  (In Om1, don't bother creating extra
-    // instructions with extra variables to register-allocate.)
-    if (OptM1) {
-      Context.insert<InstAssign>(Arg, RegisterArg);
-    } else {
-      Variable *Tmp = makeReg(RegisterArg->getType());
-      Context.insert<InstAssign>(Tmp, RegisterArg);
-      Context.insert<InstAssign>(Arg, Tmp);
-    }
-  }
-  if (!OptM1)
-    Context.availabilityUpdate();
-}
-
-/// Strength-reduce scalar integer multiplication by a constant (for i32 or
-/// narrower) for certain constants. The lea instruction can be used to multiply
-/// by 3, 5, or 9, and the lsh instruction can be used to multiply by powers of
-/// 2. These can be combined such that e.g. multiplying by 100 can be done as 2
-/// lea-based multiplies by 5, combined with left-shifting by 2.
-template <typename TraitsType>
-bool TargetX86Base<TraitsType>::optimizeScalarMul(Variable *Dest, Operand *Src0,
-                                                  int32_t Src1) {
-  // Disable this optimization for Om1 and O0, just to keep things simple
-  // there.
-  if (Func->getOptLevel() < Opt_1)
-    return false;
-  Type Ty = Dest->getType();
-  if (Src1 == -1) {
-    Variable *T = nullptr;
-    _mov(T, Src0);
-    _neg(T);
-    _mov(Dest, T);
-    return true;
-  }
-  if (Src1 == 0) {
-    _mov(Dest, Ctx->getConstantZero(Ty));
-    return true;
-  }
-  if (Src1 == 1) {
-    Variable *T = nullptr;
-    _mov(T, Src0);
-    _mov(Dest, T);
-    return true;
-  }
-  // Don't bother with the edge case where Src1 == MININT.
-  if (Src1 == -Src1)
-    return false;
-  const bool Src1IsNegative = Src1 < 0;
-  if (Src1IsNegative)
-    Src1 = -Src1;
-  uint32_t Count9 = 0;
-  uint32_t Count5 = 0;
-  uint32_t Count3 = 0;
-  uint32_t Count2 = 0;
-  uint32_t CountOps = 0;
-  while (Src1 > 1) {
-    if (Src1 % 9 == 0) {
-      ++CountOps;
-      ++Count9;
-      Src1 /= 9;
-    } else if (Src1 % 5 == 0) {
-      ++CountOps;
-      ++Count5;
-      Src1 /= 5;
-    } else if (Src1 % 3 == 0) {
-      ++CountOps;
-      ++Count3;
-      Src1 /= 3;
-    } else if (Src1 % 2 == 0) {
-      if (Count2 == 0)
-        ++CountOps;
-      ++Count2;
-      Src1 /= 2;
-    } else {
-      return false;
-    }
-  }
-  // Lea optimization only works for i16 and i32 types, not i8.
-  if (Ty != IceType_i32 && !(Traits::Is64Bit && Ty == IceType_i64) &&
-      (Count3 || Count5 || Count9))
-    return false;
-  // Limit the number of lea/shl operations for a single multiply, to a
-  // somewhat arbitrary choice of 3.
-  constexpr uint32_t MaxOpsForOptimizedMul = 3;
-  if (CountOps > MaxOpsForOptimizedMul)
-    return false;
-  Variable *T = makeReg(Traits::WordType);
-  if (typeWidthInBytes(Src0->getType()) < typeWidthInBytes(T->getType())) {
-    Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
-    _movzx(T, Src0RM);
-  } else {
-    _mov(T, Src0);
-  }
-  Constant *Zero = Ctx->getConstantZero(IceType_i32);
-  for (uint32_t i = 0; i < Count9; ++i) {
-    constexpr uint16_t Shift = 3; // log2(9-1)
-    _lea(T, X86OperandMem::create(Func, IceType_void, T, Zero, T, Shift));
-  }
-  for (uint32_t i = 0; i < Count5; ++i) {
-    constexpr uint16_t Shift = 2; // log2(5-1)
-    _lea(T, X86OperandMem::create(Func, IceType_void, T, Zero, T, Shift));
-  }
-  for (uint32_t i = 0; i < Count3; ++i) {
-    constexpr uint16_t Shift = 1; // log2(3-1)
-    _lea(T, X86OperandMem::create(Func, IceType_void, T, Zero, T, Shift));
-  }
-  if (Count2) {
-    _shl(T, Ctx->getConstantInt(Ty, Count2));
-  }
-  if (Src1IsNegative)
-    _neg(T);
-  _mov(Dest, T);
-  return true;
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::lowerShift64(InstArithmetic::OpKind Op,
-                                             Operand *Src0Lo, Operand *Src0Hi,
-                                             Operand *Src1Lo, Variable *DestLo,
-                                             Variable *DestHi) {
-  // TODO: Refactor the similarities between Shl, Lshr, and Ashr.
-  Variable *T_1 = nullptr, *T_2 = nullptr, *T_3 = nullptr;
-  Constant *Zero = Ctx->getConstantZero(IceType_i32);
-  Constant *SignExtend = Ctx->getConstantInt32(0x1f);
-  if (auto *ConstantShiftAmount = llvm::dyn_cast<ConstantInteger32>(Src1Lo)) {
-    uint32_t ShiftAmount = ConstantShiftAmount->getValue();
-    if (ShiftAmount > 32) {
-      Constant *ReducedShift = Ctx->getConstantInt32(ShiftAmount - 32);
-      switch (Op) {
-      default:
-        assert(0 && "non-shift op");
-        break;
-      case InstArithmetic::Shl: {
-        // a=b<<c ==>
-        //   t2 = b.lo
-        //   t2 = shl t2, ShiftAmount-32
-        //   t3 = t2
-        //   t2 = 0
-        _mov(T_2, Src0Lo);
-        _shl(T_2, ReducedShift);
-        _mov(DestHi, T_2);
-        _mov(DestLo, Zero);
-      } break;
-      case InstArithmetic::Lshr: {
-        // a=b>>c (unsigned) ==>
-        //   t2 = b.hi
-        //   t2 = shr t2, ShiftAmount-32
-        //   a.lo = t2
-        //   a.hi = 0
-        _mov(T_2, Src0Hi);
-        _shr(T_2, ReducedShift);
-        _mov(DestLo, T_2);
-        _mov(DestHi, Zero);
-      } break;
-      case InstArithmetic::Ashr: {
-        // a=b>>c (signed) ==>
-        //   t3 = b.hi
-        //   t3 = sar t3, 0x1f
-        //   t2 = b.hi
-        //   t2 = shrd t2, t3, ShiftAmount-32
-        //   a.lo = t2
-        //   a.hi = t3
-        _mov(T_3, Src0Hi);
-        _sar(T_3, SignExtend);
-        _mov(T_2, Src0Hi);
-        _shrd(T_2, T_3, ReducedShift);
-        _mov(DestLo, T_2);
-        _mov(DestHi, T_3);
-      } break;
-      }
-    } else if (ShiftAmount == 32) {
-      switch (Op) {
-      default:
-        assert(0 && "non-shift op");
-        break;
-      case InstArithmetic::Shl: {
-        // a=b<<c ==>
-        //   t2 = b.lo
-        //   a.hi = t2
-        //   a.lo = 0
-        _mov(T_2, Src0Lo);
-        _mov(DestHi, T_2);
-        _mov(DestLo, Zero);
-      } break;
-      case InstArithmetic::Lshr: {
-        // a=b>>c (unsigned) ==>
-        //   t2 = b.hi
-        //   a.lo = t2
-        //   a.hi = 0
-        _mov(T_2, Src0Hi);
-        _mov(DestLo, T_2);
-        _mov(DestHi, Zero);
-      } break;
-      case InstArithmetic::Ashr: {
-        // a=b>>c (signed) ==>
-        //   t2 = b.hi
-        //   a.lo = t2
-        //   t3 = b.hi
-        //   t3 = sar t3, 0x1f
-        //   a.hi = t3
-        _mov(T_2, Src0Hi);
-        _mov(DestLo, T_2);
-        _mov(T_3, Src0Hi);
-        _sar(T_3, SignExtend);
-        _mov(DestHi, T_3);
-      } break;
-      }
-    } else {
-      // COMMON PREFIX OF: a=b SHIFT_OP c ==>
-      //   t2 = b.lo
-      //   t3 = b.hi
-      _mov(T_2, Src0Lo);
-      _mov(T_3, Src0Hi);
-      switch (Op) {
-      default:
-        assert(0 && "non-shift op");
-        break;
-      case InstArithmetic::Shl: {
-        // a=b<<c ==>
-        //   t3 = shld t3, t2, ShiftAmount
-        //   t2 = shl t2, ShiftAmount
-        _shld(T_3, T_2, ConstantShiftAmount);
-        _shl(T_2, ConstantShiftAmount);
-      } break;
-      case InstArithmetic::Lshr: {
-        // a=b>>c (unsigned) ==>
-        //   t2 = shrd t2, t3, ShiftAmount
-        //   t3 = shr t3, ShiftAmount
-        _shrd(T_2, T_3, ConstantShiftAmount);
-        _shr(T_3, ConstantShiftAmount);
-      } break;
-      case InstArithmetic::Ashr: {
-        // a=b>>c (signed) ==>
-        //   t2 = shrd t2, t3, ShiftAmount
-        //   t3 = sar t3, ShiftAmount
-        _shrd(T_2, T_3, ConstantShiftAmount);
-        _sar(T_3, ConstantShiftAmount);
-      } break;
-      }
-      // COMMON SUFFIX OF: a=b SHIFT_OP c ==>
-      //   a.lo = t2
-      //   a.hi = t3
-      _mov(DestLo, T_2);
-      _mov(DestHi, T_3);
-    }
-  } else {
-    // NON-CONSTANT CASES.
-    Constant *BitTest = Ctx->getConstantInt32(0x20);
-    InstX86Label *Label = InstX86Label::create(Func, this);
-    // COMMON PREFIX OF: a=b SHIFT_OP c ==>
-    //   t1:ecx = c.lo & 0xff
-    //   t2 = b.lo
-    //   t3 = b.hi
-    T_1 = copyToReg8(Src1Lo, Traits::RegisterSet::Reg_cl);
-    _mov(T_2, Src0Lo);
-    _mov(T_3, Src0Hi);
-    switch (Op) {
-    default:
-      assert(0 && "non-shift op");
-      break;
-    case InstArithmetic::Shl: {
-      // a=b<<c ==>
-      //   t3 = shld t3, t2, t1
-      //   t2 = shl t2, t1
-      //   test t1, 0x20
-      //   je L1
-      //   use(t3)
-      //   t3 = t2
-      //   t2 = 0
-      _shld(T_3, T_2, T_1);
-      _shl(T_2, T_1);
-      _test(T_1, BitTest);
-      _br(CondX86::Br_e, Label);
-      // T_2 and T_3 are being assigned again because of the intra-block control
-      // flow, so we need to use _redefined to avoid liveness problems.
-      _redefined(_mov(T_3, T_2));
-      _redefined(_mov(T_2, Zero));
-    } break;
-    case InstArithmetic::Lshr: {
-      // a=b>>c (unsigned) ==>
-      //   t2 = shrd t2, t3, t1
-      //   t3 = shr t3, t1
-      //   test t1, 0x20
-      //   je L1
-      //   use(t2)
-      //   t2 = t3
-      //   t3 = 0
-      _shrd(T_2, T_3, T_1);
-      _shr(T_3, T_1);
-      _test(T_1, BitTest);
-      _br(CondX86::Br_e, Label);
-      // T_2 and T_3 are being assigned again because of the intra-block control
-      // flow, so we need to use _redefined to avoid liveness problems.
-      _redefined(_mov(T_2, T_3));
-      _redefined(_mov(T_3, Zero));
-    } break;
-    case InstArithmetic::Ashr: {
-      // a=b>>c (signed) ==>
-      //   t2 = shrd t2, t3, t1
-      //   t3 = sar t3, t1
-      //   test t1, 0x20
-      //   je L1
-      //   use(t2)
-      //   t2 = t3
-      //   t3 = sar t3, 0x1f
-      Constant *SignExtend = Ctx->getConstantInt32(0x1f);
-      _shrd(T_2, T_3, T_1);
-      _sar(T_3, T_1);
-      _test(T_1, BitTest);
-      _br(CondX86::Br_e, Label);
-      // T_2 and T_3 are being assigned again because of the intra-block control
-      // flow, so T_2 needs to use _redefined to avoid liveness problems. T_3
-      // doesn't need special treatment because it is reassigned via _sar
-      // instead of _mov.
-      _redefined(_mov(T_2, T_3));
-      _sar(T_3, SignExtend);
-    } break;
-    }
-    // COMMON SUFFIX OF: a=b SHIFT_OP c ==>
-    // L1:
-    //   a.lo = t2
-    //   a.hi = t3
-    Context.insert(Label);
-    _mov(DestLo, T_2);
-    _mov(DestHi, T_3);
-  }
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::lowerArithmetic(const InstArithmetic *Instr) {
-  Variable *Dest = Instr->getDest();
-  if (Dest->isRematerializable()) {
-    Context.insert<InstFakeDef>(Dest);
-    return;
-  }
-  Type Ty = Dest->getType();
-  Operand *Src0 = legalize(Instr->getSrc(0));
-  Operand *Src1 = legalize(Instr->getSrc(1));
-  if (Instr->isCommutative()) {
-    uint32_t SwapCount = 0;
-    if (!llvm::isa<Variable>(Src0) && llvm::isa<Variable>(Src1)) {
-      std::swap(Src0, Src1);
-      ++SwapCount;
-    }
-    if (llvm::isa<Constant>(Src0) && !llvm::isa<Constant>(Src1)) {
-      std::swap(Src0, Src1);
-      ++SwapCount;
-    }
-    // Improve two-address code patterns by avoiding a copy to the dest
-    // register when one of the source operands ends its lifetime here.
-    if (!Instr->isLastUse(Src0) && Instr->isLastUse(Src1)) {
-      std::swap(Src0, Src1);
-      ++SwapCount;
-    }
-    assert(SwapCount <= 1);
-    (void)SwapCount;
-  }
-  if (!Traits::Is64Bit && Ty == IceType_i64) {
-    // These x86-32 helper-call-involved instructions are lowered in this
-    // separate switch. This is because loOperand() and hiOperand() may insert
-    // redundant instructions for constant blinding and pooling. Such redundant
-    // instructions will fail liveness analysis under -Om1 setting. And,
-    // actually these arguments do not need to be processed with loOperand()
-    // and hiOperand() to be used.
-    switch (Instr->getOp()) {
-    case InstArithmetic::Udiv:
-    case InstArithmetic::Sdiv:
-    case InstArithmetic::Urem:
-    case InstArithmetic::Srem:
-      llvm::report_fatal_error("Helper call was expected");
-      return;
-    default:
-      break;
-    }
-
-    auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
-    auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
-    Operand *Src0Lo = loOperand(Src0);
-    Operand *Src0Hi = hiOperand(Src0);
-    Operand *Src1Lo = loOperand(Src1);
-    Operand *Src1Hi = hiOperand(Src1);
-    Variable *T_Lo = nullptr, *T_Hi = nullptr;
-    switch (Instr->getOp()) {
-    case InstArithmetic::_num:
-      llvm_unreachable("Unknown arithmetic operator");
-      break;
-    case InstArithmetic::Add:
-      _mov(T_Lo, Src0Lo);
-      _add(T_Lo, Src1Lo);
-      _mov(DestLo, T_Lo);
-      _mov(T_Hi, Src0Hi);
-      _adc(T_Hi, Src1Hi);
-      _mov(DestHi, T_Hi);
-      break;
-    case InstArithmetic::And:
-      _mov(T_Lo, Src0Lo);
-      _and(T_Lo, Src1Lo);
-      _mov(DestLo, T_Lo);
-      _mov(T_Hi, Src0Hi);
-      _and(T_Hi, Src1Hi);
-      _mov(DestHi, T_Hi);
-      break;
-    case InstArithmetic::Or:
-      _mov(T_Lo, Src0Lo);
-      _or(T_Lo, Src1Lo);
-      _mov(DestLo, T_Lo);
-      _mov(T_Hi, Src0Hi);
-      _or(T_Hi, Src1Hi);
-      _mov(DestHi, T_Hi);
-      break;
-    case InstArithmetic::Xor:
-      _mov(T_Lo, Src0Lo);
-      _xor(T_Lo, Src1Lo);
-      _mov(DestLo, T_Lo);
-      _mov(T_Hi, Src0Hi);
-      _xor(T_Hi, Src1Hi);
-      _mov(DestHi, T_Hi);
-      break;
-    case InstArithmetic::Sub:
-      _mov(T_Lo, Src0Lo);
-      _sub(T_Lo, Src1Lo);
-      _mov(DestLo, T_Lo);
-      _mov(T_Hi, Src0Hi);
-      _sbb(T_Hi, Src1Hi);
-      _mov(DestHi, T_Hi);
-      break;
-    case InstArithmetic::Mul: {
-      Variable *T_1 = nullptr, *T_2 = nullptr, *T_3 = nullptr;
-      Variable *T_4Lo = makeReg(IceType_i32, Traits::RegisterSet::Reg_eax);
-      Variable *T_4Hi = makeReg(IceType_i32, Traits::RegisterSet::Reg_edx);
-      // gcc does the following:
-      // a=b*c ==>
-      //   t1 = b.hi; t1 *=(imul) c.lo
-      //   t2 = c.hi; t2 *=(imul) b.lo
-      //   t3:eax = b.lo
-      //   t4.hi:edx,t4.lo:eax = t3:eax *(mul) c.lo
-      //   a.lo = t4.lo
-      //   t4.hi += t1
-      //   t4.hi += t2
-      //   a.hi = t4.hi
-      // The mul instruction cannot take an immediate operand.
-      Src1Lo = legalize(Src1Lo, Legal_Reg | Legal_Mem);
-      _mov(T_1, Src0Hi);
-      _imul(T_1, Src1Lo);
-      _mov(T_3, Src0Lo, Traits::RegisterSet::Reg_eax);
-      _mul(T_4Lo, T_3, Src1Lo);
-      // The mul instruction produces two dest variables, edx:eax. We create a
-      // fake definition of edx to account for this.
-      Context.insert<InstFakeDef>(T_4Hi, T_4Lo);
-      Context.insert<InstFakeUse>(T_4Hi);
-      _mov(DestLo, T_4Lo);
-      _add(T_4Hi, T_1);
-      _mov(T_2, Src1Hi);
-      Src0Lo = legalize(Src0Lo, Legal_Reg | Legal_Mem);
-      _imul(T_2, Src0Lo);
-      _add(T_4Hi, T_2);
-      _mov(DestHi, T_4Hi);
-    } break;
-    case InstArithmetic::Shl:
-    case InstArithmetic::Lshr:
-    case InstArithmetic::Ashr:
-      lowerShift64(Instr->getOp(), Src0Lo, Src0Hi, Src1Lo, DestLo, DestHi);
-      break;
-    case InstArithmetic::Fadd:
-    case InstArithmetic::Fsub:
-    case InstArithmetic::Fmul:
-    case InstArithmetic::Fdiv:
-    case InstArithmetic::Frem:
-      llvm_unreachable("FP instruction with i64 type");
-      break;
-    case InstArithmetic::Udiv:
-    case InstArithmetic::Sdiv:
-    case InstArithmetic::Urem:
-    case InstArithmetic::Srem:
-      llvm_unreachable("Call-helper-involved instruction for i64 type \
-                       should have already been handled before");
-      break;
-    }
-    return;
-  }
-  if (isVectorType(Ty)) {
-    // TODO: Trap on integer divide and integer modulo by zero. See:
-    // https://code.google.com/p/nativeclient/issues/detail?id=3899
-    if (llvm::isa<X86OperandMem>(Src1))
-      Src1 = legalizeToReg(Src1);
-    switch (Instr->getOp()) {
-    case InstArithmetic::_num:
-      llvm_unreachable("Unknown arithmetic operator");
-      break;
-    case InstArithmetic::Add: {
-      Variable *T = makeReg(Ty);
-      _movp(T, Src0);
-      _padd(T, Src1);
-      _movp(Dest, T);
-    } break;
-    case InstArithmetic::And: {
-      Variable *T = makeReg(Ty);
-      _movp(T, Src0);
-      _pand(T, Src1);
-      _movp(Dest, T);
-    } break;
-    case InstArithmetic::Or: {
-      Variable *T = makeReg(Ty);
-      _movp(T, Src0);
-      _por(T, Src1);
-      _movp(Dest, T);
-    } break;
-    case InstArithmetic::Xor: {
-      Variable *T = makeReg(Ty);
-      _movp(T, Src0);
-      _pxor(T, Src1);
-      _movp(Dest, T);
-    } break;
-    case InstArithmetic::Sub: {
-      Variable *T = makeReg(Ty);
-      _movp(T, Src0);
-      _psub(T, Src1);
-      _movp(Dest, T);
-    } break;
-    case InstArithmetic::Mul: {
-      bool TypesAreValidForPmull = Ty == IceType_v4i32 || Ty == IceType_v8i16;
-      bool InstructionSetIsValidForPmull =
-          Ty == IceType_v8i16 || InstructionSet >= Traits::SSE4_1;
-      if (TypesAreValidForPmull && InstructionSetIsValidForPmull) {
-        Variable *T = makeReg(Ty);
-        _movp(T, Src0);
-        _pmull(T, Src0 == Src1 ? T : Src1);
-        _movp(Dest, T);
-      } else if (Ty == IceType_v4i32) {
-        // Lowering sequence:
-        // Note: The mask arguments have index 0 on the left.
-        //
-        // movups  T1, Src0
-        // pshufd  T2, Src0, {1,0,3,0}
-        // pshufd  T3, Src1, {1,0,3,0}
-        // # T1 = {Src0[0] * Src1[0], Src0[2] * Src1[2]}
-        // pmuludq T1, Src1
-        // # T2 = {Src0[1] * Src1[1], Src0[3] * Src1[3]}
-        // pmuludq T2, T3
-        // # T1 = {lo(T1[0]), lo(T1[2]), lo(T2[0]), lo(T2[2])}
-        // shufps  T1, T2, {0,2,0,2}
-        // pshufd  T4, T1, {0,2,1,3}
-        // movups  Dest, T4
-
-        // Mask that directs pshufd to create a vector with entries
-        // Src[1, 0, 3, 0]
-        constexpr unsigned Constant1030 = 0x31;
-        Constant *Mask1030 = Ctx->getConstantInt32(Constant1030);
-        // Mask that directs shufps to create a vector with entries
-        // Dest[0, 2], Src[0, 2]
-        constexpr unsigned Mask0202 = 0x88;
-        // Mask that directs pshufd to create a vector with entries
-        // Src[0, 2, 1, 3]
-        constexpr unsigned Mask0213 = 0xd8;
-        Variable *T1 = makeReg(IceType_v4i32);
-        Variable *T2 = makeReg(IceType_v4i32);
-        Variable *T3 = makeReg(IceType_v4i32);
-        Variable *T4 = makeReg(IceType_v4i32);
-        _movp(T1, Src0);
-        _pshufd(T2, Src0, Mask1030);
-        _pshufd(T3, Src1, Mask1030);
-        _pmuludq(T1, Src1);
-        _pmuludq(T2, T3);
-        _shufps(T1, T2, Ctx->getConstantInt32(Mask0202));
-        _pshufd(T4, T1, Ctx->getConstantInt32(Mask0213));
-        _movp(Dest, T4);
-      } else if (Ty == IceType_v16i8) {
-        llvm::report_fatal_error("Scalarized operation was expected");
-      } else {
-        llvm::report_fatal_error("Invalid vector multiply type");
-      }
-    } break;
-    case InstArithmetic::Shl: {
-      assert(llvm::isa<Constant>(Src1) && "Non-constant shift not scalarized");
-      Variable *T = makeReg(Ty);
-      _movp(T, Src0);
-      _psll(T, Src1);
-      _movp(Dest, T);
-    } break;
-    case InstArithmetic::Lshr: {
-      assert(llvm::isa<Constant>(Src1) && "Non-constant shift not scalarized");
-      Variable *T = makeReg(Ty);
-      _movp(T, Src0);
-      _psrl(T, Src1);
-      _movp(Dest, T);
-    } break;
-    case InstArithmetic::Ashr: {
-      assert(llvm::isa<Constant>(Src1) && "Non-constant shift not scalarized");
-      Variable *T = makeReg(Ty);
-      _movp(T, Src0);
-      _psra(T, Src1);
-      _movp(Dest, T);
-    } break;
-    case InstArithmetic::Udiv:
-    case InstArithmetic::Urem:
-    case InstArithmetic::Sdiv:
-    case InstArithmetic::Srem:
-      llvm::report_fatal_error("Scalarized operation was expected");
-      break;
-    case InstArithmetic::Fadd: {
-      Variable *T = makeReg(Ty);
-      _movp(T, Src0);
-      _addps(T, Src1);
-      _movp(Dest, T);
-    } break;
-    case InstArithmetic::Fsub: {
-      Variable *T = makeReg(Ty);
-      _movp(T, Src0);
-      _subps(T, Src1);
-      _movp(Dest, T);
-    } break;
-    case InstArithmetic::Fmul: {
-      Variable *T = makeReg(Ty);
-      _movp(T, Src0);
-      _mulps(T, Src0 == Src1 ? T : Src1);
-      _movp(Dest, T);
-    } break;
-    case InstArithmetic::Fdiv: {
-      Variable *T = makeReg(Ty);
-      _movp(T, Src0);
-      _divps(T, Src1);
-      _movp(Dest, T);
-    } break;
-    case InstArithmetic::Frem:
-      llvm::report_fatal_error("Scalarized operation was expected");
-      break;
-    }
-    return;
-  }
-  Variable *T_edx = nullptr;
-  Variable *T = nullptr;
-  switch (Instr->getOp()) {
-  case InstArithmetic::_num:
-    llvm_unreachable("Unknown arithmetic operator");
-    break;
-  case InstArithmetic::Add: {
-    const bool ValidType =
-        Ty == IceType_i32 || (Ty == IceType_i64 && Traits::Is64Bit);
-    auto *Const = llvm::dyn_cast<Constant>(Instr->getSrc(1));
-    const bool ValidKind =
-        Const != nullptr && (llvm::isa<ConstantInteger32>(Const) ||
-                             llvm::isa<ConstantRelocatable>(Const));
-    if (getFlags().getAggressiveLea() && ValidType && ValidKind) {
-      auto *Var = legalizeToReg(Src0);
-      auto *Mem = Traits::X86OperandMem::create(Func, IceType_void, Var, Const);
-      T = makeReg(Ty);
-      _lea(T, Mem);
-      _mov(Dest, T);
-      break;
-    }
-    _mov(T, Src0);
-    _add(T, Src1);
-    _mov(Dest, T);
-  } break;
-  case InstArithmetic::And:
-    _mov(T, Src0);
-    _and(T, Src1);
-    _mov(Dest, T);
-    break;
-  case InstArithmetic::Or:
-    _mov(T, Src0);
-    _or(T, Src1);
-    _mov(Dest, T);
-    break;
-  case InstArithmetic::Xor:
-    _mov(T, Src0);
-    _xor(T, Src1);
-    _mov(Dest, T);
-    break;
-  case InstArithmetic::Sub:
-    _mov(T, Src0);
-    _sub(T, Src1);
-    _mov(Dest, T);
-    break;
-  case InstArithmetic::Mul:
-    if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) {
-      if (optimizeScalarMul(Dest, Src0, C->getValue()))
-        return;
-    }
-    // The 8-bit version of imul only allows the form "imul r/m8" where T must
-    // be in al.
-    if (isByteSizedArithType(Ty)) {
-      _mov(T, Src0, Traits::RegisterSet::Reg_al);
-      Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
-      _imul(T, Src0 == Src1 ? T : Src1);
-      _mov(Dest, T);
-    } else if (auto *ImmConst = llvm::dyn_cast<ConstantInteger32>(Src1)) {
-      T = makeReg(Ty);
-      Src0 = legalize(Src0, Legal_Reg | Legal_Mem);
-      _imul_imm(T, Src0, ImmConst);
-      _mov(Dest, T);
-    } else {
-      _mov(T, Src0);
-      // No need to legalize Src1 to Reg | Mem because the Imm case is handled
-      // already by the ConstantInteger32 case above.
-      _imul(T, Src0 == Src1 ? T : Src1);
-      _mov(Dest, T);
-    }
-    break;
-  case InstArithmetic::Shl:
-    _mov(T, Src0);
-    if (!llvm::isa<ConstantInteger32>(Src1) &&
-        !llvm::isa<ConstantInteger64>(Src1))
-      Src1 = copyToReg8(Src1, Traits::RegisterSet::Reg_cl);
-    _shl(T, Src1);
-    _mov(Dest, T);
-    break;
-  case InstArithmetic::Lshr:
-    _mov(T, Src0);
-    if (!llvm::isa<ConstantInteger32>(Src1) &&
-        !llvm::isa<ConstantInteger64>(Src1))
-      Src1 = copyToReg8(Src1, Traits::RegisterSet::Reg_cl);
-    _shr(T, Src1);
-    _mov(Dest, T);
-    break;
-  case InstArithmetic::Ashr:
-    _mov(T, Src0);
-    if (!llvm::isa<ConstantInteger32>(Src1) &&
-        !llvm::isa<ConstantInteger64>(Src1))
-      Src1 = copyToReg8(Src1, Traits::RegisterSet::Reg_cl);
-    _sar(T, Src1);
-    _mov(Dest, T);
-    break;
-  case InstArithmetic::Udiv: {
-    // div and idiv are the few arithmetic operators that do not allow
-    // immediates as the operand.
-    Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
-    RegNumT Eax;
-    RegNumT Edx;
-    switch (Ty) {
-    default:
-      llvm::report_fatal_error("Bad type for udiv");
-    case IceType_i64:
-      Eax = Traits::getRaxOrDie();
-      Edx = Traits::getRdxOrDie();
-      break;
-    case IceType_i32:
-      Eax = Traits::RegisterSet::Reg_eax;
-      Edx = Traits::RegisterSet::Reg_edx;
-      break;
-    case IceType_i16:
-      Eax = Traits::RegisterSet::Reg_ax;
-      Edx = Traits::RegisterSet::Reg_dx;
-      break;
-    case IceType_i8:
-      Eax = Traits::RegisterSet::Reg_al;
-      Edx = Traits::RegisterSet::Reg_ah;
-      break;
-    }
-    T_edx = makeReg(Ty, Edx);
-    _mov(T, Src0, Eax);
-    _mov(T_edx, Ctx->getConstantZero(Ty));
-    _div(T_edx, Src1, T);
-    _redefined(Context.insert<InstFakeDef>(T, T_edx));
-    _mov(Dest, T);
-  } break;
-  case InstArithmetic::Sdiv:
-    // TODO(stichnot): Enable this after doing better performance and cross
-    // testing.
-    if (false && Func->getOptLevel() >= Opt_1) {
-      // Optimize division by constant power of 2, but not for Om1 or O0, just
-      // to keep things simple there.
-      if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) {
-        const int32_t Divisor = C->getValue();
-        const uint32_t UDivisor = Divisor;
-        if (Divisor > 0 && llvm::isPowerOf2_32(UDivisor)) {
-          uint32_t LogDiv = llvm::Log2_32(UDivisor);
-          // LLVM does the following for dest=src/(1<<log):
-          //   t=src
-          //   sar t,typewidth-1 // -1 if src is negative, 0 if not
-          //   shr t,typewidth-log
-          //   add t,src
-          //   sar t,log
-          //   dest=t
-          uint32_t TypeWidth = Traits::X86_CHAR_BIT * typeWidthInBytes(Ty);
-          _mov(T, Src0);
-          // If for some reason we are dividing by 1, just treat it like an
-          // assignment.
-          if (LogDiv > 0) {
-            // The initial sar is unnecessary when dividing by 2.
-            if (LogDiv > 1)
-              _sar(T, Ctx->getConstantInt(Ty, TypeWidth - 1));
-            _shr(T, Ctx->getConstantInt(Ty, TypeWidth - LogDiv));
-            _add(T, Src0);
-            _sar(T, Ctx->getConstantInt(Ty, LogDiv));
-          }
-          _mov(Dest, T);
-          return;
-        }
-      }
-    }
-    Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
-    switch (Ty) {
-    default:
-      llvm::report_fatal_error("Bad type for sdiv");
-    case IceType_i64:
-      T_edx = makeReg(Ty, Traits::getRdxOrDie());
-      _mov(T, Src0, Traits::getRaxOrDie());
-      break;
-    case IceType_i32:
-      T_edx = makeReg(Ty, Traits::RegisterSet::Reg_edx);
-      _mov(T, Src0, Traits::RegisterSet::Reg_eax);
-      break;
-    case IceType_i16:
-      T_edx = makeReg(Ty, Traits::RegisterSet::Reg_dx);
-      _mov(T, Src0, Traits::RegisterSet::Reg_ax);
-      break;
-    case IceType_i8:
-      T_edx = makeReg(IceType_i16, Traits::RegisterSet::Reg_ax);
-      _mov(T, Src0, Traits::RegisterSet::Reg_al);
-      break;
-    }
-    _cbwdq(T_edx, T);
-    _idiv(T_edx, Src1, T);
-    _redefined(Context.insert<InstFakeDef>(T, T_edx));
-    _mov(Dest, T);
-    break;
-  case InstArithmetic::Urem: {
-    Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
-    RegNumT Eax;
-    RegNumT Edx;
-    switch (Ty) {
-    default:
-      llvm::report_fatal_error("Bad type for urem");
-    case IceType_i64:
-      Eax = Traits::getRaxOrDie();
-      Edx = Traits::getRdxOrDie();
-      break;
-    case IceType_i32:
-      Eax = Traits::RegisterSet::Reg_eax;
-      Edx = Traits::RegisterSet::Reg_edx;
-      break;
-    case IceType_i16:
-      Eax = Traits::RegisterSet::Reg_ax;
-      Edx = Traits::RegisterSet::Reg_dx;
-      break;
-    case IceType_i8:
-      Eax = Traits::RegisterSet::Reg_al;
-      Edx = Traits::RegisterSet::Reg_ah;
-      break;
-    }
-    T_edx = makeReg(Ty, Edx);
-    _mov(T_edx, Ctx->getConstantZero(Ty));
-    _mov(T, Src0, Eax);
-    _div(T, Src1, T_edx);
-    _redefined(Context.insert<InstFakeDef>(T_edx, T));
-    if (Ty == IceType_i8) {
-      // Register ah must be moved into one of {al,bl,cl,dl} before it can be
-      // moved into a general 8-bit register.
-      auto *T_AhRcvr = makeReg(Ty);
-      T_AhRcvr->setRegClass(RCX86_IsAhRcvr);
-      _mov(T_AhRcvr, T_edx);
-      T_edx = T_AhRcvr;
-    }
-    _mov(Dest, T_edx);
-  } break;
-  case InstArithmetic::Srem: {
-    // TODO(stichnot): Enable this after doing better performance and cross
-    // testing.
-    if (false && Func->getOptLevel() >= Opt_1) {
-      // Optimize mod by constant power of 2, but not for Om1 or O0, just to
-      // keep things simple there.
-      if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) {
-        const int32_t Divisor = C->getValue();
-        const uint32_t UDivisor = Divisor;
-        if (Divisor > 0 && llvm::isPowerOf2_32(UDivisor)) {
-          uint32_t LogDiv = llvm::Log2_32(UDivisor);
-          // LLVM does the following for dest=src%(1<<log):
-          //   t=src
-          //   sar t,typewidth-1 // -1 if src is negative, 0 if not
-          //   shr t,typewidth-log
-          //   add t,src
-          //   and t, -(1<<log)
-          //   sub t,src
-          //   neg t
-          //   dest=t
-          uint32_t TypeWidth = Traits::X86_CHAR_BIT * typeWidthInBytes(Ty);
-          // If for some reason we are dividing by 1, just assign 0.
-          if (LogDiv == 0) {
-            _mov(Dest, Ctx->getConstantZero(Ty));
-            return;
-          }
-          _mov(T, Src0);
-          // The initial sar is unnecessary when dividing by 2.
-          if (LogDiv > 1)
-            _sar(T, Ctx->getConstantInt(Ty, TypeWidth - 1));
-          _shr(T, Ctx->getConstantInt(Ty, TypeWidth - LogDiv));
-          _add(T, Src0);
-          _and(T, Ctx->getConstantInt(Ty, -(1 << LogDiv)));
-          _sub(T, Src0);
-          _neg(T);
-          _mov(Dest, T);
-          return;
-        }
-      }
-    }
-    Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
-    RegNumT Eax;
-    RegNumT Edx;
-    switch (Ty) {
-    default:
-      llvm::report_fatal_error("Bad type for srem");
-    case IceType_i64:
-      Eax = Traits::getRaxOrDie();
-      Edx = Traits::getRdxOrDie();
-      break;
-    case IceType_i32:
-      Eax = Traits::RegisterSet::Reg_eax;
-      Edx = Traits::RegisterSet::Reg_edx;
-      break;
-    case IceType_i16:
-      Eax = Traits::RegisterSet::Reg_ax;
-      Edx = Traits::RegisterSet::Reg_dx;
-      break;
-    case IceType_i8:
-      Eax = Traits::RegisterSet::Reg_al;
-      Edx = Traits::RegisterSet::Reg_ah;
-      break;
-    }
-    T_edx = makeReg(Ty, Edx);
-    _mov(T, Src0, Eax);
-    _cbwdq(T_edx, T);
-    _idiv(T, Src1, T_edx);
-    _redefined(Context.insert<InstFakeDef>(T_edx, T));
-    if (Ty == IceType_i8) {
-      // Register ah must be moved into one of {al,bl,cl,dl} before it can be
-      // moved into a general 8-bit register.
-      auto *T_AhRcvr = makeReg(Ty);
-      T_AhRcvr->setRegClass(RCX86_IsAhRcvr);
-      _mov(T_AhRcvr, T_edx);
-      T_edx = T_AhRcvr;
-    }
-    _mov(Dest, T_edx);
-  } break;
-  case InstArithmetic::Fadd:
-    _mov(T, Src0);
-    _addss(T, Src1);
-    _mov(Dest, T);
-    break;
-  case InstArithmetic::Fsub:
-    _mov(T, Src0);
-    _subss(T, Src1);
-    _mov(Dest, T);
-    break;
-  case InstArithmetic::Fmul:
-    _mov(T, Src0);
-    _mulss(T, Src0 == Src1 ? T : Src1);
-    _mov(Dest, T);
-    break;
-  case InstArithmetic::Fdiv:
-    _mov(T, Src0);
-    _divss(T, Src1);
-    _mov(Dest, T);
-    break;
-  case InstArithmetic::Frem:
-    llvm::report_fatal_error("Helper call was expected");
-    break;
-  }
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::lowerAssign(const InstAssign *Instr) {
-  Variable *Dest = Instr->getDest();
-  if (Dest->isRematerializable()) {
-    Context.insert<InstFakeDef>(Dest);
-    return;
-  }
-  Operand *Src = Instr->getSrc(0);
-  assert(Dest->getType() == Src->getType());
-  lowerMove(Dest, Src, false);
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::lowerBr(const InstBr *Br) {
-  if (Br->isUnconditional()) {
-    _br(Br->getTargetUnconditional());
-    return;
-  }
-  Operand *Cond = Br->getCondition();
-
-  // Handle folding opportunities.
-  if (const Inst *Producer = FoldingInfo.getProducerFor(Cond)) {
-    assert(Producer->isDeleted());
-    switch (BoolFolding<Traits>::getProducerKind(Producer)) {
-    default:
-      break;
-    case BoolFolding<Traits>::PK_Icmp32:
-    case BoolFolding<Traits>::PK_Icmp64: {
-      lowerIcmpAndConsumer(llvm::cast<InstIcmp>(Producer), Br);
-      return;
-    }
-    case BoolFolding<Traits>::PK_Fcmp: {
-      lowerFcmpAndConsumer(llvm::cast<InstFcmp>(Producer), Br);
-      return;
-    }
-    case BoolFolding<Traits>::PK_Arith: {
-      lowerArithAndConsumer(llvm::cast<InstArithmetic>(Producer), Br);
-      return;
-    }
-    }
-  }
-  Operand *Src0 = legalize(Cond, Legal_Reg | Legal_Mem);
-  Constant *Zero = Ctx->getConstantZero(IceType_i32);
-  _cmp(Src0, Zero);
-  _br(CondX86::Br_ne, Br->getTargetTrue(), Br->getTargetFalse());
-}
-
-// constexprMax returns a (constexpr) max(S0, S1), and it is used for defining
-// OperandList in lowerCall. std::max() is supposed to work, but it doesn't.
-inline constexpr SizeT constexprMax(SizeT S0, SizeT S1) {
-  return S0 < S1 ? S1 : S0;
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::lowerCall(const InstCall *Instr) {
-  // Common x86 calling convention lowering:
-  //
-  // * At the point before the call, the stack must be aligned to 16 bytes.
-  //
-  // * Non-register arguments are pushed onto the stack in right-to-left order,
-  // such that the left-most argument ends up on the top of the stack at the
-  // lowest memory address.
-  //
-  // * Stack arguments of vector type are aligned to start at the next highest
-  // multiple of 16 bytes. Other stack arguments are aligned to the next word
-  // size boundary (4 or 8 bytes, respectively).
-  RequiredStackAlignment = std::max<size_t>(RequiredStackAlignment,
-                                            Traits::X86_STACK_ALIGNMENT_BYTES);
-
-  constexpr SizeT MaxOperands =
-      constexprMax(Traits::X86_MAX_XMM_ARGS, Traits::X86_MAX_GPR_ARGS);
-  using OperandList = llvm::SmallVector<Operand *, MaxOperands>;
-
-  OperandList XmmArgs;
-  llvm::SmallVector<SizeT, MaxOperands> XmmArgIndices;
-  CfgVector<std::pair<const Type, Operand *>> GprArgs;
-  CfgVector<SizeT> GprArgIndices;
-  OperandList StackArgs, StackArgLocations;
-  uint32_t ParameterAreaSizeBytes = 0;
-
-  ParameterAreaSizeBytes += getShadowStoreSize();
-
-  // Classify each argument operand according to the location where the argument
-  // is passed.
-  for (SizeT i = 0, NumArgs = Instr->getNumArgs(); i < NumArgs; ++i) {
-    Operand *Arg = Instr->getArg(i);
-    const Type Ty = Arg->getType();
-    // The PNaCl ABI requires the width of arguments to be at least 32 bits.
-    assert(typeWidthInBytes(Ty) >= 4);
-    if (isVectorType(Ty) &&
-        Traits::getRegisterForXmmArgNum(Traits::getArgIndex(i, XmmArgs.size()))
-            .hasValue()) {
-      XmmArgs.push_back(Arg);
-      XmmArgIndices.push_back(i);
-    } else if (isScalarIntegerType(Ty) &&
-               Traits::getRegisterForGprArgNum(
-                   Ty, Traits::getArgIndex(i, GprArgs.size()))
-                   .hasValue()) {
-      GprArgs.emplace_back(Ty, Arg);
-      GprArgIndices.push_back(i);
-    } else {
-      // Place on stack.
-      StackArgs.push_back(Arg);
-      if (isVectorType(Arg->getType())) {
-        ParameterAreaSizeBytes =
-            Traits::applyStackAlignment(ParameterAreaSizeBytes);
-      }
-      Variable *esp = getPhysicalRegister(getStackReg(), Traits::WordType);
-      Constant *Loc = Ctx->getConstantInt32(ParameterAreaSizeBytes);
-      StackArgLocations.push_back(
-          Traits::X86OperandMem::create(Func, Ty, esp, Loc));
-      ParameterAreaSizeBytes += typeWidthInBytesOnStack(Arg->getType());
-    }
-  }
-  // Ensure there is enough space for the fstp/movs for floating returns.
-  Variable *Dest = Instr->getDest();
-  const Type DestTy = Dest ? Dest->getType() : IceType_void;
-  if (isScalarFloatingType(DestTy)) {
-    ParameterAreaSizeBytes =
-        std::max(static_cast<size_t>(ParameterAreaSizeBytes),
-                 typeWidthInBytesOnStack(DestTy));
-  }
-  // Adjust the parameter area so that the stack is aligned. It is assumed that
-  // the stack is already aligned at the start of the calling sequence.
-  ParameterAreaSizeBytes = Traits::applyStackAlignment(ParameterAreaSizeBytes);
-  assert(ParameterAreaSizeBytes <= maxOutArgsSizeBytes());
-  // Copy arguments that are passed on the stack to the appropriate stack
-  // locations.  We make sure legalize() is called on each argument at this
-  // point, to allow availabilityGet() to work.
-  for (SizeT i = 0, NumStackArgs = StackArgs.size(); i < NumStackArgs; ++i) {
-    lowerStore(
-        InstStore::create(Func, legalize(StackArgs[i]), StackArgLocations[i]));
-  }
-  // Copy arguments to be passed in registers to the appropriate registers.
-  for (SizeT i = 0, NumXmmArgs = XmmArgs.size(); i < NumXmmArgs; ++i) {
-    XmmArgs[i] = legalizeToReg(legalize(XmmArgs[i]),
-                               Traits::getRegisterForXmmArgNum(
-                                   Traits::getArgIndex(XmmArgIndices[i], i)));
-  }
-  // Materialize moves for arguments passed in GPRs.
-  for (SizeT i = 0, NumGprArgs = GprArgs.size(); i < NumGprArgs; ++i) {
-    const Type SignatureTy = GprArgs[i].first;
-    Operand *Arg =
-        legalize(GprArgs[i].second, Legal_Default | Legal_Rematerializable);
-    GprArgs[i].second = legalizeToReg(
-        Arg, Traits::getRegisterForGprArgNum(
-                 Arg->getType(), Traits::getArgIndex(GprArgIndices[i], i)));
-    assert(SignatureTy == IceType_i64 || SignatureTy == IceType_i32);
-    assert(SignatureTy == Arg->getType());
-    (void)SignatureTy;
-  }
-  // Generate a FakeUse of register arguments so that they do not get dead code
-  // eliminated as a result of the FakeKill of scratch registers after the call.
-  // These need to be right before the call instruction.
-  for (auto *Arg : XmmArgs) {
-    Context.insert<InstFakeUse>(llvm::cast<Variable>(Arg));
-  }
-  for (auto &ArgPair : GprArgs) {
-    Context.insert<InstFakeUse>(llvm::cast<Variable>(ArgPair.second));
-  }
-  // Generate the call instruction. Assign its result to a temporary with high
-  // register allocation weight.
-  // ReturnReg doubles as ReturnRegLo as necessary.
-  Variable *ReturnReg = nullptr;
-  Variable *ReturnRegHi = nullptr;
-  if (Dest) {
-    switch (DestTy) {
-    case IceType_NUM:
-    case IceType_void:
-    case IceType_i1:
-    case IceType_i8:
-    case IceType_i16:
-      llvm::report_fatal_error("Invalid Call dest type");
-      break;
-    case IceType_i32:
-      ReturnReg = makeReg(DestTy, Traits::RegisterSet::Reg_eax);
-      break;
-    case IceType_i64:
-      if (Traits::Is64Bit) {
-        ReturnReg = makeReg(IceType_i64, Traits::getRaxOrDie());
-      } else {
-        ReturnReg = makeReg(IceType_i32, Traits::RegisterSet::Reg_eax);
-        ReturnRegHi = makeReg(IceType_i32, Traits::RegisterSet::Reg_edx);
-      }
-      break;
-    case IceType_f32:
-    case IceType_f64:
-      // Leave ReturnReg==ReturnRegHi==nullptr, and capture the result with
-      // the fstp instruction.
-      break;
-    case IceType_v4i1:
-    case IceType_v8i1:
-    case IceType_v16i1:
-    case IceType_v16i8:
-    case IceType_v8i16:
-    case IceType_v4i32:
-    case IceType_v4f32:
-      ReturnReg = makeReg(DestTy, Traits::RegisterSet::Reg_xmm0);
-      break;
-    }
-  }
-  // Emit the call to the function.
-  Operand *CallTarget =
-      legalize(Instr->getCallTarget(), Legal_Reg | Legal_Imm | Legal_AddrAbs);
-  size_t NumVariadicFpArgs = Instr->isVariadic() ? XmmArgs.size() : 0;
-  Inst *NewCall = emitCallToTarget(CallTarget, ReturnReg, NumVariadicFpArgs);
-  // Keep the upper return register live on 32-bit platform.
-  if (ReturnRegHi)
-    Context.insert<InstFakeDef>(ReturnRegHi);
-  // Mark the call as killing all the caller-save registers.
-  Context.insert<InstFakeKill>(NewCall);
-  // Handle x86-32 floating point returns.
-  if (Dest != nullptr && isScalarFloatingType(DestTy)) {
-    // Special treatment for an FP function which returns its result in st(0).
-    // If Dest ends up being a physical xmm register, the fstp emit code will
-    // route st(0) through the space reserved in the function argument area
-    // we allocated.
-    _fstp(Dest);
-    // Create a fake use of Dest in case it actually isn't used, because st(0)
-    // still needs to be popped.
-    Context.insert<InstFakeUse>(Dest);
-  }
-  // Generate a FakeUse to keep the call live if necessary.
-  if (Instr->hasSideEffects() && ReturnReg) {
-    Context.insert<InstFakeUse>(ReturnReg);
-  }
-  // Process the return value, if any.
-  if (Dest == nullptr)
-    return;
-  // Assign the result of the call to Dest.  Route it through a temporary so
-  // that the local register availability peephole can be subsequently used.
-  Variable *Tmp = nullptr;
-  if (isVectorType(DestTy)) {
-    assert(ReturnReg && "Vector type requires a return register");
-    Tmp = makeReg(DestTy);
-    _movp(Tmp, ReturnReg);
-    _movp(Dest, Tmp);
-  } else if (!isScalarFloatingType(DestTy)) {
-    assert(isScalarIntegerType(DestTy));
-    assert(ReturnReg && "Integer type requires a return register");
-    if (DestTy == IceType_i64 && !Traits::Is64Bit) {
-      assert(ReturnRegHi && "64-bit type requires two return registers");
-      auto *Dest64On32 = llvm::cast<Variable64On32>(Dest);
-      Variable *DestLo = Dest64On32->getLo();
-      Variable *DestHi = Dest64On32->getHi();
-      _mov(Tmp, ReturnReg);
-      _mov(DestLo, Tmp);
-      Variable *TmpHi = nullptr;
-      _mov(TmpHi, ReturnRegHi);
-      _mov(DestHi, TmpHi);
-    } else {
-      _mov(Tmp, ReturnReg);
-      _mov(Dest, Tmp);
-    }
-  }
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::lowerCast(const InstCast *Instr) {
-  // a = cast(b) ==> t=cast(b); a=t; (link t->b, link a->t, no overlap)
-  InstCast::OpKind CastKind = Instr->getCastKind();
-  Variable *Dest = Instr->getDest();
-  Type DestTy = Dest->getType();
-  switch (CastKind) {
-  default:
-    Func->setError("Cast type not supported");
-    return;
-  case InstCast::Sext: {
-    // Src0RM is the source operand legalized to physical register or memory,
-    // but not immediate, since the relevant x86 native instructions don't
-    // allow an immediate operand. If the operand is an immediate, we could
-    // consider computing the strength-reduced result at translation time, but
-    // we're unlikely to see something like that in the bitcode that the
-    // optimizer wouldn't have already taken care of.
-    Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
-    if (isVectorType(DestTy)) {
-      if (DestTy == IceType_v16i8) {
-        // onemask = materialize(1,1,...); dst = (src & onemask) > 0
-        Variable *OneMask = makeVectorOfOnes(DestTy);
-        Variable *T = makeReg(DestTy);
-        _movp(T, Src0RM);
-        _pand(T, OneMask);
-        Variable *Zeros = makeVectorOfZeros(DestTy);
-        _pcmpgt(T, Zeros);
-        _movp(Dest, T);
-      } else {
-        /// width = width(elty) - 1; dest = (src << width) >> width
-        SizeT ShiftAmount =
-            Traits::X86_CHAR_BIT * typeWidthInBytes(typeElementType(DestTy)) -
-            1;
-        Constant *ShiftConstant = Ctx->getConstantInt8(ShiftAmount);
-        Variable *T = makeReg(DestTy);
-        _movp(T, Src0RM);
-        _psll(T, ShiftConstant);
-        _psra(T, ShiftConstant);
-        _movp(Dest, T);
-      }
-    } else if (!Traits::Is64Bit && DestTy == IceType_i64) {
-      // t1=movsx src; t2=t1; t2=sar t2, 31; dst.lo=t1; dst.hi=t2
-      Constant *Shift = Ctx->getConstantInt32(31);
-      auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
-      auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
-      Variable *T_Lo = makeReg(DestLo->getType());
-      if (Src0RM->getType() == IceType_i32) {
-        _mov(T_Lo, Src0RM);
-      } else if (Src0RM->getType() == IceType_i1) {
-        _movzx(T_Lo, Src0RM);
-        _shl(T_Lo, Shift);
-        _sar(T_Lo, Shift);
-      } else {
-        _movsx(T_Lo, Src0RM);
-      }
-      _mov(DestLo, T_Lo);
-      Variable *T_Hi = nullptr;
-      _mov(T_Hi, T_Lo);
-      if (Src0RM->getType() != IceType_i1)
-        // For i1, the sar instruction is already done above.
-        _sar(T_Hi, Shift);
-      _mov(DestHi, T_Hi);
-    } else if (Src0RM->getType() == IceType_i1) {
-      // t1 = src
-      // shl t1, dst_bitwidth - 1
-      // sar t1, dst_bitwidth - 1
-      // dst = t1
-      size_t DestBits = Traits::X86_CHAR_BIT * typeWidthInBytes(DestTy);
-      Constant *ShiftAmount = Ctx->getConstantInt32(DestBits - 1);
-      Variable *T = makeReg(DestTy);
-      if (typeWidthInBytes(DestTy) <= typeWidthInBytes(Src0RM->getType())) {
-        _mov(T, Src0RM);
-      } else {
-        // Widen the source using movsx or movzx. (It doesn't matter which one,
-        // since the following shl/sar overwrite the bits.)
-        _movzx(T, Src0RM);
-      }
-      _shl(T, ShiftAmount);
-      _sar(T, ShiftAmount);
-      _mov(Dest, T);
-    } else {
-      // t1 = movsx src; dst = t1
-      Variable *T = makeReg(DestTy);
-      _movsx(T, Src0RM);
-      _mov(Dest, T);
-    }
-    break;
-  }
-  case InstCast::Zext: {
-    Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
-    if (isVectorType(DestTy)) {
-      // onemask = materialize(1,1,...); dest = onemask & src
-      Variable *OneMask = makeVectorOfOnes(DestTy);
-      Variable *T = makeReg(DestTy);
-      _movp(T, Src0RM);
-      _pand(T, OneMask);
-      _movp(Dest, T);
-    } else if (!Traits::Is64Bit && DestTy == IceType_i64) {
-      // t1=movzx src; dst.lo=t1; dst.hi=0
-      Constant *Zero = Ctx->getConstantZero(IceType_i32);
-      auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
-      auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
-      Variable *Tmp = makeReg(DestLo->getType());
-      if (Src0RM->getType() == IceType_i32) {
-        _mov(Tmp, Src0RM);
-      } else {
-        _movzx(Tmp, Src0RM);
-      }
-      _mov(DestLo, Tmp);
-      _mov(DestHi, Zero);
-    } else if (Src0RM->getType() == IceType_i1) {
-      // t = Src0RM; Dest = t
-      Variable *T = nullptr;
-      if (DestTy == IceType_i8) {
-        _mov(T, Src0RM);
-      } else {
-        assert(DestTy != IceType_i1);
-        assert(Traits::Is64Bit || DestTy != IceType_i64);
-        // Use 32-bit for both 16-bit and 32-bit, since 32-bit ops are shorter.
-        // In x86-64 we need to widen T to 64-bits to ensure that T -- if
-        // written to the stack (i.e., in -Om1) will be fully zero-extended.
-        T = makeReg(DestTy == IceType_i64 ? IceType_i64 : IceType_i32);
-        _movzx(T, Src0RM);
-      }
-      _mov(Dest, T);
-    } else {
-      // t1 = movzx src; dst = t1
-      Variable *T = makeReg(DestTy);
-      _movzx(T, Src0RM);
-      _mov(Dest, T);
-    }
-    break;
-  }
-  case InstCast::Trunc: {
-    if (isVectorType(DestTy)) {
-      // onemask = materialize(1,1,...); dst = src & onemask
-      Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
-      Type Src0Ty = Src0RM->getType();
-      Variable *OneMask = makeVectorOfOnes(Src0Ty);
-      Variable *T = makeReg(DestTy);
-      _movp(T, Src0RM);
-      _pand(T, OneMask);
-      _movp(Dest, T);
-    } else if (DestTy == IceType_i1 || DestTy == IceType_i8) {
-      // Make sure we truncate from and into valid registers.
-      Operand *Src0 = legalizeUndef(Instr->getSrc(0));
-      if (!Traits::Is64Bit && Src0->getType() == IceType_i64)
-        Src0 = loOperand(Src0);
-      Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
-      Variable *T = copyToReg8(Src0RM);
-      if (DestTy == IceType_i1)
-        _and(T, Ctx->getConstantInt1(1));
-      _mov(Dest, T);
-    } else {
-      Operand *Src0 = legalizeUndef(Instr->getSrc(0));
-      if (!Traits::Is64Bit && Src0->getType() == IceType_i64)
-        Src0 = loOperand(Src0);
-      Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
-      // t1 = trunc Src0RM; Dest = t1
-      Variable *T = makeReg(DestTy);
-      _mov(T, Src0RM);
-      _mov(Dest, T);
-    }
-    break;
-  }
-  case InstCast::Fptrunc:
-  case InstCast::Fpext: {
-    Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
-    // t1 = cvt Src0RM; Dest = t1
-    Variable *T = makeReg(DestTy);
-    _cvt(T, Src0RM, Traits::Insts::Cvt::Float2float);
-    _mov(Dest, T);
-    break;
-  }
-  case InstCast::Fptosi:
-    if (isVectorType(DestTy)) {
-      assert(DestTy == IceType_v4i32);
-      assert(Instr->getSrc(0)->getType() == IceType_v4f32);
-      Operand *Src0R = legalizeToReg(Instr->getSrc(0));
-      Variable *T = makeReg(DestTy);
-      _cvt(T, Src0R, Traits::Insts::Cvt::Tps2dq);
-      _movp(Dest, T);
-    } else if (!Traits::Is64Bit && DestTy == IceType_i64) {
-      llvm::report_fatal_error("Helper call was expected");
-    } else {
-      Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
-      // t1.i32 = cvt Src0RM; t2.dest_type = t1; Dest = t2.dest_type
-      Variable *T_1 = nullptr;
-      if (Traits::Is64Bit && DestTy == IceType_i64) {
-        T_1 = makeReg(IceType_i64);
-      } else {
-        assert(DestTy != IceType_i64);
-        T_1 = makeReg(IceType_i32);
-      }
-      // cvt() requires its integer argument to be a GPR.
-      Variable *T_2 = makeReg(DestTy);
-      if (isByteSizedType(DestTy)) {
-        assert(T_1->getType() == IceType_i32);
-        T_1->setRegClass(RCX86_Is32To8);
-        T_2->setRegClass(RCX86_IsTrunc8Rcvr);
-      }
-      _cvt(T_1, Src0RM, Traits::Insts::Cvt::Tss2si);
-      _mov(T_2, T_1); // T_1 and T_2 may have different integer types
-      if (DestTy == IceType_i1)
-        _and(T_2, Ctx->getConstantInt1(1));
-      _mov(Dest, T_2);
-    }
-    break;
-  case InstCast::Fptoui:
-    if (isVectorType(DestTy)) {
-      llvm::report_fatal_error("Helper call was expected");
-    } else if (DestTy == IceType_i64 ||
-               (!Traits::Is64Bit && DestTy == IceType_i32)) {
-      llvm::report_fatal_error("Helper call was expected");
-    } else {
-      Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
-      // t1.i32 = cvt Src0RM; t2.dest_type = t1; Dest = t2.dest_type
-      assert(DestTy != IceType_i64);
-      Variable *T_1 = nullptr;
-      if (Traits::Is64Bit && DestTy == IceType_i32) {
-        T_1 = makeReg(IceType_i64);
-      } else {
-        assert(DestTy != IceType_i32);
-        T_1 = makeReg(IceType_i32);
-      }
-      Variable *T_2 = makeReg(DestTy);
-      if (isByteSizedType(DestTy)) {
-        assert(T_1->getType() == IceType_i32);
-        T_1->setRegClass(RCX86_Is32To8);
-        T_2->setRegClass(RCX86_IsTrunc8Rcvr);
-      }
-      _cvt(T_1, Src0RM, Traits::Insts::Cvt::Tss2si);
-      _mov(T_2, T_1); // T_1 and T_2 may have different integer types
-      if (DestTy == IceType_i1)
-        _and(T_2, Ctx->getConstantInt1(1));
-      _mov(Dest, T_2);
-    }
-    break;
-  case InstCast::Sitofp:
-    if (isVectorType(DestTy)) {
-      assert(DestTy == IceType_v4f32);
-      assert(Instr->getSrc(0)->getType() == IceType_v4i32);
-      Operand *Src0R = legalizeToReg(Instr->getSrc(0));
-      Variable *T = makeReg(DestTy);
-      _cvt(T, Src0R, Traits::Insts::Cvt::Dq2ps);
-      _movp(Dest, T);
-    } else if (!Traits::Is64Bit && Instr->getSrc(0)->getType() == IceType_i64) {
-      llvm::report_fatal_error("Helper call was expected");
-    } else {
-      Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
-      // Sign-extend the operand.
-      // t1.i32 = movsx Src0RM; t2 = Cvt t1.i32; Dest = t2
-      Variable *T_1 = nullptr;
-      if (Traits::Is64Bit && Src0RM->getType() == IceType_i64) {
-        T_1 = makeReg(IceType_i64);
-      } else {
-        assert(Src0RM->getType() != IceType_i64);
-        T_1 = makeReg(IceType_i32);
-      }
-      Variable *T_2 = makeReg(DestTy);
-      if (Src0RM->getType() == T_1->getType())
-        _mov(T_1, Src0RM);
-      else
-        _movsx(T_1, Src0RM);
-      _cvt(T_2, T_1, Traits::Insts::Cvt::Si2ss);
-      _mov(Dest, T_2);
-    }
-    break;
-  case InstCast::Uitofp: {
-    Operand *Src0 = Instr->getSrc(0);
-    if (isVectorType(Src0->getType())) {
-      llvm::report_fatal_error("Helper call was expected");
-    } else if (Src0->getType() == IceType_i64 ||
-               (!Traits::Is64Bit && Src0->getType() == IceType_i32)) {
-      llvm::report_fatal_error("Helper call was expected");
-    } else {
-      Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
-      // Zero-extend the operand.
-      // t1.i32 = movzx Src0RM; t2 = Cvt t1.i32; Dest = t2
-      Variable *T_1 = nullptr;
-      if (Traits::Is64Bit && Src0RM->getType() == IceType_i32) {
-        T_1 = makeReg(IceType_i64);
-      } else {
-        assert(Src0RM->getType() != IceType_i64);
-        assert(Traits::Is64Bit || Src0RM->getType() != IceType_i32);
-        T_1 = makeReg(IceType_i32);
-      }
-      Variable *T_2 = makeReg(DestTy);
-      if (Src0RM->getType() == T_1->getType())
-        _mov(T_1, Src0RM);
-      else
-        _movzx(T_1, Src0RM)->setMustKeep();
-      _cvt(T_2, T_1, Traits::Insts::Cvt::Si2ss);
-      _mov(Dest, T_2);
-    }
-    break;
-  }
-  case InstCast::Bitcast: {
-    Operand *Src0 = Instr->getSrc(0);
-    if (DestTy == Src0->getType()) {
-      auto *Assign = InstAssign::create(Func, Dest, Src0);
-      lowerAssign(Assign);
-      return;
-    }
-    switch (DestTy) {
-    default:
-      llvm_unreachable("Unexpected Bitcast dest type");
-    case IceType_i8: {
-      llvm::report_fatal_error("Helper call was expected");
-    } break;
-    case IceType_i16: {
-      llvm::report_fatal_error("Helper call was expected");
-    } break;
-    case IceType_i32:
-    case IceType_f32: {
-      Variable *Src0R = legalizeToReg(Src0);
-      Variable *T = makeReg(DestTy);
-      _movd(T, Src0R);
-      _mov(Dest, T);
-    } break;
-    case IceType_i64: {
-      assert(Src0->getType() == IceType_f64);
-      if (Traits::Is64Bit) {
-        Variable *Src0R = legalizeToReg(Src0);
-        Variable *T = makeReg(IceType_i64);
-        _movd(T, Src0R);
-        _mov(Dest, T);
-      } else {
-        Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
-        // a.i64 = bitcast b.f64 ==>
-        //   s.f64 = spill b.f64
-        //   t_lo.i32 = lo(s.f64)
-        //   a_lo.i32 = t_lo.i32
-        //   t_hi.i32 = hi(s.f64)
-        //   a_hi.i32 = t_hi.i32
-        Operand *SpillLo, *SpillHi;
-        if (auto *Src0Var = llvm::dyn_cast<Variable>(Src0RM)) {
-          Variable *Spill = Func->makeVariable(IceType_f64);
-          Spill->setLinkedTo(Src0Var);
-          Spill->setMustNotHaveReg();
-          _movq(Spill, Src0RM);
-          SpillLo = Traits::VariableSplit::create(Func, Spill,
-                                                  Traits::VariableSplit::Low);
-          SpillHi = Traits::VariableSplit::create(Func, Spill,
-                                                  Traits::VariableSplit::High);
-        } else {
-          SpillLo = loOperand(Src0RM);
-          SpillHi = hiOperand(Src0RM);
-        }
-
-        auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
-        auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
-        Variable *T_Lo = makeReg(IceType_i32);
-        Variable *T_Hi = makeReg(IceType_i32);
-
-        _mov(T_Lo, SpillLo);
-        _mov(DestLo, T_Lo);
-        _mov(T_Hi, SpillHi);
-        _mov(DestHi, T_Hi);
-      }
-    } break;
-    case IceType_f64: {
-      assert(Src0->getType() == IceType_i64);
-      if (Traits::Is64Bit) {
-        Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
-        Variable *T = makeReg(IceType_f64);
-        _movd(T, Src0RM);
-        _mov(Dest, T);
-      } else {
-        Src0 = legalize(Src0);
-        if (llvm::isa<X86OperandMem>(Src0)) {
-          Variable *T = makeReg(DestTy);
-          _movq(T, Src0);
-          _movq(Dest, T);
-          break;
-        }
-        // a.f64 = bitcast b.i64 ==>
-        //   t_lo.i32 = b_lo.i32
-        //   FakeDef(s.f64)
-        //   lo(s.f64) = t_lo.i32
-        //   t_hi.i32 = b_hi.i32
-        //   hi(s.f64) = t_hi.i32
-        //   a.f64 = s.f64
-        Variable *Spill = Func->makeVariable(IceType_f64);
-        Spill->setLinkedTo(Dest);
-        Spill->setMustNotHaveReg();
-
-        Variable *T_Lo = nullptr, *T_Hi = nullptr;
-        auto *SpillLo = Traits::VariableSplit::create(
-            Func, Spill, Traits::VariableSplit::Low);
-        auto *SpillHi = Traits::VariableSplit::create(
-            Func, Spill, Traits::VariableSplit::High);
-        _mov(T_Lo, loOperand(Src0));
-        // Technically, the Spill is defined after the _store happens, but
-        // SpillLo is considered a "use" of Spill so define Spill before it is
-        // used.
-        Context.insert<InstFakeDef>(Spill);
-        _store(T_Lo, SpillLo);
-        _mov(T_Hi, hiOperand(Src0));
-        _store(T_Hi, SpillHi);
-        _movq(Dest, Spill);
-      }
-    } break;
-    case IceType_v8i1: {
-      llvm::report_fatal_error("Helper call was expected");
-    } break;
-    case IceType_v16i1: {
-      llvm::report_fatal_error("Helper call was expected");
-    } break;
-    case IceType_v8i16:
-    case IceType_v16i8:
-    case IceType_v4i32:
-    case IceType_v4f32: {
-      if (Src0->getType() == IceType_i32) {
-        // Bitcast requires equal type sizes, which isn't strictly the case
-        // between scalars and vectors, but to emulate v4i8 vectors one has to
-        // use v16i8 vectors.
-        Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
-        Variable *T = makeReg(DestTy);
-        _movd(T, Src0RM);
-        _mov(Dest, T);
-      } else {
-        _movp(Dest, legalizeToReg(Src0));
-      }
-    } break;
-    }
-    break;
-  }
-  }
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::lowerExtractElement(
-    const InstExtractElement *Instr) {
-  Operand *SourceVectNotLegalized = Instr->getSrc(0);
-  auto *ElementIndex = llvm::dyn_cast<ConstantInteger32>(Instr->getSrc(1));
-  // Only constant indices are allowed in PNaCl IR.
-  assert(ElementIndex);
-
-  unsigned Index = ElementIndex->getValue();
-  Type Ty = SourceVectNotLegalized->getType();
-  Type ElementTy = typeElementType(Ty);
-  Type InVectorElementTy = Traits::getInVectorElementType(Ty);
-
-  // TODO(wala): Determine the best lowering sequences for each type.
-  bool CanUsePextr = Ty == IceType_v8i16 || Ty == IceType_v8i1 ||
-                     (InstructionSet >= Traits::SSE4_1 && Ty != IceType_v4f32);
-  Variable *ExtractedElementR =
-      makeReg(CanUsePextr ? IceType_i32 : InVectorElementTy);
-  if (CanUsePextr) {
-    // Use pextrb, pextrw, or pextrd.  The "b" and "w" versions clear the upper
-    // bits of the destination register, so we represent this by always
-    // extracting into an i32 register.  The _mov into Dest below will do
-    // truncation as necessary.
-    Constant *Mask = Ctx->getConstantInt32(Index);
-    Variable *SourceVectR = legalizeToReg(SourceVectNotLegalized);
-    _pextr(ExtractedElementR, SourceVectR, Mask);
-  } else if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) {
-    // Use pshufd and movd/movss.
-    Variable *T = nullptr;
-    if (Index) {
-      // The shuffle only needs to occur if the element to be extracted is not
-      // at the lowest index.
-      Constant *Mask = Ctx->getConstantInt32(Index);
-      T = makeReg(Ty);
-      _pshufd(T, legalize(SourceVectNotLegalized, Legal_Reg | Legal_Mem), Mask);
-    } else {
-      T = legalizeToReg(SourceVectNotLegalized);
-    }
-
-    if (InVectorElementTy == IceType_i32) {
-      _movd(ExtractedElementR, T);
-    } else { // Ty == IceType_f32
-      // TODO(wala): _movss is only used here because _mov does not allow a
-      // vector source and a scalar destination.  _mov should be able to be
-      // used here.
-      // _movss is a binary instruction, so the FakeDef is needed to keep the
-      // live range analysis consistent.
-      Context.insert<InstFakeDef>(ExtractedElementR);
-      _movss(ExtractedElementR, T);
-    }
-  } else {
-    assert(Ty == IceType_v16i8 || Ty == IceType_v16i1);
-    // Spill the value to a stack slot and do the extraction in memory.
-    //
-    // TODO(wala): use legalize(SourceVectNotLegalized, Legal_Mem) when support
-    // for legalizing to mem is implemented.
-    Variable *Slot = Func->makeVariable(Ty);
-    Slot->setMustNotHaveReg();
-    _movp(Slot, legalizeToReg(SourceVectNotLegalized));
-
-    // Compute the location of the element in memory.
-    unsigned Offset = Index * typeWidthInBytes(InVectorElementTy);
-    X86OperandMem *Loc =
-        getMemoryOperandForStackSlot(InVectorElementTy, Slot, Offset);
-    _mov(ExtractedElementR, Loc);
-  }
-
-  if (ElementTy == IceType_i1) {
-    // Truncate extracted integers to i1s if necessary.
-    Variable *T = makeReg(IceType_i1);
-    InstCast *Cast =
-        InstCast::create(Func, InstCast::Trunc, T, ExtractedElementR);
-    lowerCast(Cast);
-    ExtractedElementR = T;
-  }
-
-  // Copy the element to the destination.
-  Variable *Dest = Instr->getDest();
-  _mov(Dest, ExtractedElementR);
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::lowerFcmp(const InstFcmp *Fcmp) {
-  Variable *Dest = Fcmp->getDest();
-
-  if (isVectorType(Dest->getType())) {
-    lowerFcmpVector(Fcmp);
-  } else {
-    constexpr Inst *Consumer = nullptr;
-    lowerFcmpAndConsumer(Fcmp, Consumer);
-  }
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::lowerFcmpAndConsumer(const InstFcmp *Fcmp,
-                                                     const Inst *Consumer) {
-  Operand *Src0 = Fcmp->getSrc(0);
-  Operand *Src1 = Fcmp->getSrc(1);
-  Variable *Dest = Fcmp->getDest();
-
-  if (Consumer != nullptr) {
-    if (auto *Select = llvm::dyn_cast<InstSelect>(Consumer)) {
-      if (lowerOptimizeFcmpSelect(Fcmp, Select))
-        return;
-    }
-  }
-
-  if (isVectorType(Dest->getType())) {
-    lowerFcmp(Fcmp);
-    if (Consumer != nullptr)
-      lowerSelectVector(llvm::cast<InstSelect>(Consumer));
-    return;
-  }
-
-  // Lowering a = fcmp cond, b, c
-  //   ucomiss b, c       /* only if C1 != Br_None */
-  //                      /* but swap b,c order if SwapOperands==true */
-  //   mov a, <default>
-  //   j<C1> label        /* only if C1 != Br_None */
-  //   j<C2> label        /* only if C2 != Br_None */
-  //   FakeUse(a)         /* only if C1 != Br_None */
-  //   mov a, !<default>  /* only if C1 != Br_None */
-  //   label:             /* only if C1 != Br_None */
-  //
-  // setcc lowering when C1 != Br_None && C2 == Br_None:
-  //   ucomiss b, c       /* but swap b,c order if SwapOperands==true */
-  //   setcc a, C1
-  InstFcmp::FCond Condition = Fcmp->getCondition();
-  assert(static_cast<size_t>(Condition) < Traits::TableFcmpSize);
-  if (Traits::TableFcmp[Condition].SwapScalarOperands)
-    std::swap(Src0, Src1);
-  const bool HasC1 = (Traits::TableFcmp[Condition].C1 != CondX86::Br_None);
-  const bool HasC2 = (Traits::TableFcmp[Condition].C2 != CondX86::Br_None);
-  if (HasC1) {
-    Src0 = legalize(Src0);
-    Operand *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
-    Variable *T = nullptr;
-    _mov(T, Src0);
-    _ucomiss(T, Src1RM);
-    if (!HasC2) {
-      assert(Traits::TableFcmp[Condition].Default);
-      setccOrConsumer(Traits::TableFcmp[Condition].C1, Dest, Consumer);
-      return;
-    }
-  }
-  int32_t IntDefault = Traits::TableFcmp[Condition].Default;
-  if (Consumer == nullptr) {
-    Constant *Default = Ctx->getConstantInt(Dest->getType(), IntDefault);
-    _mov(Dest, Default);
-    if (HasC1) {
-      InstX86Label *Label = InstX86Label::create(Func, this);
-      _br(Traits::TableFcmp[Condition].C1, Label);
-      if (HasC2) {
-        _br(Traits::TableFcmp[Condition].C2, Label);
-      }
-      Constant *NonDefault = Ctx->getConstantInt(Dest->getType(), !IntDefault);
-      _redefined(_mov(Dest, NonDefault));
-      Context.insert(Label);
-    }
-    return;
-  }
-  if (const auto *Br = llvm::dyn_cast<InstBr>(Consumer)) {
-    CfgNode *TrueSucc = Br->getTargetTrue();
-    CfgNode *FalseSucc = Br->getTargetFalse();
-    if (IntDefault != 0)
-      std::swap(TrueSucc, FalseSucc);
-    if (HasC1) {
-      _br(Traits::TableFcmp[Condition].C1, FalseSucc);
-      if (HasC2) {
-        _br(Traits::TableFcmp[Condition].C2, FalseSucc);
-      }
-      _br(TrueSucc);
-      return;
-    }
-    _br(FalseSucc);
-    return;
-  }
-  if (auto *Select = llvm::dyn_cast<InstSelect>(Consumer)) {
-    Operand *SrcT = Select->getTrueOperand();
-    Operand *SrcF = Select->getFalseOperand();
-    Variable *SelectDest = Select->getDest();
-    if (IntDefault != 0)
-      std::swap(SrcT, SrcF);
-    lowerMove(SelectDest, SrcF, false);
-    if (HasC1) {
-      InstX86Label *Label = InstX86Label::create(Func, this);
-      _br(Traits::TableFcmp[Condition].C1, Label);
-      if (HasC2) {
-        _br(Traits::TableFcmp[Condition].C2, Label);
-      }
-      static constexpr bool IsRedefinition = true;
-      lowerMove(SelectDest, SrcT, IsRedefinition);
-      Context.insert(Label);
-    }
-    return;
-  }
-  llvm::report_fatal_error("Unexpected consumer type");
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::lowerFcmpVector(const InstFcmp *Fcmp) {
-  Operand *Src0 = Fcmp->getSrc(0);
-  Operand *Src1 = Fcmp->getSrc(1);
-  Variable *Dest = Fcmp->getDest();
-
-  if (!isVectorType(Dest->getType()))
-    llvm::report_fatal_error("Expected vector compare");
-
-  InstFcmp::FCond Condition = Fcmp->getCondition();
-  assert(static_cast<size_t>(Condition) < Traits::TableFcmpSize);
-
-  if (Traits::TableFcmp[Condition].SwapVectorOperands)
-    std::swap(Src0, Src1);
-
-  Variable *T = nullptr;
-
-  if (Condition == InstFcmp::True) {
-    // makeVectorOfOnes() requires an integer vector type.
-    T = makeVectorOfMinusOnes(IceType_v4i32);
-  } else if (Condition == InstFcmp::False) {
-    T = makeVectorOfZeros(Dest->getType());
-  } else {
-    Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
-    Operand *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
-    if (llvm::isa<X86OperandMem>(Src1RM))
-      Src1RM = legalizeToReg(Src1RM);
-
-    switch (Condition) {
-    default: {
-      const CmppsCond Predicate = Traits::TableFcmp[Condition].Predicate;
-      assert(Predicate != CondX86::Cmpps_Invalid);
-      T = makeReg(Src0RM->getType());
-      _movp(T, Src0RM);
-      _cmpps(T, Src1RM, Predicate);
-    } break;
-    case InstFcmp::One: {
-      // Check both unequal and ordered.
-      T = makeReg(Src0RM->getType());
-      Variable *T2 = makeReg(Src0RM->getType());
-      _movp(T, Src0RM);
-      _cmpps(T, Src1RM, CondX86::Cmpps_neq);
-      _movp(T2, Src0RM);
-      _cmpps(T2, Src1RM, CondX86::Cmpps_ord);
-      _pand(T, T2);
-    } break;
-    case InstFcmp::Ueq: {
-      // Check both equal or unordered.
-      T = makeReg(Src0RM->getType());
-      Variable *T2 = makeReg(Src0RM->getType());
-      _movp(T, Src0RM);
-      _cmpps(T, Src1RM, CondX86::Cmpps_eq);
-      _movp(T2, Src0RM);
-      _cmpps(T2, Src1RM, CondX86::Cmpps_unord);
-      _por(T, T2);
-    } break;
-    }
-  }
-
-  assert(T != nullptr);
-  _movp(Dest, T);
-  eliminateNextVectorSextInstruction(Dest);
-}
-
-inline bool isZero(const Operand *Opnd) {
-  if (auto *C64 = llvm::dyn_cast<ConstantInteger64>(Opnd))
-    return C64->getValue() == 0;
-  if (auto *C32 = llvm::dyn_cast<ConstantInteger32>(Opnd))
-    return C32->getValue() == 0;
-  return false;
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::lowerIcmpAndConsumer(const InstIcmp *Icmp,
-                                                     const Inst *Consumer) {
-  Operand *Src0 = legalize(Icmp->getSrc(0));
-  Operand *Src1 = legalize(Icmp->getSrc(1));
-  Variable *Dest = Icmp->getDest();
-
-  if (isVectorType(Dest->getType())) {
-    lowerIcmp(Icmp);
-    if (Consumer != nullptr)
-      lowerSelectVector(llvm::cast<InstSelect>(Consumer));
-    return;
-  }
-
-  if (!Traits::Is64Bit && Src0->getType() == IceType_i64) {
-    lowerIcmp64(Icmp, Consumer);
-    return;
-  }
-
-  // cmp b, c
-  if (isZero(Src1)) {
-    switch (Icmp->getCondition()) {
-    default:
-      break;
-    case InstIcmp::Uge:
-      movOrConsumer(true, Dest, Consumer);
-      return;
-    case InstIcmp::Ult:
-      movOrConsumer(false, Dest, Consumer);
-      return;
-    }
-  }
-  Operand *Src0RM = legalizeSrc0ForCmp(Src0, Src1);
-  _cmp(Src0RM, Src1);
-  setccOrConsumer(Traits::getIcmp32Mapping(Icmp->getCondition()), Dest,
-                  Consumer);
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::lowerIcmpVector(const InstIcmp *Icmp) {
-  Operand *Src0 = legalize(Icmp->getSrc(0));
-  Operand *Src1 = legalize(Icmp->getSrc(1));
-  Variable *Dest = Icmp->getDest();
-
-  if (!isVectorType(Dest->getType()))
-    llvm::report_fatal_error("Expected a vector compare");
-
-  Type Ty = Src0->getType();
-  // Promote i1 vectors to 128 bit integer vector types.
-  if (typeElementType(Ty) == IceType_i1) {
-    Type NewTy = IceType_NUM;
-    switch (Ty) {
-    default:
-      llvm::report_fatal_error("unexpected type");
-      break;
-    case IceType_v4i1:
-      NewTy = IceType_v4i32;
-      break;
-    case IceType_v8i1:
-      NewTy = IceType_v8i16;
-      break;
-    case IceType_v16i1:
-      NewTy = IceType_v16i8;
-      break;
-    }
-    Variable *NewSrc0 = Func->makeVariable(NewTy);
-    Variable *NewSrc1 = Func->makeVariable(NewTy);
-    lowerCast(InstCast::create(Func, InstCast::Sext, NewSrc0, Src0));
-    lowerCast(InstCast::create(Func, InstCast::Sext, NewSrc1, Src1));
-    Src0 = NewSrc0;
-    Src1 = NewSrc1;
-    Ty = NewTy;
-  }
-
-  InstIcmp::ICond Condition = Icmp->getCondition();
-
-  Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
-  Operand *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
-
-  // SSE2 only has signed comparison operations. Transform unsigned inputs in
-  // a manner that allows for the use of signed comparison operations by
-  // flipping the high order bits.
-  if (Condition == InstIcmp::Ugt || Condition == InstIcmp::Uge ||
-      Condition == InstIcmp::Ult || Condition == InstIcmp::Ule) {
-    Variable *T0 = makeReg(Ty);
-    Variable *T1 = makeReg(Ty);
-    Variable *HighOrderBits = makeVectorOfHighOrderBits(Ty);
-    _movp(T0, Src0RM);
-    _pxor(T0, HighOrderBits);
-    _movp(T1, Src1RM);
-    _pxor(T1, HighOrderBits);
-    Src0RM = T0;
-    Src1RM = T1;
-  }
-
-  Variable *T = makeReg(Ty);
-  switch (Condition) {
-  default:
-    llvm_unreachable("unexpected condition");
-    break;
-  case InstIcmp::Eq: {
-    if (llvm::isa<X86OperandMem>(Src1RM))
-      Src1RM = legalizeToReg(Src1RM);
-    _movp(T, Src0RM);
-    _pcmpeq(T, Src1RM);
-  } break;
-  case InstIcmp::Ne: {
-    if (llvm::isa<X86OperandMem>(Src1RM))
-      Src1RM = legalizeToReg(Src1RM);
-    _movp(T, Src0RM);
-    _pcmpeq(T, Src1RM);
-    Variable *MinusOne = makeVectorOfMinusOnes(Ty);
-    _pxor(T, MinusOne);
-  } break;
-  case InstIcmp::Ugt:
-  case InstIcmp::Sgt: {
-    if (llvm::isa<X86OperandMem>(Src1RM))
-      Src1RM = legalizeToReg(Src1RM);
-    _movp(T, Src0RM);
-    _pcmpgt(T, Src1RM);
-  } break;
-  case InstIcmp::Uge:
-  case InstIcmp::Sge: {
-    // !(Src1RM > Src0RM)
-    if (llvm::isa<X86OperandMem>(Src0RM))
-      Src0RM = legalizeToReg(Src0RM);
-    _movp(T, Src1RM);
-    _pcmpgt(T, Src0RM);
-    Variable *MinusOne = makeVectorOfMinusOnes(Ty);
-    _pxor(T, MinusOne);
-  } break;
-  case InstIcmp::Ult:
-  case InstIcmp::Slt: {
-    if (llvm::isa<X86OperandMem>(Src0RM))
-      Src0RM = legalizeToReg(Src0RM);
-    _movp(T, Src1RM);
-    _pcmpgt(T, Src0RM);
-  } break;
-  case InstIcmp::Ule:
-  case InstIcmp::Sle: {
-    // !(Src0RM > Src1RM)
-    if (llvm::isa<X86OperandMem>(Src1RM))
-      Src1RM = legalizeToReg(Src1RM);
-    _movp(T, Src0RM);
-    _pcmpgt(T, Src1RM);
-    Variable *MinusOne = makeVectorOfMinusOnes(Ty);
-    _pxor(T, MinusOne);
-  } break;
-  }
-
-  _movp(Dest, T);
-  eliminateNextVectorSextInstruction(Dest);
-}
-
-template <typename TraitsType>
-template <typename T>
-typename std::enable_if<!T::Is64Bit, void>::type
-TargetX86Base<TraitsType>::lowerIcmp64(const InstIcmp *Icmp,
-                                       const Inst *Consumer) {
-  // a=icmp cond, b, c ==> cmp b,c; a=1; br cond,L1; FakeUse(a); a=0; L1:
-  Operand *Src0 = legalize(Icmp->getSrc(0));
-  Operand *Src1 = legalize(Icmp->getSrc(1));
-  Variable *Dest = Icmp->getDest();
-  InstIcmp::ICond Condition = Icmp->getCondition();
-  assert(static_cast<size_t>(Condition) < Traits::TableIcmp64Size);
-  Operand *Src0LoRM = nullptr;
-  Operand *Src0HiRM = nullptr;
-  // Legalize the portions of Src0 that are going to be needed.
-  if (isZero(Src1)) {
-    switch (Condition) {
-    default:
-      llvm_unreachable("unexpected condition");
-      break;
-    // These two are not optimized, so we fall through to the general case,
-    // which needs the upper and lower halves legalized.
-    case InstIcmp::Sgt:
-    case InstIcmp::Sle:
-    // These four compare after performing an "or" of the high and low half, so
-    // they need the upper and lower halves legalized.
-    case InstIcmp::Eq:
-    case InstIcmp::Ule:
-    case InstIcmp::Ne:
-    case InstIcmp::Ugt:
-      Src0LoRM = legalize(loOperand(Src0), Legal_Reg | Legal_Mem);
-    // These two test only the high half's sign bit, so they need only
-    // the upper half legalized.
-    case InstIcmp::Sge:
-    case InstIcmp::Slt:
-      Src0HiRM = legalize(hiOperand(Src0), Legal_Reg | Legal_Mem);
-      break;
-
-    // These two move constants and hence need no legalization.
-    case InstIcmp::Uge:
-    case InstIcmp::Ult:
-      break;
-    }
-  } else {
-    Src0LoRM = legalize(loOperand(Src0), Legal_Reg | Legal_Mem);
-    Src0HiRM = legalize(hiOperand(Src0), Legal_Reg | Legal_Mem);
-  }
-  // Optimize comparisons with zero.
-  if (isZero(Src1)) {
-    Constant *SignMask = Ctx->getConstantInt32(0x80000000);
-    Variable *Temp = nullptr;
-    switch (Condition) {
-    default:
-      llvm_unreachable("unexpected condition");
-      break;
-    case InstIcmp::Eq:
-    case InstIcmp::Ule:
-      // Mov Src0HiRM first, because it was legalized most recently, and will
-      // sometimes avoid a move before the OR.
-      _mov(Temp, Src0HiRM);
-      _or(Temp, Src0LoRM);
-      Context.insert<InstFakeUse>(Temp);
-      setccOrConsumer(CondX86::Br_e, Dest, Consumer);
-      return;
-    case InstIcmp::Ne:
-    case InstIcmp::Ugt:
-      // Mov Src0HiRM first, because it was legalized most recently, and will
-      // sometimes avoid a move before the OR.
-      _mov(Temp, Src0HiRM);
-      _or(Temp, Src0LoRM);
-      Context.insert<InstFakeUse>(Temp);
-      setccOrConsumer(CondX86::Br_ne, Dest, Consumer);
-      return;
-    case InstIcmp::Uge:
-      movOrConsumer(true, Dest, Consumer);
-      return;
-    case InstIcmp::Ult:
-      movOrConsumer(false, Dest, Consumer);
-      return;
-    case InstIcmp::Sgt:
-      break;
-    case InstIcmp::Sge:
-      _test(Src0HiRM, SignMask);
-      setccOrConsumer(CondX86::Br_e, Dest, Consumer);
-      return;
-    case InstIcmp::Slt:
-      _test(Src0HiRM, SignMask);
-      setccOrConsumer(CondX86::Br_ne, Dest, Consumer);
-      return;
-    case InstIcmp::Sle:
-      break;
-    }
-  }
-  // Handle general compares.
-  Operand *Src1LoRI = legalize(loOperand(Src1), Legal_Reg | Legal_Imm);
-  Operand *Src1HiRI = legalize(hiOperand(Src1), Legal_Reg | Legal_Imm);
-  if (Consumer == nullptr) {
-    Constant *Zero = Ctx->getConstantInt(Dest->getType(), 0);
-    Constant *One = Ctx->getConstantInt(Dest->getType(), 1);
-    InstX86Label *LabelFalse = InstX86Label::create(Func, this);
-    InstX86Label *LabelTrue = InstX86Label::create(Func, this);
-    _mov(Dest, One);
-    _cmp(Src0HiRM, Src1HiRI);
-    if (Traits::TableIcmp64[Condition].C1 != CondX86::Br_None)
-      _br(Traits::TableIcmp64[Condition].C1, LabelTrue);
-    if (Traits::TableIcmp64[Condition].C2 != CondX86::Br_None)
-      _br(Traits::TableIcmp64[Condition].C2, LabelFalse);
-    _cmp(Src0LoRM, Src1LoRI);
-    _br(Traits::TableIcmp64[Condition].C3, LabelTrue);
-    Context.insert(LabelFalse);
-    _redefined(_mov(Dest, Zero));
-    Context.insert(LabelTrue);
-    return;
-  }
-  if (const auto *Br = llvm::dyn_cast<InstBr>(Consumer)) {
-    _cmp(Src0HiRM, Src1HiRI);
-    if (Traits::TableIcmp64[Condition].C1 != CondX86::Br_None)
-      _br(Traits::TableIcmp64[Condition].C1, Br->getTargetTrue());
-    if (Traits::TableIcmp64[Condition].C2 != CondX86::Br_None)
-      _br(Traits::TableIcmp64[Condition].C2, Br->getTargetFalse());
-    _cmp(Src0LoRM, Src1LoRI);
-    _br(Traits::TableIcmp64[Condition].C3, Br->getTargetTrue(),
-        Br->getTargetFalse());
-    return;
-  }
-  if (auto *Select = llvm::dyn_cast<InstSelect>(Consumer)) {
-    Operand *SrcT = Select->getTrueOperand();
-    Operand *SrcF = Select->getFalseOperand();
-    Variable *SelectDest = Select->getDest();
-    InstX86Label *LabelFalse = InstX86Label::create(Func, this);
-    InstX86Label *LabelTrue = InstX86Label::create(Func, this);
-    lowerMove(SelectDest, SrcT, false);
-    _cmp(Src0HiRM, Src1HiRI);
-    if (Traits::TableIcmp64[Condition].C1 != CondX86::Br_None)
-      _br(Traits::TableIcmp64[Condition].C1, LabelTrue);
-    if (Traits::TableIcmp64[Condition].C2 != CondX86::Br_None)
-      _br(Traits::TableIcmp64[Condition].C2, LabelFalse);
-    _cmp(Src0LoRM, Src1LoRI);
-    _br(Traits::TableIcmp64[Condition].C3, LabelTrue);
-    Context.insert(LabelFalse);
-    static constexpr bool IsRedefinition = true;
-    lowerMove(SelectDest, SrcF, IsRedefinition);
-    Context.insert(LabelTrue);
-    return;
-  }
-  llvm::report_fatal_error("Unexpected consumer type");
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::setccOrConsumer(BrCond Condition,
-                                                Variable *Dest,
-                                                const Inst *Consumer) {
-  if (Consumer == nullptr) {
-    _setcc(Dest, Condition);
-    return;
-  }
-  if (const auto *Br = llvm::dyn_cast<InstBr>(Consumer)) {
-    _br(Condition, Br->getTargetTrue(), Br->getTargetFalse());
-    return;
-  }
-  if (const auto *Select = llvm::dyn_cast<InstSelect>(Consumer)) {
-    Operand *SrcT = Select->getTrueOperand();
-    Operand *SrcF = Select->getFalseOperand();
-    Variable *SelectDest = Select->getDest();
-    lowerSelectMove(SelectDest, Condition, SrcT, SrcF);
-    return;
-  }
-  llvm::report_fatal_error("Unexpected consumer type");
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::movOrConsumer(bool IcmpResult, Variable *Dest,
-                                              const Inst *Consumer) {
-  if (Consumer == nullptr) {
-    _mov(Dest, Ctx->getConstantInt(Dest->getType(), (IcmpResult ? 1 : 0)));
-    return;
-  }
-  if (const auto *Br = llvm::dyn_cast<InstBr>(Consumer)) {
-    // TODO(sehr,stichnot): This could be done with a single unconditional
-    // branch instruction, but subzero doesn't know how to handle the resulting
-    // control flow graph changes now.  Make it do so to eliminate mov and cmp.
-    _mov(Dest, Ctx->getConstantInt(Dest->getType(), (IcmpResult ? 1 : 0)));
-    _cmp(Dest, Ctx->getConstantInt(Dest->getType(), 0));
-    _br(CondX86::Br_ne, Br->getTargetTrue(), Br->getTargetFalse());
-    return;
-  }
-  if (const auto *Select = llvm::dyn_cast<InstSelect>(Consumer)) {
-    Operand *Src = nullptr;
-    if (IcmpResult) {
-      Src = legalize(Select->getTrueOperand(), Legal_Reg | Legal_Imm);
-    } else {
-      Src = legalize(Select->getFalseOperand(), Legal_Reg | Legal_Imm);
-    }
-    Variable *SelectDest = Select->getDest();
-    lowerMove(SelectDest, Src, false);
-    return;
-  }
-  llvm::report_fatal_error("Unexpected consumer type");
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::lowerArithAndConsumer(
-    const InstArithmetic *Arith, const Inst *Consumer) {
-  Variable *T = nullptr;
-  Operand *Src0 = legalize(Arith->getSrc(0));
-  Operand *Src1 = legalize(Arith->getSrc(1));
-  Variable *Dest = Arith->getDest();
-  switch (Arith->getOp()) {
-  default:
-    llvm_unreachable("arithmetic operator not AND or OR");
-    break;
-  case InstArithmetic::And:
-    _mov(T, Src0);
-    // Test cannot have an address in the second position.  Since T is
-    // guaranteed to be a register and Src1 could be a memory load, ensure
-    // that the second argument is a register.
-    if (llvm::isa<Constant>(Src1))
-      _test(T, Src1);
-    else
-      _test(Src1, T);
-    break;
-  case InstArithmetic::Or:
-    _mov(T, Src0);
-    _or(T, Src1);
-    break;
-  }
-
-  if (Consumer == nullptr) {
-    llvm::report_fatal_error("Expected a consumer instruction");
-  }
-  if (const auto *Br = llvm::dyn_cast<InstBr>(Consumer)) {
-    Context.insert<InstFakeUse>(T);
-    Context.insert<InstFakeDef>(Dest);
-    _br(CondX86::Br_ne, Br->getTargetTrue(), Br->getTargetFalse());
-    return;
-  }
-  llvm::report_fatal_error("Unexpected consumer type");
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::lowerInsertElement(
-    const InstInsertElement *Instr) {
-  Operand *SourceVectNotLegalized = Instr->getSrc(0);
-  Operand *ElementToInsertNotLegalized = Instr->getSrc(1);
-  auto *ElementIndex = llvm::dyn_cast<ConstantInteger32>(Instr->getSrc(2));
-  // Only constant indices are allowed in PNaCl IR.
-  assert(ElementIndex);
-  unsigned Index = ElementIndex->getValue();
-  assert(Index < typeNumElements(SourceVectNotLegalized->getType()));
-
-  Type Ty = SourceVectNotLegalized->getType();
-  Type ElementTy = typeElementType(Ty);
-  Type InVectorElementTy = Traits::getInVectorElementType(Ty);
-
-  if (ElementTy == IceType_i1) {
-    // Expand the element to the appropriate size for it to be inserted in the
-    // vector.
-    Variable *Expanded = Func->makeVariable(InVectorElementTy);
-    auto *Cast = InstCast::create(Func, InstCast::Zext, Expanded,
-                                  ElementToInsertNotLegalized);
-    lowerCast(Cast);
-    ElementToInsertNotLegalized = Expanded;
-  }
-
-  if (Ty == IceType_v8i16 || Ty == IceType_v8i1 ||
-      InstructionSet >= Traits::SSE4_1) {
-    // Use insertps, pinsrb, pinsrw, or pinsrd.
-    Operand *ElementRM =
-        legalize(ElementToInsertNotLegalized, Legal_Reg | Legal_Mem);
-    Operand *SourceVectRM =
-        legalize(SourceVectNotLegalized, Legal_Reg | Legal_Mem);
-    Variable *T = makeReg(Ty);
-    _movp(T, SourceVectRM);
-    if (Ty == IceType_v4f32) {
-      _insertps(T, ElementRM, Ctx->getConstantInt32(Index << 4));
-    } else {
-      // For the pinsrb and pinsrw instructions, when the source operand is a
-      // register, it must be a full r32 register like eax, and not ax/al/ah.
-      // For filetype=asm, InstX86Pinsr<TraitsType>::emit() compensates for
-      // the use
-      // of r16 and r8 by converting them through getBaseReg(), while emitIAS()
-      // validates that the original and base register encodings are the same.
-      if (ElementRM->getType() == IceType_i8 &&
-          llvm::isa<Variable>(ElementRM)) {
-        // Don't use ah/bh/ch/dh for pinsrb.
-        ElementRM = copyToReg8(ElementRM);
-      }
-      _pinsr(T, ElementRM, Ctx->getConstantInt32(Index));
-    }
-    _movp(Instr->getDest(), T);
-  } else if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) {
-    // Use shufps or movss.
-    Variable *ElementR = nullptr;
-    Operand *SourceVectRM =
-        legalize(SourceVectNotLegalized, Legal_Reg | Legal_Mem);
-
-    if (InVectorElementTy == IceType_f32) {
-      // ElementR will be in an XMM register since it is floating point.
-      ElementR = legalizeToReg(ElementToInsertNotLegalized);
-    } else {
-      // Copy an integer to an XMM register.
-      Operand *T = legalize(ElementToInsertNotLegalized, Legal_Reg | Legal_Mem);
-      ElementR = makeReg(Ty);
-      _movd(ElementR, T);
-    }
-
-    if (Index == 0) {
-      Variable *T = makeReg(Ty);
-      _movp(T, SourceVectRM);
-      _movss(T, ElementR);
-      _movp(Instr->getDest(), T);
-      return;
-    }
-
-    // shufps treats the source and destination operands as vectors of four
-    // doublewords. The destination's two high doublewords are selected from
-    // the source operand and the two low doublewords are selected from the
-    // (original value of) the destination operand. An insertelement operation
-    // can be effected with a sequence of two shufps operations with
-    // appropriate masks. In all cases below, Element[0] is being inserted into
-    // SourceVectOperand. Indices are ordered from left to right.
-    //
-    // insertelement into index 1 (result is stored in ElementR):
-    //   ElementR := ElementR[0, 0] SourceVectRM[0, 0]
-    //   ElementR := ElementR[3, 0] SourceVectRM[2, 3]
-    //
-    // insertelement into index 2 (result is stored in T):
-    //   T := SourceVectRM
-    //   ElementR := ElementR[0, 0] T[0, 3]
-    //   T := T[0, 1] ElementR[0, 3]
-    //
-    // insertelement into index 3 (result is stored in T):
-    //   T := SourceVectRM
-    //   ElementR := ElementR[0, 0] T[0, 2]
-    //   T := T[0, 1] ElementR[3, 0]
-    const unsigned char Mask1[3] = {0, 192, 128};
-    const unsigned char Mask2[3] = {227, 196, 52};
-
-    Constant *Mask1Constant = Ctx->getConstantInt32(Mask1[Index - 1]);
-    Constant *Mask2Constant = Ctx->getConstantInt32(Mask2[Index - 1]);
-
-    if (Index == 1) {
-      _shufps(ElementR, SourceVectRM, Mask1Constant);
-      _shufps(ElementR, SourceVectRM, Mask2Constant);
-      _movp(Instr->getDest(), ElementR);
-    } else {
-      Variable *T = makeReg(Ty);
-      _movp(T, SourceVectRM);
-      _shufps(ElementR, T, Mask1Constant);
-      _shufps(T, ElementR, Mask2Constant);
-      _movp(Instr->getDest(), T);
-    }
-  } else {
-    assert(Ty == IceType_v16i8 || Ty == IceType_v16i1);
-    // Spill the value to a stack slot and perform the insertion in memory.
-    //
-    // TODO(wala): use legalize(SourceVectNotLegalized, Legal_Mem) when support
-    // for legalizing to mem is implemented.
-    Variable *Slot = Func->makeVariable(Ty);
-    Slot->setMustNotHaveReg();
-    _movp(Slot, legalizeToReg(SourceVectNotLegalized));
-
-    // Compute the location of the position to insert in memory.
-    unsigned Offset = Index * typeWidthInBytes(InVectorElementTy);
-    X86OperandMem *Loc =
-        getMemoryOperandForStackSlot(InVectorElementTy, Slot, Offset);
-    _store(legalizeToReg(ElementToInsertNotLegalized), Loc);
-
-    Variable *T = makeReg(Ty);
-    _movp(T, Slot);
-    _movp(Instr->getDest(), T);
-  }
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::lowerIntrinsic(const InstIntrinsic *Instr) {
-  switch (Intrinsics::IntrinsicID ID = Instr->getIntrinsicID()) {
-  case Intrinsics::AtomicCmpxchg: {
-    if (!Intrinsics::isMemoryOrderValid(
-            ID, getConstantMemoryOrder(Instr->getArg(3)),
-            getConstantMemoryOrder(Instr->getArg(4)))) {
-      Func->setError("Unexpected memory ordering for AtomicCmpxchg");
-      return;
-    }
-    Variable *DestPrev = Instr->getDest();
-    Operand *PtrToMem = legalize(Instr->getArg(0));
-    Operand *Expected = legalize(Instr->getArg(1));
-    Operand *Desired = legalize(Instr->getArg(2));
-    if (tryOptimizedCmpxchgCmpBr(DestPrev, PtrToMem, Expected, Desired))
-      return;
-    lowerAtomicCmpxchg(DestPrev, PtrToMem, Expected, Desired);
-    return;
-  }
-  case Intrinsics::AtomicFence:
-    if (!Intrinsics::isMemoryOrderValid(
-            ID, getConstantMemoryOrder(Instr->getArg(0)))) {
-      Func->setError("Unexpected memory ordering for AtomicFence");
-      return;
-    }
-    _mfence();
-    return;
-  case Intrinsics::AtomicFenceAll:
-    // NOTE: FenceAll should prevent and load/store from being moved across the
-    // fence (both atomic and non-atomic). The InstX86Mfence instruction is
-    // currently marked coarsely as "HasSideEffects".
-    _mfence();
-    return;
-  case Intrinsics::AtomicIsLockFree: {
-    // X86 is always lock free for 8/16/32/64 bit accesses.
-    // TODO(jvoung): Since the result is constant when given a constant byte
-    // size, this opens up DCE opportunities.
-    Operand *ByteSize = Instr->getArg(0);
-    Variable *Dest = Instr->getDest();
-    if (auto *CI = llvm::dyn_cast<ConstantInteger32>(ByteSize)) {
-      Constant *Result;
-      switch (CI->getValue()) {
-      default:
-        // Some x86-64 processors support the cmpxchg16b instruction, which can
-        // make 16-byte operations lock free (when used with the LOCK prefix).
-        // However, that's not supported in 32-bit mode, so just return 0 even
-        // for large sizes.
-        Result = Ctx->getConstantZero(IceType_i32);
-        break;
-      case 1:
-      case 2:
-      case 4:
-      case 8:
-        Result = Ctx->getConstantInt32(1);
-        break;
-      }
-      _mov(Dest, Result);
-      return;
-    }
-    // The PNaCl ABI requires the byte size to be a compile-time constant.
-    Func->setError("AtomicIsLockFree byte size should be compile-time const");
-    return;
-  }
-  case Intrinsics::AtomicLoad: {
-    // We require the memory address to be naturally aligned. Given that is the
-    // case, then normal loads are atomic.
-    if (!Intrinsics::isMemoryOrderValid(
-            ID, getConstantMemoryOrder(Instr->getArg(1)))) {
-      Func->setError("Unexpected memory ordering for AtomicLoad");
-      return;
-    }
-    Variable *Dest = Instr->getDest();
-    if (!Traits::Is64Bit) {
-      if (auto *Dest64On32 = llvm::dyn_cast<Variable64On32>(Dest)) {
-        // Follow what GCC does and use a movq instead of what lowerLoad()
-        // normally does (split the load into two). Thus, this skips
-        // load/arithmetic op folding. Load/arithmetic folding can't happen
-        // anyway, since this is x86-32 and integer arithmetic only happens on
-        // 32-bit quantities.
-        Variable *T = makeReg(IceType_f64);
-        X86OperandMem *Addr = formMemoryOperand(Instr->getArg(0), IceType_f64);
-        _movq(T, Addr);
-        // Then cast the bits back out of the XMM register to the i64 Dest.
-        auto *Cast = InstCast::create(Func, InstCast::Bitcast, Dest, T);
-        lowerCast(Cast);
-        // Make sure that the atomic load isn't elided when unused.
-        Context.insert<InstFakeUse>(Dest64On32->getLo());
-        Context.insert<InstFakeUse>(Dest64On32->getHi());
-        return;
-      }
-    }
-    auto *Load = InstLoad::create(Func, Dest, Instr->getArg(0));
-    lowerLoad(Load);
-    // Make sure the atomic load isn't elided when unused, by adding a FakeUse.
-    // Since lowerLoad may fuse the load w/ an arithmetic instruction, insert
-    // the FakeUse on the last-inserted instruction's dest.
-    Context.insert<InstFakeUse>(Context.getLastInserted()->getDest());
-    return;
-  }
-  case Intrinsics::AtomicRMW:
-    if (!Intrinsics::isMemoryOrderValid(
-            ID, getConstantMemoryOrder(Instr->getArg(3)))) {
-      Func->setError("Unexpected memory ordering for AtomicRMW");
-      return;
-    }
-    lowerAtomicRMW(
-        Instr->getDest(),
-        static_cast<uint32_t>(
-            llvm::cast<ConstantInteger32>(Instr->getArg(0))->getValue()),
-        Instr->getArg(1), Instr->getArg(2));
-    return;
-  case Intrinsics::AtomicStore: {
-    if (!Intrinsics::isMemoryOrderValid(
-            ID, getConstantMemoryOrder(Instr->getArg(2)))) {
-      Func->setError("Unexpected memory ordering for AtomicStore");
-      return;
-    }
-    // We require the memory address to be naturally aligned. Given that is the
-    // case, then normal stores are atomic. Add a fence after the store to make
-    // it visible.
-    Operand *Value = Instr->getArg(0);
-    Operand *Ptr = Instr->getArg(1);
-    if (!Traits::Is64Bit && Value->getType() == IceType_i64) {
-      // Use a movq instead of what lowerStore() normally does (split the store
-      // into two), following what GCC does. Cast the bits from int -> to an
-      // xmm register first.
-      Variable *T = makeReg(IceType_f64);
-      auto *Cast = InstCast::create(Func, InstCast::Bitcast, T, Value);
-      lowerCast(Cast);
-      // Then store XMM w/ a movq.
-      X86OperandMem *Addr = formMemoryOperand(Ptr, IceType_f64);
-      _storeq(T, Addr);
-      _mfence();
-      return;
-    }
-    auto *Store = InstStore::create(Func, Value, Ptr);
-    lowerStore(Store);
-    _mfence();
-    return;
-  }
-  case Intrinsics::Bswap: {
-    Variable *Dest = Instr->getDest();
-    Operand *Val = Instr->getArg(0);
-    // In 32-bit mode, bswap only works on 32-bit arguments, and the argument
-    // must be a register. Use rotate left for 16-bit bswap.
-    if (!Traits::Is64Bit && Val->getType() == IceType_i64) {
-      Val = legalizeUndef(Val);
-      Variable *T_Lo = legalizeToReg(loOperand(Val));
-      Variable *T_Hi = legalizeToReg(hiOperand(Val));
-      auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
-      auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
-      _bswap(T_Lo);
-      _bswap(T_Hi);
-      _mov(DestLo, T_Hi);
-      _mov(DestHi, T_Lo);
-    } else if ((Traits::Is64Bit && Val->getType() == IceType_i64) ||
-               Val->getType() == IceType_i32) {
-      Variable *T = legalizeToReg(Val);
-      _bswap(T);
-      _mov(Dest, T);
-    } else {
-      assert(Val->getType() == IceType_i16);
-      Constant *Eight = Ctx->getConstantInt16(8);
-      Variable *T = nullptr;
-      Val = legalize(Val);
-      _mov(T, Val);
-      _rol(T, Eight);
-      _mov(Dest, T);
-    }
-    return;
-  }
-  case Intrinsics::Ctpop: {
-    Variable *Dest = Instr->getDest();
-    Variable *T = nullptr;
-    Operand *Val = Instr->getArg(0);
-    Type ValTy = Val->getType();
-    assert(ValTy == IceType_i32 || ValTy == IceType_i64);
-
-    if (!Traits::Is64Bit) {
-      T = Dest;
-    } else {
-      T = makeReg(IceType_i64);
-      if (ValTy == IceType_i32) {
-        // in x86-64, __popcountsi2 is not defined, so we cheat a bit by
-        // converting it to a 64-bit value, and using ctpop_i64. _movzx should
-        // ensure we will not have any bits set on Val's upper 32 bits.
-        Variable *V = makeReg(IceType_i64);
-        Operand *ValRM = legalize(Val, Legal_Reg | Legal_Mem);
-        _movzx(V, ValRM);
-        Val = V;
-      }
-      ValTy = IceType_i64;
-    }
-
-    InstCall *Call =
-        makeHelperCall(ValTy == IceType_i32 ? RuntimeHelper::H_call_ctpop_i32
-                                            : RuntimeHelper::H_call_ctpop_i64,
-                       T, 1);
-    Call->addArg(Val);
-    lowerCall(Call);
-    // The popcount helpers always return 32-bit values, while the intrinsic's
-    // signature matches the native POPCNT instruction and fills a 64-bit reg
-    // (in 64-bit mode). Thus, clear the upper bits of the dest just in case
-    // the user doesn't do that in the IR. If the user does that in the IR,
-    // then this zero'ing instruction is dead and gets optimized out.
-    if (!Traits::Is64Bit) {
-      assert(T == Dest);
-      if (Val->getType() == IceType_i64) {
-        auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
-        Constant *Zero = Ctx->getConstantZero(IceType_i32);
-        _mov(DestHi, Zero);
-      }
-    } else {
-      assert(Val->getType() == IceType_i64);
-      // T is 64 bit. It needs to be copied to dest. We need to:
-      //
-      // T_1.32 = trunc T.64 to i32
-      // T_2.64 = zext T_1.32 to i64
-      // Dest.<<right_size>> = T_2.<<right_size>>
-      //
-      // which ensures the upper 32 bits will always be cleared. Just doing a
-      //
-      // mov Dest.32 = trunc T.32 to i32
-      //
-      // is dangerous because there's a chance the compiler will optimize this
-      // copy out. To use _movzx we need two new registers (one 32-, and
-      // another 64-bit wide.)
-      Variable *T_1 = makeReg(IceType_i32);
-      _mov(T_1, T);
-      Variable *T_2 = makeReg(IceType_i64);
-      _movzx(T_2, T_1);
-      _mov(Dest, T_2);
-    }
-    return;
-  }
-  case Intrinsics::Ctlz: {
-    // The "is zero undef" parameter is ignored and we always return a
-    // well-defined value.
-    Operand *Val = legalize(Instr->getArg(0));
-    Operand *FirstVal;
-    Operand *SecondVal = nullptr;
-    if (!Traits::Is64Bit && Val->getType() == IceType_i64) {
-      FirstVal = loOperand(Val);
-      SecondVal = hiOperand(Val);
-    } else {
-      FirstVal = Val;
-    }
-    constexpr bool IsCttz = false;
-    lowerCountZeros(IsCttz, Val->getType(), Instr->getDest(), FirstVal,
-                    SecondVal);
-    return;
-  }
-  case Intrinsics::Cttz: {
-    // The "is zero undef" parameter is ignored and we always return a
-    // well-defined value.
-    Operand *Val = legalize(Instr->getArg(0));
-    Operand *FirstVal;
-    Operand *SecondVal = nullptr;
-    if (!Traits::Is64Bit && Val->getType() == IceType_i64) {
-      FirstVal = hiOperand(Val);
-      SecondVal = loOperand(Val);
-    } else {
-      FirstVal = Val;
-    }
-    constexpr bool IsCttz = true;
-    lowerCountZeros(IsCttz, Val->getType(), Instr->getDest(), FirstVal,
-                    SecondVal);
-    return;
-  }
-  case Intrinsics::Fabs: {
-    Operand *Src = legalize(Instr->getArg(0));
-    Type Ty = Src->getType();
-    Variable *Dest = Instr->getDest();
-    Variable *T = makeVectorOfFabsMask(Ty);
-    // The pand instruction operates on an m128 memory operand, so if Src is an
-    // f32 or f64, we need to make sure it's in a register.
-    if (isVectorType(Ty)) {
-      if (llvm::isa<X86OperandMem>(Src))
-        Src = legalizeToReg(Src);
-    } else {
-      Src = legalizeToReg(Src);
-    }
-    _pand(T, Src);
-    if (isVectorType(Ty))
-      _movp(Dest, T);
-    else
-      _mov(Dest, T);
-    return;
-  }
-  case Intrinsics::Longjmp: {
-    InstCall *Call = makeHelperCall(RuntimeHelper::H_call_longjmp, nullptr, 2);
-    Call->addArg(Instr->getArg(0));
-    Call->addArg(Instr->getArg(1));
-    lowerCall(Call);
-    return;
-  }
-  case Intrinsics::Memcpy: {
-    lowerMemcpy(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2));
-    return;
-  }
-  case Intrinsics::Memmove: {
-    lowerMemmove(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2));
-    return;
-  }
-  case Intrinsics::Memset: {
-    lowerMemset(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2));
-    return;
-  }
-  case Intrinsics::Setjmp: {
-    InstCall *Call =
-        makeHelperCall(RuntimeHelper::H_call_setjmp, Instr->getDest(), 1);
-    Call->addArg(Instr->getArg(0));
-    lowerCall(Call);
-    return;
-  }
-  case Intrinsics::Sqrt: {
-    Operand *Src = legalize(Instr->getArg(0));
-    Variable *Dest = Instr->getDest();
-    Variable *T = makeReg(Dest->getType());
-    _sqrt(T, Src);
-    if (isVectorType(Dest->getType())) {
-      _movp(Dest, T);
-    } else {
-      _mov(Dest, T);
-    }
-    return;
-  }
-  case Intrinsics::Stacksave: {
-    Variable *esp =
-        Func->getTarget()->getPhysicalRegister(getStackReg(), Traits::WordType);
-    Variable *Dest = Instr->getDest();
-    _mov(Dest, esp);
-    return;
-  }
-  case Intrinsics::Stackrestore: {
-    Operand *Src = Instr->getArg(0);
-    _mov_sp(Src);
-    return;
-  }
-
-  case Intrinsics::Trap:
-    _ud2();
-    return;
-  case Intrinsics::LoadSubVector: {
-    assert(llvm::isa<ConstantInteger32>(Instr->getArg(1)) &&
-           "LoadSubVector second argument must be a constant");
-    Variable *Dest = Instr->getDest();
-    Type Ty = Dest->getType();
-    auto *SubVectorSize = llvm::cast<ConstantInteger32>(Instr->getArg(1));
-    Operand *Addr = Instr->getArg(0);
-    X86OperandMem *Src = formMemoryOperand(Addr, Ty);
-    doMockBoundsCheck(Src);
-
-    if (Dest->isRematerializable()) {
-      Context.insert<InstFakeDef>(Dest);
-      return;
-    }
-
-    auto *T = makeReg(Ty);
-    switch (SubVectorSize->getValue()) {
-    case 4:
-      _movd(T, Src);
-      break;
-    case 8:
-      _movq(T, Src);
-      break;
-    default:
-      Func->setError("Unexpected size for LoadSubVector");
-      return;
-    }
-    _movp(Dest, T);
-    return;
-  }
-  case Intrinsics::StoreSubVector: {
-    assert(llvm::isa<ConstantInteger32>(Instr->getArg(2)) &&
-           "StoreSubVector third argument must be a constant");
-    auto *SubVectorSize = llvm::cast<ConstantInteger32>(Instr->getArg(2));
-    Operand *Value = Instr->getArg(0);
-    Operand *Addr = Instr->getArg(1);
-    X86OperandMem *NewAddr = formMemoryOperand(Addr, Value->getType());
-    doMockBoundsCheck(NewAddr);
-
-    Value = legalizeToReg(Value);
-
-    switch (SubVectorSize->getValue()) {
-    case 4:
-      _stored(Value, NewAddr);
-      break;
-    case 8:
-      _storeq(Value, NewAddr);
-      break;
-    default:
-      Func->setError("Unexpected size for StoreSubVector");
-      return;
-    }
-    return;
-  }
-  case Intrinsics::VectorPackSigned: {
-    Operand *Src0 = Instr->getArg(0);
-    Operand *Src1 = Instr->getArg(1);
-    Variable *Dest = Instr->getDest();
-    auto *T = makeReg(Src0->getType());
-    auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
-    auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
-    _movp(T, Src0RM);
-    _packss(T, Src1RM);
-    _movp(Dest, T);
-    return;
-  }
-  case Intrinsics::VectorPackUnsigned: {
-    Operand *Src0 = Instr->getArg(0);
-    Operand *Src1 = Instr->getArg(1);
-    Variable *Dest = Instr->getDest();
-    auto *T = makeReg(Src0->getType());
-    auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
-    auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
-    _movp(T, Src0RM);
-    _packus(T, Src1RM);
-    _movp(Dest, T);
-    return;
-  }
-  case Intrinsics::SignMask: {
-    Operand *SrcReg = legalizeToReg(Instr->getArg(0));
-    Variable *Dest = Instr->getDest();
-    Variable *T = makeReg(IceType_i32);
-    if (SrcReg->getType() == IceType_v4f32 ||
-        SrcReg->getType() == IceType_v4i32 ||
-        SrcReg->getType() == IceType_v16i8) {
-      _movmsk(T, SrcReg);
-    } else {
-      // TODO(capn): We could implement v8i16 sign mask using packsswb/pmovmskb
-      llvm::report_fatal_error("Invalid type for SignMask intrinsic");
-    }
-    _mov(Dest, T);
-    return;
-  }
-  case Intrinsics::MultiplyHighSigned: {
-    Operand *Src0 = Instr->getArg(0);
-    Operand *Src1 = Instr->getArg(1);
-    Variable *Dest = Instr->getDest();
-    auto *T = makeReg(Dest->getType());
-    auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
-    auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
-    _movp(T, Src0RM);
-    _pmulhw(T, Src1RM);
-    _movp(Dest, T);
-    return;
-  }
-  case Intrinsics::MultiplyHighUnsigned: {
-    Operand *Src0 = Instr->getArg(0);
-    Operand *Src1 = Instr->getArg(1);
-    Variable *Dest = Instr->getDest();
-    auto *T = makeReg(Dest->getType());
-    auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
-    auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
-    _movp(T, Src0RM);
-    _pmulhuw(T, Src1RM);
-    _movp(Dest, T);
-    return;
-  }
-  case Intrinsics::MultiplyAddPairs: {
-    Operand *Src0 = Instr->getArg(0);
-    Operand *Src1 = Instr->getArg(1);
-    Variable *Dest = Instr->getDest();
-    auto *T = makeReg(Dest->getType());
-    auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
-    auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
-    _movp(T, Src0RM);
-    _pmaddwd(T, Src1RM);
-    _movp(Dest, T);
-    return;
-  }
-  case Intrinsics::AddSaturateSigned: {
-    Operand *Src0 = Instr->getArg(0);
-    Operand *Src1 = Instr->getArg(1);
-    Variable *Dest = Instr->getDest();
-    auto *T = makeReg(Dest->getType());
-    auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
-    auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
-    _movp(T, Src0RM);
-    _padds(T, Src1RM);
-    _movp(Dest, T);
-    return;
-  }
-  case Intrinsics::SubtractSaturateSigned: {
-    Operand *Src0 = Instr->getArg(0);
-    Operand *Src1 = Instr->getArg(1);
-    Variable *Dest = Instr->getDest();
-    auto *T = makeReg(Dest->getType());
-    auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
-    auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
-    _movp(T, Src0RM);
-    _psubs(T, Src1RM);
-    _movp(Dest, T);
-    return;
-  }
-  case Intrinsics::AddSaturateUnsigned: {
-    Operand *Src0 = Instr->getArg(0);
-    Operand *Src1 = Instr->getArg(1);
-    Variable *Dest = Instr->getDest();
-    auto *T = makeReg(Dest->getType());
-    auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
-    auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
-    _movp(T, Src0RM);
-    _paddus(T, Src1RM);
-    _movp(Dest, T);
-    return;
-  }
-  case Intrinsics::SubtractSaturateUnsigned: {
-    Operand *Src0 = Instr->getArg(0);
-    Operand *Src1 = Instr->getArg(1);
-    Variable *Dest = Instr->getDest();
-    auto *T = makeReg(Dest->getType());
-    auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
-    auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
-    _movp(T, Src0RM);
-    _psubus(T, Src1RM);
-    _movp(Dest, T);
-    return;
-  }
-  case Intrinsics::Nearbyint: {
-    Operand *Src = Instr->getArg(0);
-    Variable *Dest = Instr->getDest();
-    Type DestTy = Dest->getType();
-    if (isVectorType(DestTy)) {
-      assert(DestTy == IceType_v4i32);
-      assert(Src->getType() == IceType_v4f32);
-      Operand *Src0R = legalizeToReg(Src);
-      Variable *T = makeReg(DestTy);
-      _cvt(T, Src0R, Traits::Insts::Cvt::Ps2dq);
-      _movp(Dest, T);
-    } else if (!Traits::Is64Bit && DestTy == IceType_i64) {
-      llvm::report_fatal_error("Helper call was expected");
-    } else {
-      Operand *Src0RM = legalize(Src, Legal_Reg | Legal_Mem);
-      // t1.i32 = cvt Src0RM; t2.dest_type = t1; Dest = t2.dest_type
-      Variable *T_1 = nullptr;
-      if (Traits::Is64Bit && DestTy == IceType_i64) {
-        T_1 = makeReg(IceType_i64);
-      } else {
-        assert(DestTy != IceType_i64);
-        T_1 = makeReg(IceType_i32);
-      }
-      // cvt() requires its integer argument to be a GPR.
-      Variable *T_2 = makeReg(DestTy);
-      if (isByteSizedType(DestTy)) {
-        assert(T_1->getType() == IceType_i32);
-        T_1->setRegClass(RCX86_Is32To8);
-        T_2->setRegClass(RCX86_IsTrunc8Rcvr);
-      }
-      _cvt(T_1, Src0RM, Traits::Insts::Cvt::Ss2si);
-      _mov(T_2, T_1); // T_1 and T_2 may have different integer types
-      if (DestTy == IceType_i1)
-        _and(T_2, Ctx->getConstantInt1(1));
-      _mov(Dest, T_2);
-    }
-    return;
-  }
-  case Intrinsics::Round: {
-    assert(InstructionSet >= Traits::SSE4_1);
-    Variable *Dest = Instr->getDest();
-    Operand *Src = Instr->getArg(0);
-    Operand *Mode = Instr->getArg(1);
-    assert(llvm::isa<ConstantInteger32>(Mode) &&
-           "Round last argument must be a constant");
-    auto *SrcRM = legalize(Src, Legal_Reg | Legal_Mem);
-    int32_t Imm = llvm::cast<ConstantInteger32>(Mode)->getValue();
-    (void)Imm;
-    assert(Imm >= 0 && Imm < 4 && "Invalid rounding mode");
-    auto *T = makeReg(Dest->getType());
-    _round(T, SrcRM, Mode);
-    _movp(Dest, T);
-    return;
-  }
-  default: // UnknownIntrinsic
-    Func->setError("Unexpected intrinsic");
-    return;
-  }
-  return;
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::lowerAtomicCmpxchg(Variable *DestPrev,
-                                                   Operand *Ptr,
-                                                   Operand *Expected,
-                                                   Operand *Desired) {
-  Type Ty = Expected->getType();
-  if (!Traits::Is64Bit && Ty == IceType_i64) {
-    // Reserve the pre-colored registers first, before adding any more
-    // infinite-weight variables from formMemoryOperand's legalization.
-    Variable *T_edx = makeReg(IceType_i32, Traits::RegisterSet::Reg_edx);
-    Variable *T_eax = makeReg(IceType_i32, Traits::RegisterSet::Reg_eax);
-    Variable *T_ecx = makeReg(IceType_i32, Traits::RegisterSet::Reg_ecx);
-    Variable *T_ebx = makeReg(IceType_i32, Traits::RegisterSet::Reg_ebx);
-    _mov(T_eax, loOperand(Expected));
-    _mov(T_edx, hiOperand(Expected));
-    _mov(T_ebx, loOperand(Desired));
-    _mov(T_ecx, hiOperand(Desired));
-    X86OperandMem *Addr = formMemoryOperand(Ptr, Ty);
-    constexpr bool Locked = true;
-    _cmpxchg8b(Addr, T_edx, T_eax, T_ecx, T_ebx, Locked);
-    auto *DestLo = llvm::cast<Variable>(loOperand(DestPrev));
-    auto *DestHi = llvm::cast<Variable>(hiOperand(DestPrev));
-    _mov(DestLo, T_eax);
-    _mov(DestHi, T_edx);
-    return;
-  }
-  RegNumT Eax;
-  switch (Ty) {
-  default:
-    llvm::report_fatal_error("Bad type for cmpxchg");
-  case IceType_i64:
-    Eax = Traits::getRaxOrDie();
-    break;
-  case IceType_i32:
-    Eax = Traits::RegisterSet::Reg_eax;
-    break;
-  case IceType_i16:
-    Eax = Traits::RegisterSet::Reg_ax;
-    break;
-  case IceType_i8:
-    Eax = Traits::RegisterSet::Reg_al;
-    break;
-  }
-  Variable *T_eax = makeReg(Ty, Eax);
-  _mov(T_eax, Expected);
-  X86OperandMem *Addr = formMemoryOperand(Ptr, Ty);
-  Variable *DesiredReg = legalizeToReg(Desired);
-  constexpr bool Locked = true;
-  _cmpxchg(Addr, T_eax, DesiredReg, Locked);
-  _mov(DestPrev, T_eax);
-}
-
-template <typename TraitsType>
-bool TargetX86Base<TraitsType>::tryOptimizedCmpxchgCmpBr(Variable *Dest,
-                                                         Operand *PtrToMem,
-                                                         Operand *Expected,
-                                                         Operand *Desired) {
-  if (Func->getOptLevel() == Opt_m1)
-    return false;
-  // Peek ahead a few instructions and see how Dest is used.
-  // It's very common to have:
-  //
-  // %x = call i32 @llvm.nacl.atomic.cmpxchg.i32(i32* ptr, i32 %expected, ...)
-  // [%y_phi = ...] // list of phi stores
-  // %p = icmp eq i32 %x, %expected
-  // br i1 %p, label %l1, label %l2
-  //
-  // which we can optimize into:
-  //
-  // %x = <cmpxchg code>
-  // [%y_phi = ...] // list of phi stores
-  // br eq, %l1, %l2
-  InstList::iterator I = Context.getCur();
-  // I is currently the InstIntrinsic. Peek past that.
-  // This assumes that the atomic cmpxchg has not been lowered yet,
-  // so that the instructions seen in the scan from "Cur" is simple.
-  assert(llvm::isa<InstIntrinsic>(*I));
-  Inst *NextInst = Context.getNextInst(I);
-  if (!NextInst)
-    return false;
-  // There might be phi assignments right before the compare+branch, since this
-  // could be a backward branch for a loop. This placement of assignments is
-  // determined by placePhiStores().
-  CfgVector<InstAssign *> PhiAssigns;
-  while (auto *PhiAssign = llvm::dyn_cast<InstAssign>(NextInst)) {
-    if (PhiAssign->getDest() == Dest)
-      return false;
-    PhiAssigns.push_back(PhiAssign);
-    NextInst = Context.getNextInst(I);
-    if (!NextInst)
-      return false;
-  }
-  if (auto *NextCmp = llvm::dyn_cast<InstIcmp>(NextInst)) {
-    if (!(NextCmp->getCondition() == InstIcmp::Eq &&
-          ((NextCmp->getSrc(0) == Dest && NextCmp->getSrc(1) == Expected) ||
-           (NextCmp->getSrc(1) == Dest && NextCmp->getSrc(0) == Expected)))) {
-      return false;
-    }
-    NextInst = Context.getNextInst(I);
-    if (!NextInst)
-      return false;
-    if (auto *NextBr = llvm::dyn_cast<InstBr>(NextInst)) {
-      if (!NextBr->isUnconditional() &&
-          NextCmp->getDest() == NextBr->getCondition() &&
-          NextBr->isLastUse(NextCmp->getDest())) {
-        lowerAtomicCmpxchg(Dest, PtrToMem, Expected, Desired);
-        for (size_t i = 0; i < PhiAssigns.size(); ++i) {
-          // Lower the phi assignments now, before the branch (same placement
-          // as before).
-          InstAssign *PhiAssign = PhiAssigns[i];
-          PhiAssign->setDeleted();
-          lowerAssign(PhiAssign);
-          Context.advanceNext();
-        }
-        _br(CondX86::Br_e, NextBr->getTargetTrue(), NextBr->getTargetFalse());
-        // Skip over the old compare and branch, by deleting them.
-        NextCmp->setDeleted();
-        NextBr->setDeleted();
-        Context.advanceNext();
-        Context.advanceNext();
-        return true;
-      }
-    }
-  }
-  return false;
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::lowerAtomicRMW(Variable *Dest,
-                                               uint32_t Operation, Operand *Ptr,
-                                               Operand *Val) {
-  bool NeedsCmpxchg = false;
-  LowerBinOp Op_Lo = nullptr;
-  LowerBinOp Op_Hi = nullptr;
-  switch (Operation) {
-  default:
-    Func->setError("Unknown AtomicRMW operation");
-    return;
-  case Intrinsics::AtomicAdd: {
-    if (!Traits::Is64Bit && Dest->getType() == IceType_i64) {
-      // All the fall-through paths must set this to true, but use this
-      // for asserting.
-      NeedsCmpxchg = true;
-      Op_Lo = &TargetX86Base<TraitsType>::_add;
-      Op_Hi = &TargetX86Base<TraitsType>::_adc;
-      break;
-    }
-    X86OperandMem *Addr = formMemoryOperand(Ptr, Dest->getType());
-    constexpr bool Locked = true;
-    Variable *T = nullptr;
-    _mov(T, Val);
-    _xadd(Addr, T, Locked);
-    _mov(Dest, T);
-    return;
-  }
-  case Intrinsics::AtomicSub: {
-    if (!Traits::Is64Bit && Dest->getType() == IceType_i64) {
-      NeedsCmpxchg = true;
-      Op_Lo = &TargetX86Base<TraitsType>::_sub;
-      Op_Hi = &TargetX86Base<TraitsType>::_sbb;
-      break;
-    }
-    X86OperandMem *Addr = formMemoryOperand(Ptr, Dest->getType());
-    constexpr bool Locked = true;
-    Variable *T = nullptr;
-    _mov(T, Val);
-    _neg(T);
-    _xadd(Addr, T, Locked);
-    _mov(Dest, T);
-    return;
-  }
-  case Intrinsics::AtomicOr:
-    // TODO(jvoung): If Dest is null or dead, then some of these
-    // operations do not need an "exchange", but just a locked op.
-    // That appears to be "worth" it for sub, or, and, and xor.
-    // xadd is probably fine vs lock add for add, and xchg is fine
-    // vs an atomic store.
-    NeedsCmpxchg = true;
-    Op_Lo = &TargetX86Base<TraitsType>::_or;
-    Op_Hi = &TargetX86Base<TraitsType>::_or;
-    break;
-  case Intrinsics::AtomicAnd:
-    NeedsCmpxchg = true;
-    Op_Lo = &TargetX86Base<TraitsType>::_and;
-    Op_Hi = &TargetX86Base<TraitsType>::_and;
-    break;
-  case Intrinsics::AtomicXor:
-    NeedsCmpxchg = true;
-    Op_Lo = &TargetX86Base<TraitsType>::_xor;
-    Op_Hi = &TargetX86Base<TraitsType>::_xor;
-    break;
-  case Intrinsics::AtomicExchange:
-    if (!Traits::Is64Bit && Dest->getType() == IceType_i64) {
-      NeedsCmpxchg = true;
-      // NeedsCmpxchg, but no real Op_Lo/Op_Hi need to be done. The values
-      // just need to be moved to the ecx and ebx registers.
-      Op_Lo = nullptr;
-      Op_Hi = nullptr;
-      break;
-    }
-    X86OperandMem *Addr = formMemoryOperand(Ptr, Dest->getType());
-    Variable *T = nullptr;
-    _mov(T, Val);
-    _xchg(Addr, T);
-    _mov(Dest, T);
-    return;
-  }
-  // Otherwise, we need a cmpxchg loop.
-  (void)NeedsCmpxchg;
-  assert(NeedsCmpxchg);
-  expandAtomicRMWAsCmpxchg(Op_Lo, Op_Hi, Dest, Ptr, Val);
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::expandAtomicRMWAsCmpxchg(LowerBinOp Op_Lo,
-                                                         LowerBinOp Op_Hi,
-                                                         Variable *Dest,
-                                                         Operand *Ptr,
-                                                         Operand *Val) {
-  // Expand a more complex RMW operation as a cmpxchg loop:
-  // For 64-bit:
-  //   mov     eax, [ptr]
-  //   mov     edx, [ptr + 4]
-  // .LABEL:
-  //   mov     ebx, eax
-  //   <Op_Lo> ebx, <desired_adj_lo>
-  //   mov     ecx, edx
-  //   <Op_Hi> ecx, <desired_adj_hi>
-  //   lock cmpxchg8b [ptr]
-  //   jne     .LABEL
-  //   mov     <dest_lo>, eax
-  //   mov     <dest_lo>, edx
-  //
-  // For 32-bit:
-  //   mov     eax, [ptr]
-  // .LABEL:
-  //   mov     <reg>, eax
-  //   op      <reg>, [desired_adj]
-  //   lock cmpxchg [ptr], <reg>
-  //   jne     .LABEL
-  //   mov     <dest>, eax
-  //
-  // If Op_{Lo,Hi} are nullptr, then just copy the value.
-  Val = legalize(Val);
-  Type Ty = Val->getType();
-  if (!Traits::Is64Bit && Ty == IceType_i64) {
-    Variable *T_edx = makeReg(IceType_i32, Traits::RegisterSet::Reg_edx);
-    Variable *T_eax = makeReg(IceType_i32, Traits::RegisterSet::Reg_eax);
-    X86OperandMem *Addr = formMemoryOperand(Ptr, Ty);
-    _mov(T_eax, loOperand(Addr));
-    _mov(T_edx, hiOperand(Addr));
-    Variable *T_ecx = makeReg(IceType_i32, Traits::RegisterSet::Reg_ecx);
-    Variable *T_ebx = makeReg(IceType_i32, Traits::RegisterSet::Reg_ebx);
-    InstX86Label *Label = InstX86Label::create(Func, this);
-    const bool IsXchg8b = Op_Lo == nullptr && Op_Hi == nullptr;
-    if (!IsXchg8b) {
-      Context.insert(Label);
-      _mov(T_ebx, T_eax);
-      (this->*Op_Lo)(T_ebx, loOperand(Val));
-      _mov(T_ecx, T_edx);
-      (this->*Op_Hi)(T_ecx, hiOperand(Val));
-    } else {
-      // This is for xchg, which doesn't need an actual Op_Lo/Op_Hi.
-      // It just needs the Val loaded into ebx and ecx.
-      // That can also be done before the loop.
-      _mov(T_ebx, loOperand(Val));
-      _mov(T_ecx, hiOperand(Val));
-      Context.insert(Label);
-    }
-    constexpr bool Locked = true;
-    _cmpxchg8b(Addr, T_edx, T_eax, T_ecx, T_ebx, Locked);
-    _br(CondX86::Br_ne, Label);
-    if (!IsXchg8b) {
-      // If Val is a variable, model the extended live range of Val through
-      // the end of the loop, since it will be re-used by the loop.
-      if (auto *ValVar = llvm::dyn_cast<Variable>(Val)) {
-        auto *ValLo = llvm::cast<Variable>(loOperand(ValVar));
-        auto *ValHi = llvm::cast<Variable>(hiOperand(ValVar));
-        Context.insert<InstFakeUse>(ValLo);
-        Context.insert<InstFakeUse>(ValHi);
-      }
-    } else {
-      // For xchg, the loop is slightly smaller and ebx/ecx are used.
-      Context.insert<InstFakeUse>(T_ebx);
-      Context.insert<InstFakeUse>(T_ecx);
-    }
-    // The address base (if any) is also reused in the loop.
-    if (Variable *Base = Addr->getBase())
-      Context.insert<InstFakeUse>(Base);
-    auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
-    auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
-    _mov(DestLo, T_eax);
-    _mov(DestHi, T_edx);
-    return;
-  }
-  X86OperandMem *Addr = formMemoryOperand(Ptr, Ty);
-  RegNumT Eax;
-  switch (Ty) {
-  default:
-    llvm::report_fatal_error("Bad type for atomicRMW");
-  case IceType_i64:
-    Eax = Traits::getRaxOrDie();
-    break;
-  case IceType_i32:
-    Eax = Traits::RegisterSet::Reg_eax;
-    break;
-  case IceType_i16:
-    Eax = Traits::RegisterSet::Reg_ax;
-    break;
-  case IceType_i8:
-    Eax = Traits::RegisterSet::Reg_al;
-    break;
-  }
-  Variable *T_eax = makeReg(Ty, Eax);
-  _mov(T_eax, Addr);
-  auto *Label = Context.insert<InstX86Label>(this);
-  // We want to pick a different register for T than Eax, so don't use
-  // _mov(T == nullptr, T_eax).
-  Variable *T = makeReg(Ty);
-  _mov(T, T_eax);
-  (this->*Op_Lo)(T, Val);
-  constexpr bool Locked = true;
-  _cmpxchg(Addr, T_eax, T, Locked);
-  _br(CondX86::Br_ne, Label);
-  // If Val is a variable, model the extended live range of Val through
-  // the end of the loop, since it will be re-used by the loop.
-  if (auto *ValVar = llvm::dyn_cast<Variable>(Val)) {
-    Context.insert<InstFakeUse>(ValVar);
-  }
-  // The address base (if any) is also reused in the loop.
-  if (Variable *Base = Addr->getBase())
-    Context.insert<InstFakeUse>(Base);
-  _mov(Dest, T_eax);
-}
-
-/// Lowers count {trailing, leading} zeros intrinsic.
-///
-/// We could do constant folding here, but that should have
-/// been done by the front-end/middle-end optimizations.
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::lowerCountZeros(bool Cttz, Type Ty,
-                                                Variable *Dest,
-                                                Operand *FirstVal,
-                                                Operand *SecondVal) {
-  // TODO(jvoung): Determine if the user CPU supports LZCNT (BMI).
-  // Then the instructions will handle the Val == 0 case much more simply
-  // and won't require conversion from bit position to number of zeros.
-  //
-  // Otherwise:
-  //   bsr IF_NOT_ZERO, Val
-  //   mov T_DEST, ((Ty == i32) ? 63 : 127)
-  //   cmovne T_DEST, IF_NOT_ZERO
-  //   xor T_DEST, ((Ty == i32) ? 31 : 63)
-  //   mov DEST, T_DEST
-  //
-  // NOTE: T_DEST must be a register because cmov requires its dest to be a
-  // register. Also, bsf and bsr require their dest to be a register.
-  //
-  // The xor DEST, C(31|63) converts a bit position to # of leading zeroes.
-  // E.g., for 000... 00001100, bsr will say that the most significant bit
-  // set is at position 3, while the number of leading zeros is 28. Xor is
-  // like (M - N) for N <= M, and converts 63 to 32, and 127 to 64 (for the
-  // all-zeros case).
-  //
-  // X8632 only: Similar for 64-bit, but start w/ speculating that the upper 32
-  // bits are all zero, and compute the result for that case (checking the
-  // lower 32 bits). Then actually compute the result for the upper bits and
-  // cmov in the result from the lower computation if the earlier speculation
-  // was correct.
-  //
-  // Cttz, is similar, but uses bsf instead, and doesn't require the xor
-  // bit position conversion, and the speculation is reversed.
-
-  // TODO(jpp): refactor this method.
-  assert(Ty == IceType_i32 || Ty == IceType_i64);
-  const Type DestTy = Traits::Is64Bit ? Dest->getType() : IceType_i32;
-  Variable *T = makeReg(DestTy);
-  Operand *FirstValRM = legalize(FirstVal, Legal_Mem | Legal_Reg);
-  if (Cttz) {
-    _bsf(T, FirstValRM);
-  } else {
-    _bsr(T, FirstValRM);
-  }
-  Variable *T_Dest = makeReg(DestTy);
-  Constant *_31 = Ctx->getConstantInt32(31);
-  Constant *_32 = Ctx->getConstantInt(DestTy, 32);
-  Constant *_63 = Ctx->getConstantInt(DestTy, 63);
-  Constant *_64 = Ctx->getConstantInt(DestTy, 64);
-  if (Cttz) {
-    if (DestTy == IceType_i64) {
-      _mov(T_Dest, _64);
-    } else {
-      _mov(T_Dest, _32);
-    }
-  } else {
-    Constant *_127 = Ctx->getConstantInt(DestTy, 127);
-    if (DestTy == IceType_i64) {
-      _mov(T_Dest, _127);
-    } else {
-      _mov(T_Dest, _63);
-    }
-  }
-  _cmov(T_Dest, T, CondX86::Br_ne);
-  if (!Cttz) {
-    if (DestTy == IceType_i64) {
-      // Even though there's a _63 available at this point, that constant might
-      // not be an i32, which will cause the xor emission to fail.
-      Constant *_63 = Ctx->getConstantInt32(63);
-      _xor(T_Dest, _63);
-    } else {
-      _xor(T_Dest, _31);
-    }
-  }
-  if (Traits::Is64Bit || Ty == IceType_i32) {
-    _mov(Dest, T_Dest);
-    return;
-  }
-  _add(T_Dest, _32);
-  auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
-  auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
-  // Will be using "test" on this, so we need a registerized variable.
-  Variable *SecondVar = legalizeToReg(SecondVal);
-  Variable *T_Dest2 = makeReg(IceType_i32);
-  if (Cttz) {
-    _bsf(T_Dest2, SecondVar);
-  } else {
-    _bsr(T_Dest2, SecondVar);
-    _xor(T_Dest2, _31);
-  }
-  _test(SecondVar, SecondVar);
-  _cmov(T_Dest2, T_Dest, CondX86::Br_e);
-  _mov(DestLo, T_Dest2);
-  _mov(DestHi, Ctx->getConstantZero(IceType_i32));
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::typedLoad(Type Ty, Variable *Dest,
-                                          Variable *Base, Constant *Offset) {
-  // If Offset is a ConstantRelocatable in Non-SFI mode, we will need to
-  // legalize Mem properly.
-  if (Offset)
-    assert(!llvm::isa<ConstantRelocatable>(Offset));
-
-  auto *Mem = X86OperandMem::create(Func, Ty, Base, Offset);
-
-  if (isVectorType(Ty))
-    _movp(Dest, Mem);
-  else if (Ty == IceType_f64)
-    _movq(Dest, Mem);
-  else
-    _mov(Dest, Mem);
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::typedStore(Type Ty, Variable *Value,
-                                           Variable *Base, Constant *Offset) {
-  // If Offset is a ConstantRelocatable in Non-SFI mode, we will need to
-  // legalize Mem properly.
-  if (Offset)
-    assert(!llvm::isa<ConstantRelocatable>(Offset));
-
-  auto *Mem = X86OperandMem::create(Func, Ty, Base, Offset);
-
-  if (isVectorType(Ty))
-    _storep(Value, Mem);
-  else if (Ty == IceType_f64)
-    _storeq(Value, Mem);
-  else
-    _store(Value, Mem);
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::copyMemory(Type Ty, Variable *Dest,
-                                           Variable *Src, int32_t OffsetAmt) {
-  Constant *Offset = OffsetAmt ? Ctx->getConstantInt32(OffsetAmt) : nullptr;
-  // TODO(ascull): this or add nullptr test to _movp, _movq
-  Variable *Data = makeReg(Ty);
-
-  typedLoad(Ty, Data, Src, Offset);
-  typedStore(Ty, Data, Dest, Offset);
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::lowerMemcpy(Operand *Dest, Operand *Src,
-                                            Operand *Count) {
-  // There is a load and store for each chunk in the unroll
-  constexpr uint32_t BytesPerStorep = 16;
-
-  // Check if the operands are constants
-  const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count);
-  const bool IsCountConst = CountConst != nullptr;
-  const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0;
-
-  if (shouldOptimizeMemIntrins() && IsCountConst &&
-      CountValue <= BytesPerStorep * Traits::MEMCPY_UNROLL_LIMIT) {
-    // Unlikely, but nothing to do if it does happen
-    if (CountValue == 0)
-      return;
-
-    Variable *SrcBase = legalizeToReg(Src);
-    Variable *DestBase = legalizeToReg(Dest);
-
-    // Find the largest type that can be used and use it as much as possible in
-    // reverse order. Then handle any remainder with overlapping copies. Since
-    // the remainder will be at the end, there will be reduced pressure on the
-    // memory unit as the accesses to the same memory are far apart.
-    Type Ty = largestTypeInSize(CountValue);
-    uint32_t TyWidth = typeWidthInBytes(Ty);
-
-    uint32_t RemainingBytes = CountValue;
-    int32_t Offset = (CountValue & ~(TyWidth - 1)) - TyWidth;
-    while (RemainingBytes >= TyWidth) {
-      copyMemory(Ty, DestBase, SrcBase, Offset);
-      RemainingBytes -= TyWidth;
-      Offset -= TyWidth;
-    }
-
-    if (RemainingBytes == 0)
-      return;
-
-    // Lower the remaining bytes. Adjust to larger types in order to make use
-    // of overlaps in the copies.
-    Type LeftOverTy = firstTypeThatFitsSize(RemainingBytes);
-    Offset = CountValue - typeWidthInBytes(LeftOverTy);
-    copyMemory(LeftOverTy, DestBase, SrcBase, Offset);
-    return;
-  }
-
-  // Fall back on a function call
-  InstCall *Call = makeHelperCall(RuntimeHelper::H_call_memcpy, nullptr, 3);
-  Call->addArg(Dest);
-  Call->addArg(Src);
-  Call->addArg(Count);
-  lowerCall(Call);
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::lowerMemmove(Operand *Dest, Operand *Src,
-                                             Operand *Count) {
-  // There is a load and store for each chunk in the unroll
-  constexpr uint32_t BytesPerStorep = 16;
-
-  // Check if the operands are constants
-  const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count);
-  const bool IsCountConst = CountConst != nullptr;
-  const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0;
-
-  if (shouldOptimizeMemIntrins() && IsCountConst &&
-      CountValue <= BytesPerStorep * Traits::MEMMOVE_UNROLL_LIMIT) {
-    // Unlikely, but nothing to do if it does happen
-    if (CountValue == 0)
-      return;
-
-    Variable *SrcBase = legalizeToReg(Src);
-    Variable *DestBase = legalizeToReg(Dest);
-
-    std::tuple<Type, Constant *, Variable *>
-        Moves[Traits::MEMMOVE_UNROLL_LIMIT];
-    Constant *Offset;
-    Variable *Reg;
-
-    // Copy the data into registers as the source and destination could overlap
-    // so make sure not to clobber the memory. This also means overlapping
-    // moves can be used as we are taking a safe snapshot of the memory.
-    Type Ty = largestTypeInSize(CountValue);
-    uint32_t TyWidth = typeWidthInBytes(Ty);
-
-    uint32_t RemainingBytes = CountValue;
-    int32_t OffsetAmt = (CountValue & ~(TyWidth - 1)) - TyWidth;
-    size_t N = 0;
-    while (RemainingBytes >= TyWidth) {
-      assert(N <= Traits::MEMMOVE_UNROLL_LIMIT);
-      Offset = Ctx->getConstantInt32(OffsetAmt);
-      Reg = makeReg(Ty);
-      typedLoad(Ty, Reg, SrcBase, Offset);
-      RemainingBytes -= TyWidth;
-      OffsetAmt -= TyWidth;
-      Moves[N++] = std::make_tuple(Ty, Offset, Reg);
-    }
-
-    if (RemainingBytes != 0) {
-      // Lower the remaining bytes. Adjust to larger types in order to make use
-      // of overlaps in the copies.
-      assert(N <= Traits::MEMMOVE_UNROLL_LIMIT);
-      Ty = firstTypeThatFitsSize(RemainingBytes);
-      Offset = Ctx->getConstantInt32(CountValue - typeWidthInBytes(Ty));
-      Reg = makeReg(Ty);
-      typedLoad(Ty, Reg, SrcBase, Offset);
-      Moves[N++] = std::make_tuple(Ty, Offset, Reg);
-    }
-
-    // Copy the data out into the destination memory
-    for (size_t i = 0; i < N; ++i) {
-      std::tie(Ty, Offset, Reg) = Moves[i];
-      typedStore(Ty, Reg, DestBase, Offset);
-    }
-
-    return;
-  }
-
-  // Fall back on a function call
-  InstCall *Call = makeHelperCall(RuntimeHelper::H_call_memmove, nullptr, 3);
-  Call->addArg(Dest);
-  Call->addArg(Src);
-  Call->addArg(Count);
-  lowerCall(Call);
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::lowerMemset(Operand *Dest, Operand *Val,
-                                            Operand *Count) {
-  constexpr uint32_t BytesPerStorep = 16;
-  constexpr uint32_t BytesPerStoreq = 8;
-  constexpr uint32_t BytesPerStorei32 = 4;
-  assert(Val->getType() == IceType_i8);
-
-  // Check if the operands are constants
-  const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count);
-  const auto *ValConst = llvm::dyn_cast<const ConstantInteger32>(Val);
-  const bool IsCountConst = CountConst != nullptr;
-  const bool IsValConst = ValConst != nullptr;
-  const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0;
-  const uint32_t ValValue = IsValConst ? ValConst->getValue() : 0;
-
-  // Unlikely, but nothing to do if it does happen
-  if (IsCountConst && CountValue == 0)
-    return;
-
-  // TODO(ascull): if the count is constant but val is not it would be possible
-  // to inline by spreading the value across 4 bytes and accessing subregs e.g.
-  // eax, ax and al.
-  if (shouldOptimizeMemIntrins() && IsCountConst && IsValConst) {
-    Variable *Base = nullptr;
-    Variable *VecReg = nullptr;
-    const uint32_t MaskValue = (ValValue & 0xff);
-    const uint32_t SpreadValue =
-        (MaskValue << 24) | (MaskValue << 16) | (MaskValue << 8) | MaskValue;
-
-    auto lowerSet = [this, &Base, SpreadValue, &VecReg](Type Ty,
-                                                        uint32_t OffsetAmt) {
-      assert(Base != nullptr);
-      Constant *Offset = OffsetAmt ? Ctx->getConstantInt32(OffsetAmt) : nullptr;
-
-      // TODO(ascull): is 64-bit better with vector or scalar movq?
-      auto *Mem = X86OperandMem::create(Func, Ty, Base, Offset);
-      if (isVectorType(Ty)) {
-        assert(VecReg != nullptr);
-        _storep(VecReg, Mem);
-      } else if (Ty == IceType_f64) {
-        assert(VecReg != nullptr);
-        _storeq(VecReg, Mem);
-      } else {
-        assert(Ty != IceType_i64);
-        _store(Ctx->getConstantInt(Ty, SpreadValue), Mem);
-      }
-    };
-
-    // Find the largest type that can be used and use it as much as possible in
-    // reverse order. Then handle any remainder with overlapping copies. Since
-    // the remainder will be at the end, there will be reduces pressure on the
-    // memory unit as the access to the same memory are far apart.
-    Type Ty = IceType_void;
-    if (ValValue == 0 && CountValue >= BytesPerStoreq &&
-        CountValue <= BytesPerStorep * Traits::MEMSET_UNROLL_LIMIT) {
-      // When the value is zero it can be loaded into a vector register cheaply
-      // using the xor trick.
-      Base = legalizeToReg(Dest);
-      VecReg = makeVectorOfZeros(IceType_v16i8);
-      Ty = largestTypeInSize(CountValue);
-    } else if (CountValue <= BytesPerStorei32 * Traits::MEMSET_UNROLL_LIMIT) {
-      // When the value is non-zero or the count is small we can't use vector
-      // instructions so are limited to 32-bit stores.
-      Base = legalizeToReg(Dest);
-      constexpr uint32_t MaxSize = 4;
-      Ty = largestTypeInSize(CountValue, MaxSize);
-    }
-
-    if (Base) {
-      uint32_t TyWidth = typeWidthInBytes(Ty);
-
-      uint32_t RemainingBytes = CountValue;
-      uint32_t Offset = (CountValue & ~(TyWidth - 1)) - TyWidth;
-      while (RemainingBytes >= TyWidth) {
-        lowerSet(Ty, Offset);
-        RemainingBytes -= TyWidth;
-        Offset -= TyWidth;
-      }
-
-      if (RemainingBytes == 0)
-        return;
-
-      // Lower the remaining bytes. Adjust to larger types in order to make use
-      // of overlaps in the copies.
-      Type LeftOverTy = firstTypeThatFitsSize(RemainingBytes);
-      Offset = CountValue - typeWidthInBytes(LeftOverTy);
-      lowerSet(LeftOverTy, Offset);
-      return;
-    }
-  }
-
-  // Fall back on calling the memset function. The value operand needs to be
-  // extended to a stack slot size because the PNaCl ABI requires arguments to
-  // be at least 32 bits wide.
-  Operand *ValExt;
-  if (IsValConst) {
-    ValExt = Ctx->getConstantInt(stackSlotType(), ValValue);
-  } else {
-    Variable *ValExtVar = Func->makeVariable(stackSlotType());
-    lowerCast(InstCast::create(Func, InstCast::Zext, ValExtVar, Val));
-    ValExt = ValExtVar;
-  }
-  InstCall *Call = makeHelperCall(RuntimeHelper::H_call_memset, nullptr, 3);
-  Call->addArg(Dest);
-  Call->addArg(ValExt);
-  Call->addArg(Count);
-  lowerCall(Call);
-}
-
-class AddressOptimizer {
-  AddressOptimizer() = delete;
-  AddressOptimizer(const AddressOptimizer &) = delete;
-  AddressOptimizer &operator=(const AddressOptimizer &) = delete;
-
-public:
-  explicit AddressOptimizer(const Cfg *Func)
-      : Func(Func), VMetadata(Func->getVMetadata()) {}
-
-  inline void dumpAddressOpt(const ConstantRelocatable *const Relocatable,
-                             int32_t Offset, const Variable *Base,
-                             const Variable *Index, uint16_t Shift,
-                             const Inst *Reason) const;
-
-  inline const Inst *matchAssign(Variable **Var,
-                                 ConstantRelocatable **Relocatable,
-                                 int32_t *Offset);
-
-  inline const Inst *matchCombinedBaseIndex(Variable **Base, Variable **Index,
-                                            uint16_t *Shift);
-
-  inline const Inst *matchShiftedIndex(Variable **Index, uint16_t *Shift);
-
-  inline const Inst *matchOffsetIndexOrBase(Variable **IndexOrBase,
-                                            const uint16_t Shift,
-                                            ConstantRelocatable **Relocatable,
-                                            int32_t *Offset);
-
-private:
-  const Cfg *const Func;
-  const VariablesMetadata *const VMetadata;
-
-  static bool isAdd(const Inst *Instr) {
-    if (auto *Arith = llvm::dyn_cast_or_null<const InstArithmetic>(Instr)) {
-      return (Arith->getOp() == InstArithmetic::Add);
-    }
-    return false;
-  }
-};
-
-void AddressOptimizer::dumpAddressOpt(
-    const ConstantRelocatable *const Relocatable, int32_t Offset,
-    const Variable *Base, const Variable *Index, uint16_t Shift,
-    const Inst *Reason) const {
-  if (!BuildDefs::dump())
-    return;
-  if (!Func->isVerbose(IceV_AddrOpt))
-    return;
-  OstreamLocker L(Func->getContext());
-  Ostream &Str = Func->getContext()->getStrDump();
-  Str << "Instruction: ";
-  Reason->dumpDecorated(Func);
-  Str << "  results in Base=";
-  if (Base)
-    Base->dump(Func);
-  else
-    Str << "<null>";
-  Str << ", Index=";
-  if (Index)
-    Index->dump(Func);
-  else
-    Str << "<null>";
-  Str << ", Shift=" << Shift << ", Offset=" << Offset
-      << ", Relocatable=" << Relocatable << "\n";
-}
-
-const Inst *AddressOptimizer::matchAssign(Variable **Var,
-                                          ConstantRelocatable **Relocatable,
-                                          int32_t *Offset) {
-  // Var originates from Var=SrcVar ==> set Var:=SrcVar
-  if (*Var == nullptr)
-    return nullptr;
-  if (const Inst *VarAssign = VMetadata->getSingleDefinition(*Var)) {
-    assert(!VMetadata->isMultiDef(*Var));
-    if (llvm::isa<InstAssign>(VarAssign)) {
-      Operand *SrcOp = VarAssign->getSrc(0);
-      assert(SrcOp);
-      if (auto *SrcVar = llvm::dyn_cast<Variable>(SrcOp)) {
-        if (!VMetadata->isMultiDef(SrcVar) &&
-            // TODO: ensure SrcVar stays single-BB
-            true) {
-          *Var = SrcVar;
-          return VarAssign;
-        }
-      } else if (auto *Const = llvm::dyn_cast<ConstantInteger32>(SrcOp)) {
-        int32_t MoreOffset = Const->getValue();
-        if (Utils::WouldOverflowAdd(*Offset, MoreOffset))
-          return nullptr;
-        *Var = nullptr;
-        *Offset += MoreOffset;
-        return VarAssign;
-      } else if (auto *AddReloc = llvm::dyn_cast<ConstantRelocatable>(SrcOp)) {
-        if (*Relocatable == nullptr) {
-          // It is always safe to fold a relocatable through assignment -- the
-          // assignment frees a slot in the address operand that can be used to
-          // hold the Sandbox Pointer -- if any.
-          *Var = nullptr;
-          *Relocatable = AddReloc;
-          return VarAssign;
-        }
-      }
-    }
-  }
-  return nullptr;
-}
-
-const Inst *AddressOptimizer::matchCombinedBaseIndex(Variable **Base,
-                                                     Variable **Index,
-                                                     uint16_t *Shift) {
-  // Index==nullptr && Base is Base=Var1+Var2 ==>
-  //   set Base=Var1, Index=Var2, Shift=0
-  if (*Base == nullptr)
-    return nullptr;
-  if (*Index != nullptr)
-    return nullptr;
-  auto *BaseInst = VMetadata->getSingleDefinition(*Base);
-  if (BaseInst == nullptr)
-    return nullptr;
-  assert(!VMetadata->isMultiDef(*Base));
-  if (BaseInst->getSrcSize() < 2)
-    return nullptr;
-  if (auto *Var1 = llvm::dyn_cast<Variable>(BaseInst->getSrc(0))) {
-    if (VMetadata->isMultiDef(Var1))
-      return nullptr;
-    if (auto *Var2 = llvm::dyn_cast<Variable>(BaseInst->getSrc(1))) {
-      if (VMetadata->isMultiDef(Var2))
-        return nullptr;
-      if (isAdd(BaseInst) &&
-          // TODO: ensure Var1 and Var2 stay single-BB
-          true) {
-        *Base = Var1;
-        *Index = Var2;
-        *Shift = 0; // should already have been 0
-        return BaseInst;
-      }
-    }
-  }
-  return nullptr;
-}
-
-const Inst *AddressOptimizer::matchShiftedIndex(Variable **Index,
-                                                uint16_t *Shift) {
-  // Index is Index=Var*Const && log2(Const)+Shift<=3 ==>
-  //   Index=Var, Shift+=log2(Const)
-  if (*Index == nullptr)
-    return nullptr;
-  auto *IndexInst = VMetadata->getSingleDefinition(*Index);
-  if (IndexInst == nullptr)
-    return nullptr;
-  assert(!VMetadata->isMultiDef(*Index));
-
-  // When using an unsigned 32-bit array index on x64, it gets zero-extended
-  // before the shift & add. The explicit zero extension can be eliminated
-  // because x86 32-bit operations automatically get zero-extended into the
-  // corresponding 64-bit register.
-  if (auto *CastInst = llvm::dyn_cast<InstCast>(IndexInst)) {
-    if (CastInst->getCastKind() == InstCast::Zext) {
-      if (auto *Var = llvm::dyn_cast<Variable>(CastInst->getSrc(0))) {
-        if (Var->getType() == IceType_i32 &&
-            CastInst->getDest()->getType() == IceType_i64) {
-          IndexInst = VMetadata->getSingleDefinition(Var);
-        }
-      }
-    }
-  }
-
-  if (IndexInst->getSrcSize() < 2)
-    return nullptr;
-  if (auto *ArithInst = llvm::dyn_cast<InstArithmetic>(IndexInst)) {
-    if (auto *Var = llvm::dyn_cast<Variable>(ArithInst->getSrc(0))) {
-      if (auto *Const =
-              llvm::dyn_cast<ConstantInteger32>(ArithInst->getSrc(1))) {
-        if (VMetadata->isMultiDef(Var) || Const->getType() != IceType_i32)
-          return nullptr;
-        switch (ArithInst->getOp()) {
-        default:
-          return nullptr;
-        case InstArithmetic::Mul: {
-          uint32_t Mult = Const->getValue();
-          uint32_t LogMult;
-          switch (Mult) {
-          case 1:
-            LogMult = 0;
-            break;
-          case 2:
-            LogMult = 1;
-            break;
-          case 4:
-            LogMult = 2;
-            break;
-          case 8:
-            LogMult = 3;
-            break;
-          default:
-            return nullptr;
-          }
-          if (*Shift + LogMult <= 3) {
-            *Index = Var;
-            *Shift += LogMult;
-            return IndexInst;
-          }
-        }
-        case InstArithmetic::Shl: {
-          uint32_t ShiftAmount = Const->getValue();
-          switch (ShiftAmount) {
-          case 0:
-          case 1:
-          case 2:
-          case 3:
-            break;
-          default:
-            return nullptr;
-          }
-          if (*Shift + ShiftAmount <= 3) {
-            *Index = Var;
-            *Shift += ShiftAmount;
-            return IndexInst;
-          }
-        }
-        }
-      }
-    }
-  }
-  return nullptr;
-}
-
-const Inst *AddressOptimizer::matchOffsetIndexOrBase(
-    Variable **IndexOrBase, const uint16_t Shift,
-    ConstantRelocatable **Relocatable, int32_t *Offset) {
-  // Base is Base=Var+Const || Base is Base=Const+Var ==>
-  //   set Base=Var, Offset+=Const
-  // Base is Base=Var-Const ==>
-  //   set Base=Var, Offset-=Const
-  // Index is Index=Var+Const ==>
-  //   set Index=Var, Offset+=(Const<<Shift)
-  // Index is Index=Const+Var ==>
-  //   set Index=Var, Offset+=(Const<<Shift)
-  // Index is Index=Var-Const ==>
-  //   set Index=Var, Offset-=(Const<<Shift)
-  // Treat Index=Var Or Const as Index=Var + Const
-  //    when Var = Var' << N and log2(Const) <= N
-  // or when Var = (2^M) * (2^N) and log2(Const) <= (M+N)
-
-  if (*IndexOrBase == nullptr) {
-    return nullptr;
-  }
-  const Inst *Definition = VMetadata->getSingleDefinition(*IndexOrBase);
-  if (Definition == nullptr) {
-    return nullptr;
-  }
-  assert(!VMetadata->isMultiDef(*IndexOrBase));
-  if (auto *ArithInst = llvm::dyn_cast<const InstArithmetic>(Definition)) {
-    switch (ArithInst->getOp()) {
-    case InstArithmetic::Add:
-    case InstArithmetic::Sub:
-    case InstArithmetic::Or:
-      break;
-    default:
-      return nullptr;
-    }
-
-    Operand *Src0 = ArithInst->getSrc(0);
-    Operand *Src1 = ArithInst->getSrc(1);
-    auto *Var0 = llvm::dyn_cast<Variable>(Src0);
-    auto *Var1 = llvm::dyn_cast<Variable>(Src1);
-    auto *Const0 = llvm::dyn_cast<ConstantInteger32>(Src0);
-    auto *Const1 = llvm::dyn_cast<ConstantInteger32>(Src1);
-    auto *Reloc0 = llvm::dyn_cast<ConstantRelocatable>(Src0);
-    auto *Reloc1 = llvm::dyn_cast<ConstantRelocatable>(Src1);
-
-    bool IsAdd = false;
-    if (ArithInst->getOp() == InstArithmetic::Or) {
-      Variable *Var = nullptr;
-      ConstantInteger32 *Const = nullptr;
-      if (Var0 && Const1) {
-        Var = Var0;
-        Const = Const1;
-      } else if (Const0 && Var1) {
-        Var = Var1;
-        Const = Const0;
-      } else {
-        return nullptr;
-      }
-      auto *VarDef =
-          llvm::dyn_cast<InstArithmetic>(VMetadata->getSingleDefinition(Var));
-      if (VarDef == nullptr)
-        return nullptr;
-
-      SizeT ZeroesAvailable = 0;
-      if (VarDef->getOp() == InstArithmetic::Shl) {
-        if (auto *ConstInt =
-                llvm::dyn_cast<ConstantInteger32>(VarDef->getSrc(1))) {
-          ZeroesAvailable = ConstInt->getValue();
-        }
-      } else if (VarDef->getOp() == InstArithmetic::Mul) {
-        SizeT PowerOfTwo = 0;
-        if (auto *MultConst =
-                llvm::dyn_cast<ConstantInteger32>(VarDef->getSrc(0))) {
-          if (llvm::isPowerOf2_32(MultConst->getValue())) {
-            PowerOfTwo += MultConst->getValue();
-          }
-        }
-        if (auto *MultConst =
-                llvm::dyn_cast<ConstantInteger32>(VarDef->getSrc(1))) {
-          if (llvm::isPowerOf2_32(MultConst->getValue())) {
-            PowerOfTwo += MultConst->getValue();
-          }
-        }
-        ZeroesAvailable = llvm::Log2_32(PowerOfTwo) + 1;
-      }
-      SizeT ZeroesNeeded = llvm::Log2_32(Const->getValue()) + 1;
-      if (ZeroesNeeded == 0 || ZeroesNeeded > ZeroesAvailable)
-        return nullptr;
-      IsAdd = true; // treat it as an add if the above conditions hold
-    } else {
-      IsAdd = ArithInst->getOp() == InstArithmetic::Add;
-    }
-
-    Variable *NewIndexOrBase = nullptr;
-    int32_t NewOffset = 0;
-    ConstantRelocatable *NewRelocatable = *Relocatable;
-    if (Var0 && Var1)
-      // TODO(sehr): merge base/index splitting into here.
-      return nullptr;
-    if (!IsAdd && Var1)
-      return nullptr;
-    if (Var0)
-      NewIndexOrBase = Var0;
-    else if (Var1)
-      NewIndexOrBase = Var1;
-    // Don't know how to add/subtract two relocatables.
-    if ((*Relocatable && (Reloc0 || Reloc1)) || (Reloc0 && Reloc1))
-      return nullptr;
-    // Don't know how to subtract a relocatable.
-    if (!IsAdd && Reloc1)
-      return nullptr;
-    // Incorporate ConstantRelocatables.
-    if (Reloc0)
-      NewRelocatable = Reloc0;
-    else if (Reloc1)
-      NewRelocatable = Reloc1;
-    // Compute the updated constant offset.
-    if (Const0) {
-      const int32_t MoreOffset =
-          IsAdd ? Const0->getValue() : -Const0->getValue();
-      if (Utils::WouldOverflowAdd(*Offset + NewOffset, MoreOffset))
-        return nullptr;
-      NewOffset += MoreOffset;
-    }
-    if (Const1) {
-      const int32_t MoreOffset =
-          IsAdd ? Const1->getValue() : -Const1->getValue();
-      if (Utils::WouldOverflowAdd(*Offset + NewOffset, MoreOffset))
-        return nullptr;
-      NewOffset += MoreOffset;
-    }
-    if (Utils::WouldOverflowAdd(*Offset, NewOffset << Shift))
-      return nullptr;
-    *IndexOrBase = NewIndexOrBase;
-    *Offset += (NewOffset << Shift);
-    // Shift is always zero if this is called with the base
-    *Relocatable = NewRelocatable;
-    return Definition;
-  }
-  return nullptr;
-}
-
-template <typename TypeTraits>
-typename TargetX86Base<TypeTraits>::X86OperandMem *
-TargetX86Base<TypeTraits>::computeAddressOpt(const Inst *Instr, Type MemType,
-                                             Operand *Addr) {
-  Func->resetCurrentNode();
-  if (Func->isVerbose(IceV_AddrOpt)) {
-    OstreamLocker L(Func->getContext());
-    Ostream &Str = Func->getContext()->getStrDump();
-    Str << "\nStarting computeAddressOpt for instruction:\n  ";
-    Instr->dumpDecorated(Func);
-  }
-
-  OptAddr NewAddr;
-  NewAddr.Base = llvm::dyn_cast<Variable>(Addr);
-  if (NewAddr.Base == nullptr)
-    return nullptr;
-
-  // If the Base has more than one use or is live across multiple blocks, then
-  // don't go further. Alternatively (?), never consider a transformation that
-  // would change a variable that is currently *not* live across basic block
-  // boundaries into one that *is*.
-  if (!getFlags().getLoopInvariantCodeMotion()) {
-    // Need multi block address opt when licm is enabled.
-    // Might make sense to restrict to current node and loop header.
-    if (Func->getVMetadata()->isMultiBlock(
-            NewAddr.Base) /* || Base->getUseCount() > 1*/)
-      return nullptr;
-  }
-  AddressOptimizer AddrOpt(Func);
-  const bool MockBounds = getFlags().getMockBoundsCheck();
-  const Inst *Reason = nullptr;
-  bool AddressWasOptimized = false;
-  // The following unnamed struct identifies the address mode formation steps
-  // that could potentially create an invalid memory operand (i.e., no free
-  // slots for RebasePtr.) We add all those variables to this struct so that we
-  // can use memset() to reset all members to false.
-  struct {
-    bool AssignBase = false;
-    bool AssignIndex = false;
-    bool OffsetFromBase = false;
-    bool OffsetFromIndex = false;
-    bool CombinedBaseIndex = false;
-  } Skip;
-  // NewAddrCheckpoint is used to rollback the address being formed in case an
-  // invalid address is formed.
-  OptAddr NewAddrCheckpoint;
-  Reason = Instr;
-  do {
-    if (Reason) {
-      AddrOpt.dumpAddressOpt(NewAddr.Relocatable, NewAddr.Offset, NewAddr.Base,
-                             NewAddr.Index, NewAddr.Shift, Reason);
-      AddressWasOptimized = true;
-      Reason = nullptr;
-      memset(reinterpret_cast<void *>(&Skip), 0, sizeof(Skip));
-    }
-
-    NewAddrCheckpoint = NewAddr;
-
-    // Update Base and Index to follow through assignments to definitions.
-    if (!Skip.AssignBase &&
-        (Reason = AddrOpt.matchAssign(&NewAddr.Base, &NewAddr.Relocatable,
-                                      &NewAddr.Offset))) {
-      // Assignments of Base from a Relocatable or ConstantInt32 can result
-      // in Base becoming nullptr.  To avoid code duplication in this loop we
-      // prefer that Base be non-nullptr if possible.
-      if ((NewAddr.Base == nullptr) && (NewAddr.Index != nullptr) &&
-          NewAddr.Shift == 0) {
-        std::swap(NewAddr.Base, NewAddr.Index);
-      }
-      continue;
-    }
-    if (!Skip.AssignBase &&
-        (Reason = AddrOpt.matchAssign(&NewAddr.Index, &NewAddr.Relocatable,
-                                      &NewAddr.Offset))) {
-      continue;
-    }
-
-    if (!MockBounds) {
-      // Transition from:
-      //   <Relocatable + Offset>(Base) to
-      //   <Relocatable + Offset>(Base, Index)
-      if (!Skip.CombinedBaseIndex &&
-          (Reason = AddrOpt.matchCombinedBaseIndex(
-               &NewAddr.Base, &NewAddr.Index, &NewAddr.Shift))) {
-        continue;
-      }
-
-      // Recognize multiply/shift and update Shift amount.
-      // Index becomes Index=Var<<Const && Const+Shift<=3 ==>
-      //   Index=Var, Shift+=Const
-      // Index becomes Index=Const*Var && log2(Const)+Shift<=3 ==>
-      //   Index=Var, Shift+=log2(Const)
-      if ((Reason =
-               AddrOpt.matchShiftedIndex(&NewAddr.Index, &NewAddr.Shift))) {
-        continue;
-      }
-
-      // If Shift is zero, the choice of Base and Index was purely arbitrary.
-      // Recognize multiply/shift and set Shift amount.
-      // Shift==0 && Base is Base=Var*Const && log2(Const)+Shift<=3 ==>
-      //   swap(Index,Base)
-      // Similar for Base=Const*Var and Base=Var<<Const
-      if (NewAddr.Shift == 0 &&
-          (Reason = AddrOpt.matchShiftedIndex(&NewAddr.Base, &NewAddr.Shift))) {
-        std::swap(NewAddr.Base, NewAddr.Index);
-        continue;
-      }
-    }
-
-    // Update Offset to reflect additions/subtractions with constants and
-    // relocatables.
-    // TODO: consider overflow issues with respect to Offset.
-    if (!Skip.OffsetFromBase && (Reason = AddrOpt.matchOffsetIndexOrBase(
-                                     &NewAddr.Base, /*Shift =*/0,
-                                     &NewAddr.Relocatable, &NewAddr.Offset))) {
-      continue;
-    }
-    if (!Skip.OffsetFromIndex && (Reason = AddrOpt.matchOffsetIndexOrBase(
-                                      &NewAddr.Index, NewAddr.Shift,
-                                      &NewAddr.Relocatable, &NewAddr.Offset))) {
-      continue;
-    }
-
-    break;
-  } while (Reason);
-
-  if (!AddressWasOptimized) {
-    return nullptr;
-  }
-
-  Constant *OffsetOp = nullptr;
-  if (NewAddr.Relocatable == nullptr) {
-    OffsetOp = Ctx->getConstantInt32(NewAddr.Offset);
-  } else {
-    OffsetOp =
-        Ctx->getConstantSym(NewAddr.Relocatable->getOffset() + NewAddr.Offset,
-                            NewAddr.Relocatable->getName());
-  }
-  // Vanilla ICE load instructions should not use the segment registers, and
-  // computeAddressOpt only works at the level of Variables and Constants, not
-  // other X86OperandMem, so there should be no mention of segment
-  // registers there either.
-  static constexpr auto SegmentReg =
-      X86OperandMem::SegmentRegisters::DefaultSegment;
-
-  return X86OperandMem::create(Func, MemType, NewAddr.Base, OffsetOp,
-                               NewAddr.Index, NewAddr.Shift, SegmentReg);
-}
-
-/// Add a mock bounds check on the memory address before using it as a load or
-/// store operand.  The basic idea is that given a memory operand [reg], we
-/// would first add bounds-check code something like:
-///
-///   cmp reg, <lb>
-///   jl out_of_line_error
-///   cmp reg, <ub>
-///   jg out_of_line_error
-///
-/// In reality, the specific code will depend on how <lb> and <ub> are
-/// represented, e.g. an immediate, a global, or a function argument.
-///
-/// As such, we need to enforce that the memory operand does not have the form
-/// [reg1+reg2], because then there is no simple cmp instruction that would
-/// suffice.  However, we consider [reg+offset] to be OK because the offset is
-/// usually small, and so <ub> could have a safety buffer built in and then we
-/// could instead branch to a custom out_of_line_error that does the precise
-/// check and jumps back if it turns out OK.
-///
-/// For the purpose of mocking the bounds check, we'll do something like this:
-///
-///   cmp reg, 0
-///   je label
-///   cmp reg, 1
-///   je label
-///   label:
-///
-/// Also note that we don't need to add a bounds check to a dereference of a
-/// simple global variable address.
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::doMockBoundsCheck(Operand *Opnd) {
-  if (!getFlags().getMockBoundsCheck())
-    return;
-  if (auto *Mem = llvm::dyn_cast<X86OperandMem>(Opnd)) {
-    if (Mem->getIndex()) {
-      llvm::report_fatal_error("doMockBoundsCheck: Opnd contains index reg");
-    }
-    Opnd = Mem->getBase();
-  }
-  // At this point Opnd could be nullptr, or Variable, or Constant, or perhaps
-  // something else.  We only care if it is Variable.
-  auto *Var = llvm::dyn_cast_or_null<Variable>(Opnd);
-  if (Var == nullptr)
-    return;
-  // We use lowerStore() to copy out-args onto the stack.  This creates a memory
-  // operand with the stack pointer as the base register.  Don't do bounds
-  // checks on that.
-  if (Var->getRegNum() == getStackReg())
-    return;
-
-  auto *Label = InstX86Label::create(Func, this);
-  _cmp(Opnd, Ctx->getConstantZero(IceType_i32));
-  _br(CondX86::Br_e, Label);
-  _cmp(Opnd, Ctx->getConstantInt32(1));
-  _br(CondX86::Br_e, Label);
-  Context.insert(Label);
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::lowerLoad(const InstLoad *Load) {
-  // A Load instruction can be treated the same as an Assign instruction, after
-  // the source operand is transformed into an X86OperandMem operand.  Note that
-  // the address mode optimization already creates an X86OperandMem operand, so
-  // it doesn't need another level of transformation.
-  Variable *DestLoad = Load->getDest();
-  Type Ty = DestLoad->getType();
-  Operand *Src0 = formMemoryOperand(Load->getLoadAddress(), Ty);
-  doMockBoundsCheck(Src0);
-  auto *Assign = InstAssign::create(Func, DestLoad, Src0);
-  lowerAssign(Assign);
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::doAddressOptOther() {
-  // Inverts some Icmp instructions which helps doAddressOptLoad later.
-  // TODO(manasijm): Refactor to unify the conditions for Var0 and Var1
-  Inst *Instr = iteratorToInst(Context.getCur());
-  auto *VMetadata = Func->getVMetadata();
-  if (auto *Icmp = llvm::dyn_cast<InstIcmp>(Instr)) {
-    if (llvm::isa<Constant>(Icmp->getSrc(0)) ||
-        llvm::isa<Constant>(Icmp->getSrc(1)))
-      return;
-    auto *Var0 = llvm::dyn_cast<Variable>(Icmp->getSrc(0));
-    if (Var0 == nullptr)
-      return;
-    if (!VMetadata->isTracked(Var0))
-      return;
-    auto *Op0Def = VMetadata->getFirstDefinitionSingleBlock(Var0);
-    if (Op0Def == nullptr || !llvm::isa<InstLoad>(Op0Def))
-      return;
-    if (VMetadata->getLocalUseNode(Var0) != Context.getNode())
-      return;
-
-    auto *Var1 = llvm::dyn_cast<Variable>(Icmp->getSrc(1));
-    if (Var1 != nullptr && VMetadata->isTracked(Var1)) {
-      auto *Op1Def = VMetadata->getFirstDefinitionSingleBlock(Var1);
-      if (Op1Def != nullptr && !VMetadata->isMultiBlock(Var1) &&
-          llvm::isa<InstLoad>(Op1Def)) {
-        return; // Both are loads
-      }
-    }
-    Icmp->reverseConditionAndOperands();
-  }
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::doAddressOptLoad() {
-  Inst *Instr = iteratorToInst(Context.getCur());
-  Operand *Addr = Instr->getSrc(0);
-  Variable *Dest = Instr->getDest();
-  if (auto *OptAddr = computeAddressOpt(Instr, Dest->getType(), Addr)) {
-    Instr->setDeleted();
-    Context.insert<InstLoad>(Dest, OptAddr);
-  }
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::doAddressOptLoadSubVector() {
-  auto *Intrinsic = llvm::cast<InstIntrinsic>(Context.getCur());
-  Operand *Addr = Intrinsic->getArg(0);
-  Variable *Dest = Intrinsic->getDest();
-  if (auto *OptAddr = computeAddressOpt(Intrinsic, Dest->getType(), Addr)) {
-    Intrinsic->setDeleted();
-    const Ice::Intrinsics::IntrinsicInfo Info = {
-        Ice::Intrinsics::LoadSubVector, Ice::Intrinsics::SideEffects_F,
-        Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
-    auto *NewLoad = Context.insert<InstIntrinsic>(2, Dest, Info);
-    NewLoad->addArg(OptAddr);
-    NewLoad->addArg(Intrinsic->getArg(1));
-  }
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::lowerPhi(const InstPhi * /*Instr*/) {
-  Func->setError("Phi found in regular instruction list");
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::lowerRet(const InstRet *Instr) {
-  Variable *Reg = nullptr;
-  if (Instr->hasRetValue()) {
-    Operand *RetValue = legalize(Instr->getRetValue());
-    const Type ReturnType = RetValue->getType();
-    assert(isVectorType(ReturnType) || isScalarFloatingType(ReturnType) ||
-           (ReturnType == IceType_i32) || (ReturnType == IceType_i64));
-    Reg = moveReturnValueToRegister(RetValue, ReturnType);
-  }
-  // Add a ret instruction even if sandboxing is enabled, because addEpilog
-  // explicitly looks for a ret instruction as a marker for where to insert the
-  // frame removal instructions.
-  _ret(Reg);
-  // Add a fake use of esp to make sure esp stays alive for the entire
-  // function. Otherwise post-call esp adjustments get dead-code eliminated.
-  keepEspLiveAtExit();
-}
-
-inline uint32_t makePshufdMask(SizeT Index0, SizeT Index1, SizeT Index2,
-                               SizeT Index3) {
-  const SizeT Mask = (Index0 & 0x3) | ((Index1 & 0x3) << 2) |
-                     ((Index2 & 0x3) << 4) | ((Index3 & 0x3) << 6);
-  assert(Mask < 256);
-  return Mask;
-}
-
-template <typename TraitsType>
-Variable *TargetX86Base<TraitsType>::lowerShuffleVector_AllFromSameSrc(
-    Operand *Src, SizeT Index0, SizeT Index1, SizeT Index2, SizeT Index3) {
-  constexpr SizeT SrcBit = 1 << 2;
-  assert((Index0 & SrcBit) == (Index1 & SrcBit));
-  assert((Index0 & SrcBit) == (Index2 & SrcBit));
-  assert((Index0 & SrcBit) == (Index3 & SrcBit));
-  (void)SrcBit;
-
-  const Type SrcTy = Src->getType();
-  auto *T = makeReg(SrcTy);
-  auto *SrcRM = legalize(Src, Legal_Reg | Legal_Mem);
-  auto *Mask =
-      Ctx->getConstantInt32(makePshufdMask(Index0, Index1, Index2, Index3));
-  _pshufd(T, SrcRM, Mask);
-  return T;
-}
-
-template <typename TraitsType>
-Variable *TargetX86Base<TraitsType>::lowerShuffleVector_TwoFromSameSrc(
-    Operand *Src0, SizeT Index0, SizeT Index1, Operand *Src1, SizeT Index2,
-    SizeT Index3) {
-  constexpr SizeT SrcBit = 1 << 2;
-  assert((Index0 & SrcBit) == (Index1 & SrcBit) || (Index1 == IGNORE_INDEX));
-  assert((Index2 & SrcBit) == (Index3 & SrcBit) || (Index3 == IGNORE_INDEX));
-  (void)SrcBit;
-
-  const Type SrcTy = Src0->getType();
-  assert(Src1->getType() == SrcTy);
-  auto *T = makeReg(SrcTy);
-  auto *Src0R = legalizeToReg(Src0);
-  auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
-  auto *Mask =
-      Ctx->getConstantInt32(makePshufdMask(Index0, Index1, Index2, Index3));
-  _movp(T, Src0R);
-  _shufps(T, Src1RM, Mask);
-  return T;
-}
-
-template <typename TraitsType>
-Variable *TargetX86Base<TraitsType>::lowerShuffleVector_UnifyFromDifferentSrcs(
-    Operand *Src0, SizeT Index0, Operand *Src1, SizeT Index1) {
-  return lowerShuffleVector_TwoFromSameSrc(Src0, Index0, IGNORE_INDEX, Src1,
-                                           Index1, IGNORE_INDEX);
-}
-
-inline SizeT makeSrcSwitchMask(SizeT Index0, SizeT Index1, SizeT Index2,
-                               SizeT Index3) {
-  constexpr SizeT SrcBit = 1 << 2;
-  const SizeT Index0Bits = ((Index0 & SrcBit) == 0) ? 0 : (1 << 0);
-  const SizeT Index1Bits = ((Index1 & SrcBit) == 0) ? 0 : (1 << 1);
-  const SizeT Index2Bits = ((Index2 & SrcBit) == 0) ? 0 : (1 << 2);
-  const SizeT Index3Bits = ((Index3 & SrcBit) == 0) ? 0 : (1 << 3);
-  return Index0Bits | Index1Bits | Index2Bits | Index3Bits;
-}
-
-template <typename TraitsType>
-GlobalString TargetX86Base<TraitsType>::lowerShuffleVector_NewMaskName() {
-  GlobalString FuncName = Func->getFunctionName();
-  const SizeT Id = PshufbMaskCount++;
-  if (!BuildDefs::dump() || !FuncName.hasStdString()) {
-    return GlobalString::createWithString(
-        Ctx,
-        "$PS" + std::to_string(FuncName.getID()) + "_" + std::to_string(Id));
-  }
-  return GlobalString::createWithString(
-      Ctx, "Pshufb$" + Func->getFunctionName() + "$" + std::to_string(Id));
-}
-
-template <typename TraitsType>
-ConstantRelocatable *
-TargetX86Base<TraitsType>::lowerShuffleVector_CreatePshufbMask(
-    int8_t Idx0, int8_t Idx1, int8_t Idx2, int8_t Idx3, int8_t Idx4,
-    int8_t Idx5, int8_t Idx6, int8_t Idx7, int8_t Idx8, int8_t Idx9,
-    int8_t Idx10, int8_t Idx11, int8_t Idx12, int8_t Idx13, int8_t Idx14,
-    int8_t Idx15) {
-  static constexpr uint8_t NumElements = 16;
-  const char Initializer[NumElements] = {
-      Idx0, Idx1, Idx2,  Idx3,  Idx4,  Idx5,  Idx6,  Idx7,
-      Idx8, Idx9, Idx10, Idx11, Idx12, Idx13, Idx14, Idx15,
-  };
-
-  static constexpr Type V4VectorType = IceType_v4i32;
-  const uint32_t MaskAlignment = typeWidthInBytesOnStack(V4VectorType);
-  auto *Mask = VariableDeclaration::create(Func->getGlobalPool());
-  GlobalString MaskName = lowerShuffleVector_NewMaskName();
-  Mask->setIsConstant(true);
-  Mask->addInitializer(VariableDeclaration::DataInitializer::create(
-      Func->getGlobalPool(), Initializer, NumElements));
-  Mask->setName(MaskName);
-  // Mask needs to be 16-byte aligned, or pshufb will seg fault.
-  Mask->setAlignment(MaskAlignment);
-  Func->addGlobal(Mask);
-
-  constexpr RelocOffsetT Offset = 0;
-  return llvm::cast<ConstantRelocatable>(Ctx->getConstantSym(Offset, MaskName));
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::lowerShuffleVector_UsingPshufb(
-    Variable *Dest, Operand *Src0, Operand *Src1, int8_t Idx0, int8_t Idx1,
-    int8_t Idx2, int8_t Idx3, int8_t Idx4, int8_t Idx5, int8_t Idx6,
-    int8_t Idx7, int8_t Idx8, int8_t Idx9, int8_t Idx10, int8_t Idx11,
-    int8_t Idx12, int8_t Idx13, int8_t Idx14, int8_t Idx15) {
-  const Type DestTy = Dest->getType();
-  static constexpr bool NotRebased = false;
-  static constexpr Variable *NoBase = nullptr;
-  // We use void for the memory operand instead of DestTy because using the
-  // latter causes a validation failure: the X86 Inst layer complains that
-  // vector mem operands could be under aligned. Thus, using void we avoid the
-  // validation error. Note that the mask global declaration is aligned, so it
-  // can be used as an XMM mem operand.
-  static constexpr Type MaskType = IceType_void;
-#define IDX_IN_SRC(N, S)                                                       \
-  ((((N) & (1 << 4)) == (S << 4)) ? ((N)&0xf) : CLEAR_ALL_BITS)
-  auto *Mask0M = X86OperandMem::create(
-      Func, MaskType, NoBase,
-      lowerShuffleVector_CreatePshufbMask(
-          IDX_IN_SRC(Idx0, 0), IDX_IN_SRC(Idx1, 0), IDX_IN_SRC(Idx2, 0),
-          IDX_IN_SRC(Idx3, 0), IDX_IN_SRC(Idx4, 0), IDX_IN_SRC(Idx5, 0),
-          IDX_IN_SRC(Idx6, 0), IDX_IN_SRC(Idx7, 0), IDX_IN_SRC(Idx8, 0),
-          IDX_IN_SRC(Idx9, 0), IDX_IN_SRC(Idx10, 0), IDX_IN_SRC(Idx11, 0),
-          IDX_IN_SRC(Idx12, 0), IDX_IN_SRC(Idx13, 0), IDX_IN_SRC(Idx14, 0),
-          IDX_IN_SRC(Idx15, 0)),
-      NotRebased);
-
-  auto *T0 = makeReg(DestTy);
-  auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
-  _movp(T0, Src0RM);
-
-  _pshufb(T0, Mask0M);
-
-  if (Idx0 >= 16 || Idx1 >= 16 || Idx2 >= 16 || Idx3 >= 16 || Idx4 >= 16 ||
-      Idx5 >= 16 || Idx6 >= 16 || Idx7 >= 16 || Idx8 >= 16 || Idx9 >= 16 ||
-      Idx10 >= 16 || Idx11 >= 16 || Idx12 >= 16 || Idx13 >= 16 || Idx14 >= 16 ||
-      Idx15 >= 16) {
-    auto *Mask1M = X86OperandMem::create(
-        Func, MaskType, NoBase,
-        lowerShuffleVector_CreatePshufbMask(
-            IDX_IN_SRC(Idx0, 1), IDX_IN_SRC(Idx1, 1), IDX_IN_SRC(Idx2, 1),
-            IDX_IN_SRC(Idx3, 1), IDX_IN_SRC(Idx4, 1), IDX_IN_SRC(Idx5, 1),
-            IDX_IN_SRC(Idx6, 1), IDX_IN_SRC(Idx7, 1), IDX_IN_SRC(Idx8, 1),
-            IDX_IN_SRC(Idx9, 1), IDX_IN_SRC(Idx10, 1), IDX_IN_SRC(Idx11, 1),
-            IDX_IN_SRC(Idx12, 1), IDX_IN_SRC(Idx13, 1), IDX_IN_SRC(Idx14, 1),
-            IDX_IN_SRC(Idx15, 1)),
-        NotRebased);
-#undef IDX_IN_SRC
-    auto *T1 = makeReg(DestTy);
-    auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
-    _movp(T1, Src1RM);
-    _pshufb(T1, Mask1M);
-    _por(T0, T1);
-  }
-
-  _movp(Dest, T0);
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::lowerShuffleVector(
-    const InstShuffleVector *Instr) {
-  auto *Dest = Instr->getDest();
-  const Type DestTy = Dest->getType();
-  auto *Src0 = Instr->getSrc(0);
-  auto *Src1 = Instr->getSrc(1);
-  const SizeT NumElements = typeNumElements(DestTy);
-
-  auto *T = makeReg(DestTy);
-
-  switch (DestTy) {
-  default:
-    llvm::report_fatal_error("Unexpected vector type.");
-  case IceType_v16i1:
-  case IceType_v16i8: {
-    static constexpr SizeT ExpectedNumElements = 16;
-    assert(ExpectedNumElements == Instr->getNumIndexes());
-    (void)ExpectedNumElements;
-
-    if (Instr->indexesAre(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7)) {
-      auto *T = makeReg(DestTy);
-      auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
-      _movp(T, Src0RM);
-      _punpckl(T, Src0RM);
-      _movp(Dest, T);
-      return;
-    }
-
-    if (Instr->indexesAre(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7,
-                          23)) {
-      auto *T = makeReg(DestTy);
-      auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
-      auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
-      _movp(T, Src0RM);
-      _punpckl(T, Src1RM);
-      _movp(Dest, T);
-      return;
-    }
-
-    if (Instr->indexesAre(8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14,
-                          15, 15)) {
-      auto *T = makeReg(DestTy);
-      auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
-      _movp(T, Src0RM);
-      _punpckh(T, Src0RM);
-      _movp(Dest, T);
-      return;
-    }
-
-    if (Instr->indexesAre(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30,
-                          15, 31)) {
-      auto *T = makeReg(DestTy);
-      auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
-      auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
-      _movp(T, Src0RM);
-      _punpckh(T, Src1RM);
-      _movp(Dest, T);
-      return;
-    }
-
-    if (InstructionSet < Traits::SSE4_1) {
-      // TODO(jpp): figure out how to lower with sse2.
-      break;
-    }
-
-    const SizeT Index0 = Instr->getIndexValue(0);
-    const SizeT Index1 = Instr->getIndexValue(1);
-    const SizeT Index2 = Instr->getIndexValue(2);
-    const SizeT Index3 = Instr->getIndexValue(3);
-    const SizeT Index4 = Instr->getIndexValue(4);
-    const SizeT Index5 = Instr->getIndexValue(5);
-    const SizeT Index6 = Instr->getIndexValue(6);
-    const SizeT Index7 = Instr->getIndexValue(7);
-    const SizeT Index8 = Instr->getIndexValue(8);
-    const SizeT Index9 = Instr->getIndexValue(9);
-    const SizeT Index10 = Instr->getIndexValue(10);
-    const SizeT Index11 = Instr->getIndexValue(11);
-    const SizeT Index12 = Instr->getIndexValue(12);
-    const SizeT Index13 = Instr->getIndexValue(13);
-    const SizeT Index14 = Instr->getIndexValue(14);
-    const SizeT Index15 = Instr->getIndexValue(15);
-
-    lowerShuffleVector_UsingPshufb(Dest, Src0, Src1, Index0, Index1, Index2,
-                                   Index3, Index4, Index5, Index6, Index7,
-                                   Index8, Index9, Index10, Index11, Index12,
-                                   Index13, Index14, Index15);
-    return;
-  }
-  case IceType_v8i1:
-  case IceType_v8i16: {
-    static constexpr SizeT ExpectedNumElements = 8;
-    assert(ExpectedNumElements == Instr->getNumIndexes());
-    (void)ExpectedNumElements;
-
-    if (Instr->indexesAre(0, 0, 1, 1, 2, 2, 3, 3)) {
-      auto *T = makeReg(DestTy);
-      auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
-      _movp(T, Src0RM);
-      _punpckl(T, Src0RM);
-      _movp(Dest, T);
-      return;
-    }
-
-    if (Instr->indexesAre(0, 8, 1, 9, 2, 10, 3, 11)) {
-      auto *T = makeReg(DestTy);
-      auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
-      auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
-      _movp(T, Src0RM);
-      _punpckl(T, Src1RM);
-      _movp(Dest, T);
-      return;
-    }
-
-    if (Instr->indexesAre(4, 4, 5, 5, 6, 6, 7, 7)) {
-      auto *T = makeReg(DestTy);
-      auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
-      _movp(T, Src0RM);
-      _punpckh(T, Src0RM);
-      _movp(Dest, T);
-      return;
-    }
-
-    if (Instr->indexesAre(4, 12, 5, 13, 6, 14, 7, 15)) {
-      auto *T = makeReg(DestTy);
-      auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
-      auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
-      _movp(T, Src0RM);
-      _punpckh(T, Src1RM);
-      _movp(Dest, T);
-      return;
-    }
-
-    if (InstructionSet < Traits::SSE4_1) {
-      // TODO(jpp): figure out how to lower with sse2.
-      break;
-    }
-
-    const SizeT Index0 = Instr->getIndexValue(0);
-    const SizeT Index1 = Instr->getIndexValue(1);
-    const SizeT Index2 = Instr->getIndexValue(2);
-    const SizeT Index3 = Instr->getIndexValue(3);
-    const SizeT Index4 = Instr->getIndexValue(4);
-    const SizeT Index5 = Instr->getIndexValue(5);
-    const SizeT Index6 = Instr->getIndexValue(6);
-    const SizeT Index7 = Instr->getIndexValue(7);
-
-#define TO_BYTE_INDEX(I) ((I) << 1)
-    lowerShuffleVector_UsingPshufb(
-        Dest, Src0, Src1, TO_BYTE_INDEX(Index0), TO_BYTE_INDEX(Index0) + 1,
-        TO_BYTE_INDEX(Index1), TO_BYTE_INDEX(Index1) + 1, TO_BYTE_INDEX(Index2),
-        TO_BYTE_INDEX(Index2) + 1, TO_BYTE_INDEX(Index3),
-        TO_BYTE_INDEX(Index3) + 1, TO_BYTE_INDEX(Index4),
-        TO_BYTE_INDEX(Index4) + 1, TO_BYTE_INDEX(Index5),
-        TO_BYTE_INDEX(Index5) + 1, TO_BYTE_INDEX(Index6),
-        TO_BYTE_INDEX(Index6) + 1, TO_BYTE_INDEX(Index7),
-        TO_BYTE_INDEX(Index7) + 1);
-#undef TO_BYTE_INDEX
-    return;
-  }
-  case IceType_v4i1:
-  case IceType_v4i32:
-  case IceType_v4f32: {
-    static constexpr SizeT ExpectedNumElements = 4;
-    assert(ExpectedNumElements == Instr->getNumIndexes());
-    const SizeT Index0 = Instr->getIndexValue(0);
-    const SizeT Index1 = Instr->getIndexValue(1);
-    const SizeT Index2 = Instr->getIndexValue(2);
-    const SizeT Index3 = Instr->getIndexValue(3);
-    Variable *T = nullptr;
-    switch (makeSrcSwitchMask(Index0, Index1, Index2, Index3)) {
-#define CASE_SRCS_IN(S0, S1, S2, S3)                                           \
-  case (((S0) << 0) | ((S1) << 1) | ((S2) << 2) | ((S3) << 3))
-      CASE_SRCS_IN(0, 0, 0, 0) : {
-        T = lowerShuffleVector_AllFromSameSrc(Src0, Index0, Index1, Index2,
-                                              Index3);
-      }
-      break;
-      CASE_SRCS_IN(0, 0, 0, 1) : {
-        auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src0, Index2,
-                                                                  Src1, Index3);
-        T = lowerShuffleVector_TwoFromSameSrc(Src0, Index0, Index1, Unified,
-                                              UNIFIED_INDEX_0, UNIFIED_INDEX_1);
-      }
-      break;
-      CASE_SRCS_IN(0, 0, 1, 0) : {
-        auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src1, Index2,
-                                                                  Src0, Index3);
-        T = lowerShuffleVector_TwoFromSameSrc(Src0, Index0, Index1, Unified,
-                                              UNIFIED_INDEX_0, UNIFIED_INDEX_1);
-      }
-      break;
-      CASE_SRCS_IN(0, 0, 1, 1) : {
-        T = lowerShuffleVector_TwoFromSameSrc(Src0, Index0, Index1, Src1,
-                                              Index2, Index3);
-      }
-      break;
-      CASE_SRCS_IN(0, 1, 0, 0) : {
-        auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src0, Index0,
-                                                                  Src1, Index1);
-        T = lowerShuffleVector_TwoFromSameSrc(
-            Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Src0, Index2, Index3);
-      }
-      break;
-      CASE_SRCS_IN(0, 1, 0, 1) : {
-        if (Index0 == 0 && (Index1 - ExpectedNumElements) == 0 && Index2 == 1 &&
-            (Index3 - ExpectedNumElements) == 1) {
-          auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
-          auto *Src0R = legalizeToReg(Src0);
-          T = makeReg(DestTy);
-          _movp(T, Src0R);
-          _punpckl(T, Src1RM);
-        } else if (Index0 == Index2 && Index1 == Index3) {
-          auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(
-              Src0, Index0, Src1, Index1);
-          T = lowerShuffleVector_AllFromSameSrc(
-              Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, UNIFIED_INDEX_0,
-              UNIFIED_INDEX_1);
-        } else {
-          auto *Unified0 = lowerShuffleVector_UnifyFromDifferentSrcs(
-              Src0, Index0, Src1, Index1);
-          auto *Unified1 = lowerShuffleVector_UnifyFromDifferentSrcs(
-              Src0, Index2, Src1, Index3);
-          T = lowerShuffleVector_TwoFromSameSrc(
-              Unified0, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Unified1,
-              UNIFIED_INDEX_0, UNIFIED_INDEX_1);
-        }
-      }
-      break;
-      CASE_SRCS_IN(0, 1, 1, 0) : {
-        if (Index0 == Index3 && Index1 == Index2) {
-          auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(
-              Src0, Index0, Src1, Index1);
-          T = lowerShuffleVector_AllFromSameSrc(
-              Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, UNIFIED_INDEX_1,
-              UNIFIED_INDEX_0);
-        } else {
-          auto *Unified0 = lowerShuffleVector_UnifyFromDifferentSrcs(
-              Src0, Index0, Src1, Index1);
-          auto *Unified1 = lowerShuffleVector_UnifyFromDifferentSrcs(
-              Src1, Index2, Src0, Index3);
-          T = lowerShuffleVector_TwoFromSameSrc(
-              Unified0, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Unified1,
-              UNIFIED_INDEX_0, UNIFIED_INDEX_1);
-        }
-      }
-      break;
-      CASE_SRCS_IN(0, 1, 1, 1) : {
-        auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src0, Index0,
-                                                                  Src1, Index1);
-        T = lowerShuffleVector_TwoFromSameSrc(
-            Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Src1, Index2, Index3);
-      }
-      break;
-      CASE_SRCS_IN(1, 0, 0, 0) : {
-        auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src1, Index0,
-                                                                  Src0, Index1);
-        T = lowerShuffleVector_TwoFromSameSrc(
-            Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Src0, Index2, Index3);
-      }
-      break;
-      CASE_SRCS_IN(1, 0, 0, 1) : {
-        if (Index0 == Index3 && Index1 == Index2) {
-          auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(
-              Src1, Index0, Src0, Index1);
-          T = lowerShuffleVector_AllFromSameSrc(
-              Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, UNIFIED_INDEX_1,
-              UNIFIED_INDEX_0);
-        } else {
-          auto *Unified0 = lowerShuffleVector_UnifyFromDifferentSrcs(
-              Src1, Index0, Src0, Index1);
-          auto *Unified1 = lowerShuffleVector_UnifyFromDifferentSrcs(
-              Src0, Index2, Src1, Index3);
-          T = lowerShuffleVector_TwoFromSameSrc(
-              Unified0, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Unified1,
-              UNIFIED_INDEX_0, UNIFIED_INDEX_1);
-        }
-      }
-      break;
-      CASE_SRCS_IN(1, 0, 1, 0) : {
-        if ((Index0 - ExpectedNumElements) == 0 && Index1 == 0 &&
-            (Index2 - ExpectedNumElements) == 1 && Index3 == 1) {
-          auto *Src1RM = legalize(Src0, Legal_Reg | Legal_Mem);
-          auto *Src0R = legalizeToReg(Src1);
-          T = makeReg(DestTy);
-          _movp(T, Src0R);
-          _punpckl(T, Src1RM);
-        } else if (Index0 == Index2 && Index1 == Index3) {
-          auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(
-              Src1, Index0, Src0, Index1);
-          T = lowerShuffleVector_AllFromSameSrc(
-              Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, UNIFIED_INDEX_0,
-              UNIFIED_INDEX_1);
-        } else {
-          auto *Unified0 = lowerShuffleVector_UnifyFromDifferentSrcs(
-              Src1, Index0, Src0, Index1);
-          auto *Unified1 = lowerShuffleVector_UnifyFromDifferentSrcs(
-              Src1, Index2, Src0, Index3);
-          T = lowerShuffleVector_TwoFromSameSrc(
-              Unified0, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Unified1,
-              UNIFIED_INDEX_0, UNIFIED_INDEX_1);
-        }
-      }
-      break;
-      CASE_SRCS_IN(1, 0, 1, 1) : {
-        auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src1, Index0,
-                                                                  Src0, Index1);
-        T = lowerShuffleVector_TwoFromSameSrc(
-            Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Src1, Index2, Index3);
-      }
-      break;
-      CASE_SRCS_IN(1, 1, 0, 0) : {
-        T = lowerShuffleVector_TwoFromSameSrc(Src1, Index0, Index1, Src0,
-                                              Index2, Index3);
-      }
-      break;
-      CASE_SRCS_IN(1, 1, 0, 1) : {
-        auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src0, Index2,
-                                                                  Src1, Index3);
-        T = lowerShuffleVector_TwoFromSameSrc(Src1, Index0, Index1, Unified,
-                                              UNIFIED_INDEX_0, UNIFIED_INDEX_1);
-      }
-      break;
-      CASE_SRCS_IN(1, 1, 1, 0) : {
-        auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src1, Index2,
-                                                                  Src0, Index3);
-        T = lowerShuffleVector_TwoFromSameSrc(Src1, Index0, Index1, Unified,
-                                              UNIFIED_INDEX_0, UNIFIED_INDEX_1);
-      }
-      break;
-      CASE_SRCS_IN(1, 1, 1, 1) : {
-        T = lowerShuffleVector_AllFromSameSrc(Src1, Index0, Index1, Index2,
-                                              Index3);
-      }
-      break;
-#undef CASE_SRCS_IN
-    }
-
-    assert(T != nullptr);
-    assert(T->getType() == DestTy);
-    _movp(Dest, T);
-    return;
-  } break;
-  }
-
-  // Unoptimized shuffle. Perform a series of inserts and extracts.
-  Context.insert<InstFakeDef>(T);
-  const Type ElementType = typeElementType(DestTy);
-  for (SizeT I = 0; I < Instr->getNumIndexes(); ++I) {
-    auto *Index = Instr->getIndex(I);
-    const SizeT Elem = Index->getValue();
-    auto *ExtElmt = makeReg(ElementType);
-    if (Elem < NumElements) {
-      lowerExtractElement(
-          InstExtractElement::create(Func, ExtElmt, Src0, Index));
-    } else {
-      lowerExtractElement(InstExtractElement::create(
-          Func, ExtElmt, Src1, Ctx->getConstantInt32(Elem - NumElements)));
-    }
-    auto *NewT = makeReg(DestTy);
-    lowerInsertElement(InstInsertElement::create(Func, NewT, T, ExtElmt,
-                                                 Ctx->getConstantInt32(I)));
-    T = NewT;
-  }
-  _movp(Dest, T);
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::lowerSelect(const InstSelect *Select) {
-  Variable *Dest = Select->getDest();
-
-  Operand *Condition = Select->getCondition();
-  // Handle folding opportunities.
-  if (const Inst *Producer = FoldingInfo.getProducerFor(Condition)) {
-    assert(Producer->isDeleted());
-    switch (BoolFolding<Traits>::getProducerKind(Producer)) {
-    default:
-      break;
-    case BoolFolding<Traits>::PK_Icmp32:
-    case BoolFolding<Traits>::PK_Icmp64: {
-      lowerIcmpAndConsumer(llvm::cast<InstIcmp>(Producer), Select);
-      return;
-    }
-    case BoolFolding<Traits>::PK_Fcmp: {
-      lowerFcmpAndConsumer(llvm::cast<InstFcmp>(Producer), Select);
-      return;
-    }
-    }
-  }
-
-  if (isVectorType(Dest->getType())) {
-    lowerSelectVector(Select);
-    return;
-  }
-
-  Operand *CmpResult = legalize(Condition, Legal_Reg | Legal_Mem);
-  Operand *Zero = Ctx->getConstantZero(IceType_i32);
-  _cmp(CmpResult, Zero);
-  Operand *SrcT = Select->getTrueOperand();
-  Operand *SrcF = Select->getFalseOperand();
-  const BrCond Cond = CondX86::Br_ne;
-  lowerSelectMove(Dest, Cond, SrcT, SrcF);
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::lowerSelectMove(Variable *Dest, BrCond Cond,
-                                                Operand *SrcT, Operand *SrcF) {
-  Type DestTy = Dest->getType();
-  if (typeWidthInBytes(DestTy) == 1 || isFloatingType(DestTy)) {
-    // The cmov instruction doesn't allow 8-bit or FP operands, so we need
-    // explicit control flow.
-    // d=cmp e,f; a=d?b:c ==> cmp e,f; a=b; jne L1; a=c; L1:
-    auto *Label = InstX86Label::create(Func, this);
-    SrcT = legalize(SrcT, Legal_Reg | Legal_Imm);
-    _mov(Dest, SrcT);
-    _br(Cond, Label);
-    SrcF = legalize(SrcF, Legal_Reg | Legal_Imm);
-    _redefined(_mov(Dest, SrcF));
-    Context.insert(Label);
-    return;
-  }
-  // mov t, SrcF; cmov_cond t, SrcT; mov dest, t
-  // But if SrcT is immediate, we might be able to do better, as the cmov
-  // instruction doesn't allow an immediate operand:
-  // mov t, SrcT; cmov_!cond t, SrcF; mov dest, t
-  if (llvm::isa<Constant>(SrcT) && !llvm::isa<Constant>(SrcF)) {
-    std::swap(SrcT, SrcF);
-    Cond = InstImpl<TraitsType>::InstX86Base::getOppositeCondition(Cond);
-  }
-  if (!Traits::Is64Bit && DestTy == IceType_i64) {
-    SrcT = legalizeUndef(SrcT);
-    SrcF = legalizeUndef(SrcF);
-    // Set the low portion.
-    auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
-    lowerSelectIntMove(DestLo, Cond, loOperand(SrcT), loOperand(SrcF));
-    // Set the high portion.
-    auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
-    lowerSelectIntMove(DestHi, Cond, hiOperand(SrcT), hiOperand(SrcF));
-    return;
-  }
-
-  assert(DestTy == IceType_i16 || DestTy == IceType_i32 ||
-         (Traits::Is64Bit && DestTy == IceType_i64));
-  lowerSelectIntMove(Dest, Cond, SrcT, SrcF);
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::lowerSelectIntMove(Variable *Dest, BrCond Cond,
-                                                   Operand *SrcT,
-                                                   Operand *SrcF) {
-  Variable *T = nullptr;
-  SrcF = legalize(SrcF);
-  _mov(T, SrcF);
-  SrcT = legalize(SrcT, Legal_Reg | Legal_Mem);
-  _cmov(T, SrcT, Cond);
-  _mov(Dest, T);
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::lowerMove(Variable *Dest, Operand *Src,
-                                          bool IsRedefinition) {
-  assert(Dest->getType() == Src->getType());
-  assert(!Dest->isRematerializable());
-  if (!Traits::Is64Bit && Dest->getType() == IceType_i64) {
-    Src = legalize(Src);
-    Operand *SrcLo = loOperand(Src);
-    Operand *SrcHi = hiOperand(Src);
-    auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
-    auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
-    Variable *T_Lo = nullptr, *T_Hi = nullptr;
-    _mov(T_Lo, SrcLo);
-    _redefined(_mov(DestLo, T_Lo), IsRedefinition);
-    _mov(T_Hi, SrcHi);
-    _redefined(_mov(DestHi, T_Hi), IsRedefinition);
-  } else {
-    Operand *SrcLegal;
-    if (Dest->hasReg()) {
-      // If Dest already has a physical register, then only basic legalization
-      // is needed, as the source operand can be a register, immediate, or
-      // memory.
-      SrcLegal = legalize(Src, Legal_Reg, Dest->getRegNum());
-    } else {
-      // If Dest could be a stack operand, then RI must be a physical register
-      // or a scalar integer immediate.
-      SrcLegal = legalize(Src, Legal_Reg | Legal_Imm);
-    }
-    if (isVectorType(Dest->getType())) {
-      _redefined(_movp(Dest, SrcLegal), IsRedefinition);
-    } else {
-      _redefined(_mov(Dest, SrcLegal), IsRedefinition);
-    }
-  }
-}
-
-template <typename TraitsType>
-bool TargetX86Base<TraitsType>::lowerOptimizeFcmpSelect(
-    const InstFcmp *Fcmp, const InstSelect *Select) {
-  Operand *CmpSrc0 = Fcmp->getSrc(0);
-  Operand *CmpSrc1 = Fcmp->getSrc(1);
-  Operand *SelectSrcT = Select->getTrueOperand();
-  Operand *SelectSrcF = Select->getFalseOperand();
-  Variable *SelectDest = Select->getDest();
-
-  // TODO(capn): also handle swapped compare/select operand order.
-  if (CmpSrc0 != SelectSrcT || CmpSrc1 != SelectSrcF)
-    return false;
-
-  // TODO(sehr, stichnot): fcmp/select patterns (e.g., minsd/maxss) go here.
-  InstFcmp::FCond Condition = Fcmp->getCondition();
-  switch (Condition) {
-  default:
-    return false;
-  case InstFcmp::True:
-    break;
-  case InstFcmp::False:
-    break;
-  case InstFcmp::Ogt: {
-    Variable *T = makeReg(SelectDest->getType());
-    if (isScalarFloatingType(SelectSrcT->getType())) {
-      _mov(T, legalize(SelectSrcT, Legal_Reg | Legal_Mem));
-      _maxss(T, legalize(SelectSrcF, Legal_Reg | Legal_Mem));
-      _mov(SelectDest, T);
-    } else {
-      _movp(T, legalize(SelectSrcT, Legal_Reg | Legal_Mem));
-      _maxps(T, legalize(SelectSrcF, Legal_Reg | Legal_Mem));
-      _movp(SelectDest, T);
-    }
-    return true;
-  } break;
-  case InstFcmp::Olt: {
-    Variable *T = makeReg(SelectSrcT->getType());
-    if (isScalarFloatingType(SelectSrcT->getType())) {
-      _mov(T, legalize(SelectSrcT, Legal_Reg | Legal_Mem));
-      _minss(T, legalize(SelectSrcF, Legal_Reg | Legal_Mem));
-      _mov(SelectDest, T);
-    } else {
-      _movp(T, legalize(SelectSrcT, Legal_Reg | Legal_Mem));
-      _minps(T, legalize(SelectSrcF, Legal_Reg | Legal_Mem));
-      _movp(SelectDest, T);
-    }
-    return true;
-  } break;
-  }
-  return false;
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::lowerIcmp(const InstIcmp *Icmp) {
-  Variable *Dest = Icmp->getDest();
-  if (isVectorType(Dest->getType())) {
-    lowerIcmpVector(Icmp);
-  } else {
-    constexpr Inst *Consumer = nullptr;
-    lowerIcmpAndConsumer(Icmp, Consumer);
-  }
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::lowerSelectVector(const InstSelect *Instr) {
-  Variable *Dest = Instr->getDest();
-  Type DestTy = Dest->getType();
-  Operand *SrcT = Instr->getTrueOperand();
-  Operand *SrcF = Instr->getFalseOperand();
-  Operand *Condition = Instr->getCondition();
-
-  if (!isVectorType(DestTy))
-    llvm::report_fatal_error("Expected a vector select");
-
-  Type SrcTy = SrcT->getType();
-  Variable *T = makeReg(SrcTy);
-  Operand *SrcTRM = legalize(SrcT, Legal_Reg | Legal_Mem);
-  Operand *SrcFRM = legalize(SrcF, Legal_Reg | Legal_Mem);
-
-  if (InstructionSet >= Traits::SSE4_1) {
-    // TODO(wala): If the condition operand is a constant, use blendps or
-    // pblendw.
-    //
-    // Use blendvps or pblendvb to implement select.
-    if (SrcTy == IceType_v4i1 || SrcTy == IceType_v4i32 ||
-        SrcTy == IceType_v4f32) {
-      Operand *ConditionRM = legalize(Condition, Legal_Reg | Legal_Mem);
-      Variable *xmm0 = makeReg(IceType_v4i32, Traits::RegisterSet::Reg_xmm0);
-      _movp(xmm0, ConditionRM);
-      _psll(xmm0, Ctx->getConstantInt8(31));
-      _movp(T, SrcFRM);
-      _blendvps(T, SrcTRM, xmm0);
-      _movp(Dest, T);
-    } else {
-      assert(typeNumElements(SrcTy) == 8 || typeNumElements(SrcTy) == 16);
-      Type SignExtTy =
-          Condition->getType() == IceType_v8i1 ? IceType_v8i16 : IceType_v16i8;
-      Variable *xmm0 = makeReg(SignExtTy, Traits::RegisterSet::Reg_xmm0);
-      lowerCast(InstCast::create(Func, InstCast::Sext, xmm0, Condition));
-      _movp(T, SrcFRM);
-      _pblendvb(T, SrcTRM, xmm0);
-      _movp(Dest, T);
-    }
-    return;
-  }
-  // Lower select without Traits::SSE4.1:
-  // a=d?b:c ==>
-  //   if elementtype(d) != i1:
-  //      d=sext(d);
-  //   a=(b&d)|(c&~d);
-  Variable *T2 = makeReg(SrcTy);
-  // Sign extend the condition operand if applicable.
-  if (SrcTy == IceType_v4f32) {
-    // The sext operation takes only integer arguments.
-    Variable *T3 = Func->makeVariable(IceType_v4i32);
-    lowerCast(InstCast::create(Func, InstCast::Sext, T3, Condition));
-    _movp(T, T3);
-  } else if (typeElementType(SrcTy) != IceType_i1) {
-    lowerCast(InstCast::create(Func, InstCast::Sext, T, Condition));
-  } else {
-    Operand *ConditionRM = legalize(Condition, Legal_Reg | Legal_Mem);
-    _movp(T, ConditionRM);
-  }
-  _movp(T2, T);
-  _pand(T, SrcTRM);
-  _pandn(T2, SrcFRM);
-  _por(T, T2);
-  _movp(Dest, T);
-
-  return;
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::lowerStore(const InstStore *Instr) {
-  Operand *Value = Instr->getData();
-  Operand *Addr = Instr->getStoreAddress();
-  X86OperandMem *NewAddr = formMemoryOperand(Addr, Value->getType());
-  doMockBoundsCheck(NewAddr);
-  Type Ty = NewAddr->getType();
-
-  if (!Traits::Is64Bit && Ty == IceType_i64) {
-    Value = legalizeUndef(Value);
-    Operand *ValueHi = legalize(hiOperand(Value), Legal_Reg | Legal_Imm);
-    _store(ValueHi, llvm::cast<X86OperandMem>(hiOperand(NewAddr)));
-    Operand *ValueLo = legalize(loOperand(Value), Legal_Reg | Legal_Imm);
-    _store(ValueLo, llvm::cast<X86OperandMem>(loOperand(NewAddr)));
-  } else if (isVectorType(Ty)) {
-    _storep(legalizeToReg(Value), NewAddr);
-  } else {
-    Value = legalize(Value, Legal_Reg | Legal_Imm);
-    _store(Value, NewAddr);
-  }
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::doAddressOptStore() {
-  auto *Instr = llvm::cast<InstStore>(Context.getCur());
-  Operand *Addr = Instr->getStoreAddress();
-  Operand *Data = Instr->getData();
-  if (auto *OptAddr = computeAddressOpt(Instr, Data->getType(), Addr)) {
-    Instr->setDeleted();
-    auto *NewStore = Context.insert<InstStore>(Data, OptAddr);
-    if (Instr->getDest())
-      NewStore->setRmwBeacon(Instr->getRmwBeacon());
-  }
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::doAddressOptStoreSubVector() {
-  auto *Intrinsic = llvm::cast<InstIntrinsic>(Context.getCur());
-  Operand *Addr = Intrinsic->getArg(1);
-  Operand *Data = Intrinsic->getArg(0);
-  if (auto *OptAddr = computeAddressOpt(Intrinsic, Data->getType(), Addr)) {
-    Intrinsic->setDeleted();
-    const Ice::Intrinsics::IntrinsicInfo Info = {
-        Ice::Intrinsics::StoreSubVector, Ice::Intrinsics::SideEffects_T,
-        Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_T};
-    auto *NewStore = Context.insert<InstIntrinsic>(3, nullptr, Info);
-    NewStore->addArg(Data);
-    NewStore->addArg(OptAddr);
-    NewStore->addArg(Intrinsic->getArg(2));
-  }
-}
-
-template <typename TraitsType>
-Operand *TargetX86Base<TraitsType>::lowerCmpRange(Operand *Comparison,
-                                                  uint64_t Min, uint64_t Max) {
-  // TODO(ascull): 64-bit should not reach here but only because it is not
-  // implemented yet. This should be able to handle the 64-bit case.
-  assert(Traits::Is64Bit || Comparison->getType() != IceType_i64);
-  // Subtracting 0 is a nop so don't do it
-  if (Min != 0) {
-    // Avoid clobbering the comparison by copying it
-    Variable *T = nullptr;
-    _mov(T, Comparison);
-    _sub(T, Ctx->getConstantInt32(Min));
-    Comparison = T;
-  }
-
-  _cmp(Comparison, Ctx->getConstantInt32(Max - Min));
-
-  return Comparison;
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::lowerCaseCluster(const CaseCluster &Case,
-                                                 Operand *Comparison,
-                                                 bool DoneCmp,
-                                                 CfgNode *DefaultTarget) {
-  switch (Case.getKind()) {
-  case CaseCluster::JumpTable: {
-    InstX86Label *SkipJumpTable;
-
-    Operand *RangeIndex =
-        lowerCmpRange(Comparison, Case.getLow(), Case.getHigh());
-    if (DefaultTarget == nullptr) {
-      // Skip over jump table logic if comparison not in range and no default
-      SkipJumpTable = InstX86Label::create(Func, this);
-      _br(CondX86::Br_a, SkipJumpTable);
-    } else {
-      _br(CondX86::Br_a, DefaultTarget);
-    }
-
-    InstJumpTable *JumpTable = Case.getJumpTable();
-    Context.insert(JumpTable);
-
-    // Make sure the index is a register of the same width as the base
-    Variable *Index;
-    const Type PointerType = getPointerType();
-    if (RangeIndex->getType() != PointerType) {
-      Index = makeReg(PointerType);
-      if (RangeIndex->getType() == IceType_i64) {
-        assert(Traits::Is64Bit);
-        _mov(Index, RangeIndex); // trunc
-      } else {
-        Operand *RangeIndexRM = legalize(RangeIndex, Legal_Reg | Legal_Mem);
-        _movzx(Index, RangeIndexRM);
-      }
-    } else {
-      Index = legalizeToReg(RangeIndex);
-    }
-
-    constexpr RelocOffsetT RelocOffset = 0;
-    constexpr Variable *NoBase = nullptr;
-    constexpr Constant *NoOffset = nullptr;
-    auto JTName = GlobalString::createWithString(Ctx, JumpTable->getName());
-    Constant *Offset = Ctx->getConstantSym(RelocOffset, JTName);
-    uint16_t Shift = typeWidthInBytesLog2(PointerType);
-    constexpr auto Segment = X86OperandMem::SegmentRegisters::DefaultSegment;
-
-    Variable *Target = nullptr;
-    if (PointerType == IceType_i32) {
-      _mov(Target, X86OperandMem::create(Func, PointerType, NoBase, Offset,
-                                         Index, Shift, Segment));
-    } else {
-      auto *Base = makeReg(IceType_i64);
-      _lea(Base, X86OperandMem::create(Func, IceType_void, NoBase, Offset));
-      _mov(Target, X86OperandMem::create(Func, PointerType, Base, NoOffset,
-                                         Index, Shift, Segment));
-    }
-
-    lowerIndirectJump(Target);
-
-    if (DefaultTarget == nullptr)
-      Context.insert(SkipJumpTable);
-    return;
-  }
-  case CaseCluster::Range: {
-    if (Case.isUnitRange()) {
-      // Single item
-      if (!DoneCmp) {
-        Constant *Value = Ctx->getConstantInt32(Case.getLow());
-        _cmp(Comparison, Value);
-      }
-      _br(CondX86::Br_e, Case.getTarget());
-    } else if (DoneCmp && Case.isPairRange()) {
-      // Range of two items with first item aleady compared against
-      _br(CondX86::Br_e, Case.getTarget());
-      Constant *Value = Ctx->getConstantInt32(Case.getHigh());
-      _cmp(Comparison, Value);
-      _br(CondX86::Br_e, Case.getTarget());
-    } else {
-      // Range
-      lowerCmpRange(Comparison, Case.getLow(), Case.getHigh());
-      _br(CondX86::Br_be, Case.getTarget());
-    }
-    if (DefaultTarget != nullptr)
-      _br(DefaultTarget);
-    return;
-  }
-  }
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::lowerSwitch(const InstSwitch *Instr) {
-  // Group cases together and navigate through them with a binary search
-  CaseClusterArray CaseClusters = CaseCluster::clusterizeSwitch(Func, Instr);
-  Operand *Src0 = Instr->getComparison();
-  CfgNode *DefaultTarget = Instr->getLabelDefault();
-
-  assert(CaseClusters.size() != 0); // Should always be at least one
-
-  if (!Traits::Is64Bit && Src0->getType() == IceType_i64) {
-    Src0 = legalize(Src0); // get Base/Index into physical registers
-    Operand *Src0Lo = loOperand(Src0);
-    Operand *Src0Hi = hiOperand(Src0);
-    if (CaseClusters.back().getHigh() > UINT32_MAX) {
-      // TODO(ascull): handle 64-bit case properly (currently naive version)
-      // This might be handled by a higher level lowering of switches.
-      SizeT NumCases = Instr->getNumCases();
-      if (NumCases >= 2) {
-        Src0Lo = legalizeToReg(Src0Lo);
-        Src0Hi = legalizeToReg(Src0Hi);
-      } else {
-        Src0Lo = legalize(Src0Lo, Legal_Reg | Legal_Mem);
-        Src0Hi = legalize(Src0Hi, Legal_Reg | Legal_Mem);
-      }
-      for (SizeT I = 0; I < NumCases; ++I) {
-        Constant *ValueLo = Ctx->getConstantInt32(Instr->getValue(I));
-        Constant *ValueHi = Ctx->getConstantInt32(Instr->getValue(I) >> 32);
-        InstX86Label *Label = InstX86Label::create(Func, this);
-        _cmp(Src0Lo, ValueLo);
-        _br(CondX86::Br_ne, Label);
-        _cmp(Src0Hi, ValueHi);
-        _br(CondX86::Br_e, Instr->getLabel(I));
-        Context.insert(Label);
-      }
-      _br(Instr->getLabelDefault());
-      return;
-    } else {
-      // All the values are 32-bit so just check the operand is too and then
-      // fall through to the 32-bit implementation. This is a common case.
-      Src0Hi = legalize(Src0Hi, Legal_Reg | Legal_Mem);
-      Constant *Zero = Ctx->getConstantInt32(0);
-      _cmp(Src0Hi, Zero);
-      _br(CondX86::Br_ne, DefaultTarget);
-      Src0 = Src0Lo;
-    }
-  }
-
-  // 32-bit lowering
-
-  if (CaseClusters.size() == 1) {
-    // Jump straight to default if needed. Currently a common case as jump
-    // tables occur on their own.
-    constexpr bool DoneCmp = false;
-    lowerCaseCluster(CaseClusters.front(), Src0, DoneCmp, DefaultTarget);
-    return;
-  }
-
-  // Going to be using multiple times so get it in a register early
-  Variable *Comparison = legalizeToReg(Src0);
-
-  // A span is over the clusters
-  struct SearchSpan {
-    SearchSpan(SizeT Begin, SizeT Size, InstX86Label *Label)
-        : Begin(Begin), Size(Size), Label(Label) {}
-
-    SizeT Begin;
-    SizeT Size;
-    InstX86Label *Label;
-  };
-  // The stack will only grow to the height of the tree so 12 should be plenty
-  std::stack<SearchSpan, llvm::SmallVector<SearchSpan, 12>> SearchSpanStack;
-  SearchSpanStack.emplace(0, CaseClusters.size(), nullptr);
-  bool DoneCmp = false;
-
-  while (!SearchSpanStack.empty()) {
-    SearchSpan Span = SearchSpanStack.top();
-    SearchSpanStack.pop();
-
-    if (Span.Label != nullptr)
-      Context.insert(Span.Label);
-
-    switch (Span.Size) {
-    case 0:
-      llvm::report_fatal_error("Invalid SearchSpan size");
-      break;
-
-    case 1:
-      lowerCaseCluster(CaseClusters[Span.Begin], Comparison, DoneCmp,
-                       SearchSpanStack.empty() ? nullptr : DefaultTarget);
-      DoneCmp = false;
-      break;
-
-    case 2: {
-      const CaseCluster *CaseA = &CaseClusters[Span.Begin];
-      const CaseCluster *CaseB = &CaseClusters[Span.Begin + 1];
-
-      // Placing a range last may allow register clobbering during the range
-      // test. That means there is no need to clone the register. If it is a
-      // unit range the comparison may have already been done in the binary
-      // search (DoneCmp) and so it should be placed first. If this is a range
-      // of two items and the comparison with the low value has already been
-      // done, comparing with the other element is cheaper than a range test.
-      // If the low end of the range is zero then there is no subtraction and
-      // nothing to be gained.
-      if (!CaseA->isUnitRange() &&
-          !(CaseA->getLow() == 0 || (DoneCmp && CaseA->isPairRange()))) {
-        std::swap(CaseA, CaseB);
-        DoneCmp = false;
-      }
-
-      lowerCaseCluster(*CaseA, Comparison, DoneCmp);
-      DoneCmp = false;
-      lowerCaseCluster(*CaseB, Comparison, DoneCmp,
-                       SearchSpanStack.empty() ? nullptr : DefaultTarget);
-    } break;
-
-    default:
-      // Pick the middle item and branch b or ae
-      SizeT PivotIndex = Span.Begin + (Span.Size / 2);
-      const CaseCluster &Pivot = CaseClusters[PivotIndex];
-      Constant *Value = Ctx->getConstantInt32(Pivot.getLow());
-      InstX86Label *Label = InstX86Label::create(Func, this);
-      _cmp(Comparison, Value);
-      // TODO(ascull): does it alway have to be far?
-      _br(CondX86::Br_b, Label, InstX86Br::Far);
-      // Lower the left and (pivot+right) sides, falling through to the right
-      SearchSpanStack.emplace(Span.Begin, Span.Size / 2, Label);
-      SearchSpanStack.emplace(PivotIndex, Span.Size - (Span.Size / 2), nullptr);
-      DoneCmp = true;
-      break;
-    }
-  }
-
-  _br(DefaultTarget);
-}
-
-/// The following pattern occurs often in lowered C and C++ code:
-///
-///   %cmp     = fcmp/icmp pred <n x ty> %src0, %src1
-///   %cmp.ext = sext <n x i1> %cmp to <n x ty>
-///
-/// We can eliminate the sext operation by copying the result of pcmpeqd,
-/// pcmpgtd, or cmpps (which produce sign extended results) to the result of the
-/// sext operation.
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::eliminateNextVectorSextInstruction(
-    Variable *SignExtendedResult) {
-  if (auto *NextCast =
-          llvm::dyn_cast_or_null<InstCast>(Context.getNextInst())) {
-    if (NextCast->getCastKind() == InstCast::Sext &&
-        NextCast->getSrc(0) == SignExtendedResult) {
-      NextCast->setDeleted();
-      _movp(NextCast->getDest(), legalizeToReg(SignExtendedResult));
-      // Skip over the instruction.
-      Context.advanceNext();
-    }
-  }
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::lowerUnreachable(
-    const InstUnreachable * /*Instr*/) {
-  _ud2();
-  // Add a fake use of esp to make sure esp adjustments after the unreachable
-  // do not get dead-code eliminated.
-  keepEspLiveAtExit();
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::lowerBreakpoint(
-    const InstBreakpoint * /*Instr*/) {
-  _int3();
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::lowerRMW(const InstX86FakeRMW *RMW) {
-  // If the beacon variable's live range does not end in this instruction, then
-  // it must end in the modified Store instruction that follows. This means
-  // that the original Store instruction is still there, either because the
-  // value being stored is used beyond the Store instruction, or because dead
-  // code elimination did not happen. In either case, we cancel RMW lowering
-  // (and the caller deletes the RMW instruction).
-  if (!RMW->isLastUse(RMW->getBeacon()))
-    return;
-  Operand *Src = RMW->getData();
-  Type Ty = Src->getType();
-  X86OperandMem *Addr = formMemoryOperand(RMW->getAddr(), Ty);
-  doMockBoundsCheck(Addr);
-  if (!Traits::Is64Bit && Ty == IceType_i64) {
-    Src = legalizeUndef(Src);
-    Operand *SrcLo = legalize(loOperand(Src), Legal_Reg | Legal_Imm);
-    Operand *SrcHi = legalize(hiOperand(Src), Legal_Reg | Legal_Imm);
-    auto *AddrLo = llvm::cast<X86OperandMem>(loOperand(Addr));
-    auto *AddrHi = llvm::cast<X86OperandMem>(hiOperand(Addr));
-    switch (RMW->getOp()) {
-    default:
-      // TODO(stichnot): Implement other arithmetic operators.
-      break;
-    case InstArithmetic::Add:
-      _add_rmw(AddrLo, SrcLo);
-      _adc_rmw(AddrHi, SrcHi);
-      return;
-    case InstArithmetic::Sub:
-      _sub_rmw(AddrLo, SrcLo);
-      _sbb_rmw(AddrHi, SrcHi);
-      return;
-    case InstArithmetic::And:
-      _and_rmw(AddrLo, SrcLo);
-      _and_rmw(AddrHi, SrcHi);
-      return;
-    case InstArithmetic::Or:
-      _or_rmw(AddrLo, SrcLo);
-      _or_rmw(AddrHi, SrcHi);
-      return;
-    case InstArithmetic::Xor:
-      _xor_rmw(AddrLo, SrcLo);
-      _xor_rmw(AddrHi, SrcHi);
-      return;
-    }
-  } else {
-    // x86-32: i8, i16, i32
-    // x86-64: i8, i16, i32, i64
-    switch (RMW->getOp()) {
-    default:
-      // TODO(stichnot): Implement other arithmetic operators.
-      break;
-    case InstArithmetic::Add:
-      Src = legalize(Src, Legal_Reg | Legal_Imm);
-      _add_rmw(Addr, Src);
-      return;
-    case InstArithmetic::Sub:
-      Src = legalize(Src, Legal_Reg | Legal_Imm);
-      _sub_rmw(Addr, Src);
-      return;
-    case InstArithmetic::And:
-      Src = legalize(Src, Legal_Reg | Legal_Imm);
-      _and_rmw(Addr, Src);
-      return;
-    case InstArithmetic::Or:
-      Src = legalize(Src, Legal_Reg | Legal_Imm);
-      _or_rmw(Addr, Src);
-      return;
-    case InstArithmetic::Xor:
-      Src = legalize(Src, Legal_Reg | Legal_Imm);
-      _xor_rmw(Addr, Src);
-      return;
-    }
-  }
-  llvm::report_fatal_error("Couldn't lower RMW instruction");
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::lowerOther(const Inst *Instr) {
-  if (const auto *RMW = llvm::dyn_cast<InstX86FakeRMW>(Instr)) {
-    lowerRMW(RMW);
-  } else {
-    TargetLowering::lowerOther(Instr);
-  }
-}
-
-/// Turn an i64 Phi instruction into a pair of i32 Phi instructions, to preserve
-/// integrity of liveness analysis. Undef values are also turned into zeroes,
-/// since loOperand() and hiOperand() don't expect Undef input.  Also, in
-/// Non-SFI mode, add a FakeUse(RebasePtr) for every pooled constant operand.
-template <typename TraitsType> void TargetX86Base<TraitsType>::prelowerPhis() {
-  if (Traits::Is64Bit) {
-    // On x86-64 we don't need to prelower phis -- the architecture can handle
-    // 64-bit integer natively.
-    return;
-  }
-
-  PhiLowering::prelowerPhis32Bit<TargetX86Base<TraitsType>>(
-      this, Context.getNode(), Func);
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::genTargetHelperCallFor(Inst *Instr) {
-  uint32_t StackArgumentsSize = 0;
-  if (auto *Arith = llvm::dyn_cast<InstArithmetic>(Instr)) {
-    RuntimeHelper HelperID = RuntimeHelper::H_Num;
-    Variable *Dest = Arith->getDest();
-    Type DestTy = Dest->getType();
-    if (!Traits::Is64Bit && DestTy == IceType_i64) {
-      switch (Arith->getOp()) {
-      default:
-        return;
-      case InstArithmetic::Udiv:
-        HelperID = RuntimeHelper::H_udiv_i64;
-        break;
-      case InstArithmetic::Sdiv:
-        HelperID = RuntimeHelper::H_sdiv_i64;
-        break;
-      case InstArithmetic::Urem:
-        HelperID = RuntimeHelper::H_urem_i64;
-        break;
-      case InstArithmetic::Srem:
-        HelperID = RuntimeHelper::H_srem_i64;
-        break;
-      }
-    } else if (isVectorType(DestTy)) {
-      Variable *Dest = Arith->getDest();
-      Operand *Src0 = Arith->getSrc(0);
-      Operand *Src1 = Arith->getSrc(1);
-      switch (Arith->getOp()) {
-      default:
-        return;
-      case InstArithmetic::Mul:
-        if (DestTy == IceType_v16i8) {
-          scalarizeArithmetic(Arith->getOp(), Dest, Src0, Src1);
-          Arith->setDeleted();
-        }
-        return;
-      case InstArithmetic::Shl:
-      case InstArithmetic::Lshr:
-      case InstArithmetic::Ashr:
-        if (llvm::isa<Constant>(Src1)) {
-          return;
-        }
-      case InstArithmetic::Udiv:
-      case InstArithmetic::Urem:
-      case InstArithmetic::Sdiv:
-      case InstArithmetic::Srem:
-      case InstArithmetic::Frem:
-        scalarizeArithmetic(Arith->getOp(), Dest, Src0, Src1);
-        Arith->setDeleted();
-        return;
-      }
-    } else {
-      switch (Arith->getOp()) {
-      default:
-        return;
-      case InstArithmetic::Frem:
-        if (isFloat32Asserting32Or64(DestTy))
-          HelperID = RuntimeHelper::H_frem_f32;
-        else
-          HelperID = RuntimeHelper::H_frem_f64;
-      }
-    }
-    constexpr SizeT MaxSrcs = 2;
-    InstCall *Call = makeHelperCall(HelperID, Dest, MaxSrcs);
-    Call->addArg(Arith->getSrc(0));
-    Call->addArg(Arith->getSrc(1));
-    StackArgumentsSize = getCallStackArgumentsSizeBytes(Call);
-    Context.insert(Call);
-    Arith->setDeleted();
-  } else if (auto *Cast = llvm::dyn_cast<InstCast>(Instr)) {
-    InstCast::OpKind CastKind = Cast->getCastKind();
-    Operand *Src0 = Cast->getSrc(0);
-    const Type SrcType = Src0->getType();
-    Variable *Dest = Cast->getDest();
-    const Type DestTy = Dest->getType();
-    RuntimeHelper HelperID = RuntimeHelper::H_Num;
-    Variable *CallDest = Dest;
-    switch (CastKind) {
-    default:
-      return;
-    case InstCast::Fptosi:
-      if (!Traits::Is64Bit && DestTy == IceType_i64) {
-        HelperID = isFloat32Asserting32Or64(SrcType)
-                       ? RuntimeHelper::H_fptosi_f32_i64
-                       : RuntimeHelper::H_fptosi_f64_i64;
-      } else {
-        return;
-      }
-      break;
-    case InstCast::Fptoui:
-      if (isVectorType(DestTy)) {
-        assert(DestTy == IceType_v4i32);
-        assert(SrcType == IceType_v4f32);
-        HelperID = RuntimeHelper::H_fptoui_4xi32_f32;
-      } else if (DestTy == IceType_i64 ||
-                 (!Traits::Is64Bit && DestTy == IceType_i32)) {
-        if (Traits::Is64Bit) {
-          HelperID = isFloat32Asserting32Or64(SrcType)
-                         ? RuntimeHelper::H_fptoui_f32_i64
-                         : RuntimeHelper::H_fptoui_f64_i64;
-        } else if (isInt32Asserting32Or64(DestTy)) {
-          HelperID = isFloat32Asserting32Or64(SrcType)
-                         ? RuntimeHelper::H_fptoui_f32_i32
-                         : RuntimeHelper::H_fptoui_f64_i32;
-        } else {
-          HelperID = isFloat32Asserting32Or64(SrcType)
-                         ? RuntimeHelper::H_fptoui_f32_i64
-                         : RuntimeHelper::H_fptoui_f64_i64;
-        }
-      } else {
-        return;
-      }
-      break;
-    case InstCast::Sitofp:
-      if (!Traits::Is64Bit && SrcType == IceType_i64) {
-        HelperID = isFloat32Asserting32Or64(DestTy)
-                       ? RuntimeHelper::H_sitofp_i64_f32
-                       : RuntimeHelper::H_sitofp_i64_f64;
-      } else {
-        return;
-      }
-      break;
-    case InstCast::Uitofp:
-      if (isVectorType(SrcType)) {
-        assert(DestTy == IceType_v4f32);
-        assert(SrcType == IceType_v4i32);
-        HelperID = RuntimeHelper::H_uitofp_4xi32_4xf32;
-      } else if (SrcType == IceType_i64 ||
-                 (!Traits::Is64Bit && SrcType == IceType_i32)) {
-        if (isInt32Asserting32Or64(SrcType)) {
-          HelperID = isFloat32Asserting32Or64(DestTy)
-                         ? RuntimeHelper::H_uitofp_i32_f32
-                         : RuntimeHelper::H_uitofp_i32_f64;
-        } else {
-          HelperID = isFloat32Asserting32Or64(DestTy)
-                         ? RuntimeHelper::H_uitofp_i64_f32
-                         : RuntimeHelper::H_uitofp_i64_f64;
-        }
-      } else {
-        return;
-      }
-      break;
-    case InstCast::Bitcast: {
-      if (DestTy == Src0->getType())
-        return;
-      switch (DestTy) {
-      default:
-        return;
-      case IceType_i8:
-        assert(Src0->getType() == IceType_v8i1);
-        HelperID = RuntimeHelper::H_bitcast_8xi1_i8;
-        CallDest = Func->makeVariable(IceType_i32);
-        break;
-      case IceType_i16:
-        assert(Src0->getType() == IceType_v16i1);
-        HelperID = RuntimeHelper::H_bitcast_16xi1_i16;
-        CallDest = Func->makeVariable(IceType_i32);
-        break;
-      case IceType_v8i1: {
-        assert(Src0->getType() == IceType_i8);
-        HelperID = RuntimeHelper::H_bitcast_i8_8xi1;
-        Variable *Src0AsI32 = Func->makeVariable(stackSlotType());
-        // Arguments to functions are required to be at least 32 bits wide.
-        Context.insert<InstCast>(InstCast::Zext, Src0AsI32, Src0);
-        Src0 = Src0AsI32;
-      } break;
-      case IceType_v16i1: {
-        assert(Src0->getType() == IceType_i16);
-        HelperID = RuntimeHelper::H_bitcast_i16_16xi1;
-        Variable *Src0AsI32 = Func->makeVariable(stackSlotType());
-        // Arguments to functions are required to be at least 32 bits wide.
-        Context.insert<InstCast>(InstCast::Zext, Src0AsI32, Src0);
-        Src0 = Src0AsI32;
-      } break;
-      }
-    } break;
-    }
-    constexpr SizeT MaxSrcs = 1;
-    InstCall *Call = makeHelperCall(HelperID, CallDest, MaxSrcs);
-    Call->addArg(Src0);
-    StackArgumentsSize = getCallStackArgumentsSizeBytes(Call);
-    Context.insert(Call);
-    // The PNaCl ABI disallows i8/i16 return types, so truncate the helper call
-    // result to the appropriate type as necessary.
-    if (CallDest->getType() != Dest->getType())
-      Context.insert<InstCast>(InstCast::Trunc, Dest, CallDest);
-    Cast->setDeleted();
-  } else if (auto *Intrinsic = llvm::dyn_cast<InstIntrinsic>(Instr)) {
-    CfgVector<Type> ArgTypes;
-    Type ReturnType = IceType_void;
-    switch (Intrinsics::IntrinsicID ID = Intrinsic->getIntrinsicID()) {
-    default:
-      return;
-    case Intrinsics::Ctpop: {
-      Operand *Val = Intrinsic->getArg(0);
-      Type ValTy = Val->getType();
-      if (ValTy == IceType_i64)
-        ArgTypes = {IceType_i64};
-      else
-        ArgTypes = {IceType_i32};
-      ReturnType = IceType_i32;
-    } break;
-    case Intrinsics::Longjmp:
-      ArgTypes = {IceType_i32, IceType_i32};
-      ReturnType = IceType_void;
-      break;
-    case Intrinsics::Memcpy:
-      ArgTypes = {IceType_i32, IceType_i32, IceType_i32};
-      ReturnType = IceType_void;
-      break;
-    case Intrinsics::Memmove:
-      ArgTypes = {IceType_i32, IceType_i32, IceType_i32};
-      ReturnType = IceType_void;
-      break;
-    case Intrinsics::Memset:
-      ArgTypes = {IceType_i32, IceType_i32, IceType_i32};
-      ReturnType = IceType_void;
-      break;
-    case Intrinsics::Setjmp:
-      ArgTypes = {IceType_i32};
-      ReturnType = IceType_i32;
-      break;
-    }
-    StackArgumentsSize = getCallStackArgumentsSizeBytes(ArgTypes, ReturnType);
-  } else if (auto *Call = llvm::dyn_cast<InstCall>(Instr)) {
-    StackArgumentsSize = getCallStackArgumentsSizeBytes(Call);
-  } else if (auto *Ret = llvm::dyn_cast<InstRet>(Instr)) {
-    if (!Ret->hasRetValue())
-      return;
-    Operand *RetValue = Ret->getRetValue();
-    Type ReturnType = RetValue->getType();
-    if (!isScalarFloatingType(ReturnType))
-      return;
-    StackArgumentsSize = typeWidthInBytes(ReturnType);
-  } else {
-    return;
-  }
-  StackArgumentsSize = Traits::applyStackAlignment(StackArgumentsSize);
-  updateMaxOutArgsSizeBytes(StackArgumentsSize);
-}
-
-template <typename TraitsType>
-uint32_t TargetX86Base<TraitsType>::getCallStackArgumentsSizeBytes(
-    const CfgVector<Type> &ArgTypes, Type ReturnType) {
-  uint32_t OutArgumentsSizeBytes = 0;
-  uint32_t XmmArgCount = 0;
-  uint32_t GprArgCount = 0;
-  for (SizeT i = 0, NumArgTypes = ArgTypes.size(); i < NumArgTypes; ++i) {
-    Type Ty = ArgTypes[i];
-    // The PNaCl ABI requires the width of arguments to be at least 32 bits.
-    assert(typeWidthInBytes(Ty) >= 4);
-    if (isVectorType(Ty) &&
-        Traits::getRegisterForXmmArgNum(Traits::getArgIndex(i, XmmArgCount))
-            .hasValue()) {
-      ++XmmArgCount;
-    } else if (isScalarIntegerType(Ty) &&
-               Traits::getRegisterForGprArgNum(
-                   Ty, Traits::getArgIndex(i, GprArgCount))
-                   .hasValue()) {
-      // The 64 bit ABI allows some integers to be passed in GPRs.
-      ++GprArgCount;
-    } else {
-      if (isVectorType(Ty)) {
-        OutArgumentsSizeBytes =
-            Traits::applyStackAlignment(OutArgumentsSizeBytes);
-      }
-      OutArgumentsSizeBytes += typeWidthInBytesOnStack(Ty);
-    }
-  }
-  if (Traits::Is64Bit)
-    return OutArgumentsSizeBytes;
-  // The 32 bit ABI requires floating point values to be returned on the x87 FP
-  // stack. Ensure there is enough space for the fstp/movs for floating returns.
-  if (isScalarFloatingType(ReturnType)) {
-    OutArgumentsSizeBytes =
-        std::max(OutArgumentsSizeBytes,
-                 static_cast<uint32_t>(typeWidthInBytesOnStack(ReturnType)));
-  }
-  return OutArgumentsSizeBytes;
-}
-
-template <typename TraitsType>
-uint32_t TargetX86Base<TraitsType>::getCallStackArgumentsSizeBytes(
-    const InstCall *Instr) {
-  // Build a vector of the arguments' types.
-  const SizeT NumArgs = Instr->getNumArgs();
-  CfgVector<Type> ArgTypes;
-  ArgTypes.reserve(NumArgs);
-  for (SizeT i = 0; i < NumArgs; ++i) {
-    Operand *Arg = Instr->getArg(i);
-    ArgTypes.emplace_back(Arg->getType());
-  }
-  // Compute the return type (if any);
-  Type ReturnType = IceType_void;
-  Variable *Dest = Instr->getDest();
-  if (Dest != nullptr)
-    ReturnType = Dest->getType();
-  return getShadowStoreSize() +
-         getCallStackArgumentsSizeBytes(ArgTypes, ReturnType);
-}
-
-template <typename TraitsType>
-Variable *TargetX86Base<TraitsType>::makeZeroedRegister(Type Ty,
-                                                        RegNumT RegNum) {
-  Variable *Reg = makeReg(Ty, RegNum);
-  switch (Ty) {
-  case IceType_i1:
-  case IceType_i8:
-  case IceType_i16:
-  case IceType_i32:
-  case IceType_i64:
-    // Conservatively do "mov reg, 0" to avoid modifying FLAGS.
-    _mov(Reg, Ctx->getConstantZero(Ty));
-    break;
-  case IceType_f32:
-  case IceType_f64:
-    Context.insert<InstFakeDef>(Reg);
-    _xorps(Reg, Reg);
-    break;
-  default:
-    // All vector types use the same pxor instruction.
-    assert(isVectorType(Ty));
-    Context.insert<InstFakeDef>(Reg);
-    _pxor(Reg, Reg);
-    break;
-  }
-  return Reg;
-}
-
-// There is no support for loading or emitting vector constants, so the vector
-// values returned from makeVectorOfZeros, makeVectorOfOnes, etc. are
-// initialized with register operations.
-//
-// TODO(wala): Add limited support for vector constants so that complex
-// initialization in registers is unnecessary.
-
-template <typename TraitsType>
-Variable *TargetX86Base<TraitsType>::makeVectorOfZeros(Type Ty,
-                                                       RegNumT RegNum) {
-  return makeZeroedRegister(Ty, RegNum);
-}
-
-template <typename TraitsType>
-Variable *TargetX86Base<TraitsType>::makeVectorOfMinusOnes(Type Ty,
-                                                           RegNumT RegNum) {
-  Variable *MinusOnes = makeReg(Ty, RegNum);
-  // Insert a FakeDef so the live range of MinusOnes is not overestimated.
-  Context.insert<InstFakeDef>(MinusOnes);
-  if (Ty == IceType_f64)
-    // Making a vector of minus ones of type f64 is currently only used for the
-    // fabs intrinsic.  To use the f64 type to create this mask with pcmpeqq
-    // requires SSE 4.1.  Since we're just creating a mask, pcmpeqd does the
-    // same job and only requires SSE2.
-    _pcmpeq(MinusOnes, MinusOnes, IceType_f32);
-  else
-    _pcmpeq(MinusOnes, MinusOnes);
-  return MinusOnes;
-}
-
-template <typename TraitsType>
-Variable *TargetX86Base<TraitsType>::makeVectorOfOnes(Type Ty, RegNumT RegNum) {
-  Variable *Dest = makeVectorOfZeros(Ty, RegNum);
-  Variable *MinusOne = makeVectorOfMinusOnes(Ty);
-  _psub(Dest, MinusOne);
-  return Dest;
-}
-
-template <typename TraitsType>
-Variable *TargetX86Base<TraitsType>::makeVectorOfHighOrderBits(Type Ty,
-                                                               RegNumT RegNum) {
-  assert(Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v8i16 ||
-         Ty == IceType_v16i8);
-  if (Ty == IceType_v4f32 || Ty == IceType_v4i32 || Ty == IceType_v8i16) {
-    Variable *Reg = makeVectorOfOnes(Ty, RegNum);
-    SizeT Shift =
-        typeWidthInBytes(typeElementType(Ty)) * Traits::X86_CHAR_BIT - 1;
-    _psll(Reg, Ctx->getConstantInt8(Shift));
-    return Reg;
-  } else {
-    // SSE has no left shift operation for vectors of 8 bit integers.
-    constexpr uint32_t HIGH_ORDER_BITS_MASK = 0x80808080;
-    Constant *ConstantMask = Ctx->getConstantInt32(HIGH_ORDER_BITS_MASK);
-    Variable *Reg = makeReg(Ty, RegNum);
-    _movd(Reg, legalize(ConstantMask, Legal_Reg | Legal_Mem));
-    _pshufd(Reg, Reg, Ctx->getConstantZero(IceType_i8));
-    return Reg;
-  }
-}
-
-/// Construct a mask in a register that can be and'ed with a floating-point
-/// value to mask off its sign bit. The value will be <4 x 0x7fffffff> for f32
-/// and v4f32, and <2 x 0x7fffffffffffffff> for f64. Construct it as vector of
-/// ones logically right shifted one bit.
-// TODO(stichnot): Fix the wala
-// TODO: above, to represent vector constants in memory.
-template <typename TraitsType>
-Variable *TargetX86Base<TraitsType>::makeVectorOfFabsMask(Type Ty,
-                                                          RegNumT RegNum) {
-  Variable *Reg = makeVectorOfMinusOnes(Ty, RegNum);
-  _psrl(Reg, Ctx->getConstantInt8(1));
-  return Reg;
-}
-
-template <typename TraitsType>
-typename TargetX86Base<TraitsType>::X86OperandMem *
-TargetX86Base<TraitsType>::getMemoryOperandForStackSlot(Type Ty, Variable *Slot,
-                                                        uint32_t Offset) {
-  // Ensure that Loc is a stack slot.
-  assert(Slot->mustNotHaveReg());
-  assert(Slot->getRegNum().hasNoValue());
-  // Compute the location of Loc in memory.
-  // TODO(wala,stichnot): lea should not
-  // be required. The address of the stack slot is known at compile time
-  // (although not until after addProlog()).
-  const Type PointerType = getPointerType();
-  Variable *Loc = makeReg(PointerType);
-  _lea(Loc, Slot);
-  Constant *ConstantOffset = Ctx->getConstantInt32(Offset);
-  return X86OperandMem::create(Func, Ty, Loc, ConstantOffset);
-}
-
-/// Lowering helper to copy a scalar integer source operand into some 8-bit GPR.
-/// Src is assumed to already be legalized.  If the source operand is known to
-/// be a memory or immediate operand, a simple mov will suffice.  But if the
-/// source operand can be a physical register, then it must first be copied into
-/// a physical register that is truncable to 8-bit, then truncated into a
-/// physical register that can receive a truncation, and finally copied into the
-/// result 8-bit register (which in general can be any 8-bit register).  For
-/// example, moving %ebp into %ah may be accomplished as:
-///   movl %ebp, %edx
-///   mov_trunc %edx, %dl  // this redundant assignment is ultimately elided
-///   movb %dl, %ah
-/// On the other hand, moving a memory or immediate operand into ah:
-///   movb 4(%ebp), %ah
-///   movb $my_imm, %ah
-///
-/// Note #1.  On a 64-bit target, the "movb 4(%ebp), %ah" is likely not
-/// encodable, so RegNum=Reg_ah should NOT be given as an argument.  Instead,
-/// use RegNum=RegNumT() and then let the caller do a separate copy into
-/// Reg_ah.
-///
-/// Note #2.  ConstantRelocatable operands are also put through this process
-/// (not truncated directly) because our ELF emitter does R_386_32 relocations
-/// but not R_386_8 relocations.
-///
-/// Note #3.  If Src is a Variable, the result will be an infinite-weight i8
-/// Variable with the RCX86_IsTrunc8Rcvr register class.  As such, this helper
-/// is a convenient way to prevent ah/bh/ch/dh from being an (invalid) argument
-/// to the pinsrb instruction.
-template <typename TraitsType>
-Variable *TargetX86Base<TraitsType>::copyToReg8(Operand *Src, RegNumT RegNum) {
-  Type Ty = Src->getType();
-  assert(isScalarIntegerType(Ty));
-  assert(Ty != IceType_i1);
-  Variable *Reg = makeReg(IceType_i8, RegNum);
-  Reg->setRegClass(RCX86_IsTrunc8Rcvr);
-  if (llvm::isa<Variable>(Src) || llvm::isa<ConstantRelocatable>(Src)) {
-    Variable *SrcTruncable = makeReg(Ty);
-    switch (Ty) {
-    case IceType_i64:
-      SrcTruncable->setRegClass(RCX86_Is64To8);
-      break;
-    case IceType_i32:
-      SrcTruncable->setRegClass(RCX86_Is32To8);
-      break;
-    case IceType_i16:
-      SrcTruncable->setRegClass(RCX86_Is16To8);
-      break;
-    default:
-      // i8 - just use default register class
-      break;
-    }
-    Variable *SrcRcvr = makeReg(IceType_i8);
-    SrcRcvr->setRegClass(RCX86_IsTrunc8Rcvr);
-    _mov(SrcTruncable, Src);
-    _mov(SrcRcvr, SrcTruncable);
-    Src = SrcRcvr;
-  }
-  _mov(Reg, Src);
-  return Reg;
-}
-
-/// Helper for legalize() to emit the right code to lower an operand to a
-/// register of the appropriate type.
-template <typename TraitsType>
-Variable *TargetX86Base<TraitsType>::copyToReg(Operand *Src, RegNumT RegNum) {
-  Type Ty = Src->getType();
-  Variable *Reg = makeReg(Ty, RegNum);
-  if (isVectorType(Ty)) {
-    _movp(Reg, Src);
-  } else {
-    _mov(Reg, Src);
-  }
-  return Reg;
-}
-
-template <typename TraitsType>
-Operand *TargetX86Base<TraitsType>::legalize(Operand *From, LegalMask Allowed,
-                                             RegNumT RegNum) {
-  const Type Ty = From->getType();
-  // Assert that a physical register is allowed. To date, all calls to
-  // legalize() allow a physical register. If a physical register needs to be
-  // explicitly disallowed, then new code will need to be written to force a
-  // spill.
-  assert(Allowed & Legal_Reg);
-  // If we're asking for a specific physical register, make sure we're not
-  // allowing any other operand kinds. (This could be future work, e.g. allow
-  // the shl shift amount to be either an immediate or in ecx.)
-  assert(RegNum.hasNoValue() || Allowed == Legal_Reg);
-
-  // Substitute with an available infinite-weight variable if possible.  Only do
-  // this when we are not asking for a specific register, and when the
-  // substitution is not locked to a specific register, and when the types
-  // match, in order to capture the vast majority of opportunities and avoid
-  // corner cases in the lowering.
-  if (RegNum.hasNoValue()) {
-    if (Variable *Subst = getContext().availabilityGet(From)) {
-      // At this point we know there is a potential substitution available.
-      if (Subst->mustHaveReg() && !Subst->hasReg()) {
-        // At this point we know the substitution will have a register.
-        if (From->getType() == Subst->getType()) {
-          // At this point we know the substitution's register is compatible.
-          return Subst;
-        }
-      }
-    }
-  }
-
-  if (auto *Mem = llvm::dyn_cast<X86OperandMem>(From)) {
-    // Before doing anything with a Mem operand, we need to ensure that the
-    // Base and Index components are in physical registers.
-    Variable *Base = Mem->getBase();
-    Variable *Index = Mem->getIndex();
-    Constant *Offset = Mem->getOffset();
-    Variable *RegBase = nullptr;
-    Variable *RegIndex = nullptr;
-    uint16_t Shift = Mem->getShift();
-    if (Base) {
-      RegBase = llvm::cast<Variable>(
-          legalize(Base, Legal_Reg | Legal_Rematerializable));
-    }
-    if (Index) {
-      // TODO(jpp): perhaps we should only allow Legal_Reg if
-      // Base->isRematerializable.
-      RegIndex = llvm::cast<Variable>(
-          legalize(Index, Legal_Reg | Legal_Rematerializable));
-    }
-
-    if (Base != RegBase || Index != RegIndex) {
-      Mem = X86OperandMem::create(Func, Ty, RegBase, Offset, RegIndex, Shift,
-                                  Mem->getSegmentRegister());
-    }
-
-    From = Mem;
-
-    if (!(Allowed & Legal_Mem)) {
-      From = copyToReg(From, RegNum);
-    }
-    return From;
-  }
-
-  if (auto *Const = llvm::dyn_cast<Constant>(From)) {
-    if (llvm::isa<ConstantUndef>(Const)) {
-      From = legalizeUndef(Const, RegNum);
-      if (isVectorType(Ty))
-        return From;
-      Const = llvm::cast<Constant>(From);
-    }
-    // There should be no constants of vector type (other than undef).
-    assert(!isVectorType(Ty));
-
-    // If the operand is a 64 bit constant integer we need to legalize it to a
-    // register in x86-64.
-    if (Traits::Is64Bit) {
-      if (auto *C64 = llvm::dyn_cast<ConstantInteger64>(Const)) {
-        if (!Utils::IsInt(32, C64->getValue())) {
-          if (RegNum.hasValue()) {
-            assert(Traits::getGprForType(IceType_i64, RegNum) == RegNum);
-          }
-          return copyToReg(Const, RegNum);
-        }
-      }
-    }
-
-    if (!llvm::dyn_cast<ConstantRelocatable>(Const)) {
-      if (isScalarFloatingType(Ty)) {
-        // Convert a scalar floating point constant into an explicit memory
-        // operand.
-        if (auto *ConstFloat = llvm::dyn_cast<ConstantFloat>(Const)) {
-          if (Utils::isPositiveZero(ConstFloat->getValue()))
-            return makeZeroedRegister(Ty, RegNum);
-        } else if (auto *ConstDouble = llvm::dyn_cast<ConstantDouble>(Const)) {
-          if (Utils::isPositiveZero(ConstDouble->getValue()))
-            return makeZeroedRegister(Ty, RegNum);
-        }
-
-        auto *CFrom = llvm::cast<Constant>(From);
-        assert(CFrom->getShouldBePooled());
-        Constant *Offset = Ctx->getConstantSym(0, CFrom->getLabelName());
-        auto *Mem = X86OperandMem::create(Func, Ty, nullptr, Offset);
-        From = Mem;
-      }
-    }
-
-    bool NeedsReg = false;
-    if (!(Allowed & Legal_Imm) && !isScalarFloatingType(Ty))
-      // Immediate specifically not allowed.
-      NeedsReg = true;
-    if (!(Allowed & Legal_Mem) && isScalarFloatingType(Ty))
-      // On x86, FP constants are lowered to mem operands.
-      NeedsReg = true;
-    if (NeedsReg) {
-      From = copyToReg(From, RegNum);
-    }
-    return From;
-  }
-
-  if (auto *Var = llvm::dyn_cast<Variable>(From)) {
-    // Check if the variable is guaranteed a physical register. This can happen
-    // either when the variable is pre-colored or when it is assigned infinite
-    // weight.
-    bool MustHaveRegister = (Var->hasReg() || Var->mustHaveReg());
-    bool MustRematerialize =
-        (Var->isRematerializable() && !(Allowed & Legal_Rematerializable));
-    // We need a new physical register for the operand if:
-    // - Mem is not allowed and Var isn't guaranteed a physical register, or
-    // - RegNum is required and Var->getRegNum() doesn't match, or
-    // - Var is a rematerializable variable and rematerializable pass-through is
-    //   not allowed (in which case we need a lea instruction).
-    if (MustRematerialize) {
-      Variable *NewVar = makeReg(Ty, RegNum);
-      // Since Var is rematerializable, the offset will be added when the lea is
-      // emitted.
-      constexpr Constant *NoOffset = nullptr;
-      auto *Mem = X86OperandMem::create(Func, Ty, Var, NoOffset);
-      _lea(NewVar, Mem);
-      From = NewVar;
-    } else if ((!(Allowed & Legal_Mem) && !MustHaveRegister) ||
-               (RegNum.hasValue() && RegNum != Var->getRegNum())) {
-      From = copyToReg(From, RegNum);
-    }
-    return From;
-  }
-
-  llvm::report_fatal_error("Unhandled operand kind in legalize()");
-  return From;
-}
-
-/// Provide a trivial wrapper to legalize() for this common usage.
-template <typename TraitsType>
-Variable *TargetX86Base<TraitsType>::legalizeToReg(Operand *From,
-                                                   RegNumT RegNum) {
-  return llvm::cast<Variable>(legalize(From, Legal_Reg, RegNum));
-}
-
-/// Legalize undef values to concrete values.
-template <typename TraitsType>
-Operand *TargetX86Base<TraitsType>::legalizeUndef(Operand *From,
-                                                  RegNumT RegNum) {
-  Type Ty = From->getType();
-  if (llvm::isa<ConstantUndef>(From)) {
-    // Lower undefs to zero.  Another option is to lower undefs to an
-    // uninitialized register; however, using an uninitialized register results
-    // in less predictable code.
-    //
-    // If in the future the implementation is changed to lower undef values to
-    // uninitialized registers, a FakeDef will be needed:
-    //     Context.insert<InstFakeDef>(Reg);
-    // This is in order to ensure that the live range of Reg is not
-    // overestimated.  If the constant being lowered is a 64 bit value, then
-    // the result should be split and the lo and hi components will need to go
-    // in uninitialized registers.
-    if (isVectorType(Ty))
-      return makeVectorOfZeros(Ty, RegNum);
-    return Ctx->getConstantZero(Ty);
-  }
-  return From;
-}
-
-/// For the cmp instruction, if Src1 is an immediate, or known to be a physical
-/// register, we can allow Src0 to be a memory operand. Otherwise, Src0 must be
-/// copied into a physical register. (Actually, either Src0 or Src1 can be
-/// chosen for the physical register, but unfortunately we have to commit to one
-/// or the other before register allocation.)
-template <typename TraitsType>
-Operand *TargetX86Base<TraitsType>::legalizeSrc0ForCmp(Operand *Src0,
-                                                       Operand *Src1) {
-  bool IsSrc1ImmOrReg = false;
-  if (llvm::isa<Constant>(Src1)) {
-    IsSrc1ImmOrReg = true;
-  } else if (auto *Var = llvm::dyn_cast<Variable>(Src1)) {
-    if (Var->hasReg())
-      IsSrc1ImmOrReg = true;
-  }
-  return legalize(Src0, IsSrc1ImmOrReg ? (Legal_Reg | Legal_Mem) : Legal_Reg);
-}
-
-template <typename TraitsType>
-typename TargetX86Base<TraitsType>::X86OperandMem *
-TargetX86Base<TraitsType>::formMemoryOperand(Operand *Opnd, Type Ty,
-                                             bool DoLegalize) {
-  auto *Mem = llvm::dyn_cast<X86OperandMem>(Opnd);
-  // It may be the case that address mode optimization already creates an
-  // X86OperandMem, so in that case it wouldn't need another level of
-  // transformation.
-  if (!Mem) {
-    auto *Base = llvm::dyn_cast<Variable>(Opnd);
-    auto *Offset = llvm::dyn_cast<Constant>(Opnd);
-    assert(Base || Offset);
-    if (Offset) {
-      if (!llvm::isa<ConstantRelocatable>(Offset)) {
-        if (llvm::isa<ConstantInteger64>(Offset)) {
-          // Memory operands cannot have 64-bit immediates, so they must be
-          // legalized into a register only.
-          Base = llvm::cast<Variable>(legalize(Offset, Legal_Reg));
-          Offset = nullptr;
-        } else {
-          Offset = llvm::cast<Constant>(legalize(Offset));
-
-          assert(llvm::isa<ConstantInteger32>(Offset) ||
-                 llvm::isa<ConstantRelocatable>(Offset));
-        }
-      }
-    }
-    Mem = X86OperandMem::create(Func, Ty, Base, Offset);
-  }
-  return llvm::cast<X86OperandMem>(DoLegalize ? legalize(Mem) : Mem);
-}
-
-template <typename TraitsType>
-Variable *TargetX86Base<TraitsType>::makeReg(Type Type, RegNumT RegNum) {
-  // There aren't any 64-bit integer registers for x86-32.
-  assert(Traits::Is64Bit || Type != IceType_i64);
-  Variable *Reg = Func->makeVariable(Type);
-  if (RegNum.hasValue())
-    Reg->setRegNum(RegNum);
-  else
-    Reg->setMustHaveReg();
-  return Reg;
-}
-
-const Type TypeForSize[] = {IceType_i8, IceType_i16, IceType_i32, IceType_f64,
-                            IceType_v16i8};
-
-template <typename TraitsType>
-Type TargetX86Base<TraitsType>::largestTypeInSize(uint32_t Size,
-                                                  uint32_t MaxSize) {
-  assert(Size != 0);
-  uint32_t TyIndex = llvm::findLastSet(Size, llvm::ZB_Undefined);
-  uint32_t MaxIndex = MaxSize == NoSizeLimit
-                          ? llvm::array_lengthof(TypeForSize) - 1
-                          : llvm::findLastSet(MaxSize, llvm::ZB_Undefined);
-  return TypeForSize[std::min(TyIndex, MaxIndex)];
-}
-
-template <typename TraitsType>
-Type TargetX86Base<TraitsType>::firstTypeThatFitsSize(uint32_t Size,
-                                                      uint32_t MaxSize) {
-  assert(Size != 0);
-  uint32_t TyIndex = llvm::findLastSet(Size, llvm::ZB_Undefined);
-  if (!llvm::isPowerOf2_32(Size))
-    ++TyIndex;
-  uint32_t MaxIndex = MaxSize == NoSizeLimit
-                          ? llvm::array_lengthof(TypeForSize) - 1
-                          : llvm::findLastSet(MaxSize, llvm::ZB_Undefined);
-  return TypeForSize[std::min(TyIndex, MaxIndex)];
-}
-
-template <typename TraitsType> void TargetX86Base<TraitsType>::postLower() {
-  if (Func->getOptLevel() == Opt_m1)
-    return;
-  markRedefinitions();
-  Context.availabilityUpdate();
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::emit(const ConstantInteger32 *C) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Ctx->getStrEmit();
-  Str << "$" << C->getValue();
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::emit(const ConstantInteger64 *C) const {
-  if (!Traits::Is64Bit) {
-    llvm::report_fatal_error("Not expecting to emit 64-bit integers");
-  } else {
-    if (!BuildDefs::dump())
-      return;
-    Ostream &Str = Ctx->getStrEmit();
-    Str << "$" << C->getValue();
-  }
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::emit(const ConstantFloat *C) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Ctx->getStrEmit();
-  Str << C->getLabelName();
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::emit(const ConstantDouble *C) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Ctx->getStrEmit();
-  Str << C->getLabelName();
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::emit(const ConstantUndef *) const {
-  llvm::report_fatal_error("undef value encountered by emitter.");
-}
-
-template <class Machine>
-void TargetX86Base<Machine>::emit(const ConstantRelocatable *C) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Ctx->getStrEmit();
-  Str << "$";
-  emitWithoutPrefix(C);
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::emitJumpTable(
-    const Cfg *, const InstJumpTable *JumpTable) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Ctx->getStrEmit();
-  Str << "\t.section\t.rodata." << JumpTable->getSectionName()
-      << ",\"a\",@progbits\n"
-         "\t.align\t"
-      << typeWidthInBytes(getPointerType()) << "\n"
-      << JumpTable->getName() << ":";
-
-  for (SizeT I = 0; I < JumpTable->getNumTargets(); ++I)
-    Str << "\n\t.val\t" << JumpTable->getTarget(I)->getAsmName();
-  Str << "\n";
-}
-
-template <typename TraitsType>
-template <typename T>
-void TargetDataX86<TraitsType>::emitConstantPool(GlobalContext *Ctx) {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Ctx->getStrEmit();
-  Type Ty = T::Ty;
-  SizeT Align = typeAlignInBytes(Ty);
-  ConstantList Pool = Ctx->getConstantPool(Ty);
-
-  Str << "\t.section\t.rodata.cst" << Align << ",\"aM\",@progbits," << Align
-      << "\n";
-  Str << "\t.align\t" << Align << "\n";
-
-  for (Constant *C : Pool) {
-    if (!C->getShouldBePooled())
-      continue;
-    auto *Const = llvm::cast<typename T::IceType>(C);
-    typename T::IceType::PrimType Value = Const->getValue();
-    // Use memcpy() to copy bits from Value into RawValue in a way that avoids
-    // breaking strict-aliasing rules.
-    typename T::PrimitiveIntType RawValue;
-    memcpy(&RawValue, &Value, sizeof(Value));
-    char buf[30];
-    int CharsPrinted =
-        snprintf(buf, llvm::array_lengthof(buf), T::PrintfString, RawValue);
-    assert(CharsPrinted >= 0);
-    assert((size_t)CharsPrinted < llvm::array_lengthof(buf));
-    (void)CharsPrinted; // avoid warnings if asserts are disabled
-    Str << Const->getLabelName();
-    Str << ":\n\t" << T::AsmTag << "\t" << buf << "\t/* " << T::TypeName << " "
-        << Value << " */\n";
-  }
-}
-
-template <typename TraitsType>
-void TargetDataX86<TraitsType>::lowerConstants() {
-  if (getFlags().getDisableTranslation())
-    return;
-  switch (getFlags().getOutFileType()) {
-  case FT_Elf: {
-    ELFObjectWriter *Writer = Ctx->getObjectWriter();
-
-    Writer->writeConstantPool<ConstantInteger32>(IceType_i8);
-    Writer->writeConstantPool<ConstantInteger32>(IceType_i16);
-    Writer->writeConstantPool<ConstantInteger32>(IceType_i32);
-
-    Writer->writeConstantPool<ConstantFloat>(IceType_f32);
-    Writer->writeConstantPool<ConstantDouble>(IceType_f64);
-  } break;
-  case FT_Asm:
-  case FT_Iasm: {
-    OstreamLocker L(Ctx);
-
-    emitConstantPool<PoolTypeConverter<uint8_t>>(Ctx);
-    emitConstantPool<PoolTypeConverter<uint16_t>>(Ctx);
-    emitConstantPool<PoolTypeConverter<uint32_t>>(Ctx);
-
-    emitConstantPool<PoolTypeConverter<float>>(Ctx);
-    emitConstantPool<PoolTypeConverter<double>>(Ctx);
-  } break;
-  }
-}
-
-template <typename TraitsType>
-void TargetDataX86<TraitsType>::lowerJumpTables() {
-  const bool IsPIC = false;
-  switch (getFlags().getOutFileType()) {
-  case FT_Elf: {
-    ELFObjectWriter *Writer = Ctx->getObjectWriter();
-    constexpr FixupKind FK_Abs64 = llvm::ELF::R_X86_64_64;
-    const FixupKind RelocationKind =
-        (getPointerType() == IceType_i32) ? Traits::FK_Abs : FK_Abs64;
-    for (const JumpTableData &JT : Ctx->getJumpTables())
-      Writer->writeJumpTable(JT, RelocationKind, IsPIC);
-  } break;
-  case FT_Asm:
-    // Already emitted from Cfg
-    break;
-  case FT_Iasm: {
-    if (!BuildDefs::dump())
-      return;
-    Ostream &Str = Ctx->getStrEmit();
-    const char *Prefix = IsPIC ? ".data.rel.ro." : ".rodata.";
-    for (const JumpTableData &JT : Ctx->getJumpTables()) {
-      Str << "\t.section\t" << Prefix << JT.getSectionName()
-          << ",\"a\",@progbits\n"
-             "\t.align\t"
-          << typeWidthInBytes(getPointerType()) << "\n"
-          << JT.getName().toString() << ":";
-
-      for (intptr_t TargetOffset : JT.getTargetOffsets())
-        Str << "\n\t.val\t" << JT.getFunctionName() << "+" << TargetOffset;
-      Str << "\n";
-    }
-  } break;
-  }
-}
-
-template <typename TraitsType>
-void TargetDataX86<TraitsType>::lowerGlobals(
-    const VariableDeclarationList &Vars, const std::string &SectionSuffix) {
-  const bool IsPIC = false;
-  switch (getFlags().getOutFileType()) {
-  case FT_Elf: {
-    ELFObjectWriter *Writer = Ctx->getObjectWriter();
-    Writer->writeDataSection(Vars, Traits::FK_Abs, SectionSuffix, IsPIC);
-  } break;
-  case FT_Asm:
-  case FT_Iasm: {
-    OstreamLocker L(Ctx);
-    for (const VariableDeclaration *Var : Vars) {
-      if (getFlags().matchTranslateOnly(Var->getName(), 0)) {
-        emitGlobal(*Var, SectionSuffix);
-      }
-    }
-  } break;
-  }
-}
-} // namespace X8632
-} // end of namespace Ice
-
-#endif // SUBZERO_SRC_ICETARGETLOWERINGX8632BASEIMPL_H
diff --git a/third_party/subzero/src/IceTargetLoweringX8632Traits.h b/third_party/subzero/src/IceTargetLoweringX8632Traits.h
index d150933..9637d63 100644
--- a/third_party/subzero/src/IceTargetLoweringX8632Traits.h
+++ b/third_party/subzero/src/IceTargetLoweringX8632Traits.h
@@ -15,7 +15,7 @@
 #ifndef SUBZERO_SRC_ICETARGETLOWERINGX8632TRAITS_H
 #define SUBZERO_SRC_ICETARGETLOWERINGX8632TRAITS_H
 
-#include "IceAssemblerX8632.h"
+#include "IceAssembler.h"
 #include "IceConditionCodesX86.h"
 #include "IceDefs.h"
 #include "IceInst.h"
@@ -33,9 +33,9 @@
 namespace X8632 {
 using namespace ::Ice::X86;
 
+struct Insts;
+class TargetX8632;
 class AssemblerX8632;
-template <class Machine> struct Insts;
-template <class Machine> class TargetX86Base;
 
 class TargetX8632;
 
@@ -48,9 +48,6 @@
   //      \/_/\/_/\/_____/\/_/  \/_/
   //
   //----------------------------------------------------------------------------
-  static constexpr ::Ice::Assembler::AssemblerKind AsmKind =
-      ::Ice::Assembler::Asm_X8632;
-
   static constexpr bool Is64Bit = false;
   static constexpr ::Ice::RegX8632::GPRRegister Last8BitGPR =
       ::Ice::RegX8632::GPRRegister::Encoded_Reg_ebx;
@@ -259,14 +256,6 @@
   //      \/_____/\/_____/\/_/   \/_/\/_____/\/_/ /_/\/_/\/_/ \/_/\/_____/
   //
   //----------------------------------------------------------------------------
-  enum InstructionSet {
-    Begin,
-    // SSE2 is the PNaCl baseline instruction set.
-    SSE2 = Begin,
-    SSE4_1,
-    End
-  };
-
   static const char *TargetName;
   static constexpr Type WordType = IceType_i32;
 
@@ -607,6 +596,8 @@
   static constexpr uint32_t X86_MAX_XMM_ARGS = 4;
   /// The maximum number of arguments to pass in GPR registers
   static constexpr uint32_t X86_MAX_GPR_ARGS = 0;
+  /// Whether scalar floating point arguments are passed in XMM registers
+  static constexpr bool X86_PASS_SCALAR_FP_IN_XMM = false;
   /// Get the register for a given argument slot in the XMM registers.
   static RegNumT getRegisterForXmmArgNum(uint32_t ArgNum) {
     // TODO(sehr): Change to use the CCArg technique used in ARM32.
@@ -735,10 +726,8 @@
   //
   //----------------------------------------------------------------------------
   using Traits = TargetX8632Traits;
-  using Insts = ::Ice::X8632::Insts<Traits>;
 
-  using TargetLowering = ::Ice::X8632::TargetX86Base<Traits>;
-  using ConcreteTarget = ::Ice::X8632::TargetX8632;
+  using TargetLowering = ::Ice::X8632::TargetX8632;
   using Assembler = ::Ice::X8632::AssemblerX8632;
 
   /// X86Operand extends the Operand hierarchy. Its subclasses are X86OperandMem
diff --git a/third_party/subzero/src/IceTargetLoweringX8664.cpp b/third_party/subzero/src/IceTargetLoweringX8664.cpp
index 610bb3d..09cf4f9 100644
--- a/third_party/subzero/src/IceTargetLoweringX8664.cpp
+++ b/third_party/subzero/src/IceTargetLoweringX8664.cpp
@@ -14,22 +14,38 @@
 //===----------------------------------------------------------------------===//
 #include "IceTargetLoweringX8664.h"
 
+#include "IceCfg.h"
+#include "IceCfgNode.h"
+#include "IceClFlags.h"
 #include "IceDefs.h"
+#include "IceELFObjectWriter.h"
+#include "IceGlobalInits.h"
+#include "IceInstVarIter.h"
+#include "IceInstX8664.h"
+#include "IceLiveness.h"
+#include "IceOperand.h"
+#include "IcePhiLoweringImpl.h"
 #include "IceTargetLoweringX8664Traits.h"
+#include "IceUtils.h"
+#include "IceVariableSplitting.h"
+
+#include "llvm/Support/MathExtras.h"
+
+#include <stack>
 
 #if defined(_WIN64)
 extern "C" void __chkstk();
 #endif
 
 namespace X8664 {
+
 std::unique_ptr<::Ice::TargetLowering> createTargetLowering(::Ice::Cfg *Func) {
   return ::Ice::X8664::TargetX8664::create(Func);
 }
 
 std::unique_ptr<::Ice::TargetDataLowering>
 createTargetDataLowering(::Ice::GlobalContext *Ctx) {
-  return ::Ice::X8664::TargetDataX86<::Ice::X8664::TargetX8664Traits>::create(
-      Ctx);
+  return ::Ice::X8664::TargetDataX8664::create(Ctx);
 }
 
 std::unique_ptr<::Ice::TargetHeaderLowering>
@@ -54,6 +70,7974 @@
 namespace Ice {
 namespace X8664 {
 
+template <typename T> struct PoolTypeConverter {};
+
+template <> struct PoolTypeConverter<float> {
+  using PrimitiveIntType = uint32_t;
+  using IceType = ConstantFloat;
+  static const Type Ty = IceType_f32;
+  static const char *TypeName;
+  static const char *AsmTag;
+  static const char *PrintfString;
+};
+
+template <> struct PoolTypeConverter<double> {
+  using PrimitiveIntType = uint64_t;
+  using IceType = ConstantDouble;
+  static const Type Ty = IceType_f64;
+  static const char *TypeName;
+  static const char *AsmTag;
+  static const char *PrintfString;
+};
+
+// Add converter for int type constant pooling
+template <> struct PoolTypeConverter<uint32_t> {
+  using PrimitiveIntType = uint32_t;
+  using IceType = ConstantInteger32;
+  static const Type Ty = IceType_i32;
+  static const char *TypeName;
+  static const char *AsmTag;
+  static const char *PrintfString;
+};
+
+// Add converter for int type constant pooling
+template <> struct PoolTypeConverter<uint16_t> {
+  using PrimitiveIntType = uint32_t;
+  using IceType = ConstantInteger32;
+  static const Type Ty = IceType_i16;
+  static const char *TypeName;
+  static const char *AsmTag;
+  static const char *PrintfString;
+};
+
+// Add converter for int type constant pooling
+template <> struct PoolTypeConverter<uint8_t> {
+  using PrimitiveIntType = uint32_t;
+  using IceType = ConstantInteger32;
+  static const Type Ty = IceType_i8;
+  static const char *TypeName;
+  static const char *AsmTag;
+  static const char *PrintfString;
+};
+
+const char *PoolTypeConverter<float>::TypeName = "float";
+const char *PoolTypeConverter<float>::AsmTag = ".long";
+const char *PoolTypeConverter<float>::PrintfString = "0x%x";
+
+const char *PoolTypeConverter<double>::TypeName = "double";
+const char *PoolTypeConverter<double>::AsmTag = ".quad";
+const char *PoolTypeConverter<double>::PrintfString = "0x%llx";
+
+const char *PoolTypeConverter<uint32_t>::TypeName = "i32";
+const char *PoolTypeConverter<uint32_t>::AsmTag = ".long";
+const char *PoolTypeConverter<uint32_t>::PrintfString = "0x%x";
+
+const char *PoolTypeConverter<uint16_t>::TypeName = "i16";
+const char *PoolTypeConverter<uint16_t>::AsmTag = ".short";
+const char *PoolTypeConverter<uint16_t>::PrintfString = "0x%x";
+
+const char *PoolTypeConverter<uint8_t>::TypeName = "i8";
+const char *PoolTypeConverter<uint8_t>::AsmTag = ".byte";
+const char *PoolTypeConverter<uint8_t>::PrintfString = "0x%x";
+
+} // namespace X8664
+
+namespace X8664 {
+
+// The Microsoft x64 ABI requires the caller to allocate a minimum 32 byte
+// "shadow store" (aka "home space") so that the callee may copy the 4
+// register args to it.
+SizeT getShadowStoreSize() {
+#if defined(_WIN64)
+  static const SizeT ShadowStoreSize =
+      Traits::Is64Bit ? 4 * typeWidthInBytes(Traits::WordType) : 0;
+  return ShadowStoreSize;
+#else
+  return 0;
+#endif
+}
+
+BoolFoldingEntry::BoolFoldingEntry(Inst *I)
+    : Instr(I), IsComplex(BoolFolding::hasComplexLowering(I)) {}
+
+typename BoolFolding::BoolFoldingProducerKind
+BoolFolding::getProducerKind(const Inst *Instr) {
+  if (llvm::isa<InstIcmp>(Instr)) {
+    if (Traits::Is64Bit || Instr->getSrc(0)->getType() != IceType_i64)
+      return PK_Icmp32;
+    return PK_Icmp64;
+  }
+  if (llvm::isa<InstFcmp>(Instr))
+    return PK_Fcmp;
+  if (auto *Arith = llvm::dyn_cast<InstArithmetic>(Instr)) {
+    if (Traits::Is64Bit || Arith->getSrc(0)->getType() != IceType_i64) {
+      switch (Arith->getOp()) {
+      default:
+        return PK_None;
+      case InstArithmetic::And:
+      case InstArithmetic::Or:
+        return PK_Arith;
+      }
+    }
+  }
+  return PK_None; // TODO(stichnot): remove this
+
+  if (auto *Cast = llvm::dyn_cast<InstCast>(Instr)) {
+    switch (Cast->getCastKind()) {
+    default:
+      return PK_None;
+    case InstCast::Trunc:
+      return PK_Trunc;
+    }
+  }
+  return PK_None;
+}
+
+typename BoolFolding::BoolFoldingConsumerKind
+BoolFolding::getConsumerKind(const Inst *Instr) {
+  if (llvm::isa<InstBr>(Instr))
+    return CK_Br;
+  if (llvm::isa<InstSelect>(Instr))
+    return CK_Select;
+  return CK_None; // TODO(stichnot): remove this
+
+  if (auto *Cast = llvm::dyn_cast<InstCast>(Instr)) {
+    switch (Cast->getCastKind()) {
+    default:
+      return CK_None;
+    case InstCast::Sext:
+      return CK_Sext;
+    case InstCast::Zext:
+      return CK_Zext;
+    }
+  }
+  return CK_None;
+}
+
+/// Returns true if the producing instruction has a "complex" lowering sequence.
+/// This generally means that its lowering sequence requires more than one
+/// conditional branch, namely 64-bit integer compares and some floating-point
+/// compares. When this is true, and there is more than one consumer, we prefer
+/// to disable the folding optimization because it minimizes branches.
+
+bool BoolFolding::hasComplexLowering(const Inst *Instr) {
+  switch (getProducerKind(Instr)) {
+  default:
+    return false;
+  case PK_Icmp64:
+    return !Traits::Is64Bit;
+  case PK_Fcmp:
+    return Traits::TableFcmp[llvm::cast<InstFcmp>(Instr)->getCondition()].C2 !=
+           CondX86::Br_None;
+  }
+}
+
+bool BoolFolding::isValidFolding(
+    typename BoolFolding::BoolFoldingProducerKind ProducerKind,
+    typename BoolFolding::BoolFoldingConsumerKind ConsumerKind) {
+  switch (ProducerKind) {
+  default:
+    return false;
+  case PK_Icmp32:
+  case PK_Icmp64:
+  case PK_Fcmp:
+    return (ConsumerKind == CK_Br) || (ConsumerKind == CK_Select);
+  case PK_Arith:
+    return ConsumerKind == CK_Br;
+  }
+}
+
+void BoolFolding::init(CfgNode *Node) {
+  Producers.clear();
+  for (Inst &Instr : Node->getInsts()) {
+    if (Instr.isDeleted())
+      continue;
+    invalidateProducersOnStore(&Instr);
+    // Check whether Instr is a valid producer.
+    Variable *Var = Instr.getDest();
+    if (Var) { // only consider instructions with an actual dest var
+      if (isBooleanType(Var->getType())) {        // only bool-type dest vars
+        if (getProducerKind(&Instr) != PK_None) { // white-listed instructions
+          Producers[Var->getIndex()] = BoolFoldingEntry(&Instr);
+        }
+      }
+    }
+    // Check each src variable against the map.
+    FOREACH_VAR_IN_INST(Var, Instr) {
+      SizeT VarNum = Var->getIndex();
+      if (!containsValid(VarNum))
+        continue;
+      // All valid consumers use Var as the first source operand
+      if (IndexOfVarOperandInInst(Var) != 0) {
+        setInvalid(VarNum);
+        continue;
+      }
+      // Consumer instructions must be white-listed
+      typename BoolFolding::BoolFoldingConsumerKind ConsumerKind =
+          getConsumerKind(&Instr);
+      if (ConsumerKind == CK_None) {
+        setInvalid(VarNum);
+        continue;
+      }
+      typename BoolFolding::BoolFoldingProducerKind ProducerKind =
+          getProducerKind(Producers[VarNum].Instr);
+      if (!isValidFolding(ProducerKind, ConsumerKind)) {
+        setInvalid(VarNum);
+        continue;
+      }
+      // Avoid creating multiple copies of complex producer instructions.
+      if (Producers[VarNum].IsComplex && Producers[VarNum].NumUses > 0) {
+        setInvalid(VarNum);
+        continue;
+      }
+      ++Producers[VarNum].NumUses;
+      if (Instr.isLastUse(Var)) {
+        Producers[VarNum].IsLiveOut = false;
+      }
+    }
+  }
+  for (auto &I : Producers) {
+    // Ignore entries previously marked invalid.
+    if (I.second.Instr == nullptr)
+      continue;
+    // Disable the producer if its dest may be live beyond this block.
+    if (I.second.IsLiveOut) {
+      setInvalid(I.first);
+      continue;
+    }
+    // Mark as "dead" rather than outright deleting. This is so that other
+    // peephole style optimizations during or before lowering have access to
+    // this instruction in undeleted form. See for example
+    // tryOptimizedCmpxchgCmpBr().
+    I.second.Instr->setDead();
+  }
+}
+
+const Inst *BoolFolding::getProducerFor(const Operand *Opnd) const {
+  auto *Var = llvm::dyn_cast<const Variable>(Opnd);
+  if (Var == nullptr)
+    return nullptr;
+  SizeT VarNum = Var->getIndex();
+  auto Element = Producers.find(VarNum);
+  if (Element == Producers.end())
+    return nullptr;
+  return Element->second.Instr;
+}
+
+void BoolFolding::dump(const Cfg *Func) const {
+  if (!BuildDefs::dump() || !Func->isVerbose(IceV_Folding))
+    return;
+  OstreamLocker L(Func->getContext());
+  Ostream &Str = Func->getContext()->getStrDump();
+  for (auto &I : Producers) {
+    if (I.second.Instr == nullptr)
+      continue;
+    Str << "Found foldable producer:\n  ";
+    I.second.Instr->dump(Func);
+    Str << "\n";
+  }
+}
+
+/// If the given instruction has potential memory side effects (e.g. store, rmw,
+/// or a call instruction with potential memory side effects), then we must not
+/// allow a pre-store Producer instruction with memory operands to be folded
+/// into a post-store Consumer instruction.  If this is detected, the Producer
+/// is invalidated.
+///
+/// We use the Producer's IsLiveOut field to determine whether any potential
+/// Consumers come after this store instruction.  The IsLiveOut field is
+/// initialized to true, and BoolFolding::init() sets IsLiveOut to false when it
+/// sees the variable's definitive last use (indicating the variable is not in
+/// the node's live-out set).  Thus if we see here that IsLiveOut is false, we
+/// know that there can be no consumers after the store, and therefore we know
+/// the folding is safe despite the store instruction.
+
+void BoolFolding::invalidateProducersOnStore(const Inst *Instr) {
+  if (!Instr->isMemoryWrite())
+    return;
+  for (auto &ProducerPair : Producers) {
+    if (!ProducerPair.second.IsLiveOut)
+      continue;
+    Inst *PInst = ProducerPair.second.Instr;
+    if (PInst == nullptr)
+      continue;
+    bool HasMemOperand = false;
+    const SizeT SrcSize = PInst->getSrcSize();
+    for (SizeT I = 0; I < SrcSize; ++I) {
+      if (llvm::isa<typename Traits::X86OperandMem>(PInst->getSrc(I))) {
+        HasMemOperand = true;
+        break;
+      }
+    }
+    if (!HasMemOperand)
+      continue;
+    setInvalid(ProducerPair.first);
+  }
+}
+
+void TargetX8664::initNodeForLowering(CfgNode *Node) {
+  FoldingInfo.init(Node);
+  FoldingInfo.dump(Func);
+}
+
+TargetX8664::TargetX8664(Cfg *Func) : TargetX86(Func) {}
+
+void TargetX8664::staticInit(GlobalContext *Ctx) {
+  RegNumT::setLimit(Traits::RegisterSet::Reg_NUM);
+  Traits::initRegisterSet(getFlags(), &TypeToRegisterSet, &RegisterAliases);
+  for (size_t i = 0; i < TypeToRegisterSet.size(); ++i)
+    TypeToRegisterSetUnfiltered[i] = TypeToRegisterSet[i];
+  filterTypeToRegisterSet(Ctx, Traits::RegisterSet::Reg_NUM,
+                          TypeToRegisterSet.data(), TypeToRegisterSet.size(),
+                          Traits::getRegName, getRegClassName);
+}
+
+bool TargetX8664::shouldBePooled(const Constant *C) {
+  if (auto *ConstFloat = llvm::dyn_cast<ConstantFloat>(C)) {
+    return !Utils::isPositiveZero(ConstFloat->getValue());
+  }
+  if (auto *ConstDouble = llvm::dyn_cast<ConstantDouble>(C)) {
+    return !Utils::isPositiveZero(ConstDouble->getValue());
+  }
+  return false;
+}
+
+::Ice::Type TargetX8664::getPointerType() {
+  if (!Traits::Is64Bit) {
+    return ::Ice::IceType_i32;
+  }
+  return ::Ice::IceType_i64;
+}
+
+void TargetX8664::translateO2() {
+  TimerMarker T(TimerStack::TT_O2, Func);
+
+  genTargetHelperCalls();
+  Func->dump("After target helper call insertion");
+
+  // Merge Alloca instructions, and lay out the stack.
+  static constexpr bool SortAndCombineAllocas = true;
+  Func->processAllocas(SortAndCombineAllocas);
+  Func->dump("After Alloca processing");
+
+  // Run this early so it can be used to focus optimizations on potentially hot
+  // code.
+  // TODO(stichnot,ascull): currently only used for regalloc not
+  // expensive high level optimizations which could be focused on potentially
+  // hot code.
+  Func->generateLoopInfo();
+  Func->dump("After loop analysis");
+  if (getFlags().getLoopInvariantCodeMotion()) {
+    Func->loopInvariantCodeMotion();
+    Func->dump("After LICM");
+  }
+
+  if (getFlags().getLocalCSE() != Ice::LCSE_Disabled) {
+    Func->localCSE(getFlags().getLocalCSE() == Ice::LCSE_EnabledSSA);
+    Func->dump("After Local CSE");
+    Func->floatConstantCSE();
+  }
+  if (getFlags().getEnableShortCircuit()) {
+    Func->shortCircuitJumps();
+    Func->dump("After Short Circuiting");
+  }
+
+  if (!getFlags().getEnablePhiEdgeSplit()) {
+    // Lower Phi instructions.
+    Func->placePhiLoads();
+    if (Func->hasError())
+      return;
+    Func->placePhiStores();
+    if (Func->hasError())
+      return;
+    Func->deletePhis();
+    if (Func->hasError())
+      return;
+    Func->dump("After Phi lowering");
+  }
+
+  // Address mode optimization.
+  Func->getVMetadata()->init(VMK_SingleDefs);
+  Func->doAddressOpt();
+  Func->materializeVectorShuffles();
+
+  // Find read-modify-write opportunities. Do this after address mode
+  // optimization so that doAddressOpt() doesn't need to be applied to RMW
+  // instructions as well.
+  findRMW();
+  Func->dump("After RMW transform");
+
+  // Argument lowering
+  Func->doArgLowering();
+
+  // Target lowering. This requires liveness analysis for some parts of the
+  // lowering decisions, such as compare/branch fusing. If non-lightweight
+  // liveness analysis is used, the instructions need to be renumbered first
+  // TODO: This renumbering should only be necessary if we're actually
+  // calculating live intervals, which we only do for register allocation.
+  Func->renumberInstructions();
+  if (Func->hasError())
+    return;
+
+  // TODO: It should be sufficient to use the fastest liveness calculation,
+  // i.e. livenessLightweight(). However, for some reason that slows down the
+  // rest of the translation. Investigate.
+  Func->liveness(Liveness_Basic);
+  if (Func->hasError())
+    return;
+  Func->dump("After x86 address mode opt");
+
+  doLoadOpt();
+
+  Func->genCode();
+  if (Func->hasError())
+    return;
+  Func->dump("After x86 codegen");
+  splitBlockLocalVariables(Func);
+
+  // Register allocation. This requires instruction renumbering and full
+  // liveness analysis. Loops must be identified before liveness so variable
+  // use weights are correct.
+  Func->renumberInstructions();
+  if (Func->hasError())
+    return;
+  Func->liveness(Liveness_Intervals);
+  if (Func->hasError())
+    return;
+  // The post-codegen dump is done here, after liveness analysis and associated
+  // cleanup, to make the dump cleaner and more useful.
+  Func->dump("After initial x86 codegen");
+  // Validate the live range computations. The expensive validation call is
+  // deliberately only made when assertions are enabled.
+  assert(Func->validateLiveness());
+  Func->getVMetadata()->init(VMK_All);
+  regAlloc(RAK_Global);
+  if (Func->hasError())
+    return;
+  Func->dump("After linear scan regalloc");
+
+  if (getFlags().getEnablePhiEdgeSplit()) {
+    Func->advancedPhiLowering();
+    Func->dump("After advanced Phi lowering");
+  }
+
+  // Stack frame mapping.
+  Func->genFrame();
+  if (Func->hasError())
+    return;
+  Func->dump("After stack frame mapping");
+
+  Func->contractEmptyNodes();
+  Func->reorderNodes();
+
+  // Branch optimization.  This needs to be done just before code emission. In
+  // particular, no transformations that insert or reorder CfgNodes should be
+  // done after branch optimization. We go ahead and do it before nop insertion
+  // to reduce the amount of work needed for searching for opportunities.
+  Func->doBranchOpt();
+  Func->dump("After branch optimization");
+}
+
+void TargetX8664::translateOm1() {
+  TimerMarker T(TimerStack::TT_Om1, Func);
+
+  genTargetHelperCalls();
+
+  // Do not merge Alloca instructions, and lay out the stack.
+  // static constexpr bool SortAndCombineAllocas = false;
+  static constexpr bool SortAndCombineAllocas =
+      true; // TODO(b/171222930): Fix Win32 bug when this is false
+  Func->processAllocas(SortAndCombineAllocas);
+  Func->dump("After Alloca processing");
+
+  Func->placePhiLoads();
+  if (Func->hasError())
+    return;
+  Func->placePhiStores();
+  if (Func->hasError())
+    return;
+  Func->deletePhis();
+  if (Func->hasError())
+    return;
+  Func->dump("After Phi lowering");
+
+  Func->doArgLowering();
+  Func->genCode();
+  if (Func->hasError())
+    return;
+  Func->dump("After initial x86 codegen");
+
+  regAlloc(RAK_InfOnly);
+  if (Func->hasError())
+    return;
+  Func->dump("After regalloc of infinite-weight variables");
+
+  Func->genFrame();
+  if (Func->hasError())
+    return;
+  Func->dump("After stack frame mapping");
+}
+
+inline bool canRMW(const InstArithmetic *Arith) {
+  Type Ty = Arith->getDest()->getType();
+  // X86 vector instructions write to a register and have no RMW option.
+  if (isVectorType(Ty))
+    return false;
+  bool isI64 = Ty == IceType_i64;
+
+  switch (Arith->getOp()) {
+  // Not handled for lack of simple lowering:
+  //   shift on i64
+  //   mul, udiv, urem, sdiv, srem, frem
+  // Not handled for lack of RMW instructions:
+  //   fadd, fsub, fmul, fdiv (also vector types)
+  default:
+    return false;
+  case InstArithmetic::Add:
+  case InstArithmetic::Sub:
+  case InstArithmetic::And:
+  case InstArithmetic::Or:
+  case InstArithmetic::Xor:
+    return true;
+  case InstArithmetic::Shl:
+  case InstArithmetic::Lshr:
+  case InstArithmetic::Ashr:
+    return false; // TODO(stichnot): implement
+    return !isI64;
+  }
+}
+
+bool isSameMemAddressOperand(const Operand *A, const Operand *B) {
+  if (A == B)
+    return true;
+  if (auto *MemA = llvm::dyn_cast<typename TargetX8664::X86OperandMem>(A)) {
+    if (auto *MemB = llvm::dyn_cast<typename TargetX8664::X86OperandMem>(B)) {
+      return MemA->getBase() == MemB->getBase() &&
+             MemA->getOffset() == MemB->getOffset() &&
+             MemA->getIndex() == MemB->getIndex() &&
+             MemA->getShift() == MemB->getShift() &&
+             MemA->getSegmentRegister() == MemB->getSegmentRegister();
+    }
+  }
+  return false;
+}
+
+void TargetX8664::findRMW() {
+  TimerMarker _(TimerStack::TT_findRMW, Func);
+  Func->dump("Before RMW");
+  if (Func->isVerbose(IceV_RMW))
+    Func->getContext()->lockStr();
+  for (CfgNode *Node : Func->getNodes()) {
+    // Walk through the instructions, considering each sequence of 3
+    // instructions, and look for the particular RMW pattern. Note that this
+    // search can be "broken" (false negatives) if there are intervening
+    // deleted instructions, or intervening instructions that could be safely
+    // moved out of the way to reveal an RMW pattern.
+    auto E = Node->getInsts().end();
+    auto I1 = E, I2 = E, I3 = Node->getInsts().begin();
+    for (; I3 != E; I1 = I2, I2 = I3, ++I3) {
+      // Make I3 skip over deleted instructions.
+      while (I3 != E && I3->isDeleted())
+        ++I3;
+      if (I1 == E || I2 == E || I3 == E)
+        continue;
+      assert(!I1->isDeleted());
+      assert(!I2->isDeleted());
+      assert(!I3->isDeleted());
+      auto *Load = llvm::dyn_cast<InstLoad>(I1);
+      auto *Arith = llvm::dyn_cast<InstArithmetic>(I2);
+      auto *Store = llvm::dyn_cast<InstStore>(I3);
+      if (!Load || !Arith || !Store)
+        continue;
+      // Look for:
+      //   a = Load addr
+      //   b = <op> a, other
+      //   Store b, addr
+      // Change to:
+      //   a = Load addr
+      //   b = <op> a, other
+      //   x = FakeDef
+      //   RMW <op>, addr, other, x
+      //   b = Store b, addr, x
+      // Note that inferTwoAddress() makes sure setDestRedefined() gets called
+      // on the updated Store instruction, to avoid liveness problems later.
+      //
+      // With this transformation, the Store instruction acquires a Dest
+      // variable and is now subject to dead code elimination if there are no
+      // more uses of "b".  Variable "x" is a beacon for determining whether the
+      // Store instruction gets dead-code eliminated.  If the Store instruction
+      // is eliminated, then it must be the case that the RMW instruction ends
+      // x's live range, and therefore the RMW instruction will be retained and
+      // later lowered.  On the other hand, if the RMW instruction does not end
+      // x's live range, then the Store instruction must still be present, and
+      // therefore the RMW instruction is ignored during lowering because it is
+      // redundant with the Store instruction.
+      //
+      // Note that if "a" has further uses, the RMW transformation may still
+      // trigger, resulting in two loads and one store, which is worse than the
+      // original one load and one store.  However, this is probably rare, and
+      // caching probably keeps it just as fast.
+      if (!isSameMemAddressOperand(Load->getLoadAddress(),
+                                   Store->getStoreAddress()))
+        continue;
+      Operand *ArithSrcFromLoad = Arith->getSrc(0);
+      Operand *ArithSrcOther = Arith->getSrc(1);
+      if (ArithSrcFromLoad != Load->getDest()) {
+        if (!Arith->isCommutative() || ArithSrcOther != Load->getDest())
+          continue;
+        std::swap(ArithSrcFromLoad, ArithSrcOther);
+      }
+      if (Arith->getDest() != Store->getData())
+        continue;
+      if (!canRMW(Arith))
+        continue;
+      if (Func->isVerbose(IceV_RMW)) {
+        Ostream &Str = Func->getContext()->getStrDump();
+        Str << "Found RMW in " << Func->getFunctionName() << ":\n  ";
+        Load->dump(Func);
+        Str << "\n  ";
+        Arith->dump(Func);
+        Str << "\n  ";
+        Store->dump(Func);
+        Str << "\n";
+      }
+      Variable *Beacon = Func->makeVariable(IceType_i32);
+      Beacon->setMustNotHaveReg();
+      Store->setRmwBeacon(Beacon);
+      auto *BeaconDef = InstFakeDef::create(Func, Beacon);
+      Node->getInsts().insert(I3, BeaconDef);
+      auto *RMW =
+          InstX86FakeRMW::create(Func, ArithSrcOther, Store->getStoreAddress(),
+                                 Beacon, Arith->getOp());
+      Node->getInsts().insert(I3, RMW);
+    }
+  }
+  if (Func->isVerbose(IceV_RMW))
+    Func->getContext()->unlockStr();
+}
+
+// Converts a ConstantInteger32 operand into its constant value, or
+// MemoryOrderInvalid if the operand is not a ConstantInteger32.
+inline uint64_t getConstantMemoryOrder(Operand *Opnd) {
+  if (auto *Integer = llvm::dyn_cast<ConstantInteger32>(Opnd))
+    return Integer->getValue();
+  return Intrinsics::MemoryOrderInvalid;
+}
+
+/// Determines whether the dest of a Load instruction can be folded into one of
+/// the src operands of a 2-operand instruction. This is true as long as the
+/// load dest matches exactly one of the binary instruction's src operands.
+/// Replaces Src0 or Src1 with LoadSrc if the answer is true.
+inline bool canFoldLoadIntoBinaryInst(Operand *LoadSrc, Variable *LoadDest,
+                                      Operand *&Src0, Operand *&Src1) {
+  if (Src0 == LoadDest && Src1 != LoadDest) {
+    Src0 = LoadSrc;
+    return true;
+  }
+  if (Src0 != LoadDest && Src1 == LoadDest) {
+    Src1 = LoadSrc;
+    return true;
+  }
+  return false;
+}
+
+void TargetX8664::doLoadOpt() {
+  TimerMarker _(TimerStack::TT_loadOpt, Func);
+  for (CfgNode *Node : Func->getNodes()) {
+    Context.init(Node);
+    while (!Context.atEnd()) {
+      Variable *LoadDest = nullptr;
+      Operand *LoadSrc = nullptr;
+      Inst *CurInst = iteratorToInst(Context.getCur());
+      Inst *Next = Context.getNextInst();
+      // Determine whether the current instruction is a Load instruction or
+      // equivalent.
+      if (auto *Load = llvm::dyn_cast<InstLoad>(CurInst)) {
+        // An InstLoad qualifies unless it uses a 64-bit absolute address,
+        // which requires legalization to insert a copy to register.
+        // TODO(b/148272103): Fold these after legalization.
+        if (!Traits::Is64Bit || !llvm::isa<Constant>(Load->getLoadAddress())) {
+          LoadDest = Load->getDest();
+          constexpr bool DoLegalize = false;
+          LoadSrc = formMemoryOperand(Load->getLoadAddress(),
+                                      LoadDest->getType(), DoLegalize);
+        }
+      } else if (auto *Intrin = llvm::dyn_cast<InstIntrinsic>(CurInst)) {
+        // An AtomicLoad intrinsic qualifies as long as it has a valid memory
+        // ordering, and can be implemented in a single instruction (i.e., not
+        // i64 on x86-32).
+        Intrinsics::IntrinsicID ID = Intrin->getIntrinsicID();
+        if (ID == Intrinsics::AtomicLoad &&
+            (Traits::Is64Bit || Intrin->getDest()->getType() != IceType_i64) &&
+            Intrinsics::isMemoryOrderValid(
+                ID, getConstantMemoryOrder(Intrin->getArg(1)))) {
+          LoadDest = Intrin->getDest();
+          constexpr bool DoLegalize = false;
+          LoadSrc = formMemoryOperand(Intrin->getArg(0), LoadDest->getType(),
+                                      DoLegalize);
+        }
+      }
+      // A Load instruction can be folded into the following instruction only
+      // if the following instruction ends the Load's Dest variable's live
+      // range.
+      if (LoadDest && Next && Next->isLastUse(LoadDest)) {
+        assert(LoadSrc);
+        Inst *NewInst = nullptr;
+        if (auto *Arith = llvm::dyn_cast<InstArithmetic>(Next)) {
+          Operand *Src0 = Arith->getSrc(0);
+          Operand *Src1 = Arith->getSrc(1);
+          if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) {
+            NewInst = InstArithmetic::create(Func, Arith->getOp(),
+                                             Arith->getDest(), Src0, Src1);
+          }
+        } else if (auto *Icmp = llvm::dyn_cast<InstIcmp>(Next)) {
+          Operand *Src0 = Icmp->getSrc(0);
+          Operand *Src1 = Icmp->getSrc(1);
+          if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) {
+            NewInst = InstIcmp::create(Func, Icmp->getCondition(),
+                                       Icmp->getDest(), Src0, Src1);
+          }
+        } else if (auto *Fcmp = llvm::dyn_cast<InstFcmp>(Next)) {
+          Operand *Src0 = Fcmp->getSrc(0);
+          Operand *Src1 = Fcmp->getSrc(1);
+          if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) {
+            NewInst = InstFcmp::create(Func, Fcmp->getCondition(),
+                                       Fcmp->getDest(), Src0, Src1);
+          }
+        } else if (auto *Select = llvm::dyn_cast<InstSelect>(Next)) {
+          Operand *Src0 = Select->getTrueOperand();
+          Operand *Src1 = Select->getFalseOperand();
+          if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) {
+            NewInst = InstSelect::create(Func, Select->getDest(),
+                                         Select->getCondition(), Src0, Src1);
+          }
+        } else if (auto *Cast = llvm::dyn_cast<InstCast>(Next)) {
+          // The load dest can always be folded into a Cast instruction.
+          auto *Src0 = llvm::dyn_cast<Variable>(Cast->getSrc(0));
+          if (Src0 == LoadDest) {
+            NewInst = InstCast::create(Func, Cast->getCastKind(),
+                                       Cast->getDest(), LoadSrc);
+          }
+        }
+        if (NewInst) {
+          CurInst->setDeleted();
+          Next->setDeleted();
+          Context.insert(NewInst);
+          // Update NewInst->LiveRangesEnded so that target lowering may
+          // benefit. Also update NewInst->HasSideEffects.
+          NewInst->spliceLivenessInfo(Next, CurInst);
+        }
+      }
+      Context.advanceCur();
+      Context.advanceNext();
+    }
+  }
+  Func->dump("After load optimization");
+}
+
+bool TargetX8664::doBranchOpt(Inst *I, const CfgNode *NextNode) {
+  if (auto *Br = llvm::dyn_cast<InstX86Br>(I)) {
+    return Br->optimizeBranch(NextNode);
+  }
+  return false;
+}
+
+Variable *TargetX8664::getPhysicalRegister(RegNumT RegNum, Type Ty) {
+  if (Ty == IceType_void)
+    Ty = IceType_i32;
+  if (PhysicalRegisters[Ty].empty())
+    PhysicalRegisters[Ty].resize(Traits::RegisterSet::Reg_NUM);
+  assert(unsigned(RegNum) < PhysicalRegisters[Ty].size());
+  Variable *Reg = PhysicalRegisters[Ty][RegNum];
+  if (Reg == nullptr) {
+    Reg = Func->makeVariable(Ty);
+    Reg->setRegNum(RegNum);
+    PhysicalRegisters[Ty][RegNum] = Reg;
+    // Specially mark a named physical register as an "argument" so that it is
+    // considered live upon function entry.  Otherwise it's possible to get
+    // liveness validation errors for saving callee-save registers.
+    Func->addImplicitArg(Reg);
+    // Don't bother tracking the live range of a named physical register.
+    Reg->setIgnoreLiveness();
+  }
+  assert(Traits::getGprForType(Ty, RegNum) == RegNum);
+  return Reg;
+}
+
+const char *TargetX8664::getRegName(RegNumT RegNum, Type Ty) const {
+  return Traits::getRegName(Traits::getGprForType(Ty, RegNum));
+}
+
+void TargetX8664::emitVariable(const Variable *Var) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Ctx->getStrEmit();
+  if (Var->hasReg()) {
+    Str << "%" << getRegName(Var->getRegNum(), Var->getType());
+    return;
+  }
+  if (Var->mustHaveReg()) {
+    llvm::report_fatal_error("Infinite-weight Variable (" + Var->getName() +
+                             ") has no register assigned - function " +
+                             Func->getFunctionName());
+  }
+  const int32_t Offset = Var->getStackOffset();
+  auto BaseRegNum = Var->getBaseRegNum();
+  if (BaseRegNum.hasNoValue())
+    BaseRegNum = getFrameOrStackReg();
+
+  // Print in the form "Offset(%reg)", omitting Offset when it is 0.
+  if (getFlags().getDecorateAsm()) {
+    Str << Var->getSymbolicStackOffset();
+  } else if (Offset != 0) {
+    Str << Offset;
+  }
+  const Type FrameSPTy = Traits::WordType;
+  Str << "(%" << getRegName(BaseRegNum, FrameSPTy) << ")";
+}
+
+typename TargetX8664::X86Address
+TargetX8664::stackVarToAsmOperand(const Variable *Var) const {
+  if (Var->hasReg())
+    llvm::report_fatal_error("Stack Variable has a register assigned");
+  if (Var->mustHaveReg()) {
+    llvm::report_fatal_error("Infinite-weight Variable (" + Var->getName() +
+                             ") has no register assigned - function " +
+                             Func->getFunctionName());
+  }
+  int32_t Offset = Var->getStackOffset();
+  auto BaseRegNum = Var->getBaseRegNum();
+  if (Var->getBaseRegNum().hasNoValue()) {
+    // If the stack pointer needs alignment, we must use the frame pointer for
+    // arguments. For locals, getFrameOrStackReg will return the stack pointer
+    // in this case.
+    if (needsStackPointerAlignment() && Var->getIsArg()) {
+      assert(hasFramePointer());
+      BaseRegNum = getFrameReg();
+    } else {
+      BaseRegNum = getFrameOrStackReg();
+    }
+  }
+  return X86Address(Traits::getEncodedGPR(BaseRegNum), Offset,
+                    AssemblerFixup::NoFixup);
+}
+
+void TargetX8664::addProlog(CfgNode *Node) {
+  // Stack frame layout:
+  //
+  // +------------------------+  ^ +
+  // | 1. return address      |  |
+  // +------------------------+  v -
+  // | 2. preserved registers |
+  // +------------------------+ <--- BasePointer (if used)
+  // | 3. padding             |
+  // +------------------------+
+  // | 4. global spill area   |
+  // +------------------------+
+  // | 5. padding             |
+  // +------------------------+
+  // | 6. local spill area    |
+  // +------------------------+
+  // | 7. padding             |
+  // +------------------------+
+  // | 7.5 shadow (WinX64)    |
+  // +------------------------+
+  // | 8. allocas             |
+  // +------------------------+
+  // | 9. padding             |
+  // +------------------------+
+  // | 10. out args           |
+  // +------------------------+ <--- StackPointer
+  //
+  // The following variables record the size in bytes of the given areas:
+  //  * X86_RET_IP_SIZE_BYTES:   area 1
+  //  * PreservedRegsSizeBytes:  area 2
+  //  * SpillAreaPaddingBytes:   area 3
+  //  * GlobalsSize:             area 4
+  //  * LocalsSlotsPaddingBytes: area 5
+  //  * GlobalsAndSubsequentPaddingSize: areas 4 - 5
+  //  * LocalsSpillAreaSize:     area 6
+  //  * FixedAllocaSizeBytes:    areas 7 - 8
+  //  * SpillAreaSizeBytes:      areas 3 - 10
+  //  * maxOutArgsSizeBytes():   areas 9 - 10
+
+  // Determine stack frame offsets for each Variable without a register
+  // assignment. This can be done as one variable per stack slot. Or, do
+  // coalescing by running the register allocator again with an infinite set of
+  // registers (as a side effect, this gives variables a second chance at
+  // physical register assignment).
+  //
+  // A middle ground approach is to leverage sparsity and allocate one block of
+  // space on the frame for globals (variables with multi-block lifetime), and
+  // one block to share for locals (single-block lifetime).
+
+  const SizeT ShadowStoreSize = getShadowStoreSize();
+
+  // StackPointer: points just past return address of calling function
+
+  Context.init(Node);
+  Context.setInsertPoint(Context.getCur());
+
+  SmallBitVector CalleeSaves = getRegisterSet(RegSet_CalleeSave, RegSet_None);
+  RegsUsed = SmallBitVector(CalleeSaves.size());
+  VarList SortedSpilledVariables, VariablesLinkedToSpillSlots;
+  size_t GlobalsSize = 0;
+  // If there is a separate locals area, this represents that area. Otherwise
+  // it counts any variable not counted by GlobalsSize.
+  SpillAreaSizeBytes = 0;
+  // If there is a separate locals area, this specifies the alignment for it.
+  uint32_t LocalsSlotsAlignmentBytes = 0;
+  // The entire spill locations area gets aligned to largest natural alignment
+  // of the variables that have a spill slot.
+  uint32_t SpillAreaAlignmentBytes = 0;
+  // A spill slot linked to a variable with a stack slot should reuse that
+  // stack slot.
+  std::function<bool(Variable *)> TargetVarHook =
+      [&VariablesLinkedToSpillSlots](Variable *Var) {
+        // TODO(stichnot): Refactor this into the base class.
+        Variable *Root = Var->getLinkedToStackRoot();
+        if (Root != nullptr) {
+          assert(!Root->hasReg());
+          if (!Root->hasReg()) {
+            VariablesLinkedToSpillSlots.push_back(Var);
+            return true;
+          }
+        }
+        return false;
+      };
+
+  // Compute the list of spilled variables and bounds for GlobalsSize, etc.
+  getVarStackSlotParams(SortedSpilledVariables, RegsUsed, &GlobalsSize,
+                        &SpillAreaSizeBytes, &SpillAreaAlignmentBytes,
+                        &LocalsSlotsAlignmentBytes, TargetVarHook);
+  uint32_t LocalsSpillAreaSize = SpillAreaSizeBytes;
+  SpillAreaSizeBytes += GlobalsSize;
+
+  // Add push instructions for preserved registers.
+  uint32_t NumCallee = 0;
+  size_t PreservedRegsSizeBytes = 0;
+  SmallBitVector Pushed(CalleeSaves.size());
+  for (RegNumT i : RegNumBVIter(CalleeSaves)) {
+    const auto Canonical = Traits::getBaseReg(i);
+    assert(Canonical == Traits::getBaseReg(Canonical));
+    if (RegsUsed[i]) {
+      Pushed[Canonical] = true;
+    }
+  }
+  for (RegNumT RegNum : RegNumBVIter(Pushed)) {
+    assert(RegNum == Traits::getBaseReg(RegNum));
+    ++NumCallee;
+    if (Traits::isXmm(RegNum)) {
+      PreservedRegsSizeBytes += 16;
+    } else {
+      PreservedRegsSizeBytes += typeWidthInBytes(Traits::WordType);
+    }
+    _push_reg(RegNum);
+  }
+  Ctx->statsUpdateRegistersSaved(NumCallee);
+
+  // StackPointer: points past preserved registers at start of spill area
+
+  // Generate "push frameptr; mov frameptr, stackptr"
+  if (IsEbpBasedFrame) {
+    assert(
+        (RegsUsed & getRegisterSet(RegSet_FramePointer, RegSet_None)).count() ==
+        0);
+    PreservedRegsSizeBytes += typeWidthInBytes(Traits::WordType);
+    _link_bp();
+  }
+
+  // Align the variables area. SpillAreaPaddingBytes is the size of the region
+  // after the preserved registers and before the spill areas.
+  // LocalsSlotsPaddingBytes is the amount of padding between the globals and
+  // locals area if they are separate.
+  assert(LocalsSlotsAlignmentBytes <= SpillAreaAlignmentBytes);
+  uint32_t SpillAreaPaddingBytes = 0;
+  uint32_t LocalsSlotsPaddingBytes = 0;
+  alignStackSpillAreas(Traits::X86_RET_IP_SIZE_BYTES + PreservedRegsSizeBytes,
+                       SpillAreaAlignmentBytes, GlobalsSize,
+                       LocalsSlotsAlignmentBytes, &SpillAreaPaddingBytes,
+                       &LocalsSlotsPaddingBytes);
+  SpillAreaSizeBytes += SpillAreaPaddingBytes + LocalsSlotsPaddingBytes;
+  uint32_t GlobalsAndSubsequentPaddingSize =
+      GlobalsSize + LocalsSlotsPaddingBytes;
+
+  // Functions returning scalar floating point types may need to convert values
+  // from an in-register xmm value to the top of the x87 floating point stack.
+  // This is done by a movp[sd] and an fld[sd].  Ensure there is enough scratch
+  // space on the stack for this.
+  const Type ReturnType = Func->getReturnType();
+  if (!Traits::X86_PASS_SCALAR_FP_IN_XMM) {
+    if (isScalarFloatingType(ReturnType)) {
+      // Avoid misaligned double-precision load/store.
+      RequiredStackAlignment = std::max<size_t>(
+          RequiredStackAlignment, Traits::X86_STACK_ALIGNMENT_BYTES);
+      SpillAreaSizeBytes =
+          std::max(typeWidthInBytesOnStack(ReturnType), SpillAreaSizeBytes);
+    }
+  }
+
+  RequiredStackAlignment =
+      std::max<size_t>(RequiredStackAlignment, SpillAreaAlignmentBytes);
+
+  if (PrologEmitsFixedAllocas) {
+    RequiredStackAlignment =
+        std::max(RequiredStackAlignment, FixedAllocaAlignBytes);
+  }
+
+  // Combine fixed allocations into SpillAreaSizeBytes if we are emitting the
+  // fixed allocations in the prolog.
+  if (PrologEmitsFixedAllocas)
+    SpillAreaSizeBytes += FixedAllocaSizeBytes;
+
+  // Win64 ABI: add space for shadow store (aka home space)
+  SpillAreaSizeBytes += ShadowStoreSize;
+
+  // Entering the function has made the stack pointer unaligned. Re-align it by
+  // adjusting the stack size.
+  // Note that StackOffset does not include spill area. It's the offset from the
+  // base stack pointer (epb), whether we set it or not, to the the first stack
+  // arg (if any). StackSize, on the other hand, does include the spill area.
+  const uint32_t StackOffset =
+      ShadowStoreSize + Traits::X86_RET_IP_SIZE_BYTES + PreservedRegsSizeBytes;
+  uint32_t StackSize = Utils::applyAlignment(StackOffset + SpillAreaSizeBytes,
+                                             RequiredStackAlignment);
+  StackSize = Utils::applyAlignment(StackSize + maxOutArgsSizeBytes(),
+                                    RequiredStackAlignment);
+  SpillAreaSizeBytes = StackSize - StackOffset; // Adjust for alignment, if any
+
+  if (SpillAreaSizeBytes) {
+    auto *Func = Node->getCfg();
+    if (SpillAreaSizeBytes > Func->getStackSizeLimit()) {
+      Func->setError("Stack size limit exceeded");
+    }
+
+    emitStackProbe(SpillAreaSizeBytes);
+
+    // Generate "sub stackptr, SpillAreaSizeBytes"
+    _sub_sp(Ctx->getConstantInt32(SpillAreaSizeBytes));
+  }
+
+  // StackPointer: points just past the spill area (end of stack frame)
+
+  // If the required alignment is greater than the stack pointer's guaranteed
+  // alignment, align the stack pointer accordingly.
+  if (RequiredStackAlignment > Traits::X86_STACK_ALIGNMENT_BYTES) {
+    assert(IsEbpBasedFrame);
+    _and(getPhysicalRegister(getStackReg(), Traits::WordType),
+         Ctx->getConstantInt32(-RequiredStackAlignment));
+  }
+
+  // StackPointer: may have just been offset for alignment
+
+  // Account for known-frame-offset alloca instructions that were not already
+  // combined into the prolog.
+  if (!PrologEmitsFixedAllocas)
+    SpillAreaSizeBytes += FixedAllocaSizeBytes;
+
+  Ctx->statsUpdateFrameBytes(SpillAreaSizeBytes);
+
+  // Fill in stack offsets for stack args, and copy args into registers for
+  // those that were register-allocated. Args are pushed right to left, so
+  // Arg[0] is closest to the stack/frame pointer.
+  RegNumT FrameOrStackReg = IsEbpBasedFrame ? getFrameReg() : getStackReg();
+  Variable *FramePtr = getPhysicalRegister(FrameOrStackReg, Traits::WordType);
+  size_t BasicFrameOffset = StackOffset;
+  if (!IsEbpBasedFrame)
+    BasicFrameOffset += SpillAreaSizeBytes;
+
+  const VarList &Args = Func->getArgs();
+  size_t InArgsSizeBytes = 0;
+  unsigned NumXmmArgs = 0;
+  unsigned NumGPRArgs = 0;
+  for (SizeT i = 0, NumArgs = Args.size(); i < NumArgs; ++i) {
+    Variable *Arg = Args[i];
+    // Skip arguments passed in registers.
+    if (isVectorType(Arg->getType())) {
+      if (Traits::getRegisterForXmmArgNum(Traits::getArgIndex(i, NumXmmArgs))
+              .hasValue()) {
+        ++NumXmmArgs;
+        continue;
+      }
+    } else if (isScalarFloatingType(Arg->getType())) {
+      if (Traits::X86_PASS_SCALAR_FP_IN_XMM &&
+          Traits::getRegisterForXmmArgNum(Traits::getArgIndex(i, NumXmmArgs))
+              .hasValue()) {
+        ++NumXmmArgs;
+        continue;
+      }
+    } else {
+      assert(isScalarIntegerType(Arg->getType()));
+      if (Traits::getRegisterForGprArgNum(Traits::WordType,
+                                          Traits::getArgIndex(i, NumGPRArgs))
+              .hasValue()) {
+        ++NumGPRArgs;
+        continue;
+      }
+    }
+    // For esp-based frames where the allocas are done outside the prolog, the
+    // esp value may not stabilize to its home value until after all the
+    // fixed-size alloca instructions have executed.  In this case, a stack
+    // adjustment is needed when accessing in-args in order to copy them into
+    // registers.
+    size_t StackAdjBytes = 0;
+    if (!IsEbpBasedFrame && !PrologEmitsFixedAllocas)
+      StackAdjBytes -= FixedAllocaSizeBytes;
+    finishArgumentLowering(Arg, FramePtr, BasicFrameOffset, StackAdjBytes,
+                           InArgsSizeBytes);
+  }
+
+  // Fill in stack offsets for locals.
+  assignVarStackSlots(SortedSpilledVariables, SpillAreaPaddingBytes,
+                      SpillAreaSizeBytes, GlobalsAndSubsequentPaddingSize,
+                      IsEbpBasedFrame && !needsStackPointerAlignment());
+  // Assign stack offsets to variables that have been linked to spilled
+  // variables.
+  for (Variable *Var : VariablesLinkedToSpillSlots) {
+    const Variable *Root = Var->getLinkedToStackRoot();
+    assert(Root != nullptr);
+    Var->setStackOffset(Root->getStackOffset());
+
+    // If the stack root variable is an arg, make this variable an arg too so
+    // that stackVarToAsmOperand uses the correct base pointer (e.g. ebp on
+    // x86).
+    Var->setIsArg(Root->getIsArg());
+  }
+  this->HasComputedFrame = true;
+
+  if (BuildDefs::dump() && Func->isVerbose(IceV_Frame)) {
+    OstreamLocker L(Func->getContext());
+    Ostream &Str = Func->getContext()->getStrDump();
+
+    Str << "Stack layout:\n";
+    uint32_t EspAdjustmentPaddingSize =
+        SpillAreaSizeBytes - LocalsSpillAreaSize -
+        GlobalsAndSubsequentPaddingSize - SpillAreaPaddingBytes -
+        maxOutArgsSizeBytes();
+    Str << " in-args = " << InArgsSizeBytes << " bytes\n"
+        << " return address = " << Traits::X86_RET_IP_SIZE_BYTES << " bytes\n"
+        << " preserved registers = " << PreservedRegsSizeBytes << " bytes\n"
+        << " spill area padding = " << SpillAreaPaddingBytes << " bytes\n"
+        << " globals spill area = " << GlobalsSize << " bytes\n"
+        << " globals-locals spill areas intermediate padding = "
+        << GlobalsAndSubsequentPaddingSize - GlobalsSize << " bytes\n"
+        << " locals spill area = " << LocalsSpillAreaSize << " bytes\n"
+        << " esp alignment padding = " << EspAdjustmentPaddingSize
+        << " bytes\n";
+
+    Str << "Stack details:\n"
+        << " esp adjustment = " << SpillAreaSizeBytes << " bytes\n"
+        << " spill area alignment = " << SpillAreaAlignmentBytes << " bytes\n"
+        << " outgoing args size = " << maxOutArgsSizeBytes() << " bytes\n"
+        << " locals spill area alignment = " << LocalsSlotsAlignmentBytes
+        << " bytes\n"
+        << " is ebp based = " << IsEbpBasedFrame << "\n";
+  }
+}
+
+/// Helper function for addProlog().
+///
+/// This assumes Arg is an argument passed on the stack. This sets the frame
+/// offset for Arg and updates InArgsSizeBytes according to Arg's width. For an
+/// I64 arg that has been split into Lo and Hi components, it calls itself
+/// recursively on the components, taking care to handle Lo first because of the
+/// little-endian architecture. Lastly, this function generates an instruction
+/// to copy Arg into its assigned register if applicable.
+
+void TargetX8664::finishArgumentLowering(Variable *Arg, Variable *FramePtr,
+                                         size_t BasicFrameOffset,
+                                         size_t StackAdjBytes,
+                                         size_t &InArgsSizeBytes) {
+  if (!Traits::Is64Bit) {
+    if (auto *Arg64On32 = llvm::dyn_cast<Variable64On32>(Arg)) {
+      Variable *Lo = Arg64On32->getLo();
+      Variable *Hi = Arg64On32->getHi();
+      finishArgumentLowering(Lo, FramePtr, BasicFrameOffset, StackAdjBytes,
+                             InArgsSizeBytes);
+      finishArgumentLowering(Hi, FramePtr, BasicFrameOffset, StackAdjBytes,
+                             InArgsSizeBytes);
+      return;
+    }
+  }
+  Type Ty = Arg->getType();
+  if (isVectorType(Ty)) {
+    InArgsSizeBytes = Traits::applyStackAlignment(InArgsSizeBytes);
+  }
+  Arg->setStackOffset(BasicFrameOffset + InArgsSizeBytes);
+  InArgsSizeBytes += typeWidthInBytesOnStack(Ty);
+  if (Arg->hasReg()) {
+    assert(Ty != IceType_i64 || Traits::Is64Bit);
+    auto *Mem = X86OperandMem::create(
+        Func, Ty, FramePtr,
+        Ctx->getConstantInt32(Arg->getStackOffset() + StackAdjBytes));
+    if (isVectorType(Arg->getType())) {
+      _movp(Arg, Mem);
+    } else {
+      _mov(Arg, Mem);
+    }
+    // This argument-copying instruction uses an explicit X86OperandMem
+    // operand instead of a Variable, so its fill-from-stack operation has to
+    // be tracked separately for statistics.
+    Ctx->statsUpdateFills();
+  }
+}
+
+void TargetX8664::addEpilog(CfgNode *Node) {
+  InstList &Insts = Node->getInsts();
+  InstList::reverse_iterator RI, E;
+  for (RI = Insts.rbegin(), E = Insts.rend(); RI != E; ++RI) {
+    if (llvm::isa<Insts::Ret>(*RI))
+      break;
+  }
+  if (RI == E)
+    return;
+
+  // Convert the reverse_iterator position into its corresponding (forward)
+  // iterator position.
+  InstList::iterator InsertPoint = reverseToForwardIterator(RI);
+  --InsertPoint;
+  Context.init(Node);
+  Context.setInsertPoint(InsertPoint);
+
+  if (IsEbpBasedFrame) {
+    _unlink_bp();
+  } else {
+    // add stackptr, SpillAreaSizeBytes
+    if (SpillAreaSizeBytes != 0) {
+      _add_sp(Ctx->getConstantInt32(SpillAreaSizeBytes));
+    }
+  }
+
+  // Add pop instructions for preserved registers.
+  SmallBitVector CalleeSaves = getRegisterSet(RegSet_CalleeSave, RegSet_None);
+  SmallBitVector Popped(CalleeSaves.size());
+  for (int32_t i = CalleeSaves.size() - 1; i >= 0; --i) {
+    const auto RegNum = RegNumT::fromInt(i);
+    if (RegNum == getFrameReg() && IsEbpBasedFrame)
+      continue;
+    const RegNumT Canonical = Traits::getBaseReg(RegNum);
+    if (CalleeSaves[i] && RegsUsed[i]) {
+      Popped[Canonical] = true;
+    }
+  }
+  for (int32_t i = Popped.size() - 1; i >= 0; --i) {
+    if (!Popped[i])
+      continue;
+    const auto RegNum = RegNumT::fromInt(i);
+    assert(RegNum == Traits::getBaseReg(RegNum));
+    _pop_reg(RegNum);
+  }
+}
+
+Type TargetX8664::stackSlotType() { return Traits::WordType; }
+
+template <typename T>
+typename std::enable_if<!T::Is64Bit, Operand>::type *
+TargetX8664::loOperand(Operand *Operand) {
+  assert(Operand->getType() == IceType_i64 ||
+         Operand->getType() == IceType_f64);
+  if (Operand->getType() != IceType_i64 && Operand->getType() != IceType_f64)
+    return Operand;
+  if (auto *Var64On32 = llvm::dyn_cast<Variable64On32>(Operand))
+    return Var64On32->getLo();
+  if (auto *Const = llvm::dyn_cast<ConstantInteger64>(Operand)) {
+    auto *ConstInt = llvm::dyn_cast<ConstantInteger32>(
+        Ctx->getConstantInt32(static_cast<int32_t>(Const->getValue())));
+    // Check if we need to blind/pool the constant.
+    return legalize(ConstInt);
+  }
+  if (auto *Mem = llvm::dyn_cast<X86OperandMem>(Operand)) {
+    auto *MemOperand = X86OperandMem::create(
+        Func, IceType_i32, Mem->getBase(), Mem->getOffset(), Mem->getIndex(),
+        Mem->getShift(), Mem->getSegmentRegister(), Mem->getIsRebased());
+    // Test if we should randomize or pool the offset, if so randomize it or
+    // pool it then create mem operand with the blinded/pooled constant.
+    // Otherwise, return the mem operand as ordinary mem operand.
+    return legalize(MemOperand);
+  }
+  llvm_unreachable("Unsupported operand type");
+  return nullptr;
+}
+
+template <typename T>
+typename std::enable_if<!T::Is64Bit, Operand>::type *
+TargetX8664::hiOperand(Operand *Operand) {
+  assert(Operand->getType() == IceType_i64 ||
+         Operand->getType() == IceType_f64);
+  if (Operand->getType() != IceType_i64 && Operand->getType() != IceType_f64)
+    return Operand;
+  if (auto *Var64On32 = llvm::dyn_cast<Variable64On32>(Operand))
+    return Var64On32->getHi();
+  if (auto *Const = llvm::dyn_cast<ConstantInteger64>(Operand)) {
+    auto *ConstInt = llvm::dyn_cast<ConstantInteger32>(
+        Ctx->getConstantInt32(static_cast<int32_t>(Const->getValue() >> 32)));
+    // Check if we need to blind/pool the constant.
+    return legalize(ConstInt);
+  }
+  if (auto *Mem = llvm::dyn_cast<X86OperandMem>(Operand)) {
+    Constant *Offset = Mem->getOffset();
+    if (Offset == nullptr) {
+      Offset = Ctx->getConstantInt32(4);
+    } else if (auto *IntOffset = llvm::dyn_cast<ConstantInteger32>(Offset)) {
+      Offset = Ctx->getConstantInt32(4 + IntOffset->getValue());
+    } else if (auto *SymOffset = llvm::dyn_cast<ConstantRelocatable>(Offset)) {
+      assert(!Utils::WouldOverflowAdd(SymOffset->getOffset(), 4));
+      Offset =
+          Ctx->getConstantSym(4 + SymOffset->getOffset(), SymOffset->getName());
+    }
+    auto *MemOperand = X86OperandMem::create(
+        Func, IceType_i32, Mem->getBase(), Offset, Mem->getIndex(),
+        Mem->getShift(), Mem->getSegmentRegister(), Mem->getIsRebased());
+    // Test if the Offset is an eligible i32 constants for randomization and
+    // pooling. Blind/pool it if it is. Otherwise return as oridinary mem
+    // operand.
+    return legalize(MemOperand);
+  }
+  llvm_unreachable("Unsupported operand type");
+  return nullptr;
+}
+
+SmallBitVector TargetX8664::getRegisterSet(RegSetMask Include,
+                                           RegSetMask Exclude) const {
+  return Traits::getRegisterSet(getFlags(), Include, Exclude);
+}
+
+void TargetX8664::lowerAlloca(const InstAlloca *Instr) {
+  // Conservatively require the stack to be aligned. Some stack adjustment
+  // operations implemented below assume that the stack is aligned before the
+  // alloca. All the alloca code ensures that the stack alignment is preserved
+  // after the alloca. The stack alignment restriction can be relaxed in some
+  // cases.
+  RequiredStackAlignment = std::max<size_t>(RequiredStackAlignment,
+                                            Traits::X86_STACK_ALIGNMENT_BYTES);
+
+  // For default align=0, set it to the real value 1, to avoid any
+  // bit-manipulation problems below.
+  const uint32_t AlignmentParam = std::max(1u, Instr->getAlignInBytes());
+
+  // LLVM enforces power of 2 alignment.
+  assert(llvm::isPowerOf2_32(AlignmentParam));
+  assert(llvm::isPowerOf2_32(Traits::X86_STACK_ALIGNMENT_BYTES));
+
+  const uint32_t Alignment =
+      std::max(AlignmentParam, Traits::X86_STACK_ALIGNMENT_BYTES);
+  const bool OverAligned = Alignment > Traits::X86_STACK_ALIGNMENT_BYTES;
+  const bool OptM1 = Func->getOptLevel() == Opt_m1;
+  const bool AllocaWithKnownOffset = Instr->getKnownFrameOffset();
+  const bool UseFramePointer =
+      hasFramePointer() || OverAligned || !AllocaWithKnownOffset || OptM1;
+
+  if (UseFramePointer)
+    setHasFramePointer();
+
+  Variable *esp = getPhysicalRegister(getStackReg(), Traits::WordType);
+  if (OverAligned) {
+    _and(esp, Ctx->getConstantInt32(-Alignment));
+  }
+
+  Variable *Dest = Instr->getDest();
+  Operand *TotalSize = legalize(Instr->getSizeInBytes());
+
+  if (const auto *ConstantTotalSize =
+          llvm::dyn_cast<ConstantInteger32>(TotalSize)) {
+    const uint32_t Value =
+        Utils::applyAlignment(ConstantTotalSize->getValue(), Alignment);
+    if (UseFramePointer) {
+      _sub_sp(Ctx->getConstantInt32(Value));
+    } else {
+      // If we don't need a Frame Pointer, this alloca has a known offset to the
+      // stack pointer. We don't need adjust the stack pointer, nor assign any
+      // value to Dest, as Dest is rematerializable.
+      assert(Dest->isRematerializable());
+      FixedAllocaSizeBytes += Value;
+      Context.insert<InstFakeDef>(Dest);
+    }
+  } else {
+    // Non-constant sizes need to be adjusted to the next highest multiple of
+    // the required alignment at runtime.
+    Variable *T = nullptr;
+    if (Traits::Is64Bit && TotalSize->getType() != IceType_i64) {
+      T = makeReg(IceType_i64);
+      _movzx(T, TotalSize);
+    } else {
+      T = makeReg(IceType_i32);
+      _mov(T, TotalSize);
+    }
+    _add(T, Ctx->getConstantInt32(Alignment - 1));
+    _and(T, Ctx->getConstantInt32(-Alignment));
+    _sub_sp(T);
+  }
+  // Add enough to the returned address to account for the out args area.
+  uint32_t OutArgsSize = maxOutArgsSizeBytes();
+  if (OutArgsSize > 0) {
+    Variable *T = makeReg(Dest->getType());
+    auto *CalculateOperand = X86OperandMem::create(
+        Func, IceType_void, esp, Ctx->getConstantInt(IceType_i32, OutArgsSize));
+    _lea(T, CalculateOperand);
+    _mov(Dest, T);
+  } else {
+    _mov(Dest, esp);
+  }
+}
+
+void TargetX8664::lowerArguments() {
+  const bool OptM1 = Func->getOptLevel() == Opt_m1;
+  VarList &Args = Func->getArgs();
+  unsigned NumXmmArgs = 0;
+  bool XmmSlotsRemain = true;
+  unsigned NumGprArgs = 0;
+  bool GprSlotsRemain = true;
+
+  Context.init(Func->getEntryNode());
+  Context.setInsertPoint(Context.getCur());
+
+  for (SizeT i = 0, End = Args.size();
+       i < End && (XmmSlotsRemain || GprSlotsRemain); ++i) {
+    Variable *Arg = Args[i];
+    Type Ty = Arg->getType();
+    Variable *RegisterArg = nullptr;
+    RegNumT RegNum;
+    if (isVectorType(Ty)) {
+      RegNum =
+          Traits::getRegisterForXmmArgNum(Traits::getArgIndex(i, NumXmmArgs));
+      if (RegNum.hasNoValue()) {
+        XmmSlotsRemain = false;
+        continue;
+      }
+      ++NumXmmArgs;
+      RegisterArg = Func->makeVariable(Ty);
+    } else if (isScalarFloatingType(Ty)) {
+      if (!Traits::X86_PASS_SCALAR_FP_IN_XMM) {
+        continue;
+      }
+      RegNum =
+          Traits::getRegisterForXmmArgNum(Traits::getArgIndex(i, NumXmmArgs));
+      if (RegNum.hasNoValue()) {
+        XmmSlotsRemain = false;
+        continue;
+      }
+      ++NumXmmArgs;
+      RegisterArg = Func->makeVariable(Ty);
+    } else if (isScalarIntegerType(Ty)) {
+      RegNum = Traits::getRegisterForGprArgNum(
+          Ty, Traits::getArgIndex(i, NumGprArgs));
+      if (RegNum.hasNoValue()) {
+        GprSlotsRemain = false;
+        continue;
+      }
+      ++NumGprArgs;
+      RegisterArg = Func->makeVariable(Ty);
+    }
+    assert(RegNum.hasValue());
+    assert(RegisterArg != nullptr);
+    // Replace Arg in the argument list with the home register. Then generate
+    // an instruction in the prolog to copy the home register to the assigned
+    // location of Arg.
+    if (BuildDefs::dump())
+      RegisterArg->setName(Func, "home_reg:" + Arg->getName());
+    RegisterArg->setRegNum(RegNum);
+    RegisterArg->setIsArg();
+    Arg->setIsArg(false);
+
+    Args[i] = RegisterArg;
+    // When not Om1, do the assignment through a temporary, instead of directly
+    // from the pre-colored variable, so that a subsequent availabilityGet()
+    // call has a chance to work.  (In Om1, don't bother creating extra
+    // instructions with extra variables to register-allocate.)
+    if (OptM1) {
+      Context.insert<InstAssign>(Arg, RegisterArg);
+    } else {
+      Variable *Tmp = makeReg(RegisterArg->getType());
+      Context.insert<InstAssign>(Tmp, RegisterArg);
+      Context.insert<InstAssign>(Arg, Tmp);
+    }
+  }
+  if (!OptM1)
+    Context.availabilityUpdate();
+}
+
+/// Strength-reduce scalar integer multiplication by a constant (for i32 or
+/// narrower) for certain constants. The lea instruction can be used to multiply
+/// by 3, 5, or 9, and the lsh instruction can be used to multiply by powers of
+/// 2. These can be combined such that e.g. multiplying by 100 can be done as 2
+/// lea-based multiplies by 5, combined with left-shifting by 2.
+
+bool TargetX8664::optimizeScalarMul(Variable *Dest, Operand *Src0,
+                                    int32_t Src1) {
+  // Disable this optimization for Om1 and O0, just to keep things simple
+  // there.
+  if (Func->getOptLevel() < Opt_1)
+    return false;
+  Type Ty = Dest->getType();
+  if (Src1 == -1) {
+    Variable *T = nullptr;
+    _mov(T, Src0);
+    _neg(T);
+    _mov(Dest, T);
+    return true;
+  }
+  if (Src1 == 0) {
+    _mov(Dest, Ctx->getConstantZero(Ty));
+    return true;
+  }
+  if (Src1 == 1) {
+    Variable *T = nullptr;
+    _mov(T, Src0);
+    _mov(Dest, T);
+    return true;
+  }
+  // Don't bother with the edge case where Src1 == MININT.
+  if (Src1 == -Src1)
+    return false;
+  const bool Src1IsNegative = Src1 < 0;
+  if (Src1IsNegative)
+    Src1 = -Src1;
+  uint32_t Count9 = 0;
+  uint32_t Count5 = 0;
+  uint32_t Count3 = 0;
+  uint32_t Count2 = 0;
+  uint32_t CountOps = 0;
+  while (Src1 > 1) {
+    if (Src1 % 9 == 0) {
+      ++CountOps;
+      ++Count9;
+      Src1 /= 9;
+    } else if (Src1 % 5 == 0) {
+      ++CountOps;
+      ++Count5;
+      Src1 /= 5;
+    } else if (Src1 % 3 == 0) {
+      ++CountOps;
+      ++Count3;
+      Src1 /= 3;
+    } else if (Src1 % 2 == 0) {
+      if (Count2 == 0)
+        ++CountOps;
+      ++Count2;
+      Src1 /= 2;
+    } else {
+      return false;
+    }
+  }
+  // Lea optimization only works for i16 and i32 types, not i8.
+  if (Ty != IceType_i32 && !(Traits::Is64Bit && Ty == IceType_i64) &&
+      (Count3 || Count5 || Count9))
+    return false;
+  // Limit the number of lea/shl operations for a single multiply, to a
+  // somewhat arbitrary choice of 3.
+  constexpr uint32_t MaxOpsForOptimizedMul = 3;
+  if (CountOps > MaxOpsForOptimizedMul)
+    return false;
+  Variable *T = makeReg(Traits::WordType);
+  if (typeWidthInBytes(Src0->getType()) < typeWidthInBytes(T->getType())) {
+    Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+    _movzx(T, Src0RM);
+  } else {
+    _mov(T, Src0);
+  }
+  Constant *Zero = Ctx->getConstantZero(IceType_i32);
+  for (uint32_t i = 0; i < Count9; ++i) {
+    constexpr uint16_t Shift = 3; // log2(9-1)
+    _lea(T, X86OperandMem::create(Func, IceType_void, T, Zero, T, Shift));
+  }
+  for (uint32_t i = 0; i < Count5; ++i) {
+    constexpr uint16_t Shift = 2; // log2(5-1)
+    _lea(T, X86OperandMem::create(Func, IceType_void, T, Zero, T, Shift));
+  }
+  for (uint32_t i = 0; i < Count3; ++i) {
+    constexpr uint16_t Shift = 1; // log2(3-1)
+    _lea(T, X86OperandMem::create(Func, IceType_void, T, Zero, T, Shift));
+  }
+  if (Count2) {
+    _shl(T, Ctx->getConstantInt(Ty, Count2));
+  }
+  if (Src1IsNegative)
+    _neg(T);
+  _mov(Dest, T);
+  return true;
+}
+
+void TargetX8664::lowerShift64(InstArithmetic::OpKind Op, Operand *Src0Lo,
+                               Operand *Src0Hi, Operand *Src1Lo,
+                               Variable *DestLo, Variable *DestHi) {
+  // TODO: Refactor the similarities between Shl, Lshr, and Ashr.
+  Variable *T_1 = nullptr, *T_2 = nullptr, *T_3 = nullptr;
+  Constant *Zero = Ctx->getConstantZero(IceType_i32);
+  Constant *SignExtend = Ctx->getConstantInt32(0x1f);
+  if (auto *ConstantShiftAmount = llvm::dyn_cast<ConstantInteger32>(Src1Lo)) {
+    uint32_t ShiftAmount = ConstantShiftAmount->getValue();
+    if (ShiftAmount > 32) {
+      Constant *ReducedShift = Ctx->getConstantInt32(ShiftAmount - 32);
+      switch (Op) {
+      default:
+        assert(0 && "non-shift op");
+        break;
+      case InstArithmetic::Shl: {
+        // a=b<<c ==>
+        //   t2 = b.lo
+        //   t2 = shl t2, ShiftAmount-32
+        //   t3 = t2
+        //   t2 = 0
+        _mov(T_2, Src0Lo);
+        _shl(T_2, ReducedShift);
+        _mov(DestHi, T_2);
+        _mov(DestLo, Zero);
+      } break;
+      case InstArithmetic::Lshr: {
+        // a=b>>c (unsigned) ==>
+        //   t2 = b.hi
+        //   t2 = shr t2, ShiftAmount-32
+        //   a.lo = t2
+        //   a.hi = 0
+        _mov(T_2, Src0Hi);
+        _shr(T_2, ReducedShift);
+        _mov(DestLo, T_2);
+        _mov(DestHi, Zero);
+      } break;
+      case InstArithmetic::Ashr: {
+        // a=b>>c (signed) ==>
+        //   t3 = b.hi
+        //   t3 = sar t3, 0x1f
+        //   t2 = b.hi
+        //   t2 = shrd t2, t3, ShiftAmount-32
+        //   a.lo = t2
+        //   a.hi = t3
+        _mov(T_3, Src0Hi);
+        _sar(T_3, SignExtend);
+        _mov(T_2, Src0Hi);
+        _shrd(T_2, T_3, ReducedShift);
+        _mov(DestLo, T_2);
+        _mov(DestHi, T_3);
+      } break;
+      }
+    } else if (ShiftAmount == 32) {
+      switch (Op) {
+      default:
+        assert(0 && "non-shift op");
+        break;
+      case InstArithmetic::Shl: {
+        // a=b<<c ==>
+        //   t2 = b.lo
+        //   a.hi = t2
+        //   a.lo = 0
+        _mov(T_2, Src0Lo);
+        _mov(DestHi, T_2);
+        _mov(DestLo, Zero);
+      } break;
+      case InstArithmetic::Lshr: {
+        // a=b>>c (unsigned) ==>
+        //   t2 = b.hi
+        //   a.lo = t2
+        //   a.hi = 0
+        _mov(T_2, Src0Hi);
+        _mov(DestLo, T_2);
+        _mov(DestHi, Zero);
+      } break;
+      case InstArithmetic::Ashr: {
+        // a=b>>c (signed) ==>
+        //   t2 = b.hi
+        //   a.lo = t2
+        //   t3 = b.hi
+        //   t3 = sar t3, 0x1f
+        //   a.hi = t3
+        _mov(T_2, Src0Hi);
+        _mov(DestLo, T_2);
+        _mov(T_3, Src0Hi);
+        _sar(T_3, SignExtend);
+        _mov(DestHi, T_3);
+      } break;
+      }
+    } else {
+      // COMMON PREFIX OF: a=b SHIFT_OP c ==>
+      //   t2 = b.lo
+      //   t3 = b.hi
+      _mov(T_2, Src0Lo);
+      _mov(T_3, Src0Hi);
+      switch (Op) {
+      default:
+        assert(0 && "non-shift op");
+        break;
+      case InstArithmetic::Shl: {
+        // a=b<<c ==>
+        //   t3 = shld t3, t2, ShiftAmount
+        //   t2 = shl t2, ShiftAmount
+        _shld(T_3, T_2, ConstantShiftAmount);
+        _shl(T_2, ConstantShiftAmount);
+      } break;
+      case InstArithmetic::Lshr: {
+        // a=b>>c (unsigned) ==>
+        //   t2 = shrd t2, t3, ShiftAmount
+        //   t3 = shr t3, ShiftAmount
+        _shrd(T_2, T_3, ConstantShiftAmount);
+        _shr(T_3, ConstantShiftAmount);
+      } break;
+      case InstArithmetic::Ashr: {
+        // a=b>>c (signed) ==>
+        //   t2 = shrd t2, t3, ShiftAmount
+        //   t3 = sar t3, ShiftAmount
+        _shrd(T_2, T_3, ConstantShiftAmount);
+        _sar(T_3, ConstantShiftAmount);
+      } break;
+      }
+      // COMMON SUFFIX OF: a=b SHIFT_OP c ==>
+      //   a.lo = t2
+      //   a.hi = t3
+      _mov(DestLo, T_2);
+      _mov(DestHi, T_3);
+    }
+  } else {
+    // NON-CONSTANT CASES.
+    Constant *BitTest = Ctx->getConstantInt32(0x20);
+    InstX86Label *Label = InstX86Label::create(Func, this);
+    // COMMON PREFIX OF: a=b SHIFT_OP c ==>
+    //   t1:ecx = c.lo & 0xff
+    //   t2 = b.lo
+    //   t3 = b.hi
+    T_1 = copyToReg8(Src1Lo, Traits::RegisterSet::Reg_cl);
+    _mov(T_2, Src0Lo);
+    _mov(T_3, Src0Hi);
+    switch (Op) {
+    default:
+      assert(0 && "non-shift op");
+      break;
+    case InstArithmetic::Shl: {
+      // a=b<<c ==>
+      //   t3 = shld t3, t2, t1
+      //   t2 = shl t2, t1
+      //   test t1, 0x20
+      //   je L1
+      //   use(t3)
+      //   t3 = t2
+      //   t2 = 0
+      _shld(T_3, T_2, T_1);
+      _shl(T_2, T_1);
+      _test(T_1, BitTest);
+      _br(CondX86::Br_e, Label);
+      // T_2 and T_3 are being assigned again because of the intra-block control
+      // flow, so we need to use _redefined to avoid liveness problems.
+      _redefined(_mov(T_3, T_2));
+      _redefined(_mov(T_2, Zero));
+    } break;
+    case InstArithmetic::Lshr: {
+      // a=b>>c (unsigned) ==>
+      //   t2 = shrd t2, t3, t1
+      //   t3 = shr t3, t1
+      //   test t1, 0x20
+      //   je L1
+      //   use(t2)
+      //   t2 = t3
+      //   t3 = 0
+      _shrd(T_2, T_3, T_1);
+      _shr(T_3, T_1);
+      _test(T_1, BitTest);
+      _br(CondX86::Br_e, Label);
+      // T_2 and T_3 are being assigned again because of the intra-block control
+      // flow, so we need to use _redefined to avoid liveness problems.
+      _redefined(_mov(T_2, T_3));
+      _redefined(_mov(T_3, Zero));
+    } break;
+    case InstArithmetic::Ashr: {
+      // a=b>>c (signed) ==>
+      //   t2 = shrd t2, t3, t1
+      //   t3 = sar t3, t1
+      //   test t1, 0x20
+      //   je L1
+      //   use(t2)
+      //   t2 = t3
+      //   t3 = sar t3, 0x1f
+      Constant *SignExtend = Ctx->getConstantInt32(0x1f);
+      _shrd(T_2, T_3, T_1);
+      _sar(T_3, T_1);
+      _test(T_1, BitTest);
+      _br(CondX86::Br_e, Label);
+      // T_2 and T_3 are being assigned again because of the intra-block control
+      // flow, so T_2 needs to use _redefined to avoid liveness problems. T_3
+      // doesn't need special treatment because it is reassigned via _sar
+      // instead of _mov.
+      _redefined(_mov(T_2, T_3));
+      _sar(T_3, SignExtend);
+    } break;
+    }
+    // COMMON SUFFIX OF: a=b SHIFT_OP c ==>
+    // L1:
+    //   a.lo = t2
+    //   a.hi = t3
+    Context.insert(Label);
+    _mov(DestLo, T_2);
+    _mov(DestHi, T_3);
+  }
+}
+
+void TargetX8664::lowerArithmetic(const InstArithmetic *Instr) {
+  Variable *Dest = Instr->getDest();
+  if (Dest->isRematerializable()) {
+    Context.insert<InstFakeDef>(Dest);
+    return;
+  }
+  Type Ty = Dest->getType();
+  Operand *Src0 = legalize(Instr->getSrc(0));
+  Operand *Src1 = legalize(Instr->getSrc(1));
+  if (Instr->isCommutative()) {
+    uint32_t SwapCount = 0;
+    if (!llvm::isa<Variable>(Src0) && llvm::isa<Variable>(Src1)) {
+      std::swap(Src0, Src1);
+      ++SwapCount;
+    }
+    if (llvm::isa<Constant>(Src0) && !llvm::isa<Constant>(Src1)) {
+      std::swap(Src0, Src1);
+      ++SwapCount;
+    }
+    // Improve two-address code patterns by avoiding a copy to the dest
+    // register when one of the source operands ends its lifetime here.
+    if (!Instr->isLastUse(Src0) && Instr->isLastUse(Src1)) {
+      std::swap(Src0, Src1);
+      ++SwapCount;
+    }
+    assert(SwapCount <= 1);
+    (void)SwapCount;
+  }
+  if (!Traits::Is64Bit && Ty == IceType_i64) {
+    // These x86-32 helper-call-involved instructions are lowered in this
+    // separate switch. This is because loOperand() and hiOperand() may insert
+    // redundant instructions for constant blinding and pooling. Such redundant
+    // instructions will fail liveness analysis under -Om1 setting. And,
+    // actually these arguments do not need to be processed with loOperand()
+    // and hiOperand() to be used.
+    switch (Instr->getOp()) {
+    case InstArithmetic::Udiv:
+    case InstArithmetic::Sdiv:
+    case InstArithmetic::Urem:
+    case InstArithmetic::Srem:
+      llvm::report_fatal_error("Helper call was expected");
+      return;
+    default:
+      break;
+    }
+
+    auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
+    auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
+    Operand *Src0Lo = loOperand(Src0);
+    Operand *Src0Hi = hiOperand(Src0);
+    Operand *Src1Lo = loOperand(Src1);
+    Operand *Src1Hi = hiOperand(Src1);
+    Variable *T_Lo = nullptr, *T_Hi = nullptr;
+    switch (Instr->getOp()) {
+    case InstArithmetic::_num:
+      llvm_unreachable("Unknown arithmetic operator");
+      break;
+    case InstArithmetic::Add:
+      _mov(T_Lo, Src0Lo);
+      _add(T_Lo, Src1Lo);
+      _mov(DestLo, T_Lo);
+      _mov(T_Hi, Src0Hi);
+      _adc(T_Hi, Src1Hi);
+      _mov(DestHi, T_Hi);
+      break;
+    case InstArithmetic::And:
+      _mov(T_Lo, Src0Lo);
+      _and(T_Lo, Src1Lo);
+      _mov(DestLo, T_Lo);
+      _mov(T_Hi, Src0Hi);
+      _and(T_Hi, Src1Hi);
+      _mov(DestHi, T_Hi);
+      break;
+    case InstArithmetic::Or:
+      _mov(T_Lo, Src0Lo);
+      _or(T_Lo, Src1Lo);
+      _mov(DestLo, T_Lo);
+      _mov(T_Hi, Src0Hi);
+      _or(T_Hi, Src1Hi);
+      _mov(DestHi, T_Hi);
+      break;
+    case InstArithmetic::Xor:
+      _mov(T_Lo, Src0Lo);
+      _xor(T_Lo, Src1Lo);
+      _mov(DestLo, T_Lo);
+      _mov(T_Hi, Src0Hi);
+      _xor(T_Hi, Src1Hi);
+      _mov(DestHi, T_Hi);
+      break;
+    case InstArithmetic::Sub:
+      _mov(T_Lo, Src0Lo);
+      _sub(T_Lo, Src1Lo);
+      _mov(DestLo, T_Lo);
+      _mov(T_Hi, Src0Hi);
+      _sbb(T_Hi, Src1Hi);
+      _mov(DestHi, T_Hi);
+      break;
+    case InstArithmetic::Mul: {
+      Variable *T_1 = nullptr, *T_2 = nullptr, *T_3 = nullptr;
+      Variable *T_4Lo = makeReg(IceType_i32, Traits::RegisterSet::Reg_eax);
+      Variable *T_4Hi = makeReg(IceType_i32, Traits::RegisterSet::Reg_edx);
+      // gcc does the following:
+      // a=b*c ==>
+      //   t1 = b.hi; t1 *=(imul) c.lo
+      //   t2 = c.hi; t2 *=(imul) b.lo
+      //   t3:eax = b.lo
+      //   t4.hi:edx,t4.lo:eax = t3:eax *(mul) c.lo
+      //   a.lo = t4.lo
+      //   t4.hi += t1
+      //   t4.hi += t2
+      //   a.hi = t4.hi
+      // The mul instruction cannot take an immediate operand.
+      Src1Lo = legalize(Src1Lo, Legal_Reg | Legal_Mem);
+      _mov(T_1, Src0Hi);
+      _imul(T_1, Src1Lo);
+      _mov(T_3, Src0Lo, Traits::RegisterSet::Reg_eax);
+      _mul(T_4Lo, T_3, Src1Lo);
+      // The mul instruction produces two dest variables, edx:eax. We create a
+      // fake definition of edx to account for this.
+      Context.insert<InstFakeDef>(T_4Hi, T_4Lo);
+      Context.insert<InstFakeUse>(T_4Hi);
+      _mov(DestLo, T_4Lo);
+      _add(T_4Hi, T_1);
+      _mov(T_2, Src1Hi);
+      Src0Lo = legalize(Src0Lo, Legal_Reg | Legal_Mem);
+      _imul(T_2, Src0Lo);
+      _add(T_4Hi, T_2);
+      _mov(DestHi, T_4Hi);
+    } break;
+    case InstArithmetic::Shl:
+    case InstArithmetic::Lshr:
+    case InstArithmetic::Ashr:
+      lowerShift64(Instr->getOp(), Src0Lo, Src0Hi, Src1Lo, DestLo, DestHi);
+      break;
+    case InstArithmetic::Fadd:
+    case InstArithmetic::Fsub:
+    case InstArithmetic::Fmul:
+    case InstArithmetic::Fdiv:
+    case InstArithmetic::Frem:
+      llvm_unreachable("FP instruction with i64 type");
+      break;
+    case InstArithmetic::Udiv:
+    case InstArithmetic::Sdiv:
+    case InstArithmetic::Urem:
+    case InstArithmetic::Srem:
+      llvm_unreachable("Call-helper-involved instruction for i64 type \
+                       should have already been handled before");
+      break;
+    }
+    return;
+  }
+  if (isVectorType(Ty)) {
+    // TODO: Trap on integer divide and integer modulo by zero. See:
+    // https://code.google.com/p/nativeclient/issues/detail?id=3899
+    if (llvm::isa<X86OperandMem>(Src1))
+      Src1 = legalizeToReg(Src1);
+    switch (Instr->getOp()) {
+    case InstArithmetic::_num:
+      llvm_unreachable("Unknown arithmetic operator");
+      break;
+    case InstArithmetic::Add: {
+      Variable *T = makeReg(Ty);
+      _movp(T, Src0);
+      _padd(T, Src1);
+      _movp(Dest, T);
+    } break;
+    case InstArithmetic::And: {
+      Variable *T = makeReg(Ty);
+      _movp(T, Src0);
+      _pand(T, Src1);
+      _movp(Dest, T);
+    } break;
+    case InstArithmetic::Or: {
+      Variable *T = makeReg(Ty);
+      _movp(T, Src0);
+      _por(T, Src1);
+      _movp(Dest, T);
+    } break;
+    case InstArithmetic::Xor: {
+      Variable *T = makeReg(Ty);
+      _movp(T, Src0);
+      _pxor(T, Src1);
+      _movp(Dest, T);
+    } break;
+    case InstArithmetic::Sub: {
+      Variable *T = makeReg(Ty);
+      _movp(T, Src0);
+      _psub(T, Src1);
+      _movp(Dest, T);
+    } break;
+    case InstArithmetic::Mul: {
+      bool TypesAreValidForPmull = Ty == IceType_v4i32 || Ty == IceType_v8i16;
+      bool InstructionSetIsValidForPmull =
+          Ty == IceType_v8i16 || InstructionSet >= SSE4_1;
+      if (TypesAreValidForPmull && InstructionSetIsValidForPmull) {
+        Variable *T = makeReg(Ty);
+        _movp(T, Src0);
+        _pmull(T, Src0 == Src1 ? T : Src1);
+        _movp(Dest, T);
+      } else if (Ty == IceType_v4i32) {
+        // Lowering sequence:
+        // Note: The mask arguments have index 0 on the left.
+        //
+        // movups  T1, Src0
+        // pshufd  T2, Src0, {1,0,3,0}
+        // pshufd  T3, Src1, {1,0,3,0}
+        // # T1 = {Src0[0] * Src1[0], Src0[2] * Src1[2]}
+        // pmuludq T1, Src1
+        // # T2 = {Src0[1] * Src1[1], Src0[3] * Src1[3]}
+        // pmuludq T2, T3
+        // # T1 = {lo(T1[0]), lo(T1[2]), lo(T2[0]), lo(T2[2])}
+        // shufps  T1, T2, {0,2,0,2}
+        // pshufd  T4, T1, {0,2,1,3}
+        // movups  Dest, T4
+
+        // Mask that directs pshufd to create a vector with entries
+        // Src[1, 0, 3, 0]
+        constexpr unsigned Constant1030 = 0x31;
+        Constant *Mask1030 = Ctx->getConstantInt32(Constant1030);
+        // Mask that directs shufps to create a vector with entries
+        // Dest[0, 2], Src[0, 2]
+        constexpr unsigned Mask0202 = 0x88;
+        // Mask that directs pshufd to create a vector with entries
+        // Src[0, 2, 1, 3]
+        constexpr unsigned Mask0213 = 0xd8;
+        Variable *T1 = makeReg(IceType_v4i32);
+        Variable *T2 = makeReg(IceType_v4i32);
+        Variable *T3 = makeReg(IceType_v4i32);
+        Variable *T4 = makeReg(IceType_v4i32);
+        _movp(T1, Src0);
+        _pshufd(T2, Src0, Mask1030);
+        _pshufd(T3, Src1, Mask1030);
+        _pmuludq(T1, Src1);
+        _pmuludq(T2, T3);
+        _shufps(T1, T2, Ctx->getConstantInt32(Mask0202));
+        _pshufd(T4, T1, Ctx->getConstantInt32(Mask0213));
+        _movp(Dest, T4);
+      } else if (Ty == IceType_v16i8) {
+        llvm::report_fatal_error("Scalarized operation was expected");
+      } else {
+        llvm::report_fatal_error("Invalid vector multiply type");
+      }
+    } break;
+    case InstArithmetic::Shl: {
+      assert(llvm::isa<Constant>(Src1) && "Non-constant shift not scalarized");
+      Variable *T = makeReg(Ty);
+      _movp(T, Src0);
+      _psll(T, Src1);
+      _movp(Dest, T);
+    } break;
+    case InstArithmetic::Lshr: {
+      assert(llvm::isa<Constant>(Src1) && "Non-constant shift not scalarized");
+      Variable *T = makeReg(Ty);
+      _movp(T, Src0);
+      _psrl(T, Src1);
+      _movp(Dest, T);
+    } break;
+    case InstArithmetic::Ashr: {
+      assert(llvm::isa<Constant>(Src1) && "Non-constant shift not scalarized");
+      Variable *T = makeReg(Ty);
+      _movp(T, Src0);
+      _psra(T, Src1);
+      _movp(Dest, T);
+    } break;
+    case InstArithmetic::Udiv:
+    case InstArithmetic::Urem:
+    case InstArithmetic::Sdiv:
+    case InstArithmetic::Srem:
+      llvm::report_fatal_error("Scalarized operation was expected");
+      break;
+    case InstArithmetic::Fadd: {
+      Variable *T = makeReg(Ty);
+      _movp(T, Src0);
+      _addps(T, Src1);
+      _movp(Dest, T);
+    } break;
+    case InstArithmetic::Fsub: {
+      Variable *T = makeReg(Ty);
+      _movp(T, Src0);
+      _subps(T, Src1);
+      _movp(Dest, T);
+    } break;
+    case InstArithmetic::Fmul: {
+      Variable *T = makeReg(Ty);
+      _movp(T, Src0);
+      _mulps(T, Src0 == Src1 ? T : Src1);
+      _movp(Dest, T);
+    } break;
+    case InstArithmetic::Fdiv: {
+      Variable *T = makeReg(Ty);
+      _movp(T, Src0);
+      _divps(T, Src1);
+      _movp(Dest, T);
+    } break;
+    case InstArithmetic::Frem:
+      llvm::report_fatal_error("Scalarized operation was expected");
+      break;
+    }
+    return;
+  }
+  Variable *T_edx = nullptr;
+  Variable *T = nullptr;
+  switch (Instr->getOp()) {
+  case InstArithmetic::_num:
+    llvm_unreachable("Unknown arithmetic operator");
+    break;
+  case InstArithmetic::Add: {
+    const bool ValidType =
+        Ty == IceType_i32 || (Ty == IceType_i64 && Traits::Is64Bit);
+    auto *Const = llvm::dyn_cast<Constant>(Instr->getSrc(1));
+    const bool ValidKind =
+        Const != nullptr && (llvm::isa<ConstantInteger32>(Const) ||
+                             llvm::isa<ConstantRelocatable>(Const));
+    if (getFlags().getAggressiveLea() && ValidType && ValidKind) {
+      auto *Var = legalizeToReg(Src0);
+      auto *Mem = Traits::X86OperandMem::create(Func, IceType_void, Var, Const);
+      T = makeReg(Ty);
+      _lea(T, Mem);
+      _mov(Dest, T);
+      break;
+    }
+    _mov(T, Src0);
+    _add(T, Src1);
+    _mov(Dest, T);
+  } break;
+  case InstArithmetic::And:
+    _mov(T, Src0);
+    _and(T, Src1);
+    _mov(Dest, T);
+    break;
+  case InstArithmetic::Or:
+    _mov(T, Src0);
+    _or(T, Src1);
+    _mov(Dest, T);
+    break;
+  case InstArithmetic::Xor:
+    _mov(T, Src0);
+    _xor(T, Src1);
+    _mov(Dest, T);
+    break;
+  case InstArithmetic::Sub:
+    _mov(T, Src0);
+    _sub(T, Src1);
+    _mov(Dest, T);
+    break;
+  case InstArithmetic::Mul:
+    if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) {
+      if (optimizeScalarMul(Dest, Src0, C->getValue()))
+        return;
+    }
+    // The 8-bit version of imul only allows the form "imul r/m8" where T must
+    // be in al.
+    if (isByteSizedArithType(Ty)) {
+      _mov(T, Src0, Traits::RegisterSet::Reg_al);
+      Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
+      _imul(T, Src0 == Src1 ? T : Src1);
+      _mov(Dest, T);
+    } else if (auto *ImmConst = llvm::dyn_cast<ConstantInteger32>(Src1)) {
+      T = makeReg(Ty);
+      Src0 = legalize(Src0, Legal_Reg | Legal_Mem);
+      _imul_imm(T, Src0, ImmConst);
+      _mov(Dest, T);
+    } else {
+      _mov(T, Src0);
+      // No need to legalize Src1 to Reg | Mem because the Imm case is handled
+      // already by the ConstantInteger32 case above.
+      _imul(T, Src0 == Src1 ? T : Src1);
+      _mov(Dest, T);
+    }
+    break;
+  case InstArithmetic::Shl:
+    _mov(T, Src0);
+    if (!llvm::isa<ConstantInteger32>(Src1) &&
+        !llvm::isa<ConstantInteger64>(Src1))
+      Src1 = copyToReg8(Src1, Traits::RegisterSet::Reg_cl);
+    _shl(T, Src1);
+    _mov(Dest, T);
+    break;
+  case InstArithmetic::Lshr:
+    _mov(T, Src0);
+    if (!llvm::isa<ConstantInteger32>(Src1) &&
+        !llvm::isa<ConstantInteger64>(Src1))
+      Src1 = copyToReg8(Src1, Traits::RegisterSet::Reg_cl);
+    _shr(T, Src1);
+    _mov(Dest, T);
+    break;
+  case InstArithmetic::Ashr:
+    _mov(T, Src0);
+    if (!llvm::isa<ConstantInteger32>(Src1) &&
+        !llvm::isa<ConstantInteger64>(Src1))
+      Src1 = copyToReg8(Src1, Traits::RegisterSet::Reg_cl);
+    _sar(T, Src1);
+    _mov(Dest, T);
+    break;
+  case InstArithmetic::Udiv: {
+    // div and idiv are the few arithmetic operators that do not allow
+    // immediates as the operand.
+    Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
+    RegNumT Eax;
+    RegNumT Edx;
+    switch (Ty) {
+    default:
+      llvm::report_fatal_error("Bad type for udiv");
+    case IceType_i64:
+      Eax = Traits::getRaxOrDie();
+      Edx = Traits::getRdxOrDie();
+      break;
+    case IceType_i32:
+      Eax = Traits::RegisterSet::Reg_eax;
+      Edx = Traits::RegisterSet::Reg_edx;
+      break;
+    case IceType_i16:
+      Eax = Traits::RegisterSet::Reg_ax;
+      Edx = Traits::RegisterSet::Reg_dx;
+      break;
+    case IceType_i8:
+      Eax = Traits::RegisterSet::Reg_al;
+      Edx = Traits::RegisterSet::Reg_ah;
+      break;
+    }
+    T_edx = makeReg(Ty, Edx);
+    _mov(T, Src0, Eax);
+    _mov(T_edx, Ctx->getConstantZero(Ty));
+    _div(T_edx, Src1, T);
+    _redefined(Context.insert<InstFakeDef>(T, T_edx));
+    _mov(Dest, T);
+  } break;
+  case InstArithmetic::Sdiv:
+    // TODO(stichnot): Enable this after doing better performance and cross
+    // testing.
+    if (false && Func->getOptLevel() >= Opt_1) {
+      // Optimize division by constant power of 2, but not for Om1 or O0, just
+      // to keep things simple there.
+      if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) {
+        const int32_t Divisor = C->getValue();
+        const uint32_t UDivisor = Divisor;
+        if (Divisor > 0 && llvm::isPowerOf2_32(UDivisor)) {
+          uint32_t LogDiv = llvm::Log2_32(UDivisor);
+          // LLVM does the following for dest=src/(1<<log):
+          //   t=src
+          //   sar t,typewidth-1 // -1 if src is negative, 0 if not
+          //   shr t,typewidth-log
+          //   add t,src
+          //   sar t,log
+          //   dest=t
+          uint32_t TypeWidth = Traits::X86_CHAR_BIT * typeWidthInBytes(Ty);
+          _mov(T, Src0);
+          // If for some reason we are dividing by 1, just treat it like an
+          // assignment.
+          if (LogDiv > 0) {
+            // The initial sar is unnecessary when dividing by 2.
+            if (LogDiv > 1)
+              _sar(T, Ctx->getConstantInt(Ty, TypeWidth - 1));
+            _shr(T, Ctx->getConstantInt(Ty, TypeWidth - LogDiv));
+            _add(T, Src0);
+            _sar(T, Ctx->getConstantInt(Ty, LogDiv));
+          }
+          _mov(Dest, T);
+          return;
+        }
+      }
+    }
+    Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
+    switch (Ty) {
+    default:
+      llvm::report_fatal_error("Bad type for sdiv");
+    case IceType_i64:
+      T_edx = makeReg(Ty, Traits::getRdxOrDie());
+      _mov(T, Src0, Traits::getRaxOrDie());
+      break;
+    case IceType_i32:
+      T_edx = makeReg(Ty, Traits::RegisterSet::Reg_edx);
+      _mov(T, Src0, Traits::RegisterSet::Reg_eax);
+      break;
+    case IceType_i16:
+      T_edx = makeReg(Ty, Traits::RegisterSet::Reg_dx);
+      _mov(T, Src0, Traits::RegisterSet::Reg_ax);
+      break;
+    case IceType_i8:
+      T_edx = makeReg(IceType_i16, Traits::RegisterSet::Reg_ax);
+      _mov(T, Src0, Traits::RegisterSet::Reg_al);
+      break;
+    }
+    _cbwdq(T_edx, T);
+    _idiv(T_edx, Src1, T);
+    _redefined(Context.insert<InstFakeDef>(T, T_edx));
+    _mov(Dest, T);
+    break;
+  case InstArithmetic::Urem: {
+    Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
+    RegNumT Eax;
+    RegNumT Edx;
+    switch (Ty) {
+    default:
+      llvm::report_fatal_error("Bad type for urem");
+    case IceType_i64:
+      Eax = Traits::getRaxOrDie();
+      Edx = Traits::getRdxOrDie();
+      break;
+    case IceType_i32:
+      Eax = Traits::RegisterSet::Reg_eax;
+      Edx = Traits::RegisterSet::Reg_edx;
+      break;
+    case IceType_i16:
+      Eax = Traits::RegisterSet::Reg_ax;
+      Edx = Traits::RegisterSet::Reg_dx;
+      break;
+    case IceType_i8:
+      Eax = Traits::RegisterSet::Reg_al;
+      Edx = Traits::RegisterSet::Reg_ah;
+      break;
+    }
+    T_edx = makeReg(Ty, Edx);
+    _mov(T_edx, Ctx->getConstantZero(Ty));
+    _mov(T, Src0, Eax);
+    _div(T, Src1, T_edx);
+    _redefined(Context.insert<InstFakeDef>(T_edx, T));
+    if (Ty == IceType_i8) {
+      // Register ah must be moved into one of {al,bl,cl,dl} before it can be
+      // moved into a general 8-bit register.
+      auto *T_AhRcvr = makeReg(Ty);
+      T_AhRcvr->setRegClass(RCX86_IsAhRcvr);
+      _mov(T_AhRcvr, T_edx);
+      T_edx = T_AhRcvr;
+    }
+    _mov(Dest, T_edx);
+  } break;
+  case InstArithmetic::Srem: {
+    // TODO(stichnot): Enable this after doing better performance and cross
+    // testing.
+    if (false && Func->getOptLevel() >= Opt_1) {
+      // Optimize mod by constant power of 2, but not for Om1 or O0, just to
+      // keep things simple there.
+      if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) {
+        const int32_t Divisor = C->getValue();
+        const uint32_t UDivisor = Divisor;
+        if (Divisor > 0 && llvm::isPowerOf2_32(UDivisor)) {
+          uint32_t LogDiv = llvm::Log2_32(UDivisor);
+          // LLVM does the following for dest=src%(1<<log):
+          //   t=src
+          //   sar t,typewidth-1 // -1 if src is negative, 0 if not
+          //   shr t,typewidth-log
+          //   add t,src
+          //   and t, -(1<<log)
+          //   sub t,src
+          //   neg t
+          //   dest=t
+          uint32_t TypeWidth = Traits::X86_CHAR_BIT * typeWidthInBytes(Ty);
+          // If for some reason we are dividing by 1, just assign 0.
+          if (LogDiv == 0) {
+            _mov(Dest, Ctx->getConstantZero(Ty));
+            return;
+          }
+          _mov(T, Src0);
+          // The initial sar is unnecessary when dividing by 2.
+          if (LogDiv > 1)
+            _sar(T, Ctx->getConstantInt(Ty, TypeWidth - 1));
+          _shr(T, Ctx->getConstantInt(Ty, TypeWidth - LogDiv));
+          _add(T, Src0);
+          _and(T, Ctx->getConstantInt(Ty, -(1 << LogDiv)));
+          _sub(T, Src0);
+          _neg(T);
+          _mov(Dest, T);
+          return;
+        }
+      }
+    }
+    Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
+    RegNumT Eax;
+    RegNumT Edx;
+    switch (Ty) {
+    default:
+      llvm::report_fatal_error("Bad type for srem");
+    case IceType_i64:
+      Eax = Traits::getRaxOrDie();
+      Edx = Traits::getRdxOrDie();
+      break;
+    case IceType_i32:
+      Eax = Traits::RegisterSet::Reg_eax;
+      Edx = Traits::RegisterSet::Reg_edx;
+      break;
+    case IceType_i16:
+      Eax = Traits::RegisterSet::Reg_ax;
+      Edx = Traits::RegisterSet::Reg_dx;
+      break;
+    case IceType_i8:
+      Eax = Traits::RegisterSet::Reg_al;
+      Edx = Traits::RegisterSet::Reg_ah;
+      break;
+    }
+    T_edx = makeReg(Ty, Edx);
+    _mov(T, Src0, Eax);
+    _cbwdq(T_edx, T);
+    _idiv(T, Src1, T_edx);
+    _redefined(Context.insert<InstFakeDef>(T_edx, T));
+    if (Ty == IceType_i8) {
+      // Register ah must be moved into one of {al,bl,cl,dl} before it can be
+      // moved into a general 8-bit register.
+      auto *T_AhRcvr = makeReg(Ty);
+      T_AhRcvr->setRegClass(RCX86_IsAhRcvr);
+      _mov(T_AhRcvr, T_edx);
+      T_edx = T_AhRcvr;
+    }
+    _mov(Dest, T_edx);
+  } break;
+  case InstArithmetic::Fadd:
+    _mov(T, Src0);
+    _addss(T, Src1);
+    _mov(Dest, T);
+    break;
+  case InstArithmetic::Fsub:
+    _mov(T, Src0);
+    _subss(T, Src1);
+    _mov(Dest, T);
+    break;
+  case InstArithmetic::Fmul:
+    _mov(T, Src0);
+    _mulss(T, Src0 == Src1 ? T : Src1);
+    _mov(Dest, T);
+    break;
+  case InstArithmetic::Fdiv:
+    _mov(T, Src0);
+    _divss(T, Src1);
+    _mov(Dest, T);
+    break;
+  case InstArithmetic::Frem:
+    llvm::report_fatal_error("Helper call was expected");
+    break;
+  }
+}
+
+void TargetX8664::lowerAssign(const InstAssign *Instr) {
+  Variable *Dest = Instr->getDest();
+  if (Dest->isRematerializable()) {
+    Context.insert<InstFakeDef>(Dest);
+    return;
+  }
+  Operand *Src = Instr->getSrc(0);
+  assert(Dest->getType() == Src->getType());
+  lowerMove(Dest, Src, false);
+}
+
+void TargetX8664::lowerBr(const InstBr *Br) {
+  if (Br->isUnconditional()) {
+    _br(Br->getTargetUnconditional());
+    return;
+  }
+  Operand *Cond = Br->getCondition();
+
+  // Handle folding opportunities.
+  if (const Inst *Producer = FoldingInfo.getProducerFor(Cond)) {
+    assert(Producer->isDeleted());
+    switch (BoolFolding::getProducerKind(Producer)) {
+    default:
+      break;
+    case BoolFolding::PK_Icmp32:
+    case BoolFolding::PK_Icmp64: {
+      lowerIcmpAndConsumer(llvm::cast<InstIcmp>(Producer), Br);
+      return;
+    }
+    case BoolFolding::PK_Fcmp: {
+      lowerFcmpAndConsumer(llvm::cast<InstFcmp>(Producer), Br);
+      return;
+    }
+    case BoolFolding::PK_Arith: {
+      lowerArithAndConsumer(llvm::cast<InstArithmetic>(Producer), Br);
+      return;
+    }
+    }
+  }
+  Operand *Src0 = legalize(Cond, Legal_Reg | Legal_Mem);
+  Constant *Zero = Ctx->getConstantZero(IceType_i32);
+  _cmp(Src0, Zero);
+  _br(CondX86::Br_ne, Br->getTargetTrue(), Br->getTargetFalse());
+}
+
+// constexprMax returns a (constexpr) max(S0, S1), and it is used for defining
+// OperandList in lowerCall. std::max() is supposed to work, but it doesn't.
+inline constexpr SizeT constexprMax(SizeT S0, SizeT S1) {
+  return S0 < S1 ? S1 : S0;
+}
+
+void TargetX8664::lowerCall(const InstCall *Instr) {
+  // Common x86 calling convention lowering:
+  //
+  // * At the point before the call, the stack must be aligned to 16 bytes.
+  //
+  // * Non-register arguments are pushed onto the stack in right-to-left order,
+  // such that the left-most argument ends up on the top of the stack at the
+  // lowest memory address.
+  //
+  // * Stack arguments of vector type are aligned to start at the next highest
+  // multiple of 16 bytes. Other stack arguments are aligned to the next word
+  // size boundary (4 or 8 bytes, respectively).
+  RequiredStackAlignment = std::max<size_t>(RequiredStackAlignment,
+                                            Traits::X86_STACK_ALIGNMENT_BYTES);
+
+  constexpr SizeT MaxOperands =
+      constexprMax(Traits::X86_MAX_XMM_ARGS, Traits::X86_MAX_GPR_ARGS);
+  using OperandList = llvm::SmallVector<Operand *, MaxOperands>;
+
+  OperandList XmmArgs;
+  llvm::SmallVector<SizeT, MaxOperands> XmmArgIndices;
+  CfgVector<std::pair<const Type, Operand *>> GprArgs;
+  CfgVector<SizeT> GprArgIndices;
+  OperandList StackArgs, StackArgLocations;
+  uint32_t ParameterAreaSizeBytes = 0;
+
+  ParameterAreaSizeBytes += getShadowStoreSize();
+
+  // Classify each argument operand according to the location where the argument
+  // is passed.
+  for (SizeT i = 0, NumArgs = Instr->getNumArgs(); i < NumArgs; ++i) {
+    Operand *Arg = Instr->getArg(i);
+    const Type Ty = Arg->getType();
+    // The PNaCl ABI requires the width of arguments to be at least 32 bits.
+    assert(typeWidthInBytes(Ty) >= 4);
+    if (isVectorType(Ty) &&
+        Traits::getRegisterForXmmArgNum(Traits::getArgIndex(i, XmmArgs.size()))
+            .hasValue()) {
+      XmmArgs.push_back(Arg);
+      XmmArgIndices.push_back(i);
+    } else if (isScalarFloatingType(Ty) && Traits::X86_PASS_SCALAR_FP_IN_XMM &&
+               Traits::getRegisterForXmmArgNum(
+                   Traits::getArgIndex(i, XmmArgs.size()))
+                   .hasValue()) {
+      XmmArgs.push_back(Arg);
+      XmmArgIndices.push_back(i);
+    } else if (isScalarIntegerType(Ty) &&
+               Traits::getRegisterForGprArgNum(
+                   Ty, Traits::getArgIndex(i, GprArgs.size()))
+                   .hasValue()) {
+      GprArgs.emplace_back(Ty, Arg);
+      GprArgIndices.push_back(i);
+    } else {
+      // Place on stack.
+      StackArgs.push_back(Arg);
+      if (isVectorType(Arg->getType())) {
+        ParameterAreaSizeBytes =
+            Traits::applyStackAlignment(ParameterAreaSizeBytes);
+      }
+      Variable *esp = getPhysicalRegister(getStackReg(), Traits::WordType);
+      Constant *Loc = Ctx->getConstantInt32(ParameterAreaSizeBytes);
+      StackArgLocations.push_back(
+          Traits::X86OperandMem::create(Func, Ty, esp, Loc));
+      ParameterAreaSizeBytes += typeWidthInBytesOnStack(Arg->getType());
+    }
+  }
+  // Ensure there is enough space for the fstp/movs for floating returns.
+  Variable *Dest = Instr->getDest();
+  const Type DestTy = Dest ? Dest->getType() : IceType_void;
+  if (!Traits::X86_PASS_SCALAR_FP_IN_XMM) {
+    if (isScalarFloatingType(DestTy)) {
+      ParameterAreaSizeBytes =
+          std::max(static_cast<size_t>(ParameterAreaSizeBytes),
+                   typeWidthInBytesOnStack(DestTy));
+    }
+  }
+  // Adjust the parameter area so that the stack is aligned. It is assumed that
+  // the stack is already aligned at the start of the calling sequence.
+  ParameterAreaSizeBytes = Traits::applyStackAlignment(ParameterAreaSizeBytes);
+  assert(ParameterAreaSizeBytes <= maxOutArgsSizeBytes());
+  // Copy arguments that are passed on the stack to the appropriate stack
+  // locations.  We make sure legalize() is called on each argument at this
+  // point, to allow availabilityGet() to work.
+  for (SizeT i = 0, NumStackArgs = StackArgs.size(); i < NumStackArgs; ++i) {
+    lowerStore(
+        InstStore::create(Func, legalize(StackArgs[i]), StackArgLocations[i]));
+  }
+  // Copy arguments to be passed in registers to the appropriate registers.
+  for (SizeT i = 0, NumXmmArgs = XmmArgs.size(); i < NumXmmArgs; ++i) {
+    XmmArgs[i] = legalizeToReg(legalize(XmmArgs[i]),
+                               Traits::getRegisterForXmmArgNum(
+                                   Traits::getArgIndex(XmmArgIndices[i], i)));
+  }
+  // Materialize moves for arguments passed in GPRs.
+  for (SizeT i = 0, NumGprArgs = GprArgs.size(); i < NumGprArgs; ++i) {
+    const Type SignatureTy = GprArgs[i].first;
+    Operand *Arg =
+        legalize(GprArgs[i].second, Legal_Default | Legal_Rematerializable);
+    GprArgs[i].second = legalizeToReg(
+        Arg, Traits::getRegisterForGprArgNum(
+                 Arg->getType(), Traits::getArgIndex(GprArgIndices[i], i)));
+    assert(SignatureTy == IceType_i64 || SignatureTy == IceType_i32);
+    assert(SignatureTy == Arg->getType());
+    (void)SignatureTy;
+  }
+  // Generate a FakeUse of register arguments so that they do not get dead code
+  // eliminated as a result of the FakeKill of scratch registers after the call.
+  // These need to be right before the call instruction.
+  for (auto *Arg : XmmArgs) {
+    Context.insert<InstFakeUse>(llvm::cast<Variable>(Arg));
+  }
+  for (auto &ArgPair : GprArgs) {
+    Context.insert<InstFakeUse>(llvm::cast<Variable>(ArgPair.second));
+  }
+  // Generate the call instruction. Assign its result to a temporary with high
+  // register allocation weight.
+  // ReturnReg doubles as ReturnRegLo as necessary.
+  Variable *ReturnReg = nullptr;
+  Variable *ReturnRegHi = nullptr;
+  if (Dest) {
+    switch (DestTy) {
+    case IceType_NUM:
+    case IceType_void:
+    case IceType_i1:
+    case IceType_i8:
+    case IceType_i16:
+      llvm::report_fatal_error("Invalid Call dest type");
+      break;
+    case IceType_i32:
+      ReturnReg = makeReg(DestTy, Traits::RegisterSet::Reg_eax);
+      break;
+    case IceType_i64:
+      if (Traits::Is64Bit) {
+        ReturnReg = makeReg(IceType_i64, Traits::getRaxOrDie());
+      } else {
+        ReturnReg = makeReg(IceType_i32, Traits::RegisterSet::Reg_eax);
+        ReturnRegHi = makeReg(IceType_i32, Traits::RegisterSet::Reg_edx);
+      }
+      break;
+    case IceType_f32:
+    case IceType_f64:
+      if (!Traits::X86_PASS_SCALAR_FP_IN_XMM) {
+        // Leave ReturnReg==ReturnRegHi==nullptr, and capture the result with
+        // the fstp instruction.
+        break;
+      }
+    // Fallthrough intended.
+    case IceType_v4i1:
+    case IceType_v8i1:
+    case IceType_v16i1:
+    case IceType_v16i8:
+    case IceType_v8i16:
+    case IceType_v4i32:
+    case IceType_v4f32:
+      ReturnReg = makeReg(DestTy, Traits::RegisterSet::Reg_xmm0);
+      break;
+    }
+  }
+  // Emit the call to the function.
+  Operand *CallTarget =
+      legalize(Instr->getCallTarget(), Legal_Reg | Legal_Imm | Legal_AddrAbs);
+  size_t NumVariadicFpArgs = Instr->isVariadic() ? XmmArgs.size() : 0;
+  Inst *NewCall = emitCallToTarget(CallTarget, ReturnReg, NumVariadicFpArgs);
+  // Keep the upper return register live on 32-bit platform.
+  if (ReturnRegHi)
+    Context.insert<InstFakeDef>(ReturnRegHi);
+  // Mark the call as killing all the caller-save registers.
+  Context.insert<InstFakeKill>(NewCall);
+  // Generate a FakeUse to keep the call live if necessary.
+  if (Instr->hasSideEffects() && ReturnReg) {
+    Context.insert<InstFakeUse>(ReturnReg);
+  }
+  // Process the return value, if any.
+  if (Dest == nullptr)
+    return;
+  // Assign the result of the call to Dest.  Route it through a temporary so
+  // that the local register availability peephole can be subsequently used.
+  Variable *Tmp = nullptr;
+  if (isVectorType(DestTy)) {
+    assert(ReturnReg && "Vector type requires a return register");
+    Tmp = makeReg(DestTy);
+    _movp(Tmp, ReturnReg);
+    _movp(Dest, Tmp);
+  } else if (isScalarFloatingType(DestTy)) {
+    assert(ReturnReg && "FP type requires a return register");
+    _mov(Tmp, ReturnReg);
+    _mov(Dest, Tmp);
+  } else {
+    assert(isScalarIntegerType(DestTy));
+    assert(ReturnReg && "Integer type requires a return register");
+    if (DestTy == IceType_i64 && !Traits::Is64Bit) {
+      assert(ReturnRegHi && "64-bit type requires two return registers");
+      auto *Dest64On32 = llvm::cast<Variable64On32>(Dest);
+      Variable *DestLo = Dest64On32->getLo();
+      Variable *DestHi = Dest64On32->getHi();
+      _mov(Tmp, ReturnReg);
+      _mov(DestLo, Tmp);
+      Variable *TmpHi = nullptr;
+      _mov(TmpHi, ReturnRegHi);
+      _mov(DestHi, TmpHi);
+    } else {
+      _mov(Tmp, ReturnReg);
+      _mov(Dest, Tmp);
+    }
+  }
+}
+
+void TargetX8664::lowerCast(const InstCast *Instr) {
+  // a = cast(b) ==> t=cast(b); a=t; (link t->b, link a->t, no overlap)
+  InstCast::OpKind CastKind = Instr->getCastKind();
+  Variable *Dest = Instr->getDest();
+  Type DestTy = Dest->getType();
+  switch (CastKind) {
+  default:
+    Func->setError("Cast type not supported");
+    return;
+  case InstCast::Sext: {
+    // Src0RM is the source operand legalized to physical register or memory,
+    // but not immediate, since the relevant x86 native instructions don't
+    // allow an immediate operand. If the operand is an immediate, we could
+    // consider computing the strength-reduced result at translation time, but
+    // we're unlikely to see something like that in the bitcode that the
+    // optimizer wouldn't have already taken care of.
+    Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
+    if (isVectorType(DestTy)) {
+      if (DestTy == IceType_v16i8) {
+        // onemask = materialize(1,1,...); dst = (src & onemask) > 0
+        Variable *OneMask = makeVectorOfOnes(DestTy);
+        Variable *T = makeReg(DestTy);
+        _movp(T, Src0RM);
+        _pand(T, OneMask);
+        Variable *Zeros = makeVectorOfZeros(DestTy);
+        _pcmpgt(T, Zeros);
+        _movp(Dest, T);
+      } else {
+        /// width = width(elty) - 1; dest = (src << width) >> width
+        SizeT ShiftAmount =
+            Traits::X86_CHAR_BIT * typeWidthInBytes(typeElementType(DestTy)) -
+            1;
+        Constant *ShiftConstant = Ctx->getConstantInt8(ShiftAmount);
+        Variable *T = makeReg(DestTy);
+        _movp(T, Src0RM);
+        _psll(T, ShiftConstant);
+        _psra(T, ShiftConstant);
+        _movp(Dest, T);
+      }
+    } else if (!Traits::Is64Bit && DestTy == IceType_i64) {
+      // t1=movsx src; t2=t1; t2=sar t2, 31; dst.lo=t1; dst.hi=t2
+      Constant *Shift = Ctx->getConstantInt32(31);
+      auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
+      auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
+      Variable *T_Lo = makeReg(DestLo->getType());
+      if (Src0RM->getType() == IceType_i32) {
+        _mov(T_Lo, Src0RM);
+      } else if (Src0RM->getType() == IceType_i1) {
+        _movzx(T_Lo, Src0RM);
+        _shl(T_Lo, Shift);
+        _sar(T_Lo, Shift);
+      } else {
+        _movsx(T_Lo, Src0RM);
+      }
+      _mov(DestLo, T_Lo);
+      Variable *T_Hi = nullptr;
+      _mov(T_Hi, T_Lo);
+      if (Src0RM->getType() != IceType_i1)
+        // For i1, the sar instruction is already done above.
+        _sar(T_Hi, Shift);
+      _mov(DestHi, T_Hi);
+    } else if (Src0RM->getType() == IceType_i1) {
+      // t1 = src
+      // shl t1, dst_bitwidth - 1
+      // sar t1, dst_bitwidth - 1
+      // dst = t1
+      size_t DestBits = Traits::X86_CHAR_BIT * typeWidthInBytes(DestTy);
+      Constant *ShiftAmount = Ctx->getConstantInt32(DestBits - 1);
+      Variable *T = makeReg(DestTy);
+      if (typeWidthInBytes(DestTy) <= typeWidthInBytes(Src0RM->getType())) {
+        _mov(T, Src0RM);
+      } else {
+        // Widen the source using movsx or movzx. (It doesn't matter which one,
+        // since the following shl/sar overwrite the bits.)
+        _movzx(T, Src0RM);
+      }
+      _shl(T, ShiftAmount);
+      _sar(T, ShiftAmount);
+      _mov(Dest, T);
+    } else {
+      // t1 = movsx src; dst = t1
+      Variable *T = makeReg(DestTy);
+      _movsx(T, Src0RM);
+      _mov(Dest, T);
+    }
+    break;
+  }
+  case InstCast::Zext: {
+    Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
+    if (isVectorType(DestTy)) {
+      // onemask = materialize(1,1,...); dest = onemask & src
+      Variable *OneMask = makeVectorOfOnes(DestTy);
+      Variable *T = makeReg(DestTy);
+      _movp(T, Src0RM);
+      _pand(T, OneMask);
+      _movp(Dest, T);
+    } else if (!Traits::Is64Bit && DestTy == IceType_i64) {
+      // t1=movzx src; dst.lo=t1; dst.hi=0
+      Constant *Zero = Ctx->getConstantZero(IceType_i32);
+      auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
+      auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
+      Variable *Tmp = makeReg(DestLo->getType());
+      if (Src0RM->getType() == IceType_i32) {
+        _mov(Tmp, Src0RM);
+      } else {
+        _movzx(Tmp, Src0RM);
+      }
+      _mov(DestLo, Tmp);
+      _mov(DestHi, Zero);
+    } else if (Src0RM->getType() == IceType_i1) {
+      // t = Src0RM; Dest = t
+      Variable *T = nullptr;
+      if (DestTy == IceType_i8) {
+        _mov(T, Src0RM);
+      } else {
+        assert(DestTy != IceType_i1);
+        assert(Traits::Is64Bit || DestTy != IceType_i64);
+        // Use 32-bit for both 16-bit and 32-bit, since 32-bit ops are shorter.
+        // In x86-64 we need to widen T to 64-bits to ensure that T -- if
+        // written to the stack (i.e., in -Om1) will be fully zero-extended.
+        T = makeReg(DestTy == IceType_i64 ? IceType_i64 : IceType_i32);
+        _movzx(T, Src0RM);
+      }
+      _mov(Dest, T);
+    } else {
+      // t1 = movzx src; dst = t1
+      Variable *T = makeReg(DestTy);
+      _movzx(T, Src0RM);
+      _mov(Dest, T);
+    }
+    break;
+  }
+  case InstCast::Trunc: {
+    if (isVectorType(DestTy)) {
+      // onemask = materialize(1,1,...); dst = src & onemask
+      Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
+      Type Src0Ty = Src0RM->getType();
+      Variable *OneMask = makeVectorOfOnes(Src0Ty);
+      Variable *T = makeReg(DestTy);
+      _movp(T, Src0RM);
+      _pand(T, OneMask);
+      _movp(Dest, T);
+    } else if (DestTy == IceType_i1 || DestTy == IceType_i8) {
+      // Make sure we truncate from and into valid registers.
+      Operand *Src0 = legalizeUndef(Instr->getSrc(0));
+      if (!Traits::Is64Bit && Src0->getType() == IceType_i64)
+        Src0 = loOperand(Src0);
+      Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+      Variable *T = copyToReg8(Src0RM);
+      if (DestTy == IceType_i1)
+        _and(T, Ctx->getConstantInt1(1));
+      _mov(Dest, T);
+    } else {
+      Operand *Src0 = legalizeUndef(Instr->getSrc(0));
+      if (!Traits::Is64Bit && Src0->getType() == IceType_i64)
+        Src0 = loOperand(Src0);
+      Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+      // t1 = trunc Src0RM; Dest = t1
+      Variable *T = makeReg(DestTy);
+      _mov(T, Src0RM);
+      _mov(Dest, T);
+    }
+    break;
+  }
+  case InstCast::Fptrunc:
+  case InstCast::Fpext: {
+    Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
+    // t1 = cvt Src0RM; Dest = t1
+    Variable *T = makeReg(DestTy);
+    _cvt(T, Src0RM, Insts::Cvt::Float2float);
+    _mov(Dest, T);
+    break;
+  }
+  case InstCast::Fptosi:
+    if (isVectorType(DestTy)) {
+      assert(DestTy == IceType_v4i32);
+      assert(Instr->getSrc(0)->getType() == IceType_v4f32);
+      Operand *Src0R = legalizeToReg(Instr->getSrc(0));
+      Variable *T = makeReg(DestTy);
+      _cvt(T, Src0R, Insts::Cvt::Tps2dq);
+      _movp(Dest, T);
+    } else if (!Traits::Is64Bit && DestTy == IceType_i64) {
+      llvm::report_fatal_error("Helper call was expected");
+    } else {
+      Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
+      // t1.i32 = cvt Src0RM; t2.dest_type = t1; Dest = t2.dest_type
+      Variable *T_1 = nullptr;
+      if (Traits::Is64Bit && DestTy == IceType_i64) {
+        T_1 = makeReg(IceType_i64);
+      } else {
+        assert(DestTy != IceType_i64);
+        T_1 = makeReg(IceType_i32);
+      }
+      // cvt() requires its integer argument to be a GPR.
+      Variable *T_2 = makeReg(DestTy);
+      if (isByteSizedType(DestTy)) {
+        assert(T_1->getType() == IceType_i32);
+        T_1->setRegClass(RCX86_Is32To8);
+        T_2->setRegClass(RCX86_IsTrunc8Rcvr);
+      }
+      _cvt(T_1, Src0RM, Insts::Cvt::Tss2si);
+      _mov(T_2, T_1); // T_1 and T_2 may have different integer types
+      if (DestTy == IceType_i1)
+        _and(T_2, Ctx->getConstantInt1(1));
+      _mov(Dest, T_2);
+    }
+    break;
+  case InstCast::Fptoui:
+    if (isVectorType(DestTy)) {
+      llvm::report_fatal_error("Helper call was expected");
+    } else if (DestTy == IceType_i64 ||
+               (!Traits::Is64Bit && DestTy == IceType_i32)) {
+      llvm::report_fatal_error("Helper call was expected");
+    } else {
+      Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
+      // t1.i32 = cvt Src0RM; t2.dest_type = t1; Dest = t2.dest_type
+      assert(DestTy != IceType_i64);
+      Variable *T_1 = nullptr;
+      if (Traits::Is64Bit && DestTy == IceType_i32) {
+        T_1 = makeReg(IceType_i64);
+      } else {
+        assert(DestTy != IceType_i32);
+        T_1 = makeReg(IceType_i32);
+      }
+      Variable *T_2 = makeReg(DestTy);
+      if (isByteSizedType(DestTy)) {
+        assert(T_1->getType() == IceType_i32);
+        T_1->setRegClass(RCX86_Is32To8);
+        T_2->setRegClass(RCX86_IsTrunc8Rcvr);
+      }
+      _cvt(T_1, Src0RM, Insts::Cvt::Tss2si);
+      _mov(T_2, T_1); // T_1 and T_2 may have different integer types
+      if (DestTy == IceType_i1)
+        _and(T_2, Ctx->getConstantInt1(1));
+      _mov(Dest, T_2);
+    }
+    break;
+  case InstCast::Sitofp:
+    if (isVectorType(DestTy)) {
+      assert(DestTy == IceType_v4f32);
+      assert(Instr->getSrc(0)->getType() == IceType_v4i32);
+      Operand *Src0R = legalizeToReg(Instr->getSrc(0));
+      Variable *T = makeReg(DestTy);
+      _cvt(T, Src0R, Insts::Cvt::Dq2ps);
+      _movp(Dest, T);
+    } else if (!Traits::Is64Bit && Instr->getSrc(0)->getType() == IceType_i64) {
+      llvm::report_fatal_error("Helper call was expected");
+    } else {
+      Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
+      // Sign-extend the operand.
+      // t1.i32 = movsx Src0RM; t2 = Cvt t1.i32; Dest = t2
+      Variable *T_1 = nullptr;
+      if (Traits::Is64Bit && Src0RM->getType() == IceType_i64) {
+        T_1 = makeReg(IceType_i64);
+      } else {
+        assert(Src0RM->getType() != IceType_i64);
+        T_1 = makeReg(IceType_i32);
+      }
+      Variable *T_2 = makeReg(DestTy);
+      if (Src0RM->getType() == T_1->getType())
+        _mov(T_1, Src0RM);
+      else
+        _movsx(T_1, Src0RM);
+      _cvt(T_2, T_1, Insts::Cvt::Si2ss);
+      _mov(Dest, T_2);
+    }
+    break;
+  case InstCast::Uitofp: {
+    Operand *Src0 = Instr->getSrc(0);
+    if (isVectorType(Src0->getType())) {
+      llvm::report_fatal_error("Helper call was expected");
+    } else if (Src0->getType() == IceType_i64 ||
+               (!Traits::Is64Bit && Src0->getType() == IceType_i32)) {
+      llvm::report_fatal_error("Helper call was expected");
+    } else {
+      Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+      // Zero-extend the operand.
+      // t1.i32 = movzx Src0RM; t2 = Cvt t1.i32; Dest = t2
+      Variable *T_1 = nullptr;
+      if (Traits::Is64Bit && Src0RM->getType() == IceType_i32) {
+        T_1 = makeReg(IceType_i64);
+      } else {
+        assert(Src0RM->getType() != IceType_i64);
+        assert(Traits::Is64Bit || Src0RM->getType() != IceType_i32);
+        T_1 = makeReg(IceType_i32);
+      }
+      Variable *T_2 = makeReg(DestTy);
+      if (Src0RM->getType() == T_1->getType())
+        _mov(T_1, Src0RM);
+      else
+        _movzx(T_1, Src0RM)->setMustKeep();
+      _cvt(T_2, T_1, Insts::Cvt::Si2ss);
+      _mov(Dest, T_2);
+    }
+    break;
+  }
+  case InstCast::Bitcast: {
+    Operand *Src0 = Instr->getSrc(0);
+    if (DestTy == Src0->getType()) {
+      auto *Assign = InstAssign::create(Func, Dest, Src0);
+      lowerAssign(Assign);
+      return;
+    }
+    switch (DestTy) {
+    default:
+      llvm_unreachable("Unexpected Bitcast dest type");
+    case IceType_i8: {
+      llvm::report_fatal_error("Helper call was expected");
+    } break;
+    case IceType_i16: {
+      llvm::report_fatal_error("Helper call was expected");
+    } break;
+    case IceType_i32:
+    case IceType_f32: {
+      Variable *Src0R = legalizeToReg(Src0);
+      Variable *T = makeReg(DestTy);
+      _movd(T, Src0R);
+      _mov(Dest, T);
+    } break;
+    case IceType_i64: {
+      assert(Src0->getType() == IceType_f64);
+      if (Traits::Is64Bit) {
+        Variable *Src0R = legalizeToReg(Src0);
+        Variable *T = makeReg(IceType_i64);
+        _movd(T, Src0R);
+        _mov(Dest, T);
+      } else {
+        Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+        // a.i64 = bitcast b.f64 ==>
+        //   s.f64 = spill b.f64
+        //   t_lo.i32 = lo(s.f64)
+        //   a_lo.i32 = t_lo.i32
+        //   t_hi.i32 = hi(s.f64)
+        //   a_hi.i32 = t_hi.i32
+        Operand *SpillLo, *SpillHi;
+        if (auto *Src0Var = llvm::dyn_cast<Variable>(Src0RM)) {
+          Variable *Spill = Func->makeVariable(IceType_f64);
+          Spill->setLinkedTo(Src0Var);
+          Spill->setMustNotHaveReg();
+          _movq(Spill, Src0RM);
+          SpillLo = Traits::VariableSplit::create(Func, Spill,
+                                                  Traits::VariableSplit::Low);
+          SpillHi = Traits::VariableSplit::create(Func, Spill,
+                                                  Traits::VariableSplit::High);
+        } else {
+          SpillLo = loOperand(Src0RM);
+          SpillHi = hiOperand(Src0RM);
+        }
+
+        auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
+        auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
+        Variable *T_Lo = makeReg(IceType_i32);
+        Variable *T_Hi = makeReg(IceType_i32);
+
+        _mov(T_Lo, SpillLo);
+        _mov(DestLo, T_Lo);
+        _mov(T_Hi, SpillHi);
+        _mov(DestHi, T_Hi);
+      }
+    } break;
+    case IceType_f64: {
+      assert(Src0->getType() == IceType_i64);
+      if (Traits::Is64Bit) {
+        Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+        Variable *T = makeReg(IceType_f64);
+        _movd(T, Src0RM);
+        _mov(Dest, T);
+      } else {
+        Src0 = legalize(Src0);
+        if (llvm::isa<X86OperandMem>(Src0)) {
+          Variable *T = makeReg(DestTy);
+          _movq(T, Src0);
+          _movq(Dest, T);
+          break;
+        }
+        // a.f64 = bitcast b.i64 ==>
+        //   t_lo.i32 = b_lo.i32
+        //   FakeDef(s.f64)
+        //   lo(s.f64) = t_lo.i32
+        //   t_hi.i32 = b_hi.i32
+        //   hi(s.f64) = t_hi.i32
+        //   a.f64 = s.f64
+        Variable *Spill = Func->makeVariable(IceType_f64);
+        Spill->setLinkedTo(Dest);
+        Spill->setMustNotHaveReg();
+
+        Variable *T_Lo = nullptr, *T_Hi = nullptr;
+        auto *SpillLo = Traits::VariableSplit::create(
+            Func, Spill, Traits::VariableSplit::Low);
+        auto *SpillHi = Traits::VariableSplit::create(
+            Func, Spill, Traits::VariableSplit::High);
+        _mov(T_Lo, loOperand(Src0));
+        // Technically, the Spill is defined after the _store happens, but
+        // SpillLo is considered a "use" of Spill so define Spill before it is
+        // used.
+        Context.insert<InstFakeDef>(Spill);
+        _store(T_Lo, SpillLo);
+        _mov(T_Hi, hiOperand(Src0));
+        _store(T_Hi, SpillHi);
+        _movq(Dest, Spill);
+      }
+    } break;
+    case IceType_v8i1: {
+      llvm::report_fatal_error("Helper call was expected");
+    } break;
+    case IceType_v16i1: {
+      llvm::report_fatal_error("Helper call was expected");
+    } break;
+    case IceType_v8i16:
+    case IceType_v16i8:
+    case IceType_v4i32:
+    case IceType_v4f32: {
+      if (Src0->getType() == IceType_i32) {
+        // Bitcast requires equal type sizes, which isn't strictly the case
+        // between scalars and vectors, but to emulate v4i8 vectors one has to
+        // use v16i8 vectors.
+        Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+        Variable *T = makeReg(DestTy);
+        _movd(T, Src0RM);
+        _mov(Dest, T);
+      } else {
+        _movp(Dest, legalizeToReg(Src0));
+      }
+    } break;
+    }
+    break;
+  }
+  }
+}
+
+void TargetX8664::lowerExtractElement(const InstExtractElement *Instr) {
+  Operand *SourceVectNotLegalized = Instr->getSrc(0);
+  auto *ElementIndex = llvm::dyn_cast<ConstantInteger32>(Instr->getSrc(1));
+  // Only constant indices are allowed in PNaCl IR.
+  assert(ElementIndex);
+
+  unsigned Index = ElementIndex->getValue();
+  Type Ty = SourceVectNotLegalized->getType();
+  Type ElementTy = typeElementType(Ty);
+  Type InVectorElementTy = Traits::getInVectorElementType(Ty);
+
+  // TODO(wala): Determine the best lowering sequences for each type.
+  bool CanUsePextr = Ty == IceType_v8i16 || Ty == IceType_v8i1 ||
+                     (InstructionSet >= SSE4_1 && Ty != IceType_v4f32);
+  Variable *ExtractedElementR =
+      makeReg(CanUsePextr ? IceType_i32 : InVectorElementTy);
+  if (CanUsePextr) {
+    // Use pextrb, pextrw, or pextrd.  The "b" and "w" versions clear the upper
+    // bits of the destination register, so we represent this by always
+    // extracting into an i32 register.  The _mov into Dest below will do
+    // truncation as necessary.
+    Constant *Mask = Ctx->getConstantInt32(Index);
+    Variable *SourceVectR = legalizeToReg(SourceVectNotLegalized);
+    _pextr(ExtractedElementR, SourceVectR, Mask);
+  } else if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) {
+    // Use pshufd and movd/movss.
+    Variable *T = nullptr;
+    if (Index) {
+      // The shuffle only needs to occur if the element to be extracted is not
+      // at the lowest index.
+      Constant *Mask = Ctx->getConstantInt32(Index);
+      T = makeReg(Ty);
+      _pshufd(T, legalize(SourceVectNotLegalized, Legal_Reg | Legal_Mem), Mask);
+    } else {
+      T = legalizeToReg(SourceVectNotLegalized);
+    }
+
+    if (InVectorElementTy == IceType_i32) {
+      _movd(ExtractedElementR, T);
+    } else { // Ty == IceType_f32
+      // TODO(wala): _movss is only used here because _mov does not allow a
+      // vector source and a scalar destination.  _mov should be able to be
+      // used here.
+      // _movss is a binary instruction, so the FakeDef is needed to keep the
+      // live range analysis consistent.
+      Context.insert<InstFakeDef>(ExtractedElementR);
+      _movss(ExtractedElementR, T);
+    }
+  } else {
+    assert(Ty == IceType_v16i8 || Ty == IceType_v16i1);
+    // Spill the value to a stack slot and do the extraction in memory.
+    //
+    // TODO(wala): use legalize(SourceVectNotLegalized, Legal_Mem) when support
+    // for legalizing to mem is implemented.
+    Variable *Slot = Func->makeVariable(Ty);
+    Slot->setMustNotHaveReg();
+    _movp(Slot, legalizeToReg(SourceVectNotLegalized));
+
+    // Compute the location of the element in memory.
+    unsigned Offset = Index * typeWidthInBytes(InVectorElementTy);
+    X86OperandMem *Loc =
+        getMemoryOperandForStackSlot(InVectorElementTy, Slot, Offset);
+    _mov(ExtractedElementR, Loc);
+  }
+
+  if (ElementTy == IceType_i1) {
+    // Truncate extracted integers to i1s if necessary.
+    Variable *T = makeReg(IceType_i1);
+    InstCast *Cast =
+        InstCast::create(Func, InstCast::Trunc, T, ExtractedElementR);
+    lowerCast(Cast);
+    ExtractedElementR = T;
+  }
+
+  // Copy the element to the destination.
+  Variable *Dest = Instr->getDest();
+  _mov(Dest, ExtractedElementR);
+}
+
+void TargetX8664::lowerFcmp(const InstFcmp *Fcmp) {
+  Variable *Dest = Fcmp->getDest();
+
+  if (isVectorType(Dest->getType())) {
+    lowerFcmpVector(Fcmp);
+  } else {
+    constexpr Inst *Consumer = nullptr;
+    lowerFcmpAndConsumer(Fcmp, Consumer);
+  }
+}
+
+void TargetX8664::lowerFcmpAndConsumer(const InstFcmp *Fcmp,
+                                       const Inst *Consumer) {
+  Operand *Src0 = Fcmp->getSrc(0);
+  Operand *Src1 = Fcmp->getSrc(1);
+  Variable *Dest = Fcmp->getDest();
+
+  if (Consumer != nullptr) {
+    if (auto *Select = llvm::dyn_cast<InstSelect>(Consumer)) {
+      if (lowerOptimizeFcmpSelect(Fcmp, Select))
+        return;
+    }
+  }
+
+  if (isVectorType(Dest->getType())) {
+    lowerFcmp(Fcmp);
+    if (Consumer != nullptr)
+      lowerSelectVector(llvm::cast<InstSelect>(Consumer));
+    return;
+  }
+
+  // Lowering a = fcmp cond, b, c
+  //   ucomiss b, c       /* only if C1 != Br_None */
+  //                      /* but swap b,c order if SwapOperands==true */
+  //   mov a, <default>
+  //   j<C1> label        /* only if C1 != Br_None */
+  //   j<C2> label        /* only if C2 != Br_None */
+  //   FakeUse(a)         /* only if C1 != Br_None */
+  //   mov a, !<default>  /* only if C1 != Br_None */
+  //   label:             /* only if C1 != Br_None */
+  //
+  // setcc lowering when C1 != Br_None && C2 == Br_None:
+  //   ucomiss b, c       /* but swap b,c order if SwapOperands==true */
+  //   setcc a, C1
+  InstFcmp::FCond Condition = Fcmp->getCondition();
+  assert(static_cast<size_t>(Condition) < Traits::TableFcmpSize);
+  if (Traits::TableFcmp[Condition].SwapScalarOperands)
+    std::swap(Src0, Src1);
+  const bool HasC1 = (Traits::TableFcmp[Condition].C1 != CondX86::Br_None);
+  const bool HasC2 = (Traits::TableFcmp[Condition].C2 != CondX86::Br_None);
+  if (HasC1) {
+    Src0 = legalize(Src0);
+    Operand *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
+    Variable *T = nullptr;
+    _mov(T, Src0);
+    _ucomiss(T, Src1RM);
+    if (!HasC2) {
+      assert(Traits::TableFcmp[Condition].Default);
+      setccOrConsumer(Traits::TableFcmp[Condition].C1, Dest, Consumer);
+      return;
+    }
+  }
+  int32_t IntDefault = Traits::TableFcmp[Condition].Default;
+  if (Consumer == nullptr) {
+    Constant *Default = Ctx->getConstantInt(Dest->getType(), IntDefault);
+    _mov(Dest, Default);
+    if (HasC1) {
+      InstX86Label *Label = InstX86Label::create(Func, this);
+      _br(Traits::TableFcmp[Condition].C1, Label);
+      if (HasC2) {
+        _br(Traits::TableFcmp[Condition].C2, Label);
+      }
+      Constant *NonDefault = Ctx->getConstantInt(Dest->getType(), !IntDefault);
+      _redefined(_mov(Dest, NonDefault));
+      Context.insert(Label);
+    }
+    return;
+  }
+  if (const auto *Br = llvm::dyn_cast<InstBr>(Consumer)) {
+    CfgNode *TrueSucc = Br->getTargetTrue();
+    CfgNode *FalseSucc = Br->getTargetFalse();
+    if (IntDefault != 0)
+      std::swap(TrueSucc, FalseSucc);
+    if (HasC1) {
+      _br(Traits::TableFcmp[Condition].C1, FalseSucc);
+      if (HasC2) {
+        _br(Traits::TableFcmp[Condition].C2, FalseSucc);
+      }
+      _br(TrueSucc);
+      return;
+    }
+    _br(FalseSucc);
+    return;
+  }
+  if (auto *Select = llvm::dyn_cast<InstSelect>(Consumer)) {
+    Operand *SrcT = Select->getTrueOperand();
+    Operand *SrcF = Select->getFalseOperand();
+    Variable *SelectDest = Select->getDest();
+    if (IntDefault != 0)
+      std::swap(SrcT, SrcF);
+    lowerMove(SelectDest, SrcF, false);
+    if (HasC1) {
+      InstX86Label *Label = InstX86Label::create(Func, this);
+      _br(Traits::TableFcmp[Condition].C1, Label);
+      if (HasC2) {
+        _br(Traits::TableFcmp[Condition].C2, Label);
+      }
+      static constexpr bool IsRedefinition = true;
+      lowerMove(SelectDest, SrcT, IsRedefinition);
+      Context.insert(Label);
+    }
+    return;
+  }
+  llvm::report_fatal_error("Unexpected consumer type");
+}
+
+void TargetX8664::lowerFcmpVector(const InstFcmp *Fcmp) {
+  Operand *Src0 = Fcmp->getSrc(0);
+  Operand *Src1 = Fcmp->getSrc(1);
+  Variable *Dest = Fcmp->getDest();
+
+  if (!isVectorType(Dest->getType()))
+    llvm::report_fatal_error("Expected vector compare");
+
+  InstFcmp::FCond Condition = Fcmp->getCondition();
+  assert(static_cast<size_t>(Condition) < Traits::TableFcmpSize);
+
+  if (Traits::TableFcmp[Condition].SwapVectorOperands)
+    std::swap(Src0, Src1);
+
+  Variable *T = nullptr;
+
+  if (Condition == InstFcmp::True) {
+    // makeVectorOfOnes() requires an integer vector type.
+    T = makeVectorOfMinusOnes(IceType_v4i32);
+  } else if (Condition == InstFcmp::False) {
+    T = makeVectorOfZeros(Dest->getType());
+  } else {
+    Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+    Operand *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
+    if (llvm::isa<X86OperandMem>(Src1RM))
+      Src1RM = legalizeToReg(Src1RM);
+
+    switch (Condition) {
+    default: {
+      const CmppsCond Predicate = Traits::TableFcmp[Condition].Predicate;
+      assert(Predicate != CondX86::Cmpps_Invalid);
+      T = makeReg(Src0RM->getType());
+      _movp(T, Src0RM);
+      _cmpps(T, Src1RM, Predicate);
+    } break;
+    case InstFcmp::One: {
+      // Check both unequal and ordered.
+      T = makeReg(Src0RM->getType());
+      Variable *T2 = makeReg(Src0RM->getType());
+      _movp(T, Src0RM);
+      _cmpps(T, Src1RM, CondX86::Cmpps_neq);
+      _movp(T2, Src0RM);
+      _cmpps(T2, Src1RM, CondX86::Cmpps_ord);
+      _pand(T, T2);
+    } break;
+    case InstFcmp::Ueq: {
+      // Check both equal or unordered.
+      T = makeReg(Src0RM->getType());
+      Variable *T2 = makeReg(Src0RM->getType());
+      _movp(T, Src0RM);
+      _cmpps(T, Src1RM, CondX86::Cmpps_eq);
+      _movp(T2, Src0RM);
+      _cmpps(T2, Src1RM, CondX86::Cmpps_unord);
+      _por(T, T2);
+    } break;
+    }
+  }
+
+  assert(T != nullptr);
+  _movp(Dest, T);
+  eliminateNextVectorSextInstruction(Dest);
+}
+
+inline bool isZero(const Operand *Opnd) {
+  if (auto *C64 = llvm::dyn_cast<ConstantInteger64>(Opnd))
+    return C64->getValue() == 0;
+  if (auto *C32 = llvm::dyn_cast<ConstantInteger32>(Opnd))
+    return C32->getValue() == 0;
+  return false;
+}
+
+void TargetX8664::lowerIcmpAndConsumer(const InstIcmp *Icmp,
+                                       const Inst *Consumer) {
+  Operand *Src0 = legalize(Icmp->getSrc(0));
+  Operand *Src1 = legalize(Icmp->getSrc(1));
+  Variable *Dest = Icmp->getDest();
+
+  if (isVectorType(Dest->getType())) {
+    lowerIcmp(Icmp);
+    if (Consumer != nullptr)
+      lowerSelectVector(llvm::cast<InstSelect>(Consumer));
+    return;
+  }
+
+  if (!Traits::Is64Bit && Src0->getType() == IceType_i64) {
+    lowerIcmp64(Icmp, Consumer);
+    return;
+  }
+
+  // cmp b, c
+  if (isZero(Src1)) {
+    switch (Icmp->getCondition()) {
+    default:
+      break;
+    case InstIcmp::Uge:
+      movOrConsumer(true, Dest, Consumer);
+      return;
+    case InstIcmp::Ult:
+      movOrConsumer(false, Dest, Consumer);
+      return;
+    }
+  }
+  Operand *Src0RM = legalizeSrc0ForCmp(Src0, Src1);
+  _cmp(Src0RM, Src1);
+  setccOrConsumer(Traits::getIcmp32Mapping(Icmp->getCondition()), Dest,
+                  Consumer);
+}
+
+void TargetX8664::lowerIcmpVector(const InstIcmp *Icmp) {
+  Operand *Src0 = legalize(Icmp->getSrc(0));
+  Operand *Src1 = legalize(Icmp->getSrc(1));
+  Variable *Dest = Icmp->getDest();
+
+  if (!isVectorType(Dest->getType()))
+    llvm::report_fatal_error("Expected a vector compare");
+
+  Type Ty = Src0->getType();
+  // Promote i1 vectors to 128 bit integer vector types.
+  if (typeElementType(Ty) == IceType_i1) {
+    Type NewTy = IceType_NUM;
+    switch (Ty) {
+    default:
+      llvm::report_fatal_error("unexpected type");
+      break;
+    case IceType_v4i1:
+      NewTy = IceType_v4i32;
+      break;
+    case IceType_v8i1:
+      NewTy = IceType_v8i16;
+      break;
+    case IceType_v16i1:
+      NewTy = IceType_v16i8;
+      break;
+    }
+    Variable *NewSrc0 = Func->makeVariable(NewTy);
+    Variable *NewSrc1 = Func->makeVariable(NewTy);
+    lowerCast(InstCast::create(Func, InstCast::Sext, NewSrc0, Src0));
+    lowerCast(InstCast::create(Func, InstCast::Sext, NewSrc1, Src1));
+    Src0 = NewSrc0;
+    Src1 = NewSrc1;
+    Ty = NewTy;
+  }
+
+  InstIcmp::ICond Condition = Icmp->getCondition();
+
+  Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+  Operand *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
+
+  // SSE2 only has signed comparison operations. Transform unsigned inputs in
+  // a manner that allows for the use of signed comparison operations by
+  // flipping the high order bits.
+  if (Condition == InstIcmp::Ugt || Condition == InstIcmp::Uge ||
+      Condition == InstIcmp::Ult || Condition == InstIcmp::Ule) {
+    Variable *T0 = makeReg(Ty);
+    Variable *T1 = makeReg(Ty);
+    Variable *HighOrderBits = makeVectorOfHighOrderBits(Ty);
+    _movp(T0, Src0RM);
+    _pxor(T0, HighOrderBits);
+    _movp(T1, Src1RM);
+    _pxor(T1, HighOrderBits);
+    Src0RM = T0;
+    Src1RM = T1;
+  }
+
+  Variable *T = makeReg(Ty);
+  switch (Condition) {
+  default:
+    llvm_unreachable("unexpected condition");
+    break;
+  case InstIcmp::Eq: {
+    if (llvm::isa<X86OperandMem>(Src1RM))
+      Src1RM = legalizeToReg(Src1RM);
+    _movp(T, Src0RM);
+    _pcmpeq(T, Src1RM);
+  } break;
+  case InstIcmp::Ne: {
+    if (llvm::isa<X86OperandMem>(Src1RM))
+      Src1RM = legalizeToReg(Src1RM);
+    _movp(T, Src0RM);
+    _pcmpeq(T, Src1RM);
+    Variable *MinusOne = makeVectorOfMinusOnes(Ty);
+    _pxor(T, MinusOne);
+  } break;
+  case InstIcmp::Ugt:
+  case InstIcmp::Sgt: {
+    if (llvm::isa<X86OperandMem>(Src1RM))
+      Src1RM = legalizeToReg(Src1RM);
+    _movp(T, Src0RM);
+    _pcmpgt(T, Src1RM);
+  } break;
+  case InstIcmp::Uge:
+  case InstIcmp::Sge: {
+    // !(Src1RM > Src0RM)
+    if (llvm::isa<X86OperandMem>(Src0RM))
+      Src0RM = legalizeToReg(Src0RM);
+    _movp(T, Src1RM);
+    _pcmpgt(T, Src0RM);
+    Variable *MinusOne = makeVectorOfMinusOnes(Ty);
+    _pxor(T, MinusOne);
+  } break;
+  case InstIcmp::Ult:
+  case InstIcmp::Slt: {
+    if (llvm::isa<X86OperandMem>(Src0RM))
+      Src0RM = legalizeToReg(Src0RM);
+    _movp(T, Src1RM);
+    _pcmpgt(T, Src0RM);
+  } break;
+  case InstIcmp::Ule:
+  case InstIcmp::Sle: {
+    // !(Src0RM > Src1RM)
+    if (llvm::isa<X86OperandMem>(Src1RM))
+      Src1RM = legalizeToReg(Src1RM);
+    _movp(T, Src0RM);
+    _pcmpgt(T, Src1RM);
+    Variable *MinusOne = makeVectorOfMinusOnes(Ty);
+    _pxor(T, MinusOne);
+  } break;
+  }
+
+  _movp(Dest, T);
+  eliminateNextVectorSextInstruction(Dest);
+}
+
+template <typename T>
+typename std::enable_if<!T::Is64Bit, void>::type
+TargetX8664::lowerIcmp64(const InstIcmp *Icmp, const Inst *Consumer) {
+  // a=icmp cond, b, c ==> cmp b,c; a=1; br cond,L1; FakeUse(a); a=0; L1:
+  Operand *Src0 = legalize(Icmp->getSrc(0));
+  Operand *Src1 = legalize(Icmp->getSrc(1));
+  Variable *Dest = Icmp->getDest();
+  InstIcmp::ICond Condition = Icmp->getCondition();
+  assert(static_cast<size_t>(Condition) < Traits::TableIcmp64Size);
+  Operand *Src0LoRM = nullptr;
+  Operand *Src0HiRM = nullptr;
+  // Legalize the portions of Src0 that are going to be needed.
+  if (isZero(Src1)) {
+    switch (Condition) {
+    default:
+      llvm_unreachable("unexpected condition");
+      break;
+    // These two are not optimized, so we fall through to the general case,
+    // which needs the upper and lower halves legalized.
+    case InstIcmp::Sgt:
+    case InstIcmp::Sle:
+    // These four compare after performing an "or" of the high and low half, so
+    // they need the upper and lower halves legalized.
+    case InstIcmp::Eq:
+    case InstIcmp::Ule:
+    case InstIcmp::Ne:
+    case InstIcmp::Ugt:
+      Src0LoRM = legalize(loOperand(Src0), Legal_Reg | Legal_Mem);
+    // These two test only the high half's sign bit, so they need only
+    // the upper half legalized.
+    case InstIcmp::Sge:
+    case InstIcmp::Slt:
+      Src0HiRM = legalize(hiOperand(Src0), Legal_Reg | Legal_Mem);
+      break;
+
+    // These two move constants and hence need no legalization.
+    case InstIcmp::Uge:
+    case InstIcmp::Ult:
+      break;
+    }
+  } else {
+    Src0LoRM = legalize(loOperand(Src0), Legal_Reg | Legal_Mem);
+    Src0HiRM = legalize(hiOperand(Src0), Legal_Reg | Legal_Mem);
+  }
+  // Optimize comparisons with zero.
+  if (isZero(Src1)) {
+    Constant *SignMask = Ctx->getConstantInt32(0x80000000);
+    Variable *Temp = nullptr;
+    switch (Condition) {
+    default:
+      llvm_unreachable("unexpected condition");
+      break;
+    case InstIcmp::Eq:
+    case InstIcmp::Ule:
+      // Mov Src0HiRM first, because it was legalized most recently, and will
+      // sometimes avoid a move before the OR.
+      _mov(Temp, Src0HiRM);
+      _or(Temp, Src0LoRM);
+      Context.insert<InstFakeUse>(Temp);
+      setccOrConsumer(CondX86::Br_e, Dest, Consumer);
+      return;
+    case InstIcmp::Ne:
+    case InstIcmp::Ugt:
+      // Mov Src0HiRM first, because it was legalized most recently, and will
+      // sometimes avoid a move before the OR.
+      _mov(Temp, Src0HiRM);
+      _or(Temp, Src0LoRM);
+      Context.insert<InstFakeUse>(Temp);
+      setccOrConsumer(CondX86::Br_ne, Dest, Consumer);
+      return;
+    case InstIcmp::Uge:
+      movOrConsumer(true, Dest, Consumer);
+      return;
+    case InstIcmp::Ult:
+      movOrConsumer(false, Dest, Consumer);
+      return;
+    case InstIcmp::Sgt:
+      break;
+    case InstIcmp::Sge:
+      _test(Src0HiRM, SignMask);
+      setccOrConsumer(CondX86::Br_e, Dest, Consumer);
+      return;
+    case InstIcmp::Slt:
+      _test(Src0HiRM, SignMask);
+      setccOrConsumer(CondX86::Br_ne, Dest, Consumer);
+      return;
+    case InstIcmp::Sle:
+      break;
+    }
+  }
+  // Handle general compares.
+  Operand *Src1LoRI = legalize(loOperand(Src1), Legal_Reg | Legal_Imm);
+  Operand *Src1HiRI = legalize(hiOperand(Src1), Legal_Reg | Legal_Imm);
+  if (Consumer == nullptr) {
+    Constant *Zero = Ctx->getConstantInt(Dest->getType(), 0);
+    Constant *One = Ctx->getConstantInt(Dest->getType(), 1);
+    InstX86Label *LabelFalse = InstX86Label::create(Func, this);
+    InstX86Label *LabelTrue = InstX86Label::create(Func, this);
+    _mov(Dest, One);
+    _cmp(Src0HiRM, Src1HiRI);
+    if (Traits::TableIcmp64[Condition].C1 != CondX86::Br_None)
+      _br(Traits::TableIcmp64[Condition].C1, LabelTrue);
+    if (Traits::TableIcmp64[Condition].C2 != CondX86::Br_None)
+      _br(Traits::TableIcmp64[Condition].C2, LabelFalse);
+    _cmp(Src0LoRM, Src1LoRI);
+    _br(Traits::TableIcmp64[Condition].C3, LabelTrue);
+    Context.insert(LabelFalse);
+    _redefined(_mov(Dest, Zero));
+    Context.insert(LabelTrue);
+    return;
+  }
+  if (const auto *Br = llvm::dyn_cast<InstBr>(Consumer)) {
+    _cmp(Src0HiRM, Src1HiRI);
+    if (Traits::TableIcmp64[Condition].C1 != CondX86::Br_None)
+      _br(Traits::TableIcmp64[Condition].C1, Br->getTargetTrue());
+    if (Traits::TableIcmp64[Condition].C2 != CondX86::Br_None)
+      _br(Traits::TableIcmp64[Condition].C2, Br->getTargetFalse());
+    _cmp(Src0LoRM, Src1LoRI);
+    _br(Traits::TableIcmp64[Condition].C3, Br->getTargetTrue(),
+        Br->getTargetFalse());
+    return;
+  }
+  if (auto *Select = llvm::dyn_cast<InstSelect>(Consumer)) {
+    Operand *SrcT = Select->getTrueOperand();
+    Operand *SrcF = Select->getFalseOperand();
+    Variable *SelectDest = Select->getDest();
+    InstX86Label *LabelFalse = InstX86Label::create(Func, this);
+    InstX86Label *LabelTrue = InstX86Label::create(Func, this);
+    lowerMove(SelectDest, SrcT, false);
+    _cmp(Src0HiRM, Src1HiRI);
+    if (Traits::TableIcmp64[Condition].C1 != CondX86::Br_None)
+      _br(Traits::TableIcmp64[Condition].C1, LabelTrue);
+    if (Traits::TableIcmp64[Condition].C2 != CondX86::Br_None)
+      _br(Traits::TableIcmp64[Condition].C2, LabelFalse);
+    _cmp(Src0LoRM, Src1LoRI);
+    _br(Traits::TableIcmp64[Condition].C3, LabelTrue);
+    Context.insert(LabelFalse);
+    static constexpr bool IsRedefinition = true;
+    lowerMove(SelectDest, SrcF, IsRedefinition);
+    Context.insert(LabelTrue);
+    return;
+  }
+  llvm::report_fatal_error("Unexpected consumer type");
+}
+
+void TargetX8664::setccOrConsumer(BrCond Condition, Variable *Dest,
+                                  const Inst *Consumer) {
+  if (Consumer == nullptr) {
+    _setcc(Dest, Condition);
+    return;
+  }
+  if (const auto *Br = llvm::dyn_cast<InstBr>(Consumer)) {
+    _br(Condition, Br->getTargetTrue(), Br->getTargetFalse());
+    return;
+  }
+  if (const auto *Select = llvm::dyn_cast<InstSelect>(Consumer)) {
+    Operand *SrcT = Select->getTrueOperand();
+    Operand *SrcF = Select->getFalseOperand();
+    Variable *SelectDest = Select->getDest();
+    lowerSelectMove(SelectDest, Condition, SrcT, SrcF);
+    return;
+  }
+  llvm::report_fatal_error("Unexpected consumer type");
+}
+
+void TargetX8664::movOrConsumer(bool IcmpResult, Variable *Dest,
+                                const Inst *Consumer) {
+  if (Consumer == nullptr) {
+    _mov(Dest, Ctx->getConstantInt(Dest->getType(), (IcmpResult ? 1 : 0)));
+    return;
+  }
+  if (const auto *Br = llvm::dyn_cast<InstBr>(Consumer)) {
+    // TODO(sehr,stichnot): This could be done with a single unconditional
+    // branch instruction, but subzero doesn't know how to handle the resulting
+    // control flow graph changes now.  Make it do so to eliminate mov and cmp.
+    _mov(Dest, Ctx->getConstantInt(Dest->getType(), (IcmpResult ? 1 : 0)));
+    _cmp(Dest, Ctx->getConstantInt(Dest->getType(), 0));
+    _br(CondX86::Br_ne, Br->getTargetTrue(), Br->getTargetFalse());
+    return;
+  }
+  if (const auto *Select = llvm::dyn_cast<InstSelect>(Consumer)) {
+    Operand *Src = nullptr;
+    if (IcmpResult) {
+      Src = legalize(Select->getTrueOperand(), Legal_Reg | Legal_Imm);
+    } else {
+      Src = legalize(Select->getFalseOperand(), Legal_Reg | Legal_Imm);
+    }
+    Variable *SelectDest = Select->getDest();
+    lowerMove(SelectDest, Src, false);
+    return;
+  }
+  llvm::report_fatal_error("Unexpected consumer type");
+}
+
+void TargetX8664::lowerArithAndConsumer(const InstArithmetic *Arith,
+                                        const Inst *Consumer) {
+  Variable *T = nullptr;
+  Operand *Src0 = legalize(Arith->getSrc(0));
+  Operand *Src1 = legalize(Arith->getSrc(1));
+  Variable *Dest = Arith->getDest();
+  switch (Arith->getOp()) {
+  default:
+    llvm_unreachable("arithmetic operator not AND or OR");
+    break;
+  case InstArithmetic::And:
+    _mov(T, Src0);
+    // Test cannot have an address in the second position.  Since T is
+    // guaranteed to be a register and Src1 could be a memory load, ensure
+    // that the second argument is a register.
+    if (llvm::isa<Constant>(Src1))
+      _test(T, Src1);
+    else
+      _test(Src1, T);
+    break;
+  case InstArithmetic::Or:
+    _mov(T, Src0);
+    _or(T, Src1);
+    break;
+  }
+
+  if (Consumer == nullptr) {
+    llvm::report_fatal_error("Expected a consumer instruction");
+  }
+  if (const auto *Br = llvm::dyn_cast<InstBr>(Consumer)) {
+    Context.insert<InstFakeUse>(T);
+    Context.insert<InstFakeDef>(Dest);
+    _br(CondX86::Br_ne, Br->getTargetTrue(), Br->getTargetFalse());
+    return;
+  }
+  llvm::report_fatal_error("Unexpected consumer type");
+}
+
+void TargetX8664::lowerInsertElement(const InstInsertElement *Instr) {
+  Operand *SourceVectNotLegalized = Instr->getSrc(0);
+  Operand *ElementToInsertNotLegalized = Instr->getSrc(1);
+  auto *ElementIndex = llvm::dyn_cast<ConstantInteger32>(Instr->getSrc(2));
+  // Only constant indices are allowed in PNaCl IR.
+  assert(ElementIndex);
+  unsigned Index = ElementIndex->getValue();
+  assert(Index < typeNumElements(SourceVectNotLegalized->getType()));
+
+  Type Ty = SourceVectNotLegalized->getType();
+  Type ElementTy = typeElementType(Ty);
+  Type InVectorElementTy = Traits::getInVectorElementType(Ty);
+
+  if (ElementTy == IceType_i1) {
+    // Expand the element to the appropriate size for it to be inserted in the
+    // vector.
+    Variable *Expanded = Func->makeVariable(InVectorElementTy);
+    auto *Cast = InstCast::create(Func, InstCast::Zext, Expanded,
+                                  ElementToInsertNotLegalized);
+    lowerCast(Cast);
+    ElementToInsertNotLegalized = Expanded;
+  }
+
+  if (Ty == IceType_v8i16 || Ty == IceType_v8i1 ||
+      InstructionSet >= SSE4_1) {
+    // Use insertps, pinsrb, pinsrw, or pinsrd.
+    Operand *ElementRM =
+        legalize(ElementToInsertNotLegalized, Legal_Reg | Legal_Mem);
+    Operand *SourceVectRM =
+        legalize(SourceVectNotLegalized, Legal_Reg | Legal_Mem);
+    Variable *T = makeReg(Ty);
+    _movp(T, SourceVectRM);
+    if (Ty == IceType_v4f32) {
+      _insertps(T, ElementRM, Ctx->getConstantInt32(Index << 4));
+    } else {
+      // For the pinsrb and pinsrw instructions, when the source operand is a
+      // register, it must be a full r32 register like eax, and not ax/al/ah.
+      // For filetype=asm, InstX86Pinsr::emit() compensates for
+      // the use
+      // of r16 and r8 by converting them through getBaseReg(), while emitIAS()
+      // validates that the original and base register encodings are the same.
+      if (ElementRM->getType() == IceType_i8 &&
+          llvm::isa<Variable>(ElementRM)) {
+        // Don't use ah/bh/ch/dh for pinsrb.
+        ElementRM = copyToReg8(ElementRM);
+      }
+      _pinsr(T, ElementRM, Ctx->getConstantInt32(Index));
+    }
+    _movp(Instr->getDest(), T);
+  } else if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) {
+    // Use shufps or movss.
+    Variable *ElementR = nullptr;
+    Operand *SourceVectRM =
+        legalize(SourceVectNotLegalized, Legal_Reg | Legal_Mem);
+
+    if (InVectorElementTy == IceType_f32) {
+      // ElementR will be in an XMM register since it is floating point.
+      ElementR = legalizeToReg(ElementToInsertNotLegalized);
+    } else {
+      // Copy an integer to an XMM register.
+      Operand *T = legalize(ElementToInsertNotLegalized, Legal_Reg | Legal_Mem);
+      ElementR = makeReg(Ty);
+      _movd(ElementR, T);
+    }
+
+    if (Index == 0) {
+      Variable *T = makeReg(Ty);
+      _movp(T, SourceVectRM);
+      _movss(T, ElementR);
+      _movp(Instr->getDest(), T);
+      return;
+    }
+
+    // shufps treats the source and destination operands as vectors of four
+    // doublewords. The destination's two high doublewords are selected from
+    // the source operand and the two low doublewords are selected from the
+    // (original value of) the destination operand. An insertelement operation
+    // can be effected with a sequence of two shufps operations with
+    // appropriate masks. In all cases below, Element[0] is being inserted into
+    // SourceVectOperand. Indices are ordered from left to right.
+    //
+    // insertelement into index 1 (result is stored in ElementR):
+    //   ElementR := ElementR[0, 0] SourceVectRM[0, 0]
+    //   ElementR := ElementR[3, 0] SourceVectRM[2, 3]
+    //
+    // insertelement into index 2 (result is stored in T):
+    //   T := SourceVectRM
+    //   ElementR := ElementR[0, 0] T[0, 3]
+    //   T := T[0, 1] ElementR[0, 3]
+    //
+    // insertelement into index 3 (result is stored in T):
+    //   T := SourceVectRM
+    //   ElementR := ElementR[0, 0] T[0, 2]
+    //   T := T[0, 1] ElementR[3, 0]
+    const unsigned char Mask1[3] = {0, 192, 128};
+    const unsigned char Mask2[3] = {227, 196, 52};
+
+    Constant *Mask1Constant = Ctx->getConstantInt32(Mask1[Index - 1]);
+    Constant *Mask2Constant = Ctx->getConstantInt32(Mask2[Index - 1]);
+
+    if (Index == 1) {
+      _shufps(ElementR, SourceVectRM, Mask1Constant);
+      _shufps(ElementR, SourceVectRM, Mask2Constant);
+      _movp(Instr->getDest(), ElementR);
+    } else {
+      Variable *T = makeReg(Ty);
+      _movp(T, SourceVectRM);
+      _shufps(ElementR, T, Mask1Constant);
+      _shufps(T, ElementR, Mask2Constant);
+      _movp(Instr->getDest(), T);
+    }
+  } else {
+    assert(Ty == IceType_v16i8 || Ty == IceType_v16i1);
+    // Spill the value to a stack slot and perform the insertion in memory.
+    //
+    // TODO(wala): use legalize(SourceVectNotLegalized, Legal_Mem) when support
+    // for legalizing to mem is implemented.
+    Variable *Slot = Func->makeVariable(Ty);
+    Slot->setMustNotHaveReg();
+    _movp(Slot, legalizeToReg(SourceVectNotLegalized));
+
+    // Compute the location of the position to insert in memory.
+    unsigned Offset = Index * typeWidthInBytes(InVectorElementTy);
+    X86OperandMem *Loc =
+        getMemoryOperandForStackSlot(InVectorElementTy, Slot, Offset);
+    _store(legalizeToReg(ElementToInsertNotLegalized), Loc);
+
+    Variable *T = makeReg(Ty);
+    _movp(T, Slot);
+    _movp(Instr->getDest(), T);
+  }
+}
+
+void TargetX8664::lowerIntrinsic(const InstIntrinsic *Instr) {
+  switch (Intrinsics::IntrinsicID ID = Instr->getIntrinsicID()) {
+  case Intrinsics::AtomicCmpxchg: {
+    if (!Intrinsics::isMemoryOrderValid(
+            ID, getConstantMemoryOrder(Instr->getArg(3)),
+            getConstantMemoryOrder(Instr->getArg(4)))) {
+      Func->setError("Unexpected memory ordering for AtomicCmpxchg");
+      return;
+    }
+    Variable *DestPrev = Instr->getDest();
+    Operand *PtrToMem = legalize(Instr->getArg(0));
+    Operand *Expected = legalize(Instr->getArg(1));
+    Operand *Desired = legalize(Instr->getArg(2));
+    if (tryOptimizedCmpxchgCmpBr(DestPrev, PtrToMem, Expected, Desired))
+      return;
+    lowerAtomicCmpxchg(DestPrev, PtrToMem, Expected, Desired);
+    return;
+  }
+  case Intrinsics::AtomicFence:
+    if (!Intrinsics::isMemoryOrderValid(
+            ID, getConstantMemoryOrder(Instr->getArg(0)))) {
+      Func->setError("Unexpected memory ordering for AtomicFence");
+      return;
+    }
+    _mfence();
+    return;
+  case Intrinsics::AtomicFenceAll:
+    // NOTE: FenceAll should prevent and load/store from being moved across the
+    // fence (both atomic and non-atomic). The InstX8632Mfence instruction is
+    // currently marked coarsely as "HasSideEffects".
+    _mfence();
+    return;
+  case Intrinsics::AtomicIsLockFree: {
+    // X86 is always lock free for 8/16/32/64 bit accesses.
+    // TODO(jvoung): Since the result is constant when given a constant byte
+    // size, this opens up DCE opportunities.
+    Operand *ByteSize = Instr->getArg(0);
+    Variable *Dest = Instr->getDest();
+    if (auto *CI = llvm::dyn_cast<ConstantInteger32>(ByteSize)) {
+      Constant *Result;
+      switch (CI->getValue()) {
+      default:
+        // Some x86-64 processors support the cmpxchg16b instruction, which can
+        // make 16-byte operations lock free (when used with the LOCK prefix).
+        // However, that's not supported in 32-bit mode, so just return 0 even
+        // for large sizes.
+        Result = Ctx->getConstantZero(IceType_i32);
+        break;
+      case 1:
+      case 2:
+      case 4:
+      case 8:
+        Result = Ctx->getConstantInt32(1);
+        break;
+      }
+      _mov(Dest, Result);
+      return;
+    }
+    // The PNaCl ABI requires the byte size to be a compile-time constant.
+    Func->setError("AtomicIsLockFree byte size should be compile-time const");
+    return;
+  }
+  case Intrinsics::AtomicLoad: {
+    // We require the memory address to be naturally aligned. Given that is the
+    // case, then normal loads are atomic.
+    if (!Intrinsics::isMemoryOrderValid(
+            ID, getConstantMemoryOrder(Instr->getArg(1)))) {
+      Func->setError("Unexpected memory ordering for AtomicLoad");
+      return;
+    }
+    Variable *Dest = Instr->getDest();
+    if (!Traits::Is64Bit) {
+      if (auto *Dest64On32 = llvm::dyn_cast<Variable64On32>(Dest)) {
+        // Follow what GCC does and use a movq instead of what lowerLoad()
+        // normally does (split the load into two). Thus, this skips
+        // load/arithmetic op folding. Load/arithmetic folding can't happen
+        // anyway, since this is x86-32 and integer arithmetic only happens on
+        // 32-bit quantities.
+        Variable *T = makeReg(IceType_f64);
+        X86OperandMem *Addr = formMemoryOperand(Instr->getArg(0), IceType_f64);
+        _movq(T, Addr);
+        // Then cast the bits back out of the XMM register to the i64 Dest.
+        auto *Cast = InstCast::create(Func, InstCast::Bitcast, Dest, T);
+        lowerCast(Cast);
+        // Make sure that the atomic load isn't elided when unused.
+        Context.insert<InstFakeUse>(Dest64On32->getLo());
+        Context.insert<InstFakeUse>(Dest64On32->getHi());
+        return;
+      }
+    }
+    auto *Load = InstLoad::create(Func, Dest, Instr->getArg(0));
+    lowerLoad(Load);
+    // Make sure the atomic load isn't elided when unused, by adding a FakeUse.
+    // Since lowerLoad may fuse the load w/ an arithmetic instruction, insert
+    // the FakeUse on the last-inserted instruction's dest.
+    Context.insert<InstFakeUse>(Context.getLastInserted()->getDest());
+    return;
+  }
+  case Intrinsics::AtomicRMW:
+    if (!Intrinsics::isMemoryOrderValid(
+            ID, getConstantMemoryOrder(Instr->getArg(3)))) {
+      Func->setError("Unexpected memory ordering for AtomicRMW");
+      return;
+    }
+    lowerAtomicRMW(
+        Instr->getDest(),
+        static_cast<uint32_t>(
+            llvm::cast<ConstantInteger32>(Instr->getArg(0))->getValue()),
+        Instr->getArg(1), Instr->getArg(2));
+    return;
+  case Intrinsics::AtomicStore: {
+    if (!Intrinsics::isMemoryOrderValid(
+            ID, getConstantMemoryOrder(Instr->getArg(2)))) {
+      Func->setError("Unexpected memory ordering for AtomicStore");
+      return;
+    }
+    // We require the memory address to be naturally aligned. Given that is the
+    // case, then normal stores are atomic. Add a fence after the store to make
+    // it visible.
+    Operand *Value = Instr->getArg(0);
+    Operand *Ptr = Instr->getArg(1);
+    if (!Traits::Is64Bit && Value->getType() == IceType_i64) {
+      // Use a movq instead of what lowerStore() normally does (split the store
+      // into two), following what GCC does. Cast the bits from int -> to an
+      // xmm register first.
+      Variable *T = makeReg(IceType_f64);
+      auto *Cast = InstCast::create(Func, InstCast::Bitcast, T, Value);
+      lowerCast(Cast);
+      // Then store XMM w/ a movq.
+      X86OperandMem *Addr = formMemoryOperand(Ptr, IceType_f64);
+      _storeq(T, Addr);
+      _mfence();
+      return;
+    }
+    auto *Store = InstStore::create(Func, Value, Ptr);
+    lowerStore(Store);
+    _mfence();
+    return;
+  }
+  case Intrinsics::Bswap: {
+    Variable *Dest = Instr->getDest();
+    Operand *Val = Instr->getArg(0);
+    // In 32-bit mode, bswap only works on 32-bit arguments, and the argument
+    // must be a register. Use rotate left for 16-bit bswap.
+    if (!Traits::Is64Bit && Val->getType() == IceType_i64) {
+      Val = legalizeUndef(Val);
+      Variable *T_Lo = legalizeToReg(loOperand(Val));
+      Variable *T_Hi = legalizeToReg(hiOperand(Val));
+      auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
+      auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
+      _bswap(T_Lo);
+      _bswap(T_Hi);
+      _mov(DestLo, T_Hi);
+      _mov(DestHi, T_Lo);
+    } else if ((Traits::Is64Bit && Val->getType() == IceType_i64) ||
+               Val->getType() == IceType_i32) {
+      Variable *T = legalizeToReg(Val);
+      _bswap(T);
+      _mov(Dest, T);
+    } else {
+      assert(Val->getType() == IceType_i16);
+      Constant *Eight = Ctx->getConstantInt16(8);
+      Variable *T = nullptr;
+      Val = legalize(Val);
+      _mov(T, Val);
+      _rol(T, Eight);
+      _mov(Dest, T);
+    }
+    return;
+  }
+  case Intrinsics::Ctpop: {
+    Variable *Dest = Instr->getDest();
+    Variable *T = nullptr;
+    Operand *Val = Instr->getArg(0);
+    Type ValTy = Val->getType();
+    assert(ValTy == IceType_i32 || ValTy == IceType_i64);
+
+    if (!Traits::Is64Bit) {
+      T = Dest;
+    } else {
+      T = makeReg(IceType_i64);
+      if (ValTy == IceType_i32) {
+        // in x86-64, __popcountsi2 is not defined, so we cheat a bit by
+        // converting it to a 64-bit value, and using ctpop_i64. _movzx should
+        // ensure we will not have any bits set on Val's upper 32 bits.
+        Variable *V = makeReg(IceType_i64);
+        Operand *ValRM = legalize(Val, Legal_Reg | Legal_Mem);
+        _movzx(V, ValRM);
+        Val = V;
+      }
+      ValTy = IceType_i64;
+    }
+
+    InstCall *Call =
+        makeHelperCall(ValTy == IceType_i32 ? RuntimeHelper::H_call_ctpop_i32
+                                            : RuntimeHelper::H_call_ctpop_i64,
+                       T, 1);
+    Call->addArg(Val);
+    lowerCall(Call);
+    // The popcount helpers always return 32-bit values, while the intrinsic's
+    // signature matches the native POPCNT instruction and fills a 64-bit reg
+    // (in 64-bit mode). Thus, clear the upper bits of the dest just in case
+    // the user doesn't do that in the IR. If the user does that in the IR,
+    // then this zero'ing instruction is dead and gets optimized out.
+    if (!Traits::Is64Bit) {
+      assert(T == Dest);
+      if (Val->getType() == IceType_i64) {
+        auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
+        Constant *Zero = Ctx->getConstantZero(IceType_i32);
+        _mov(DestHi, Zero);
+      }
+    } else {
+      assert(Val->getType() == IceType_i64);
+      // T is 64 bit. It needs to be copied to dest. We need to:
+      //
+      // T_1.32 = trunc T.64 to i32
+      // T_2.64 = zext T_1.32 to i64
+      // Dest.<<right_size>> = T_2.<<right_size>>
+      //
+      // which ensures the upper 32 bits will always be cleared. Just doing a
+      //
+      // mov Dest.32 = trunc T.32 to i32
+      //
+      // is dangerous because there's a chance the compiler will optimize this
+      // copy out. To use _movzx we need two new registers (one 32-, and
+      // another 64-bit wide.)
+      Variable *T_1 = makeReg(IceType_i32);
+      _mov(T_1, T);
+      Variable *T_2 = makeReg(IceType_i64);
+      _movzx(T_2, T_1);
+      _mov(Dest, T_2);
+    }
+    return;
+  }
+  case Intrinsics::Ctlz: {
+    // The "is zero undef" parameter is ignored and we always return a
+    // well-defined value.
+    Operand *Val = legalize(Instr->getArg(0));
+    Operand *FirstVal;
+    Operand *SecondVal = nullptr;
+    if (!Traits::Is64Bit && Val->getType() == IceType_i64) {
+      FirstVal = loOperand(Val);
+      SecondVal = hiOperand(Val);
+    } else {
+      FirstVal = Val;
+    }
+    constexpr bool IsCttz = false;
+    lowerCountZeros(IsCttz, Val->getType(), Instr->getDest(), FirstVal,
+                    SecondVal);
+    return;
+  }
+  case Intrinsics::Cttz: {
+    // The "is zero undef" parameter is ignored and we always return a
+    // well-defined value.
+    Operand *Val = legalize(Instr->getArg(0));
+    Operand *FirstVal;
+    Operand *SecondVal = nullptr;
+    if (!Traits::Is64Bit && Val->getType() == IceType_i64) {
+      FirstVal = hiOperand(Val);
+      SecondVal = loOperand(Val);
+    } else {
+      FirstVal = Val;
+    }
+    constexpr bool IsCttz = true;
+    lowerCountZeros(IsCttz, Val->getType(), Instr->getDest(), FirstVal,
+                    SecondVal);
+    return;
+  }
+  case Intrinsics::Fabs: {
+    Operand *Src = legalize(Instr->getArg(0));
+    Type Ty = Src->getType();
+    Variable *Dest = Instr->getDest();
+    Variable *T = makeVectorOfFabsMask(Ty);
+    // The pand instruction operates on an m128 memory operand, so if Src is an
+    // f32 or f64, we need to make sure it's in a register.
+    if (isVectorType(Ty)) {
+      if (llvm::isa<X86OperandMem>(Src))
+        Src = legalizeToReg(Src);
+    } else {
+      Src = legalizeToReg(Src);
+    }
+    _pand(T, Src);
+    if (isVectorType(Ty))
+      _movp(Dest, T);
+    else
+      _mov(Dest, T);
+    return;
+  }
+  case Intrinsics::Longjmp: {
+    InstCall *Call = makeHelperCall(RuntimeHelper::H_call_longjmp, nullptr, 2);
+    Call->addArg(Instr->getArg(0));
+    Call->addArg(Instr->getArg(1));
+    lowerCall(Call);
+    return;
+  }
+  case Intrinsics::Memcpy: {
+    lowerMemcpy(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2));
+    return;
+  }
+  case Intrinsics::Memmove: {
+    lowerMemmove(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2));
+    return;
+  }
+  case Intrinsics::Memset: {
+    lowerMemset(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2));
+    return;
+  }
+  case Intrinsics::Setjmp: {
+    InstCall *Call =
+        makeHelperCall(RuntimeHelper::H_call_setjmp, Instr->getDest(), 1);
+    Call->addArg(Instr->getArg(0));
+    lowerCall(Call);
+    return;
+  }
+  case Intrinsics::Sqrt: {
+    Operand *Src = legalize(Instr->getArg(0));
+    Variable *Dest = Instr->getDest();
+    Variable *T = makeReg(Dest->getType());
+    _sqrt(T, Src);
+    if (isVectorType(Dest->getType())) {
+      _movp(Dest, T);
+    } else {
+      _mov(Dest, T);
+    }
+    return;
+  }
+  case Intrinsics::Stacksave: {
+    Variable *rsp =
+        Func->getTarget()->getPhysicalRegister(getStackReg(), Traits::WordType);
+    Variable *Dest = Instr->getDest();
+    _mov(Dest, rsp);
+    return;
+  }
+  case Intrinsics::Stackrestore: {
+    Operand *Src = Instr->getArg(0);
+    _mov_sp(Src);
+    return;
+  }
+
+  case Intrinsics::Trap:
+    _ud2();
+    return;
+  case Intrinsics::LoadSubVector: {
+    assert(llvm::isa<ConstantInteger32>(Instr->getArg(1)) &&
+           "LoadSubVector second argument must be a constant");
+    Variable *Dest = Instr->getDest();
+    Type Ty = Dest->getType();
+    auto *SubVectorSize = llvm::cast<ConstantInteger32>(Instr->getArg(1));
+    Operand *Addr = Instr->getArg(0);
+    X86OperandMem *Src = formMemoryOperand(Addr, Ty);
+    doMockBoundsCheck(Src);
+
+    if (Dest->isRematerializable()) {
+      Context.insert<InstFakeDef>(Dest);
+      return;
+    }
+
+    auto *T = makeReg(Ty);
+    switch (SubVectorSize->getValue()) {
+    case 4:
+      _movd(T, Src);
+      break;
+    case 8:
+      _movq(T, Src);
+      break;
+    default:
+      Func->setError("Unexpected size for LoadSubVector");
+      return;
+    }
+    _movp(Dest, T);
+    return;
+  }
+  case Intrinsics::StoreSubVector: {
+    assert(llvm::isa<ConstantInteger32>(Instr->getArg(2)) &&
+           "StoreSubVector third argument must be a constant");
+    auto *SubVectorSize = llvm::cast<ConstantInteger32>(Instr->getArg(2));
+    Operand *Value = Instr->getArg(0);
+    Operand *Addr = Instr->getArg(1);
+    X86OperandMem *NewAddr = formMemoryOperand(Addr, Value->getType());
+    doMockBoundsCheck(NewAddr);
+
+    Value = legalizeToReg(Value);
+
+    switch (SubVectorSize->getValue()) {
+    case 4:
+      _stored(Value, NewAddr);
+      break;
+    case 8:
+      _storeq(Value, NewAddr);
+      break;
+    default:
+      Func->setError("Unexpected size for StoreSubVector");
+      return;
+    }
+    return;
+  }
+  case Intrinsics::VectorPackSigned: {
+    Operand *Src0 = Instr->getArg(0);
+    Operand *Src1 = Instr->getArg(1);
+    Variable *Dest = Instr->getDest();
+    auto *T = makeReg(Src0->getType());
+    auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+    auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
+    _movp(T, Src0RM);
+    _packss(T, Src1RM);
+    _movp(Dest, T);
+    return;
+  }
+  case Intrinsics::VectorPackUnsigned: {
+    Operand *Src0 = Instr->getArg(0);
+    Operand *Src1 = Instr->getArg(1);
+    Variable *Dest = Instr->getDest();
+    auto *T = makeReg(Src0->getType());
+    auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+    auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
+    _movp(T, Src0RM);
+    _packus(T, Src1RM);
+    _movp(Dest, T);
+    return;
+  }
+  case Intrinsics::SignMask: {
+    Operand *SrcReg = legalizeToReg(Instr->getArg(0));
+    Variable *Dest = Instr->getDest();
+    Variable *T = makeReg(IceType_i32);
+    if (SrcReg->getType() == IceType_v4f32 ||
+        SrcReg->getType() == IceType_v4i32 ||
+        SrcReg->getType() == IceType_v16i8) {
+      _movmsk(T, SrcReg);
+    } else {
+      // TODO(capn): We could implement v8i16 sign mask using packsswb/pmovmskb
+      llvm::report_fatal_error("Invalid type for SignMask intrinsic");
+    }
+    _mov(Dest, T);
+    return;
+  }
+  case Intrinsics::MultiplyHighSigned: {
+    Operand *Src0 = Instr->getArg(0);
+    Operand *Src1 = Instr->getArg(1);
+    Variable *Dest = Instr->getDest();
+    auto *T = makeReg(Dest->getType());
+    auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+    auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
+    _movp(T, Src0RM);
+    _pmulhw(T, Src1RM);
+    _movp(Dest, T);
+    return;
+  }
+  case Intrinsics::MultiplyHighUnsigned: {
+    Operand *Src0 = Instr->getArg(0);
+    Operand *Src1 = Instr->getArg(1);
+    Variable *Dest = Instr->getDest();
+    auto *T = makeReg(Dest->getType());
+    auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+    auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
+    _movp(T, Src0RM);
+    _pmulhuw(T, Src1RM);
+    _movp(Dest, T);
+    return;
+  }
+  case Intrinsics::MultiplyAddPairs: {
+    Operand *Src0 = Instr->getArg(0);
+    Operand *Src1 = Instr->getArg(1);
+    Variable *Dest = Instr->getDest();
+    auto *T = makeReg(Dest->getType());
+    auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+    auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
+    _movp(T, Src0RM);
+    _pmaddwd(T, Src1RM);
+    _movp(Dest, T);
+    return;
+  }
+  case Intrinsics::AddSaturateSigned: {
+    Operand *Src0 = Instr->getArg(0);
+    Operand *Src1 = Instr->getArg(1);
+    Variable *Dest = Instr->getDest();
+    auto *T = makeReg(Dest->getType());
+    auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+    auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
+    _movp(T, Src0RM);
+    _padds(T, Src1RM);
+    _movp(Dest, T);
+    return;
+  }
+  case Intrinsics::SubtractSaturateSigned: {
+    Operand *Src0 = Instr->getArg(0);
+    Operand *Src1 = Instr->getArg(1);
+    Variable *Dest = Instr->getDest();
+    auto *T = makeReg(Dest->getType());
+    auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+    auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
+    _movp(T, Src0RM);
+    _psubs(T, Src1RM);
+    _movp(Dest, T);
+    return;
+  }
+  case Intrinsics::AddSaturateUnsigned: {
+    Operand *Src0 = Instr->getArg(0);
+    Operand *Src1 = Instr->getArg(1);
+    Variable *Dest = Instr->getDest();
+    auto *T = makeReg(Dest->getType());
+    auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+    auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
+    _movp(T, Src0RM);
+    _paddus(T, Src1RM);
+    _movp(Dest, T);
+    return;
+  }
+  case Intrinsics::SubtractSaturateUnsigned: {
+    Operand *Src0 = Instr->getArg(0);
+    Operand *Src1 = Instr->getArg(1);
+    Variable *Dest = Instr->getDest();
+    auto *T = makeReg(Dest->getType());
+    auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+    auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
+    _movp(T, Src0RM);
+    _psubus(T, Src1RM);
+    _movp(Dest, T);
+    return;
+  }
+  case Intrinsics::Nearbyint: {
+    Operand *Src = Instr->getArg(0);
+    Variable *Dest = Instr->getDest();
+    Type DestTy = Dest->getType();
+    if (isVectorType(DestTy)) {
+      assert(DestTy == IceType_v4i32);
+      assert(Src->getType() == IceType_v4f32);
+      Operand *Src0R = legalizeToReg(Src);
+      Variable *T = makeReg(DestTy);
+      _cvt(T, Src0R, Insts::Cvt::Ps2dq);
+      _movp(Dest, T);
+    } else if (!Traits::Is64Bit && DestTy == IceType_i64) {
+      llvm::report_fatal_error("Helper call was expected");
+    } else {
+      Operand *Src0RM = legalize(Src, Legal_Reg | Legal_Mem);
+      // t1.i32 = cvt Src0RM; t2.dest_type = t1; Dest = t2.dest_type
+      Variable *T_1 = nullptr;
+      if (Traits::Is64Bit && DestTy == IceType_i64) {
+        T_1 = makeReg(IceType_i64);
+      } else {
+        assert(DestTy != IceType_i64);
+        T_1 = makeReg(IceType_i32);
+      }
+      // cvt() requires its integer argument to be a GPR.
+      Variable *T_2 = makeReg(DestTy);
+      if (isByteSizedType(DestTy)) {
+        assert(T_1->getType() == IceType_i32);
+        T_1->setRegClass(RCX86_Is32To8);
+        T_2->setRegClass(RCX86_IsTrunc8Rcvr);
+      }
+      _cvt(T_1, Src0RM, Insts::Cvt::Ss2si);
+      _mov(T_2, T_1); // T_1 and T_2 may have different integer types
+      if (DestTy == IceType_i1)
+        _and(T_2, Ctx->getConstantInt1(1));
+      _mov(Dest, T_2);
+    }
+    return;
+  }
+  case Intrinsics::Round: {
+    assert(InstructionSet >= SSE4_1);
+    Variable *Dest = Instr->getDest();
+    Operand *Src = Instr->getArg(0);
+    Operand *Mode = Instr->getArg(1);
+    assert(llvm::isa<ConstantInteger32>(Mode) &&
+           "Round last argument must be a constant");
+    auto *SrcRM = legalize(Src, Legal_Reg | Legal_Mem);
+    int32_t Imm = llvm::cast<ConstantInteger32>(Mode)->getValue();
+    (void)Imm;
+    assert(Imm >= 0 && Imm < 4 && "Invalid rounding mode");
+    auto *T = makeReg(Dest->getType());
+    _round(T, SrcRM, Mode);
+    _movp(Dest, T);
+    return;
+  }
+  default: // UnknownIntrinsic
+    Func->setError("Unexpected intrinsic");
+    return;
+  }
+  return;
+}
+
+void TargetX8664::lowerAtomicCmpxchg(Variable *DestPrev, Operand *Ptr,
+                                     Operand *Expected, Operand *Desired) {
+  Type Ty = Expected->getType();
+  if (!Traits::Is64Bit && Ty == IceType_i64) {
+    // Reserve the pre-colored registers first, before adding any more
+    // infinite-weight variables from formMemoryOperand's legalization.
+    Variable *T_edx = makeReg(IceType_i32, Traits::RegisterSet::Reg_edx);
+    Variable *T_eax = makeReg(IceType_i32, Traits::RegisterSet::Reg_eax);
+    Variable *T_ecx = makeReg(IceType_i32, Traits::RegisterSet::Reg_ecx);
+    Variable *T_ebx = makeReg(IceType_i32, Traits::RegisterSet::Reg_ebx);
+    _mov(T_eax, loOperand(Expected));
+    _mov(T_edx, hiOperand(Expected));
+    _mov(T_ebx, loOperand(Desired));
+    _mov(T_ecx, hiOperand(Desired));
+    X86OperandMem *Addr = formMemoryOperand(Ptr, Ty);
+    constexpr bool Locked = true;
+    _cmpxchg8b(Addr, T_edx, T_eax, T_ecx, T_ebx, Locked);
+    auto *DestLo = llvm::cast<Variable>(loOperand(DestPrev));
+    auto *DestHi = llvm::cast<Variable>(hiOperand(DestPrev));
+    _mov(DestLo, T_eax);
+    _mov(DestHi, T_edx);
+    return;
+  }
+  RegNumT Eax;
+  switch (Ty) {
+  default:
+    llvm::report_fatal_error("Bad type for cmpxchg");
+  case IceType_i64:
+    Eax = Traits::getRaxOrDie();
+    break;
+  case IceType_i32:
+    Eax = Traits::RegisterSet::Reg_eax;
+    break;
+  case IceType_i16:
+    Eax = Traits::RegisterSet::Reg_ax;
+    break;
+  case IceType_i8:
+    Eax = Traits::RegisterSet::Reg_al;
+    break;
+  }
+  Variable *T_eax = makeReg(Ty, Eax);
+  _mov(T_eax, Expected);
+  X86OperandMem *Addr = formMemoryOperand(Ptr, Ty);
+  Variable *DesiredReg = legalizeToReg(Desired);
+  constexpr bool Locked = true;
+  _cmpxchg(Addr, T_eax, DesiredReg, Locked);
+  _mov(DestPrev, T_eax);
+}
+
+bool TargetX8664::tryOptimizedCmpxchgCmpBr(Variable *Dest, Operand *PtrToMem,
+                                           Operand *Expected,
+                                           Operand *Desired) {
+  if (Func->getOptLevel() == Opt_m1)
+    return false;
+  // Peek ahead a few instructions and see how Dest is used.
+  // It's very common to have:
+  //
+  // %x = call i32 @llvm.nacl.atomic.cmpxchg.i32(i32* ptr, i32 %expected, ...)
+  // [%y_phi = ...] // list of phi stores
+  // %p = icmp eq i32 %x, %expected
+  // br i1 %p, label %l1, label %l2
+  //
+  // which we can optimize into:
+  //
+  // %x = <cmpxchg code>
+  // [%y_phi = ...] // list of phi stores
+  // br eq, %l1, %l2
+  InstList::iterator I = Context.getCur();
+  // I is currently the InstIntrinsic. Peek past that.
+  // This assumes that the atomic cmpxchg has not been lowered yet,
+  // so that the instructions seen in the scan from "Cur" is simple.
+  assert(llvm::isa<InstIntrinsic>(*I));
+  Inst *NextInst = Context.getNextInst(I);
+  if (!NextInst)
+    return false;
+  // There might be phi assignments right before the compare+branch, since this
+  // could be a backward branch for a loop. This placement of assignments is
+  // determined by placePhiStores().
+  CfgVector<InstAssign *> PhiAssigns;
+  while (auto *PhiAssign = llvm::dyn_cast<InstAssign>(NextInst)) {
+    if (PhiAssign->getDest() == Dest)
+      return false;
+    PhiAssigns.push_back(PhiAssign);
+    NextInst = Context.getNextInst(I);
+    if (!NextInst)
+      return false;
+  }
+  if (auto *NextCmp = llvm::dyn_cast<InstIcmp>(NextInst)) {
+    if (!(NextCmp->getCondition() == InstIcmp::Eq &&
+          ((NextCmp->getSrc(0) == Dest && NextCmp->getSrc(1) == Expected) ||
+           (NextCmp->getSrc(1) == Dest && NextCmp->getSrc(0) == Expected)))) {
+      return false;
+    }
+    NextInst = Context.getNextInst(I);
+    if (!NextInst)
+      return false;
+    if (auto *NextBr = llvm::dyn_cast<InstBr>(NextInst)) {
+      if (!NextBr->isUnconditional() &&
+          NextCmp->getDest() == NextBr->getCondition() &&
+          NextBr->isLastUse(NextCmp->getDest())) {
+        lowerAtomicCmpxchg(Dest, PtrToMem, Expected, Desired);
+        for (size_t i = 0; i < PhiAssigns.size(); ++i) {
+          // Lower the phi assignments now, before the branch (same placement
+          // as before).
+          InstAssign *PhiAssign = PhiAssigns[i];
+          PhiAssign->setDeleted();
+          lowerAssign(PhiAssign);
+          Context.advanceNext();
+        }
+        _br(CondX86::Br_e, NextBr->getTargetTrue(), NextBr->getTargetFalse());
+        // Skip over the old compare and branch, by deleting them.
+        NextCmp->setDeleted();
+        NextBr->setDeleted();
+        Context.advanceNext();
+        Context.advanceNext();
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+void TargetX8664::lowerAtomicRMW(Variable *Dest, uint32_t Operation,
+                                 Operand *Ptr, Operand *Val) {
+  bool NeedsCmpxchg = false;
+  LowerBinOp Op_Lo = nullptr;
+  LowerBinOp Op_Hi = nullptr;
+  switch (Operation) {
+  default:
+    Func->setError("Unknown AtomicRMW operation");
+    return;
+  case Intrinsics::AtomicAdd: {
+    if (!Traits::Is64Bit && Dest->getType() == IceType_i64) {
+      // All the fall-through paths must set this to true, but use this
+      // for asserting.
+      NeedsCmpxchg = true;
+      Op_Lo = &TargetX8664::_add;
+      Op_Hi = &TargetX8664::_adc;
+      break;
+    }
+    X86OperandMem *Addr = formMemoryOperand(Ptr, Dest->getType());
+    constexpr bool Locked = true;
+    Variable *T = nullptr;
+    _mov(T, Val);
+    _xadd(Addr, T, Locked);
+    _mov(Dest, T);
+    return;
+  }
+  case Intrinsics::AtomicSub: {
+    if (!Traits::Is64Bit && Dest->getType() == IceType_i64) {
+      NeedsCmpxchg = true;
+      Op_Lo = &TargetX8664::_sub;
+      Op_Hi = &TargetX8664::_sbb;
+      break;
+    }
+    X86OperandMem *Addr = formMemoryOperand(Ptr, Dest->getType());
+    constexpr bool Locked = true;
+    Variable *T = nullptr;
+    _mov(T, Val);
+    _neg(T);
+    _xadd(Addr, T, Locked);
+    _mov(Dest, T);
+    return;
+  }
+  case Intrinsics::AtomicOr:
+    // TODO(jvoung): If Dest is null or dead, then some of these
+    // operations do not need an "exchange", but just a locked op.
+    // That appears to be "worth" it for sub, or, and, and xor.
+    // xadd is probably fine vs lock add for add, and xchg is fine
+    // vs an atomic store.
+    NeedsCmpxchg = true;
+    Op_Lo = &TargetX8664::_or;
+    Op_Hi = &TargetX8664::_or;
+    break;
+  case Intrinsics::AtomicAnd:
+    NeedsCmpxchg = true;
+    Op_Lo = &TargetX8664::_and;
+    Op_Hi = &TargetX8664::_and;
+    break;
+  case Intrinsics::AtomicXor:
+    NeedsCmpxchg = true;
+    Op_Lo = &TargetX8664::_xor;
+    Op_Hi = &TargetX8664::_xor;
+    break;
+  case Intrinsics::AtomicExchange:
+    if (!Traits::Is64Bit && Dest->getType() == IceType_i64) {
+      NeedsCmpxchg = true;
+      // NeedsCmpxchg, but no real Op_Lo/Op_Hi need to be done. The values
+      // just need to be moved to the ecx and ebx registers.
+      Op_Lo = nullptr;
+      Op_Hi = nullptr;
+      break;
+    }
+    X86OperandMem *Addr = formMemoryOperand(Ptr, Dest->getType());
+    Variable *T = nullptr;
+    _mov(T, Val);
+    _xchg(Addr, T);
+    _mov(Dest, T);
+    return;
+  }
+  // Otherwise, we need a cmpxchg loop.
+  (void)NeedsCmpxchg;
+  assert(NeedsCmpxchg);
+  expandAtomicRMWAsCmpxchg(Op_Lo, Op_Hi, Dest, Ptr, Val);
+}
+
+void TargetX8664::expandAtomicRMWAsCmpxchg(LowerBinOp Op_Lo, LowerBinOp Op_Hi,
+                                           Variable *Dest, Operand *Ptr,
+                                           Operand *Val) {
+  // Expand a more complex RMW operation as a cmpxchg loop:
+  // For 64-bit:
+  //   mov     eax, [ptr]
+  //   mov     edx, [ptr + 4]
+  // .LABEL:
+  //   mov     ebx, eax
+  //   <Op_Lo> ebx, <desired_adj_lo>
+  //   mov     ecx, edx
+  //   <Op_Hi> ecx, <desired_adj_hi>
+  //   lock cmpxchg8b [ptr]
+  //   jne     .LABEL
+  //   mov     <dest_lo>, eax
+  //   mov     <dest_lo>, edx
+  //
+  // For 32-bit:
+  //   mov     eax, [ptr]
+  // .LABEL:
+  //   mov     <reg>, eax
+  //   op      <reg>, [desired_adj]
+  //   lock cmpxchg [ptr], <reg>
+  //   jne     .LABEL
+  //   mov     <dest>, eax
+  //
+  // If Op_{Lo,Hi} are nullptr, then just copy the value.
+  Val = legalize(Val);
+  Type Ty = Val->getType();
+  if (!Traits::Is64Bit && Ty == IceType_i64) {
+    Variable *T_edx = makeReg(IceType_i32, Traits::RegisterSet::Reg_edx);
+    Variable *T_eax = makeReg(IceType_i32, Traits::RegisterSet::Reg_eax);
+    X86OperandMem *Addr = formMemoryOperand(Ptr, Ty);
+    _mov(T_eax, loOperand(Addr));
+    _mov(T_edx, hiOperand(Addr));
+    Variable *T_ecx = makeReg(IceType_i32, Traits::RegisterSet::Reg_ecx);
+    Variable *T_ebx = makeReg(IceType_i32, Traits::RegisterSet::Reg_ebx);
+    InstX86Label *Label = InstX86Label::create(Func, this);
+    const bool IsXchg8b = Op_Lo == nullptr && Op_Hi == nullptr;
+    if (!IsXchg8b) {
+      Context.insert(Label);
+      _mov(T_ebx, T_eax);
+      (this->*Op_Lo)(T_ebx, loOperand(Val));
+      _mov(T_ecx, T_edx);
+      (this->*Op_Hi)(T_ecx, hiOperand(Val));
+    } else {
+      // This is for xchg, which doesn't need an actual Op_Lo/Op_Hi.
+      // It just needs the Val loaded into ebx and ecx.
+      // That can also be done before the loop.
+      _mov(T_ebx, loOperand(Val));
+      _mov(T_ecx, hiOperand(Val));
+      Context.insert(Label);
+    }
+    constexpr bool Locked = true;
+    _cmpxchg8b(Addr, T_edx, T_eax, T_ecx, T_ebx, Locked);
+    _br(CondX86::Br_ne, Label);
+    if (!IsXchg8b) {
+      // If Val is a variable, model the extended live range of Val through
+      // the end of the loop, since it will be re-used by the loop.
+      if (auto *ValVar = llvm::dyn_cast<Variable>(Val)) {
+        auto *ValLo = llvm::cast<Variable>(loOperand(ValVar));
+        auto *ValHi = llvm::cast<Variable>(hiOperand(ValVar));
+        Context.insert<InstFakeUse>(ValLo);
+        Context.insert<InstFakeUse>(ValHi);
+      }
+    } else {
+      // For xchg, the loop is slightly smaller and ebx/ecx are used.
+      Context.insert<InstFakeUse>(T_ebx);
+      Context.insert<InstFakeUse>(T_ecx);
+    }
+    // The address base (if any) is also reused in the loop.
+    if (Variable *Base = Addr->getBase())
+      Context.insert<InstFakeUse>(Base);
+    auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
+    auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
+    _mov(DestLo, T_eax);
+    _mov(DestHi, T_edx);
+    return;
+  }
+  X86OperandMem *Addr = formMemoryOperand(Ptr, Ty);
+  RegNumT Eax;
+  switch (Ty) {
+  default:
+    llvm::report_fatal_error("Bad type for atomicRMW");
+  case IceType_i64:
+    Eax = Traits::getRaxOrDie();
+    break;
+  case IceType_i32:
+    Eax = Traits::RegisterSet::Reg_eax;
+    break;
+  case IceType_i16:
+    Eax = Traits::RegisterSet::Reg_ax;
+    break;
+  case IceType_i8:
+    Eax = Traits::RegisterSet::Reg_al;
+    break;
+  }
+  Variable *T_eax = makeReg(Ty, Eax);
+  _mov(T_eax, Addr);
+  auto *Label = Context.insert<InstX86Label>(this);
+  // We want to pick a different register for T than Eax, so don't use
+  // _mov(T == nullptr, T_eax).
+  Variable *T = makeReg(Ty);
+  _mov(T, T_eax);
+  (this->*Op_Lo)(T, Val);
+  constexpr bool Locked = true;
+  _cmpxchg(Addr, T_eax, T, Locked);
+  _br(CondX86::Br_ne, Label);
+  // If Val is a variable, model the extended live range of Val through
+  // the end of the loop, since it will be re-used by the loop.
+  if (auto *ValVar = llvm::dyn_cast<Variable>(Val)) {
+    Context.insert<InstFakeUse>(ValVar);
+  }
+  // The address base (if any) is also reused in the loop.
+  if (Variable *Base = Addr->getBase())
+    Context.insert<InstFakeUse>(Base);
+  _mov(Dest, T_eax);
+}
+
+/// Lowers count {trailing, leading} zeros intrinsic.
+///
+/// We could do constant folding here, but that should have
+/// been done by the front-end/middle-end optimizations.
+
+void TargetX8664::lowerCountZeros(bool Cttz, Type Ty, Variable *Dest,
+                                  Operand *FirstVal, Operand *SecondVal) {
+  // TODO(jvoung): Determine if the user CPU supports LZCNT (BMI).
+  // Then the instructions will handle the Val == 0 case much more simply
+  // and won't require conversion from bit position to number of zeros.
+  //
+  // Otherwise:
+  //   bsr IF_NOT_ZERO, Val
+  //   mov T_DEST, ((Ty == i32) ? 63 : 127)
+  //   cmovne T_DEST, IF_NOT_ZERO
+  //   xor T_DEST, ((Ty == i32) ? 31 : 63)
+  //   mov DEST, T_DEST
+  //
+  // NOTE: T_DEST must be a register because cmov requires its dest to be a
+  // register. Also, bsf and bsr require their dest to be a register.
+  //
+  // The xor DEST, C(31|63) converts a bit position to # of leading zeroes.
+  // E.g., for 000... 00001100, bsr will say that the most significant bit
+  // set is at position 3, while the number of leading zeros is 28. Xor is
+  // like (M - N) for N <= M, and converts 63 to 32, and 127 to 64 (for the
+  // all-zeros case).
+  //
+  // X8632 only: Similar for 64-bit, but start w/ speculating that the upper 32
+  // bits are all zero, and compute the result for that case (checking the
+  // lower 32 bits). Then actually compute the result for the upper bits and
+  // cmov in the result from the lower computation if the earlier speculation
+  // was correct.
+  //
+  // Cttz, is similar, but uses bsf instead, and doesn't require the xor
+  // bit position conversion, and the speculation is reversed.
+
+  // TODO(jpp): refactor this method.
+  assert(Ty == IceType_i32 || Ty == IceType_i64);
+  const Type DestTy = Traits::Is64Bit ? Dest->getType() : IceType_i32;
+  Variable *T = makeReg(DestTy);
+  Operand *FirstValRM = legalize(FirstVal, Legal_Mem | Legal_Reg);
+  if (Cttz) {
+    _bsf(T, FirstValRM);
+  } else {
+    _bsr(T, FirstValRM);
+  }
+  Variable *T_Dest = makeReg(DestTy);
+  Constant *_31 = Ctx->getConstantInt32(31);
+  Constant *_32 = Ctx->getConstantInt(DestTy, 32);
+  Constant *_63 = Ctx->getConstantInt(DestTy, 63);
+  Constant *_64 = Ctx->getConstantInt(DestTy, 64);
+  if (Cttz) {
+    if (DestTy == IceType_i64) {
+      _mov(T_Dest, _64);
+    } else {
+      _mov(T_Dest, _32);
+    }
+  } else {
+    Constant *_127 = Ctx->getConstantInt(DestTy, 127);
+    if (DestTy == IceType_i64) {
+      _mov(T_Dest, _127);
+    } else {
+      _mov(T_Dest, _63);
+    }
+  }
+  _cmov(T_Dest, T, CondX86::Br_ne);
+  if (!Cttz) {
+    if (DestTy == IceType_i64) {
+      // Even though there's a _63 available at this point, that constant might
+      // not be an i32, which will cause the xor emission to fail.
+      Constant *_63 = Ctx->getConstantInt32(63);
+      _xor(T_Dest, _63);
+    } else {
+      _xor(T_Dest, _31);
+    }
+  }
+  if (Traits::Is64Bit || Ty == IceType_i32) {
+    _mov(Dest, T_Dest);
+    return;
+  }
+  _add(T_Dest, _32);
+  auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
+  auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
+  // Will be using "test" on this, so we need a registerized variable.
+  Variable *SecondVar = legalizeToReg(SecondVal);
+  Variable *T_Dest2 = makeReg(IceType_i32);
+  if (Cttz) {
+    _bsf(T_Dest2, SecondVar);
+  } else {
+    _bsr(T_Dest2, SecondVar);
+    _xor(T_Dest2, _31);
+  }
+  _test(SecondVar, SecondVar);
+  _cmov(T_Dest2, T_Dest, CondX86::Br_e);
+  _mov(DestLo, T_Dest2);
+  _mov(DestHi, Ctx->getConstantZero(IceType_i32));
+}
+
+void TargetX8664::typedLoad(Type Ty, Variable *Dest, Variable *Base,
+                            Constant *Offset) {
+  // If Offset is a ConstantRelocatable in Non-SFI mode, we will need to
+  // legalize Mem properly.
+  if (Offset)
+    assert(!llvm::isa<ConstantRelocatable>(Offset));
+
+  auto *Mem = X86OperandMem::create(Func, Ty, Base, Offset);
+
+  if (isVectorType(Ty))
+    _movp(Dest, Mem);
+  else if (Ty == IceType_f64)
+    _movq(Dest, Mem);
+  else
+    _mov(Dest, Mem);
+}
+
+void TargetX8664::typedStore(Type Ty, Variable *Value, Variable *Base,
+                             Constant *Offset) {
+  // If Offset is a ConstantRelocatable in Non-SFI mode, we will need to
+  // legalize Mem properly.
+  if (Offset)
+    assert(!llvm::isa<ConstantRelocatable>(Offset));
+
+  auto *Mem = X86OperandMem::create(Func, Ty, Base, Offset);
+
+  if (isVectorType(Ty))
+    _storep(Value, Mem);
+  else if (Ty == IceType_f64)
+    _storeq(Value, Mem);
+  else
+    _store(Value, Mem);
+}
+
+void TargetX8664::copyMemory(Type Ty, Variable *Dest, Variable *Src,
+                             int32_t OffsetAmt) {
+  Constant *Offset = OffsetAmt ? Ctx->getConstantInt32(OffsetAmt) : nullptr;
+  // TODO(ascull): this or add nullptr test to _movp, _movq
+  Variable *Data = makeReg(Ty);
+
+  typedLoad(Ty, Data, Src, Offset);
+  typedStore(Ty, Data, Dest, Offset);
+}
+
+void TargetX8664::lowerMemcpy(Operand *Dest, Operand *Src, Operand *Count) {
+  // There is a load and store for each chunk in the unroll
+  constexpr uint32_t BytesPerStorep = 16;
+
+  // Check if the operands are constants
+  const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count);
+  const bool IsCountConst = CountConst != nullptr;
+  const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0;
+
+  if (shouldOptimizeMemIntrins() && IsCountConst &&
+      CountValue <= BytesPerStorep * Traits::MEMCPY_UNROLL_LIMIT) {
+    // Unlikely, but nothing to do if it does happen
+    if (CountValue == 0)
+      return;
+
+    Variable *SrcBase = legalizeToReg(Src);
+    Variable *DestBase = legalizeToReg(Dest);
+
+    // Find the largest type that can be used and use it as much as possible in
+    // reverse order. Then handle any remainder with overlapping copies. Since
+    // the remainder will be at the end, there will be reduced pressure on the
+    // memory unit as the accesses to the same memory are far apart.
+    Type Ty = largestTypeInSize(CountValue);
+    uint32_t TyWidth = typeWidthInBytes(Ty);
+
+    uint32_t RemainingBytes = CountValue;
+    int32_t Offset = (CountValue & ~(TyWidth - 1)) - TyWidth;
+    while (RemainingBytes >= TyWidth) {
+      copyMemory(Ty, DestBase, SrcBase, Offset);
+      RemainingBytes -= TyWidth;
+      Offset -= TyWidth;
+    }
+
+    if (RemainingBytes == 0)
+      return;
+
+    // Lower the remaining bytes. Adjust to larger types in order to make use
+    // of overlaps in the copies.
+    Type LeftOverTy = firstTypeThatFitsSize(RemainingBytes);
+    Offset = CountValue - typeWidthInBytes(LeftOverTy);
+    copyMemory(LeftOverTy, DestBase, SrcBase, Offset);
+    return;
+  }
+
+  // Fall back on a function call
+  InstCall *Call = makeHelperCall(RuntimeHelper::H_call_memcpy, nullptr, 3);
+  Call->addArg(Dest);
+  Call->addArg(Src);
+  Call->addArg(Count);
+  lowerCall(Call);
+}
+
+void TargetX8664::lowerMemmove(Operand *Dest, Operand *Src, Operand *Count) {
+  // There is a load and store for each chunk in the unroll
+  constexpr uint32_t BytesPerStorep = 16;
+
+  // Check if the operands are constants
+  const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count);
+  const bool IsCountConst = CountConst != nullptr;
+  const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0;
+
+  if (shouldOptimizeMemIntrins() && IsCountConst &&
+      CountValue <= BytesPerStorep * Traits::MEMMOVE_UNROLL_LIMIT) {
+    // Unlikely, but nothing to do if it does happen
+    if (CountValue == 0)
+      return;
+
+    Variable *SrcBase = legalizeToReg(Src);
+    Variable *DestBase = legalizeToReg(Dest);
+
+    std::tuple<Type, Constant *, Variable *>
+        Moves[Traits::MEMMOVE_UNROLL_LIMIT];
+    Constant *Offset;
+    Variable *Reg;
+
+    // Copy the data into registers as the source and destination could overlap
+    // so make sure not to clobber the memory. This also means overlapping
+    // moves can be used as we are taking a safe snapshot of the memory.
+    Type Ty = largestTypeInSize(CountValue);
+    uint32_t TyWidth = typeWidthInBytes(Ty);
+
+    uint32_t RemainingBytes = CountValue;
+    int32_t OffsetAmt = (CountValue & ~(TyWidth - 1)) - TyWidth;
+    size_t N = 0;
+    while (RemainingBytes >= TyWidth) {
+      assert(N <= Traits::MEMMOVE_UNROLL_LIMIT);
+      Offset = Ctx->getConstantInt32(OffsetAmt);
+      Reg = makeReg(Ty);
+      typedLoad(Ty, Reg, SrcBase, Offset);
+      RemainingBytes -= TyWidth;
+      OffsetAmt -= TyWidth;
+      Moves[N++] = std::make_tuple(Ty, Offset, Reg);
+    }
+
+    if (RemainingBytes != 0) {
+      // Lower the remaining bytes. Adjust to larger types in order to make use
+      // of overlaps in the copies.
+      assert(N <= Traits::MEMMOVE_UNROLL_LIMIT);
+      Ty = firstTypeThatFitsSize(RemainingBytes);
+      Offset = Ctx->getConstantInt32(CountValue - typeWidthInBytes(Ty));
+      Reg = makeReg(Ty);
+      typedLoad(Ty, Reg, SrcBase, Offset);
+      Moves[N++] = std::make_tuple(Ty, Offset, Reg);
+    }
+
+    // Copy the data out into the destination memory
+    for (size_t i = 0; i < N; ++i) {
+      std::tie(Ty, Offset, Reg) = Moves[i];
+      typedStore(Ty, Reg, DestBase, Offset);
+    }
+
+    return;
+  }
+
+  // Fall back on a function call
+  InstCall *Call = makeHelperCall(RuntimeHelper::H_call_memmove, nullptr, 3);
+  Call->addArg(Dest);
+  Call->addArg(Src);
+  Call->addArg(Count);
+  lowerCall(Call);
+}
+
+void TargetX8664::lowerMemset(Operand *Dest, Operand *Val, Operand *Count) {
+  constexpr uint32_t BytesPerStorep = 16;
+  constexpr uint32_t BytesPerStoreq = 8;
+  constexpr uint32_t BytesPerStorei32 = 4;
+  assert(Val->getType() == IceType_i8);
+
+  // Check if the operands are constants
+  const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count);
+  const auto *ValConst = llvm::dyn_cast<const ConstantInteger32>(Val);
+  const bool IsCountConst = CountConst != nullptr;
+  const bool IsValConst = ValConst != nullptr;
+  const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0;
+  const uint32_t ValValue = IsValConst ? ValConst->getValue() : 0;
+
+  // Unlikely, but nothing to do if it does happen
+  if (IsCountConst && CountValue == 0)
+    return;
+
+  // TODO(ascull): if the count is constant but val is not it would be possible
+  // to inline by spreading the value across 4 bytes and accessing subregs e.g.
+  // eax, ax and al.
+  if (shouldOptimizeMemIntrins() && IsCountConst && IsValConst) {
+    Variable *Base = nullptr;
+    Variable *VecReg = nullptr;
+    const uint32_t MaskValue = (ValValue & 0xff);
+    const uint32_t SpreadValue =
+        (MaskValue << 24) | (MaskValue << 16) | (MaskValue << 8) | MaskValue;
+
+    auto lowerSet = [this, &Base, SpreadValue, &VecReg](Type Ty,
+                                                        uint32_t OffsetAmt) {
+      assert(Base != nullptr);
+      Constant *Offset = OffsetAmt ? Ctx->getConstantInt32(OffsetAmt) : nullptr;
+
+      // TODO(ascull): is 64-bit better with vector or scalar movq?
+      auto *Mem = X86OperandMem::create(Func, Ty, Base, Offset);
+      if (isVectorType(Ty)) {
+        assert(VecReg != nullptr);
+        _storep(VecReg, Mem);
+      } else if (Ty == IceType_f64) {
+        assert(VecReg != nullptr);
+        _storeq(VecReg, Mem);
+      } else {
+        assert(Ty != IceType_i64);
+        _store(Ctx->getConstantInt(Ty, SpreadValue), Mem);
+      }
+    };
+
+    // Find the largest type that can be used and use it as much as possible in
+    // reverse order. Then handle any remainder with overlapping copies. Since
+    // the remainder will be at the end, there will be reduces pressure on the
+    // memory unit as the access to the same memory are far apart.
+    Type Ty = IceType_void;
+    if (ValValue == 0 && CountValue >= BytesPerStoreq &&
+        CountValue <= BytesPerStorep * Traits::MEMSET_UNROLL_LIMIT) {
+      // When the value is zero it can be loaded into a vector register cheaply
+      // using the xor trick.
+      Base = legalizeToReg(Dest);
+      VecReg = makeVectorOfZeros(IceType_v16i8);
+      Ty = largestTypeInSize(CountValue);
+    } else if (CountValue <= BytesPerStorei32 * Traits::MEMSET_UNROLL_LIMIT) {
+      // When the value is non-zero or the count is small we can't use vector
+      // instructions so are limited to 32-bit stores.
+      Base = legalizeToReg(Dest);
+      constexpr uint32_t MaxSize = 4;
+      Ty = largestTypeInSize(CountValue, MaxSize);
+    }
+
+    if (Base) {
+      uint32_t TyWidth = typeWidthInBytes(Ty);
+
+      uint32_t RemainingBytes = CountValue;
+      uint32_t Offset = (CountValue & ~(TyWidth - 1)) - TyWidth;
+      while (RemainingBytes >= TyWidth) {
+        lowerSet(Ty, Offset);
+        RemainingBytes -= TyWidth;
+        Offset -= TyWidth;
+      }
+
+      if (RemainingBytes == 0)
+        return;
+
+      // Lower the remaining bytes. Adjust to larger types in order to make use
+      // of overlaps in the copies.
+      Type LeftOverTy = firstTypeThatFitsSize(RemainingBytes);
+      Offset = CountValue - typeWidthInBytes(LeftOverTy);
+      lowerSet(LeftOverTy, Offset);
+      return;
+    }
+  }
+
+  // Fall back on calling the memset function. The value operand needs to be
+  // extended to a stack slot size because the PNaCl ABI requires arguments to
+  // be at least 32 bits wide.
+  Operand *ValExt;
+  if (IsValConst) {
+    ValExt = Ctx->getConstantInt(stackSlotType(), ValValue);
+  } else {
+    Variable *ValExtVar = Func->makeVariable(stackSlotType());
+    lowerCast(InstCast::create(Func, InstCast::Zext, ValExtVar, Val));
+    ValExt = ValExtVar;
+  }
+  InstCall *Call = makeHelperCall(RuntimeHelper::H_call_memset, nullptr, 3);
+  Call->addArg(Dest);
+  Call->addArg(ValExt);
+  Call->addArg(Count);
+  lowerCall(Call);
+}
+
+class AddressOptimizer {
+  AddressOptimizer() = delete;
+  AddressOptimizer(const AddressOptimizer &) = delete;
+  AddressOptimizer &operator=(const AddressOptimizer &) = delete;
+
+public:
+  explicit AddressOptimizer(const Cfg *Func)
+      : Func(Func), VMetadata(Func->getVMetadata()) {}
+
+  inline void dumpAddressOpt(const ConstantRelocatable *const Relocatable,
+                             int32_t Offset, const Variable *Base,
+                             const Variable *Index, uint16_t Shift,
+                             const Inst *Reason) const;
+
+  inline const Inst *matchAssign(Variable **Var,
+                                 ConstantRelocatable **Relocatable,
+                                 int32_t *Offset);
+
+  inline const Inst *matchCombinedBaseIndex(Variable **Base, Variable **Index,
+                                            uint16_t *Shift);
+
+  inline const Inst *matchShiftedIndex(Variable **Index, uint16_t *Shift);
+
+  inline const Inst *matchOffsetIndexOrBase(Variable **IndexOrBase,
+                                            const uint16_t Shift,
+                                            ConstantRelocatable **Relocatable,
+                                            int32_t *Offset);
+
+private:
+  const Cfg *const Func;
+  const VariablesMetadata *const VMetadata;
+
+  static bool isAdd(const Inst *Instr) {
+    if (auto *Arith = llvm::dyn_cast_or_null<const InstArithmetic>(Instr)) {
+      return (Arith->getOp() == InstArithmetic::Add);
+    }
+    return false;
+  }
+};
+
+void AddressOptimizer::dumpAddressOpt(
+    const ConstantRelocatable *const Relocatable, int32_t Offset,
+    const Variable *Base, const Variable *Index, uint16_t Shift,
+    const Inst *Reason) const {
+  if (!BuildDefs::dump())
+    return;
+  if (!Func->isVerbose(IceV_AddrOpt))
+    return;
+  OstreamLocker L(Func->getContext());
+  Ostream &Str = Func->getContext()->getStrDump();
+  Str << "Instruction: ";
+  Reason->dumpDecorated(Func);
+  Str << "  results in Base=";
+  if (Base)
+    Base->dump(Func);
+  else
+    Str << "<null>";
+  Str << ", Index=";
+  if (Index)
+    Index->dump(Func);
+  else
+    Str << "<null>";
+  Str << ", Shift=" << Shift << ", Offset=" << Offset
+      << ", Relocatable=" << Relocatable << "\n";
+}
+
+const Inst *AddressOptimizer::matchAssign(Variable **Var,
+                                          ConstantRelocatable **Relocatable,
+                                          int32_t *Offset) {
+  // Var originates from Var=SrcVar ==> set Var:=SrcVar
+  if (*Var == nullptr)
+    return nullptr;
+  if (const Inst *VarAssign = VMetadata->getSingleDefinition(*Var)) {
+    assert(!VMetadata->isMultiDef(*Var));
+    if (llvm::isa<InstAssign>(VarAssign)) {
+      Operand *SrcOp = VarAssign->getSrc(0);
+      assert(SrcOp);
+      if (auto *SrcVar = llvm::dyn_cast<Variable>(SrcOp)) {
+        if (!VMetadata->isMultiDef(SrcVar) &&
+            // TODO: ensure SrcVar stays single-BB
+            true) {
+          *Var = SrcVar;
+          return VarAssign;
+        }
+      } else if (auto *Const = llvm::dyn_cast<ConstantInteger32>(SrcOp)) {
+        int32_t MoreOffset = Const->getValue();
+        if (Utils::WouldOverflowAdd(*Offset, MoreOffset))
+          return nullptr;
+        *Var = nullptr;
+        *Offset += MoreOffset;
+        return VarAssign;
+      } else if (auto *AddReloc = llvm::dyn_cast<ConstantRelocatable>(SrcOp)) {
+        if (*Relocatable == nullptr) {
+          // It is always safe to fold a relocatable through assignment -- the
+          // assignment frees a slot in the address operand that can be used to
+          // hold the Sandbox Pointer -- if any.
+          *Var = nullptr;
+          *Relocatable = AddReloc;
+          return VarAssign;
+        }
+      }
+    }
+  }
+  return nullptr;
+}
+
+const Inst *AddressOptimizer::matchCombinedBaseIndex(Variable **Base,
+                                                     Variable **Index,
+                                                     uint16_t *Shift) {
+  // Index==nullptr && Base is Base=Var1+Var2 ==>
+  //   set Base=Var1, Index=Var2, Shift=0
+  if (*Base == nullptr)
+    return nullptr;
+  if (*Index != nullptr)
+    return nullptr;
+  auto *BaseInst = VMetadata->getSingleDefinition(*Base);
+  if (BaseInst == nullptr)
+    return nullptr;
+  assert(!VMetadata->isMultiDef(*Base));
+  if (BaseInst->getSrcSize() < 2)
+    return nullptr;
+  if (auto *Var1 = llvm::dyn_cast<Variable>(BaseInst->getSrc(0))) {
+    if (VMetadata->isMultiDef(Var1))
+      return nullptr;
+    if (auto *Var2 = llvm::dyn_cast<Variable>(BaseInst->getSrc(1))) {
+      if (VMetadata->isMultiDef(Var2))
+        return nullptr;
+      if (isAdd(BaseInst) &&
+          // TODO: ensure Var1 and Var2 stay single-BB
+          true) {
+        *Base = Var1;
+        *Index = Var2;
+        *Shift = 0; // should already have been 0
+        return BaseInst;
+      }
+    }
+  }
+  return nullptr;
+}
+
+const Inst *AddressOptimizer::matchShiftedIndex(Variable **Index,
+                                                uint16_t *Shift) {
+  // Index is Index=Var*Const && log2(Const)+Shift<=3 ==>
+  //   Index=Var, Shift+=log2(Const)
+  if (*Index == nullptr)
+    return nullptr;
+  auto *IndexInst = VMetadata->getSingleDefinition(*Index);
+  if (IndexInst == nullptr)
+    return nullptr;
+  assert(!VMetadata->isMultiDef(*Index));
+
+  // When using an unsigned 32-bit array index on x64, it gets zero-extended
+  // before the shift & add. The explicit zero extension can be eliminated
+  // because x86 32-bit operations automatically get zero-extended into the
+  // corresponding 64-bit register.
+  if (auto *CastInst = llvm::dyn_cast<InstCast>(IndexInst)) {
+    if (CastInst->getCastKind() == InstCast::Zext) {
+      if (auto *Var = llvm::dyn_cast<Variable>(CastInst->getSrc(0))) {
+        if (Var->getType() == IceType_i32 &&
+            CastInst->getDest()->getType() == IceType_i64) {
+          IndexInst = VMetadata->getSingleDefinition(Var);
+        }
+      }
+    }
+  }
+
+  if (IndexInst->getSrcSize() < 2)
+    return nullptr;
+  if (auto *ArithInst = llvm::dyn_cast<InstArithmetic>(IndexInst)) {
+    if (auto *Var = llvm::dyn_cast<Variable>(ArithInst->getSrc(0))) {
+      if (auto *Const =
+              llvm::dyn_cast<ConstantInteger32>(ArithInst->getSrc(1))) {
+        if (VMetadata->isMultiDef(Var) || Const->getType() != IceType_i32)
+          return nullptr;
+        switch (ArithInst->getOp()) {
+        default:
+          return nullptr;
+        case InstArithmetic::Mul: {
+          uint32_t Mult = Const->getValue();
+          uint32_t LogMult;
+          switch (Mult) {
+          case 1:
+            LogMult = 0;
+            break;
+          case 2:
+            LogMult = 1;
+            break;
+          case 4:
+            LogMult = 2;
+            break;
+          case 8:
+            LogMult = 3;
+            break;
+          default:
+            return nullptr;
+          }
+          if (*Shift + LogMult <= 3) {
+            *Index = Var;
+            *Shift += LogMult;
+            return IndexInst;
+          }
+        }
+        case InstArithmetic::Shl: {
+          uint32_t ShiftAmount = Const->getValue();
+          switch (ShiftAmount) {
+          case 0:
+          case 1:
+          case 2:
+          case 3:
+            break;
+          default:
+            return nullptr;
+          }
+          if (*Shift + ShiftAmount <= 3) {
+            *Index = Var;
+            *Shift += ShiftAmount;
+            return IndexInst;
+          }
+        }
+        }
+      }
+    }
+  }
+  return nullptr;
+}
+
+const Inst *AddressOptimizer::matchOffsetIndexOrBase(
+    Variable **IndexOrBase, const uint16_t Shift,
+    ConstantRelocatable **Relocatable, int32_t *Offset) {
+  // Base is Base=Var+Const || Base is Base=Const+Var ==>
+  //   set Base=Var, Offset+=Const
+  // Base is Base=Var-Const ==>
+  //   set Base=Var, Offset-=Const
+  // Index is Index=Var+Const ==>
+  //   set Index=Var, Offset+=(Const<<Shift)
+  // Index is Index=Const+Var ==>
+  //   set Index=Var, Offset+=(Const<<Shift)
+  // Index is Index=Var-Const ==>
+  //   set Index=Var, Offset-=(Const<<Shift)
+  // Treat Index=Var Or Const as Index=Var + Const
+  //    when Var = Var' << N and log2(Const) <= N
+  // or when Var = (2^M) * (2^N) and log2(Const) <= (M+N)
+
+  if (*IndexOrBase == nullptr) {
+    return nullptr;
+  }
+  const Inst *Definition = VMetadata->getSingleDefinition(*IndexOrBase);
+  if (Definition == nullptr) {
+    return nullptr;
+  }
+  assert(!VMetadata->isMultiDef(*IndexOrBase));
+  if (auto *ArithInst = llvm::dyn_cast<const InstArithmetic>(Definition)) {
+    switch (ArithInst->getOp()) {
+    case InstArithmetic::Add:
+    case InstArithmetic::Sub:
+    case InstArithmetic::Or:
+      break;
+    default:
+      return nullptr;
+    }
+
+    Operand *Src0 = ArithInst->getSrc(0);
+    Operand *Src1 = ArithInst->getSrc(1);
+    auto *Var0 = llvm::dyn_cast<Variable>(Src0);
+    auto *Var1 = llvm::dyn_cast<Variable>(Src1);
+    auto *Const0 = llvm::dyn_cast<ConstantInteger32>(Src0);
+    auto *Const1 = llvm::dyn_cast<ConstantInteger32>(Src1);
+    auto *Reloc0 = llvm::dyn_cast<ConstantRelocatable>(Src0);
+    auto *Reloc1 = llvm::dyn_cast<ConstantRelocatable>(Src1);
+
+    bool IsAdd = false;
+    if (ArithInst->getOp() == InstArithmetic::Or) {
+      Variable *Var = nullptr;
+      ConstantInteger32 *Const = nullptr;
+      if (Var0 && Const1) {
+        Var = Var0;
+        Const = Const1;
+      } else if (Const0 && Var1) {
+        Var = Var1;
+        Const = Const0;
+      } else {
+        return nullptr;
+      }
+      auto *VarDef =
+          llvm::dyn_cast<InstArithmetic>(VMetadata->getSingleDefinition(Var));
+      if (VarDef == nullptr)
+        return nullptr;
+
+      SizeT ZeroesAvailable = 0;
+      if (VarDef->getOp() == InstArithmetic::Shl) {
+        if (auto *ConstInt =
+                llvm::dyn_cast<ConstantInteger32>(VarDef->getSrc(1))) {
+          ZeroesAvailable = ConstInt->getValue();
+        }
+      } else if (VarDef->getOp() == InstArithmetic::Mul) {
+        SizeT PowerOfTwo = 0;
+        if (auto *MultConst =
+                llvm::dyn_cast<ConstantInteger32>(VarDef->getSrc(0))) {
+          if (llvm::isPowerOf2_32(MultConst->getValue())) {
+            PowerOfTwo += MultConst->getValue();
+          }
+        }
+        if (auto *MultConst =
+                llvm::dyn_cast<ConstantInteger32>(VarDef->getSrc(1))) {
+          if (llvm::isPowerOf2_32(MultConst->getValue())) {
+            PowerOfTwo += MultConst->getValue();
+          }
+        }
+        ZeroesAvailable = llvm::Log2_32(PowerOfTwo) + 1;
+      }
+      SizeT ZeroesNeeded = llvm::Log2_32(Const->getValue()) + 1;
+      if (ZeroesNeeded == 0 || ZeroesNeeded > ZeroesAvailable)
+        return nullptr;
+      IsAdd = true; // treat it as an add if the above conditions hold
+    } else {
+      IsAdd = ArithInst->getOp() == InstArithmetic::Add;
+    }
+
+    Variable *NewIndexOrBase = nullptr;
+    int32_t NewOffset = 0;
+    ConstantRelocatable *NewRelocatable = *Relocatable;
+    if (Var0 && Var1)
+      // TODO(sehr): merge base/index splitting into here.
+      return nullptr;
+    if (!IsAdd && Var1)
+      return nullptr;
+    if (Var0)
+      NewIndexOrBase = Var0;
+    else if (Var1)
+      NewIndexOrBase = Var1;
+    // Don't know how to add/subtract two relocatables.
+    if ((*Relocatable && (Reloc0 || Reloc1)) || (Reloc0 && Reloc1))
+      return nullptr;
+    // Don't know how to subtract a relocatable.
+    if (!IsAdd && Reloc1)
+      return nullptr;
+    // Incorporate ConstantRelocatables.
+    if (Reloc0)
+      NewRelocatable = Reloc0;
+    else if (Reloc1)
+      NewRelocatable = Reloc1;
+    // Compute the updated constant offset.
+    if (Const0) {
+      const int32_t MoreOffset =
+          IsAdd ? Const0->getValue() : -Const0->getValue();
+      if (Utils::WouldOverflowAdd(*Offset + NewOffset, MoreOffset))
+        return nullptr;
+      NewOffset += MoreOffset;
+    }
+    if (Const1) {
+      const int32_t MoreOffset =
+          IsAdd ? Const1->getValue() : -Const1->getValue();
+      if (Utils::WouldOverflowAdd(*Offset + NewOffset, MoreOffset))
+        return nullptr;
+      NewOffset += MoreOffset;
+    }
+    if (Utils::WouldOverflowAdd(*Offset, NewOffset << Shift))
+      return nullptr;
+    *IndexOrBase = NewIndexOrBase;
+    *Offset += (NewOffset << Shift);
+    // Shift is always zero if this is called with the base
+    *Relocatable = NewRelocatable;
+    return Definition;
+  }
+  return nullptr;
+}
+
+typename TargetX8664::X86OperandMem *
+TargetX8664::computeAddressOpt(const Inst *Instr, Type MemType, Operand *Addr) {
+  Func->resetCurrentNode();
+  if (Func->isVerbose(IceV_AddrOpt)) {
+    OstreamLocker L(Func->getContext());
+    Ostream &Str = Func->getContext()->getStrDump();
+    Str << "\nStarting computeAddressOpt for instruction:\n  ";
+    Instr->dumpDecorated(Func);
+  }
+
+  OptAddr NewAddr;
+  NewAddr.Base = llvm::dyn_cast<Variable>(Addr);
+  if (NewAddr.Base == nullptr)
+    return nullptr;
+
+  // If the Base has more than one use or is live across multiple blocks, then
+  // don't go further. Alternatively (?), never consider a transformation that
+  // would change a variable that is currently *not* live across basic block
+  // boundaries into one that *is*.
+  if (!getFlags().getLoopInvariantCodeMotion()) {
+    // Need multi block address opt when licm is enabled.
+    // Might make sense to restrict to current node and loop header.
+    if (Func->getVMetadata()->isMultiBlock(
+            NewAddr.Base) /* || Base->getUseCount() > 1*/)
+      return nullptr;
+  }
+  AddressOptimizer AddrOpt(Func);
+  const bool MockBounds = getFlags().getMockBoundsCheck();
+  const Inst *Reason = nullptr;
+  bool AddressWasOptimized = false;
+  // The following unnamed struct identifies the address mode formation steps
+  // that could potentially create an invalid memory operand (i.e., no free
+  // slots for RebasePtr.) We add all those variables to this struct so that we
+  // can use memset() to reset all members to false.
+  struct {
+    bool AssignBase = false;
+    bool AssignIndex = false;
+    bool OffsetFromBase = false;
+    bool OffsetFromIndex = false;
+    bool CombinedBaseIndex = false;
+  } Skip;
+  // NewAddrCheckpoint is used to rollback the address being formed in case an
+  // invalid address is formed.
+  OptAddr NewAddrCheckpoint;
+  Reason = Instr;
+  do {
+    if (Reason) {
+      AddrOpt.dumpAddressOpt(NewAddr.Relocatable, NewAddr.Offset, NewAddr.Base,
+                             NewAddr.Index, NewAddr.Shift, Reason);
+      AddressWasOptimized = true;
+      Reason = nullptr;
+      memset(reinterpret_cast<void *>(&Skip), 0, sizeof(Skip));
+    }
+
+    NewAddrCheckpoint = NewAddr;
+
+    // Update Base and Index to follow through assignments to definitions.
+    if (!Skip.AssignBase &&
+        (Reason = AddrOpt.matchAssign(&NewAddr.Base, &NewAddr.Relocatable,
+                                      &NewAddr.Offset))) {
+      // Assignments of Base from a Relocatable or ConstantInt32 can result
+      // in Base becoming nullptr.  To avoid code duplication in this loop we
+      // prefer that Base be non-nullptr if possible.
+      if ((NewAddr.Base == nullptr) && (NewAddr.Index != nullptr) &&
+          NewAddr.Shift == 0) {
+        std::swap(NewAddr.Base, NewAddr.Index);
+      }
+      continue;
+    }
+    if (!Skip.AssignBase &&
+        (Reason = AddrOpt.matchAssign(&NewAddr.Index, &NewAddr.Relocatable,
+                                      &NewAddr.Offset))) {
+      continue;
+    }
+
+    if (!MockBounds) {
+      // Transition from:
+      //   <Relocatable + Offset>(Base) to
+      //   <Relocatable + Offset>(Base, Index)
+      if (!Skip.CombinedBaseIndex &&
+          (Reason = AddrOpt.matchCombinedBaseIndex(
+               &NewAddr.Base, &NewAddr.Index, &NewAddr.Shift))) {
+        continue;
+      }
+
+      // Recognize multiply/shift and update Shift amount.
+      // Index becomes Index=Var<<Const && Const+Shift<=3 ==>
+      //   Index=Var, Shift+=Const
+      // Index becomes Index=Const*Var && log2(Const)+Shift<=3 ==>
+      //   Index=Var, Shift+=log2(Const)
+      if ((Reason =
+               AddrOpt.matchShiftedIndex(&NewAddr.Index, &NewAddr.Shift))) {
+        continue;
+      }
+
+      // If Shift is zero, the choice of Base and Index was purely arbitrary.
+      // Recognize multiply/shift and set Shift amount.
+      // Shift==0 && Base is Base=Var*Const && log2(Const)+Shift<=3 ==>
+      //   swap(Index,Base)
+      // Similar for Base=Const*Var and Base=Var<<Const
+      if (NewAddr.Shift == 0 &&
+          (Reason = AddrOpt.matchShiftedIndex(&NewAddr.Base, &NewAddr.Shift))) {
+        std::swap(NewAddr.Base, NewAddr.Index);
+        continue;
+      }
+    }
+
+    // Update Offset to reflect additions/subtractions with constants and
+    // relocatables.
+    // TODO: consider overflow issues with respect to Offset.
+    if (!Skip.OffsetFromBase && (Reason = AddrOpt.matchOffsetIndexOrBase(
+                                     &NewAddr.Base, /*Shift =*/0,
+                                     &NewAddr.Relocatable, &NewAddr.Offset))) {
+      continue;
+    }
+    if (!Skip.OffsetFromIndex && (Reason = AddrOpt.matchOffsetIndexOrBase(
+                                      &NewAddr.Index, NewAddr.Shift,
+                                      &NewAddr.Relocatable, &NewAddr.Offset))) {
+      continue;
+    }
+
+    break;
+  } while (Reason);
+
+  if (!AddressWasOptimized) {
+    return nullptr;
+  }
+
+  Constant *OffsetOp = nullptr;
+  if (NewAddr.Relocatable == nullptr) {
+    OffsetOp = Ctx->getConstantInt32(NewAddr.Offset);
+  } else {
+    OffsetOp =
+        Ctx->getConstantSym(NewAddr.Relocatable->getOffset() + NewAddr.Offset,
+                            NewAddr.Relocatable->getName());
+  }
+  // Vanilla ICE load instructions should not use the segment registers, and
+  // computeAddressOpt only works at the level of Variables and Constants, not
+  // other X86OperandMem, so there should be no mention of segment
+  // registers there either.
+  static constexpr auto SegmentReg =
+      X86OperandMem::SegmentRegisters::DefaultSegment;
+
+  return X86OperandMem::create(Func, MemType, NewAddr.Base, OffsetOp,
+                               NewAddr.Index, NewAddr.Shift, SegmentReg);
+}
+
+/// Add a mock bounds check on the memory address before using it as a load or
+/// store operand.  The basic idea is that given a memory operand [reg], we
+/// would first add bounds-check code something like:
+///
+///   cmp reg, <lb>
+///   jl out_of_line_error
+///   cmp reg, <ub>
+///   jg out_of_line_error
+///
+/// In reality, the specific code will depend on how <lb> and <ub> are
+/// represented, e.g. an immediate, a global, or a function argument.
+///
+/// As such, we need to enforce that the memory operand does not have the form
+/// [reg1+reg2], because then there is no simple cmp instruction that would
+/// suffice.  However, we consider [reg+offset] to be OK because the offset is
+/// usually small, and so <ub> could have a safety buffer built in and then we
+/// could instead branch to a custom out_of_line_error that does the precise
+/// check and jumps back if it turns out OK.
+///
+/// For the purpose of mocking the bounds check, we'll do something like this:
+///
+///   cmp reg, 0
+///   je label
+///   cmp reg, 1
+///   je label
+///   label:
+///
+/// Also note that we don't need to add a bounds check to a dereference of a
+/// simple global variable address.
+
+void TargetX8664::doMockBoundsCheck(Operand *Opnd) {
+  if (!getFlags().getMockBoundsCheck())
+    return;
+  if (auto *Mem = llvm::dyn_cast<X86OperandMem>(Opnd)) {
+    if (Mem->getIndex()) {
+      llvm::report_fatal_error("doMockBoundsCheck: Opnd contains index reg");
+    }
+    Opnd = Mem->getBase();
+  }
+  // At this point Opnd could be nullptr, or Variable, or Constant, or perhaps
+  // something else.  We only care if it is Variable.
+  auto *Var = llvm::dyn_cast_or_null<Variable>(Opnd);
+  if (Var == nullptr)
+    return;
+  // We use lowerStore() to copy out-args onto the stack.  This creates a memory
+  // operand with the stack pointer as the base register.  Don't do bounds
+  // checks on that.
+  if (Var->getRegNum() == getStackReg())
+    return;
+
+  auto *Label = InstX86Label::create(Func, this);
+  _cmp(Opnd, Ctx->getConstantZero(IceType_i32));
+  _br(CondX86::Br_e, Label);
+  _cmp(Opnd, Ctx->getConstantInt32(1));
+  _br(CondX86::Br_e, Label);
+  Context.insert(Label);
+}
+
+void TargetX8664::lowerLoad(const InstLoad *Load) {
+  // A Load instruction can be treated the same as an Assign instruction, after
+  // the source operand is transformed into an X86OperandMem operand.  Note that
+  // the address mode optimization already creates an X86OperandMem operand, so
+  // it doesn't need another level of transformation.
+  Variable *DestLoad = Load->getDest();
+  Type Ty = DestLoad->getType();
+  Operand *Src0 = formMemoryOperand(Load->getLoadAddress(), Ty);
+  doMockBoundsCheck(Src0);
+  auto *Assign = InstAssign::create(Func, DestLoad, Src0);
+  lowerAssign(Assign);
+}
+
+void TargetX8664::doAddressOptOther() {
+  // Inverts some Icmp instructions which helps doAddressOptLoad later.
+  // TODO(manasijm): Refactor to unify the conditions for Var0 and Var1
+  Inst *Instr = iteratorToInst(Context.getCur());
+  auto *VMetadata = Func->getVMetadata();
+  if (auto *Icmp = llvm::dyn_cast<InstIcmp>(Instr)) {
+    if (llvm::isa<Constant>(Icmp->getSrc(0)) ||
+        llvm::isa<Constant>(Icmp->getSrc(1)))
+      return;
+    auto *Var0 = llvm::dyn_cast<Variable>(Icmp->getSrc(0));
+    if (Var0 == nullptr)
+      return;
+    if (!VMetadata->isTracked(Var0))
+      return;
+    auto *Op0Def = VMetadata->getFirstDefinitionSingleBlock(Var0);
+    if (Op0Def == nullptr || !llvm::isa<InstLoad>(Op0Def))
+      return;
+    if (VMetadata->getLocalUseNode(Var0) != Context.getNode())
+      return;
+
+    auto *Var1 = llvm::dyn_cast<Variable>(Icmp->getSrc(1));
+    if (Var1 != nullptr && VMetadata->isTracked(Var1)) {
+      auto *Op1Def = VMetadata->getFirstDefinitionSingleBlock(Var1);
+      if (Op1Def != nullptr && !VMetadata->isMultiBlock(Var1) &&
+          llvm::isa<InstLoad>(Op1Def)) {
+        return; // Both are loads
+      }
+    }
+    Icmp->reverseConditionAndOperands();
+  }
+}
+
+void TargetX8664::doAddressOptLoad() {
+  Inst *Instr = iteratorToInst(Context.getCur());
+  Operand *Addr = Instr->getSrc(0);
+  Variable *Dest = Instr->getDest();
+  if (auto *OptAddr = computeAddressOpt(Instr, Dest->getType(), Addr)) {
+    Instr->setDeleted();
+    Context.insert<InstLoad>(Dest, OptAddr);
+  }
+}
+
+void TargetX8664::doAddressOptLoadSubVector() {
+  auto *Intrinsic = llvm::cast<InstIntrinsic>(Context.getCur());
+  Operand *Addr = Intrinsic->getArg(0);
+  Variable *Dest = Intrinsic->getDest();
+  if (auto *OptAddr = computeAddressOpt(Intrinsic, Dest->getType(), Addr)) {
+    Intrinsic->setDeleted();
+    const Ice::Intrinsics::IntrinsicInfo Info = {
+        Ice::Intrinsics::LoadSubVector, Ice::Intrinsics::SideEffects_F,
+        Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
+    auto *NewLoad = Context.insert<InstIntrinsic>(2, Dest, Info);
+    NewLoad->addArg(OptAddr);
+    NewLoad->addArg(Intrinsic->getArg(1));
+  }
+}
+
+void TargetX8664::lowerPhi(const InstPhi * /*Instr*/) {
+  Func->setError("Phi found in regular instruction list");
+}
+
+void TargetX8664::lowerRet(const InstRet *Instr) {
+  Variable *Reg = nullptr;
+  if (Instr->hasRetValue()) {
+    Operand *RetValue = legalize(Instr->getRetValue());
+    const Type ReturnType = RetValue->getType();
+    assert(isVectorType(ReturnType) || isScalarFloatingType(ReturnType) ||
+           (ReturnType == IceType_i32) || (ReturnType == IceType_i64));
+    Reg = moveReturnValueToRegister(RetValue, ReturnType);
+  }
+  // Add a ret instruction even if sandboxing is enabled, because addEpilog
+  // explicitly looks for a ret instruction as a marker for where to insert the
+  // frame removal instructions.
+  _ret(Reg);
+  // Add a fake use of esp to make sure esp stays alive for the entire
+  // function. Otherwise post-call esp adjustments get dead-code eliminated.
+  keepEspLiveAtExit();
+}
+
+inline uint32_t makePshufdMask(SizeT Index0, SizeT Index1, SizeT Index2,
+                               SizeT Index3) {
+  const SizeT Mask = (Index0 & 0x3) | ((Index1 & 0x3) << 2) |
+                     ((Index2 & 0x3) << 4) | ((Index3 & 0x3) << 6);
+  assert(Mask < 256);
+  return Mask;
+}
+
+Variable *TargetX8664::lowerShuffleVector_AllFromSameSrc(
+    Operand *Src, SizeT Index0, SizeT Index1, SizeT Index2, SizeT Index3) {
+  constexpr SizeT SrcBit = 1 << 2;
+  assert((Index0 & SrcBit) == (Index1 & SrcBit));
+  assert((Index0 & SrcBit) == (Index2 & SrcBit));
+  assert((Index0 & SrcBit) == (Index3 & SrcBit));
+  (void)SrcBit;
+
+  const Type SrcTy = Src->getType();
+  auto *T = makeReg(SrcTy);
+  auto *SrcRM = legalize(Src, Legal_Reg | Legal_Mem);
+  auto *Mask =
+      Ctx->getConstantInt32(makePshufdMask(Index0, Index1, Index2, Index3));
+  _pshufd(T, SrcRM, Mask);
+  return T;
+}
+
+Variable *
+TargetX8664::lowerShuffleVector_TwoFromSameSrc(Operand *Src0, SizeT Index0,
+                                               SizeT Index1, Operand *Src1,
+                                               SizeT Index2, SizeT Index3) {
+  constexpr SizeT SrcBit = 1 << 2;
+  assert((Index0 & SrcBit) == (Index1 & SrcBit) || (Index1 == IGNORE_INDEX));
+  assert((Index2 & SrcBit) == (Index3 & SrcBit) || (Index3 == IGNORE_INDEX));
+  (void)SrcBit;
+
+  const Type SrcTy = Src0->getType();
+  assert(Src1->getType() == SrcTy);
+  auto *T = makeReg(SrcTy);
+  auto *Src0R = legalizeToReg(Src0);
+  auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
+  auto *Mask =
+      Ctx->getConstantInt32(makePshufdMask(Index0, Index1, Index2, Index3));
+  _movp(T, Src0R);
+  _shufps(T, Src1RM, Mask);
+  return T;
+}
+
+Variable *TargetX8664::lowerShuffleVector_UnifyFromDifferentSrcs(Operand *Src0,
+                                                                 SizeT Index0,
+                                                                 Operand *Src1,
+                                                                 SizeT Index1) {
+  return lowerShuffleVector_TwoFromSameSrc(Src0, Index0, IGNORE_INDEX, Src1,
+                                           Index1, IGNORE_INDEX);
+}
+
+inline SizeT makeSrcSwitchMask(SizeT Index0, SizeT Index1, SizeT Index2,
+                               SizeT Index3) {
+  constexpr SizeT SrcBit = 1 << 2;
+  const SizeT Index0Bits = ((Index0 & SrcBit) == 0) ? 0 : (1 << 0);
+  const SizeT Index1Bits = ((Index1 & SrcBit) == 0) ? 0 : (1 << 1);
+  const SizeT Index2Bits = ((Index2 & SrcBit) == 0) ? 0 : (1 << 2);
+  const SizeT Index3Bits = ((Index3 & SrcBit) == 0) ? 0 : (1 << 3);
+  return Index0Bits | Index1Bits | Index2Bits | Index3Bits;
+}
+
+GlobalString TargetX8664::lowerShuffleVector_NewMaskName() {
+  GlobalString FuncName = Func->getFunctionName();
+  const SizeT Id = PshufbMaskCount++;
+  if (!BuildDefs::dump() || !FuncName.hasStdString()) {
+    return GlobalString::createWithString(
+        Ctx,
+        "$PS" + std::to_string(FuncName.getID()) + "_" + std::to_string(Id));
+  }
+  return GlobalString::createWithString(
+      Ctx, "Pshufb$" + Func->getFunctionName() + "$" + std::to_string(Id));
+}
+
+ConstantRelocatable *TargetX8664::lowerShuffleVector_CreatePshufbMask(
+    int8_t Idx0, int8_t Idx1, int8_t Idx2, int8_t Idx3, int8_t Idx4,
+    int8_t Idx5, int8_t Idx6, int8_t Idx7, int8_t Idx8, int8_t Idx9,
+    int8_t Idx10, int8_t Idx11, int8_t Idx12, int8_t Idx13, int8_t Idx14,
+    int8_t Idx15) {
+  static constexpr uint8_t NumElements = 16;
+  const char Initializer[NumElements] = {
+      Idx0, Idx1, Idx2,  Idx3,  Idx4,  Idx5,  Idx6,  Idx7,
+      Idx8, Idx9, Idx10, Idx11, Idx12, Idx13, Idx14, Idx15,
+  };
+
+  static constexpr Type V4VectorType = IceType_v4i32;
+  const uint32_t MaskAlignment = typeWidthInBytesOnStack(V4VectorType);
+  auto *Mask = VariableDeclaration::create(Func->getGlobalPool());
+  GlobalString MaskName = lowerShuffleVector_NewMaskName();
+  Mask->setIsConstant(true);
+  Mask->addInitializer(VariableDeclaration::DataInitializer::create(
+      Func->getGlobalPool(), Initializer, NumElements));
+  Mask->setName(MaskName);
+  // Mask needs to be 16-byte aligned, or pshufb will seg fault.
+  Mask->setAlignment(MaskAlignment);
+  Func->addGlobal(Mask);
+
+  constexpr RelocOffsetT Offset = 0;
+  return llvm::cast<ConstantRelocatable>(Ctx->getConstantSym(Offset, MaskName));
+}
+
+void TargetX8664::lowerShuffleVector_UsingPshufb(
+    Variable *Dest, Operand *Src0, Operand *Src1, int8_t Idx0, int8_t Idx1,
+    int8_t Idx2, int8_t Idx3, int8_t Idx4, int8_t Idx5, int8_t Idx6,
+    int8_t Idx7, int8_t Idx8, int8_t Idx9, int8_t Idx10, int8_t Idx11,
+    int8_t Idx12, int8_t Idx13, int8_t Idx14, int8_t Idx15) {
+  const Type DestTy = Dest->getType();
+  static constexpr bool NotRebased = false;
+  static constexpr Variable *NoBase = nullptr;
+  // We use void for the memory operand instead of DestTy because using the
+  // latter causes a validation failure: the X86 Inst layer complains that
+  // vector mem operands could be under aligned. Thus, using void we avoid the
+  // validation error. Note that the mask global declaration is aligned, so it
+  // can be used as an XMM mem operand.
+  static constexpr Type MaskType = IceType_void;
+#define IDX_IN_SRC(N, S)                                                       \
+  ((((N) & (1 << 4)) == (S << 4)) ? ((N)&0xf) : CLEAR_ALL_BITS)
+  auto *Mask0M = X86OperandMem::create(
+      Func, MaskType, NoBase,
+      lowerShuffleVector_CreatePshufbMask(
+          IDX_IN_SRC(Idx0, 0), IDX_IN_SRC(Idx1, 0), IDX_IN_SRC(Idx2, 0),
+          IDX_IN_SRC(Idx3, 0), IDX_IN_SRC(Idx4, 0), IDX_IN_SRC(Idx5, 0),
+          IDX_IN_SRC(Idx6, 0), IDX_IN_SRC(Idx7, 0), IDX_IN_SRC(Idx8, 0),
+          IDX_IN_SRC(Idx9, 0), IDX_IN_SRC(Idx10, 0), IDX_IN_SRC(Idx11, 0),
+          IDX_IN_SRC(Idx12, 0), IDX_IN_SRC(Idx13, 0), IDX_IN_SRC(Idx14, 0),
+          IDX_IN_SRC(Idx15, 0)),
+      NotRebased);
+
+  auto *T0 = makeReg(DestTy);
+  auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+  _movp(T0, Src0RM);
+
+  _pshufb(T0, Mask0M);
+
+  if (Idx0 >= 16 || Idx1 >= 16 || Idx2 >= 16 || Idx3 >= 16 || Idx4 >= 16 ||
+      Idx5 >= 16 || Idx6 >= 16 || Idx7 >= 16 || Idx8 >= 16 || Idx9 >= 16 ||
+      Idx10 >= 16 || Idx11 >= 16 || Idx12 >= 16 || Idx13 >= 16 || Idx14 >= 16 ||
+      Idx15 >= 16) {
+    auto *Mask1M = X86OperandMem::create(
+        Func, MaskType, NoBase,
+        lowerShuffleVector_CreatePshufbMask(
+            IDX_IN_SRC(Idx0, 1), IDX_IN_SRC(Idx1, 1), IDX_IN_SRC(Idx2, 1),
+            IDX_IN_SRC(Idx3, 1), IDX_IN_SRC(Idx4, 1), IDX_IN_SRC(Idx5, 1),
+            IDX_IN_SRC(Idx6, 1), IDX_IN_SRC(Idx7, 1), IDX_IN_SRC(Idx8, 1),
+            IDX_IN_SRC(Idx9, 1), IDX_IN_SRC(Idx10, 1), IDX_IN_SRC(Idx11, 1),
+            IDX_IN_SRC(Idx12, 1), IDX_IN_SRC(Idx13, 1), IDX_IN_SRC(Idx14, 1),
+            IDX_IN_SRC(Idx15, 1)),
+        NotRebased);
+#undef IDX_IN_SRC
+    auto *T1 = makeReg(DestTy);
+    auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
+    _movp(T1, Src1RM);
+    _pshufb(T1, Mask1M);
+    _por(T0, T1);
+  }
+
+  _movp(Dest, T0);
+}
+
+void TargetX8664::lowerShuffleVector(const InstShuffleVector *Instr) {
+  auto *Dest = Instr->getDest();
+  const Type DestTy = Dest->getType();
+  auto *Src0 = Instr->getSrc(0);
+  auto *Src1 = Instr->getSrc(1);
+  const SizeT NumElements = typeNumElements(DestTy);
+
+  auto *T = makeReg(DestTy);
+
+  switch (DestTy) {
+  default:
+    llvm::report_fatal_error("Unexpected vector type.");
+  case IceType_v16i1:
+  case IceType_v16i8: {
+    static constexpr SizeT ExpectedNumElements = 16;
+    assert(ExpectedNumElements == Instr->getNumIndexes());
+    (void)ExpectedNumElements;
+
+    if (Instr->indexesAre(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7)) {
+      auto *T = makeReg(DestTy);
+      auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+      _movp(T, Src0RM);
+      _punpckl(T, Src0RM);
+      _movp(Dest, T);
+      return;
+    }
+
+    if (Instr->indexesAre(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7,
+                          23)) {
+      auto *T = makeReg(DestTy);
+      auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+      auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
+      _movp(T, Src0RM);
+      _punpckl(T, Src1RM);
+      _movp(Dest, T);
+      return;
+    }
+
+    if (Instr->indexesAre(8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14,
+                          15, 15)) {
+      auto *T = makeReg(DestTy);
+      auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+      _movp(T, Src0RM);
+      _punpckh(T, Src0RM);
+      _movp(Dest, T);
+      return;
+    }
+
+    if (Instr->indexesAre(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30,
+                          15, 31)) {
+      auto *T = makeReg(DestTy);
+      auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+      auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
+      _movp(T, Src0RM);
+      _punpckh(T, Src1RM);
+      _movp(Dest, T);
+      return;
+    }
+
+    if (InstructionSet < SSE4_1) {
+      // TODO(jpp): figure out how to lower with sse2.
+      break;
+    }
+
+    const SizeT Index0 = Instr->getIndexValue(0);
+    const SizeT Index1 = Instr->getIndexValue(1);
+    const SizeT Index2 = Instr->getIndexValue(2);
+    const SizeT Index3 = Instr->getIndexValue(3);
+    const SizeT Index4 = Instr->getIndexValue(4);
+    const SizeT Index5 = Instr->getIndexValue(5);
+    const SizeT Index6 = Instr->getIndexValue(6);
+    const SizeT Index7 = Instr->getIndexValue(7);
+    const SizeT Index8 = Instr->getIndexValue(8);
+    const SizeT Index9 = Instr->getIndexValue(9);
+    const SizeT Index10 = Instr->getIndexValue(10);
+    const SizeT Index11 = Instr->getIndexValue(11);
+    const SizeT Index12 = Instr->getIndexValue(12);
+    const SizeT Index13 = Instr->getIndexValue(13);
+    const SizeT Index14 = Instr->getIndexValue(14);
+    const SizeT Index15 = Instr->getIndexValue(15);
+
+    lowerShuffleVector_UsingPshufb(Dest, Src0, Src1, Index0, Index1, Index2,
+                                   Index3, Index4, Index5, Index6, Index7,
+                                   Index8, Index9, Index10, Index11, Index12,
+                                   Index13, Index14, Index15);
+    return;
+  }
+  case IceType_v8i1:
+  case IceType_v8i16: {
+    static constexpr SizeT ExpectedNumElements = 8;
+    assert(ExpectedNumElements == Instr->getNumIndexes());
+    (void)ExpectedNumElements;
+
+    if (Instr->indexesAre(0, 0, 1, 1, 2, 2, 3, 3)) {
+      auto *T = makeReg(DestTy);
+      auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+      _movp(T, Src0RM);
+      _punpckl(T, Src0RM);
+      _movp(Dest, T);
+      return;
+    }
+
+    if (Instr->indexesAre(0, 8, 1, 9, 2, 10, 3, 11)) {
+      auto *T = makeReg(DestTy);
+      auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+      auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
+      _movp(T, Src0RM);
+      _punpckl(T, Src1RM);
+      _movp(Dest, T);
+      return;
+    }
+
+    if (Instr->indexesAre(4, 4, 5, 5, 6, 6, 7, 7)) {
+      auto *T = makeReg(DestTy);
+      auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+      _movp(T, Src0RM);
+      _punpckh(T, Src0RM);
+      _movp(Dest, T);
+      return;
+    }
+
+    if (Instr->indexesAre(4, 12, 5, 13, 6, 14, 7, 15)) {
+      auto *T = makeReg(DestTy);
+      auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+      auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
+      _movp(T, Src0RM);
+      _punpckh(T, Src1RM);
+      _movp(Dest, T);
+      return;
+    }
+
+    if (InstructionSet < SSE4_1) {
+      // TODO(jpp): figure out how to lower with sse2.
+      break;
+    }
+
+    const SizeT Index0 = Instr->getIndexValue(0);
+    const SizeT Index1 = Instr->getIndexValue(1);
+    const SizeT Index2 = Instr->getIndexValue(2);
+    const SizeT Index3 = Instr->getIndexValue(3);
+    const SizeT Index4 = Instr->getIndexValue(4);
+    const SizeT Index5 = Instr->getIndexValue(5);
+    const SizeT Index6 = Instr->getIndexValue(6);
+    const SizeT Index7 = Instr->getIndexValue(7);
+
+#define TO_BYTE_INDEX(I) ((I) << 1)
+    lowerShuffleVector_UsingPshufb(
+        Dest, Src0, Src1, TO_BYTE_INDEX(Index0), TO_BYTE_INDEX(Index0) + 1,
+        TO_BYTE_INDEX(Index1), TO_BYTE_INDEX(Index1) + 1, TO_BYTE_INDEX(Index2),
+        TO_BYTE_INDEX(Index2) + 1, TO_BYTE_INDEX(Index3),
+        TO_BYTE_INDEX(Index3) + 1, TO_BYTE_INDEX(Index4),
+        TO_BYTE_INDEX(Index4) + 1, TO_BYTE_INDEX(Index5),
+        TO_BYTE_INDEX(Index5) + 1, TO_BYTE_INDEX(Index6),
+        TO_BYTE_INDEX(Index6) + 1, TO_BYTE_INDEX(Index7),
+        TO_BYTE_INDEX(Index7) + 1);
+#undef TO_BYTE_INDEX
+    return;
+  }
+  case IceType_v4i1:
+  case IceType_v4i32:
+  case IceType_v4f32: {
+    static constexpr SizeT ExpectedNumElements = 4;
+    assert(ExpectedNumElements == Instr->getNumIndexes());
+    const SizeT Index0 = Instr->getIndexValue(0);
+    const SizeT Index1 = Instr->getIndexValue(1);
+    const SizeT Index2 = Instr->getIndexValue(2);
+    const SizeT Index3 = Instr->getIndexValue(3);
+    Variable *T = nullptr;
+    switch (makeSrcSwitchMask(Index0, Index1, Index2, Index3)) {
+#define CASE_SRCS_IN(S0, S1, S2, S3)                                           \
+  case (((S0) << 0) | ((S1) << 1) | ((S2) << 2) | ((S3) << 3))
+      CASE_SRCS_IN(0, 0, 0, 0) : {
+        T = lowerShuffleVector_AllFromSameSrc(Src0, Index0, Index1, Index2,
+                                              Index3);
+      }
+      break;
+      CASE_SRCS_IN(0, 0, 0, 1) : {
+        auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src0, Index2,
+                                                                  Src1, Index3);
+        T = lowerShuffleVector_TwoFromSameSrc(Src0, Index0, Index1, Unified,
+                                              UNIFIED_INDEX_0, UNIFIED_INDEX_1);
+      }
+      break;
+      CASE_SRCS_IN(0, 0, 1, 0) : {
+        auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src1, Index2,
+                                                                  Src0, Index3);
+        T = lowerShuffleVector_TwoFromSameSrc(Src0, Index0, Index1, Unified,
+                                              UNIFIED_INDEX_0, UNIFIED_INDEX_1);
+      }
+      break;
+      CASE_SRCS_IN(0, 0, 1, 1) : {
+        T = lowerShuffleVector_TwoFromSameSrc(Src0, Index0, Index1, Src1,
+                                              Index2, Index3);
+      }
+      break;
+      CASE_SRCS_IN(0, 1, 0, 0) : {
+        auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src0, Index0,
+                                                                  Src1, Index1);
+        T = lowerShuffleVector_TwoFromSameSrc(
+            Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Src0, Index2, Index3);
+      }
+      break;
+      CASE_SRCS_IN(0, 1, 0, 1) : {
+        if (Index0 == 0 && (Index1 - ExpectedNumElements) == 0 && Index2 == 1 &&
+            (Index3 - ExpectedNumElements) == 1) {
+          auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
+          auto *Src0R = legalizeToReg(Src0);
+          T = makeReg(DestTy);
+          _movp(T, Src0R);
+          _punpckl(T, Src1RM);
+        } else if (Index0 == Index2 && Index1 == Index3) {
+          auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(
+              Src0, Index0, Src1, Index1);
+          T = lowerShuffleVector_AllFromSameSrc(
+              Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, UNIFIED_INDEX_0,
+              UNIFIED_INDEX_1);
+        } else {
+          auto *Unified0 = lowerShuffleVector_UnifyFromDifferentSrcs(
+              Src0, Index0, Src1, Index1);
+          auto *Unified1 = lowerShuffleVector_UnifyFromDifferentSrcs(
+              Src0, Index2, Src1, Index3);
+          T = lowerShuffleVector_TwoFromSameSrc(
+              Unified0, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Unified1,
+              UNIFIED_INDEX_0, UNIFIED_INDEX_1);
+        }
+      }
+      break;
+      CASE_SRCS_IN(0, 1, 1, 0) : {
+        if (Index0 == Index3 && Index1 == Index2) {
+          auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(
+              Src0, Index0, Src1, Index1);
+          T = lowerShuffleVector_AllFromSameSrc(
+              Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, UNIFIED_INDEX_1,
+              UNIFIED_INDEX_0);
+        } else {
+          auto *Unified0 = lowerShuffleVector_UnifyFromDifferentSrcs(
+              Src0, Index0, Src1, Index1);
+          auto *Unified1 = lowerShuffleVector_UnifyFromDifferentSrcs(
+              Src1, Index2, Src0, Index3);
+          T = lowerShuffleVector_TwoFromSameSrc(
+              Unified0, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Unified1,
+              UNIFIED_INDEX_0, UNIFIED_INDEX_1);
+        }
+      }
+      break;
+      CASE_SRCS_IN(0, 1, 1, 1) : {
+        auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src0, Index0,
+                                                                  Src1, Index1);
+        T = lowerShuffleVector_TwoFromSameSrc(
+            Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Src1, Index2, Index3);
+      }
+      break;
+      CASE_SRCS_IN(1, 0, 0, 0) : {
+        auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src1, Index0,
+                                                                  Src0, Index1);
+        T = lowerShuffleVector_TwoFromSameSrc(
+            Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Src0, Index2, Index3);
+      }
+      break;
+      CASE_SRCS_IN(1, 0, 0, 1) : {
+        if (Index0 == Index3 && Index1 == Index2) {
+          auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(
+              Src1, Index0, Src0, Index1);
+          T = lowerShuffleVector_AllFromSameSrc(
+              Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, UNIFIED_INDEX_1,
+              UNIFIED_INDEX_0);
+        } else {
+          auto *Unified0 = lowerShuffleVector_UnifyFromDifferentSrcs(
+              Src1, Index0, Src0, Index1);
+          auto *Unified1 = lowerShuffleVector_UnifyFromDifferentSrcs(
+              Src0, Index2, Src1, Index3);
+          T = lowerShuffleVector_TwoFromSameSrc(
+              Unified0, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Unified1,
+              UNIFIED_INDEX_0, UNIFIED_INDEX_1);
+        }
+      }
+      break;
+      CASE_SRCS_IN(1, 0, 1, 0) : {
+        if ((Index0 - ExpectedNumElements) == 0 && Index1 == 0 &&
+            (Index2 - ExpectedNumElements) == 1 && Index3 == 1) {
+          auto *Src1RM = legalize(Src0, Legal_Reg | Legal_Mem);
+          auto *Src0R = legalizeToReg(Src1);
+          T = makeReg(DestTy);
+          _movp(T, Src0R);
+          _punpckl(T, Src1RM);
+        } else if (Index0 == Index2 && Index1 == Index3) {
+          auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(
+              Src1, Index0, Src0, Index1);
+          T = lowerShuffleVector_AllFromSameSrc(
+              Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, UNIFIED_INDEX_0,
+              UNIFIED_INDEX_1);
+        } else {
+          auto *Unified0 = lowerShuffleVector_UnifyFromDifferentSrcs(
+              Src1, Index0, Src0, Index1);
+          auto *Unified1 = lowerShuffleVector_UnifyFromDifferentSrcs(
+              Src1, Index2, Src0, Index3);
+          T = lowerShuffleVector_TwoFromSameSrc(
+              Unified0, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Unified1,
+              UNIFIED_INDEX_0, UNIFIED_INDEX_1);
+        }
+      }
+      break;
+      CASE_SRCS_IN(1, 0, 1, 1) : {
+        auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src1, Index0,
+                                                                  Src0, Index1);
+        T = lowerShuffleVector_TwoFromSameSrc(
+            Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Src1, Index2, Index3);
+      }
+      break;
+      CASE_SRCS_IN(1, 1, 0, 0) : {
+        T = lowerShuffleVector_TwoFromSameSrc(Src1, Index0, Index1, Src0,
+                                              Index2, Index3);
+      }
+      break;
+      CASE_SRCS_IN(1, 1, 0, 1) : {
+        auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src0, Index2,
+                                                                  Src1, Index3);
+        T = lowerShuffleVector_TwoFromSameSrc(Src1, Index0, Index1, Unified,
+                                              UNIFIED_INDEX_0, UNIFIED_INDEX_1);
+      }
+      break;
+      CASE_SRCS_IN(1, 1, 1, 0) : {
+        auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src1, Index2,
+                                                                  Src0, Index3);
+        T = lowerShuffleVector_TwoFromSameSrc(Src1, Index0, Index1, Unified,
+                                              UNIFIED_INDEX_0, UNIFIED_INDEX_1);
+      }
+      break;
+      CASE_SRCS_IN(1, 1, 1, 1) : {
+        T = lowerShuffleVector_AllFromSameSrc(Src1, Index0, Index1, Index2,
+                                              Index3);
+      }
+      break;
+#undef CASE_SRCS_IN
+    }
+
+    assert(T != nullptr);
+    assert(T->getType() == DestTy);
+    _movp(Dest, T);
+    return;
+  } break;
+  }
+
+  // Unoptimized shuffle. Perform a series of inserts and extracts.
+  Context.insert<InstFakeDef>(T);
+  const Type ElementType = typeElementType(DestTy);
+  for (SizeT I = 0; I < Instr->getNumIndexes(); ++I) {
+    auto *Index = Instr->getIndex(I);
+    const SizeT Elem = Index->getValue();
+    auto *ExtElmt = makeReg(ElementType);
+    if (Elem < NumElements) {
+      lowerExtractElement(
+          InstExtractElement::create(Func, ExtElmt, Src0, Index));
+    } else {
+      lowerExtractElement(InstExtractElement::create(
+          Func, ExtElmt, Src1, Ctx->getConstantInt32(Elem - NumElements)));
+    }
+    auto *NewT = makeReg(DestTy);
+    lowerInsertElement(InstInsertElement::create(Func, NewT, T, ExtElmt,
+                                                 Ctx->getConstantInt32(I)));
+    T = NewT;
+  }
+  _movp(Dest, T);
+}
+
+void TargetX8664::lowerSelect(const InstSelect *Select) {
+  Variable *Dest = Select->getDest();
+
+  Operand *Condition = Select->getCondition();
+  // Handle folding opportunities.
+  if (const Inst *Producer = FoldingInfo.getProducerFor(Condition)) {
+    assert(Producer->isDeleted());
+    switch (BoolFolding::getProducerKind(Producer)) {
+    default:
+      break;
+    case BoolFolding::PK_Icmp32:
+    case BoolFolding::PK_Icmp64: {
+      lowerIcmpAndConsumer(llvm::cast<InstIcmp>(Producer), Select);
+      return;
+    }
+    case BoolFolding::PK_Fcmp: {
+      lowerFcmpAndConsumer(llvm::cast<InstFcmp>(Producer), Select);
+      return;
+    }
+    }
+  }
+
+  if (isVectorType(Dest->getType())) {
+    lowerSelectVector(Select);
+    return;
+  }
+
+  Operand *CmpResult = legalize(Condition, Legal_Reg | Legal_Mem);
+  Operand *Zero = Ctx->getConstantZero(IceType_i32);
+  _cmp(CmpResult, Zero);
+  Operand *SrcT = Select->getTrueOperand();
+  Operand *SrcF = Select->getFalseOperand();
+  const BrCond Cond = CondX86::Br_ne;
+  lowerSelectMove(Dest, Cond, SrcT, SrcF);
+}
+
+void TargetX8664::lowerSelectMove(Variable *Dest, BrCond Cond, Operand *SrcT,
+                                  Operand *SrcF) {
+  Type DestTy = Dest->getType();
+  if (typeWidthInBytes(DestTy) == 1 || isFloatingType(DestTy)) {
+    // The cmov instruction doesn't allow 8-bit or FP operands, so we need
+    // explicit control flow.
+    // d=cmp e,f; a=d?b:c ==> cmp e,f; a=b; jne L1; a=c; L1:
+    auto *Label = InstX86Label::create(Func, this);
+    SrcT = legalize(SrcT, Legal_Reg | Legal_Imm);
+    _mov(Dest, SrcT);
+    _br(Cond, Label);
+    SrcF = legalize(SrcF, Legal_Reg | Legal_Imm);
+    _redefined(_mov(Dest, SrcF));
+    Context.insert(Label);
+    return;
+  }
+  // mov t, SrcF; cmov_cond t, SrcT; mov dest, t
+  // But if SrcT is immediate, we might be able to do better, as the cmov
+  // instruction doesn't allow an immediate operand:
+  // mov t, SrcT; cmov_!cond t, SrcF; mov dest, t
+  if (llvm::isa<Constant>(SrcT) && !llvm::isa<Constant>(SrcF)) {
+    std::swap(SrcT, SrcF);
+    Cond = InstX86Base::getOppositeCondition(Cond);
+  }
+  if (!Traits::Is64Bit && DestTy == IceType_i64) {
+    SrcT = legalizeUndef(SrcT);
+    SrcF = legalizeUndef(SrcF);
+    // Set the low portion.
+    auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
+    lowerSelectIntMove(DestLo, Cond, loOperand(SrcT), loOperand(SrcF));
+    // Set the high portion.
+    auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
+    lowerSelectIntMove(DestHi, Cond, hiOperand(SrcT), hiOperand(SrcF));
+    return;
+  }
+
+  assert(DestTy == IceType_i16 || DestTy == IceType_i32 ||
+         (Traits::Is64Bit && DestTy == IceType_i64));
+  lowerSelectIntMove(Dest, Cond, SrcT, SrcF);
+}
+
+void TargetX8664::lowerSelectIntMove(Variable *Dest, BrCond Cond, Operand *SrcT,
+                                     Operand *SrcF) {
+  Variable *T = nullptr;
+  SrcF = legalize(SrcF);
+  _mov(T, SrcF);
+  SrcT = legalize(SrcT, Legal_Reg | Legal_Mem);
+  _cmov(T, SrcT, Cond);
+  _mov(Dest, T);
+}
+
+void TargetX8664::lowerMove(Variable *Dest, Operand *Src, bool IsRedefinition) {
+  assert(Dest->getType() == Src->getType());
+  assert(!Dest->isRematerializable());
+  if (!Traits::Is64Bit && Dest->getType() == IceType_i64) {
+    Src = legalize(Src);
+    Operand *SrcLo = loOperand(Src);
+    Operand *SrcHi = hiOperand(Src);
+    auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
+    auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
+    Variable *T_Lo = nullptr, *T_Hi = nullptr;
+    _mov(T_Lo, SrcLo);
+    _redefined(_mov(DestLo, T_Lo), IsRedefinition);
+    _mov(T_Hi, SrcHi);
+    _redefined(_mov(DestHi, T_Hi), IsRedefinition);
+  } else {
+    Operand *SrcLegal;
+    if (Dest->hasReg()) {
+      // If Dest already has a physical register, then only basic legalization
+      // is needed, as the source operand can be a register, immediate, or
+      // memory.
+      SrcLegal = legalize(Src, Legal_Reg, Dest->getRegNum());
+    } else {
+      // If Dest could be a stack operand, then RI must be a physical register
+      // or a scalar integer immediate.
+      SrcLegal = legalize(Src, Legal_Reg | Legal_Imm);
+    }
+    if (isVectorType(Dest->getType())) {
+      _redefined(_movp(Dest, SrcLegal), IsRedefinition);
+    } else {
+      _redefined(_mov(Dest, SrcLegal), IsRedefinition);
+    }
+  }
+}
+
+bool TargetX8664::lowerOptimizeFcmpSelect(const InstFcmp *Fcmp,
+                                          const InstSelect *Select) {
+  Operand *CmpSrc0 = Fcmp->getSrc(0);
+  Operand *CmpSrc1 = Fcmp->getSrc(1);
+  Operand *SelectSrcT = Select->getTrueOperand();
+  Operand *SelectSrcF = Select->getFalseOperand();
+  Variable *SelectDest = Select->getDest();
+
+  // TODO(capn): also handle swapped compare/select operand order.
+  if (CmpSrc0 != SelectSrcT || CmpSrc1 != SelectSrcF)
+    return false;
+
+  // TODO(sehr, stichnot): fcmp/select patterns (e.g., minsd/maxss) go here.
+  InstFcmp::FCond Condition = Fcmp->getCondition();
+  switch (Condition) {
+  default:
+    return false;
+  case InstFcmp::True:
+    break;
+  case InstFcmp::False:
+    break;
+  case InstFcmp::Ogt: {
+    Variable *T = makeReg(SelectDest->getType());
+    if (isScalarFloatingType(SelectSrcT->getType())) {
+      _mov(T, legalize(SelectSrcT, Legal_Reg | Legal_Mem));
+      _maxss(T, legalize(SelectSrcF, Legal_Reg | Legal_Mem));
+      _mov(SelectDest, T);
+    } else {
+      _movp(T, legalize(SelectSrcT, Legal_Reg | Legal_Mem));
+      _maxps(T, legalize(SelectSrcF, Legal_Reg | Legal_Mem));
+      _movp(SelectDest, T);
+    }
+    return true;
+  } break;
+  case InstFcmp::Olt: {
+    Variable *T = makeReg(SelectSrcT->getType());
+    if (isScalarFloatingType(SelectSrcT->getType())) {
+      _mov(T, legalize(SelectSrcT, Legal_Reg | Legal_Mem));
+      _minss(T, legalize(SelectSrcF, Legal_Reg | Legal_Mem));
+      _mov(SelectDest, T);
+    } else {
+      _movp(T, legalize(SelectSrcT, Legal_Reg | Legal_Mem));
+      _minps(T, legalize(SelectSrcF, Legal_Reg | Legal_Mem));
+      _movp(SelectDest, T);
+    }
+    return true;
+  } break;
+  }
+  return false;
+}
+
+void TargetX8664::lowerIcmp(const InstIcmp *Icmp) {
+  Variable *Dest = Icmp->getDest();
+  if (isVectorType(Dest->getType())) {
+    lowerIcmpVector(Icmp);
+  } else {
+    constexpr Inst *Consumer = nullptr;
+    lowerIcmpAndConsumer(Icmp, Consumer);
+  }
+}
+
+void TargetX8664::lowerSelectVector(const InstSelect *Instr) {
+  Variable *Dest = Instr->getDest();
+  Type DestTy = Dest->getType();
+  Operand *SrcT = Instr->getTrueOperand();
+  Operand *SrcF = Instr->getFalseOperand();
+  Operand *Condition = Instr->getCondition();
+
+  if (!isVectorType(DestTy))
+    llvm::report_fatal_error("Expected a vector select");
+
+  Type SrcTy = SrcT->getType();
+  Variable *T = makeReg(SrcTy);
+  Operand *SrcTRM = legalize(SrcT, Legal_Reg | Legal_Mem);
+  Operand *SrcFRM = legalize(SrcF, Legal_Reg | Legal_Mem);
+
+  if (InstructionSet >= SSE4_1) {
+    // TODO(wala): If the condition operand is a constant, use blendps or
+    // pblendw.
+    //
+    // Use blendvps or pblendvb to implement select.
+    if (SrcTy == IceType_v4i1 || SrcTy == IceType_v4i32 ||
+        SrcTy == IceType_v4f32) {
+      Operand *ConditionRM = legalize(Condition, Legal_Reg | Legal_Mem);
+      Variable *xmm0 = makeReg(IceType_v4i32, Traits::RegisterSet::Reg_xmm0);
+      _movp(xmm0, ConditionRM);
+      _psll(xmm0, Ctx->getConstantInt8(31));
+      _movp(T, SrcFRM);
+      _blendvps(T, SrcTRM, xmm0);
+      _movp(Dest, T);
+    } else {
+      assert(typeNumElements(SrcTy) == 8 || typeNumElements(SrcTy) == 16);
+      Type SignExtTy =
+          Condition->getType() == IceType_v8i1 ? IceType_v8i16 : IceType_v16i8;
+      Variable *xmm0 = makeReg(SignExtTy, Traits::RegisterSet::Reg_xmm0);
+      lowerCast(InstCast::create(Func, InstCast::Sext, xmm0, Condition));
+      _movp(T, SrcFRM);
+      _pblendvb(T, SrcTRM, xmm0);
+      _movp(Dest, T);
+    }
+    return;
+  }
+  // Lower select without Traits::SSE4.1:
+  // a=d?b:c ==>
+  //   if elementtype(d) != i1:
+  //      d=sext(d);
+  //   a=(b&d)|(c&~d);
+  Variable *T2 = makeReg(SrcTy);
+  // Sign extend the condition operand if applicable.
+  if (SrcTy == IceType_v4f32) {
+    // The sext operation takes only integer arguments.
+    Variable *T3 = Func->makeVariable(IceType_v4i32);
+    lowerCast(InstCast::create(Func, InstCast::Sext, T3, Condition));
+    _movp(T, T3);
+  } else if (typeElementType(SrcTy) != IceType_i1) {
+    lowerCast(InstCast::create(Func, InstCast::Sext, T, Condition));
+  } else {
+    Operand *ConditionRM = legalize(Condition, Legal_Reg | Legal_Mem);
+    _movp(T, ConditionRM);
+  }
+  _movp(T2, T);
+  _pand(T, SrcTRM);
+  _pandn(T2, SrcFRM);
+  _por(T, T2);
+  _movp(Dest, T);
+
+  return;
+}
+
+void TargetX8664::lowerStore(const InstStore *Instr) {
+  Operand *Value = Instr->getData();
+  Operand *Addr = Instr->getStoreAddress();
+  X86OperandMem *NewAddr = formMemoryOperand(Addr, Value->getType());
+  doMockBoundsCheck(NewAddr);
+  Type Ty = NewAddr->getType();
+
+  if (!Traits::Is64Bit && Ty == IceType_i64) {
+    Value = legalizeUndef(Value);
+    Operand *ValueHi = legalize(hiOperand(Value), Legal_Reg | Legal_Imm);
+    _store(ValueHi, llvm::cast<X86OperandMem>(hiOperand(NewAddr)));
+    Operand *ValueLo = legalize(loOperand(Value), Legal_Reg | Legal_Imm);
+    _store(ValueLo, llvm::cast<X86OperandMem>(loOperand(NewAddr)));
+  } else if (isVectorType(Ty)) {
+    _storep(legalizeToReg(Value), NewAddr);
+  } else {
+    Value = legalize(Value, Legal_Reg | Legal_Imm);
+    _store(Value, NewAddr);
+  }
+}
+
+void TargetX8664::doAddressOptStore() {
+  auto *Instr = llvm::cast<InstStore>(Context.getCur());
+  Operand *Addr = Instr->getStoreAddress();
+  Operand *Data = Instr->getData();
+  if (auto *OptAddr = computeAddressOpt(Instr, Data->getType(), Addr)) {
+    Instr->setDeleted();
+    auto *NewStore = Context.insert<InstStore>(Data, OptAddr);
+    if (Instr->getDest())
+      NewStore->setRmwBeacon(Instr->getRmwBeacon());
+  }
+}
+
+void TargetX8664::doAddressOptStoreSubVector() {
+  auto *Intrinsic = llvm::cast<InstIntrinsic>(Context.getCur());
+  Operand *Addr = Intrinsic->getArg(1);
+  Operand *Data = Intrinsic->getArg(0);
+  if (auto *OptAddr = computeAddressOpt(Intrinsic, Data->getType(), Addr)) {
+    Intrinsic->setDeleted();
+    const Ice::Intrinsics::IntrinsicInfo Info = {
+        Ice::Intrinsics::StoreSubVector, Ice::Intrinsics::SideEffects_T,
+        Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_T};
+    auto *NewStore = Context.insert<InstIntrinsic>(3, nullptr, Info);
+    NewStore->addArg(Data);
+    NewStore->addArg(OptAddr);
+    NewStore->addArg(Intrinsic->getArg(2));
+  }
+}
+
+Operand *TargetX8664::lowerCmpRange(Operand *Comparison, uint64_t Min,
+                                    uint64_t Max) {
+  // TODO(ascull): 64-bit should not reach here but only because it is not
+  // implemented yet. This should be able to handle the 64-bit case.
+  assert(Traits::Is64Bit || Comparison->getType() != IceType_i64);
+  // Subtracting 0 is a nop so don't do it
+  if (Min != 0) {
+    // Avoid clobbering the comparison by copying it
+    Variable *T = nullptr;
+    _mov(T, Comparison);
+    _sub(T, Ctx->getConstantInt32(Min));
+    Comparison = T;
+  }
+
+  _cmp(Comparison, Ctx->getConstantInt32(Max - Min));
+
+  return Comparison;
+}
+
+void TargetX8664::lowerCaseCluster(const CaseCluster &Case, Operand *Comparison,
+                                   bool DoneCmp, CfgNode *DefaultTarget) {
+  switch (Case.getKind()) {
+  case CaseCluster::JumpTable: {
+    InstX86Label *SkipJumpTable;
+
+    Operand *RangeIndex =
+        lowerCmpRange(Comparison, Case.getLow(), Case.getHigh());
+    if (DefaultTarget == nullptr) {
+      // Skip over jump table logic if comparison not in range and no default
+      SkipJumpTable = InstX86Label::create(Func, this);
+      _br(CondX86::Br_a, SkipJumpTable);
+    } else {
+      _br(CondX86::Br_a, DefaultTarget);
+    }
+
+    InstJumpTable *JumpTable = Case.getJumpTable();
+    Context.insert(JumpTable);
+
+    // Make sure the index is a register of the same width as the base
+    Variable *Index;
+    const Type PointerType = getPointerType();
+    if (RangeIndex->getType() != PointerType) {
+      Index = makeReg(PointerType);
+      if (RangeIndex->getType() == IceType_i64) {
+        assert(Traits::Is64Bit);
+        _mov(Index, RangeIndex); // trunc
+      } else {
+        Operand *RangeIndexRM = legalize(RangeIndex, Legal_Reg | Legal_Mem);
+        _movzx(Index, RangeIndexRM);
+      }
+    } else {
+      Index = legalizeToReg(RangeIndex);
+    }
+
+    constexpr RelocOffsetT RelocOffset = 0;
+    constexpr Variable *NoBase = nullptr;
+    constexpr Constant *NoOffset = nullptr;
+    auto JTName = GlobalString::createWithString(Ctx, JumpTable->getName());
+    Constant *Offset = Ctx->getConstantSym(RelocOffset, JTName);
+    uint16_t Shift = typeWidthInBytesLog2(PointerType);
+    constexpr auto Segment = X86OperandMem::SegmentRegisters::DefaultSegment;
+
+    Variable *Target = nullptr;
+    if (PointerType == IceType_i32) {
+      _mov(Target, X86OperandMem::create(Func, PointerType, NoBase, Offset,
+                                         Index, Shift, Segment));
+    } else {
+      auto *Base = makeReg(IceType_i64);
+      _lea(Base, X86OperandMem::create(Func, IceType_void, NoBase, Offset));
+      _mov(Target, X86OperandMem::create(Func, PointerType, Base, NoOffset,
+                                         Index, Shift, Segment));
+    }
+
+    lowerIndirectJump(Target);
+
+    if (DefaultTarget == nullptr)
+      Context.insert(SkipJumpTable);
+    return;
+  }
+  case CaseCluster::Range: {
+    if (Case.isUnitRange()) {
+      // Single item
+      if (!DoneCmp) {
+        Constant *Value = Ctx->getConstantInt32(Case.getLow());
+        _cmp(Comparison, Value);
+      }
+      _br(CondX86::Br_e, Case.getTarget());
+    } else if (DoneCmp && Case.isPairRange()) {
+      // Range of two items with first item aleady compared against
+      _br(CondX86::Br_e, Case.getTarget());
+      Constant *Value = Ctx->getConstantInt32(Case.getHigh());
+      _cmp(Comparison, Value);
+      _br(CondX86::Br_e, Case.getTarget());
+    } else {
+      // Range
+      lowerCmpRange(Comparison, Case.getLow(), Case.getHigh());
+      _br(CondX86::Br_be, Case.getTarget());
+    }
+    if (DefaultTarget != nullptr)
+      _br(DefaultTarget);
+    return;
+  }
+  }
+}
+
+void TargetX8664::lowerSwitch(const InstSwitch *Instr) {
+  // Group cases together and navigate through them with a binary search
+  CaseClusterArray CaseClusters = CaseCluster::clusterizeSwitch(Func, Instr);
+  Operand *Src0 = Instr->getComparison();
+  CfgNode *DefaultTarget = Instr->getLabelDefault();
+
+  assert(CaseClusters.size() != 0); // Should always be at least one
+
+  if (!Traits::Is64Bit && Src0->getType() == IceType_i64) {
+    Src0 = legalize(Src0); // get Base/Index into physical registers
+    Operand *Src0Lo = loOperand(Src0);
+    Operand *Src0Hi = hiOperand(Src0);
+    if (CaseClusters.back().getHigh() > UINT32_MAX) {
+      // TODO(ascull): handle 64-bit case properly (currently naive version)
+      // This might be handled by a higher level lowering of switches.
+      SizeT NumCases = Instr->getNumCases();
+      if (NumCases >= 2) {
+        Src0Lo = legalizeToReg(Src0Lo);
+        Src0Hi = legalizeToReg(Src0Hi);
+      } else {
+        Src0Lo = legalize(Src0Lo, Legal_Reg | Legal_Mem);
+        Src0Hi = legalize(Src0Hi, Legal_Reg | Legal_Mem);
+      }
+      for (SizeT I = 0; I < NumCases; ++I) {
+        Constant *ValueLo = Ctx->getConstantInt32(Instr->getValue(I));
+        Constant *ValueHi = Ctx->getConstantInt32(Instr->getValue(I) >> 32);
+        InstX86Label *Label = InstX86Label::create(Func, this);
+        _cmp(Src0Lo, ValueLo);
+        _br(CondX86::Br_ne, Label);
+        _cmp(Src0Hi, ValueHi);
+        _br(CondX86::Br_e, Instr->getLabel(I));
+        Context.insert(Label);
+      }
+      _br(Instr->getLabelDefault());
+      return;
+    } else {
+      // All the values are 32-bit so just check the operand is too and then
+      // fall through to the 32-bit implementation. This is a common case.
+      Src0Hi = legalize(Src0Hi, Legal_Reg | Legal_Mem);
+      Constant *Zero = Ctx->getConstantInt32(0);
+      _cmp(Src0Hi, Zero);
+      _br(CondX86::Br_ne, DefaultTarget);
+      Src0 = Src0Lo;
+    }
+  }
+
+  // 32-bit lowering
+
+  if (CaseClusters.size() == 1) {
+    // Jump straight to default if needed. Currently a common case as jump
+    // tables occur on their own.
+    constexpr bool DoneCmp = false;
+    lowerCaseCluster(CaseClusters.front(), Src0, DoneCmp, DefaultTarget);
+    return;
+  }
+
+  // Going to be using multiple times so get it in a register early
+  Variable *Comparison = legalizeToReg(Src0);
+
+  // A span is over the clusters
+  struct SearchSpan {
+    SearchSpan(SizeT Begin, SizeT Size, InstX86Label *Label)
+        : Begin(Begin), Size(Size), Label(Label) {}
+
+    SizeT Begin;
+    SizeT Size;
+    InstX86Label *Label;
+  };
+  // The stack will only grow to the height of the tree so 12 should be plenty
+  std::stack<SearchSpan, llvm::SmallVector<SearchSpan, 12>> SearchSpanStack;
+  SearchSpanStack.emplace(0, CaseClusters.size(), nullptr);
+  bool DoneCmp = false;
+
+  while (!SearchSpanStack.empty()) {
+    SearchSpan Span = SearchSpanStack.top();
+    SearchSpanStack.pop();
+
+    if (Span.Label != nullptr)
+      Context.insert(Span.Label);
+
+    switch (Span.Size) {
+    case 0:
+      llvm::report_fatal_error("Invalid SearchSpan size");
+      break;
+
+    case 1:
+      lowerCaseCluster(CaseClusters[Span.Begin], Comparison, DoneCmp,
+                       SearchSpanStack.empty() ? nullptr : DefaultTarget);
+      DoneCmp = false;
+      break;
+
+    case 2: {
+      const CaseCluster *CaseA = &CaseClusters[Span.Begin];
+      const CaseCluster *CaseB = &CaseClusters[Span.Begin + 1];
+
+      // Placing a range last may allow register clobbering during the range
+      // test. That means there is no need to clone the register. If it is a
+      // unit range the comparison may have already been done in the binary
+      // search (DoneCmp) and so it should be placed first. If this is a range
+      // of two items and the comparison with the low value has already been
+      // done, comparing with the other element is cheaper than a range test.
+      // If the low end of the range is zero then there is no subtraction and
+      // nothing to be gained.
+      if (!CaseA->isUnitRange() &&
+          !(CaseA->getLow() == 0 || (DoneCmp && CaseA->isPairRange()))) {
+        std::swap(CaseA, CaseB);
+        DoneCmp = false;
+      }
+
+      lowerCaseCluster(*CaseA, Comparison, DoneCmp);
+      DoneCmp = false;
+      lowerCaseCluster(*CaseB, Comparison, DoneCmp,
+                       SearchSpanStack.empty() ? nullptr : DefaultTarget);
+    } break;
+
+    default:
+      // Pick the middle item and branch b or ae
+      SizeT PivotIndex = Span.Begin + (Span.Size / 2);
+      const CaseCluster &Pivot = CaseClusters[PivotIndex];
+      Constant *Value = Ctx->getConstantInt32(Pivot.getLow());
+      InstX86Label *Label = InstX86Label::create(Func, this);
+      _cmp(Comparison, Value);
+      // TODO(ascull): does it alway have to be far?
+      _br(CondX86::Br_b, Label, InstX86Br::Far);
+      // Lower the left and (pivot+right) sides, falling through to the right
+      SearchSpanStack.emplace(Span.Begin, Span.Size / 2, Label);
+      SearchSpanStack.emplace(PivotIndex, Span.Size - (Span.Size / 2), nullptr);
+      DoneCmp = true;
+      break;
+    }
+  }
+
+  _br(DefaultTarget);
+}
+
+/// The following pattern occurs often in lowered C and C++ code:
+///
+///   %cmp     = fcmp/icmp pred <n x ty> %src0, %src1
+///   %cmp.ext = sext <n x i1> %cmp to <n x ty>
+///
+/// We can eliminate the sext operation by copying the result of pcmpeqd,
+/// pcmpgtd, or cmpps (which produce sign extended results) to the result of the
+/// sext operation.
+
+void TargetX8664::eliminateNextVectorSextInstruction(
+    Variable *SignExtendedResult) {
+  if (auto *NextCast =
+          llvm::dyn_cast_or_null<InstCast>(Context.getNextInst())) {
+    if (NextCast->getCastKind() == InstCast::Sext &&
+        NextCast->getSrc(0) == SignExtendedResult) {
+      NextCast->setDeleted();
+      _movp(NextCast->getDest(), legalizeToReg(SignExtendedResult));
+      // Skip over the instruction.
+      Context.advanceNext();
+    }
+  }
+}
+
+void TargetX8664::lowerUnreachable(const InstUnreachable * /*Instr*/) {
+  _ud2();
+  // Add a fake use of esp to make sure esp adjustments after the unreachable
+  // do not get dead-code eliminated.
+  keepEspLiveAtExit();
+}
+
+void TargetX8664::lowerBreakpoint(const InstBreakpoint * /*Instr*/) { _int3(); }
+
+void TargetX8664::lowerRMW(const InstX86FakeRMW *RMW) {
+  // If the beacon variable's live range does not end in this instruction, then
+  // it must end in the modified Store instruction that follows. This means
+  // that the original Store instruction is still there, either because the
+  // value being stored is used beyond the Store instruction, or because dead
+  // code elimination did not happen. In either case, we cancel RMW lowering
+  // (and the caller deletes the RMW instruction).
+  if (!RMW->isLastUse(RMW->getBeacon()))
+    return;
+  Operand *Src = RMW->getData();
+  Type Ty = Src->getType();
+  X86OperandMem *Addr = formMemoryOperand(RMW->getAddr(), Ty);
+  doMockBoundsCheck(Addr);
+  if (!Traits::Is64Bit && Ty == IceType_i64) {
+    Src = legalizeUndef(Src);
+    Operand *SrcLo = legalize(loOperand(Src), Legal_Reg | Legal_Imm);
+    Operand *SrcHi = legalize(hiOperand(Src), Legal_Reg | Legal_Imm);
+    auto *AddrLo = llvm::cast<X86OperandMem>(loOperand(Addr));
+    auto *AddrHi = llvm::cast<X86OperandMem>(hiOperand(Addr));
+    switch (RMW->getOp()) {
+    default:
+      // TODO(stichnot): Implement other arithmetic operators.
+      break;
+    case InstArithmetic::Add:
+      _add_rmw(AddrLo, SrcLo);
+      _adc_rmw(AddrHi, SrcHi);
+      return;
+    case InstArithmetic::Sub:
+      _sub_rmw(AddrLo, SrcLo);
+      _sbb_rmw(AddrHi, SrcHi);
+      return;
+    case InstArithmetic::And:
+      _and_rmw(AddrLo, SrcLo);
+      _and_rmw(AddrHi, SrcHi);
+      return;
+    case InstArithmetic::Or:
+      _or_rmw(AddrLo, SrcLo);
+      _or_rmw(AddrHi, SrcHi);
+      return;
+    case InstArithmetic::Xor:
+      _xor_rmw(AddrLo, SrcLo);
+      _xor_rmw(AddrHi, SrcHi);
+      return;
+    }
+  } else {
+    // x86-32: i8, i16, i32
+    // x86-64: i8, i16, i32, i64
+    switch (RMW->getOp()) {
+    default:
+      // TODO(stichnot): Implement other arithmetic operators.
+      break;
+    case InstArithmetic::Add:
+      Src = legalize(Src, Legal_Reg | Legal_Imm);
+      _add_rmw(Addr, Src);
+      return;
+    case InstArithmetic::Sub:
+      Src = legalize(Src, Legal_Reg | Legal_Imm);
+      _sub_rmw(Addr, Src);
+      return;
+    case InstArithmetic::And:
+      Src = legalize(Src, Legal_Reg | Legal_Imm);
+      _and_rmw(Addr, Src);
+      return;
+    case InstArithmetic::Or:
+      Src = legalize(Src, Legal_Reg | Legal_Imm);
+      _or_rmw(Addr, Src);
+      return;
+    case InstArithmetic::Xor:
+      Src = legalize(Src, Legal_Reg | Legal_Imm);
+      _xor_rmw(Addr, Src);
+      return;
+    }
+  }
+  llvm::report_fatal_error("Couldn't lower RMW instruction");
+}
+
+void TargetX8664::lowerOther(const Inst *Instr) {
+  if (const auto *RMW = llvm::dyn_cast<InstX86FakeRMW>(Instr)) {
+    lowerRMW(RMW);
+  } else {
+    TargetLowering::lowerOther(Instr);
+  }
+}
+
+/// Turn an i64 Phi instruction into a pair of i32 Phi instructions, to preserve
+/// integrity of liveness analysis. Undef values are also turned into zeroes,
+/// since loOperand() and hiOperand() don't expect Undef input.
+void TargetX8664::prelowerPhis() {
+  if (Traits::Is64Bit) {
+    // On x86-64 we don't need to prelower phis -- the architecture can handle
+    // 64-bit integer natively.
+    return;
+  }
+
+  PhiLowering::prelowerPhis32Bit<TargetX8664>(this, Context.getNode(), Func);
+}
+
+void TargetX8664::genTargetHelperCallFor(Inst *Instr) {
+  uint32_t StackArgumentsSize = 0;
+  if (auto *Arith = llvm::dyn_cast<InstArithmetic>(Instr)) {
+    RuntimeHelper HelperID = RuntimeHelper::H_Num;
+    Variable *Dest = Arith->getDest();
+    Type DestTy = Dest->getType();
+    if (!Traits::Is64Bit && DestTy == IceType_i64) {
+      switch (Arith->getOp()) {
+      default:
+        return;
+      case InstArithmetic::Udiv:
+        HelperID = RuntimeHelper::H_udiv_i64;
+        break;
+      case InstArithmetic::Sdiv:
+        HelperID = RuntimeHelper::H_sdiv_i64;
+        break;
+      case InstArithmetic::Urem:
+        HelperID = RuntimeHelper::H_urem_i64;
+        break;
+      case InstArithmetic::Srem:
+        HelperID = RuntimeHelper::H_srem_i64;
+        break;
+      }
+    } else if (isVectorType(DestTy)) {
+      Variable *Dest = Arith->getDest();
+      Operand *Src0 = Arith->getSrc(0);
+      Operand *Src1 = Arith->getSrc(1);
+      switch (Arith->getOp()) {
+      default:
+        return;
+      case InstArithmetic::Mul:
+        if (DestTy == IceType_v16i8) {
+          scalarizeArithmetic(Arith->getOp(), Dest, Src0, Src1);
+          Arith->setDeleted();
+        }
+        return;
+      case InstArithmetic::Shl:
+      case InstArithmetic::Lshr:
+      case InstArithmetic::Ashr:
+        if (llvm::isa<Constant>(Src1)) {
+          return;
+        }
+      case InstArithmetic::Udiv:
+      case InstArithmetic::Urem:
+      case InstArithmetic::Sdiv:
+      case InstArithmetic::Srem:
+      case InstArithmetic::Frem:
+        scalarizeArithmetic(Arith->getOp(), Dest, Src0, Src1);
+        Arith->setDeleted();
+        return;
+      }
+    } else {
+      switch (Arith->getOp()) {
+      default:
+        return;
+      case InstArithmetic::Frem:
+        if (isFloat32Asserting32Or64(DestTy))
+          HelperID = RuntimeHelper::H_frem_f32;
+        else
+          HelperID = RuntimeHelper::H_frem_f64;
+      }
+    }
+    constexpr SizeT MaxSrcs = 2;
+    InstCall *Call = makeHelperCall(HelperID, Dest, MaxSrcs);
+    Call->addArg(Arith->getSrc(0));
+    Call->addArg(Arith->getSrc(1));
+    StackArgumentsSize = getCallStackArgumentsSizeBytes(Call);
+    Context.insert(Call);
+    Arith->setDeleted();
+  } else if (auto *Cast = llvm::dyn_cast<InstCast>(Instr)) {
+    InstCast::OpKind CastKind = Cast->getCastKind();
+    Operand *Src0 = Cast->getSrc(0);
+    const Type SrcType = Src0->getType();
+    Variable *Dest = Cast->getDest();
+    const Type DestTy = Dest->getType();
+    RuntimeHelper HelperID = RuntimeHelper::H_Num;
+    Variable *CallDest = Dest;
+    switch (CastKind) {
+    default:
+      return;
+    case InstCast::Fptosi:
+      if (!Traits::Is64Bit && DestTy == IceType_i64) {
+        HelperID = isFloat32Asserting32Or64(SrcType)
+                       ? RuntimeHelper::H_fptosi_f32_i64
+                       : RuntimeHelper::H_fptosi_f64_i64;
+      } else {
+        return;
+      }
+      break;
+    case InstCast::Fptoui:
+      if (isVectorType(DestTy)) {
+        assert(DestTy == IceType_v4i32);
+        assert(SrcType == IceType_v4f32);
+        HelperID = RuntimeHelper::H_fptoui_4xi32_f32;
+      } else if (DestTy == IceType_i64 ||
+                 (!Traits::Is64Bit && DestTy == IceType_i32)) {
+        if (Traits::Is64Bit) {
+          HelperID = isFloat32Asserting32Or64(SrcType)
+                         ? RuntimeHelper::H_fptoui_f32_i64
+                         : RuntimeHelper::H_fptoui_f64_i64;
+        } else if (isInt32Asserting32Or64(DestTy)) {
+          HelperID = isFloat32Asserting32Or64(SrcType)
+                         ? RuntimeHelper::H_fptoui_f32_i32
+                         : RuntimeHelper::H_fptoui_f64_i32;
+        } else {
+          HelperID = isFloat32Asserting32Or64(SrcType)
+                         ? RuntimeHelper::H_fptoui_f32_i64
+                         : RuntimeHelper::H_fptoui_f64_i64;
+        }
+      } else {
+        return;
+      }
+      break;
+    case InstCast::Sitofp:
+      if (!Traits::Is64Bit && SrcType == IceType_i64) {
+        HelperID = isFloat32Asserting32Or64(DestTy)
+                       ? RuntimeHelper::H_sitofp_i64_f32
+                       : RuntimeHelper::H_sitofp_i64_f64;
+      } else {
+        return;
+      }
+      break;
+    case InstCast::Uitofp:
+      if (isVectorType(SrcType)) {
+        assert(DestTy == IceType_v4f32);
+        assert(SrcType == IceType_v4i32);
+        HelperID = RuntimeHelper::H_uitofp_4xi32_4xf32;
+      } else if (SrcType == IceType_i64 ||
+                 (!Traits::Is64Bit && SrcType == IceType_i32)) {
+        if (isInt32Asserting32Or64(SrcType)) {
+          HelperID = isFloat32Asserting32Or64(DestTy)
+                         ? RuntimeHelper::H_uitofp_i32_f32
+                         : RuntimeHelper::H_uitofp_i32_f64;
+        } else {
+          HelperID = isFloat32Asserting32Or64(DestTy)
+                         ? RuntimeHelper::H_uitofp_i64_f32
+                         : RuntimeHelper::H_uitofp_i64_f64;
+        }
+      } else {
+        return;
+      }
+      break;
+    case InstCast::Bitcast: {
+      if (DestTy == Src0->getType())
+        return;
+      switch (DestTy) {
+      default:
+        return;
+      case IceType_i8:
+        assert(Src0->getType() == IceType_v8i1);
+        HelperID = RuntimeHelper::H_bitcast_8xi1_i8;
+        CallDest = Func->makeVariable(IceType_i32);
+        break;
+      case IceType_i16:
+        assert(Src0->getType() == IceType_v16i1);
+        HelperID = RuntimeHelper::H_bitcast_16xi1_i16;
+        CallDest = Func->makeVariable(IceType_i32);
+        break;
+      case IceType_v8i1: {
+        assert(Src0->getType() == IceType_i8);
+        HelperID = RuntimeHelper::H_bitcast_i8_8xi1;
+        Variable *Src0AsI32 = Func->makeVariable(stackSlotType());
+        // Arguments to functions are required to be at least 32 bits wide.
+        Context.insert<InstCast>(InstCast::Zext, Src0AsI32, Src0);
+        Src0 = Src0AsI32;
+      } break;
+      case IceType_v16i1: {
+        assert(Src0->getType() == IceType_i16);
+        HelperID = RuntimeHelper::H_bitcast_i16_16xi1;
+        Variable *Src0AsI32 = Func->makeVariable(stackSlotType());
+        // Arguments to functions are required to be at least 32 bits wide.
+        Context.insert<InstCast>(InstCast::Zext, Src0AsI32, Src0);
+        Src0 = Src0AsI32;
+      } break;
+      }
+    } break;
+    }
+    constexpr SizeT MaxSrcs = 1;
+    InstCall *Call = makeHelperCall(HelperID, CallDest, MaxSrcs);
+    Call->addArg(Src0);
+    StackArgumentsSize = getCallStackArgumentsSizeBytes(Call);
+    Context.insert(Call);
+    // The PNaCl ABI disallows i8/i16 return types, so truncate the helper call
+    // result to the appropriate type as necessary.
+    if (CallDest->getType() != Dest->getType())
+      Context.insert<InstCast>(InstCast::Trunc, Dest, CallDest);
+    Cast->setDeleted();
+  } else if (auto *Intrinsic = llvm::dyn_cast<InstIntrinsic>(Instr)) {
+    CfgVector<Type> ArgTypes;
+    Type ReturnType = IceType_void;
+    switch (Intrinsic->getIntrinsicID()) {
+    default:
+      return;
+    case Intrinsics::Ctpop: {
+      Operand *Val = Intrinsic->getArg(0);
+      Type ValTy = Val->getType();
+      if (ValTy == IceType_i64)
+        ArgTypes = {IceType_i64};
+      else
+        ArgTypes = {IceType_i32};
+      ReturnType = IceType_i32;
+    } break;
+    case Intrinsics::Longjmp:
+      ArgTypes = {IceType_i32, IceType_i32};
+      ReturnType = IceType_void;
+      break;
+    case Intrinsics::Memcpy:
+      ArgTypes = {IceType_i32, IceType_i32, IceType_i32};
+      ReturnType = IceType_void;
+      break;
+    case Intrinsics::Memmove:
+      ArgTypes = {IceType_i32, IceType_i32, IceType_i32};
+      ReturnType = IceType_void;
+      break;
+    case Intrinsics::Memset:
+      ArgTypes = {IceType_i32, IceType_i32, IceType_i32};
+      ReturnType = IceType_void;
+      break;
+    case Intrinsics::Setjmp:
+      ArgTypes = {IceType_i32};
+      ReturnType = IceType_i32;
+      break;
+    }
+    StackArgumentsSize = getCallStackArgumentsSizeBytes(ArgTypes, ReturnType);
+  } else if (auto *Call = llvm::dyn_cast<InstCall>(Instr)) {
+    StackArgumentsSize = getCallStackArgumentsSizeBytes(Call);
+  } else if (auto *Ret = llvm::dyn_cast<InstRet>(Instr)) {
+    if (!Ret->hasRetValue())
+      return;
+    Operand *RetValue = Ret->getRetValue();
+    Type ReturnType = RetValue->getType();
+    if (!isScalarFloatingType(ReturnType))
+      return;
+    StackArgumentsSize = typeWidthInBytes(ReturnType);
+  } else {
+    return;
+  }
+  StackArgumentsSize = Traits::applyStackAlignment(StackArgumentsSize);
+  updateMaxOutArgsSizeBytes(StackArgumentsSize);
+}
+
+uint32_t
+TargetX8664::getCallStackArgumentsSizeBytes(const CfgVector<Type> &ArgTypes,
+                                            Type ReturnType) {
+  uint32_t OutArgumentsSizeBytes = 0;
+  uint32_t XmmArgCount = 0;
+  uint32_t GprArgCount = 0;
+  for (SizeT i = 0, NumArgTypes = ArgTypes.size(); i < NumArgTypes; ++i) {
+    Type Ty = ArgTypes[i];
+    // The PNaCl ABI requires the width of arguments to be at least 32 bits.
+    assert(typeWidthInBytes(Ty) >= 4);
+    if (isVectorType(Ty) &&
+        Traits::getRegisterForXmmArgNum(Traits::getArgIndex(i, XmmArgCount))
+            .hasValue()) {
+      ++XmmArgCount;
+    } else if (isScalarFloatingType(Ty) && Traits::X86_PASS_SCALAR_FP_IN_XMM &&
+               Traits::getRegisterForXmmArgNum(
+                   Traits::getArgIndex(i, XmmArgCount))
+                   .hasValue()) {
+      ++XmmArgCount;
+    } else if (isScalarIntegerType(Ty) &&
+               Traits::getRegisterForGprArgNum(
+                   Ty, Traits::getArgIndex(i, GprArgCount))
+                   .hasValue()) {
+      // The 64 bit ABI allows some integers to be passed in GPRs.
+      ++GprArgCount;
+    } else {
+      if (isVectorType(Ty)) {
+        OutArgumentsSizeBytes =
+            Traits::applyStackAlignment(OutArgumentsSizeBytes);
+      }
+      OutArgumentsSizeBytes += typeWidthInBytesOnStack(Ty);
+    }
+  }
+  if (Traits::Is64Bit)
+    return OutArgumentsSizeBytes;
+  // The 32 bit ABI requires floating point values to be returned on the x87 FP
+  // stack. Ensure there is enough space for the fstp/movs for floating returns.
+  if (isScalarFloatingType(ReturnType)) {
+    OutArgumentsSizeBytes =
+        std::max(OutArgumentsSizeBytes,
+                 static_cast<uint32_t>(typeWidthInBytesOnStack(ReturnType)));
+  }
+  return OutArgumentsSizeBytes;
+}
+
+uint32_t TargetX8664::getCallStackArgumentsSizeBytes(const InstCall *Instr) {
+  // Build a vector of the arguments' types.
+  const SizeT NumArgs = Instr->getNumArgs();
+  CfgVector<Type> ArgTypes;
+  ArgTypes.reserve(NumArgs);
+  for (SizeT i = 0; i < NumArgs; ++i) {
+    Operand *Arg = Instr->getArg(i);
+    ArgTypes.emplace_back(Arg->getType());
+  }
+  // Compute the return type (if any);
+  Type ReturnType = IceType_void;
+  Variable *Dest = Instr->getDest();
+  if (Dest != nullptr)
+    ReturnType = Dest->getType();
+  return getShadowStoreSize() +
+         getCallStackArgumentsSizeBytes(ArgTypes, ReturnType);
+}
+
+Variable *TargetX8664::makeZeroedRegister(Type Ty, RegNumT RegNum) {
+  Variable *Reg = makeReg(Ty, RegNum);
+  switch (Ty) {
+  case IceType_i1:
+  case IceType_i8:
+  case IceType_i16:
+  case IceType_i32:
+  case IceType_i64:
+    // Conservatively do "mov reg, 0" to avoid modifying FLAGS.
+    _mov(Reg, Ctx->getConstantZero(Ty));
+    break;
+  case IceType_f32:
+  case IceType_f64:
+    Context.insert<InstFakeDef>(Reg);
+    _xorps(Reg, Reg);
+    break;
+  default:
+    // All vector types use the same pxor instruction.
+    assert(isVectorType(Ty));
+    Context.insert<InstFakeDef>(Reg);
+    _pxor(Reg, Reg);
+    break;
+  }
+  return Reg;
+}
+
+// There is no support for loading or emitting vector constants, so the vector
+// values returned from makeVectorOfZeros, makeVectorOfOnes, etc. are
+// initialized with register operations.
+//
+// TODO(wala): Add limited support for vector constants so that complex
+// initialization in registers is unnecessary.
+
+Variable *TargetX8664::makeVectorOfZeros(Type Ty, RegNumT RegNum) {
+  return makeZeroedRegister(Ty, RegNum);
+}
+
+Variable *TargetX8664::makeVectorOfMinusOnes(Type Ty, RegNumT RegNum) {
+  Variable *MinusOnes = makeReg(Ty, RegNum);
+  // Insert a FakeDef so the live range of MinusOnes is not overestimated.
+  Context.insert<InstFakeDef>(MinusOnes);
+  if (Ty == IceType_f64)
+    // Making a vector of minus ones of type f64 is currently only used for the
+    // fabs intrinsic.  To use the f64 type to create this mask with pcmpeqq
+    // requires SSE 4.1.  Since we're just creating a mask, pcmpeqd does the
+    // same job and only requires SSE2.
+    _pcmpeq(MinusOnes, MinusOnes, IceType_f32);
+  else
+    _pcmpeq(MinusOnes, MinusOnes);
+  return MinusOnes;
+}
+
+Variable *TargetX8664::makeVectorOfOnes(Type Ty, RegNumT RegNum) {
+  Variable *Dest = makeVectorOfZeros(Ty, RegNum);
+  Variable *MinusOne = makeVectorOfMinusOnes(Ty);
+  _psub(Dest, MinusOne);
+  return Dest;
+}
+
+Variable *TargetX8664::makeVectorOfHighOrderBits(Type Ty, RegNumT RegNum) {
+  assert(Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v8i16 ||
+         Ty == IceType_v16i8);
+  if (Ty == IceType_v4f32 || Ty == IceType_v4i32 || Ty == IceType_v8i16) {
+    Variable *Reg = makeVectorOfOnes(Ty, RegNum);
+    SizeT Shift =
+        typeWidthInBytes(typeElementType(Ty)) * Traits::X86_CHAR_BIT - 1;
+    _psll(Reg, Ctx->getConstantInt8(Shift));
+    return Reg;
+  } else {
+    // SSE has no left shift operation for vectors of 8 bit integers.
+    constexpr uint32_t HIGH_ORDER_BITS_MASK = 0x80808080;
+    Constant *ConstantMask = Ctx->getConstantInt32(HIGH_ORDER_BITS_MASK);
+    Variable *Reg = makeReg(Ty, RegNum);
+    _movd(Reg, legalize(ConstantMask, Legal_Reg | Legal_Mem));
+    _pshufd(Reg, Reg, Ctx->getConstantZero(IceType_i8));
+    return Reg;
+  }
+}
+
+/// Construct a mask in a register that can be and'ed with a floating-point
+/// value to mask off its sign bit. The value will be <4 x 0x7fffffff> for f32
+/// and v4f32, and <2 x 0x7fffffffffffffff> for f64. Construct it as vector of
+/// ones logically right shifted one bit.
+// TODO(stichnot): Fix the wala
+// TODO: above, to represent vector constants in memory.
+
+Variable *TargetX8664::makeVectorOfFabsMask(Type Ty, RegNumT RegNum) {
+  Variable *Reg = makeVectorOfMinusOnes(Ty, RegNum);
+  _psrl(Reg, Ctx->getConstantInt8(1));
+  return Reg;
+}
+
+typename TargetX8664::X86OperandMem *
+TargetX8664::getMemoryOperandForStackSlot(Type Ty, Variable *Slot,
+                                          uint32_t Offset) {
+  // Ensure that Loc is a stack slot.
+  assert(Slot->mustNotHaveReg());
+  assert(Slot->getRegNum().hasNoValue());
+  // Compute the location of Loc in memory.
+  // TODO(wala,stichnot): lea should not
+  // be required. The address of the stack slot is known at compile time
+  // (although not until after addProlog()).
+  const Type PointerType = getPointerType();
+  Variable *Loc = makeReg(PointerType);
+  _lea(Loc, Slot);
+  Constant *ConstantOffset = Ctx->getConstantInt32(Offset);
+  return X86OperandMem::create(Func, Ty, Loc, ConstantOffset);
+}
+
+/// Lowering helper to copy a scalar integer source operand into some 8-bit GPR.
+/// Src is assumed to already be legalized.  If the source operand is known to
+/// be a memory or immediate operand, a simple mov will suffice.  But if the
+/// source operand can be a physical register, then it must first be copied into
+/// a physical register that is truncable to 8-bit, then truncated into a
+/// physical register that can receive a truncation, and finally copied into the
+/// result 8-bit register (which in general can be any 8-bit register).  For
+/// example, moving %ebp into %ah may be accomplished as:
+///   movl %ebp, %edx
+///   mov_trunc %edx, %dl  // this redundant assignment is ultimately elided
+///   movb %dl, %ah
+/// On the other hand, moving a memory or immediate operand into ah:
+///   movb 4(%ebp), %ah
+///   movb $my_imm, %ah
+///
+/// Note #1.  On a 64-bit target, the "movb 4(%ebp), %ah" is likely not
+/// encodable, so RegNum=Reg_ah should NOT be given as an argument.  Instead,
+/// use RegNum=RegNumT() and then let the caller do a separate copy into
+/// Reg_ah.
+///
+/// Note #2.  ConstantRelocatable operands are also put through this process
+/// (not truncated directly) because our ELF emitter does R_386_32 relocations
+/// but not R_386_8 relocations.
+///
+/// Note #3.  If Src is a Variable, the result will be an infinite-weight i8
+/// Variable with the RCX86_IsTrunc8Rcvr register class.  As such, this helper
+/// is a convenient way to prevent ah/bh/ch/dh from being an (invalid) argument
+/// to the pinsrb instruction.
+
+Variable *TargetX8664::copyToReg8(Operand *Src, RegNumT RegNum) {
+  Type Ty = Src->getType();
+  assert(isScalarIntegerType(Ty));
+  assert(Ty != IceType_i1);
+  Variable *Reg = makeReg(IceType_i8, RegNum);
+  Reg->setRegClass(RCX86_IsTrunc8Rcvr);
+  if (llvm::isa<Variable>(Src) || llvm::isa<ConstantRelocatable>(Src)) {
+    Variable *SrcTruncable = makeReg(Ty);
+    switch (Ty) {
+    case IceType_i64:
+      SrcTruncable->setRegClass(RCX86_Is64To8);
+      break;
+    case IceType_i32:
+      SrcTruncable->setRegClass(RCX86_Is32To8);
+      break;
+    case IceType_i16:
+      SrcTruncable->setRegClass(RCX86_Is16To8);
+      break;
+    default:
+      // i8 - just use default register class
+      break;
+    }
+    Variable *SrcRcvr = makeReg(IceType_i8);
+    SrcRcvr->setRegClass(RCX86_IsTrunc8Rcvr);
+    _mov(SrcTruncable, Src);
+    _mov(SrcRcvr, SrcTruncable);
+    Src = SrcRcvr;
+  }
+  _mov(Reg, Src);
+  return Reg;
+}
+
+/// Helper for legalize() to emit the right code to lower an operand to a
+/// register of the appropriate type.
+
+Variable *TargetX8664::copyToReg(Operand *Src, RegNumT RegNum) {
+  Type Ty = Src->getType();
+  Variable *Reg = makeReg(Ty, RegNum);
+  if (isVectorType(Ty)) {
+    _movp(Reg, Src);
+  } else {
+    _mov(Reg, Src);
+  }
+  return Reg;
+}
+
+Operand *TargetX8664::legalize(Operand *From, LegalMask Allowed,
+                               RegNumT RegNum) {
+  const Type Ty = From->getType();
+  // Assert that a physical register is allowed. To date, all calls to
+  // legalize() allow a physical register. If a physical register needs to be
+  // explicitly disallowed, then new code will need to be written to force a
+  // spill.
+  assert(Allowed & Legal_Reg);
+  // If we're asking for a specific physical register, make sure we're not
+  // allowing any other operand kinds. (This could be future work, e.g. allow
+  // the shl shift amount to be either an immediate or in ecx.)
+  assert(RegNum.hasNoValue() || Allowed == Legal_Reg);
+
+  // Substitute with an available infinite-weight variable if possible.  Only do
+  // this when we are not asking for a specific register, and when the
+  // substitution is not locked to a specific register, and when the types
+  // match, in order to capture the vast majority of opportunities and avoid
+  // corner cases in the lowering.
+  if (RegNum.hasNoValue()) {
+    if (Variable *Subst = getContext().availabilityGet(From)) {
+      // At this point we know there is a potential substitution available.
+      if (Subst->mustHaveReg() && !Subst->hasReg()) {
+        // At this point we know the substitution will have a register.
+        if (From->getType() == Subst->getType()) {
+          // At this point we know the substitution's register is compatible.
+          return Subst;
+        }
+      }
+    }
+  }
+
+  if (auto *Mem = llvm::dyn_cast<X86OperandMem>(From)) {
+    // Before doing anything with a Mem operand, we need to ensure that the
+    // Base and Index components are in physical registers.
+    Variable *Base = Mem->getBase();
+    Variable *Index = Mem->getIndex();
+    Constant *Offset = Mem->getOffset();
+    Variable *RegBase = nullptr;
+    Variable *RegIndex = nullptr;
+    uint16_t Shift = Mem->getShift();
+    if (Base) {
+      RegBase = llvm::cast<Variable>(
+          legalize(Base, Legal_Reg | Legal_Rematerializable));
+    }
+    if (Index) {
+      // TODO(jpp): perhaps we should only allow Legal_Reg if
+      // Base->isRematerializable.
+      RegIndex = llvm::cast<Variable>(
+          legalize(Index, Legal_Reg | Legal_Rematerializable));
+    }
+
+    if (Base != RegBase || Index != RegIndex) {
+      Mem = X86OperandMem::create(Func, Ty, RegBase, Offset, RegIndex, Shift,
+                                  Mem->getSegmentRegister());
+    }
+
+    From = Mem;
+
+    if (!(Allowed & Legal_Mem)) {
+      From = copyToReg(From, RegNum);
+    }
+    return From;
+  }
+
+  if (auto *Const = llvm::dyn_cast<Constant>(From)) {
+    if (llvm::isa<ConstantUndef>(Const)) {
+      From = legalizeUndef(Const, RegNum);
+      if (isVectorType(Ty))
+        return From;
+      Const = llvm::cast<Constant>(From);
+    }
+    // There should be no constants of vector type (other than undef).
+    assert(!isVectorType(Ty));
+
+    // If the operand is a 64 bit constant integer we need to legalize it to a
+    // register in x86-64.
+    if (Traits::Is64Bit) {
+      if (auto *C64 = llvm::dyn_cast<ConstantInteger64>(Const)) {
+        if (!Utils::IsInt(32, C64->getValue())) {
+          if (RegNum.hasValue()) {
+            assert(Traits::getGprForType(IceType_i64, RegNum) == RegNum);
+          }
+          return copyToReg(Const, RegNum);
+        }
+      }
+    }
+
+    if (!llvm::dyn_cast<ConstantRelocatable>(Const)) {
+      if (isScalarFloatingType(Ty)) {
+        // Convert a scalar floating point constant into an explicit memory
+        // operand.
+        if (auto *ConstFloat = llvm::dyn_cast<ConstantFloat>(Const)) {
+          if (Utils::isPositiveZero(ConstFloat->getValue()))
+            return makeZeroedRegister(Ty, RegNum);
+        } else if (auto *ConstDouble = llvm::dyn_cast<ConstantDouble>(Const)) {
+          if (Utils::isPositiveZero(ConstDouble->getValue()))
+            return makeZeroedRegister(Ty, RegNum);
+        }
+
+        auto *CFrom = llvm::cast<Constant>(From);
+        assert(CFrom->getShouldBePooled());
+        Constant *Offset = Ctx->getConstantSym(0, CFrom->getLabelName());
+        auto *Mem = X86OperandMem::create(Func, Ty, nullptr, Offset);
+        From = Mem;
+      }
+    }
+
+    bool NeedsReg = false;
+    if (!(Allowed & Legal_Imm) && !isScalarFloatingType(Ty))
+      // Immediate specifically not allowed.
+      NeedsReg = true;
+    if (!(Allowed & Legal_Mem) && isScalarFloatingType(Ty))
+      // On x86, FP constants are lowered to mem operands.
+      NeedsReg = true;
+    if (NeedsReg) {
+      From = copyToReg(From, RegNum);
+    }
+    return From;
+  }
+
+  if (auto *Var = llvm::dyn_cast<Variable>(From)) {
+    // Check if the variable is guaranteed a physical register. This can happen
+    // either when the variable is pre-colored or when it is assigned infinite
+    // weight.
+    bool MustHaveRegister = (Var->hasReg() || Var->mustHaveReg());
+    bool MustRematerialize =
+        (Var->isRematerializable() && !(Allowed & Legal_Rematerializable));
+    // We need a new physical register for the operand if:
+    // - Mem is not allowed and Var isn't guaranteed a physical register, or
+    // - RegNum is required and Var->getRegNum() doesn't match, or
+    // - Var is a rematerializable variable and rematerializable pass-through is
+    //   not allowed (in which case we need a lea instruction).
+    if (MustRematerialize) {
+      Variable *NewVar = makeReg(Ty, RegNum);
+      // Since Var is rematerializable, the offset will be added when the lea is
+      // emitted.
+      constexpr Constant *NoOffset = nullptr;
+      auto *Mem = X86OperandMem::create(Func, Ty, Var, NoOffset);
+      _lea(NewVar, Mem);
+      From = NewVar;
+    } else if ((!(Allowed & Legal_Mem) && !MustHaveRegister) ||
+               (RegNum.hasValue() && RegNum != Var->getRegNum())) {
+      From = copyToReg(From, RegNum);
+    }
+    return From;
+  }
+
+  llvm::report_fatal_error("Unhandled operand kind in legalize()");
+  return From;
+}
+
+/// Provide a trivial wrapper to legalize() for this common usage.
+
+Variable *TargetX8664::legalizeToReg(Operand *From, RegNumT RegNum) {
+  return llvm::cast<Variable>(legalize(From, Legal_Reg, RegNum));
+}
+
+/// Legalize undef values to concrete values.
+
+Operand *TargetX8664::legalizeUndef(Operand *From, RegNumT RegNum) {
+  Type Ty = From->getType();
+  if (llvm::isa<ConstantUndef>(From)) {
+    // Lower undefs to zero.  Another option is to lower undefs to an
+    // uninitialized register; however, using an uninitialized register results
+    // in less predictable code.
+    //
+    // If in the future the implementation is changed to lower undef values to
+    // uninitialized registers, a FakeDef will be needed:
+    //     Context.insert<InstFakeDef>(Reg);
+    // This is in order to ensure that the live range of Reg is not
+    // overestimated.  If the constant being lowered is a 64 bit value, then
+    // the result should be split and the lo and hi components will need to go
+    // in uninitialized registers.
+    if (isVectorType(Ty))
+      return makeVectorOfZeros(Ty, RegNum);
+    return Ctx->getConstantZero(Ty);
+  }
+  return From;
+}
+
+/// For the cmp instruction, if Src1 is an immediate, or known to be a physical
+/// register, we can allow Src0 to be a memory operand. Otherwise, Src0 must be
+/// copied into a physical register. (Actually, either Src0 or Src1 can be
+/// chosen for the physical register, but unfortunately we have to commit to one
+/// or the other before register allocation.)
+
+Operand *TargetX8664::legalizeSrc0ForCmp(Operand *Src0, Operand *Src1) {
+  bool IsSrc1ImmOrReg = false;
+  if (llvm::isa<Constant>(Src1)) {
+    IsSrc1ImmOrReg = true;
+  } else if (auto *Var = llvm::dyn_cast<Variable>(Src1)) {
+    if (Var->hasReg())
+      IsSrc1ImmOrReg = true;
+  }
+  return legalize(Src0, IsSrc1ImmOrReg ? (Legal_Reg | Legal_Mem) : Legal_Reg);
+}
+
+typename TargetX8664::X86OperandMem *
+TargetX8664::formMemoryOperand(Operand *Opnd, Type Ty, bool DoLegalize) {
+  auto *Mem = llvm::dyn_cast<X86OperandMem>(Opnd);
+  // It may be the case that address mode optimization already creates an
+  // X86OperandMem, so in that case it wouldn't need another level of
+  // transformation.
+  if (!Mem) {
+    auto *Base = llvm::dyn_cast<Variable>(Opnd);
+    auto *Offset = llvm::dyn_cast<Constant>(Opnd);
+    assert(Base || Offset);
+    if (Offset) {
+      if (!llvm::isa<ConstantRelocatable>(Offset)) {
+        if (llvm::isa<ConstantInteger64>(Offset)) {
+          // Memory operands cannot have 64-bit immediates, so they must be
+          // legalized into a register only.
+          Base = llvm::cast<Variable>(legalize(Offset, Legal_Reg));
+          Offset = nullptr;
+        } else {
+          Offset = llvm::cast<Constant>(legalize(Offset));
+
+          assert(llvm::isa<ConstantInteger32>(Offset) ||
+                 llvm::isa<ConstantRelocatable>(Offset));
+        }
+      }
+    }
+    Mem = X86OperandMem::create(Func, Ty, Base, Offset);
+  }
+  return llvm::cast<X86OperandMem>(DoLegalize ? legalize(Mem) : Mem);
+}
+
+Variable *TargetX8664::makeReg(Type Type, RegNumT RegNum) {
+  // There aren't any 64-bit integer registers for x86-32.
+  assert(Traits::Is64Bit || Type != IceType_i64);
+  Variable *Reg = Func->makeVariable(Type);
+  if (RegNum.hasValue())
+    Reg->setRegNum(RegNum);
+  else
+    Reg->setMustHaveReg();
+  return Reg;
+}
+
+const Type TypeForSize[] = {IceType_i8, IceType_i16, IceType_i32, IceType_f64,
+                            IceType_v16i8};
+
+Type TargetX8664::largestTypeInSize(uint32_t Size, uint32_t MaxSize) {
+  assert(Size != 0);
+  uint32_t TyIndex = llvm::findLastSet(Size, llvm::ZB_Undefined);
+  uint32_t MaxIndex = MaxSize == NoSizeLimit
+                          ? llvm::array_lengthof(TypeForSize) - 1
+                          : llvm::findLastSet(MaxSize, llvm::ZB_Undefined);
+  return TypeForSize[std::min(TyIndex, MaxIndex)];
+}
+
+Type TargetX8664::firstTypeThatFitsSize(uint32_t Size, uint32_t MaxSize) {
+  assert(Size != 0);
+  uint32_t TyIndex = llvm::findLastSet(Size, llvm::ZB_Undefined);
+  if (!llvm::isPowerOf2_32(Size))
+    ++TyIndex;
+  uint32_t MaxIndex = MaxSize == NoSizeLimit
+                          ? llvm::array_lengthof(TypeForSize) - 1
+                          : llvm::findLastSet(MaxSize, llvm::ZB_Undefined);
+  return TypeForSize[std::min(TyIndex, MaxIndex)];
+}
+
+void TargetX8664::postLower() {
+  if (Func->getOptLevel() == Opt_m1)
+    return;
+  markRedefinitions();
+  Context.availabilityUpdate();
+}
+
+void TargetX8664::emit(const ConstantInteger32 *C) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Ctx->getStrEmit();
+  Str << "$" << C->getValue();
+}
+
+void TargetX8664::emit(const ConstantInteger64 *C) const {
+  if (!Traits::Is64Bit) {
+    llvm::report_fatal_error("Not expecting to emit 64-bit integers");
+  } else {
+    if (!BuildDefs::dump())
+      return;
+    Ostream &Str = Ctx->getStrEmit();
+    Str << "$" << C->getValue();
+  }
+}
+
+void TargetX8664::emit(const ConstantFloat *C) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Ctx->getStrEmit();
+  Str << C->getLabelName();
+}
+
+void TargetX8664::emit(const ConstantDouble *C) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Ctx->getStrEmit();
+  Str << C->getLabelName();
+}
+
+void TargetX8664::emit(const ConstantUndef *) const {
+  llvm::report_fatal_error("undef value encountered by emitter.");
+}
+
+void TargetX8664::emit(const ConstantRelocatable *C) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Ctx->getStrEmit();
+  Str << "$";
+  emitWithoutPrefix(C);
+}
+
+void TargetX8664::emitJumpTable(const Cfg *,
+                                const InstJumpTable *JumpTable) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Ctx->getStrEmit();
+  Str << "\t.section\t.rodata." << JumpTable->getSectionName()
+      << ",\"a\",@progbits\n"
+         "\t.align\t"
+      << typeWidthInBytes(getPointerType()) << "\n"
+      << JumpTable->getName() << ":";
+
+  for (SizeT I = 0; I < JumpTable->getNumTargets(); ++I)
+    Str << "\n\t.var\t" << JumpTable->getTarget(I)->getAsmName();
+  Str << "\n";
+}
+
+template <typename T>
+void TargetDataX8664::emitConstantPool(GlobalContext *Ctx) {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Ctx->getStrEmit();
+  Type Ty = T::Ty;
+  SizeT Align = typeAlignInBytes(Ty);
+  ConstantList Pool = Ctx->getConstantPool(Ty);
+
+  Str << "\t.section\t.rodata.cst" << Align << ",\"aM\",@progbits," << Align
+      << "\n";
+  Str << "\t.align\t" << Align << "\n";
+
+  for (Constant *C : Pool) {
+    if (!C->getShouldBePooled())
+      continue;
+    auto *Const = llvm::cast<typename T::IceType>(C);
+    typename T::IceType::PrimType Value = Const->getValue();
+    // Use memcpy() to copy bits from Value into RawValue in a way that avoids
+    // breaking strict-aliasing rules.
+    typename T::PrimitiveIntType RawValue;
+    memcpy(&RawValue, &Value, sizeof(Value));
+    char buf[30];
+    int CharsPrinted =
+        snprintf(buf, llvm::array_lengthof(buf), T::PrintfString, RawValue);
+    assert(CharsPrinted >= 0);
+    assert((size_t)CharsPrinted < llvm::array_lengthof(buf));
+    (void)CharsPrinted; // avoid warnings if asserts are disabled
+    Str << Const->getLabelName();
+    Str << ":\n\t" << T::AsmTag << "\t" << buf << "\t/* " << T::TypeName << " "
+        << Value << " */\n";
+  }
+}
+
+void TargetDataX8664::lowerConstants() {
+  if (getFlags().getDisableTranslation())
+    return;
+  switch (getFlags().getOutFileType()) {
+  case FT_Elf: {
+    ELFObjectWriter *Writer = Ctx->getObjectWriter();
+
+    Writer->writeConstantPool<ConstantInteger32>(IceType_i8);
+    Writer->writeConstantPool<ConstantInteger32>(IceType_i16);
+    Writer->writeConstantPool<ConstantInteger32>(IceType_i32);
+
+    Writer->writeConstantPool<ConstantFloat>(IceType_f32);
+    Writer->writeConstantPool<ConstantDouble>(IceType_f64);
+  } break;
+  case FT_Asm:
+  case FT_Iasm: {
+    OstreamLocker L(Ctx);
+
+    emitConstantPool<PoolTypeConverter<uint8_t>>(Ctx);
+    emitConstantPool<PoolTypeConverter<uint16_t>>(Ctx);
+    emitConstantPool<PoolTypeConverter<uint32_t>>(Ctx);
+
+    emitConstantPool<PoolTypeConverter<float>>(Ctx);
+    emitConstantPool<PoolTypeConverter<double>>(Ctx);
+  } break;
+  }
+}
+
+void TargetDataX8664::lowerJumpTables() {
+  const bool IsPIC = false;
+  switch (getFlags().getOutFileType()) {
+  case FT_Elf: {
+    ELFObjectWriter *Writer = Ctx->getObjectWriter();
+    constexpr FixupKind FK_Abs64 = llvm::ELF::R_X86_64_64;
+    const FixupKind RelocationKind =
+        (getPointerType() == IceType_i32) ? Traits::FK_Abs : FK_Abs64;
+    for (const JumpTableData &JT : Ctx->getJumpTables())
+      Writer->writeJumpTable(JT, RelocationKind, IsPIC);
+  } break;
+  case FT_Asm:
+    // Already emitted from Cfg
+    break;
+  case FT_Iasm: {
+    if (!BuildDefs::dump())
+      return;
+    Ostream &Str = Ctx->getStrEmit();
+    const char *Prefix = IsPIC ? ".data.rel.ro." : ".rodata.";
+    for (const JumpTableData &JT : Ctx->getJumpTables()) {
+      Str << "\t.section\t" << Prefix << JT.getSectionName()
+          << ",\"a\",@progbits\n"
+             "\t.align\t"
+          << typeWidthInBytes(getPointerType()) << "\n"
+          << JT.getName().toString() << ":";
+
+      // On X8664 ILP32 pointers are 32-bit hence the use of .long
+      for (intptr_t TargetOffset : JT.getTargetOffsets())
+        Str << "\n\t.long\t" << JT.getFunctionName() << "+" << TargetOffset;
+      Str << "\n";
+    }
+  } break;
+  }
+}
+
+void TargetDataX8664::lowerGlobals(const VariableDeclarationList &Vars,
+                                   const std::string &SectionSuffix) {
+  const bool IsPIC = false;
+  switch (getFlags().getOutFileType()) {
+  case FT_Elf: {
+    ELFObjectWriter *Writer = Ctx->getObjectWriter();
+    Writer->writeDataSection(Vars, Traits::FK_Abs, SectionSuffix, IsPIC);
+  } break;
+  case FT_Asm:
+  case FT_Iasm: {
+    OstreamLocker L(Ctx);
+    for (const VariableDeclaration *Var : Vars) {
+      if (getFlags().matchTranslateOnly(Var->getName(), 0)) {
+        emitGlobal(*Var, SectionSuffix);
+      }
+    }
+  } break;
+  }
+}
+
 //------------------------------------------------------------------------------
 //      ______   ______     ______     __     ______   ______
 //     /\__  _\ /\  == \   /\  __ \   /\ \   /\__  _\ /\  ___\
@@ -104,18 +8088,13 @@
 const uint32_t TargetX8664Traits::X86_STACK_ALIGNMENT_BYTES = 16;
 const char *TargetX8664Traits::TargetName = "X8664";
 
-template <>
-std::array<SmallBitVector, RCX86_NUM>
-    TargetX86Base<X8664::Traits>::TypeToRegisterSet = {{}};
+std::array<SmallBitVector, RCX86_NUM> TargetX8664::TypeToRegisterSet = {{}};
 
-template <>
-std::array<SmallBitVector, RCX86_NUM>
-    TargetX86Base<X8664::Traits>::TypeToRegisterSetUnfiltered = {{}};
+std::array<SmallBitVector, RCX86_NUM> TargetX8664::TypeToRegisterSetUnfiltered =
+    {{}};
 
-template <>
-std::array<SmallBitVector,
-           TargetX86Base<X8664::Traits>::Traits::RegisterSet::Reg_NUM>
-    TargetX86Base<X8664::Traits>::RegisterAliases = {{}};
+std::array<SmallBitVector, TargetX8664::Traits::RegisterSet::Reg_NUM>
+    TargetX8664::RegisterAliases = {{}};
 
 //------------------------------------------------------------------------------
 //     __      ______  __     __  ______  ______  __  __   __  ______
@@ -132,15 +8111,9 @@
 }
 
 void TargetX8664::_mov_sp(Operand *NewValue) {
-  assert(NewValue->getType() == IceType_i32);
-
-  Variable *esp = getPhysicalRegister(Traits::RegisterSet::Reg_esp);
   Variable *rsp =
       getPhysicalRegister(Traits::RegisterSet::Reg_rsp, IceType_i64);
-
-  _redefined(Context.insert<InstFakeDef>(esp, rsp));
-  _redefined(_mov(esp, NewValue));
-  _redefined(Context.insert<InstFakeDef>(rsp, esp));
+  _redefined(_mov(rsp, NewValue));
 }
 
 void TargetX8664::_link_bp() {
@@ -151,7 +8124,6 @@
 
   _push(rbp);
   _mov(rbp, rsp);
-
   // Keep ebp live for late-stage liveness analysis (e.g. asm-verbose mode).
   Context.insert<InstFakeUse>(rbp);
 }
@@ -161,11 +8133,11 @@
       getPhysicalRegister(Traits::RegisterSet::Reg_rsp, IceType_i64);
   Variable *rbp =
       getPhysicalRegister(Traits::RegisterSet::Reg_rbp, IceType_i64);
-
   // For late-stage liveness analysis (e.g. asm-verbose mode), adding a fake
   // use of rsp before the assignment of rsp=rbp keeps previous rsp
   // adjustments from being dead-code eliminated.
   Context.insert<InstFakeUse>(rsp);
+
   _mov(rsp, rbp);
   _pop(rbp);
 }
@@ -225,8 +8197,6 @@
 
 Inst *TargetX8664::emitCallToTarget(Operand *CallTarget, Variable *ReturnReg,
                                     size_t NumVariadicFpArgs) {
-  Inst *NewCall = nullptr;
-
   if (CallTarget->getType() == IceType_i64) {
     // x86-64 does not support 64-bit direct calls, so write the value to a
     // register and make an indirect call for Constant call targets.
@@ -235,7 +8205,7 @@
     // System V: force r11 when calling a variadic function so that rax isn't
     // used, since rax stores the number of FP args (see NumVariadicFpArgs
     // usage below).
-#if !defined(_WIN64)
+#if !defined(SUBZERO_USE_MICROSOFT_ABI)
     if (NumVariadicFpArgs > 0)
       TargetReg = Traits::RegisterSet::Reg_r11;
 #endif
@@ -251,7 +8221,7 @@
   }
 
   // System V: store number of FP args in RAX for variadic calls
-#if !defined(_WIN64)
+#if !defined(SUBZERO_USE_MICROSOFT_ABI)
   if (NumVariadicFpArgs > 0) {
     // Store number of FP args (stored in XMM registers) in RAX for variadic
     // calls
@@ -262,9 +8232,7 @@
   }
 #endif
 
-  NewCall = Context.insert<Traits::Insts::Call>(ReturnReg, CallTarget);
-
-  return NewCall;
+  return Context.insert<Insts::Call>(ReturnReg, CallTarget);
 }
 
 Variable *TargetX8664::moveReturnValueToRegister(Operand *Value,
@@ -402,6 +8370,7 @@
                 "Inconsistency between ICETYPEX86_TABLE and ICETYPE_TABLE");
 ICETYPE_TABLE
 #undef X
+
 } // end of namespace dummy3
 } // end of anonymous namespace
 
diff --git a/third_party/subzero/src/IceTargetLoweringX8664.h b/third_party/subzero/src/IceTargetLoweringX8664.h
index bac931b..727c067 100644
--- a/third_party/subzero/src/IceTargetLoweringX8664.h
+++ b/third_party/subzero/src/IceTargetLoweringX8664.h
@@ -18,23 +18,974 @@
 
 #include "IceAssemblerX8664.h"
 #include "IceCfg.h"
+#include "IceDefs.h"
 #include "IceGlobalContext.h"
+#include "IceInst.h"
 #include "IceInstX8664.h"
-#include "IceTargetLowering.h"
-#include "IceTargetLoweringX8664Base.h"
+#include "IceSwitchLowering.h"
+#include "IceTargetLoweringX86.h"
 #include "IceTargetLoweringX8664Traits.h"
+#include "IceTargetLoweringX86RegClass.h"
+#include "IceUtils.h"
+
+#include <array>
+#include <type_traits>
+#include <utility>
 
 namespace Ice {
 namespace X8664 {
 
-class TargetX8664 final : public X8664::TargetX86Base<X8664::Traits> {
+using namespace ::Ice::X86;
+
+class BoolFoldingEntry {
+  BoolFoldingEntry(const BoolFoldingEntry &) = delete;
+
+public:
+  BoolFoldingEntry() = default;
+  explicit BoolFoldingEntry(Inst *I);
+  BoolFoldingEntry &operator=(const BoolFoldingEntry &) = default;
+  /// Instr is the instruction producing the i1-type variable of interest.
+  Inst *Instr = nullptr;
+  /// IsComplex is the cached result of BoolFolding::hasComplexLowering(Instr).
+  bool IsComplex = false;
+  /// IsLiveOut is initialized conservatively to true, and is set to false when
+  /// we encounter an instruction that ends Var's live range. We disable the
+  /// folding optimization when Var is live beyond this basic block. Note that
+  /// if liveness analysis is not performed (e.g. in Om1 mode), IsLiveOut will
+  /// always be true and the folding optimization will never be performed.
+  bool IsLiveOut = true;
+  // NumUses counts the number of times Var is used as a source operand in the
+  // basic block. If IsComplex is true and there is more than one use of Var,
+  // then the folding optimization is disabled for Var.
+  uint32_t NumUses = 0;
+};
+
+class BoolFolding {
+public:
+  enum BoolFoldingProducerKind {
+    PK_None,
+    // TODO(jpp): PK_Icmp32 is no longer meaningful. Rename to PK_IcmpNative.
+    PK_Icmp32,
+    PK_Icmp64,
+    PK_Fcmp,
+    PK_Trunc,
+    PK_Arith // A flag-setting arithmetic instruction.
+  };
+
+  /// Currently the actual enum values are not used (other than CK_None), but we
+  /// go ahead and produce them anyway for symmetry with the
+  /// BoolFoldingProducerKind.
+  enum BoolFoldingConsumerKind { CK_None, CK_Br, CK_Select, CK_Sext, CK_Zext };
+
+private:
+  BoolFolding(const BoolFolding &) = delete;
+  BoolFolding &operator=(const BoolFolding &) = delete;
+
+public:
+  BoolFolding() = default;
+  static BoolFoldingProducerKind getProducerKind(const Inst *Instr);
+  static BoolFoldingConsumerKind getConsumerKind(const Inst *Instr);
+  static bool hasComplexLowering(const Inst *Instr);
+  static bool isValidFolding(BoolFoldingProducerKind ProducerKind,
+                             BoolFoldingConsumerKind ConsumerKind);
+  void init(CfgNode *Node);
+  const Inst *getProducerFor(const Operand *Opnd) const;
+  void dump(const Cfg *Func) const;
+
+private:
+  /// Returns true if Producers contains a valid entry for the given VarNum.
+  bool containsValid(SizeT VarNum) const {
+    auto Element = Producers.find(VarNum);
+    return Element != Producers.end() && Element->second.Instr != nullptr;
+  }
+  void setInvalid(SizeT VarNum) { Producers[VarNum].Instr = nullptr; }
+  void invalidateProducersOnStore(const Inst *Instr);
+  /// Producers maps Variable::Number to a BoolFoldingEntry.
+  CfgUnorderedMap<SizeT, BoolFoldingEntry> Producers;
+};
+
+/// TargetX8664 is a template for all X86 Targets, and it relies on the CRT
+/// pattern for generating code, delegating to actual backends target-specific
+/// lowerings (e.g., call, ret, and intrinsics.)
+///
+/// Note: Ideally, we should be able to
+///
+///  static_assert(std::is_base_of<TargetX8664<TraitsType>,
+///  Machine>::value);
+///
+/// but that does not work: the compiler does not know that Machine inherits
+/// from TargetX8664 at this point in translation.
+class TargetX8664 : public TargetX86 {
   TargetX8664() = delete;
   TargetX8664(const TargetX8664 &) = delete;
   TargetX8664 &operator=(const TargetX8664 &) = delete;
 
 public:
-  ~TargetX8664() = default;
+  using Traits = TargetX8664Traits;
+  using TargetLowering = typename Traits::TargetLowering;
 
+  using BrCond = CondX86::BrCond;
+  using CmppsCond = CondX86::CmppsCond;
+
+  using X86Address = typename Traits::Address;
+  using X86Operand = typename Traits::X86Operand;
+  using X86OperandMem = typename Traits::X86OperandMem;
+  using SegmentRegisters = typename Traits::X86OperandMem::SegmentRegisters;
+
+  using InstX86Br = Insts::Br;
+  using InstX86FakeRMW = Insts::FakeRMW;
+  using InstX86Label = Insts::Label;
+
+  ~TargetX8664() override = default;
+
+  static void staticInit(GlobalContext *Ctx);
+  static bool shouldBePooled(const Constant *C);
+  static ::Ice::Type getPointerType();
+
+  static FixupKind getPcRelFixup() { return PcRelFixup; }
+  static FixupKind getAbsFixup() { return AbsFixup; }
+
+  void translateOm1() override;
+  void translateO2() override;
+  void doLoadOpt();
+  bool doBranchOpt(Inst *I, const CfgNode *NextNode) override;
+
+  SizeT getNumRegisters() const override {
+    return Traits::RegisterSet::Reg_NUM;
+  }
+
+  Inst *createLoweredMove(Variable *Dest, Variable *SrcVar) override {
+    if (isVectorType(Dest->getType())) {
+      return Insts::Movp::create(Func, Dest, SrcVar);
+    }
+    return Insts::Mov::create(Func, Dest, SrcVar);
+    (void)Dest;
+    (void)SrcVar;
+    return nullptr;
+  }
+
+  Variable *getPhysicalRegister(RegNumT RegNum,
+                                Type Ty = IceType_void) override;
+  const char *getRegName(RegNumT RegNum, Type Ty) const override;
+  static const char *getRegClassName(RegClass C) {
+    auto ClassNum = static_cast<RegClassX86>(C);
+    assert(ClassNum < RCX86_NUM);
+    switch (ClassNum) {
+    default:
+      assert(C < RC_Target);
+      return regClassString(C);
+    case RCX86_Is64To8:
+      return "i64to8"; // 64-bit GPR truncable to i8
+    case RCX86_Is32To8:
+      return "i32to8"; // 32-bit GPR truncable to i8
+    case RCX86_Is16To8:
+      return "i16to8"; // 16-bit GPR truncable to i8
+    case RCX86_IsTrunc8Rcvr:
+      return "i8from"; // 8-bit GPR truncable from wider GPRs
+    case RCX86_IsAhRcvr:
+      return "i8fromah"; // 8-bit GPR that ah can be assigned to
+    }
+  }
+  SmallBitVector getRegisterSet(RegSetMask Include,
+                                RegSetMask Exclude) const override;
+  const SmallBitVector &
+  getRegistersForVariable(const Variable *Var) const override {
+    RegClass RC = Var->getRegClass();
+    assert(static_cast<RegClassX86>(RC) < RCX86_NUM);
+    return TypeToRegisterSet[RC];
+  }
+
+  const SmallBitVector &
+  getAllRegistersForVariable(const Variable *Var) const override {
+    RegClass RC = Var->getRegClass();
+    assert(static_cast<RegClassX86>(RC) < RCX86_NUM);
+    return TypeToRegisterSetUnfiltered[RC];
+  }
+
+  const SmallBitVector &getAliasesForRegister(RegNumT Reg) const override {
+    Reg.assertIsValid();
+    return RegisterAliases[Reg];
+  }
+
+  bool hasFramePointer() const override { return IsEbpBasedFrame; }
+  void setHasFramePointer() override { IsEbpBasedFrame = true; }
+  RegNumT getStackReg() const override { return Traits::StackPtr; }
+  RegNumT getFrameReg() const override { return Traits::FramePtr; }
+  RegNumT getFrameOrStackReg() const override {
+    // If the stack pointer needs to be aligned, then the frame pointer is
+    // unaligned, so always use the stack pointer.
+    if (needsStackPointerAlignment())
+      return getStackReg();
+    return IsEbpBasedFrame ? getFrameReg() : getStackReg();
+  }
+  size_t typeWidthInBytesOnStack(Type Ty) const override {
+    // Round up to the next multiple of WordType bytes.
+    const uint32_t WordSizeInBytes = typeWidthInBytes(Traits::WordType);
+    return Utils::applyAlignment(typeWidthInBytes(Ty), WordSizeInBytes);
+  }
+  uint32_t getStackAlignment() const override {
+    return Traits::X86_STACK_ALIGNMENT_BYTES;
+  }
+  bool needsStackPointerAlignment() const override {
+    // If the ABI's stack alignment is smaller than the vector size (16 bytes),
+    // use the (realigned) stack pointer for addressing any stack variables.
+    return Traits::X86_STACK_ALIGNMENT_BYTES < 16;
+  }
+  void reserveFixedAllocaArea(size_t Size, size_t Align) override {
+    FixedAllocaSizeBytes = Size;
+    assert(llvm::isPowerOf2_32(Align));
+    FixedAllocaAlignBytes = Align;
+    PrologEmitsFixedAllocas = true;
+  }
+  /// Returns the (negative) offset from ebp/rbp where the fixed Allocas start.
+  int32_t getFrameFixedAllocaOffset() const override {
+    return FixedAllocaSizeBytes - (SpillAreaSizeBytes - maxOutArgsSizeBytes());
+  }
+  virtual uint32_t maxOutArgsSizeBytes() const override {
+    return MaxOutArgsSizeBytes;
+  }
+  virtual void updateMaxOutArgsSizeBytes(uint32_t Size) {
+    MaxOutArgsSizeBytes = std::max(MaxOutArgsSizeBytes, Size);
+  }
+
+  bool shouldSplitToVariable64On32(Type Ty) const override {
+    return Traits::Is64Bit ? false : Ty == IceType_i64;
+  }
+
+  SizeT getMinJumpTableSize() const override { return 4; }
+
+  void emitVariable(const Variable *Var) const override;
+
+  void emit(const ConstantInteger32 *C) const final;
+  void emit(const ConstantInteger64 *C) const final;
+  void emit(const ConstantFloat *C) const final;
+  void emit(const ConstantDouble *C) const final;
+  void emit(const ConstantUndef *C) const final;
+  void emit(const ConstantRelocatable *C) const final;
+
+  void initNodeForLowering(CfgNode *Node) override;
+
+  template <typename T = Traits>
+  typename std::enable_if<!T::Is64Bit, Operand>::type *
+  loOperand(Operand *Operand);
+  template <typename T = Traits>
+  typename std::enable_if<T::Is64Bit, Operand>::type *loOperand(Operand *) {
+    llvm::report_fatal_error(
+        "Hey, yo! This is x86-64. Watcha doin'? (loOperand)");
+  }
+
+  template <typename T = Traits>
+  typename std::enable_if<!T::Is64Bit, Operand>::type *
+  hiOperand(Operand *Operand);
+  template <typename T = Traits>
+  typename std::enable_if<T::Is64Bit, Operand>::type *hiOperand(Operand *) {
+    llvm::report_fatal_error(
+        "Hey, yo! This is x86-64. Watcha doin'? (hiOperand)");
+  }
+
+  void addProlog(CfgNode *Node) override;
+  void finishArgumentLowering(Variable *Arg, Variable *FramePtr,
+                              size_t BasicFrameOffset, size_t StackAdjBytes,
+                              size_t &InArgsSizeBytes);
+  void addEpilog(CfgNode *Node) override;
+  X86Address stackVarToAsmOperand(const Variable *Var) const;
+
+  Operand *legalizeUndef(Operand *From, RegNumT RegNum = RegNumT());
+
+protected:
+  void postLower() override;
+
+  void lowerAlloca(const InstAlloca *Instr) override;
+  void lowerArguments() override;
+  void lowerArithmetic(const InstArithmetic *Instr) override;
+  void lowerAssign(const InstAssign *Instr) override;
+  void lowerBr(const InstBr *Instr) override;
+  void lowerBreakpoint(const InstBreakpoint *Instr) override;
+  void lowerCall(const InstCall *Instr) override;
+  void lowerCast(const InstCast *Instr) override;
+  void lowerExtractElement(const InstExtractElement *Instr) override;
+  void lowerFcmp(const InstFcmp *Instr) override;
+  void lowerIcmp(const InstIcmp *Instr) override;
+
+  void lowerIntrinsic(const InstIntrinsic *Instr) override;
+  void lowerInsertElement(const InstInsertElement *Instr) override;
+  void lowerLoad(const InstLoad *Instr) override;
+  void lowerPhi(const InstPhi *Instr) override;
+  void lowerRet(const InstRet *Instr) override;
+  void lowerSelect(const InstSelect *Instr) override;
+  void lowerShuffleVector(const InstShuffleVector *Instr) override;
+  void lowerStore(const InstStore *Instr) override;
+  void lowerSwitch(const InstSwitch *Instr) override;
+  void lowerUnreachable(const InstUnreachable *Instr) override;
+  void lowerOther(const Inst *Instr) override;
+  void lowerRMW(const InstX86FakeRMW *RMW);
+  void prelowerPhis() override;
+  uint32_t getCallStackArgumentsSizeBytes(const CfgVector<Type> &ArgTypes,
+                                          Type ReturnType);
+  uint32_t getCallStackArgumentsSizeBytes(const InstCall *Instr) override;
+  void genTargetHelperCallFor(Inst *Instr) override;
+
+  /// OptAddr wraps all the possible operands that an x86 address might have.
+  struct OptAddr {
+    Variable *Base = nullptr;
+    Variable *Index = nullptr;
+    uint16_t Shift = 0;
+    int32_t Offset = 0;
+    ConstantRelocatable *Relocatable = nullptr;
+  };
+
+  // Builds information for a canonical address expresion:
+  //   <Relocatable + Offset>(Base, Index, Shift)
+  X86OperandMem *computeAddressOpt(const Inst *Instr, Type MemType,
+                                   Operand *Addr);
+  void doAddressOptOther() override;
+  void doAddressOptLoad() override;
+  void doAddressOptStore() override;
+  void doAddressOptLoadSubVector() override;
+  void doAddressOptStoreSubVector() override;
+  void doMockBoundsCheck(Operand *Opnd) override;
+
+  /// Naive lowering of cmpxchg.
+  void lowerAtomicCmpxchg(Variable *DestPrev, Operand *Ptr, Operand *Expected,
+                          Operand *Desired);
+  /// Attempt a more optimized lowering of cmpxchg. Returns true if optimized.
+  bool tryOptimizedCmpxchgCmpBr(Variable *DestPrev, Operand *Ptr,
+                                Operand *Expected, Operand *Desired);
+  void lowerAtomicRMW(Variable *Dest, uint32_t Operation, Operand *Ptr,
+                      Operand *Val);
+  void lowerCountZeros(bool Cttz, Type Ty, Variable *Dest, Operand *FirstVal,
+                       Operand *SecondVal);
+  /// Load from memory for a given type.
+  void typedLoad(Type Ty, Variable *Dest, Variable *Base, Constant *Offset);
+  /// Store to memory for a given type.
+  void typedStore(Type Ty, Variable *Value, Variable *Base, Constant *Offset);
+  /// Copy memory of given type from Src to Dest using OffsetAmt on both.
+  void copyMemory(Type Ty, Variable *Dest, Variable *Src, int32_t OffsetAmt);
+  /// Replace some calls to memcpy with inline instructions.
+  void lowerMemcpy(Operand *Dest, Operand *Src, Operand *Count);
+  /// Replace some calls to memmove with inline instructions.
+  void lowerMemmove(Operand *Dest, Operand *Src, Operand *Count);
+  /// Replace some calls to memset with inline instructions.
+  void lowerMemset(Operand *Dest, Operand *Val, Operand *Count);
+
+  /// Lower an indirect jump .
+  void lowerIndirectJump(Variable *JumpTarget);
+
+  /// Check the comparison is in [Min,Max]. The flags register will be modified
+  /// with:
+  ///   - below equal, if in range
+  ///   - above, set if not in range
+  /// The index into the range is returned.
+  Operand *lowerCmpRange(Operand *Comparison, uint64_t Min, uint64_t Max);
+  /// Lowering of a cluster of switch cases. If the case is not matched control
+  /// will pass to the default label provided. If the default label is nullptr
+  /// then control will fall through to the next instruction. DoneCmp should be
+  /// true if the flags contain the result of a comparison with the Comparison.
+  void lowerCaseCluster(const CaseCluster &Case, Operand *Src0, bool DoneCmp,
+                        CfgNode *DefaultLabel = nullptr);
+
+  using LowerBinOp = void (TargetX8664::*)(Variable *, Operand *);
+  void expandAtomicRMWAsCmpxchg(LowerBinOp op_lo, LowerBinOp op_hi,
+                                Variable *Dest, Operand *Ptr, Operand *Val);
+
+  void eliminateNextVectorSextInstruction(Variable *SignExtendedResult);
+
+  void emitStackProbe(size_t StackSizeBytes);
+
+  /// Emit just the call instruction (without argument or return variable
+  /// processing).
+  Inst *emitCallToTarget(Operand *CallTarget, Variable *ReturnReg,
+                         size_t NumVariadicFpArgs = 0);
+  /// Materialize the moves needed to return a value of the specified type.
+  Variable *moveReturnValueToRegister(Operand *Value, Type ReturnType);
+
+  /// Emit a jump table to the constant pool.
+  void emitJumpTable(const Cfg *Func,
+                     const InstJumpTable *JumpTable) const override;
+
+  /// Emit a fake use of esp to make sure esp stays alive for the entire
+  /// function. Otherwise some esp adjustments get dead-code eliminated.
+  void keepEspLiveAtExit() {
+    Variable *esp =
+        Func->getTarget()->getPhysicalRegister(getStackReg(), Traits::WordType);
+    Context.insert<InstFakeUse>(esp);
+  }
+
+  /// Operand legalization helpers. To deal with address mode constraints, the
+  /// helpers will create a new Operand and emit instructions that guarantee
+  /// that the Operand kind is one of those indicated by the LegalMask (a
+  /// bitmask of allowed kinds). If the input Operand is known to already meet
+  /// the constraints, it may be simply returned as the result, without creating
+  /// any new instructions or operands.
+  enum OperandLegalization {
+    Legal_None = 0,
+    Legal_Reg = 1 << 0, // physical register, not stack location
+    Legal_Imm = 1 << 1,
+    Legal_Mem = 1 << 2, // includes [eax+4*ecx] as well as [esp+12]
+    Legal_Rematerializable = 1 << 3,
+    Legal_AddrAbs = 1 << 4, // ConstantRelocatable doesn't have to add RebasePtr
+    Legal_Default = ~(Legal_Rematerializable | Legal_AddrAbs)
+    // TODO(stichnot): Figure out whether this default works for x86-64.
+  };
+  using LegalMask = uint32_t;
+  Operand *legalize(Operand *From, LegalMask Allowed = Legal_Default,
+                    RegNumT RegNum = RegNumT());
+  Variable *legalizeToReg(Operand *From, RegNumT RegNum = RegNumT());
+  /// Legalize the first source operand for use in the cmp instruction.
+  Operand *legalizeSrc0ForCmp(Operand *Src0, Operand *Src1);
+  /// Turn a pointer operand into a memory operand that can be used by a real
+  /// load/store operation. Legalizes the operand as well. This is a nop if the
+  /// operand is already a legal memory operand.
+  X86OperandMem *formMemoryOperand(Operand *Ptr, Type Ty,
+                                   bool DoLegalize = true);
+
+  Variable *makeReg(Type Ty, RegNumT RegNum = RegNumT());
+  static Type stackSlotType();
+
+  static constexpr uint32_t NoSizeLimit = 0;
+  /// Returns the largest type which is equal to or larger than Size bytes. The
+  /// type is suitable for copying memory i.e. a load and store will be a single
+  /// instruction (for example x86 will get f64 not i64).
+  static Type largestTypeInSize(uint32_t Size, uint32_t MaxSize = NoSizeLimit);
+  /// Returns the smallest type which is equal to or larger than Size bytes. If
+  /// one doesn't exist then the largest type smaller than Size bytes is
+  /// returned. The type is suitable for memory copies as described at
+  /// largestTypeInSize.
+  static Type firstTypeThatFitsSize(uint32_t Size,
+                                    uint32_t MaxSize = NoSizeLimit);
+
+  Variable *copyToReg8(Operand *Src, RegNumT RegNum = RegNumT());
+  Variable *copyToReg(Operand *Src, RegNumT RegNum = RegNumT());
+
+  /// Returns a register containing all zeros, without affecting the FLAGS
+  /// register, using the best instruction for the type.
+  Variable *makeZeroedRegister(Type Ty, RegNumT RegNum = RegNumT());
+
+  /// \name Returns a vector in a register with the given constant entries.
+  /// @{
+  Variable *makeVectorOfZeros(Type Ty, RegNumT RegNum = RegNumT());
+  Variable *makeVectorOfOnes(Type Ty, RegNumT RegNum = RegNumT());
+  Variable *makeVectorOfMinusOnes(Type Ty, RegNumT RegNum = RegNumT());
+  Variable *makeVectorOfHighOrderBits(Type Ty, RegNumT RegNum = RegNumT());
+  Variable *makeVectorOfFabsMask(Type Ty, RegNumT RegNum = RegNumT());
+  /// @}
+
+  /// Return a memory operand corresponding to a stack allocated Variable.
+  X86OperandMem *getMemoryOperandForStackSlot(Type Ty, Variable *Slot,
+                                              uint32_t Offset = 0);
+
+  /// The following are helpers that insert lowered x86 instructions with
+  /// minimal syntactic overhead, so that the lowering code can look as close to
+  /// assembly as practical.
+  void _adc(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Adc>(Dest, Src0);
+  }
+  void _adc_rmw(X86OperandMem *DestSrc0, Operand *Src1) {
+    Context.insert<Insts::AdcRMW>(DestSrc0, Src1);
+  }
+  void _add(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Add>(Dest, Src0);
+  }
+  void _add_rmw(X86OperandMem *DestSrc0, Operand *Src1) {
+    Context.insert<Insts::AddRMW>(DestSrc0, Src1);
+  }
+  void _addps(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Addps>(Dest, Src0);
+  }
+  void _addss(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Addss>(Dest, Src0);
+  }
+  void _add_sp(Operand *Adjustment);
+  void _and(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::And>(Dest, Src0);
+  }
+  void _andnps(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Andnps>(Dest, Src0);
+  }
+  void _andps(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Andps>(Dest, Src0);
+  }
+  void _and_rmw(X86OperandMem *DestSrc0, Operand *Src1) {
+    Context.insert<Insts::AndRMW>(DestSrc0, Src1);
+  }
+  void _blendvps(Variable *Dest, Operand *Src0, Operand *Src1) {
+    Context.insert<Insts::Blendvps>(Dest, Src0, Src1);
+  }
+  void _br(BrCond Condition, CfgNode *TargetTrue, CfgNode *TargetFalse) {
+    Context.insert<InstX86Br>(TargetTrue, TargetFalse, Condition,
+                              InstX86Br::Far);
+  }
+  void _br(CfgNode *Target) {
+    Context.insert<InstX86Br>(Target, InstX86Br::Far);
+  }
+  void _br(BrCond Condition, CfgNode *Target) {
+    Context.insert<InstX86Br>(Target, Condition, InstX86Br::Far);
+  }
+  void _br(BrCond Condition, InstX86Label *Label,
+           typename InstX86Br::Mode Kind = InstX86Br::Near) {
+    Context.insert<InstX86Br>(Label, Condition, Kind);
+  }
+  void _bsf(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Bsf>(Dest, Src0);
+  }
+  void _bsr(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Bsr>(Dest, Src0);
+  }
+  void _bswap(Variable *SrcDest) { Context.insert<Insts::Bswap>(SrcDest); }
+  void _cbwdq(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Cbwdq>(Dest, Src0);
+  }
+  void _cmov(Variable *Dest, Operand *Src0, BrCond Condition) {
+    Context.insert<Insts::Cmov>(Dest, Src0, Condition);
+  }
+  void _cmp(Operand *Src0, Operand *Src1) {
+    Context.insert<Insts::Icmp>(Src0, Src1);
+  }
+  void _cmpps(Variable *Dest, Operand *Src0, CmppsCond Condition) {
+    Context.insert<Insts::Cmpps>(Dest, Src0, Condition);
+  }
+  void _cmpxchg(Operand *DestOrAddr, Variable *Eax, Variable *Desired,
+                bool Locked) {
+    Context.insert<Insts::Cmpxchg>(DestOrAddr, Eax, Desired, Locked);
+    // Mark eax as possibly modified by cmpxchg.
+    Context.insert<InstFakeDef>(Eax, llvm::dyn_cast<Variable>(DestOrAddr));
+    _set_dest_redefined();
+    Context.insert<InstFakeUse>(Eax);
+  }
+  void _cmpxchg8b(X86OperandMem *Addr, Variable *Edx, Variable *Eax,
+                  Variable *Ecx, Variable *Ebx, bool Locked) {
+    Context.insert<Insts::Cmpxchg8b>(Addr, Edx, Eax, Ecx, Ebx, Locked);
+    // Mark edx, and eax as possibly modified by cmpxchg8b.
+    Context.insert<InstFakeDef>(Edx);
+    _set_dest_redefined();
+    Context.insert<InstFakeUse>(Edx);
+    Context.insert<InstFakeDef>(Eax);
+    _set_dest_redefined();
+    Context.insert<InstFakeUse>(Eax);
+  }
+  void _cvt(Variable *Dest, Operand *Src0, Insts::Cvt::CvtVariant Variant) {
+    Context.insert<Insts::Cvt>(Dest, Src0, Variant);
+  }
+  void _round(Variable *Dest, Operand *Src0, Operand *Imm) {
+    Context.insert<Insts::Round>(Dest, Src0, Imm);
+  }
+  void _div(Variable *Dest, Operand *Src0, Operand *Src1) {
+    Context.insert<Insts::Div>(Dest, Src0, Src1);
+  }
+  void _divps(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Divps>(Dest, Src0);
+  }
+  void _divss(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Divss>(Dest, Src0);
+  }
+  void _idiv(Variable *Dest, Operand *Src0, Operand *Src1) {
+    Context.insert<Insts::Idiv>(Dest, Src0, Src1);
+  }
+  void _imul(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Imul>(Dest, Src0);
+  }
+  void _imul_imm(Variable *Dest, Operand *Src0, Constant *Imm) {
+    Context.insert<Insts::ImulImm>(Dest, Src0, Imm);
+  }
+  void _insertps(Variable *Dest, Operand *Src0, Operand *Src1) {
+    Context.insert<Insts::Insertps>(Dest, Src0, Src1);
+  }
+  void _int3() { Context.insert<Insts::Int3>(); }
+  void _jmp(Operand *Target) { Context.insert<Insts::Jmp>(Target); }
+  void _lea(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Lea>(Dest, Src0);
+  }
+  void _link_bp();
+  void _push_reg(RegNumT RegNum);
+  void _pop_reg(RegNumT RegNum);
+  void _mfence() { Context.insert<Insts::Mfence>(); }
+  /// Moves can be used to redefine registers, creating "partial kills" for
+  /// liveness.  Mark where moves are used in this way.
+  void _redefined(Inst *MovInst, bool IsRedefinition = true) {
+    if (IsRedefinition)
+      MovInst->setDestRedefined();
+  }
+  /// If Dest=nullptr is passed in, then a new variable is created, marked as
+  /// infinite register allocation weight, and returned through the in/out Dest
+  /// argument.
+  Insts::Mov *_mov(Variable *&Dest, Operand *Src0, RegNumT RegNum = RegNumT()) {
+    if (Dest == nullptr)
+      Dest = makeReg(Src0->getType(), RegNum);
+    return Context.insert<Insts::Mov>(Dest, Src0);
+  }
+  void _mov_sp(Operand *NewValue);
+  Insts::Movp *_movp(Variable *Dest, Operand *Src0) {
+    return Context.insert<Insts::Movp>(Dest, Src0);
+  }
+  void _movd(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Movd>(Dest, Src0);
+  }
+  void _movq(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Movq>(Dest, Src0);
+  }
+  void _movss(Variable *Dest, Variable *Src0) {
+    Context.insert<Insts::MovssRegs>(Dest, Src0);
+  }
+  void _movsx(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Movsx>(Dest, Src0);
+  }
+  Insts::Movzx *_movzx(Variable *Dest, Operand *Src0) {
+    return Context.insert<Insts::Movzx>(Dest, Src0);
+  }
+  void _maxss(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Maxss>(Dest, Src0);
+  }
+  void _minss(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Minss>(Dest, Src0);
+  }
+  void _maxps(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Maxps>(Dest, Src0);
+  }
+  void _minps(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Minps>(Dest, Src0);
+  }
+  void _mul(Variable *Dest, Variable *Src0, Operand *Src1) {
+    Context.insert<Insts::Mul>(Dest, Src0, Src1);
+  }
+  void _mulps(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Mulps>(Dest, Src0);
+  }
+  void _mulss(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Mulss>(Dest, Src0);
+  }
+  void _neg(Variable *SrcDest) { Context.insert<Insts::Neg>(SrcDest); }
+  void _nop(SizeT Variant) { Context.insert<Insts::Nop>(Variant); }
+  void _or(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Or>(Dest, Src0);
+  }
+  void _orps(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Orps>(Dest, Src0);
+  }
+  void _or_rmw(X86OperandMem *DestSrc0, Operand *Src1) {
+    Context.insert<Insts::OrRMW>(DestSrc0, Src1);
+  }
+  void _padd(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Padd>(Dest, Src0);
+  }
+  void _padds(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Padds>(Dest, Src0);
+  }
+  void _paddus(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Paddus>(Dest, Src0);
+  }
+  void _pand(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Pand>(Dest, Src0);
+  }
+  void _pandn(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Pandn>(Dest, Src0);
+  }
+  void _pblendvb(Variable *Dest, Operand *Src0, Operand *Src1) {
+    Context.insert<Insts::Pblendvb>(Dest, Src0, Src1);
+  }
+  void _pcmpeq(Variable *Dest, Operand *Src0,
+               Type ArithmeticTypeOverride = IceType_void) {
+    Context.insert<Insts::Pcmpeq>(Dest, Src0, ArithmeticTypeOverride);
+  }
+  void _pcmpgt(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Pcmpgt>(Dest, Src0);
+  }
+  void _pextr(Variable *Dest, Operand *Src0, Operand *Src1) {
+    Context.insert<Insts::Pextr>(Dest, Src0, Src1);
+  }
+  void _pinsr(Variable *Dest, Operand *Src0, Operand *Src1) {
+    Context.insert<Insts::Pinsr>(Dest, Src0, Src1);
+  }
+  void _pmull(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Pmull>(Dest, Src0);
+  }
+  void _pmulhw(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Pmulhw>(Dest, Src0);
+  }
+  void _pmulhuw(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Pmulhuw>(Dest, Src0);
+  }
+  void _pmaddwd(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Pmaddwd>(Dest, Src0);
+  }
+  void _pmuludq(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Pmuludq>(Dest, Src0);
+  }
+  void _pop(Variable *Dest) { Context.insert<Insts::Pop>(Dest); }
+  void _por(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Por>(Dest, Src0);
+  }
+  void _punpckl(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Punpckl>(Dest, Src0);
+  }
+  void _punpckh(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Punpckh>(Dest, Src0);
+  }
+  void _packss(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Packss>(Dest, Src0);
+  }
+  void _packus(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Packus>(Dest, Src0);
+  }
+  void _pshufb(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Pshufb>(Dest, Src0);
+  }
+  void _pshufd(Variable *Dest, Operand *Src0, Operand *Src1) {
+    Context.insert<Insts::Pshufd>(Dest, Src0, Src1);
+  }
+  void _psll(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Psll>(Dest, Src0);
+  }
+  void _psra(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Psra>(Dest, Src0);
+  }
+  void _psrl(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Psrl>(Dest, Src0);
+  }
+  void _psub(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Psub>(Dest, Src0);
+  }
+  void _psubs(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Psubs>(Dest, Src0);
+  }
+  void _psubus(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Psubus>(Dest, Src0);
+  }
+  void _push(Operand *Src0) { Context.insert<Insts::Push>(Src0); }
+  void _pxor(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Pxor>(Dest, Src0);
+  }
+  void _ret(Variable *Src0 = nullptr) { Context.insert<Insts::Ret>(Src0); }
+  void _rol(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Rol>(Dest, Src0);
+  }
+  void _round(Variable *Dest, Operand *Src, Constant *Imm) {
+    Context.insert<Insts::Round>(Dest, Src, Imm);
+  }
+  void _sar(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Sar>(Dest, Src0);
+  }
+  void _sbb(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Sbb>(Dest, Src0);
+  }
+  void _sbb_rmw(X86OperandMem *DestSrc0, Operand *Src1) {
+    Context.insert<Insts::SbbRMW>(DestSrc0, Src1);
+  }
+  void _setcc(Variable *Dest, BrCond Condition) {
+    Context.insert<Insts::Setcc>(Dest, Condition);
+  }
+  void _shl(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Shl>(Dest, Src0);
+  }
+  void _shld(Variable *Dest, Variable *Src0, Operand *Src1) {
+    Context.insert<Insts::Shld>(Dest, Src0, Src1);
+  }
+  void _shr(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Shr>(Dest, Src0);
+  }
+  void _shrd(Variable *Dest, Variable *Src0, Operand *Src1) {
+    Context.insert<Insts::Shrd>(Dest, Src0, Src1);
+  }
+  void _shufps(Variable *Dest, Operand *Src0, Operand *Src1) {
+    Context.insert<Insts::Shufps>(Dest, Src0, Src1);
+  }
+  void _movmsk(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Movmsk>(Dest, Src0);
+  }
+  void _sqrt(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Sqrt>(Dest, Src0);
+  }
+  void _store(Operand *Value, X86Operand *Mem) {
+    Context.insert<Insts::Store>(Value, Mem);
+  }
+  void _storep(Variable *Value, X86OperandMem *Mem) {
+    Context.insert<Insts::StoreP>(Value, Mem);
+  }
+  void _storeq(Operand *Value, X86OperandMem *Mem) {
+    Context.insert<Insts::StoreQ>(Value, Mem);
+  }
+  void _stored(Operand *Value, X86OperandMem *Mem) {
+    Context.insert<Insts::StoreD>(Value, Mem);
+  }
+  void _sub(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Sub>(Dest, Src0);
+  }
+  void _sub_rmw(X86OperandMem *DestSrc0, Operand *Src1) {
+    Context.insert<Insts::SubRMW>(DestSrc0, Src1);
+  }
+  void _sub_sp(Operand *Adjustment);
+  void _subps(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Subps>(Dest, Src0);
+  }
+  void _subss(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Subss>(Dest, Src0);
+  }
+  void _test(Operand *Src0, Operand *Src1) {
+    Context.insert<Insts::Test>(Src0, Src1);
+  }
+  void _ucomiss(Operand *Src0, Operand *Src1) {
+    Context.insert<Insts::Ucomiss>(Src0, Src1);
+  }
+  void _ud2() { Context.insert<Insts::UD2>(); }
+  void _unlink_bp();
+  void _xadd(Operand *Dest, Variable *Src, bool Locked) {
+    Context.insert<Insts::Xadd>(Dest, Src, Locked);
+    // The xadd exchanges Dest and Src (modifying Src). Model that update with
+    // a FakeDef followed by a FakeUse.
+    Context.insert<InstFakeDef>(Src, llvm::dyn_cast<Variable>(Dest));
+    _set_dest_redefined();
+    Context.insert<InstFakeUse>(Src);
+  }
+  void _xchg(Operand *Dest, Variable *Src) {
+    Context.insert<Insts::Xchg>(Dest, Src);
+    // The xchg modifies Dest and Src -- model that update with a
+    // FakeDef/FakeUse.
+    Context.insert<InstFakeDef>(Src, llvm::dyn_cast<Variable>(Dest));
+    _set_dest_redefined();
+    Context.insert<InstFakeUse>(Src);
+  }
+  void _xor(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Xor>(Dest, Src0);
+  }
+  void _xorps(Variable *Dest, Operand *Src0) {
+    Context.insert<Insts::Xorps>(Dest, Src0);
+  }
+  void _xor_rmw(X86OperandMem *DestSrc0, Operand *Src1) {
+    Context.insert<Insts::XorRMW>(DestSrc0, Src1);
+  }
+
+  void _iaca_start() {
+    if (!BuildDefs::minimal())
+      Context.insert<Insts::IacaStart>();
+  }
+  void _iaca_end() {
+    if (!BuildDefs::minimal())
+      Context.insert<Insts::IacaEnd>();
+  }
+
+  /// This class helps wrap IACA markers around the code generated by the
+  /// current scope. It means you don't need to put an end before each return.
+  class ScopedIacaMark {
+    ScopedIacaMark(const ScopedIacaMark &) = delete;
+    ScopedIacaMark &operator=(const ScopedIacaMark &) = delete;
+
+  public:
+    ScopedIacaMark(TargetX8664 *Lowering) : Lowering(Lowering) {
+      Lowering->_iaca_start();
+    }
+    ~ScopedIacaMark() { end(); }
+    void end() {
+      if (!Lowering)
+        return;
+      Lowering->_iaca_end();
+      Lowering = nullptr;
+    }
+
+  private:
+    TargetX8664 *Lowering;
+  };
+
+  bool optimizeScalarMul(Variable *Dest, Operand *Src0, int32_t Src1);
+  void findRMW();
+
+  bool IsEbpBasedFrame = false;
+  size_t RequiredStackAlignment = sizeof(Traits::WordType);
+  size_t SpillAreaSizeBytes = 0;
+  size_t FixedAllocaSizeBytes = 0;
+  size_t FixedAllocaAlignBytes = 0;
+  bool PrologEmitsFixedAllocas = false;
+  uint32_t MaxOutArgsSizeBytes = 0;
+  static std::array<SmallBitVector, RCX86_NUM> TypeToRegisterSet;
+  static std::array<SmallBitVector, RCX86_NUM> TypeToRegisterSetUnfiltered;
+  static std::array<SmallBitVector, Traits::RegisterSet::Reg_NUM>
+      RegisterAliases;
+  SmallBitVector RegsUsed;
+  std::array<VarList, IceType_NUM> PhysicalRegisters;
+
+private:
+  void lowerShift64(InstArithmetic::OpKind Op, Operand *Src0Lo, Operand *Src0Hi,
+                    Operand *Src1Lo, Variable *DestLo, Variable *DestHi);
+
+  /// Emit the code for a combined operation and consumer instruction, or set
+  /// the destination variable of the operation if Consumer == nullptr.
+  void lowerIcmpAndConsumer(const InstIcmp *Icmp, const Inst *Consumer);
+  void lowerFcmpAndConsumer(const InstFcmp *Fcmp, const Inst *Consumer);
+  void lowerArithAndConsumer(const InstArithmetic *Arith, const Inst *Consumer);
+
+  /// Emit a setcc instruction if Consumer == nullptr; otherwise emit a
+  /// specialized version of Consumer.
+  void setccOrConsumer(BrCond Condition, Variable *Dest, const Inst *Consumer);
+
+  /// Emit a mov [1|0] instruction if Consumer == nullptr; otherwise emit a
+  /// specialized version of Consumer.
+  void movOrConsumer(bool IcmpResult, Variable *Dest, const Inst *Consumer);
+
+  /// Emit the code for instructions with a vector type.
+  void lowerIcmpVector(const InstIcmp *Icmp);
+  void lowerFcmpVector(const InstFcmp *Icmp);
+  void lowerSelectVector(const InstSelect *Instr);
+
+  /// Helpers for select lowering.
+  void lowerSelectMove(Variable *Dest, BrCond Cond, Operand *SrcT,
+                       Operand *SrcF);
+  void lowerSelectIntMove(Variable *Dest, BrCond Cond, Operand *SrcT,
+                          Operand *SrcF);
+  /// Generic helper to move an arbitrary type from Src to Dest.
+  void lowerMove(Variable *Dest, Operand *Src, bool IsRedefinition);
+
+  /// Optimizations for idiom recognition.
+  bool lowerOptimizeFcmpSelect(const InstFcmp *Fcmp, const InstSelect *Select);
+
+  /// Complains loudly if invoked because the cpu can handle 64-bit types
+  /// natively.
+  template <typename T = Traits>
+  typename std::enable_if<T::Is64Bit, void>::type lowerIcmp64(const InstIcmp *,
+                                                              const Inst *) {
+    llvm::report_fatal_error(
+        "Hey, yo! This is x86-64. Watcha doin'? (lowerIcmp64)");
+  }
+  /// x86lowerIcmp64 handles 64-bit icmp lowering.
+  template <typename T = Traits>
+  typename std::enable_if<!T::Is64Bit, void>::type
+  lowerIcmp64(const InstIcmp *Icmp, const Inst *Consumer);
+
+  BoolFolding FoldingInfo;
+
+  /// Helpers for lowering ShuffleVector
+  /// @{
+  Variable *lowerShuffleVector_AllFromSameSrc(Operand *Src, SizeT Index0,
+                                              SizeT Index1, SizeT Index2,
+                                              SizeT Index3);
+  static constexpr SizeT IGNORE_INDEX = 0x80000000u;
+  Variable *lowerShuffleVector_TwoFromSameSrc(Operand *Src0, SizeT Index0,
+                                              SizeT Index1, Operand *Src1,
+                                              SizeT Index2, SizeT Index3);
+  static constexpr SizeT UNIFIED_INDEX_0 = 0;
+  static constexpr SizeT UNIFIED_INDEX_1 = 2;
+  Variable *lowerShuffleVector_UnifyFromDifferentSrcs(Operand *Src0,
+                                                      SizeT Index0,
+                                                      Operand *Src1,
+                                                      SizeT Index1);
+  static constexpr SizeT CLEAR_ALL_BITS = 0x80;
+  SizeT PshufbMaskCount = 0;
+  GlobalString lowerShuffleVector_NewMaskName();
+  ConstantRelocatable *lowerShuffleVector_CreatePshufbMask(
+      int8_t Idx0, int8_t Idx1, int8_t Idx2, int8_t Idx3, int8_t Idx4,
+      int8_t Idx5, int8_t Idx6, int8_t Idx7, int8_t Idx8, int8_t Idx9,
+      int8_t Idx10, int8_t Idx11, int8_t Idx12, int8_t Idx13, int8_t Idx14,
+      int8_t Idx15);
+  void lowerShuffleVector_UsingPshufb(Variable *Dest, Operand *Src0,
+                                      Operand *Src1, int8_t Idx0, int8_t Idx1,
+                                      int8_t Idx2, int8_t Idx3, int8_t Idx4,
+                                      int8_t Idx5, int8_t Idx6, int8_t Idx7,
+                                      int8_t Idx8, int8_t Idx9, int8_t Idx10,
+                                      int8_t Idx11, int8_t Idx12, int8_t Idx13,
+                                      int8_t Idx14, int8_t Idx15);
+  /// @}
+
+  static constexpr FixupKind PcRelFixup = Traits::FK_PcRel;
+  static constexpr FixupKind AbsFixup = Traits::FK_Abs;
+
+public:
   static std::unique_ptr<::Ice::TargetLowering> create(Cfg *Func) {
     return makeUnique<TargetX8664>(Func);
   }
@@ -43,50 +994,54 @@
     return makeUnique<X8664::AssemblerX8664>();
   }
 
-protected:
-  void _add_sp(Operand *Adjustment);
-  void _mov_sp(Operand *NewValue);
-  void _sub_sp(Operand *Adjustment);
-  void _link_bp();
-  void _unlink_bp();
-  void _push_reg(RegNumT RegNum);
-  void _pop_reg(RegNumT RegNum);
+private:
+  ENABLE_MAKE_UNIQUE;
 
-  void emitStackProbe(size_t StackSizeBytes);
-  void lowerIndirectJump(Variable *JumpTarget);
-  Inst *emitCallToTarget(Operand *CallTarget, Variable *ReturnReg,
-                         size_t NumVariadicFpArgs = 0) override;
-  Variable *moveReturnValueToRegister(Operand *Value, Type ReturnType) override;
+  explicit TargetX8664(Cfg *Func);
+};
+
+class TargetDataX8664 final : public TargetDataLowering {
+  using Traits = TargetX8664Traits;
+  TargetDataX8664() = delete;
+  TargetDataX8664(const TargetDataX8664 &) = delete;
+  TargetDataX8664 &operator=(const TargetDataX8664 &) = delete;
+
+public:
+  ~TargetDataX8664() override = default;
+
+  static std::unique_ptr<TargetDataLowering> create(GlobalContext *Ctx) {
+    return makeUnique<TargetDataX8664>(Ctx);
+  }
+
+  void lowerGlobals(const VariableDeclarationList &Vars,
+                    const std::string &SectionSuffix) override;
+  void lowerConstants() override;
+  void lowerJumpTables() override;
 
 private:
   ENABLE_MAKE_UNIQUE;
-  friend class X8664::TargetX86Base<X8664::Traits>;
 
-  explicit TargetX8664(Cfg *Func) : TargetX86Base(Func) {}
+  explicit TargetDataX8664(GlobalContext *Ctx) : TargetDataLowering(Ctx) {}
+  template <typename T> static void emitConstantPool(GlobalContext *Ctx);
 };
 
-// The -Wundefined-var-template warning requires to forward-declare static
-// members of template class specializations. Note that "An explicit
-// specialization of a static data member of a template is a definition if the
-// declaration includes an initializer; otherwise, it is a declaration."
-// Visual Studio has a bug which treats these declarations as definitions,
-// leading to multiple definition errors. Since we only enable
-// -Wundefined-var-template for Clang, omit these declarations on other
-// compilers.
-#if defined(__clang__)
-template <>
-std::array<SmallBitVector, RCX86_NUM>
-    TargetX86Base<X8664::Traits>::TypeToRegisterSet;
+class TargetHeaderX86 : public TargetHeaderLowering {
+  TargetHeaderX86() = delete;
+  TargetHeaderX86(const TargetHeaderX86 &) = delete;
+  TargetHeaderX86 &operator=(const TargetHeaderX86 &) = delete;
 
-template <>
-std::array<SmallBitVector, RCX86_NUM>
-    TargetX86Base<X8664::Traits>::TypeToRegisterSetUnfiltered;
+public:
+  ~TargetHeaderX86() = default;
 
-template <>
-std::array<SmallBitVector,
-           TargetX86Base<X8664::Traits>::Traits::RegisterSet::Reg_NUM>
-    TargetX86Base<X8664::Traits>::RegisterAliases;
-#endif
+  static std::unique_ptr<TargetHeaderLowering> create(GlobalContext *Ctx) {
+    return makeUnique<TargetHeaderX86>(Ctx);
+  }
+
+private:
+  ENABLE_MAKE_UNIQUE;
+
+  explicit TargetHeaderX86(GlobalContext *Ctx) : TargetHeaderLowering(Ctx) {}
+};
 
 } // end of namespace X8664
 } // end of namespace Ice
diff --git a/third_party/subzero/src/IceTargetLoweringX8664Base.h b/third_party/subzero/src/IceTargetLoweringX8664Base.h
deleted file mode 100644
index dd7a10b..0000000
--- a/third_party/subzero/src/IceTargetLoweringX8664Base.h
+++ /dev/null
@@ -1,1036 +0,0 @@
-//===- subzero/src/IceTargetLoweringX8664Base.h - x86 lowering ----*- C++
-//-*-===//
-//
-//                        The Subzero Code Generator
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// \brief Declares the TargetLoweringX86 template class, which implements the
-/// TargetLowering base interface for the x86 architecture.
-///
-//===----------------------------------------------------------------------===//
-
-#ifndef SUBZERO_SRC_ICETARGETLOWERINGX8664BASE_H
-#define SUBZERO_SRC_ICETARGETLOWERINGX8664BASE_H
-
-#include "IceDefs.h"
-#include "IceInst.h"
-#include "IceSwitchLowering.h"
-#include "IceTargetLowering.h"
-#include "IceTargetLoweringX86RegClass.h"
-#include "IceUtils.h"
-
-#include <array>
-#include <type_traits>
-#include <utility>
-
-namespace Ice {
-namespace X8664 {
-
-using namespace ::Ice::X86;
-
-template <typename Traits> class BoolFolding;
-
-/// TargetX86Base is a template for all X86 Targets, and it relies on the CRT
-/// pattern for generating code, delegating to actual backends target-specific
-/// lowerings (e.g., call, ret, and intrinsics.).
-///
-/// Note: Ideally, we should be able to
-///
-///  static_assert(std::is_base_of<TargetX86Base<TraitsType>,
-///  Machine>::value);
-///
-/// but that does not work: the compiler does not know that Machine inherits
-/// from TargetX86Base at this point in translation.
-template <typename TraitsType> class TargetX86Base : public TargetLowering {
-  TargetX86Base() = delete;
-  TargetX86Base(const TargetX86Base &) = delete;
-  TargetX86Base &operator=(const TargetX86Base &) = delete;
-
-public:
-  using Traits = TraitsType;
-  using ConcreteTarget = typename Traits::ConcreteTarget;
-  using InstructionSetEnum = typename Traits::InstructionSet;
-
-  using BrCond = CondX86::BrCond;
-  using CmppsCond = CondX86::CmppsCond;
-
-  using X86Address = typename Traits::Address;
-  using X86Operand = typename Traits::X86Operand;
-  using X86OperandMem = typename Traits::X86OperandMem;
-  using SegmentRegisters = typename Traits::X86OperandMem::SegmentRegisters;
-
-  using InstX86Br = typename Traits::Insts::Br;
-  using InstX86FakeRMW = typename Traits::Insts::FakeRMW;
-  using InstX86Label = typename Traits::Insts::Label;
-
-  ~TargetX86Base() override = default;
-
-  static void staticInit(GlobalContext *Ctx);
-  static bool shouldBePooled(const Constant *C);
-  static ::Ice::Type getPointerType();
-
-  static FixupKind getPcRelFixup() { return PcRelFixup; }
-  static FixupKind getAbsFixup() { return AbsFixup; }
-
-  void translateOm1() override;
-  void translateO2() override;
-  void doLoadOpt();
-  bool doBranchOpt(Inst *I, const CfgNode *NextNode) override;
-
-  SizeT getNumRegisters() const override {
-    return Traits::RegisterSet::Reg_NUM;
-  }
-
-  Inst *createLoweredMove(Variable *Dest, Variable *SrcVar) override {
-    if (isVectorType(Dest->getType())) {
-      return Traits::Insts::Movp::create(Func, Dest, SrcVar);
-    }
-    return Traits::Insts::Mov::create(Func, Dest, SrcVar);
-    (void)Dest;
-    (void)SrcVar;
-    return nullptr;
-  }
-
-  Variable *getPhysicalRegister(RegNumT RegNum,
-                                Type Ty = IceType_void) override;
-  const char *getRegName(RegNumT RegNum, Type Ty) const override;
-  static const char *getRegClassName(RegClass C) {
-    auto ClassNum = static_cast<RegClassX86>(C);
-    assert(ClassNum < RCX86_NUM);
-    switch (ClassNum) {
-    default:
-      assert(C < RC_Target);
-      return regClassString(C);
-    case RCX86_Is64To8:
-      return "i64to8"; // 64-bit GPR truncable to i8
-    case RCX86_Is32To8:
-      return "i32to8"; // 32-bit GPR truncable to i8
-    case RCX86_Is16To8:
-      return "i16to8"; // 16-bit GPR truncable to i8
-    case RCX86_IsTrunc8Rcvr:
-      return "i8from"; // 8-bit GPR truncable from wider GPRs
-    case RCX86_IsAhRcvr:
-      return "i8fromah"; // 8-bit GPR that ah can be assigned to
-    }
-  }
-  SmallBitVector getRegisterSet(RegSetMask Include,
-                                RegSetMask Exclude) const override;
-  const SmallBitVector &
-  getRegistersForVariable(const Variable *Var) const override {
-    RegClass RC = Var->getRegClass();
-    assert(static_cast<RegClassX86>(RC) < RCX86_NUM);
-    return TypeToRegisterSet[RC];
-  }
-
-  const SmallBitVector &
-  getAllRegistersForVariable(const Variable *Var) const override {
-    RegClass RC = Var->getRegClass();
-    assert(static_cast<RegClassX86>(RC) < RCX86_NUM);
-    return TypeToRegisterSetUnfiltered[RC];
-  }
-
-  const SmallBitVector &getAliasesForRegister(RegNumT Reg) const override {
-    Reg.assertIsValid();
-    return RegisterAliases[Reg];
-  }
-
-  bool hasFramePointer() const override { return IsEbpBasedFrame; }
-  void setHasFramePointer() override { IsEbpBasedFrame = true; }
-  RegNumT getStackReg() const override { return Traits::StackPtr; }
-  RegNumT getFrameReg() const override { return Traits::FramePtr; }
-  RegNumT getFrameOrStackReg() const override {
-    // If the stack pointer needs to be aligned, then the frame pointer is
-    // unaligned, so always use the stack pointer.
-    if (needsStackPointerAlignment())
-      return getStackReg();
-    return IsEbpBasedFrame ? getFrameReg() : getStackReg();
-  }
-  size_t typeWidthInBytesOnStack(Type Ty) const override {
-    // Round up to the next multiple of WordType bytes.
-    const uint32_t WordSizeInBytes = typeWidthInBytes(Traits::WordType);
-    return Utils::applyAlignment(typeWidthInBytes(Ty), WordSizeInBytes);
-  }
-  uint32_t getStackAlignment() const override {
-    return Traits::X86_STACK_ALIGNMENT_BYTES;
-  }
-  bool needsStackPointerAlignment() const override {
-    // If the ABI's stack alignment is smaller than the vector size (16 bytes),
-    // use the (realigned) stack pointer for addressing any stack variables.
-    return Traits::X86_STACK_ALIGNMENT_BYTES < 16;
-  }
-  void reserveFixedAllocaArea(size_t Size, size_t Align) override {
-    FixedAllocaSizeBytes = Size;
-    assert(llvm::isPowerOf2_32(Align));
-    FixedAllocaAlignBytes = Align;
-    PrologEmitsFixedAllocas = true;
-  }
-  /// Returns the (negative) offset from ebp/rbp where the fixed Allocas start.
-  int32_t getFrameFixedAllocaOffset() const override {
-    return FixedAllocaSizeBytes - (SpillAreaSizeBytes - maxOutArgsSizeBytes());
-  }
-  virtual uint32_t maxOutArgsSizeBytes() const override {
-    return MaxOutArgsSizeBytes;
-  }
-  virtual void updateMaxOutArgsSizeBytes(uint32_t Size) {
-    MaxOutArgsSizeBytes = std::max(MaxOutArgsSizeBytes, Size);
-  }
-
-  bool shouldSplitToVariable64On32(Type Ty) const override {
-    return Traits::Is64Bit ? false : Ty == IceType_i64;
-  }
-
-  SizeT getMinJumpTableSize() const override { return 4; }
-
-  void emitVariable(const Variable *Var) const override;
-
-  void emit(const ConstantInteger32 *C) const final;
-  void emit(const ConstantInteger64 *C) const final;
-  void emit(const ConstantFloat *C) const final;
-  void emit(const ConstantDouble *C) const final;
-  void emit(const ConstantUndef *C) const final;
-  void emit(const ConstantRelocatable *C) const final;
-
-  void initNodeForLowering(CfgNode *Node) override;
-
-  template <typename T = Traits>
-  typename std::enable_if<!T::Is64Bit, Operand>::type *
-  loOperand(Operand *Operand);
-  template <typename T = Traits>
-  typename std::enable_if<T::Is64Bit, Operand>::type *loOperand(Operand *) {
-    llvm::report_fatal_error(
-        "Hey, yo! This is x86-64. Watcha doin'? (loOperand)");
-  }
-
-  template <typename T = Traits>
-  typename std::enable_if<!T::Is64Bit, Operand>::type *
-  hiOperand(Operand *Operand);
-  template <typename T = Traits>
-  typename std::enable_if<T::Is64Bit, Operand>::type *hiOperand(Operand *) {
-    llvm::report_fatal_error(
-        "Hey, yo! This is x86-64. Watcha doin'? (hiOperand)");
-  }
-
-  void addProlog(CfgNode *Node) override;
-  void finishArgumentLowering(Variable *Arg, Variable *FramePtr,
-                              size_t BasicFrameOffset, size_t StackAdjBytes,
-                              size_t &InArgsSizeBytes);
-  void addEpilog(CfgNode *Node) override;
-  X86Address stackVarToAsmOperand(const Variable *Var) const;
-
-  InstructionSetEnum getInstructionSet() const { return InstructionSet; }
-  Operand *legalizeUndef(Operand *From, RegNumT RegNum = RegNumT());
-
-protected:
-  explicit TargetX86Base(Cfg *Func);
-
-  void postLower() override;
-
-  void lowerAlloca(const InstAlloca *Instr) override;
-  void lowerArguments() override;
-  void lowerArithmetic(const InstArithmetic *Instr) override;
-  void lowerAssign(const InstAssign *Instr) override;
-  void lowerBr(const InstBr *Instr) override;
-  void lowerBreakpoint(const InstBreakpoint *Instr) override;
-  void lowerCall(const InstCall *Instr) override;
-  void lowerCast(const InstCast *Instr) override;
-  void lowerExtractElement(const InstExtractElement *Instr) override;
-  void lowerFcmp(const InstFcmp *Instr) override;
-  void lowerIcmp(const InstIcmp *Instr) override;
-
-  void lowerIntrinsic(const InstIntrinsic *Instr) override;
-  void lowerInsertElement(const InstInsertElement *Instr) override;
-  void lowerLoad(const InstLoad *Instr) override;
-  void lowerPhi(const InstPhi *Instr) override;
-  void lowerRet(const InstRet *Instr) override;
-  void lowerSelect(const InstSelect *Instr) override;
-  void lowerShuffleVector(const InstShuffleVector *Instr) override;
-  void lowerStore(const InstStore *Instr) override;
-  void lowerSwitch(const InstSwitch *Instr) override;
-  void lowerUnreachable(const InstUnreachable *Instr) override;
-  void lowerOther(const Inst *Instr) override;
-  void lowerRMW(const InstX86FakeRMW *RMW);
-  void prelowerPhis() override;
-  uint32_t getCallStackArgumentsSizeBytes(const CfgVector<Type> &ArgTypes,
-                                          Type ReturnType);
-  uint32_t getCallStackArgumentsSizeBytes(const InstCall *Instr) override;
-  void genTargetHelperCallFor(Inst *Instr) override;
-
-  /// OptAddr wraps all the possible operands that an x86 address might have.
-  struct OptAddr {
-    Variable *Base = nullptr;
-    Variable *Index = nullptr;
-    uint16_t Shift = 0;
-    int32_t Offset = 0;
-    ConstantRelocatable *Relocatable = nullptr;
-  };
-
-  // Builds information for a canonical address expresion:
-  //   <Relocatable + Offset>(Base, Index, Shift)
-  X86OperandMem *computeAddressOpt(const Inst *Instr, Type MemType,
-                                   Operand *Addr);
-  void doAddressOptOther() override;
-  void doAddressOptLoad() override;
-  void doAddressOptStore() override;
-  void doAddressOptLoadSubVector() override;
-  void doAddressOptStoreSubVector() override;
-  void doMockBoundsCheck(Operand *Opnd) override;
-
-  /// Naive lowering of cmpxchg.
-  void lowerAtomicCmpxchg(Variable *DestPrev, Operand *Ptr, Operand *Expected,
-                          Operand *Desired);
-  /// Attempt a more optimized lowering of cmpxchg. Returns true if optimized.
-  bool tryOptimizedCmpxchgCmpBr(Variable *DestPrev, Operand *Ptr,
-                                Operand *Expected, Operand *Desired);
-  void lowerAtomicRMW(Variable *Dest, uint32_t Operation, Operand *Ptr,
-                      Operand *Val);
-  void lowerCountZeros(bool Cttz, Type Ty, Variable *Dest, Operand *FirstVal,
-                       Operand *SecondVal);
-  /// Load from memory for a given type.
-  void typedLoad(Type Ty, Variable *Dest, Variable *Base, Constant *Offset);
-  /// Store to memory for a given type.
-  void typedStore(Type Ty, Variable *Value, Variable *Base, Constant *Offset);
-  /// Copy memory of given type from Src to Dest using OffsetAmt on both.
-  void copyMemory(Type Ty, Variable *Dest, Variable *Src, int32_t OffsetAmt);
-  /// Replace some calls to memcpy with inline instructions.
-  void lowerMemcpy(Operand *Dest, Operand *Src, Operand *Count);
-  /// Replace some calls to memmove with inline instructions.
-  void lowerMemmove(Operand *Dest, Operand *Src, Operand *Count);
-  /// Replace some calls to memset with inline instructions.
-  void lowerMemset(Operand *Dest, Operand *Val, Operand *Count);
-
-  void lowerIndirectJump(Variable *JumpTarget) {
-    // Without std::move below, the compiler deduces that the argument to
-    // lowerIndirectJmp is a Variable *&, not a Variable *.
-    dispatchToConcrete(&Traits::ConcreteTarget::lowerIndirectJump,
-                       std::move(JumpTarget));
-  }
-
-  /// Check the comparison is in [Min,Max]. The flags register will be modified
-  /// with:
-  ///   - below equal, if in range
-  ///   - above, set if not in range
-  /// The index into the range is returned.
-  Operand *lowerCmpRange(Operand *Comparison, uint64_t Min, uint64_t Max);
-  /// Lowering of a cluster of switch cases. If the case is not matched control
-  /// will pass to the default label provided. If the default label is nullptr
-  /// then control will fall through to the next instruction. DoneCmp should be
-  /// true if the flags contain the result of a comparison with the Comparison.
-  void lowerCaseCluster(const CaseCluster &Case, Operand *Src0, bool DoneCmp,
-                        CfgNode *DefaultLabel = nullptr);
-
-  using LowerBinOp = void (TargetX86Base::*)(Variable *, Operand *);
-  void expandAtomicRMWAsCmpxchg(LowerBinOp op_lo, LowerBinOp op_hi,
-                                Variable *Dest, Operand *Ptr, Operand *Val);
-
-  void eliminateNextVectorSextInstruction(Variable *SignExtendedResult);
-
-  void emitStackProbe(size_t StackSizeBytes) {
-    dispatchToConcrete(&Traits::ConcreteTarget::emitStackProbe,
-                       std::move(StackSizeBytes));
-  }
-
-  /// Emit just the call instruction (without argument or return variable
-  /// processing).
-  virtual Inst *emitCallToTarget(Operand *CallTarget, Variable *ReturnReg,
-                                 size_t NumVariadicFpArgs = 0) = 0;
-  /// Materialize the moves needed to return a value of the specified type.
-  virtual Variable *moveReturnValueToRegister(Operand *Value,
-                                              Type ReturnType) = 0;
-
-  /// Emit a jump table to the constant pool.
-  void emitJumpTable(const Cfg *Func,
-                     const InstJumpTable *JumpTable) const override;
-
-  /// Emit a fake use of esp to make sure esp stays alive for the entire
-  /// function. Otherwise some esp adjustments get dead-code eliminated.
-  void keepEspLiveAtExit() {
-    Variable *esp =
-        Func->getTarget()->getPhysicalRegister(getStackReg(), Traits::WordType);
-    Context.insert<InstFakeUse>(esp);
-  }
-
-  /// Operand legalization helpers. To deal with address mode constraints, the
-  /// helpers will create a new Operand and emit instructions that guarantee
-  /// that the Operand kind is one of those indicated by the LegalMask (a
-  /// bitmask of allowed kinds). If the input Operand is known to already meet
-  /// the constraints, it may be simply returned as the result, without creating
-  /// any new instructions or operands.
-  enum OperandLegalization {
-    Legal_None = 0,
-    Legal_Reg = 1 << 0, // physical register, not stack location
-    Legal_Imm = 1 << 1,
-    Legal_Mem = 1 << 2, // includes [eax+4*ecx] as well as [esp+12]
-    Legal_Rematerializable = 1 << 3,
-    Legal_AddrAbs = 1 << 4, // ConstantRelocatable doesn't have to add RebasePtr
-    Legal_Default = ~(Legal_Rematerializable | Legal_AddrAbs)
-    // TODO(stichnot): Figure out whether this default works for x86-64.
-  };
-  using LegalMask = uint32_t;
-  Operand *legalize(Operand *From, LegalMask Allowed = Legal_Default,
-                    RegNumT RegNum = RegNumT());
-  Variable *legalizeToReg(Operand *From, RegNumT RegNum = RegNumT());
-  /// Legalize the first source operand for use in the cmp instruction.
-  Operand *legalizeSrc0ForCmp(Operand *Src0, Operand *Src1);
-  /// Turn a pointer operand into a memory operand that can be used by a real
-  /// load/store operation. Legalizes the operand as well. This is a nop if the
-  /// operand is already a legal memory operand.
-  X86OperandMem *formMemoryOperand(Operand *Ptr, Type Ty,
-                                   bool DoLegalize = true);
-
-  Variable *makeReg(Type Ty, RegNumT RegNum = RegNumT());
-  static Type stackSlotType();
-
-  static constexpr uint32_t NoSizeLimit = 0;
-  /// Returns the largest type which is equal to or larger than Size bytes. The
-  /// type is suitable for copying memory i.e. a load and store will be a single
-  /// instruction (for example x86 will get f64 not i64).
-  static Type largestTypeInSize(uint32_t Size, uint32_t MaxSize = NoSizeLimit);
-  /// Returns the smallest type which is equal to or larger than Size bytes. If
-  /// one doesn't exist then the largest type smaller than Size bytes is
-  /// returned. The type is suitable for memory copies as described at
-  /// largestTypeInSize.
-  static Type firstTypeThatFitsSize(uint32_t Size,
-                                    uint32_t MaxSize = NoSizeLimit);
-
-  Variable *copyToReg8(Operand *Src, RegNumT RegNum = RegNumT());
-  Variable *copyToReg(Operand *Src, RegNumT RegNum = RegNumT());
-
-  /// Returns a register containing all zeros, without affecting the FLAGS
-  /// register, using the best instruction for the type.
-  Variable *makeZeroedRegister(Type Ty, RegNumT RegNum = RegNumT());
-
-  /// \name Returns a vector in a register with the given constant entries.
-  /// @{
-  Variable *makeVectorOfZeros(Type Ty, RegNumT RegNum = RegNumT());
-  Variable *makeVectorOfOnes(Type Ty, RegNumT RegNum = RegNumT());
-  Variable *makeVectorOfMinusOnes(Type Ty, RegNumT RegNum = RegNumT());
-  Variable *makeVectorOfHighOrderBits(Type Ty, RegNumT RegNum = RegNumT());
-  Variable *makeVectorOfFabsMask(Type Ty, RegNumT RegNum = RegNumT());
-  /// @}
-
-  /// Return a memory operand corresponding to a stack allocated Variable.
-  X86OperandMem *getMemoryOperandForStackSlot(Type Ty, Variable *Slot,
-                                              uint32_t Offset = 0);
-
-  /// The following are helpers that insert lowered x86 instructions with
-  /// minimal syntactic overhead, so that the lowering code can look as close to
-  /// assembly as practical.
-  void _adc(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Adc>(Dest, Src0);
-  }
-  void _adc_rmw(X86OperandMem *DestSrc0, Operand *Src1) {
-    Context.insert<typename Traits::Insts::AdcRMW>(DestSrc0, Src1);
-  }
-  void _add(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Add>(Dest, Src0);
-  }
-  void _add_rmw(X86OperandMem *DestSrc0, Operand *Src1) {
-    Context.insert<typename Traits::Insts::AddRMW>(DestSrc0, Src1);
-  }
-  void _addps(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Addps>(Dest, Src0);
-  }
-  void _addss(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Addss>(Dest, Src0);
-  }
-  void _add_sp(Operand *Adjustment) {
-    dispatchToConcrete(&Traits::ConcreteTarget::_add_sp, std::move(Adjustment));
-  }
-  void _and(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::And>(Dest, Src0);
-  }
-  void _andnps(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Andnps>(Dest, Src0);
-  }
-  void _andps(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Andps>(Dest, Src0);
-  }
-  void _and_rmw(X86OperandMem *DestSrc0, Operand *Src1) {
-    Context.insert<typename Traits::Insts::AndRMW>(DestSrc0, Src1);
-  }
-  void _blendvps(Variable *Dest, Operand *Src0, Operand *Src1) {
-    Context.insert<typename Traits::Insts::Blendvps>(Dest, Src0, Src1);
-  }
-  void _br(BrCond Condition, CfgNode *TargetTrue, CfgNode *TargetFalse) {
-    Context.insert<InstX86Br>(TargetTrue, TargetFalse, Condition,
-                              InstX86Br::Far);
-  }
-  void _br(CfgNode *Target) {
-    Context.insert<InstX86Br>(Target, InstX86Br::Far);
-  }
-  void _br(BrCond Condition, CfgNode *Target) {
-    Context.insert<InstX86Br>(Target, Condition, InstX86Br::Far);
-  }
-  void _br(BrCond Condition, InstX86Label *Label,
-           typename InstX86Br::Mode Kind = InstX86Br::Near) {
-    Context.insert<InstX86Br>(Label, Condition, Kind);
-  }
-  void _bsf(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Bsf>(Dest, Src0);
-  }
-  void _bsr(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Bsr>(Dest, Src0);
-  }
-  void _bswap(Variable *SrcDest) {
-    Context.insert<typename Traits::Insts::Bswap>(SrcDest);
-  }
-  void _cbwdq(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Cbwdq>(Dest, Src0);
-  }
-  void _cmov(Variable *Dest, Operand *Src0, BrCond Condition) {
-    Context.insert<typename Traits::Insts::Cmov>(Dest, Src0, Condition);
-  }
-  void _cmp(Operand *Src0, Operand *Src1) {
-    Context.insert<typename Traits::Insts::Icmp>(Src0, Src1);
-  }
-  void _cmpps(Variable *Dest, Operand *Src0, CmppsCond Condition) {
-    Context.insert<typename Traits::Insts::Cmpps>(Dest, Src0, Condition);
-  }
-  void _cmpxchg(Operand *DestOrAddr, Variable *Eax, Variable *Desired,
-                bool Locked) {
-    Context.insert<typename Traits::Insts::Cmpxchg>(DestOrAddr, Eax, Desired,
-                                                    Locked);
-    // Mark eax as possibly modified by cmpxchg.
-    Context.insert<InstFakeDef>(Eax, llvm::dyn_cast<Variable>(DestOrAddr));
-    _set_dest_redefined();
-    Context.insert<InstFakeUse>(Eax);
-  }
-  void _cmpxchg8b(X86OperandMem *Addr, Variable *Edx, Variable *Eax,
-                  Variable *Ecx, Variable *Ebx, bool Locked) {
-    Context.insert<typename Traits::Insts::Cmpxchg8b>(Addr, Edx, Eax, Ecx, Ebx,
-                                                      Locked);
-    // Mark edx, and eax as possibly modified by cmpxchg8b.
-    Context.insert<InstFakeDef>(Edx);
-    _set_dest_redefined();
-    Context.insert<InstFakeUse>(Edx);
-    Context.insert<InstFakeDef>(Eax);
-    _set_dest_redefined();
-    Context.insert<InstFakeUse>(Eax);
-  }
-  void _cvt(Variable *Dest, Operand *Src0,
-            typename Traits::Insts::Cvt::CvtVariant Variant) {
-    Context.insert<typename Traits::Insts::Cvt>(Dest, Src0, Variant);
-  }
-  void _round(Variable *Dest, Operand *Src0, Operand *Imm) {
-    Context.insert<typename Traits::Insts::Round>(Dest, Src0, Imm);
-  }
-  void _div(Variable *Dest, Operand *Src0, Operand *Src1) {
-    Context.insert<typename Traits::Insts::Div>(Dest, Src0, Src1);
-  }
-  void _divps(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Divps>(Dest, Src0);
-  }
-  void _divss(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Divss>(Dest, Src0);
-  }
-  void _idiv(Variable *Dest, Operand *Src0, Operand *Src1) {
-    Context.insert<typename Traits::Insts::Idiv>(Dest, Src0, Src1);
-  }
-  void _imul(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Imul>(Dest, Src0);
-  }
-  void _imul_imm(Variable *Dest, Operand *Src0, Constant *Imm) {
-    Context.insert<typename Traits::Insts::ImulImm>(Dest, Src0, Imm);
-  }
-  void _insertps(Variable *Dest, Operand *Src0, Operand *Src1) {
-    Context.insert<typename Traits::Insts::Insertps>(Dest, Src0, Src1);
-  }
-  void _int3() { Context.insert<typename Traits::Insts::Int3>(); }
-  void _jmp(Operand *Target) {
-    Context.insert<typename Traits::Insts::Jmp>(Target);
-  }
-  void _lea(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Lea>(Dest, Src0);
-  }
-  void _link_bp() { dispatchToConcrete(&Traits::ConcreteTarget::_link_bp); }
-  void _push_reg(RegNumT RegNum) {
-    dispatchToConcrete(&Traits::ConcreteTarget::_push_reg, std::move(RegNum));
-  }
-  void _pop_reg(RegNumT RegNum) {
-    dispatchToConcrete(&Traits::ConcreteTarget::_pop_reg, std::move(RegNum));
-  }
-  void _mfence() { Context.insert<typename Traits::Insts::Mfence>(); }
-  /// Moves can be used to redefine registers, creating "partial kills" for
-  /// liveness.  Mark where moves are used in this way.
-  void _redefined(Inst *MovInst, bool IsRedefinition = true) {
-    if (IsRedefinition)
-      MovInst->setDestRedefined();
-  }
-  /// If Dest=nullptr is passed in, then a new variable is created, marked as
-  /// infinite register allocation weight, and returned through the in/out Dest
-  /// argument.
-  typename Traits::Insts::Mov *_mov(Variable *&Dest, Operand *Src0,
-                                    RegNumT RegNum = RegNumT()) {
-    if (Dest == nullptr)
-      Dest = makeReg(Src0->getType(), RegNum);
-    return Context.insert<typename Traits::Insts::Mov>(Dest, Src0);
-  }
-  void _mov_sp(Operand *NewValue) {
-    dispatchToConcrete(&Traits::ConcreteTarget::_mov_sp, std::move(NewValue));
-  }
-  typename Traits::Insts::Movp *_movp(Variable *Dest, Operand *Src0) {
-    return Context.insert<typename Traits::Insts::Movp>(Dest, Src0);
-  }
-  void _movd(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Movd>(Dest, Src0);
-  }
-  void _movq(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Movq>(Dest, Src0);
-  }
-  void _movss(Variable *Dest, Variable *Src0) {
-    Context.insert<typename Traits::Insts::MovssRegs>(Dest, Src0);
-  }
-  void _movsx(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Movsx>(Dest, Src0);
-  }
-  typename Traits::Insts::Movzx *_movzx(Variable *Dest, Operand *Src0) {
-    return Context.insert<typename Traits::Insts::Movzx>(Dest, Src0);
-  }
-  void _maxss(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Maxss>(Dest, Src0);
-  }
-  void _minss(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Minss>(Dest, Src0);
-  }
-  void _maxps(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Maxps>(Dest, Src0);
-  }
-  void _minps(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Minps>(Dest, Src0);
-  }
-  void _mul(Variable *Dest, Variable *Src0, Operand *Src1) {
-    Context.insert<typename Traits::Insts::Mul>(Dest, Src0, Src1);
-  }
-  void _mulps(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Mulps>(Dest, Src0);
-  }
-  void _mulss(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Mulss>(Dest, Src0);
-  }
-  void _neg(Variable *SrcDest) {
-    Context.insert<typename Traits::Insts::Neg>(SrcDest);
-  }
-  void _nop(SizeT Variant) {
-    Context.insert<typename Traits::Insts::Nop>(Variant);
-  }
-  void _or(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Or>(Dest, Src0);
-  }
-  void _orps(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Orps>(Dest, Src0);
-  }
-  void _or_rmw(X86OperandMem *DestSrc0, Operand *Src1) {
-    Context.insert<typename Traits::Insts::OrRMW>(DestSrc0, Src1);
-  }
-  void _padd(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Padd>(Dest, Src0);
-  }
-  void _padds(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Padds>(Dest, Src0);
-  }
-  void _paddus(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Paddus>(Dest, Src0);
-  }
-  void _pand(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Pand>(Dest, Src0);
-  }
-  void _pandn(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Pandn>(Dest, Src0);
-  }
-  void _pblendvb(Variable *Dest, Operand *Src0, Operand *Src1) {
-    Context.insert<typename Traits::Insts::Pblendvb>(Dest, Src0, Src1);
-  }
-  void _pcmpeq(Variable *Dest, Operand *Src0,
-               Type ArithmeticTypeOverride = IceType_void) {
-    Context.insert<typename Traits::Insts::Pcmpeq>(Dest, Src0,
-                                                   ArithmeticTypeOverride);
-  }
-  void _pcmpgt(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Pcmpgt>(Dest, Src0);
-  }
-  void _pextr(Variable *Dest, Operand *Src0, Operand *Src1) {
-    Context.insert<typename Traits::Insts::Pextr>(Dest, Src0, Src1);
-  }
-  void _pinsr(Variable *Dest, Operand *Src0, Operand *Src1) {
-    Context.insert<typename Traits::Insts::Pinsr>(Dest, Src0, Src1);
-  }
-  void _pmull(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Pmull>(Dest, Src0);
-  }
-  void _pmulhw(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Pmulhw>(Dest, Src0);
-  }
-  void _pmulhuw(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Pmulhuw>(Dest, Src0);
-  }
-  void _pmaddwd(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Pmaddwd>(Dest, Src0);
-  }
-  void _pmuludq(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Pmuludq>(Dest, Src0);
-  }
-  void _pop(Variable *Dest) {
-    Context.insert<typename Traits::Insts::Pop>(Dest);
-  }
-  void _por(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Por>(Dest, Src0);
-  }
-  void _punpckl(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Punpckl>(Dest, Src0);
-  }
-  void _punpckh(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Punpckh>(Dest, Src0);
-  }
-  void _packss(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Packss>(Dest, Src0);
-  }
-  void _packus(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Packus>(Dest, Src0);
-  }
-  void _pshufb(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Pshufb>(Dest, Src0);
-  }
-  void _pshufd(Variable *Dest, Operand *Src0, Operand *Src1) {
-    Context.insert<typename Traits::Insts::Pshufd>(Dest, Src0, Src1);
-  }
-  void _psll(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Psll>(Dest, Src0);
-  }
-  void _psra(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Psra>(Dest, Src0);
-  }
-  void _psrl(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Psrl>(Dest, Src0);
-  }
-  void _psub(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Psub>(Dest, Src0);
-  }
-  void _psubs(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Psubs>(Dest, Src0);
-  }
-  void _psubus(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Psubus>(Dest, Src0);
-  }
-  void _push(Operand *Src0) {
-    Context.insert<typename Traits::Insts::Push>(Src0);
-  }
-  void _pxor(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Pxor>(Dest, Src0);
-  }
-  void _ret(Variable *Src0 = nullptr) {
-    Context.insert<typename Traits::Insts::Ret>(Src0);
-  }
-  void _rol(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Rol>(Dest, Src0);
-  }
-  void _round(Variable *Dest, Operand *Src, Constant *Imm) {
-    Context.insert<typename Traits::Insts::Round>(Dest, Src, Imm);
-  }
-  void _sar(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Sar>(Dest, Src0);
-  }
-  void _sbb(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Sbb>(Dest, Src0);
-  }
-  void _sbb_rmw(X86OperandMem *DestSrc0, Operand *Src1) {
-    Context.insert<typename Traits::Insts::SbbRMW>(DestSrc0, Src1);
-  }
-  void _setcc(Variable *Dest, BrCond Condition) {
-    Context.insert<typename Traits::Insts::Setcc>(Dest, Condition);
-  }
-  void _shl(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Shl>(Dest, Src0);
-  }
-  void _shld(Variable *Dest, Variable *Src0, Operand *Src1) {
-    Context.insert<typename Traits::Insts::Shld>(Dest, Src0, Src1);
-  }
-  void _shr(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Shr>(Dest, Src0);
-  }
-  void _shrd(Variable *Dest, Variable *Src0, Operand *Src1) {
-    Context.insert<typename Traits::Insts::Shrd>(Dest, Src0, Src1);
-  }
-  void _shufps(Variable *Dest, Operand *Src0, Operand *Src1) {
-    Context.insert<typename Traits::Insts::Shufps>(Dest, Src0, Src1);
-  }
-  void _movmsk(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Movmsk>(Dest, Src0);
-  }
-  void _sqrt(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Sqrt>(Dest, Src0);
-  }
-  void _store(Operand *Value, X86Operand *Mem) {
-    Context.insert<typename Traits::Insts::Store>(Value, Mem);
-  }
-  void _storep(Variable *Value, X86OperandMem *Mem) {
-    Context.insert<typename Traits::Insts::StoreP>(Value, Mem);
-  }
-  void _storeq(Operand *Value, X86OperandMem *Mem) {
-    Context.insert<typename Traits::Insts::StoreQ>(Value, Mem);
-  }
-  void _stored(Operand *Value, X86OperandMem *Mem) {
-    Context.insert<typename Traits::Insts::StoreD>(Value, Mem);
-  }
-  void _sub(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Sub>(Dest, Src0);
-  }
-  void _sub_rmw(X86OperandMem *DestSrc0, Operand *Src1) {
-    Context.insert<typename Traits::Insts::SubRMW>(DestSrc0, Src1);
-  }
-  void _sub_sp(Operand *Adjustment) {
-    dispatchToConcrete(&Traits::ConcreteTarget::_sub_sp, std::move(Adjustment));
-  }
-  void _subps(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Subps>(Dest, Src0);
-  }
-  void _subss(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Subss>(Dest, Src0);
-  }
-  void _test(Operand *Src0, Operand *Src1) {
-    Context.insert<typename Traits::Insts::Test>(Src0, Src1);
-  }
-  void _ucomiss(Operand *Src0, Operand *Src1) {
-    Context.insert<typename Traits::Insts::Ucomiss>(Src0, Src1);
-  }
-  void _ud2() { Context.insert<typename Traits::Insts::UD2>(); }
-  void _unlink_bp() { dispatchToConcrete(&Traits::ConcreteTarget::_unlink_bp); }
-  void _xadd(Operand *Dest, Variable *Src, bool Locked) {
-    Context.insert<typename Traits::Insts::Xadd>(Dest, Src, Locked);
-    // The xadd exchanges Dest and Src (modifying Src). Model that update with
-    // a FakeDef followed by a FakeUse.
-    Context.insert<InstFakeDef>(Src, llvm::dyn_cast<Variable>(Dest));
-    _set_dest_redefined();
-    Context.insert<InstFakeUse>(Src);
-  }
-  void _xchg(Operand *Dest, Variable *Src) {
-    Context.insert<typename Traits::Insts::Xchg>(Dest, Src);
-    // The xchg modifies Dest and Src -- model that update with a
-    // FakeDef/FakeUse.
-    Context.insert<InstFakeDef>(Src, llvm::dyn_cast<Variable>(Dest));
-    _set_dest_redefined();
-    Context.insert<InstFakeUse>(Src);
-  }
-  void _xor(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Xor>(Dest, Src0);
-  }
-  void _xorps(Variable *Dest, Operand *Src0) {
-    Context.insert<typename Traits::Insts::Xorps>(Dest, Src0);
-  }
-  void _xor_rmw(X86OperandMem *DestSrc0, Operand *Src1) {
-    Context.insert<typename Traits::Insts::XorRMW>(DestSrc0, Src1);
-  }
-
-  void _iaca_start() {
-    if (!BuildDefs::minimal())
-      Context.insert<typename Traits::Insts::IacaStart>();
-  }
-  void _iaca_end() {
-    if (!BuildDefs::minimal())
-      Context.insert<typename Traits::Insts::IacaEnd>();
-  }
-
-  /// This class helps wrap IACA markers around the code generated by the
-  /// current scope. It means you don't need to put an end before each return.
-  class ScopedIacaMark {
-    ScopedIacaMark(const ScopedIacaMark &) = delete;
-    ScopedIacaMark &operator=(const ScopedIacaMark &) = delete;
-
-  public:
-    ScopedIacaMark(TargetX86Base *Lowering) : Lowering(Lowering) {
-      Lowering->_iaca_start();
-    }
-    ~ScopedIacaMark() { end(); }
-    void end() {
-      if (!Lowering)
-        return;
-      Lowering->_iaca_end();
-      Lowering = nullptr;
-    }
-
-  private:
-    TargetX86Base *Lowering;
-  };
-
-  bool optimizeScalarMul(Variable *Dest, Operand *Src0, int32_t Src1);
-  void findRMW();
-
-  InstructionSetEnum InstructionSet = Traits::InstructionSet::Begin;
-  bool IsEbpBasedFrame = false;
-  size_t RequiredStackAlignment = sizeof(Traits::WordType);
-  size_t SpillAreaSizeBytes = 0;
-  size_t FixedAllocaSizeBytes = 0;
-  size_t FixedAllocaAlignBytes = 0;
-  bool PrologEmitsFixedAllocas = false;
-  uint32_t MaxOutArgsSizeBytes = 0;
-  static std::array<SmallBitVector, RCX86_NUM> TypeToRegisterSet;
-  static std::array<SmallBitVector, RCX86_NUM> TypeToRegisterSetUnfiltered;
-  static std::array<SmallBitVector, Traits::RegisterSet::Reg_NUM>
-      RegisterAliases;
-  SmallBitVector RegsUsed;
-  std::array<VarList, IceType_NUM> PhysicalRegisters;
-
-private:
-  /// dispatchToConcrete is the template voodoo that allows TargetX86Base to
-  /// invoke methods in Machine (which inherits from TargetX86Base) without
-  /// having to rely on virtual method calls. There are two overloads, one
-  /// for non-void types, and one for void types. We need this becase, for
-  /// non-void types, we need to return the method result, where as for
-  /// void, we don't. While it is true that the code compiles without the
-  /// void "version", there used to be a time when compilers would reject
-  /// such code.
-  ///
-  /// This machinery is far from perfect. Note that, in particular, the
-  /// arguments provided to dispatchToConcrete() need to match the arguments
-  /// for Method **exactly** (i.e., no argument promotion is performed.)
-  template <typename Ret, typename... Args>
-  typename std::enable_if<!std::is_void<Ret>::value, Ret>::type
-  dispatchToConcrete(Ret (ConcreteTarget::*Method)(Args...), Args &&...args) {
-    return (static_cast<ConcreteTarget *>(this)->*Method)(
-        std::forward<Args>(args)...);
-  }
-
-  template <typename... Args>
-  void dispatchToConcrete(void (ConcreteTarget::*Method)(Args...),
-                          Args &&...args) {
-    (static_cast<ConcreteTarget *>(this)->*Method)(std::forward<Args>(args)...);
-  }
-
-  void lowerShift64(InstArithmetic::OpKind Op, Operand *Src0Lo, Operand *Src0Hi,
-                    Operand *Src1Lo, Variable *DestLo, Variable *DestHi);
-
-  /// Emit the code for a combined operation and consumer instruction, or set
-  /// the destination variable of the operation if Consumer == nullptr.
-  void lowerIcmpAndConsumer(const InstIcmp *Icmp, const Inst *Consumer);
-  void lowerFcmpAndConsumer(const InstFcmp *Fcmp, const Inst *Consumer);
-  void lowerArithAndConsumer(const InstArithmetic *Arith, const Inst *Consumer);
-
-  /// Emit a setcc instruction if Consumer == nullptr; otherwise emit a
-  /// specialized version of Consumer.
-  void setccOrConsumer(BrCond Condition, Variable *Dest, const Inst *Consumer);
-
-  /// Emit a mov [1|0] instruction if Consumer == nullptr; otherwise emit a
-  /// specialized version of Consumer.
-  void movOrConsumer(bool IcmpResult, Variable *Dest, const Inst *Consumer);
-
-  /// Emit the code for instructions with a vector type.
-  void lowerIcmpVector(const InstIcmp *Icmp);
-  void lowerFcmpVector(const InstFcmp *Icmp);
-  void lowerSelectVector(const InstSelect *Instr);
-
-  /// Helpers for select lowering.
-  void lowerSelectMove(Variable *Dest, BrCond Cond, Operand *SrcT,
-                       Operand *SrcF);
-  void lowerSelectIntMove(Variable *Dest, BrCond Cond, Operand *SrcT,
-                          Operand *SrcF);
-  /// Generic helper to move an arbitrary type from Src to Dest.
-  void lowerMove(Variable *Dest, Operand *Src, bool IsRedefinition);
-
-  /// Optimizations for idiom recognition.
-  bool lowerOptimizeFcmpSelect(const InstFcmp *Fcmp, const InstSelect *Select);
-
-  /// Complains loudly if invoked because the cpu can handle 64-bit types
-  /// natively.
-  template <typename T = Traits>
-  typename std::enable_if<T::Is64Bit, void>::type lowerIcmp64(const InstIcmp *,
-                                                              const Inst *) {
-    llvm::report_fatal_error(
-        "Hey, yo! This is x86-64. Watcha doin'? (lowerIcmp64)");
-  }
-  /// x86lowerIcmp64 handles 64-bit icmp lowering.
-  template <typename T = Traits>
-  typename std::enable_if<!T::Is64Bit, void>::type
-  lowerIcmp64(const InstIcmp *Icmp, const Inst *Consumer);
-
-  BoolFolding<Traits> FoldingInfo;
-
-  /// Helpers for lowering ShuffleVector
-  /// @{
-  Variable *lowerShuffleVector_AllFromSameSrc(Operand *Src, SizeT Index0,
-                                              SizeT Index1, SizeT Index2,
-                                              SizeT Index3);
-  static constexpr SizeT IGNORE_INDEX = 0x80000000u;
-  Variable *lowerShuffleVector_TwoFromSameSrc(Operand *Src0, SizeT Index0,
-                                              SizeT Index1, Operand *Src1,
-                                              SizeT Index2, SizeT Index3);
-  static constexpr SizeT UNIFIED_INDEX_0 = 0;
-  static constexpr SizeT UNIFIED_INDEX_1 = 2;
-  Variable *lowerShuffleVector_UnifyFromDifferentSrcs(Operand *Src0,
-                                                      SizeT Index0,
-                                                      Operand *Src1,
-                                                      SizeT Index1);
-  static constexpr SizeT CLEAR_ALL_BITS = 0x80;
-  SizeT PshufbMaskCount = 0;
-  GlobalString lowerShuffleVector_NewMaskName();
-  ConstantRelocatable *lowerShuffleVector_CreatePshufbMask(
-      int8_t Idx0, int8_t Idx1, int8_t Idx2, int8_t Idx3, int8_t Idx4,
-      int8_t Idx5, int8_t Idx6, int8_t Idx7, int8_t Idx8, int8_t Idx9,
-      int8_t Idx10, int8_t Idx11, int8_t Idx12, int8_t Idx13, int8_t Idx14,
-      int8_t Idx15);
-  void lowerShuffleVector_UsingPshufb(Variable *Dest, Operand *Src0,
-                                      Operand *Src1, int8_t Idx0, int8_t Idx1,
-                                      int8_t Idx2, int8_t Idx3, int8_t Idx4,
-                                      int8_t Idx5, int8_t Idx6, int8_t Idx7,
-                                      int8_t Idx8, int8_t Idx9, int8_t Idx10,
-                                      int8_t Idx11, int8_t Idx12, int8_t Idx13,
-                                      int8_t Idx14, int8_t Idx15);
-  /// @}
-
-  static constexpr FixupKind PcRelFixup = Traits::FK_PcRel;
-  static constexpr FixupKind AbsFixup = Traits::FK_Abs;
-};
-
-template <typename TraitsType>
-class TargetDataX86 final : public TargetDataLowering {
-  using Traits = TraitsType;
-  TargetDataX86() = delete;
-  TargetDataX86(const TargetDataX86 &) = delete;
-  TargetDataX86 &operator=(const TargetDataX86 &) = delete;
-
-public:
-  ~TargetDataX86() override = default;
-
-  static std::unique_ptr<TargetDataLowering> create(GlobalContext *Ctx) {
-    return makeUnique<TargetDataX86>(Ctx);
-  }
-
-  void lowerGlobals(const VariableDeclarationList &Vars,
-                    const std::string &SectionSuffix) override;
-  void lowerConstants() override;
-  void lowerJumpTables() override;
-
-private:
-  ENABLE_MAKE_UNIQUE;
-
-  explicit TargetDataX86(GlobalContext *Ctx) : TargetDataLowering(Ctx) {}
-  template <typename T> static void emitConstantPool(GlobalContext *Ctx);
-};
-
-class TargetHeaderX86 : public TargetHeaderLowering {
-  TargetHeaderX86() = delete;
-  TargetHeaderX86(const TargetHeaderX86 &) = delete;
-  TargetHeaderX86 &operator=(const TargetHeaderX86 &) = delete;
-
-public:
-  ~TargetHeaderX86() = default;
-
-  static std::unique_ptr<TargetHeaderLowering> create(GlobalContext *Ctx) {
-    return makeUnique<TargetHeaderX86>(Ctx);
-  }
-
-private:
-  ENABLE_MAKE_UNIQUE;
-
-  explicit TargetHeaderX86(GlobalContext *Ctx) : TargetHeaderLowering(Ctx) {}
-};
-
-} // end of namespace X8664
-} // end of namespace Ice
-
-#include "IceTargetLoweringX8664BaseImpl.h"
-
-#endif // SUBZERO_SRC_ICETARGETLOWERINGX8664BASE_H
diff --git a/third_party/subzero/src/IceTargetLoweringX8664BaseImpl.h b/third_party/subzero/src/IceTargetLoweringX8664BaseImpl.h
deleted file mode 100644
index c5eba01..0000000
--- a/third_party/subzero/src/IceTargetLoweringX8664BaseImpl.h
+++ /dev/null
@@ -1,8136 +0,0 @@
-//===- subzero/src/IceTargetLoweringX8664BaseImpl.h - x86 lowering -*- C++
-//-*-==//
-//
-//                        The Subzero Code Generator
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// \brief Implements the TargetLoweringX86Base class, which consists almost
-/// entirely of the lowering sequence for each high-level instruction.
-///
-//===----------------------------------------------------------------------===//
-
-#ifndef SUBZERO_SRC_ICETARGETLOWERINGX8664BASEIMPL_H
-#define SUBZERO_SRC_ICETARGETLOWERINGX8664BASEIMPL_H
-
-#include "IceCfg.h"
-#include "IceCfgNode.h"
-#include "IceClFlags.h"
-#include "IceDefs.h"
-#include "IceELFObjectWriter.h"
-#include "IceGlobalInits.h"
-#include "IceInstVarIter.h"
-#include "IceInstX8664Base.h"
-#include "IceLiveness.h"
-#include "IceOperand.h"
-#include "IcePhiLoweringImpl.h"
-#include "IceTargetLoweringX86.h"
-#include "IceUtils.h"
-#include "IceVariableSplitting.h"
-
-#include "llvm/Support/MathExtras.h"
-
-#include <stack>
-
-namespace Ice {
-namespace X8664 {
-
-// The Microsoft x64 ABI requires the caller to allocate a 32 byte
-// "shadow store" (aka "home space") so that the callee may copy the 4
-// register args to it.
-constexpr SizeT getShadowStoreSize() {
-#if defined(_WIN64)
-  return 4 * sizeof(int64_t);
-#else
-  return 0;
-#endif
-}
-
-using Utils::BoolFlagSaver;
-
-template <typename Traits> class BoolFoldingEntry {
-  BoolFoldingEntry(const BoolFoldingEntry &) = delete;
-
-public:
-  BoolFoldingEntry() = default;
-  explicit BoolFoldingEntry(Inst *I);
-  BoolFoldingEntry &operator=(const BoolFoldingEntry &) = default;
-  /// Instr is the instruction producing the i1-type variable of interest.
-  Inst *Instr = nullptr;
-  /// IsComplex is the cached result of BoolFolding::hasComplexLowering(Instr).
-  bool IsComplex = false;
-  /// IsLiveOut is initialized conservatively to true, and is set to false when
-  /// we encounter an instruction that ends Var's live range. We disable the
-  /// folding optimization when Var is live beyond this basic block. Note that
-  /// if liveness analysis is not performed (e.g. in Om1 mode), IsLiveOut will
-  /// always be true and the folding optimization will never be performed.
-  bool IsLiveOut = true;
-  // NumUses counts the number of times Var is used as a source operand in the
-  // basic block. If IsComplex is true and there is more than one use of Var,
-  // then the folding optimization is disabled for Var.
-  uint32_t NumUses = 0;
-};
-
-template <typename Traits> class BoolFolding {
-public:
-  enum BoolFoldingProducerKind {
-    PK_None,
-    // TODO(jpp): PK_Icmp32 is no longer meaningful. Rename to PK_IcmpNative.
-    PK_Icmp32,
-    PK_Icmp64,
-    PK_Fcmp,
-    PK_Trunc,
-    PK_Arith // A flag-setting arithmetic instruction.
-  };
-
-  /// Currently the actual enum values are not used (other than CK_None), but we
-  /// go ahead and produce them anyway for symmetry with the
-  /// BoolFoldingProducerKind.
-  enum BoolFoldingConsumerKind { CK_None, CK_Br, CK_Select, CK_Sext, CK_Zext };
-
-private:
-  BoolFolding(const BoolFolding &) = delete;
-  BoolFolding &operator=(const BoolFolding &) = delete;
-
-public:
-  BoolFolding() = default;
-  static BoolFoldingProducerKind getProducerKind(const Inst *Instr);
-  static BoolFoldingConsumerKind getConsumerKind(const Inst *Instr);
-  static bool hasComplexLowering(const Inst *Instr);
-  static bool isValidFolding(BoolFoldingProducerKind ProducerKind,
-                             BoolFoldingConsumerKind ConsumerKind);
-  void init(CfgNode *Node);
-  const Inst *getProducerFor(const Operand *Opnd) const;
-  void dump(const Cfg *Func) const;
-
-private:
-  /// Returns true if Producers contains a valid entry for the given VarNum.
-  bool containsValid(SizeT VarNum) const {
-    auto Element = Producers.find(VarNum);
-    return Element != Producers.end() && Element->second.Instr != nullptr;
-  }
-  void setInvalid(SizeT VarNum) { Producers[VarNum].Instr = nullptr; }
-  void invalidateProducersOnStore(const Inst *Instr);
-  /// Producers maps Variable::Number to a BoolFoldingEntry.
-  CfgUnorderedMap<SizeT, BoolFoldingEntry<Traits>> Producers;
-};
-
-template <typename Traits>
-BoolFoldingEntry<Traits>::BoolFoldingEntry(Inst *I)
-    : Instr(I), IsComplex(BoolFolding<Traits>::hasComplexLowering(I)) {}
-
-template <typename Traits>
-typename BoolFolding<Traits>::BoolFoldingProducerKind
-BoolFolding<Traits>::getProducerKind(const Inst *Instr) {
-  if (llvm::isa<InstIcmp>(Instr)) {
-    if (Traits::Is64Bit || Instr->getSrc(0)->getType() != IceType_i64)
-      return PK_Icmp32;
-    return PK_Icmp64;
-  }
-  if (llvm::isa<InstFcmp>(Instr))
-    return PK_Fcmp;
-  if (auto *Arith = llvm::dyn_cast<InstArithmetic>(Instr)) {
-    if (Traits::Is64Bit || Arith->getSrc(0)->getType() != IceType_i64) {
-      switch (Arith->getOp()) {
-      default:
-        return PK_None;
-      case InstArithmetic::And:
-      case InstArithmetic::Or:
-        return PK_Arith;
-      }
-    }
-  }
-  return PK_None; // TODO(stichnot): remove this
-
-  if (auto *Cast = llvm::dyn_cast<InstCast>(Instr)) {
-    switch (Cast->getCastKind()) {
-    default:
-      return PK_None;
-    case InstCast::Trunc:
-      return PK_Trunc;
-    }
-  }
-  return PK_None;
-}
-
-template <typename Traits>
-typename BoolFolding<Traits>::BoolFoldingConsumerKind
-BoolFolding<Traits>::getConsumerKind(const Inst *Instr) {
-  if (llvm::isa<InstBr>(Instr))
-    return CK_Br;
-  if (llvm::isa<InstSelect>(Instr))
-    return CK_Select;
-  return CK_None; // TODO(stichnot): remove this
-
-  if (auto *Cast = llvm::dyn_cast<InstCast>(Instr)) {
-    switch (Cast->getCastKind()) {
-    default:
-      return CK_None;
-    case InstCast::Sext:
-      return CK_Sext;
-    case InstCast::Zext:
-      return CK_Zext;
-    }
-  }
-  return CK_None;
-}
-
-/// Returns true if the producing instruction has a "complex" lowering sequence.
-/// This generally means that its lowering sequence requires more than one
-/// conditional branch, namely 64-bit integer compares and some floating-point
-/// compares. When this is true, and there is more than one consumer, we prefer
-/// to disable the folding optimization because it minimizes branches.
-template <typename Traits>
-bool BoolFolding<Traits>::hasComplexLowering(const Inst *Instr) {
-  switch (getProducerKind(Instr)) {
-  default:
-    return false;
-  case PK_Icmp64:
-    return !Traits::Is64Bit;
-  case PK_Fcmp:
-    return Traits::TableFcmp[llvm::cast<InstFcmp>(Instr)->getCondition()].C2 !=
-           CondX86::Br_None;
-  }
-}
-
-template <typename Traits>
-bool BoolFolding<Traits>::isValidFolding(
-    typename BoolFolding<Traits>::BoolFoldingProducerKind ProducerKind,
-    typename BoolFolding<Traits>::BoolFoldingConsumerKind ConsumerKind) {
-  switch (ProducerKind) {
-  default:
-    return false;
-  case PK_Icmp32:
-  case PK_Icmp64:
-  case PK_Fcmp:
-    return (ConsumerKind == CK_Br) || (ConsumerKind == CK_Select);
-  case PK_Arith:
-    return ConsumerKind == CK_Br;
-  }
-}
-
-template <typename Traits> void BoolFolding<Traits>::init(CfgNode *Node) {
-  Producers.clear();
-  for (Inst &Instr : Node->getInsts()) {
-    if (Instr.isDeleted())
-      continue;
-    invalidateProducersOnStore(&Instr);
-    // Check whether Instr is a valid producer.
-    Variable *Var = Instr.getDest();
-    if (Var) { // only consider instructions with an actual dest var
-      if (isBooleanType(Var->getType())) {        // only bool-type dest vars
-        if (getProducerKind(&Instr) != PK_None) { // white-listed instructions
-          Producers[Var->getIndex()] = BoolFoldingEntry<Traits>(&Instr);
-        }
-      }
-    }
-    // Check each src variable against the map.
-    FOREACH_VAR_IN_INST(Var, Instr) {
-      SizeT VarNum = Var->getIndex();
-      if (!containsValid(VarNum))
-        continue;
-      // All valid consumers use Var as the first source operand
-      if (IndexOfVarOperandInInst(Var) != 0) {
-        setInvalid(VarNum);
-        continue;
-      }
-      // Consumer instructions must be white-listed
-      typename BoolFolding<Traits>::BoolFoldingConsumerKind ConsumerKind =
-          getConsumerKind(&Instr);
-      if (ConsumerKind == CK_None) {
-        setInvalid(VarNum);
-        continue;
-      }
-      typename BoolFolding<Traits>::BoolFoldingProducerKind ProducerKind =
-          getProducerKind(Producers[VarNum].Instr);
-      if (!isValidFolding(ProducerKind, ConsumerKind)) {
-        setInvalid(VarNum);
-        continue;
-      }
-      // Avoid creating multiple copies of complex producer instructions.
-      if (Producers[VarNum].IsComplex && Producers[VarNum].NumUses > 0) {
-        setInvalid(VarNum);
-        continue;
-      }
-      ++Producers[VarNum].NumUses;
-      if (Instr.isLastUse(Var)) {
-        Producers[VarNum].IsLiveOut = false;
-      }
-    }
-  }
-  for (auto &I : Producers) {
-    // Ignore entries previously marked invalid.
-    if (I.second.Instr == nullptr)
-      continue;
-    // Disable the producer if its dest may be live beyond this block.
-    if (I.second.IsLiveOut) {
-      setInvalid(I.first);
-      continue;
-    }
-    // Mark as "dead" rather than outright deleting. This is so that other
-    // peephole style optimizations during or before lowering have access to
-    // this instruction in undeleted form. See for example
-    // tryOptimizedCmpxchgCmpBr().
-    I.second.Instr->setDead();
-  }
-}
-
-template <typename Traits>
-const Inst *BoolFolding<Traits>::getProducerFor(const Operand *Opnd) const {
-  auto *Var = llvm::dyn_cast<const Variable>(Opnd);
-  if (Var == nullptr)
-    return nullptr;
-  SizeT VarNum = Var->getIndex();
-  auto Element = Producers.find(VarNum);
-  if (Element == Producers.end())
-    return nullptr;
-  return Element->second.Instr;
-}
-
-template <typename Traits>
-void BoolFolding<Traits>::dump(const Cfg *Func) const {
-  if (!BuildDefs::dump() || !Func->isVerbose(IceV_Folding))
-    return;
-  OstreamLocker L(Func->getContext());
-  Ostream &Str = Func->getContext()->getStrDump();
-  for (auto &I : Producers) {
-    if (I.second.Instr == nullptr)
-      continue;
-    Str << "Found foldable producer:\n  ";
-    I.second.Instr->dump(Func);
-    Str << "\n";
-  }
-}
-
-/// If the given instruction has potential memory side effects (e.g. store, rmw,
-/// or a call instruction with potential memory side effects), then we must not
-/// allow a pre-store Producer instruction with memory operands to be folded
-/// into a post-store Consumer instruction.  If this is detected, the Producer
-/// is invalidated.
-///
-/// We use the Producer's IsLiveOut field to determine whether any potential
-/// Consumers come after this store instruction.  The IsLiveOut field is
-/// initialized to true, and BoolFolding::init() sets IsLiveOut to false when it
-/// sees the variable's definitive last use (indicating the variable is not in
-/// the node's live-out set).  Thus if we see here that IsLiveOut is false, we
-/// know that there can be no consumers after the store, and therefore we know
-/// the folding is safe despite the store instruction.
-template <typename Traits>
-void BoolFolding<Traits>::invalidateProducersOnStore(const Inst *Instr) {
-  if (!Instr->isMemoryWrite())
-    return;
-  for (auto &ProducerPair : Producers) {
-    if (!ProducerPair.second.IsLiveOut)
-      continue;
-    Inst *PInst = ProducerPair.second.Instr;
-    if (PInst == nullptr)
-      continue;
-    bool HasMemOperand = false;
-    const SizeT SrcSize = PInst->getSrcSize();
-    for (SizeT I = 0; I < SrcSize; ++I) {
-      if (llvm::isa<typename Traits::X86OperandMem>(PInst->getSrc(I))) {
-        HasMemOperand = true;
-        break;
-      }
-    }
-    if (!HasMemOperand)
-      continue;
-    setInvalid(ProducerPair.first);
-  }
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::initNodeForLowering(CfgNode *Node) {
-  FoldingInfo.init(Node);
-  FoldingInfo.dump(Func);
-}
-
-template <typename TraitsType>
-TargetX86Base<TraitsType>::TargetX86Base(Cfg *Func) : TargetLowering(Func) {
-  static_assert(
-      (Traits::InstructionSet::End - Traits::InstructionSet::Begin) ==
-          (TargetInstructionSet::X86InstructionSet_End -
-           TargetInstructionSet::X86InstructionSet_Begin),
-      "Traits::InstructionSet range different from TargetInstructionSet");
-  if (getFlags().getTargetInstructionSet() !=
-      TargetInstructionSet::BaseInstructionSet) {
-    InstructionSet = static_cast<InstructionSetEnum>(
-        (getFlags().getTargetInstructionSet() -
-         TargetInstructionSet::X86InstructionSet_Begin) +
-        Traits::InstructionSet::Begin);
-  }
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::staticInit(GlobalContext *Ctx) {
-  RegNumT::setLimit(Traits::RegisterSet::Reg_NUM);
-  Traits::initRegisterSet(getFlags(), &TypeToRegisterSet, &RegisterAliases);
-  for (size_t i = 0; i < TypeToRegisterSet.size(); ++i)
-    TypeToRegisterSetUnfiltered[i] = TypeToRegisterSet[i];
-  filterTypeToRegisterSet(Ctx, Traits::RegisterSet::Reg_NUM,
-                          TypeToRegisterSet.data(), TypeToRegisterSet.size(),
-                          Traits::getRegName, getRegClassName);
-}
-
-template <typename TraitsType>
-bool TargetX86Base<TraitsType>::shouldBePooled(const Constant *C) {
-  if (auto *ConstFloat = llvm::dyn_cast<ConstantFloat>(C)) {
-    return !Utils::isPositiveZero(ConstFloat->getValue());
-  }
-  if (auto *ConstDouble = llvm::dyn_cast<ConstantDouble>(C)) {
-    return !Utils::isPositiveZero(ConstDouble->getValue());
-  }
-  return false;
-}
-
-template <typename TraitsType>
-::Ice::Type TargetX86Base<TraitsType>::getPointerType() {
-  return Traits::Is64Bit ? IceType_i64 : IceType_i32;
-}
-
-template <typename TraitsType> void TargetX86Base<TraitsType>::translateO2() {
-  TimerMarker T(TimerStack::TT_O2, Func);
-
-  genTargetHelperCalls();
-  Func->dump("After target helper call insertion");
-
-  // Merge Alloca instructions, and lay out the stack.
-  static constexpr bool SortAndCombineAllocas = true;
-  Func->processAllocas(SortAndCombineAllocas);
-  Func->dump("After Alloca processing");
-
-  // Run this early so it can be used to focus optimizations on potentially hot
-  // code.
-  // TODO(stichnot,ascull): currently only used for regalloc not
-  // expensive high level optimizations which could be focused on potentially
-  // hot code.
-  Func->generateLoopInfo();
-  Func->dump("After loop analysis");
-  if (getFlags().getLoopInvariantCodeMotion()) {
-    Func->loopInvariantCodeMotion();
-    Func->dump("After LICM");
-  }
-
-  if (getFlags().getLocalCSE() != Ice::LCSE_Disabled) {
-    Func->localCSE(getFlags().getLocalCSE() == Ice::LCSE_EnabledSSA);
-    Func->dump("After Local CSE");
-    Func->floatConstantCSE();
-  }
-  if (getFlags().getEnableShortCircuit()) {
-    Func->shortCircuitJumps();
-    Func->dump("After Short Circuiting");
-  }
-
-  if (!getFlags().getEnablePhiEdgeSplit()) {
-    // Lower Phi instructions.
-    Func->placePhiLoads();
-    if (Func->hasError())
-      return;
-    Func->placePhiStores();
-    if (Func->hasError())
-      return;
-    Func->deletePhis();
-    if (Func->hasError())
-      return;
-    Func->dump("After Phi lowering");
-  }
-
-  // Address mode optimization.
-  Func->getVMetadata()->init(VMK_SingleDefs);
-  Func->doAddressOpt();
-  Func->materializeVectorShuffles();
-
-  // Find read-modify-write opportunities. Do this after address mode
-  // optimization so that doAddressOpt() doesn't need to be applied to RMW
-  // instructions as well.
-  findRMW();
-  Func->dump("After RMW transform");
-
-  // Argument lowering
-  Func->doArgLowering();
-
-  // Target lowering. This requires liveness analysis for some parts of the
-  // lowering decisions, such as compare/branch fusing. If non-lightweight
-  // liveness analysis is used, the instructions need to be renumbered first
-  // TODO: This renumbering should only be necessary if we're actually
-  // calculating live intervals, which we only do for register allocation.
-  Func->renumberInstructions();
-  if (Func->hasError())
-    return;
-
-  // TODO: It should be sufficient to use the fastest liveness calculation,
-  // i.e. livenessLightweight(). However, for some reason that slows down the
-  // rest of the translation. Investigate.
-  Func->liveness(Liveness_Basic);
-  if (Func->hasError())
-    return;
-  Func->dump("After x86 address mode opt");
-
-  doLoadOpt();
-
-  Func->genCode();
-  if (Func->hasError())
-    return;
-  Func->dump("After x86 codegen");
-  splitBlockLocalVariables(Func);
-
-  // Register allocation. This requires instruction renumbering and full
-  // liveness analysis. Loops must be identified before liveness so variable
-  // use weights are correct.
-  Func->renumberInstructions();
-  if (Func->hasError())
-    return;
-  Func->liveness(Liveness_Intervals);
-  if (Func->hasError())
-    return;
-  // The post-codegen dump is done here, after liveness analysis and associated
-  // cleanup, to make the dump cleaner and more useful.
-  Func->dump("After initial x86 codegen");
-  // Validate the live range computations. The expensive validation call is
-  // deliberately only made when assertions are enabled.
-  assert(Func->validateLiveness());
-  Func->getVMetadata()->init(VMK_All);
-  regAlloc(RAK_Global);
-  if (Func->hasError())
-    return;
-  Func->dump("After linear scan regalloc");
-
-  if (getFlags().getEnablePhiEdgeSplit()) {
-    Func->advancedPhiLowering();
-    Func->dump("After advanced Phi lowering");
-  }
-
-  // Stack frame mapping.
-  Func->genFrame();
-  if (Func->hasError())
-    return;
-  Func->dump("After stack frame mapping");
-
-  Func->contractEmptyNodes();
-  Func->reorderNodes();
-
-  // Branch optimization.  This needs to be done just before code emission. In
-  // particular, no transformations that insert or reorder CfgNodes should be
-  // done after branch optimization. We go ahead and do it before nop insertion
-  // to reduce the amount of work needed for searching for opportunities.
-  Func->doBranchOpt();
-  Func->dump("After branch optimization");
-}
-
-template <typename TraitsType> void TargetX86Base<TraitsType>::translateOm1() {
-  TimerMarker T(TimerStack::TT_Om1, Func);
-
-  genTargetHelperCalls();
-
-  // Do not merge Alloca instructions, and lay out the stack.
-  // static constexpr bool SortAndCombineAllocas = false;
-  static constexpr bool SortAndCombineAllocas =
-      true; // TODO(b/171222930): Fix Win32 bug when this is false
-  Func->processAllocas(SortAndCombineAllocas);
-  Func->dump("After Alloca processing");
-
-  Func->placePhiLoads();
-  if (Func->hasError())
-    return;
-  Func->placePhiStores();
-  if (Func->hasError())
-    return;
-  Func->deletePhis();
-  if (Func->hasError())
-    return;
-  Func->dump("After Phi lowering");
-
-  Func->doArgLowering();
-  Func->genCode();
-  if (Func->hasError())
-    return;
-  Func->dump("After initial x86 codegen");
-
-  regAlloc(RAK_InfOnly);
-  if (Func->hasError())
-    return;
-  Func->dump("After regalloc of infinite-weight variables");
-
-  Func->genFrame();
-  if (Func->hasError())
-    return;
-  Func->dump("After stack frame mapping");
-}
-
-inline bool canRMW(const InstArithmetic *Arith) {
-  Type Ty = Arith->getDest()->getType();
-  // X86 vector instructions write to a register and have no RMW option.
-  if (isVectorType(Ty))
-    return false;
-  bool isI64 = Ty == IceType_i64;
-
-  switch (Arith->getOp()) {
-  // Not handled for lack of simple lowering:
-  //   shift on i64
-  //   mul, udiv, urem, sdiv, srem, frem
-  // Not handled for lack of RMW instructions:
-  //   fadd, fsub, fmul, fdiv (also vector types)
-  default:
-    return false;
-  case InstArithmetic::Add:
-  case InstArithmetic::Sub:
-  case InstArithmetic::And:
-  case InstArithmetic::Or:
-  case InstArithmetic::Xor:
-    return true;
-  case InstArithmetic::Shl:
-  case InstArithmetic::Lshr:
-  case InstArithmetic::Ashr:
-    return false; // TODO(stichnot): implement
-    return !isI64;
-  }
-}
-
-template <typename TraitsType>
-bool isSameMemAddressOperand(const Operand *A, const Operand *B) {
-  if (A == B)
-    return true;
-  if (auto *MemA =
-          llvm::dyn_cast<typename TargetX86Base<TraitsType>::X86OperandMem>(
-              A)) {
-    if (auto *MemB =
-            llvm::dyn_cast<typename TargetX86Base<TraitsType>::X86OperandMem>(
-                B)) {
-      return MemA->getBase() == MemB->getBase() &&
-             MemA->getOffset() == MemB->getOffset() &&
-             MemA->getIndex() == MemB->getIndex() &&
-             MemA->getShift() == MemB->getShift() &&
-             MemA->getSegmentRegister() == MemB->getSegmentRegister();
-    }
-  }
-  return false;
-}
-
-template <typename TraitsType> void TargetX86Base<TraitsType>::findRMW() {
-  TimerMarker _(TimerStack::TT_findRMW, Func);
-  Func->dump("Before RMW");
-  if (Func->isVerbose(IceV_RMW))
-    Func->getContext()->lockStr();
-  for (CfgNode *Node : Func->getNodes()) {
-    // Walk through the instructions, considering each sequence of 3
-    // instructions, and look for the particular RMW pattern. Note that this
-    // search can be "broken" (false negatives) if there are intervening
-    // deleted instructions, or intervening instructions that could be safely
-    // moved out of the way to reveal an RMW pattern.
-    auto E = Node->getInsts().end();
-    auto I1 = E, I2 = E, I3 = Node->getInsts().begin();
-    for (; I3 != E; I1 = I2, I2 = I3, ++I3) {
-      // Make I3 skip over deleted instructions.
-      while (I3 != E && I3->isDeleted())
-        ++I3;
-      if (I1 == E || I2 == E || I3 == E)
-        continue;
-      assert(!I1->isDeleted());
-      assert(!I2->isDeleted());
-      assert(!I3->isDeleted());
-      auto *Load = llvm::dyn_cast<InstLoad>(I1);
-      auto *Arith = llvm::dyn_cast<InstArithmetic>(I2);
-      auto *Store = llvm::dyn_cast<InstStore>(I3);
-      if (!Load || !Arith || !Store)
-        continue;
-      // Look for:
-      //   a = Load addr
-      //   b = <op> a, other
-      //   Store b, addr
-      // Change to:
-      //   a = Load addr
-      //   b = <op> a, other
-      //   x = FakeDef
-      //   RMW <op>, addr, other, x
-      //   b = Store b, addr, x
-      // Note that inferTwoAddress() makes sure setDestRedefined() gets called
-      // on the updated Store instruction, to avoid liveness problems later.
-      //
-      // With this transformation, the Store instruction acquires a Dest
-      // variable and is now subject to dead code elimination if there are no
-      // more uses of "b".  Variable "x" is a beacon for determining whether the
-      // Store instruction gets dead-code eliminated.  If the Store instruction
-      // is eliminated, then it must be the case that the RMW instruction ends
-      // x's live range, and therefore the RMW instruction will be retained and
-      // later lowered.  On the other hand, if the RMW instruction does not end
-      // x's live range, then the Store instruction must still be present, and
-      // therefore the RMW instruction is ignored during lowering because it is
-      // redundant with the Store instruction.
-      //
-      // Note that if "a" has further uses, the RMW transformation may still
-      // trigger, resulting in two loads and one store, which is worse than the
-      // original one load and one store.  However, this is probably rare, and
-      // caching probably keeps it just as fast.
-      if (!isSameMemAddressOperand<TraitsType>(Load->getLoadAddress(),
-                                               Store->getStoreAddress()))
-        continue;
-      Operand *ArithSrcFromLoad = Arith->getSrc(0);
-      Operand *ArithSrcOther = Arith->getSrc(1);
-      if (ArithSrcFromLoad != Load->getDest()) {
-        if (!Arith->isCommutative() || ArithSrcOther != Load->getDest())
-          continue;
-        std::swap(ArithSrcFromLoad, ArithSrcOther);
-      }
-      if (Arith->getDest() != Store->getData())
-        continue;
-      if (!canRMW(Arith))
-        continue;
-      if (Func->isVerbose(IceV_RMW)) {
-        Ostream &Str = Func->getContext()->getStrDump();
-        Str << "Found RMW in " << Func->getFunctionName() << ":\n  ";
-        Load->dump(Func);
-        Str << "\n  ";
-        Arith->dump(Func);
-        Str << "\n  ";
-        Store->dump(Func);
-        Str << "\n";
-      }
-      Variable *Beacon = Func->makeVariable(IceType_i32);
-      Beacon->setMustNotHaveReg();
-      Store->setRmwBeacon(Beacon);
-      auto *BeaconDef = InstFakeDef::create(Func, Beacon);
-      Node->getInsts().insert(I3, BeaconDef);
-      auto *RMW =
-          InstX86FakeRMW::create(Func, ArithSrcOther, Store->getStoreAddress(),
-                                 Beacon, Arith->getOp());
-      Node->getInsts().insert(I3, RMW);
-    }
-  }
-  if (Func->isVerbose(IceV_RMW))
-    Func->getContext()->unlockStr();
-}
-
-// Converts a ConstantInteger32 operand into its constant value, or
-// MemoryOrderInvalid if the operand is not a ConstantInteger32.
-inline uint64_t getConstantMemoryOrder(Operand *Opnd) {
-  if (auto *Integer = llvm::dyn_cast<ConstantInteger32>(Opnd))
-    return Integer->getValue();
-  return Intrinsics::MemoryOrderInvalid;
-}
-
-/// Determines whether the dest of a Load instruction can be folded into one of
-/// the src operands of a 2-operand instruction. This is true as long as the
-/// load dest matches exactly one of the binary instruction's src operands.
-/// Replaces Src0 or Src1 with LoadSrc if the answer is true.
-inline bool canFoldLoadIntoBinaryInst(Operand *LoadSrc, Variable *LoadDest,
-                                      Operand *&Src0, Operand *&Src1) {
-  if (Src0 == LoadDest && Src1 != LoadDest) {
-    Src0 = LoadSrc;
-    return true;
-  }
-  if (Src0 != LoadDest && Src1 == LoadDest) {
-    Src1 = LoadSrc;
-    return true;
-  }
-  return false;
-}
-
-template <typename TraitsType> void TargetX86Base<TraitsType>::doLoadOpt() {
-  TimerMarker _(TimerStack::TT_loadOpt, Func);
-  for (CfgNode *Node : Func->getNodes()) {
-    Context.init(Node);
-    while (!Context.atEnd()) {
-      Variable *LoadDest = nullptr;
-      Operand *LoadSrc = nullptr;
-      Inst *CurInst = iteratorToInst(Context.getCur());
-      Inst *Next = Context.getNextInst();
-      // Determine whether the current instruction is a Load instruction or
-      // equivalent.
-      if (auto *Load = llvm::dyn_cast<InstLoad>(CurInst)) {
-        // An InstLoad qualifies unless it uses a 64-bit absolute address,
-        // which requires legalization to insert a copy to register.
-        // TODO(b/148272103): Fold these after legalization.
-        if (!Traits::Is64Bit || !llvm::isa<Constant>(Load->getLoadAddress())) {
-          LoadDest = Load->getDest();
-          constexpr bool DoLegalize = false;
-          LoadSrc = formMemoryOperand(Load->getLoadAddress(),
-                                      LoadDest->getType(), DoLegalize);
-        }
-      } else if (auto *Intrin = llvm::dyn_cast<InstIntrinsic>(CurInst)) {
-        // An AtomicLoad intrinsic qualifies as long as it has a valid memory
-        // ordering, and can be implemented in a single instruction (i.e., not
-        // i64 on x86-32).
-        Intrinsics::IntrinsicID ID = Intrin->getIntrinsicID();
-        if (ID == Intrinsics::AtomicLoad &&
-            (Traits::Is64Bit || Intrin->getDest()->getType() != IceType_i64) &&
-            Intrinsics::isMemoryOrderValid(
-                ID, getConstantMemoryOrder(Intrin->getArg(1)))) {
-          LoadDest = Intrin->getDest();
-          constexpr bool DoLegalize = false;
-          LoadSrc = formMemoryOperand(Intrin->getArg(0), LoadDest->getType(),
-                                      DoLegalize);
-        }
-      }
-      // A Load instruction can be folded into the following instruction only
-      // if the following instruction ends the Load's Dest variable's live
-      // range.
-      if (LoadDest && Next && Next->isLastUse(LoadDest)) {
-        assert(LoadSrc);
-        Inst *NewInst = nullptr;
-        if (auto *Arith = llvm::dyn_cast<InstArithmetic>(Next)) {
-          Operand *Src0 = Arith->getSrc(0);
-          Operand *Src1 = Arith->getSrc(1);
-          if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) {
-            NewInst = InstArithmetic::create(Func, Arith->getOp(),
-                                             Arith->getDest(), Src0, Src1);
-          }
-        } else if (auto *Icmp = llvm::dyn_cast<InstIcmp>(Next)) {
-          Operand *Src0 = Icmp->getSrc(0);
-          Operand *Src1 = Icmp->getSrc(1);
-          if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) {
-            NewInst = InstIcmp::create(Func, Icmp->getCondition(),
-                                       Icmp->getDest(), Src0, Src1);
-          }
-        } else if (auto *Fcmp = llvm::dyn_cast<InstFcmp>(Next)) {
-          Operand *Src0 = Fcmp->getSrc(0);
-          Operand *Src1 = Fcmp->getSrc(1);
-          if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) {
-            NewInst = InstFcmp::create(Func, Fcmp->getCondition(),
-                                       Fcmp->getDest(), Src0, Src1);
-          }
-        } else if (auto *Select = llvm::dyn_cast<InstSelect>(Next)) {
-          Operand *Src0 = Select->getTrueOperand();
-          Operand *Src1 = Select->getFalseOperand();
-          if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) {
-            NewInst = InstSelect::create(Func, Select->getDest(),
-                                         Select->getCondition(), Src0, Src1);
-          }
-        } else if (auto *Cast = llvm::dyn_cast<InstCast>(Next)) {
-          // The load dest can always be folded into a Cast instruction.
-          auto *Src0 = llvm::dyn_cast<Variable>(Cast->getSrc(0));
-          if (Src0 == LoadDest) {
-            NewInst = InstCast::create(Func, Cast->getCastKind(),
-                                       Cast->getDest(), LoadSrc);
-          }
-        }
-        if (NewInst) {
-          CurInst->setDeleted();
-          Next->setDeleted();
-          Context.insert(NewInst);
-          // Update NewInst->LiveRangesEnded so that target lowering may
-          // benefit. Also update NewInst->HasSideEffects.
-          NewInst->spliceLivenessInfo(Next, CurInst);
-        }
-      }
-      Context.advanceCur();
-      Context.advanceNext();
-    }
-  }
-  Func->dump("After load optimization");
-}
-
-template <typename TraitsType>
-bool TargetX86Base<TraitsType>::doBranchOpt(Inst *I, const CfgNode *NextNode) {
-  if (auto *Br = llvm::dyn_cast<InstX86Br>(I)) {
-    return Br->optimizeBranch(NextNode);
-  }
-  return false;
-}
-
-template <typename TraitsType>
-Variable *TargetX86Base<TraitsType>::getPhysicalRegister(RegNumT RegNum,
-                                                         Type Ty) {
-  if (Ty == IceType_void)
-    Ty = IceType_i32;
-  if (PhysicalRegisters[Ty].empty())
-    PhysicalRegisters[Ty].resize(Traits::RegisterSet::Reg_NUM);
-  assert(unsigned(RegNum) < PhysicalRegisters[Ty].size());
-  Variable *Reg = PhysicalRegisters[Ty][RegNum];
-  if (Reg == nullptr) {
-    Reg = Func->makeVariable(Ty);
-    Reg->setRegNum(RegNum);
-    PhysicalRegisters[Ty][RegNum] = Reg;
-    // Specially mark a named physical register as an "argument" so that it is
-    // considered live upon function entry.  Otherwise it's possible to get
-    // liveness validation errors for saving callee-save registers.
-    Func->addImplicitArg(Reg);
-    // Don't bother tracking the live range of a named physical register.
-    Reg->setIgnoreLiveness();
-  }
-  assert(Traits::getGprForType(Ty, RegNum) == RegNum);
-  return Reg;
-}
-
-template <typename TraitsType>
-const char *TargetX86Base<TraitsType>::getRegName(RegNumT RegNum,
-                                                  Type Ty) const {
-  return Traits::getRegName(Traits::getGprForType(Ty, RegNum));
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::emitVariable(const Variable *Var) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Ctx->getStrEmit();
-  if (Var->hasReg()) {
-    Str << "%" << getRegName(Var->getRegNum(), Var->getType());
-    return;
-  }
-  if (Var->mustHaveReg()) {
-    llvm::report_fatal_error("Infinite-weight Variable (" + Var->getName() +
-                             ") has no register assigned - function " +
-                             Func->getFunctionName());
-  }
-  const int32_t Offset = Var->getStackOffset();
-  auto BaseRegNum = Var->getBaseRegNum();
-  if (BaseRegNum.hasNoValue())
-    BaseRegNum = getFrameOrStackReg();
-
-  // Print in the form "Offset(%reg)", omitting Offset when it is 0.
-  if (getFlags().getDecorateAsm()) {
-    Str << Var->getSymbolicStackOffset();
-  } else if (Offset != 0) {
-    Str << Offset;
-  }
-  const Type FrameSPTy = Traits::WordType;
-  Str << "(%" << getRegName(BaseRegNum, FrameSPTy) << ")";
-}
-
-template <typename TraitsType>
-typename TargetX86Base<TraitsType>::X86Address
-TargetX86Base<TraitsType>::stackVarToAsmOperand(const Variable *Var) const {
-  if (Var->hasReg())
-    llvm::report_fatal_error("Stack Variable has a register assigned");
-  if (Var->mustHaveReg()) {
-    llvm::report_fatal_error("Infinite-weight Variable (" + Var->getName() +
-                             ") has no register assigned - function " +
-                             Func->getFunctionName());
-  }
-  int32_t Offset = Var->getStackOffset();
-  auto BaseRegNum = Var->getBaseRegNum();
-  if (Var->getBaseRegNum().hasNoValue()) {
-    // If the stack pointer needs alignment, we must use the frame pointer for
-    // arguments. For locals, getFrameOrStackReg will return the stack pointer
-    // in this case.
-    if (needsStackPointerAlignment() && Var->getIsArg()) {
-      assert(hasFramePointer());
-      BaseRegNum = getFrameReg();
-    } else {
-      BaseRegNum = getFrameOrStackReg();
-    }
-  }
-  return X86Address(Traits::getEncodedGPR(BaseRegNum), Offset,
-                    AssemblerFixup::NoFixup);
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::addProlog(CfgNode *Node) {
-  // Stack frame layout:
-  //
-  // +------------------------+  ^ +
-  // | 1. return address      |  |
-  // +------------------------+  v -
-  // | 2. preserved registers |
-  // +------------------------+ <--- BasePointer (if used)
-  // | 3. padding             |
-  // +------------------------+
-  // | 4. global spill area   |
-  // +------------------------+
-  // | 5. padding             |
-  // +------------------------+
-  // | 6. local spill area    |
-  // +------------------------+
-  // | 7. padding             |
-  // +------------------------+
-  // | 7.5 shadow (WinX64)    |
-  // +------------------------+
-  // | 8. allocas             |
-  // +------------------------+
-  // | 9. padding             |
-  // +------------------------+
-  // | 10. out args           |
-  // +------------------------+ <--- StackPointer
-  //
-  // The following variables record the size in bytes of the given areas:
-  //  * X86_RET_IP_SIZE_BYTES:   area 1
-  //  * PreservedRegsSizeBytes:  area 2
-  //  * SpillAreaPaddingBytes:   area 3
-  //  * GlobalsSize:             area 4
-  //  * LocalsSlotsPaddingBytes: area 5
-  //  * GlobalsAndSubsequentPaddingSize: areas 4 - 5
-  //  * LocalsSpillAreaSize:     area 6
-  //  * FixedAllocaSizeBytes:    areas 7 - 8
-  //  * SpillAreaSizeBytes:      areas 3 - 10
-  //  * maxOutArgsSizeBytes():   areas 9 - 10
-
-  // Determine stack frame offsets for each Variable without a register
-  // assignment. This can be done as one variable per stack slot. Or, do
-  // coalescing by running the register allocator again with an infinite set of
-  // registers (as a side effect, this gives variables a second chance at
-  // physical register assignment).
-  //
-  // A middle ground approach is to leverage sparsity and allocate one block of
-  // space on the frame for globals (variables with multi-block lifetime), and
-  // one block to share for locals (single-block lifetime).
-
-  const SizeT ShadowStoreSize = getShadowStoreSize();
-
-  // StackPointer: points just past return address of calling function
-
-  Context.init(Node);
-  Context.setInsertPoint(Context.getCur());
-
-  SmallBitVector CalleeSaves = getRegisterSet(RegSet_CalleeSave, RegSet_None);
-  RegsUsed = SmallBitVector(CalleeSaves.size());
-  VarList SortedSpilledVariables, VariablesLinkedToSpillSlots;
-  size_t GlobalsSize = 0;
-  // If there is a separate locals area, this represents that area. Otherwise
-  // it counts any variable not counted by GlobalsSize.
-  SpillAreaSizeBytes = 0;
-  // If there is a separate locals area, this specifies the alignment for it.
-  uint32_t LocalsSlotsAlignmentBytes = 0;
-  // The entire spill locations area gets aligned to largest natural alignment
-  // of the variables that have a spill slot.
-  uint32_t SpillAreaAlignmentBytes = 0;
-  // A spill slot linked to a variable with a stack slot should reuse that
-  // stack slot.
-  std::function<bool(Variable *)> TargetVarHook =
-      [&VariablesLinkedToSpillSlots](Variable *Var) {
-        // TODO(stichnot): Refactor this into the base class.
-        Variable *Root = Var->getLinkedToStackRoot();
-        if (Root != nullptr) {
-          assert(!Root->hasReg());
-          if (!Root->hasReg()) {
-            VariablesLinkedToSpillSlots.push_back(Var);
-            return true;
-          }
-        }
-        return false;
-      };
-
-  // Compute the list of spilled variables and bounds for GlobalsSize, etc.
-  getVarStackSlotParams(SortedSpilledVariables, RegsUsed, &GlobalsSize,
-                        &SpillAreaSizeBytes, &SpillAreaAlignmentBytes,
-                        &LocalsSlotsAlignmentBytes, TargetVarHook);
-  uint32_t LocalsSpillAreaSize = SpillAreaSizeBytes;
-  SpillAreaSizeBytes += GlobalsSize;
-
-  // Add push instructions for preserved registers.
-  uint32_t NumCallee = 0;
-  size_t PreservedRegsSizeBytes = 0;
-  SmallBitVector Pushed(CalleeSaves.size());
-  for (RegNumT i : RegNumBVIter(CalleeSaves)) {
-    const auto Canonical = Traits::getBaseReg(i);
-    assert(Canonical == Traits::getBaseReg(Canonical));
-    if (RegsUsed[i]) {
-      Pushed[Canonical] = true;
-    }
-  }
-  for (RegNumT RegNum : RegNumBVIter(Pushed)) {
-    assert(RegNum == Traits::getBaseReg(RegNum));
-    ++NumCallee;
-    if (Traits::isXmm(RegNum)) {
-      PreservedRegsSizeBytes += 16;
-    } else {
-      PreservedRegsSizeBytes += typeWidthInBytes(Traits::WordType);
-    }
-    _push_reg(RegNum);
-  }
-  Ctx->statsUpdateRegistersSaved(NumCallee);
-
-  // StackPointer: points past preserved registers at start of spill area
-
-  // Generate "push frameptr; mov frameptr, stackptr"
-  if (IsEbpBasedFrame) {
-    assert(
-        (RegsUsed & getRegisterSet(RegSet_FramePointer, RegSet_None)).count() ==
-        0);
-    PreservedRegsSizeBytes += typeWidthInBytes(Traits::WordType);
-    _link_bp();
-  }
-
-  // Align the variables area. SpillAreaPaddingBytes is the size of the region
-  // after the preserved registers and before the spill areas.
-  // LocalsSlotsPaddingBytes is the amount of padding between the globals and
-  // locals area if they are separate.
-  assert(LocalsSlotsAlignmentBytes <= SpillAreaAlignmentBytes);
-  uint32_t SpillAreaPaddingBytes = 0;
-  uint32_t LocalsSlotsPaddingBytes = 0;
-  alignStackSpillAreas(Traits::X86_RET_IP_SIZE_BYTES + PreservedRegsSizeBytes,
-                       SpillAreaAlignmentBytes, GlobalsSize,
-                       LocalsSlotsAlignmentBytes, &SpillAreaPaddingBytes,
-                       &LocalsSlotsPaddingBytes);
-  SpillAreaSizeBytes += SpillAreaPaddingBytes + LocalsSlotsPaddingBytes;
-  uint32_t GlobalsAndSubsequentPaddingSize =
-      GlobalsSize + LocalsSlotsPaddingBytes;
-
-  RequiredStackAlignment =
-      std::max<size_t>(RequiredStackAlignment, SpillAreaAlignmentBytes);
-
-  if (PrologEmitsFixedAllocas) {
-    RequiredStackAlignment =
-        std::max(RequiredStackAlignment, FixedAllocaAlignBytes);
-  }
-
-  // Combine fixed allocations into SpillAreaSizeBytes if we are emitting the
-  // fixed allocations in the prolog.
-  if (PrologEmitsFixedAllocas)
-    SpillAreaSizeBytes += FixedAllocaSizeBytes;
-
-  // Win64 ABI: add space for shadow store (aka home space)
-  SpillAreaSizeBytes += ShadowStoreSize;
-
-  // Entering the function has made the stack pointer unaligned. Re-align it by
-  // adjusting the stack size.
-  // Note that StackOffset does not include spill area. It's the offset from the
-  // base stack pointer (epb), whether we set it or not, to the the first stack
-  // arg (if any). StackSize, on the other hand, does include the spill area.
-  const uint32_t StackOffset =
-      ShadowStoreSize + Traits::X86_RET_IP_SIZE_BYTES + PreservedRegsSizeBytes;
-  uint32_t StackSize = Utils::applyAlignment(StackOffset + SpillAreaSizeBytes,
-                                             RequiredStackAlignment);
-  StackSize = Utils::applyAlignment(StackSize + maxOutArgsSizeBytes(),
-                                    RequiredStackAlignment);
-  SpillAreaSizeBytes = StackSize - StackOffset; // Adjust for alignment, if any
-
-  if (SpillAreaSizeBytes) {
-    auto *Func = Node->getCfg();
-    if (SpillAreaSizeBytes > Func->getStackSizeLimit()) {
-      Func->setError("Stack size limit exceeded");
-    }
-
-    emitStackProbe(SpillAreaSizeBytes);
-
-    // Generate "sub stackptr, SpillAreaSizeBytes"
-    _sub_sp(Ctx->getConstantInt32(SpillAreaSizeBytes));
-  }
-
-  // StackPointer: points just past the spill area (end of stack frame)
-
-  // If the required alignment is greater than the stack pointer's guaranteed
-  // alignment, align the stack pointer accordingly.
-  if (RequiredStackAlignment > Traits::X86_STACK_ALIGNMENT_BYTES) {
-    assert(IsEbpBasedFrame);
-    _and(getPhysicalRegister(getStackReg(), Traits::WordType),
-         Ctx->getConstantInt32(-RequiredStackAlignment));
-  }
-
-  // StackPointer: may have just been offset for alignment
-
-  // Account for known-frame-offset alloca instructions that were not already
-  // combined into the prolog.
-  if (!PrologEmitsFixedAllocas)
-    SpillAreaSizeBytes += FixedAllocaSizeBytes;
-
-  Ctx->statsUpdateFrameBytes(SpillAreaSizeBytes);
-
-  // Fill in stack offsets for stack args, and copy args into registers for
-  // those that were register-allocated. Args are pushed right to left, so
-  // Arg[0] is closest to the stack/frame pointer.
-  RegNumT FrameOrStackReg = IsEbpBasedFrame ? getFrameReg() : getStackReg();
-  Variable *FramePtr = getPhysicalRegister(FrameOrStackReg, Traits::WordType);
-  size_t BasicFrameOffset = StackOffset;
-  if (!IsEbpBasedFrame)
-    BasicFrameOffset += SpillAreaSizeBytes;
-
-  const VarList &Args = Func->getArgs();
-  size_t InArgsSizeBytes = 0;
-  unsigned NumXmmArgs = 0;
-  unsigned NumGPRArgs = 0;
-  for (SizeT i = 0, NumArgs = Args.size(); i < NumArgs; ++i) {
-    Variable *Arg = Args[i];
-    // Skip arguments passed in registers.
-    if (isVectorType(Arg->getType())) {
-      if (Traits::getRegisterForXmmArgNum(Traits::getArgIndex(i, NumXmmArgs))
-              .hasValue()) {
-        ++NumXmmArgs;
-        continue;
-      }
-    } else if (isScalarFloatingType(Arg->getType())) {
-      if (Traits::getRegisterForXmmArgNum(Traits::getArgIndex(i, NumXmmArgs))
-              .hasValue()) {
-        ++NumXmmArgs;
-        continue;
-      }
-    } else {
-      assert(isScalarIntegerType(Arg->getType()));
-      if (Traits::getRegisterForGprArgNum(Traits::WordType,
-                                          Traits::getArgIndex(i, NumGPRArgs))
-              .hasValue()) {
-        ++NumGPRArgs;
-        continue;
-      }
-    }
-    // For esp-based frames where the allocas are done outside the prolog, the
-    // esp value may not stabilize to its home value until after all the
-    // fixed-size alloca instructions have executed.  In this case, a stack
-    // adjustment is needed when accessing in-args in order to copy them into
-    // registers.
-    size_t StackAdjBytes = 0;
-    if (!IsEbpBasedFrame && !PrologEmitsFixedAllocas)
-      StackAdjBytes -= FixedAllocaSizeBytes;
-    finishArgumentLowering(Arg, FramePtr, BasicFrameOffset, StackAdjBytes,
-                           InArgsSizeBytes);
-  }
-
-  // Fill in stack offsets for locals.
-  assignVarStackSlots(SortedSpilledVariables, SpillAreaPaddingBytes,
-                      SpillAreaSizeBytes, GlobalsAndSubsequentPaddingSize,
-                      IsEbpBasedFrame && !needsStackPointerAlignment());
-  // Assign stack offsets to variables that have been linked to spilled
-  // variables.
-  for (Variable *Var : VariablesLinkedToSpillSlots) {
-    const Variable *Root = Var->getLinkedToStackRoot();
-    assert(Root != nullptr);
-    Var->setStackOffset(Root->getStackOffset());
-
-    // If the stack root variable is an arg, make this variable an arg too so
-    // that stackVarToAsmOperand uses the correct base pointer (e.g. ebp on
-    // x86).
-    Var->setIsArg(Root->getIsArg());
-  }
-  this->HasComputedFrame = true;
-
-  if (BuildDefs::dump() && Func->isVerbose(IceV_Frame)) {
-    OstreamLocker L(Func->getContext());
-    Ostream &Str = Func->getContext()->getStrDump();
-
-    Str << "Stack layout:\n";
-    uint32_t EspAdjustmentPaddingSize =
-        SpillAreaSizeBytes - LocalsSpillAreaSize -
-        GlobalsAndSubsequentPaddingSize - SpillAreaPaddingBytes -
-        maxOutArgsSizeBytes();
-    Str << " in-args = " << InArgsSizeBytes << " bytes\n"
-        << " return address = " << Traits::X86_RET_IP_SIZE_BYTES << " bytes\n"
-        << " preserved registers = " << PreservedRegsSizeBytes << " bytes\n"
-        << " spill area padding = " << SpillAreaPaddingBytes << " bytes\n"
-        << " globals spill area = " << GlobalsSize << " bytes\n"
-        << " globals-locals spill areas intermediate padding = "
-        << GlobalsAndSubsequentPaddingSize - GlobalsSize << " bytes\n"
-        << " locals spill area = " << LocalsSpillAreaSize << " bytes\n"
-        << " esp alignment padding = " << EspAdjustmentPaddingSize
-        << " bytes\n";
-
-    Str << "Stack details:\n"
-        << " esp adjustment = " << SpillAreaSizeBytes << " bytes\n"
-        << " spill area alignment = " << SpillAreaAlignmentBytes << " bytes\n"
-        << " outgoing args size = " << maxOutArgsSizeBytes() << " bytes\n"
-        << " locals spill area alignment = " << LocalsSlotsAlignmentBytes
-        << " bytes\n"
-        << " is ebp based = " << IsEbpBasedFrame << "\n";
-  }
-}
-
-/// Helper function for addProlog().
-///
-/// This assumes Arg is an argument passed on the stack. This sets the frame
-/// offset for Arg and updates InArgsSizeBytes according to Arg's width. For an
-/// I64 arg that has been split into Lo and Hi components, it calls itself
-/// recursively on the components, taking care to handle Lo first because of the
-/// little-endian architecture. Lastly, this function generates an instruction
-/// to copy Arg into its assigned register if applicable.
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::finishArgumentLowering(
-    Variable *Arg, Variable *FramePtr, size_t BasicFrameOffset,
-    size_t StackAdjBytes, size_t &InArgsSizeBytes) {
-  if (!Traits::Is64Bit) {
-    if (auto *Arg64On32 = llvm::dyn_cast<Variable64On32>(Arg)) {
-      Variable *Lo = Arg64On32->getLo();
-      Variable *Hi = Arg64On32->getHi();
-      finishArgumentLowering(Lo, FramePtr, BasicFrameOffset, StackAdjBytes,
-                             InArgsSizeBytes);
-      finishArgumentLowering(Hi, FramePtr, BasicFrameOffset, StackAdjBytes,
-                             InArgsSizeBytes);
-      return;
-    }
-  }
-  Type Ty = Arg->getType();
-  if (isVectorType(Ty)) {
-    InArgsSizeBytes = Traits::applyStackAlignment(InArgsSizeBytes);
-  }
-  Arg->setStackOffset(BasicFrameOffset + InArgsSizeBytes);
-  InArgsSizeBytes += typeWidthInBytesOnStack(Ty);
-  if (Arg->hasReg()) {
-    assert(Ty != IceType_i64 || Traits::Is64Bit);
-    auto *Mem = X86OperandMem::create(
-        Func, Ty, FramePtr,
-        Ctx->getConstantInt32(Arg->getStackOffset() + StackAdjBytes));
-    if (isVectorType(Arg->getType())) {
-      _movp(Arg, Mem);
-    } else {
-      _mov(Arg, Mem);
-    }
-    // This argument-copying instruction uses an explicit X86OperandMem
-    // operand instead of a Variable, so its fill-from-stack operation has to
-    // be tracked separately for statistics.
-    Ctx->statsUpdateFills();
-  }
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::addEpilog(CfgNode *Node) {
-  InstList &Insts = Node->getInsts();
-  InstList::reverse_iterator RI, E;
-  for (RI = Insts.rbegin(), E = Insts.rend(); RI != E; ++RI) {
-    if (llvm::isa<typename Traits::Insts::Ret>(*RI))
-      break;
-  }
-  if (RI == E)
-    return;
-
-  // Convert the reverse_iterator position into its corresponding (forward)
-  // iterator position.
-  InstList::iterator InsertPoint = reverseToForwardIterator(RI);
-  --InsertPoint;
-  Context.init(Node);
-  Context.setInsertPoint(InsertPoint);
-
-  if (IsEbpBasedFrame) {
-    _unlink_bp();
-  } else {
-    // add stackptr, SpillAreaSizeBytes
-    if (SpillAreaSizeBytes != 0) {
-      _add_sp(Ctx->getConstantInt32(SpillAreaSizeBytes));
-    }
-  }
-
-  // Add pop instructions for preserved registers.
-  SmallBitVector CalleeSaves = getRegisterSet(RegSet_CalleeSave, RegSet_None);
-  SmallBitVector Popped(CalleeSaves.size());
-  for (int32_t i = CalleeSaves.size() - 1; i >= 0; --i) {
-    const auto RegNum = RegNumT::fromInt(i);
-    if (RegNum == getFrameReg() && IsEbpBasedFrame)
-      continue;
-    const RegNumT Canonical = Traits::getBaseReg(RegNum);
-    if (CalleeSaves[i] && RegsUsed[i]) {
-      Popped[Canonical] = true;
-    }
-  }
-  for (int32_t i = Popped.size() - 1; i >= 0; --i) {
-    if (!Popped[i])
-      continue;
-    const auto RegNum = RegNumT::fromInt(i);
-    assert(RegNum == Traits::getBaseReg(RegNum));
-    _pop_reg(RegNum);
-  }
-}
-
-template <typename TraitsType> Type TargetX86Base<TraitsType>::stackSlotType() {
-  return Traits::WordType;
-}
-
-template <typename TraitsType>
-template <typename T>
-typename std::enable_if<!T::Is64Bit, Operand>::type *
-TargetX86Base<TraitsType>::loOperand(Operand *Operand) {
-  assert(Operand->getType() == IceType_i64 ||
-         Operand->getType() == IceType_f64);
-  if (Operand->getType() != IceType_i64 && Operand->getType() != IceType_f64)
-    return Operand;
-  if (auto *Var64On32 = llvm::dyn_cast<Variable64On32>(Operand))
-    return Var64On32->getLo();
-  if (auto *Const = llvm::dyn_cast<ConstantInteger64>(Operand)) {
-    auto *ConstInt = llvm::dyn_cast<ConstantInteger32>(
-        Ctx->getConstantInt32(static_cast<int32_t>(Const->getValue())));
-    // Check if we need to blind/pool the constant.
-    return legalize(ConstInt);
-  }
-  if (auto *Mem = llvm::dyn_cast<X86OperandMem>(Operand)) {
-    auto *MemOperand = X86OperandMem::create(
-        Func, IceType_i32, Mem->getBase(), Mem->getOffset(), Mem->getIndex(),
-        Mem->getShift(), Mem->getSegmentRegister(), Mem->getIsRebased());
-    // Test if we should randomize or pool the offset, if so randomize it or
-    // pool it then create mem operand with the blinded/pooled constant.
-    // Otherwise, return the mem operand as ordinary mem operand.
-    return legalize(MemOperand);
-  }
-  llvm_unreachable("Unsupported operand type");
-  return nullptr;
-}
-
-template <typename TraitsType>
-template <typename T>
-typename std::enable_if<!T::Is64Bit, Operand>::type *
-TargetX86Base<TraitsType>::hiOperand(Operand *Operand) {
-  assert(Operand->getType() == IceType_i64 ||
-         Operand->getType() == IceType_f64);
-  if (Operand->getType() != IceType_i64 && Operand->getType() != IceType_f64)
-    return Operand;
-  if (auto *Var64On32 = llvm::dyn_cast<Variable64On32>(Operand))
-    return Var64On32->getHi();
-  if (auto *Const = llvm::dyn_cast<ConstantInteger64>(Operand)) {
-    auto *ConstInt = llvm::dyn_cast<ConstantInteger32>(
-        Ctx->getConstantInt32(static_cast<int32_t>(Const->getValue() >> 32)));
-    // Check if we need to blind/pool the constant.
-    return legalize(ConstInt);
-  }
-  if (auto *Mem = llvm::dyn_cast<X86OperandMem>(Operand)) {
-    Constant *Offset = Mem->getOffset();
-    if (Offset == nullptr) {
-      Offset = Ctx->getConstantInt32(4);
-    } else if (auto *IntOffset = llvm::dyn_cast<ConstantInteger32>(Offset)) {
-      Offset = Ctx->getConstantInt32(4 + IntOffset->getValue());
-    } else if (auto *SymOffset = llvm::dyn_cast<ConstantRelocatable>(Offset)) {
-      assert(!Utils::WouldOverflowAdd(SymOffset->getOffset(), 4));
-      Offset =
-          Ctx->getConstantSym(4 + SymOffset->getOffset(), SymOffset->getName());
-    }
-    auto *MemOperand = X86OperandMem::create(
-        Func, IceType_i32, Mem->getBase(), Offset, Mem->getIndex(),
-        Mem->getShift(), Mem->getSegmentRegister(), Mem->getIsRebased());
-    // Test if the Offset is an eligible i32 constants for randomization and
-    // pooling. Blind/pool it if it is. Otherwise return as oridinary mem
-    // operand.
-    return legalize(MemOperand);
-  }
-  llvm_unreachable("Unsupported operand type");
-  return nullptr;
-}
-
-template <typename TraitsType>
-SmallBitVector
-TargetX86Base<TraitsType>::getRegisterSet(RegSetMask Include,
-                                          RegSetMask Exclude) const {
-  return Traits::getRegisterSet(getFlags(), Include, Exclude);
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::lowerAlloca(const InstAlloca *Instr) {
-  // Conservatively require the stack to be aligned. Some stack adjustment
-  // operations implemented below assume that the stack is aligned before the
-  // alloca. All the alloca code ensures that the stack alignment is preserved
-  // after the alloca. The stack alignment restriction can be relaxed in some
-  // cases.
-  RequiredStackAlignment = std::max<size_t>(RequiredStackAlignment,
-                                            Traits::X86_STACK_ALIGNMENT_BYTES);
-
-  // For default align=0, set it to the real value 1, to avoid any
-  // bit-manipulation problems below.
-  const uint32_t AlignmentParam = std::max(1u, Instr->getAlignInBytes());
-
-  // LLVM enforces power of 2 alignment.
-  assert(llvm::isPowerOf2_32(AlignmentParam));
-  assert(llvm::isPowerOf2_32(Traits::X86_STACK_ALIGNMENT_BYTES));
-
-  const uint32_t Alignment =
-      std::max(AlignmentParam, Traits::X86_STACK_ALIGNMENT_BYTES);
-  const bool OverAligned = Alignment > Traits::X86_STACK_ALIGNMENT_BYTES;
-  const bool OptM1 = Func->getOptLevel() == Opt_m1;
-  const bool AllocaWithKnownOffset = Instr->getKnownFrameOffset();
-  const bool UseFramePointer =
-      hasFramePointer() || OverAligned || !AllocaWithKnownOffset || OptM1;
-
-  if (UseFramePointer)
-    setHasFramePointer();
-
-  Variable *esp = getPhysicalRegister(getStackReg(), Traits::WordType);
-  if (OverAligned) {
-    _and(esp, Ctx->getConstantInt32(-Alignment));
-  }
-
-  Variable *Dest = Instr->getDest();
-  Operand *TotalSize = legalize(Instr->getSizeInBytes());
-
-  if (const auto *ConstantTotalSize =
-          llvm::dyn_cast<ConstantInteger32>(TotalSize)) {
-    const uint32_t Value =
-        Utils::applyAlignment(ConstantTotalSize->getValue(), Alignment);
-    if (UseFramePointer) {
-      _sub_sp(Ctx->getConstantInt32(Value));
-    } else {
-      // If we don't need a Frame Pointer, this alloca has a known offset to the
-      // stack pointer. We don't need adjust the stack pointer, nor assign any
-      // value to Dest, as Dest is rematerializable.
-      assert(Dest->isRematerializable());
-      FixedAllocaSizeBytes += Value;
-      Context.insert<InstFakeDef>(Dest);
-    }
-  } else {
-    // Non-constant sizes need to be adjusted to the next highest multiple of
-    // the required alignment at runtime.
-    Variable *T = nullptr;
-    if (Traits::Is64Bit && TotalSize->getType() != IceType_i64) {
-      T = makeReg(IceType_i64);
-      _movzx(T, TotalSize);
-    } else {
-      T = makeReg(IceType_i32);
-      _mov(T, TotalSize);
-    }
-    _add(T, Ctx->getConstantInt32(Alignment - 1));
-    _and(T, Ctx->getConstantInt32(-Alignment));
-    _sub_sp(T);
-  }
-  // Add enough to the returned address to account for the out args area.
-  uint32_t OutArgsSize = maxOutArgsSizeBytes();
-  if (OutArgsSize > 0) {
-    Variable *T = makeReg(Dest->getType());
-    auto *CalculateOperand = X86OperandMem::create(
-        Func, IceType_void, esp, Ctx->getConstantInt(IceType_i32, OutArgsSize));
-    _lea(T, CalculateOperand);
-    _mov(Dest, T);
-  } else {
-    _mov(Dest, esp);
-  }
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::lowerArguments() {
-  const bool OptM1 = Func->getOptLevel() == Opt_m1;
-  VarList &Args = Func->getArgs();
-  unsigned NumXmmArgs = 0;
-  bool XmmSlotsRemain = true;
-  unsigned NumGprArgs = 0;
-  bool GprSlotsRemain = true;
-
-  Context.init(Func->getEntryNode());
-  Context.setInsertPoint(Context.getCur());
-
-  for (SizeT i = 0, End = Args.size();
-       i < End && (XmmSlotsRemain || GprSlotsRemain); ++i) {
-    Variable *Arg = Args[i];
-    Type Ty = Arg->getType();
-    Variable *RegisterArg = nullptr;
-    RegNumT RegNum;
-    if (isVectorType(Ty)) {
-      RegNum =
-          Traits::getRegisterForXmmArgNum(Traits::getArgIndex(i, NumXmmArgs));
-      if (RegNum.hasNoValue()) {
-        XmmSlotsRemain = false;
-        continue;
-      }
-      ++NumXmmArgs;
-      RegisterArg = Func->makeVariable(Ty);
-    } else if (isScalarFloatingType(Ty)) {
-      RegNum =
-          Traits::getRegisterForXmmArgNum(Traits::getArgIndex(i, NumXmmArgs));
-      if (RegNum.hasNoValue()) {
-        XmmSlotsRemain = false;
-        continue;
-      }
-      ++NumXmmArgs;
-      RegisterArg = Func->makeVariable(Ty);
-    } else if (isScalarIntegerType(Ty)) {
-      RegNum = Traits::getRegisterForGprArgNum(
-          Ty, Traits::getArgIndex(i, NumGprArgs));
-      if (RegNum.hasNoValue()) {
-        GprSlotsRemain = false;
-        continue;
-      }
-      ++NumGprArgs;
-      RegisterArg = Func->makeVariable(Ty);
-    }
-    assert(RegNum.hasValue());
-    assert(RegisterArg != nullptr);
-    // Replace Arg in the argument list with the home register. Then generate
-    // an instruction in the prolog to copy the home register to the assigned
-    // location of Arg.
-    if (BuildDefs::dump())
-      RegisterArg->setName(Func, "home_reg:" + Arg->getName());
-    RegisterArg->setRegNum(RegNum);
-    RegisterArg->setIsArg();
-    Arg->setIsArg(false);
-
-    Args[i] = RegisterArg;
-    // When not Om1, do the assignment through a temporary, instead of directly
-    // from the pre-colored variable, so that a subsequent availabilityGet()
-    // call has a chance to work.  (In Om1, don't bother creating extra
-    // instructions with extra variables to register-allocate.)
-    if (OptM1) {
-      Context.insert<InstAssign>(Arg, RegisterArg);
-    } else {
-      Variable *Tmp = makeReg(RegisterArg->getType());
-      Context.insert<InstAssign>(Tmp, RegisterArg);
-      Context.insert<InstAssign>(Arg, Tmp);
-    }
-  }
-  if (!OptM1)
-    Context.availabilityUpdate();
-}
-
-/// Strength-reduce scalar integer multiplication by a constant (for i32 or
-/// narrower) for certain constants. The lea instruction can be used to multiply
-/// by 3, 5, or 9, and the lsh instruction can be used to multiply by powers of
-/// 2. These can be combined such that e.g. multiplying by 100 can be done as 2
-/// lea-based multiplies by 5, combined with left-shifting by 2.
-template <typename TraitsType>
-bool TargetX86Base<TraitsType>::optimizeScalarMul(Variable *Dest, Operand *Src0,
-                                                  int32_t Src1) {
-  // Disable this optimization for Om1 and O0, just to keep things simple
-  // there.
-  if (Func->getOptLevel() < Opt_1)
-    return false;
-  Type Ty = Dest->getType();
-  if (Src1 == -1) {
-    Variable *T = nullptr;
-    _mov(T, Src0);
-    _neg(T);
-    _mov(Dest, T);
-    return true;
-  }
-  if (Src1 == 0) {
-    _mov(Dest, Ctx->getConstantZero(Ty));
-    return true;
-  }
-  if (Src1 == 1) {
-    Variable *T = nullptr;
-    _mov(T, Src0);
-    _mov(Dest, T);
-    return true;
-  }
-  // Don't bother with the edge case where Src1 == MININT.
-  if (Src1 == -Src1)
-    return false;
-  const bool Src1IsNegative = Src1 < 0;
-  if (Src1IsNegative)
-    Src1 = -Src1;
-  uint32_t Count9 = 0;
-  uint32_t Count5 = 0;
-  uint32_t Count3 = 0;
-  uint32_t Count2 = 0;
-  uint32_t CountOps = 0;
-  while (Src1 > 1) {
-    if (Src1 % 9 == 0) {
-      ++CountOps;
-      ++Count9;
-      Src1 /= 9;
-    } else if (Src1 % 5 == 0) {
-      ++CountOps;
-      ++Count5;
-      Src1 /= 5;
-    } else if (Src1 % 3 == 0) {
-      ++CountOps;
-      ++Count3;
-      Src1 /= 3;
-    } else if (Src1 % 2 == 0) {
-      if (Count2 == 0)
-        ++CountOps;
-      ++Count2;
-      Src1 /= 2;
-    } else {
-      return false;
-    }
-  }
-  // Lea optimization only works for i16 and i32 types, not i8.
-  if (Ty != IceType_i32 && !(Traits::Is64Bit && Ty == IceType_i64) &&
-      (Count3 || Count5 || Count9))
-    return false;
-  // Limit the number of lea/shl operations for a single multiply, to a
-  // somewhat arbitrary choice of 3.
-  constexpr uint32_t MaxOpsForOptimizedMul = 3;
-  if (CountOps > MaxOpsForOptimizedMul)
-    return false;
-  Variable *T = makeReg(Traits::WordType);
-  if (typeWidthInBytes(Src0->getType()) < typeWidthInBytes(T->getType())) {
-    Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
-    _movzx(T, Src0RM);
-  } else {
-    _mov(T, Src0);
-  }
-  Constant *Zero = Ctx->getConstantZero(IceType_i32);
-  for (uint32_t i = 0; i < Count9; ++i) {
-    constexpr uint16_t Shift = 3; // log2(9-1)
-    _lea(T, X86OperandMem::create(Func, IceType_void, T, Zero, T, Shift));
-  }
-  for (uint32_t i = 0; i < Count5; ++i) {
-    constexpr uint16_t Shift = 2; // log2(5-1)
-    _lea(T, X86OperandMem::create(Func, IceType_void, T, Zero, T, Shift));
-  }
-  for (uint32_t i = 0; i < Count3; ++i) {
-    constexpr uint16_t Shift = 1; // log2(3-1)
-    _lea(T, X86OperandMem::create(Func, IceType_void, T, Zero, T, Shift));
-  }
-  if (Count2) {
-    _shl(T, Ctx->getConstantInt(Ty, Count2));
-  }
-  if (Src1IsNegative)
-    _neg(T);
-  _mov(Dest, T);
-  return true;
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::lowerShift64(InstArithmetic::OpKind Op,
-                                             Operand *Src0Lo, Operand *Src0Hi,
-                                             Operand *Src1Lo, Variable *DestLo,
-                                             Variable *DestHi) {
-  // TODO: Refactor the similarities between Shl, Lshr, and Ashr.
-  Variable *T_1 = nullptr, *T_2 = nullptr, *T_3 = nullptr;
-  Constant *Zero = Ctx->getConstantZero(IceType_i32);
-  Constant *SignExtend = Ctx->getConstantInt32(0x1f);
-  if (auto *ConstantShiftAmount = llvm::dyn_cast<ConstantInteger32>(Src1Lo)) {
-    uint32_t ShiftAmount = ConstantShiftAmount->getValue();
-    if (ShiftAmount > 32) {
-      Constant *ReducedShift = Ctx->getConstantInt32(ShiftAmount - 32);
-      switch (Op) {
-      default:
-        assert(0 && "non-shift op");
-        break;
-      case InstArithmetic::Shl: {
-        // a=b<<c ==>
-        //   t2 = b.lo
-        //   t2 = shl t2, ShiftAmount-32
-        //   t3 = t2
-        //   t2 = 0
-        _mov(T_2, Src0Lo);
-        _shl(T_2, ReducedShift);
-        _mov(DestHi, T_2);
-        _mov(DestLo, Zero);
-      } break;
-      case InstArithmetic::Lshr: {
-        // a=b>>c (unsigned) ==>
-        //   t2 = b.hi
-        //   t2 = shr t2, ShiftAmount-32
-        //   a.lo = t2
-        //   a.hi = 0
-        _mov(T_2, Src0Hi);
-        _shr(T_2, ReducedShift);
-        _mov(DestLo, T_2);
-        _mov(DestHi, Zero);
-      } break;
-      case InstArithmetic::Ashr: {
-        // a=b>>c (signed) ==>
-        //   t3 = b.hi
-        //   t3 = sar t3, 0x1f
-        //   t2 = b.hi
-        //   t2 = shrd t2, t3, ShiftAmount-32
-        //   a.lo = t2
-        //   a.hi = t3
-        _mov(T_3, Src0Hi);
-        _sar(T_3, SignExtend);
-        _mov(T_2, Src0Hi);
-        _shrd(T_2, T_3, ReducedShift);
-        _mov(DestLo, T_2);
-        _mov(DestHi, T_3);
-      } break;
-      }
-    } else if (ShiftAmount == 32) {
-      switch (Op) {
-      default:
-        assert(0 && "non-shift op");
-        break;
-      case InstArithmetic::Shl: {
-        // a=b<<c ==>
-        //   t2 = b.lo
-        //   a.hi = t2
-        //   a.lo = 0
-        _mov(T_2, Src0Lo);
-        _mov(DestHi, T_2);
-        _mov(DestLo, Zero);
-      } break;
-      case InstArithmetic::Lshr: {
-        // a=b>>c (unsigned) ==>
-        //   t2 = b.hi
-        //   a.lo = t2
-        //   a.hi = 0
-        _mov(T_2, Src0Hi);
-        _mov(DestLo, T_2);
-        _mov(DestHi, Zero);
-      } break;
-      case InstArithmetic::Ashr: {
-        // a=b>>c (signed) ==>
-        //   t2 = b.hi
-        //   a.lo = t2
-        //   t3 = b.hi
-        //   t3 = sar t3, 0x1f
-        //   a.hi = t3
-        _mov(T_2, Src0Hi);
-        _mov(DestLo, T_2);
-        _mov(T_3, Src0Hi);
-        _sar(T_3, SignExtend);
-        _mov(DestHi, T_3);
-      } break;
-      }
-    } else {
-      // COMMON PREFIX OF: a=b SHIFT_OP c ==>
-      //   t2 = b.lo
-      //   t3 = b.hi
-      _mov(T_2, Src0Lo);
-      _mov(T_3, Src0Hi);
-      switch (Op) {
-      default:
-        assert(0 && "non-shift op");
-        break;
-      case InstArithmetic::Shl: {
-        // a=b<<c ==>
-        //   t3 = shld t3, t2, ShiftAmount
-        //   t2 = shl t2, ShiftAmount
-        _shld(T_3, T_2, ConstantShiftAmount);
-        _shl(T_2, ConstantShiftAmount);
-      } break;
-      case InstArithmetic::Lshr: {
-        // a=b>>c (unsigned) ==>
-        //   t2 = shrd t2, t3, ShiftAmount
-        //   t3 = shr t3, ShiftAmount
-        _shrd(T_2, T_3, ConstantShiftAmount);
-        _shr(T_3, ConstantShiftAmount);
-      } break;
-      case InstArithmetic::Ashr: {
-        // a=b>>c (signed) ==>
-        //   t2 = shrd t2, t3, ShiftAmount
-        //   t3 = sar t3, ShiftAmount
-        _shrd(T_2, T_3, ConstantShiftAmount);
-        _sar(T_3, ConstantShiftAmount);
-      } break;
-      }
-      // COMMON SUFFIX OF: a=b SHIFT_OP c ==>
-      //   a.lo = t2
-      //   a.hi = t3
-      _mov(DestLo, T_2);
-      _mov(DestHi, T_3);
-    }
-  } else {
-    // NON-CONSTANT CASES.
-    Constant *BitTest = Ctx->getConstantInt32(0x20);
-    InstX86Label *Label = InstX86Label::create(Func, this);
-    // COMMON PREFIX OF: a=b SHIFT_OP c ==>
-    //   t1:ecx = c.lo & 0xff
-    //   t2 = b.lo
-    //   t3 = b.hi
-    T_1 = copyToReg8(Src1Lo, Traits::RegisterSet::Reg_cl);
-    _mov(T_2, Src0Lo);
-    _mov(T_3, Src0Hi);
-    switch (Op) {
-    default:
-      assert(0 && "non-shift op");
-      break;
-    case InstArithmetic::Shl: {
-      // a=b<<c ==>
-      //   t3 = shld t3, t2, t1
-      //   t2 = shl t2, t1
-      //   test t1, 0x20
-      //   je L1
-      //   use(t3)
-      //   t3 = t2
-      //   t2 = 0
-      _shld(T_3, T_2, T_1);
-      _shl(T_2, T_1);
-      _test(T_1, BitTest);
-      _br(CondX86::Br_e, Label);
-      // T_2 and T_3 are being assigned again because of the intra-block control
-      // flow, so we need to use _redefined to avoid liveness problems.
-      _redefined(_mov(T_3, T_2));
-      _redefined(_mov(T_2, Zero));
-    } break;
-    case InstArithmetic::Lshr: {
-      // a=b>>c (unsigned) ==>
-      //   t2 = shrd t2, t3, t1
-      //   t3 = shr t3, t1
-      //   test t1, 0x20
-      //   je L1
-      //   use(t2)
-      //   t2 = t3
-      //   t3 = 0
-      _shrd(T_2, T_3, T_1);
-      _shr(T_3, T_1);
-      _test(T_1, BitTest);
-      _br(CondX86::Br_e, Label);
-      // T_2 and T_3 are being assigned again because of the intra-block control
-      // flow, so we need to use _redefined to avoid liveness problems.
-      _redefined(_mov(T_2, T_3));
-      _redefined(_mov(T_3, Zero));
-    } break;
-    case InstArithmetic::Ashr: {
-      // a=b>>c (signed) ==>
-      //   t2 = shrd t2, t3, t1
-      //   t3 = sar t3, t1
-      //   test t1, 0x20
-      //   je L1
-      //   use(t2)
-      //   t2 = t3
-      //   t3 = sar t3, 0x1f
-      Constant *SignExtend = Ctx->getConstantInt32(0x1f);
-      _shrd(T_2, T_3, T_1);
-      _sar(T_3, T_1);
-      _test(T_1, BitTest);
-      _br(CondX86::Br_e, Label);
-      // T_2 and T_3 are being assigned again because of the intra-block control
-      // flow, so T_2 needs to use _redefined to avoid liveness problems. T_3
-      // doesn't need special treatment because it is reassigned via _sar
-      // instead of _mov.
-      _redefined(_mov(T_2, T_3));
-      _sar(T_3, SignExtend);
-    } break;
-    }
-    // COMMON SUFFIX OF: a=b SHIFT_OP c ==>
-    // L1:
-    //   a.lo = t2
-    //   a.hi = t3
-    Context.insert(Label);
-    _mov(DestLo, T_2);
-    _mov(DestHi, T_3);
-  }
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::lowerArithmetic(const InstArithmetic *Instr) {
-  Variable *Dest = Instr->getDest();
-  if (Dest->isRematerializable()) {
-    Context.insert<InstFakeDef>(Dest);
-    return;
-  }
-  Type Ty = Dest->getType();
-  Operand *Src0 = legalize(Instr->getSrc(0));
-  Operand *Src1 = legalize(Instr->getSrc(1));
-  if (Instr->isCommutative()) {
-    uint32_t SwapCount = 0;
-    if (!llvm::isa<Variable>(Src0) && llvm::isa<Variable>(Src1)) {
-      std::swap(Src0, Src1);
-      ++SwapCount;
-    }
-    if (llvm::isa<Constant>(Src0) && !llvm::isa<Constant>(Src1)) {
-      std::swap(Src0, Src1);
-      ++SwapCount;
-    }
-    // Improve two-address code patterns by avoiding a copy to the dest
-    // register when one of the source operands ends its lifetime here.
-    if (!Instr->isLastUse(Src0) && Instr->isLastUse(Src1)) {
-      std::swap(Src0, Src1);
-      ++SwapCount;
-    }
-    assert(SwapCount <= 1);
-    (void)SwapCount;
-  }
-  if (!Traits::Is64Bit && Ty == IceType_i64) {
-    // These x86-32 helper-call-involved instructions are lowered in this
-    // separate switch. This is because loOperand() and hiOperand() may insert
-    // redundant instructions for constant blinding and pooling. Such redundant
-    // instructions will fail liveness analysis under -Om1 setting. And,
-    // actually these arguments do not need to be processed with loOperand()
-    // and hiOperand() to be used.
-    switch (Instr->getOp()) {
-    case InstArithmetic::Udiv:
-    case InstArithmetic::Sdiv:
-    case InstArithmetic::Urem:
-    case InstArithmetic::Srem:
-      llvm::report_fatal_error("Helper call was expected");
-      return;
-    default:
-      break;
-    }
-
-    auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
-    auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
-    Operand *Src0Lo = loOperand(Src0);
-    Operand *Src0Hi = hiOperand(Src0);
-    Operand *Src1Lo = loOperand(Src1);
-    Operand *Src1Hi = hiOperand(Src1);
-    Variable *T_Lo = nullptr, *T_Hi = nullptr;
-    switch (Instr->getOp()) {
-    case InstArithmetic::_num:
-      llvm_unreachable("Unknown arithmetic operator");
-      break;
-    case InstArithmetic::Add:
-      _mov(T_Lo, Src0Lo);
-      _add(T_Lo, Src1Lo);
-      _mov(DestLo, T_Lo);
-      _mov(T_Hi, Src0Hi);
-      _adc(T_Hi, Src1Hi);
-      _mov(DestHi, T_Hi);
-      break;
-    case InstArithmetic::And:
-      _mov(T_Lo, Src0Lo);
-      _and(T_Lo, Src1Lo);
-      _mov(DestLo, T_Lo);
-      _mov(T_Hi, Src0Hi);
-      _and(T_Hi, Src1Hi);
-      _mov(DestHi, T_Hi);
-      break;
-    case InstArithmetic::Or:
-      _mov(T_Lo, Src0Lo);
-      _or(T_Lo, Src1Lo);
-      _mov(DestLo, T_Lo);
-      _mov(T_Hi, Src0Hi);
-      _or(T_Hi, Src1Hi);
-      _mov(DestHi, T_Hi);
-      break;
-    case InstArithmetic::Xor:
-      _mov(T_Lo, Src0Lo);
-      _xor(T_Lo, Src1Lo);
-      _mov(DestLo, T_Lo);
-      _mov(T_Hi, Src0Hi);
-      _xor(T_Hi, Src1Hi);
-      _mov(DestHi, T_Hi);
-      break;
-    case InstArithmetic::Sub:
-      _mov(T_Lo, Src0Lo);
-      _sub(T_Lo, Src1Lo);
-      _mov(DestLo, T_Lo);
-      _mov(T_Hi, Src0Hi);
-      _sbb(T_Hi, Src1Hi);
-      _mov(DestHi, T_Hi);
-      break;
-    case InstArithmetic::Mul: {
-      Variable *T_1 = nullptr, *T_2 = nullptr, *T_3 = nullptr;
-      Variable *T_4Lo = makeReg(IceType_i32, Traits::RegisterSet::Reg_eax);
-      Variable *T_4Hi = makeReg(IceType_i32, Traits::RegisterSet::Reg_edx);
-      // gcc does the following:
-      // a=b*c ==>
-      //   t1 = b.hi; t1 *=(imul) c.lo
-      //   t2 = c.hi; t2 *=(imul) b.lo
-      //   t3:eax = b.lo
-      //   t4.hi:edx,t4.lo:eax = t3:eax *(mul) c.lo
-      //   a.lo = t4.lo
-      //   t4.hi += t1
-      //   t4.hi += t2
-      //   a.hi = t4.hi
-      // The mul instruction cannot take an immediate operand.
-      Src1Lo = legalize(Src1Lo, Legal_Reg | Legal_Mem);
-      _mov(T_1, Src0Hi);
-      _imul(T_1, Src1Lo);
-      _mov(T_3, Src0Lo, Traits::RegisterSet::Reg_eax);
-      _mul(T_4Lo, T_3, Src1Lo);
-      // The mul instruction produces two dest variables, edx:eax. We create a
-      // fake definition of edx to account for this.
-      Context.insert<InstFakeDef>(T_4Hi, T_4Lo);
-      Context.insert<InstFakeUse>(T_4Hi);
-      _mov(DestLo, T_4Lo);
-      _add(T_4Hi, T_1);
-      _mov(T_2, Src1Hi);
-      Src0Lo = legalize(Src0Lo, Legal_Reg | Legal_Mem);
-      _imul(T_2, Src0Lo);
-      _add(T_4Hi, T_2);
-      _mov(DestHi, T_4Hi);
-    } break;
-    case InstArithmetic::Shl:
-    case InstArithmetic::Lshr:
-    case InstArithmetic::Ashr:
-      lowerShift64(Instr->getOp(), Src0Lo, Src0Hi, Src1Lo, DestLo, DestHi);
-      break;
-    case InstArithmetic::Fadd:
-    case InstArithmetic::Fsub:
-    case InstArithmetic::Fmul:
-    case InstArithmetic::Fdiv:
-    case InstArithmetic::Frem:
-      llvm_unreachable("FP instruction with i64 type");
-      break;
-    case InstArithmetic::Udiv:
-    case InstArithmetic::Sdiv:
-    case InstArithmetic::Urem:
-    case InstArithmetic::Srem:
-      llvm_unreachable("Call-helper-involved instruction for i64 type \
-                       should have already been handled before");
-      break;
-    }
-    return;
-  }
-  if (isVectorType(Ty)) {
-    // TODO: Trap on integer divide and integer modulo by zero. See:
-    // https://code.google.com/p/nativeclient/issues/detail?id=3899
-    if (llvm::isa<X86OperandMem>(Src1))
-      Src1 = legalizeToReg(Src1);
-    switch (Instr->getOp()) {
-    case InstArithmetic::_num:
-      llvm_unreachable("Unknown arithmetic operator");
-      break;
-    case InstArithmetic::Add: {
-      Variable *T = makeReg(Ty);
-      _movp(T, Src0);
-      _padd(T, Src1);
-      _movp(Dest, T);
-    } break;
-    case InstArithmetic::And: {
-      Variable *T = makeReg(Ty);
-      _movp(T, Src0);
-      _pand(T, Src1);
-      _movp(Dest, T);
-    } break;
-    case InstArithmetic::Or: {
-      Variable *T = makeReg(Ty);
-      _movp(T, Src0);
-      _por(T, Src1);
-      _movp(Dest, T);
-    } break;
-    case InstArithmetic::Xor: {
-      Variable *T = makeReg(Ty);
-      _movp(T, Src0);
-      _pxor(T, Src1);
-      _movp(Dest, T);
-    } break;
-    case InstArithmetic::Sub: {
-      Variable *T = makeReg(Ty);
-      _movp(T, Src0);
-      _psub(T, Src1);
-      _movp(Dest, T);
-    } break;
-    case InstArithmetic::Mul: {
-      bool TypesAreValidForPmull = Ty == IceType_v4i32 || Ty == IceType_v8i16;
-      bool InstructionSetIsValidForPmull =
-          Ty == IceType_v8i16 || InstructionSet >= Traits::SSE4_1;
-      if (TypesAreValidForPmull && InstructionSetIsValidForPmull) {
-        Variable *T = makeReg(Ty);
-        _movp(T, Src0);
-        _pmull(T, Src0 == Src1 ? T : Src1);
-        _movp(Dest, T);
-      } else if (Ty == IceType_v4i32) {
-        // Lowering sequence:
-        // Note: The mask arguments have index 0 on the left.
-        //
-        // movups  T1, Src0
-        // pshufd  T2, Src0, {1,0,3,0}
-        // pshufd  T3, Src1, {1,0,3,0}
-        // # T1 = {Src0[0] * Src1[0], Src0[2] * Src1[2]}
-        // pmuludq T1, Src1
-        // # T2 = {Src0[1] * Src1[1], Src0[3] * Src1[3]}
-        // pmuludq T2, T3
-        // # T1 = {lo(T1[0]), lo(T1[2]), lo(T2[0]), lo(T2[2])}
-        // shufps  T1, T2, {0,2,0,2}
-        // pshufd  T4, T1, {0,2,1,3}
-        // movups  Dest, T4
-
-        // Mask that directs pshufd to create a vector with entries
-        // Src[1, 0, 3, 0]
-        constexpr unsigned Constant1030 = 0x31;
-        Constant *Mask1030 = Ctx->getConstantInt32(Constant1030);
-        // Mask that directs shufps to create a vector with entries
-        // Dest[0, 2], Src[0, 2]
-        constexpr unsigned Mask0202 = 0x88;
-        // Mask that directs pshufd to create a vector with entries
-        // Src[0, 2, 1, 3]
-        constexpr unsigned Mask0213 = 0xd8;
-        Variable *T1 = makeReg(IceType_v4i32);
-        Variable *T2 = makeReg(IceType_v4i32);
-        Variable *T3 = makeReg(IceType_v4i32);
-        Variable *T4 = makeReg(IceType_v4i32);
-        _movp(T1, Src0);
-        _pshufd(T2, Src0, Mask1030);
-        _pshufd(T3, Src1, Mask1030);
-        _pmuludq(T1, Src1);
-        _pmuludq(T2, T3);
-        _shufps(T1, T2, Ctx->getConstantInt32(Mask0202));
-        _pshufd(T4, T1, Ctx->getConstantInt32(Mask0213));
-        _movp(Dest, T4);
-      } else if (Ty == IceType_v16i8) {
-        llvm::report_fatal_error("Scalarized operation was expected");
-      } else {
-        llvm::report_fatal_error("Invalid vector multiply type");
-      }
-    } break;
-    case InstArithmetic::Shl: {
-      assert(llvm::isa<Constant>(Src1) && "Non-constant shift not scalarized");
-      Variable *T = makeReg(Ty);
-      _movp(T, Src0);
-      _psll(T, Src1);
-      _movp(Dest, T);
-    } break;
-    case InstArithmetic::Lshr: {
-      assert(llvm::isa<Constant>(Src1) && "Non-constant shift not scalarized");
-      Variable *T = makeReg(Ty);
-      _movp(T, Src0);
-      _psrl(T, Src1);
-      _movp(Dest, T);
-    } break;
-    case InstArithmetic::Ashr: {
-      assert(llvm::isa<Constant>(Src1) && "Non-constant shift not scalarized");
-      Variable *T = makeReg(Ty);
-      _movp(T, Src0);
-      _psra(T, Src1);
-      _movp(Dest, T);
-    } break;
-    case InstArithmetic::Udiv:
-    case InstArithmetic::Urem:
-    case InstArithmetic::Sdiv:
-    case InstArithmetic::Srem:
-      llvm::report_fatal_error("Scalarized operation was expected");
-      break;
-    case InstArithmetic::Fadd: {
-      Variable *T = makeReg(Ty);
-      _movp(T, Src0);
-      _addps(T, Src1);
-      _movp(Dest, T);
-    } break;
-    case InstArithmetic::Fsub: {
-      Variable *T = makeReg(Ty);
-      _movp(T, Src0);
-      _subps(T, Src1);
-      _movp(Dest, T);
-    } break;
-    case InstArithmetic::Fmul: {
-      Variable *T = makeReg(Ty);
-      _movp(T, Src0);
-      _mulps(T, Src0 == Src1 ? T : Src1);
-      _movp(Dest, T);
-    } break;
-    case InstArithmetic::Fdiv: {
-      Variable *T = makeReg(Ty);
-      _movp(T, Src0);
-      _divps(T, Src1);
-      _movp(Dest, T);
-    } break;
-    case InstArithmetic::Frem:
-      llvm::report_fatal_error("Scalarized operation was expected");
-      break;
-    }
-    return;
-  }
-  Variable *T_edx = nullptr;
-  Variable *T = nullptr;
-  switch (Instr->getOp()) {
-  case InstArithmetic::_num:
-    llvm_unreachable("Unknown arithmetic operator");
-    break;
-  case InstArithmetic::Add: {
-    const bool ValidType =
-        Ty == IceType_i32 || (Ty == IceType_i64 && Traits::Is64Bit);
-    auto *Const = llvm::dyn_cast<Constant>(Instr->getSrc(1));
-    const bool ValidKind =
-        Const != nullptr && (llvm::isa<ConstantInteger32>(Const) ||
-                             llvm::isa<ConstantRelocatable>(Const));
-    if (getFlags().getAggressiveLea() && ValidType && ValidKind) {
-      auto *Var = legalizeToReg(Src0);
-      auto *Mem = Traits::X86OperandMem::create(Func, IceType_void, Var, Const);
-      T = makeReg(Ty);
-      _lea(T, Mem);
-      _mov(Dest, T);
-      break;
-    }
-    _mov(T, Src0);
-    _add(T, Src1);
-    _mov(Dest, T);
-  } break;
-  case InstArithmetic::And:
-    _mov(T, Src0);
-    _and(T, Src1);
-    _mov(Dest, T);
-    break;
-  case InstArithmetic::Or:
-    _mov(T, Src0);
-    _or(T, Src1);
-    _mov(Dest, T);
-    break;
-  case InstArithmetic::Xor:
-    _mov(T, Src0);
-    _xor(T, Src1);
-    _mov(Dest, T);
-    break;
-  case InstArithmetic::Sub:
-    _mov(T, Src0);
-    _sub(T, Src1);
-    _mov(Dest, T);
-    break;
-  case InstArithmetic::Mul:
-    if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) {
-      if (optimizeScalarMul(Dest, Src0, C->getValue()))
-        return;
-    }
-    // The 8-bit version of imul only allows the form "imul r/m8" where T must
-    // be in al.
-    if (isByteSizedArithType(Ty)) {
-      _mov(T, Src0, Traits::RegisterSet::Reg_al);
-      Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
-      _imul(T, Src0 == Src1 ? T : Src1);
-      _mov(Dest, T);
-    } else if (auto *ImmConst = llvm::dyn_cast<ConstantInteger32>(Src1)) {
-      T = makeReg(Ty);
-      Src0 = legalize(Src0, Legal_Reg | Legal_Mem);
-      _imul_imm(T, Src0, ImmConst);
-      _mov(Dest, T);
-    } else {
-      _mov(T, Src0);
-      // No need to legalize Src1 to Reg | Mem because the Imm case is handled
-      // already by the ConstantInteger32 case above.
-      _imul(T, Src0 == Src1 ? T : Src1);
-      _mov(Dest, T);
-    }
-    break;
-  case InstArithmetic::Shl:
-    _mov(T, Src0);
-    if (!llvm::isa<ConstantInteger32>(Src1) &&
-        !llvm::isa<ConstantInteger64>(Src1))
-      Src1 = copyToReg8(Src1, Traits::RegisterSet::Reg_cl);
-    _shl(T, Src1);
-    _mov(Dest, T);
-    break;
-  case InstArithmetic::Lshr:
-    _mov(T, Src0);
-    if (!llvm::isa<ConstantInteger32>(Src1) &&
-        !llvm::isa<ConstantInteger64>(Src1))
-      Src1 = copyToReg8(Src1, Traits::RegisterSet::Reg_cl);
-    _shr(T, Src1);
-    _mov(Dest, T);
-    break;
-  case InstArithmetic::Ashr:
-    _mov(T, Src0);
-    if (!llvm::isa<ConstantInteger32>(Src1) &&
-        !llvm::isa<ConstantInteger64>(Src1))
-      Src1 = copyToReg8(Src1, Traits::RegisterSet::Reg_cl);
-    _sar(T, Src1);
-    _mov(Dest, T);
-    break;
-  case InstArithmetic::Udiv: {
-    // div and idiv are the few arithmetic operators that do not allow
-    // immediates as the operand.
-    Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
-    RegNumT Eax;
-    RegNumT Edx;
-    switch (Ty) {
-    default:
-      llvm::report_fatal_error("Bad type for udiv");
-    case IceType_i64:
-      Eax = Traits::getRaxOrDie();
-      Edx = Traits::getRdxOrDie();
-      break;
-    case IceType_i32:
-      Eax = Traits::RegisterSet::Reg_eax;
-      Edx = Traits::RegisterSet::Reg_edx;
-      break;
-    case IceType_i16:
-      Eax = Traits::RegisterSet::Reg_ax;
-      Edx = Traits::RegisterSet::Reg_dx;
-      break;
-    case IceType_i8:
-      Eax = Traits::RegisterSet::Reg_al;
-      Edx = Traits::RegisterSet::Reg_ah;
-      break;
-    }
-    T_edx = makeReg(Ty, Edx);
-    _mov(T, Src0, Eax);
-    _mov(T_edx, Ctx->getConstantZero(Ty));
-    _div(T_edx, Src1, T);
-    _redefined(Context.insert<InstFakeDef>(T, T_edx));
-    _mov(Dest, T);
-  } break;
-  case InstArithmetic::Sdiv:
-    // TODO(stichnot): Enable this after doing better performance and cross
-    // testing.
-    if (false && Func->getOptLevel() >= Opt_1) {
-      // Optimize division by constant power of 2, but not for Om1 or O0, just
-      // to keep things simple there.
-      if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) {
-        const int32_t Divisor = C->getValue();
-        const uint32_t UDivisor = Divisor;
-        if (Divisor > 0 && llvm::isPowerOf2_32(UDivisor)) {
-          uint32_t LogDiv = llvm::Log2_32(UDivisor);
-          // LLVM does the following for dest=src/(1<<log):
-          //   t=src
-          //   sar t,typewidth-1 // -1 if src is negative, 0 if not
-          //   shr t,typewidth-log
-          //   add t,src
-          //   sar t,log
-          //   dest=t
-          uint32_t TypeWidth = Traits::X86_CHAR_BIT * typeWidthInBytes(Ty);
-          _mov(T, Src0);
-          // If for some reason we are dividing by 1, just treat it like an
-          // assignment.
-          if (LogDiv > 0) {
-            // The initial sar is unnecessary when dividing by 2.
-            if (LogDiv > 1)
-              _sar(T, Ctx->getConstantInt(Ty, TypeWidth - 1));
-            _shr(T, Ctx->getConstantInt(Ty, TypeWidth - LogDiv));
-            _add(T, Src0);
-            _sar(T, Ctx->getConstantInt(Ty, LogDiv));
-          }
-          _mov(Dest, T);
-          return;
-        }
-      }
-    }
-    Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
-    switch (Ty) {
-    default:
-      llvm::report_fatal_error("Bad type for sdiv");
-    case IceType_i64:
-      T_edx = makeReg(Ty, Traits::getRdxOrDie());
-      _mov(T, Src0, Traits::getRaxOrDie());
-      break;
-    case IceType_i32:
-      T_edx = makeReg(Ty, Traits::RegisterSet::Reg_edx);
-      _mov(T, Src0, Traits::RegisterSet::Reg_eax);
-      break;
-    case IceType_i16:
-      T_edx = makeReg(Ty, Traits::RegisterSet::Reg_dx);
-      _mov(T, Src0, Traits::RegisterSet::Reg_ax);
-      break;
-    case IceType_i8:
-      T_edx = makeReg(IceType_i16, Traits::RegisterSet::Reg_ax);
-      _mov(T, Src0, Traits::RegisterSet::Reg_al);
-      break;
-    }
-    _cbwdq(T_edx, T);
-    _idiv(T_edx, Src1, T);
-    _redefined(Context.insert<InstFakeDef>(T, T_edx));
-    _mov(Dest, T);
-    break;
-  case InstArithmetic::Urem: {
-    Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
-    RegNumT Eax;
-    RegNumT Edx;
-    switch (Ty) {
-    default:
-      llvm::report_fatal_error("Bad type for urem");
-    case IceType_i64:
-      Eax = Traits::getRaxOrDie();
-      Edx = Traits::getRdxOrDie();
-      break;
-    case IceType_i32:
-      Eax = Traits::RegisterSet::Reg_eax;
-      Edx = Traits::RegisterSet::Reg_edx;
-      break;
-    case IceType_i16:
-      Eax = Traits::RegisterSet::Reg_ax;
-      Edx = Traits::RegisterSet::Reg_dx;
-      break;
-    case IceType_i8:
-      Eax = Traits::RegisterSet::Reg_al;
-      Edx = Traits::RegisterSet::Reg_ah;
-      break;
-    }
-    T_edx = makeReg(Ty, Edx);
-    _mov(T_edx, Ctx->getConstantZero(Ty));
-    _mov(T, Src0, Eax);
-    _div(T, Src1, T_edx);
-    _redefined(Context.insert<InstFakeDef>(T_edx, T));
-    if (Ty == IceType_i8) {
-      // Register ah must be moved into one of {al,bl,cl,dl} before it can be
-      // moved into a general 8-bit register.
-      auto *T_AhRcvr = makeReg(Ty);
-      T_AhRcvr->setRegClass(RCX86_IsAhRcvr);
-      _mov(T_AhRcvr, T_edx);
-      T_edx = T_AhRcvr;
-    }
-    _mov(Dest, T_edx);
-  } break;
-  case InstArithmetic::Srem: {
-    // TODO(stichnot): Enable this after doing better performance and cross
-    // testing.
-    if (false && Func->getOptLevel() >= Opt_1) {
-      // Optimize mod by constant power of 2, but not for Om1 or O0, just to
-      // keep things simple there.
-      if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) {
-        const int32_t Divisor = C->getValue();
-        const uint32_t UDivisor = Divisor;
-        if (Divisor > 0 && llvm::isPowerOf2_32(UDivisor)) {
-          uint32_t LogDiv = llvm::Log2_32(UDivisor);
-          // LLVM does the following for dest=src%(1<<log):
-          //   t=src
-          //   sar t,typewidth-1 // -1 if src is negative, 0 if not
-          //   shr t,typewidth-log
-          //   add t,src
-          //   and t, -(1<<log)
-          //   sub t,src
-          //   neg t
-          //   dest=t
-          uint32_t TypeWidth = Traits::X86_CHAR_BIT * typeWidthInBytes(Ty);
-          // If for some reason we are dividing by 1, just assign 0.
-          if (LogDiv == 0) {
-            _mov(Dest, Ctx->getConstantZero(Ty));
-            return;
-          }
-          _mov(T, Src0);
-          // The initial sar is unnecessary when dividing by 2.
-          if (LogDiv > 1)
-            _sar(T, Ctx->getConstantInt(Ty, TypeWidth - 1));
-          _shr(T, Ctx->getConstantInt(Ty, TypeWidth - LogDiv));
-          _add(T, Src0);
-          _and(T, Ctx->getConstantInt(Ty, -(1 << LogDiv)));
-          _sub(T, Src0);
-          _neg(T);
-          _mov(Dest, T);
-          return;
-        }
-      }
-    }
-    Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
-    RegNumT Eax;
-    RegNumT Edx;
-    switch (Ty) {
-    default:
-      llvm::report_fatal_error("Bad type for srem");
-    case IceType_i64:
-      Eax = Traits::getRaxOrDie();
-      Edx = Traits::getRdxOrDie();
-      break;
-    case IceType_i32:
-      Eax = Traits::RegisterSet::Reg_eax;
-      Edx = Traits::RegisterSet::Reg_edx;
-      break;
-    case IceType_i16:
-      Eax = Traits::RegisterSet::Reg_ax;
-      Edx = Traits::RegisterSet::Reg_dx;
-      break;
-    case IceType_i8:
-      Eax = Traits::RegisterSet::Reg_al;
-      Edx = Traits::RegisterSet::Reg_ah;
-      break;
-    }
-    T_edx = makeReg(Ty, Edx);
-    _mov(T, Src0, Eax);
-    _cbwdq(T_edx, T);
-    _idiv(T, Src1, T_edx);
-    _redefined(Context.insert<InstFakeDef>(T_edx, T));
-    if (Ty == IceType_i8) {
-      // Register ah must be moved into one of {al,bl,cl,dl} before it can be
-      // moved into a general 8-bit register.
-      auto *T_AhRcvr = makeReg(Ty);
-      T_AhRcvr->setRegClass(RCX86_IsAhRcvr);
-      _mov(T_AhRcvr, T_edx);
-      T_edx = T_AhRcvr;
-    }
-    _mov(Dest, T_edx);
-  } break;
-  case InstArithmetic::Fadd:
-    _mov(T, Src0);
-    _addss(T, Src1);
-    _mov(Dest, T);
-    break;
-  case InstArithmetic::Fsub:
-    _mov(T, Src0);
-    _subss(T, Src1);
-    _mov(Dest, T);
-    break;
-  case InstArithmetic::Fmul:
-    _mov(T, Src0);
-    _mulss(T, Src0 == Src1 ? T : Src1);
-    _mov(Dest, T);
-    break;
-  case InstArithmetic::Fdiv:
-    _mov(T, Src0);
-    _divss(T, Src1);
-    _mov(Dest, T);
-    break;
-  case InstArithmetic::Frem:
-    llvm::report_fatal_error("Helper call was expected");
-    break;
-  }
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::lowerAssign(const InstAssign *Instr) {
-  Variable *Dest = Instr->getDest();
-  if (Dest->isRematerializable()) {
-    Context.insert<InstFakeDef>(Dest);
-    return;
-  }
-  Operand *Src = Instr->getSrc(0);
-  assert(Dest->getType() == Src->getType());
-  lowerMove(Dest, Src, false);
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::lowerBr(const InstBr *Br) {
-  if (Br->isUnconditional()) {
-    _br(Br->getTargetUnconditional());
-    return;
-  }
-  Operand *Cond = Br->getCondition();
-
-  // Handle folding opportunities.
-  if (const Inst *Producer = FoldingInfo.getProducerFor(Cond)) {
-    assert(Producer->isDeleted());
-    switch (BoolFolding<Traits>::getProducerKind(Producer)) {
-    default:
-      break;
-    case BoolFolding<Traits>::PK_Icmp32:
-    case BoolFolding<Traits>::PK_Icmp64: {
-      lowerIcmpAndConsumer(llvm::cast<InstIcmp>(Producer), Br);
-      return;
-    }
-    case BoolFolding<Traits>::PK_Fcmp: {
-      lowerFcmpAndConsumer(llvm::cast<InstFcmp>(Producer), Br);
-      return;
-    }
-    case BoolFolding<Traits>::PK_Arith: {
-      lowerArithAndConsumer(llvm::cast<InstArithmetic>(Producer), Br);
-      return;
-    }
-    }
-  }
-  Operand *Src0 = legalize(Cond, Legal_Reg | Legal_Mem);
-  Constant *Zero = Ctx->getConstantZero(IceType_i32);
-  _cmp(Src0, Zero);
-  _br(CondX86::Br_ne, Br->getTargetTrue(), Br->getTargetFalse());
-}
-
-// constexprMax returns a (constexpr) max(S0, S1), and it is used for defining
-// OperandList in lowerCall. std::max() is supposed to work, but it doesn't.
-inline constexpr SizeT constexprMax(SizeT S0, SizeT S1) {
-  return S0 < S1 ? S1 : S0;
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::lowerCall(const InstCall *Instr) {
-  // Common x86 calling convention lowering:
-  //
-  // * At the point before the call, the stack must be aligned to 16 bytes.
-  //
-  // * Non-register arguments are pushed onto the stack in right-to-left order,
-  // such that the left-most argument ends up on the top of the stack at the
-  // lowest memory address.
-  //
-  // * Stack arguments of vector type are aligned to start at the next highest
-  // multiple of 16 bytes. Other stack arguments are aligned to the next word
-  // size boundary (4 or 8 bytes, respectively).
-  RequiredStackAlignment = std::max<size_t>(RequiredStackAlignment,
-                                            Traits::X86_STACK_ALIGNMENT_BYTES);
-
-  constexpr SizeT MaxOperands =
-      constexprMax(Traits::X86_MAX_XMM_ARGS, Traits::X86_MAX_GPR_ARGS);
-  using OperandList = llvm::SmallVector<Operand *, MaxOperands>;
-
-  OperandList XmmArgs;
-  llvm::SmallVector<SizeT, MaxOperands> XmmArgIndices;
-  CfgVector<std::pair<const Type, Operand *>> GprArgs;
-  CfgVector<SizeT> GprArgIndices;
-  OperandList StackArgs, StackArgLocations;
-  uint32_t ParameterAreaSizeBytes = 0;
-
-  ParameterAreaSizeBytes += getShadowStoreSize();
-
-  // Classify each argument operand according to the location where the argument
-  // is passed.
-  for (SizeT i = 0, NumArgs = Instr->getNumArgs(); i < NumArgs; ++i) {
-    Operand *Arg = Instr->getArg(i);
-    const Type Ty = Arg->getType();
-    // The PNaCl ABI requires the width of arguments to be at least 32 bits.
-    assert(typeWidthInBytes(Ty) >= 4);
-    if (isVectorType(Ty) &&
-        Traits::getRegisterForXmmArgNum(Traits::getArgIndex(i, XmmArgs.size()))
-            .hasValue()) {
-      XmmArgs.push_back(Arg);
-      XmmArgIndices.push_back(i);
-    } else if (isScalarFloatingType(Ty) &&
-               Traits::getRegisterForXmmArgNum(
-                   Traits::getArgIndex(i, XmmArgs.size()))
-                   .hasValue()) {
-      XmmArgs.push_back(Arg);
-      XmmArgIndices.push_back(i);
-    } else if (isScalarIntegerType(Ty) &&
-               Traits::getRegisterForGprArgNum(
-                   Ty, Traits::getArgIndex(i, GprArgs.size()))
-                   .hasValue()) {
-      GprArgs.emplace_back(Ty, Arg);
-      GprArgIndices.push_back(i);
-    } else {
-      // Place on stack.
-      StackArgs.push_back(Arg);
-      if (isVectorType(Arg->getType())) {
-        ParameterAreaSizeBytes =
-            Traits::applyStackAlignment(ParameterAreaSizeBytes);
-      }
-      Variable *esp = getPhysicalRegister(getStackReg(), Traits::WordType);
-      Constant *Loc = Ctx->getConstantInt32(ParameterAreaSizeBytes);
-      StackArgLocations.push_back(
-          Traits::X86OperandMem::create(Func, Ty, esp, Loc));
-      ParameterAreaSizeBytes += typeWidthInBytesOnStack(Arg->getType());
-    }
-  }
-  // Ensure there is enough space for the fstp/movs for floating returns.
-  Variable *Dest = Instr->getDest();
-  const Type DestTy = Dest ? Dest->getType() : IceType_void;
-  // Adjust the parameter area so that the stack is aligned. It is assumed that
-  // the stack is already aligned at the start of the calling sequence.
-  ParameterAreaSizeBytes = Traits::applyStackAlignment(ParameterAreaSizeBytes);
-  assert(ParameterAreaSizeBytes <= maxOutArgsSizeBytes());
-  // Copy arguments that are passed on the stack to the appropriate stack
-  // locations.  We make sure legalize() is called on each argument at this
-  // point, to allow availabilityGet() to work.
-  for (SizeT i = 0, NumStackArgs = StackArgs.size(); i < NumStackArgs; ++i) {
-    lowerStore(
-        InstStore::create(Func, legalize(StackArgs[i]), StackArgLocations[i]));
-  }
-  // Copy arguments to be passed in registers to the appropriate registers.
-  for (SizeT i = 0, NumXmmArgs = XmmArgs.size(); i < NumXmmArgs; ++i) {
-    XmmArgs[i] = legalizeToReg(legalize(XmmArgs[i]),
-                               Traits::getRegisterForXmmArgNum(
-                                   Traits::getArgIndex(XmmArgIndices[i], i)));
-  }
-  // Materialize moves for arguments passed in GPRs.
-  for (SizeT i = 0, NumGprArgs = GprArgs.size(); i < NumGprArgs; ++i) {
-    const Type SignatureTy = GprArgs[i].first;
-    Operand *Arg =
-        legalize(GprArgs[i].second, Legal_Default | Legal_Rematerializable);
-    GprArgs[i].second = legalizeToReg(
-        Arg, Traits::getRegisterForGprArgNum(
-                 Arg->getType(), Traits::getArgIndex(GprArgIndices[i], i)));
-    assert(SignatureTy == IceType_i64 || SignatureTy == IceType_i32);
-    assert(SignatureTy == Arg->getType());
-    (void)SignatureTy;
-  }
-  // Generate a FakeUse of register arguments so that they do not get dead code
-  // eliminated as a result of the FakeKill of scratch registers after the call.
-  // These need to be right before the call instruction.
-  for (auto *Arg : XmmArgs) {
-    Context.insert<InstFakeUse>(llvm::cast<Variable>(Arg));
-  }
-  for (auto &ArgPair : GprArgs) {
-    Context.insert<InstFakeUse>(llvm::cast<Variable>(ArgPair.second));
-  }
-  // Generate the call instruction. Assign its result to a temporary with high
-  // register allocation weight.
-  // ReturnReg doubles as ReturnRegLo as necessary.
-  Variable *ReturnReg = nullptr;
-  Variable *ReturnRegHi = nullptr;
-  if (Dest) {
-    switch (DestTy) {
-    case IceType_NUM:
-    case IceType_void:
-    case IceType_i1:
-    case IceType_i8:
-    case IceType_i16:
-      llvm::report_fatal_error("Invalid Call dest type");
-      break;
-    case IceType_i32:
-      ReturnReg = makeReg(DestTy, Traits::RegisterSet::Reg_eax);
-      break;
-    case IceType_i64:
-      if (Traits::Is64Bit) {
-        ReturnReg = makeReg(IceType_i64, Traits::getRaxOrDie());
-      } else {
-        ReturnReg = makeReg(IceType_i32, Traits::RegisterSet::Reg_eax);
-        ReturnRegHi = makeReg(IceType_i32, Traits::RegisterSet::Reg_edx);
-      }
-      break;
-    case IceType_f32:
-    case IceType_f64:
-    // Fallthrough intended.
-    case IceType_v4i1:
-    case IceType_v8i1:
-    case IceType_v16i1:
-    case IceType_v16i8:
-    case IceType_v8i16:
-    case IceType_v4i32:
-    case IceType_v4f32:
-      ReturnReg = makeReg(DestTy, Traits::RegisterSet::Reg_xmm0);
-      break;
-    }
-  }
-  // Emit the call to the function.
-  Operand *CallTarget =
-      legalize(Instr->getCallTarget(), Legal_Reg | Legal_Imm | Legal_AddrAbs);
-  size_t NumVariadicFpArgs = Instr->isVariadic() ? XmmArgs.size() : 0;
-  Inst *NewCall = emitCallToTarget(CallTarget, ReturnReg, NumVariadicFpArgs);
-  // Keep the upper return register live on 32-bit platform.
-  if (ReturnRegHi)
-    Context.insert<InstFakeDef>(ReturnRegHi);
-  // Mark the call as killing all the caller-save registers.
-  Context.insert<InstFakeKill>(NewCall);
-  // Generate a FakeUse to keep the call live if necessary.
-  if (Instr->hasSideEffects() && ReturnReg) {
-    Context.insert<InstFakeUse>(ReturnReg);
-  }
-  // Process the return value, if any.
-  if (Dest == nullptr)
-    return;
-  // Assign the result of the call to Dest.  Route it through a temporary so
-  // that the local register availability peephole can be subsequently used.
-  Variable *Tmp = nullptr;
-  if (isVectorType(DestTy)) {
-    assert(ReturnReg && "Vector type requires a return register");
-    Tmp = makeReg(DestTy);
-    _movp(Tmp, ReturnReg);
-    _movp(Dest, Tmp);
-  } else if (isScalarFloatingType(DestTy)) {
-    assert(ReturnReg && "FP type requires a return register");
-    _mov(Tmp, ReturnReg);
-    _mov(Dest, Tmp);
-  } else {
-    assert(isScalarIntegerType(DestTy));
-    assert(ReturnReg && "Integer type requires a return register");
-    if (DestTy == IceType_i64 && !Traits::Is64Bit) {
-      assert(ReturnRegHi && "64-bit type requires two return registers");
-      auto *Dest64On32 = llvm::cast<Variable64On32>(Dest);
-      Variable *DestLo = Dest64On32->getLo();
-      Variable *DestHi = Dest64On32->getHi();
-      _mov(Tmp, ReturnReg);
-      _mov(DestLo, Tmp);
-      Variable *TmpHi = nullptr;
-      _mov(TmpHi, ReturnRegHi);
-      _mov(DestHi, TmpHi);
-    } else {
-      _mov(Tmp, ReturnReg);
-      _mov(Dest, Tmp);
-    }
-  }
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::lowerCast(const InstCast *Instr) {
-  // a = cast(b) ==> t=cast(b); a=t; (link t->b, link a->t, no overlap)
-  InstCast::OpKind CastKind = Instr->getCastKind();
-  Variable *Dest = Instr->getDest();
-  Type DestTy = Dest->getType();
-  switch (CastKind) {
-  default:
-    Func->setError("Cast type not supported");
-    return;
-  case InstCast::Sext: {
-    // Src0RM is the source operand legalized to physical register or memory,
-    // but not immediate, since the relevant x86 native instructions don't
-    // allow an immediate operand. If the operand is an immediate, we could
-    // consider computing the strength-reduced result at translation time, but
-    // we're unlikely to see something like that in the bitcode that the
-    // optimizer wouldn't have already taken care of.
-    Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
-    if (isVectorType(DestTy)) {
-      if (DestTy == IceType_v16i8) {
-        // onemask = materialize(1,1,...); dst = (src & onemask) > 0
-        Variable *OneMask = makeVectorOfOnes(DestTy);
-        Variable *T = makeReg(DestTy);
-        _movp(T, Src0RM);
-        _pand(T, OneMask);
-        Variable *Zeros = makeVectorOfZeros(DestTy);
-        _pcmpgt(T, Zeros);
-        _movp(Dest, T);
-      } else {
-        /// width = width(elty) - 1; dest = (src << width) >> width
-        SizeT ShiftAmount =
-            Traits::X86_CHAR_BIT * typeWidthInBytes(typeElementType(DestTy)) -
-            1;
-        Constant *ShiftConstant = Ctx->getConstantInt8(ShiftAmount);
-        Variable *T = makeReg(DestTy);
-        _movp(T, Src0RM);
-        _psll(T, ShiftConstant);
-        _psra(T, ShiftConstant);
-        _movp(Dest, T);
-      }
-    } else if (!Traits::Is64Bit && DestTy == IceType_i64) {
-      // t1=movsx src; t2=t1; t2=sar t2, 31; dst.lo=t1; dst.hi=t2
-      Constant *Shift = Ctx->getConstantInt32(31);
-      auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
-      auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
-      Variable *T_Lo = makeReg(DestLo->getType());
-      if (Src0RM->getType() == IceType_i32) {
-        _mov(T_Lo, Src0RM);
-      } else if (Src0RM->getType() == IceType_i1) {
-        _movzx(T_Lo, Src0RM);
-        _shl(T_Lo, Shift);
-        _sar(T_Lo, Shift);
-      } else {
-        _movsx(T_Lo, Src0RM);
-      }
-      _mov(DestLo, T_Lo);
-      Variable *T_Hi = nullptr;
-      _mov(T_Hi, T_Lo);
-      if (Src0RM->getType() != IceType_i1)
-        // For i1, the sar instruction is already done above.
-        _sar(T_Hi, Shift);
-      _mov(DestHi, T_Hi);
-    } else if (Src0RM->getType() == IceType_i1) {
-      // t1 = src
-      // shl t1, dst_bitwidth - 1
-      // sar t1, dst_bitwidth - 1
-      // dst = t1
-      size_t DestBits = Traits::X86_CHAR_BIT * typeWidthInBytes(DestTy);
-      Constant *ShiftAmount = Ctx->getConstantInt32(DestBits - 1);
-      Variable *T = makeReg(DestTy);
-      if (typeWidthInBytes(DestTy) <= typeWidthInBytes(Src0RM->getType())) {
-        _mov(T, Src0RM);
-      } else {
-        // Widen the source using movsx or movzx. (It doesn't matter which one,
-        // since the following shl/sar overwrite the bits.)
-        _movzx(T, Src0RM);
-      }
-      _shl(T, ShiftAmount);
-      _sar(T, ShiftAmount);
-      _mov(Dest, T);
-    } else {
-      // t1 = movsx src; dst = t1
-      Variable *T = makeReg(DestTy);
-      _movsx(T, Src0RM);
-      _mov(Dest, T);
-    }
-    break;
-  }
-  case InstCast::Zext: {
-    Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
-    if (isVectorType(DestTy)) {
-      // onemask = materialize(1,1,...); dest = onemask & src
-      Variable *OneMask = makeVectorOfOnes(DestTy);
-      Variable *T = makeReg(DestTy);
-      _movp(T, Src0RM);
-      _pand(T, OneMask);
-      _movp(Dest, T);
-    } else if (!Traits::Is64Bit && DestTy == IceType_i64) {
-      // t1=movzx src; dst.lo=t1; dst.hi=0
-      Constant *Zero = Ctx->getConstantZero(IceType_i32);
-      auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
-      auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
-      Variable *Tmp = makeReg(DestLo->getType());
-      if (Src0RM->getType() == IceType_i32) {
-        _mov(Tmp, Src0RM);
-      } else {
-        _movzx(Tmp, Src0RM);
-      }
-      _mov(DestLo, Tmp);
-      _mov(DestHi, Zero);
-    } else if (Src0RM->getType() == IceType_i1) {
-      // t = Src0RM; Dest = t
-      Variable *T = nullptr;
-      if (DestTy == IceType_i8) {
-        _mov(T, Src0RM);
-      } else {
-        assert(DestTy != IceType_i1);
-        assert(Traits::Is64Bit || DestTy != IceType_i64);
-        // Use 32-bit for both 16-bit and 32-bit, since 32-bit ops are shorter.
-        // In x86-64 we need to widen T to 64-bits to ensure that T -- if
-        // written to the stack (i.e., in -Om1) will be fully zero-extended.
-        T = makeReg(DestTy == IceType_i64 ? IceType_i64 : IceType_i32);
-        _movzx(T, Src0RM);
-      }
-      _mov(Dest, T);
-    } else {
-      // t1 = movzx src; dst = t1
-      Variable *T = makeReg(DestTy);
-      _movzx(T, Src0RM);
-      _mov(Dest, T);
-    }
-    break;
-  }
-  case InstCast::Trunc: {
-    if (isVectorType(DestTy)) {
-      // onemask = materialize(1,1,...); dst = src & onemask
-      Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
-      Type Src0Ty = Src0RM->getType();
-      Variable *OneMask = makeVectorOfOnes(Src0Ty);
-      Variable *T = makeReg(DestTy);
-      _movp(T, Src0RM);
-      _pand(T, OneMask);
-      _movp(Dest, T);
-    } else if (DestTy == IceType_i1 || DestTy == IceType_i8) {
-      // Make sure we truncate from and into valid registers.
-      Operand *Src0 = legalizeUndef(Instr->getSrc(0));
-      if (!Traits::Is64Bit && Src0->getType() == IceType_i64)
-        Src0 = loOperand(Src0);
-      Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
-      Variable *T = copyToReg8(Src0RM);
-      if (DestTy == IceType_i1)
-        _and(T, Ctx->getConstantInt1(1));
-      _mov(Dest, T);
-    } else {
-      Operand *Src0 = legalizeUndef(Instr->getSrc(0));
-      if (!Traits::Is64Bit && Src0->getType() == IceType_i64)
-        Src0 = loOperand(Src0);
-      Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
-      // t1 = trunc Src0RM; Dest = t1
-      Variable *T = makeReg(DestTy);
-      _mov(T, Src0RM);
-      _mov(Dest, T);
-    }
-    break;
-  }
-  case InstCast::Fptrunc:
-  case InstCast::Fpext: {
-    Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
-    // t1 = cvt Src0RM; Dest = t1
-    Variable *T = makeReg(DestTy);
-    _cvt(T, Src0RM, Traits::Insts::Cvt::Float2float);
-    _mov(Dest, T);
-    break;
-  }
-  case InstCast::Fptosi:
-    if (isVectorType(DestTy)) {
-      assert(DestTy == IceType_v4i32);
-      assert(Instr->getSrc(0)->getType() == IceType_v4f32);
-      Operand *Src0R = legalizeToReg(Instr->getSrc(0));
-      Variable *T = makeReg(DestTy);
-      _cvt(T, Src0R, Traits::Insts::Cvt::Tps2dq);
-      _movp(Dest, T);
-    } else if (!Traits::Is64Bit && DestTy == IceType_i64) {
-      llvm::report_fatal_error("Helper call was expected");
-    } else {
-      Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
-      // t1.i32 = cvt Src0RM; t2.dest_type = t1; Dest = t2.dest_type
-      Variable *T_1 = nullptr;
-      if (Traits::Is64Bit && DestTy == IceType_i64) {
-        T_1 = makeReg(IceType_i64);
-      } else {
-        assert(DestTy != IceType_i64);
-        T_1 = makeReg(IceType_i32);
-      }
-      // cvt() requires its integer argument to be a GPR.
-      Variable *T_2 = makeReg(DestTy);
-      if (isByteSizedType(DestTy)) {
-        assert(T_1->getType() == IceType_i32);
-        T_1->setRegClass(RCX86_Is32To8);
-        T_2->setRegClass(RCX86_IsTrunc8Rcvr);
-      }
-      _cvt(T_1, Src0RM, Traits::Insts::Cvt::Tss2si);
-      _mov(T_2, T_1); // T_1 and T_2 may have different integer types
-      if (DestTy == IceType_i1)
-        _and(T_2, Ctx->getConstantInt1(1));
-      _mov(Dest, T_2);
-    }
-    break;
-  case InstCast::Fptoui:
-    if (isVectorType(DestTy)) {
-      llvm::report_fatal_error("Helper call was expected");
-    } else if (DestTy == IceType_i64 ||
-               (!Traits::Is64Bit && DestTy == IceType_i32)) {
-      llvm::report_fatal_error("Helper call was expected");
-    } else {
-      Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
-      // t1.i32 = cvt Src0RM; t2.dest_type = t1; Dest = t2.dest_type
-      assert(DestTy != IceType_i64);
-      Variable *T_1 = nullptr;
-      if (Traits::Is64Bit && DestTy == IceType_i32) {
-        T_1 = makeReg(IceType_i64);
-      } else {
-        assert(DestTy != IceType_i32);
-        T_1 = makeReg(IceType_i32);
-      }
-      Variable *T_2 = makeReg(DestTy);
-      if (isByteSizedType(DestTy)) {
-        assert(T_1->getType() == IceType_i32);
-        T_1->setRegClass(RCX86_Is32To8);
-        T_2->setRegClass(RCX86_IsTrunc8Rcvr);
-      }
-      _cvt(T_1, Src0RM, Traits::Insts::Cvt::Tss2si);
-      _mov(T_2, T_1); // T_1 and T_2 may have different integer types
-      if (DestTy == IceType_i1)
-        _and(T_2, Ctx->getConstantInt1(1));
-      _mov(Dest, T_2);
-    }
-    break;
-  case InstCast::Sitofp:
-    if (isVectorType(DestTy)) {
-      assert(DestTy == IceType_v4f32);
-      assert(Instr->getSrc(0)->getType() == IceType_v4i32);
-      Operand *Src0R = legalizeToReg(Instr->getSrc(0));
-      Variable *T = makeReg(DestTy);
-      _cvt(T, Src0R, Traits::Insts::Cvt::Dq2ps);
-      _movp(Dest, T);
-    } else if (!Traits::Is64Bit && Instr->getSrc(0)->getType() == IceType_i64) {
-      llvm::report_fatal_error("Helper call was expected");
-    } else {
-      Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
-      // Sign-extend the operand.
-      // t1.i32 = movsx Src0RM; t2 = Cvt t1.i32; Dest = t2
-      Variable *T_1 = nullptr;
-      if (Traits::Is64Bit && Src0RM->getType() == IceType_i64) {
-        T_1 = makeReg(IceType_i64);
-      } else {
-        assert(Src0RM->getType() != IceType_i64);
-        T_1 = makeReg(IceType_i32);
-      }
-      Variable *T_2 = makeReg(DestTy);
-      if (Src0RM->getType() == T_1->getType())
-        _mov(T_1, Src0RM);
-      else
-        _movsx(T_1, Src0RM);
-      _cvt(T_2, T_1, Traits::Insts::Cvt::Si2ss);
-      _mov(Dest, T_2);
-    }
-    break;
-  case InstCast::Uitofp: {
-    Operand *Src0 = Instr->getSrc(0);
-    if (isVectorType(Src0->getType())) {
-      llvm::report_fatal_error("Helper call was expected");
-    } else if (Src0->getType() == IceType_i64 ||
-               (!Traits::Is64Bit && Src0->getType() == IceType_i32)) {
-      llvm::report_fatal_error("Helper call was expected");
-    } else {
-      Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
-      // Zero-extend the operand.
-      // t1.i32 = movzx Src0RM; t2 = Cvt t1.i32; Dest = t2
-      Variable *T_1 = nullptr;
-      if (Traits::Is64Bit && Src0RM->getType() == IceType_i32) {
-        T_1 = makeReg(IceType_i64);
-      } else {
-        assert(Src0RM->getType() != IceType_i64);
-        assert(Traits::Is64Bit || Src0RM->getType() != IceType_i32);
-        T_1 = makeReg(IceType_i32);
-      }
-      Variable *T_2 = makeReg(DestTy);
-      if (Src0RM->getType() == T_1->getType())
-        _mov(T_1, Src0RM);
-      else
-        _movzx(T_1, Src0RM)->setMustKeep();
-      _cvt(T_2, T_1, Traits::Insts::Cvt::Si2ss);
-      _mov(Dest, T_2);
-    }
-    break;
-  }
-  case InstCast::Bitcast: {
-    Operand *Src0 = Instr->getSrc(0);
-    if (DestTy == Src0->getType()) {
-      auto *Assign = InstAssign::create(Func, Dest, Src0);
-      lowerAssign(Assign);
-      return;
-    }
-    switch (DestTy) {
-    default:
-      llvm_unreachable("Unexpected Bitcast dest type");
-    case IceType_i8: {
-      llvm::report_fatal_error("Helper call was expected");
-    } break;
-    case IceType_i16: {
-      llvm::report_fatal_error("Helper call was expected");
-    } break;
-    case IceType_i32:
-    case IceType_f32: {
-      Variable *Src0R = legalizeToReg(Src0);
-      Variable *T = makeReg(DestTy);
-      _movd(T, Src0R);
-      _mov(Dest, T);
-    } break;
-    case IceType_i64: {
-      assert(Src0->getType() == IceType_f64);
-      if (Traits::Is64Bit) {
-        Variable *Src0R = legalizeToReg(Src0);
-        Variable *T = makeReg(IceType_i64);
-        _movd(T, Src0R);
-        _mov(Dest, T);
-      } else {
-        Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
-        // a.i64 = bitcast b.f64 ==>
-        //   s.f64 = spill b.f64
-        //   t_lo.i32 = lo(s.f64)
-        //   a_lo.i32 = t_lo.i32
-        //   t_hi.i32 = hi(s.f64)
-        //   a_hi.i32 = t_hi.i32
-        Operand *SpillLo, *SpillHi;
-        if (auto *Src0Var = llvm::dyn_cast<Variable>(Src0RM)) {
-          Variable *Spill = Func->makeVariable(IceType_f64);
-          Spill->setLinkedTo(Src0Var);
-          Spill->setMustNotHaveReg();
-          _movq(Spill, Src0RM);
-          SpillLo = Traits::VariableSplit::create(Func, Spill,
-                                                  Traits::VariableSplit::Low);
-          SpillHi = Traits::VariableSplit::create(Func, Spill,
-                                                  Traits::VariableSplit::High);
-        } else {
-          SpillLo = loOperand(Src0RM);
-          SpillHi = hiOperand(Src0RM);
-        }
-
-        auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
-        auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
-        Variable *T_Lo = makeReg(IceType_i32);
-        Variable *T_Hi = makeReg(IceType_i32);
-
-        _mov(T_Lo, SpillLo);
-        _mov(DestLo, T_Lo);
-        _mov(T_Hi, SpillHi);
-        _mov(DestHi, T_Hi);
-      }
-    } break;
-    case IceType_f64: {
-      assert(Src0->getType() == IceType_i64);
-      if (Traits::Is64Bit) {
-        Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
-        Variable *T = makeReg(IceType_f64);
-        _movd(T, Src0RM);
-        _mov(Dest, T);
-      } else {
-        Src0 = legalize(Src0);
-        if (llvm::isa<X86OperandMem>(Src0)) {
-          Variable *T = makeReg(DestTy);
-          _movq(T, Src0);
-          _movq(Dest, T);
-          break;
-        }
-        // a.f64 = bitcast b.i64 ==>
-        //   t_lo.i32 = b_lo.i32
-        //   FakeDef(s.f64)
-        //   lo(s.f64) = t_lo.i32
-        //   t_hi.i32 = b_hi.i32
-        //   hi(s.f64) = t_hi.i32
-        //   a.f64 = s.f64
-        Variable *Spill = Func->makeVariable(IceType_f64);
-        Spill->setLinkedTo(Dest);
-        Spill->setMustNotHaveReg();
-
-        Variable *T_Lo = nullptr, *T_Hi = nullptr;
-        auto *SpillLo = Traits::VariableSplit::create(
-            Func, Spill, Traits::VariableSplit::Low);
-        auto *SpillHi = Traits::VariableSplit::create(
-            Func, Spill, Traits::VariableSplit::High);
-        _mov(T_Lo, loOperand(Src0));
-        // Technically, the Spill is defined after the _store happens, but
-        // SpillLo is considered a "use" of Spill so define Spill before it is
-        // used.
-        Context.insert<InstFakeDef>(Spill);
-        _store(T_Lo, SpillLo);
-        _mov(T_Hi, hiOperand(Src0));
-        _store(T_Hi, SpillHi);
-        _movq(Dest, Spill);
-      }
-    } break;
-    case IceType_v8i1: {
-      llvm::report_fatal_error("Helper call was expected");
-    } break;
-    case IceType_v16i1: {
-      llvm::report_fatal_error("Helper call was expected");
-    } break;
-    case IceType_v8i16:
-    case IceType_v16i8:
-    case IceType_v4i32:
-    case IceType_v4f32: {
-      if (Src0->getType() == IceType_i32) {
-        // Bitcast requires equal type sizes, which isn't strictly the case
-        // between scalars and vectors, but to emulate v4i8 vectors one has to
-        // use v16i8 vectors.
-        Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
-        Variable *T = makeReg(DestTy);
-        _movd(T, Src0RM);
-        _mov(Dest, T);
-      } else {
-        _movp(Dest, legalizeToReg(Src0));
-      }
-    } break;
-    }
-    break;
-  }
-  }
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::lowerExtractElement(
-    const InstExtractElement *Instr) {
-  Operand *SourceVectNotLegalized = Instr->getSrc(0);
-  auto *ElementIndex = llvm::dyn_cast<ConstantInteger32>(Instr->getSrc(1));
-  // Only constant indices are allowed in PNaCl IR.
-  assert(ElementIndex);
-
-  unsigned Index = ElementIndex->getValue();
-  Type Ty = SourceVectNotLegalized->getType();
-  Type ElementTy = typeElementType(Ty);
-  Type InVectorElementTy = Traits::getInVectorElementType(Ty);
-
-  // TODO(wala): Determine the best lowering sequences for each type.
-  bool CanUsePextr = Ty == IceType_v8i16 || Ty == IceType_v8i1 ||
-                     (InstructionSet >= Traits::SSE4_1 && Ty != IceType_v4f32);
-  Variable *ExtractedElementR =
-      makeReg(CanUsePextr ? IceType_i32 : InVectorElementTy);
-  if (CanUsePextr) {
-    // Use pextrb, pextrw, or pextrd.  The "b" and "w" versions clear the upper
-    // bits of the destination register, so we represent this by always
-    // extracting into an i32 register.  The _mov into Dest below will do
-    // truncation as necessary.
-    Constant *Mask = Ctx->getConstantInt32(Index);
-    Variable *SourceVectR = legalizeToReg(SourceVectNotLegalized);
-    _pextr(ExtractedElementR, SourceVectR, Mask);
-  } else if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) {
-    // Use pshufd and movd/movss.
-    Variable *T = nullptr;
-    if (Index) {
-      // The shuffle only needs to occur if the element to be extracted is not
-      // at the lowest index.
-      Constant *Mask = Ctx->getConstantInt32(Index);
-      T = makeReg(Ty);
-      _pshufd(T, legalize(SourceVectNotLegalized, Legal_Reg | Legal_Mem), Mask);
-    } else {
-      T = legalizeToReg(SourceVectNotLegalized);
-    }
-
-    if (InVectorElementTy == IceType_i32) {
-      _movd(ExtractedElementR, T);
-    } else { // Ty == IceType_f32
-      // TODO(wala): _movss is only used here because _mov does not allow a
-      // vector source and a scalar destination.  _mov should be able to be
-      // used here.
-      // _movss is a binary instruction, so the FakeDef is needed to keep the
-      // live range analysis consistent.
-      Context.insert<InstFakeDef>(ExtractedElementR);
-      _movss(ExtractedElementR, T);
-    }
-  } else {
-    assert(Ty == IceType_v16i8 || Ty == IceType_v16i1);
-    // Spill the value to a stack slot and do the extraction in memory.
-    //
-    // TODO(wala): use legalize(SourceVectNotLegalized, Legal_Mem) when support
-    // for legalizing to mem is implemented.
-    Variable *Slot = Func->makeVariable(Ty);
-    Slot->setMustNotHaveReg();
-    _movp(Slot, legalizeToReg(SourceVectNotLegalized));
-
-    // Compute the location of the element in memory.
-    unsigned Offset = Index * typeWidthInBytes(InVectorElementTy);
-    X86OperandMem *Loc =
-        getMemoryOperandForStackSlot(InVectorElementTy, Slot, Offset);
-    _mov(ExtractedElementR, Loc);
-  }
-
-  if (ElementTy == IceType_i1) {
-    // Truncate extracted integers to i1s if necessary.
-    Variable *T = makeReg(IceType_i1);
-    InstCast *Cast =
-        InstCast::create(Func, InstCast::Trunc, T, ExtractedElementR);
-    lowerCast(Cast);
-    ExtractedElementR = T;
-  }
-
-  // Copy the element to the destination.
-  Variable *Dest = Instr->getDest();
-  _mov(Dest, ExtractedElementR);
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::lowerFcmp(const InstFcmp *Fcmp) {
-  Variable *Dest = Fcmp->getDest();
-
-  if (isVectorType(Dest->getType())) {
-    lowerFcmpVector(Fcmp);
-  } else {
-    constexpr Inst *Consumer = nullptr;
-    lowerFcmpAndConsumer(Fcmp, Consumer);
-  }
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::lowerFcmpAndConsumer(const InstFcmp *Fcmp,
-                                                     const Inst *Consumer) {
-  Operand *Src0 = Fcmp->getSrc(0);
-  Operand *Src1 = Fcmp->getSrc(1);
-  Variable *Dest = Fcmp->getDest();
-
-  if (Consumer != nullptr) {
-    if (auto *Select = llvm::dyn_cast<InstSelect>(Consumer)) {
-      if (lowerOptimizeFcmpSelect(Fcmp, Select))
-        return;
-    }
-  }
-
-  if (isVectorType(Dest->getType())) {
-    lowerFcmp(Fcmp);
-    if (Consumer != nullptr)
-      lowerSelectVector(llvm::cast<InstSelect>(Consumer));
-    return;
-  }
-
-  // Lowering a = fcmp cond, b, c
-  //   ucomiss b, c       /* only if C1 != Br_None */
-  //                      /* but swap b,c order if SwapOperands==true */
-  //   mov a, <default>
-  //   j<C1> label        /* only if C1 != Br_None */
-  //   j<C2> label        /* only if C2 != Br_None */
-  //   FakeUse(a)         /* only if C1 != Br_None */
-  //   mov a, !<default>  /* only if C1 != Br_None */
-  //   label:             /* only if C1 != Br_None */
-  //
-  // setcc lowering when C1 != Br_None && C2 == Br_None:
-  //   ucomiss b, c       /* but swap b,c order if SwapOperands==true */
-  //   setcc a, C1
-  InstFcmp::FCond Condition = Fcmp->getCondition();
-  assert(static_cast<size_t>(Condition) < Traits::TableFcmpSize);
-  if (Traits::TableFcmp[Condition].SwapScalarOperands)
-    std::swap(Src0, Src1);
-  const bool HasC1 = (Traits::TableFcmp[Condition].C1 != CondX86::Br_None);
-  const bool HasC2 = (Traits::TableFcmp[Condition].C2 != CondX86::Br_None);
-  if (HasC1) {
-    Src0 = legalize(Src0);
-    Operand *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
-    Variable *T = nullptr;
-    _mov(T, Src0);
-    _ucomiss(T, Src1RM);
-    if (!HasC2) {
-      assert(Traits::TableFcmp[Condition].Default);
-      setccOrConsumer(Traits::TableFcmp[Condition].C1, Dest, Consumer);
-      return;
-    }
-  }
-  int32_t IntDefault = Traits::TableFcmp[Condition].Default;
-  if (Consumer == nullptr) {
-    Constant *Default = Ctx->getConstantInt(Dest->getType(), IntDefault);
-    _mov(Dest, Default);
-    if (HasC1) {
-      InstX86Label *Label = InstX86Label::create(Func, this);
-      _br(Traits::TableFcmp[Condition].C1, Label);
-      if (HasC2) {
-        _br(Traits::TableFcmp[Condition].C2, Label);
-      }
-      Constant *NonDefault = Ctx->getConstantInt(Dest->getType(), !IntDefault);
-      _redefined(_mov(Dest, NonDefault));
-      Context.insert(Label);
-    }
-    return;
-  }
-  if (const auto *Br = llvm::dyn_cast<InstBr>(Consumer)) {
-    CfgNode *TrueSucc = Br->getTargetTrue();
-    CfgNode *FalseSucc = Br->getTargetFalse();
-    if (IntDefault != 0)
-      std::swap(TrueSucc, FalseSucc);
-    if (HasC1) {
-      _br(Traits::TableFcmp[Condition].C1, FalseSucc);
-      if (HasC2) {
-        _br(Traits::TableFcmp[Condition].C2, FalseSucc);
-      }
-      _br(TrueSucc);
-      return;
-    }
-    _br(FalseSucc);
-    return;
-  }
-  if (auto *Select = llvm::dyn_cast<InstSelect>(Consumer)) {
-    Operand *SrcT = Select->getTrueOperand();
-    Operand *SrcF = Select->getFalseOperand();
-    Variable *SelectDest = Select->getDest();
-    if (IntDefault != 0)
-      std::swap(SrcT, SrcF);
-    lowerMove(SelectDest, SrcF, false);
-    if (HasC1) {
-      InstX86Label *Label = InstX86Label::create(Func, this);
-      _br(Traits::TableFcmp[Condition].C1, Label);
-      if (HasC2) {
-        _br(Traits::TableFcmp[Condition].C2, Label);
-      }
-      static constexpr bool IsRedefinition = true;
-      lowerMove(SelectDest, SrcT, IsRedefinition);
-      Context.insert(Label);
-    }
-    return;
-  }
-  llvm::report_fatal_error("Unexpected consumer type");
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::lowerFcmpVector(const InstFcmp *Fcmp) {
-  Operand *Src0 = Fcmp->getSrc(0);
-  Operand *Src1 = Fcmp->getSrc(1);
-  Variable *Dest = Fcmp->getDest();
-
-  if (!isVectorType(Dest->getType()))
-    llvm::report_fatal_error("Expected vector compare");
-
-  InstFcmp::FCond Condition = Fcmp->getCondition();
-  assert(static_cast<size_t>(Condition) < Traits::TableFcmpSize);
-
-  if (Traits::TableFcmp[Condition].SwapVectorOperands)
-    std::swap(Src0, Src1);
-
-  Variable *T = nullptr;
-
-  if (Condition == InstFcmp::True) {
-    // makeVectorOfOnes() requires an integer vector type.
-    T = makeVectorOfMinusOnes(IceType_v4i32);
-  } else if (Condition == InstFcmp::False) {
-    T = makeVectorOfZeros(Dest->getType());
-  } else {
-    Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
-    Operand *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
-    if (llvm::isa<X86OperandMem>(Src1RM))
-      Src1RM = legalizeToReg(Src1RM);
-
-    switch (Condition) {
-    default: {
-      const CmppsCond Predicate = Traits::TableFcmp[Condition].Predicate;
-      assert(Predicate != CondX86::Cmpps_Invalid);
-      T = makeReg(Src0RM->getType());
-      _movp(T, Src0RM);
-      _cmpps(T, Src1RM, Predicate);
-    } break;
-    case InstFcmp::One: {
-      // Check both unequal and ordered.
-      T = makeReg(Src0RM->getType());
-      Variable *T2 = makeReg(Src0RM->getType());
-      _movp(T, Src0RM);
-      _cmpps(T, Src1RM, CondX86::Cmpps_neq);
-      _movp(T2, Src0RM);
-      _cmpps(T2, Src1RM, CondX86::Cmpps_ord);
-      _pand(T, T2);
-    } break;
-    case InstFcmp::Ueq: {
-      // Check both equal or unordered.
-      T = makeReg(Src0RM->getType());
-      Variable *T2 = makeReg(Src0RM->getType());
-      _movp(T, Src0RM);
-      _cmpps(T, Src1RM, CondX86::Cmpps_eq);
-      _movp(T2, Src0RM);
-      _cmpps(T2, Src1RM, CondX86::Cmpps_unord);
-      _por(T, T2);
-    } break;
-    }
-  }
-
-  assert(T != nullptr);
-  _movp(Dest, T);
-  eliminateNextVectorSextInstruction(Dest);
-}
-
-inline bool isZero(const Operand *Opnd) {
-  if (auto *C64 = llvm::dyn_cast<ConstantInteger64>(Opnd))
-    return C64->getValue() == 0;
-  if (auto *C32 = llvm::dyn_cast<ConstantInteger32>(Opnd))
-    return C32->getValue() == 0;
-  return false;
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::lowerIcmpAndConsumer(const InstIcmp *Icmp,
-                                                     const Inst *Consumer) {
-  Operand *Src0 = legalize(Icmp->getSrc(0));
-  Operand *Src1 = legalize(Icmp->getSrc(1));
-  Variable *Dest = Icmp->getDest();
-
-  if (isVectorType(Dest->getType())) {
-    lowerIcmp(Icmp);
-    if (Consumer != nullptr)
-      lowerSelectVector(llvm::cast<InstSelect>(Consumer));
-    return;
-  }
-
-  if (!Traits::Is64Bit && Src0->getType() == IceType_i64) {
-    lowerIcmp64(Icmp, Consumer);
-    return;
-  }
-
-  // cmp b, c
-  if (isZero(Src1)) {
-    switch (Icmp->getCondition()) {
-    default:
-      break;
-    case InstIcmp::Uge:
-      movOrConsumer(true, Dest, Consumer);
-      return;
-    case InstIcmp::Ult:
-      movOrConsumer(false, Dest, Consumer);
-      return;
-    }
-  }
-  Operand *Src0RM = legalizeSrc0ForCmp(Src0, Src1);
-  _cmp(Src0RM, Src1);
-  setccOrConsumer(Traits::getIcmp32Mapping(Icmp->getCondition()), Dest,
-                  Consumer);
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::lowerIcmpVector(const InstIcmp *Icmp) {
-  Operand *Src0 = legalize(Icmp->getSrc(0));
-  Operand *Src1 = legalize(Icmp->getSrc(1));
-  Variable *Dest = Icmp->getDest();
-
-  if (!isVectorType(Dest->getType()))
-    llvm::report_fatal_error("Expected a vector compare");
-
-  Type Ty = Src0->getType();
-  // Promote i1 vectors to 128 bit integer vector types.
-  if (typeElementType(Ty) == IceType_i1) {
-    Type NewTy = IceType_NUM;
-    switch (Ty) {
-    default:
-      llvm::report_fatal_error("unexpected type");
-      break;
-    case IceType_v4i1:
-      NewTy = IceType_v4i32;
-      break;
-    case IceType_v8i1:
-      NewTy = IceType_v8i16;
-      break;
-    case IceType_v16i1:
-      NewTy = IceType_v16i8;
-      break;
-    }
-    Variable *NewSrc0 = Func->makeVariable(NewTy);
-    Variable *NewSrc1 = Func->makeVariable(NewTy);
-    lowerCast(InstCast::create(Func, InstCast::Sext, NewSrc0, Src0));
-    lowerCast(InstCast::create(Func, InstCast::Sext, NewSrc1, Src1));
-    Src0 = NewSrc0;
-    Src1 = NewSrc1;
-    Ty = NewTy;
-  }
-
-  InstIcmp::ICond Condition = Icmp->getCondition();
-
-  Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
-  Operand *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
-
-  // SSE2 only has signed comparison operations. Transform unsigned inputs in
-  // a manner that allows for the use of signed comparison operations by
-  // flipping the high order bits.
-  if (Condition == InstIcmp::Ugt || Condition == InstIcmp::Uge ||
-      Condition == InstIcmp::Ult || Condition == InstIcmp::Ule) {
-    Variable *T0 = makeReg(Ty);
-    Variable *T1 = makeReg(Ty);
-    Variable *HighOrderBits = makeVectorOfHighOrderBits(Ty);
-    _movp(T0, Src0RM);
-    _pxor(T0, HighOrderBits);
-    _movp(T1, Src1RM);
-    _pxor(T1, HighOrderBits);
-    Src0RM = T0;
-    Src1RM = T1;
-  }
-
-  Variable *T = makeReg(Ty);
-  switch (Condition) {
-  default:
-    llvm_unreachable("unexpected condition");
-    break;
-  case InstIcmp::Eq: {
-    if (llvm::isa<X86OperandMem>(Src1RM))
-      Src1RM = legalizeToReg(Src1RM);
-    _movp(T, Src0RM);
-    _pcmpeq(T, Src1RM);
-  } break;
-  case InstIcmp::Ne: {
-    if (llvm::isa<X86OperandMem>(Src1RM))
-      Src1RM = legalizeToReg(Src1RM);
-    _movp(T, Src0RM);
-    _pcmpeq(T, Src1RM);
-    Variable *MinusOne = makeVectorOfMinusOnes(Ty);
-    _pxor(T, MinusOne);
-  } break;
-  case InstIcmp::Ugt:
-  case InstIcmp::Sgt: {
-    if (llvm::isa<X86OperandMem>(Src1RM))
-      Src1RM = legalizeToReg(Src1RM);
-    _movp(T, Src0RM);
-    _pcmpgt(T, Src1RM);
-  } break;
-  case InstIcmp::Uge:
-  case InstIcmp::Sge: {
-    // !(Src1RM > Src0RM)
-    if (llvm::isa<X86OperandMem>(Src0RM))
-      Src0RM = legalizeToReg(Src0RM);
-    _movp(T, Src1RM);
-    _pcmpgt(T, Src0RM);
-    Variable *MinusOne = makeVectorOfMinusOnes(Ty);
-    _pxor(T, MinusOne);
-  } break;
-  case InstIcmp::Ult:
-  case InstIcmp::Slt: {
-    if (llvm::isa<X86OperandMem>(Src0RM))
-      Src0RM = legalizeToReg(Src0RM);
-    _movp(T, Src1RM);
-    _pcmpgt(T, Src0RM);
-  } break;
-  case InstIcmp::Ule:
-  case InstIcmp::Sle: {
-    // !(Src0RM > Src1RM)
-    if (llvm::isa<X86OperandMem>(Src1RM))
-      Src1RM = legalizeToReg(Src1RM);
-    _movp(T, Src0RM);
-    _pcmpgt(T, Src1RM);
-    Variable *MinusOne = makeVectorOfMinusOnes(Ty);
-    _pxor(T, MinusOne);
-  } break;
-  }
-
-  _movp(Dest, T);
-  eliminateNextVectorSextInstruction(Dest);
-}
-
-template <typename TraitsType>
-template <typename T>
-typename std::enable_if<!T::Is64Bit, void>::type
-TargetX86Base<TraitsType>::lowerIcmp64(const InstIcmp *Icmp,
-                                       const Inst *Consumer) {
-  // a=icmp cond, b, c ==> cmp b,c; a=1; br cond,L1; FakeUse(a); a=0; L1:
-  Operand *Src0 = legalize(Icmp->getSrc(0));
-  Operand *Src1 = legalize(Icmp->getSrc(1));
-  Variable *Dest = Icmp->getDest();
-  InstIcmp::ICond Condition = Icmp->getCondition();
-  assert(static_cast<size_t>(Condition) < Traits::TableIcmp64Size);
-  Operand *Src0LoRM = nullptr;
-  Operand *Src0HiRM = nullptr;
-  // Legalize the portions of Src0 that are going to be needed.
-  if (isZero(Src1)) {
-    switch (Condition) {
-    default:
-      llvm_unreachable("unexpected condition");
-      break;
-    // These two are not optimized, so we fall through to the general case,
-    // which needs the upper and lower halves legalized.
-    case InstIcmp::Sgt:
-    case InstIcmp::Sle:
-    // These four compare after performing an "or" of the high and low half, so
-    // they need the upper and lower halves legalized.
-    case InstIcmp::Eq:
-    case InstIcmp::Ule:
-    case InstIcmp::Ne:
-    case InstIcmp::Ugt:
-      Src0LoRM = legalize(loOperand(Src0), Legal_Reg | Legal_Mem);
-    // These two test only the high half's sign bit, so they need only
-    // the upper half legalized.
-    case InstIcmp::Sge:
-    case InstIcmp::Slt:
-      Src0HiRM = legalize(hiOperand(Src0), Legal_Reg | Legal_Mem);
-      break;
-
-    // These two move constants and hence need no legalization.
-    case InstIcmp::Uge:
-    case InstIcmp::Ult:
-      break;
-    }
-  } else {
-    Src0LoRM = legalize(loOperand(Src0), Legal_Reg | Legal_Mem);
-    Src0HiRM = legalize(hiOperand(Src0), Legal_Reg | Legal_Mem);
-  }
-  // Optimize comparisons with zero.
-  if (isZero(Src1)) {
-    Constant *SignMask = Ctx->getConstantInt32(0x80000000);
-    Variable *Temp = nullptr;
-    switch (Condition) {
-    default:
-      llvm_unreachable("unexpected condition");
-      break;
-    case InstIcmp::Eq:
-    case InstIcmp::Ule:
-      // Mov Src0HiRM first, because it was legalized most recently, and will
-      // sometimes avoid a move before the OR.
-      _mov(Temp, Src0HiRM);
-      _or(Temp, Src0LoRM);
-      Context.insert<InstFakeUse>(Temp);
-      setccOrConsumer(CondX86::Br_e, Dest, Consumer);
-      return;
-    case InstIcmp::Ne:
-    case InstIcmp::Ugt:
-      // Mov Src0HiRM first, because it was legalized most recently, and will
-      // sometimes avoid a move before the OR.
-      _mov(Temp, Src0HiRM);
-      _or(Temp, Src0LoRM);
-      Context.insert<InstFakeUse>(Temp);
-      setccOrConsumer(CondX86::Br_ne, Dest, Consumer);
-      return;
-    case InstIcmp::Uge:
-      movOrConsumer(true, Dest, Consumer);
-      return;
-    case InstIcmp::Ult:
-      movOrConsumer(false, Dest, Consumer);
-      return;
-    case InstIcmp::Sgt:
-      break;
-    case InstIcmp::Sge:
-      _test(Src0HiRM, SignMask);
-      setccOrConsumer(CondX86::Br_e, Dest, Consumer);
-      return;
-    case InstIcmp::Slt:
-      _test(Src0HiRM, SignMask);
-      setccOrConsumer(CondX86::Br_ne, Dest, Consumer);
-      return;
-    case InstIcmp::Sle:
-      break;
-    }
-  }
-  // Handle general compares.
-  Operand *Src1LoRI = legalize(loOperand(Src1), Legal_Reg | Legal_Imm);
-  Operand *Src1HiRI = legalize(hiOperand(Src1), Legal_Reg | Legal_Imm);
-  if (Consumer == nullptr) {
-    Constant *Zero = Ctx->getConstantInt(Dest->getType(), 0);
-    Constant *One = Ctx->getConstantInt(Dest->getType(), 1);
-    InstX86Label *LabelFalse = InstX86Label::create(Func, this);
-    InstX86Label *LabelTrue = InstX86Label::create(Func, this);
-    _mov(Dest, One);
-    _cmp(Src0HiRM, Src1HiRI);
-    if (Traits::TableIcmp64[Condition].C1 != CondX86::Br_None)
-      _br(Traits::TableIcmp64[Condition].C1, LabelTrue);
-    if (Traits::TableIcmp64[Condition].C2 != CondX86::Br_None)
-      _br(Traits::TableIcmp64[Condition].C2, LabelFalse);
-    _cmp(Src0LoRM, Src1LoRI);
-    _br(Traits::TableIcmp64[Condition].C3, LabelTrue);
-    Context.insert(LabelFalse);
-    _redefined(_mov(Dest, Zero));
-    Context.insert(LabelTrue);
-    return;
-  }
-  if (const auto *Br = llvm::dyn_cast<InstBr>(Consumer)) {
-    _cmp(Src0HiRM, Src1HiRI);
-    if (Traits::TableIcmp64[Condition].C1 != CondX86::Br_None)
-      _br(Traits::TableIcmp64[Condition].C1, Br->getTargetTrue());
-    if (Traits::TableIcmp64[Condition].C2 != CondX86::Br_None)
-      _br(Traits::TableIcmp64[Condition].C2, Br->getTargetFalse());
-    _cmp(Src0LoRM, Src1LoRI);
-    _br(Traits::TableIcmp64[Condition].C3, Br->getTargetTrue(),
-        Br->getTargetFalse());
-    return;
-  }
-  if (auto *Select = llvm::dyn_cast<InstSelect>(Consumer)) {
-    Operand *SrcT = Select->getTrueOperand();
-    Operand *SrcF = Select->getFalseOperand();
-    Variable *SelectDest = Select->getDest();
-    InstX86Label *LabelFalse = InstX86Label::create(Func, this);
-    InstX86Label *LabelTrue = InstX86Label::create(Func, this);
-    lowerMove(SelectDest, SrcT, false);
-    _cmp(Src0HiRM, Src1HiRI);
-    if (Traits::TableIcmp64[Condition].C1 != CondX86::Br_None)
-      _br(Traits::TableIcmp64[Condition].C1, LabelTrue);
-    if (Traits::TableIcmp64[Condition].C2 != CondX86::Br_None)
-      _br(Traits::TableIcmp64[Condition].C2, LabelFalse);
-    _cmp(Src0LoRM, Src1LoRI);
-    _br(Traits::TableIcmp64[Condition].C3, LabelTrue);
-    Context.insert(LabelFalse);
-    static constexpr bool IsRedefinition = true;
-    lowerMove(SelectDest, SrcF, IsRedefinition);
-    Context.insert(LabelTrue);
-    return;
-  }
-  llvm::report_fatal_error("Unexpected consumer type");
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::setccOrConsumer(BrCond Condition,
-                                                Variable *Dest,
-                                                const Inst *Consumer) {
-  if (Consumer == nullptr) {
-    _setcc(Dest, Condition);
-    return;
-  }
-  if (const auto *Br = llvm::dyn_cast<InstBr>(Consumer)) {
-    _br(Condition, Br->getTargetTrue(), Br->getTargetFalse());
-    return;
-  }
-  if (const auto *Select = llvm::dyn_cast<InstSelect>(Consumer)) {
-    Operand *SrcT = Select->getTrueOperand();
-    Operand *SrcF = Select->getFalseOperand();
-    Variable *SelectDest = Select->getDest();
-    lowerSelectMove(SelectDest, Condition, SrcT, SrcF);
-    return;
-  }
-  llvm::report_fatal_error("Unexpected consumer type");
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::movOrConsumer(bool IcmpResult, Variable *Dest,
-                                              const Inst *Consumer) {
-  if (Consumer == nullptr) {
-    _mov(Dest, Ctx->getConstantInt(Dest->getType(), (IcmpResult ? 1 : 0)));
-    return;
-  }
-  if (const auto *Br = llvm::dyn_cast<InstBr>(Consumer)) {
-    // TODO(sehr,stichnot): This could be done with a single unconditional
-    // branch instruction, but subzero doesn't know how to handle the resulting
-    // control flow graph changes now.  Make it do so to eliminate mov and cmp.
-    _mov(Dest, Ctx->getConstantInt(Dest->getType(), (IcmpResult ? 1 : 0)));
-    _cmp(Dest, Ctx->getConstantInt(Dest->getType(), 0));
-    _br(CondX86::Br_ne, Br->getTargetTrue(), Br->getTargetFalse());
-    return;
-  }
-  if (const auto *Select = llvm::dyn_cast<InstSelect>(Consumer)) {
-    Operand *Src = nullptr;
-    if (IcmpResult) {
-      Src = legalize(Select->getTrueOperand(), Legal_Reg | Legal_Imm);
-    } else {
-      Src = legalize(Select->getFalseOperand(), Legal_Reg | Legal_Imm);
-    }
-    Variable *SelectDest = Select->getDest();
-    lowerMove(SelectDest, Src, false);
-    return;
-  }
-  llvm::report_fatal_error("Unexpected consumer type");
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::lowerArithAndConsumer(
-    const InstArithmetic *Arith, const Inst *Consumer) {
-  Variable *T = nullptr;
-  Operand *Src0 = legalize(Arith->getSrc(0));
-  Operand *Src1 = legalize(Arith->getSrc(1));
-  Variable *Dest = Arith->getDest();
-  switch (Arith->getOp()) {
-  default:
-    llvm_unreachable("arithmetic operator not AND or OR");
-    break;
-  case InstArithmetic::And:
-    _mov(T, Src0);
-    // Test cannot have an address in the second position.  Since T is
-    // guaranteed to be a register and Src1 could be a memory load, ensure
-    // that the second argument is a register.
-    if (llvm::isa<Constant>(Src1))
-      _test(T, Src1);
-    else
-      _test(Src1, T);
-    break;
-  case InstArithmetic::Or:
-    _mov(T, Src0);
-    _or(T, Src1);
-    break;
-  }
-
-  if (Consumer == nullptr) {
-    llvm::report_fatal_error("Expected a consumer instruction");
-  }
-  if (const auto *Br = llvm::dyn_cast<InstBr>(Consumer)) {
-    Context.insert<InstFakeUse>(T);
-    Context.insert<InstFakeDef>(Dest);
-    _br(CondX86::Br_ne, Br->getTargetTrue(), Br->getTargetFalse());
-    return;
-  }
-  llvm::report_fatal_error("Unexpected consumer type");
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::lowerInsertElement(
-    const InstInsertElement *Instr) {
-  Operand *SourceVectNotLegalized = Instr->getSrc(0);
-  Operand *ElementToInsertNotLegalized = Instr->getSrc(1);
-  auto *ElementIndex = llvm::dyn_cast<ConstantInteger32>(Instr->getSrc(2));
-  // Only constant indices are allowed in PNaCl IR.
-  assert(ElementIndex);
-  unsigned Index = ElementIndex->getValue();
-  assert(Index < typeNumElements(SourceVectNotLegalized->getType()));
-
-  Type Ty = SourceVectNotLegalized->getType();
-  Type ElementTy = typeElementType(Ty);
-  Type InVectorElementTy = Traits::getInVectorElementType(Ty);
-
-  if (ElementTy == IceType_i1) {
-    // Expand the element to the appropriate size for it to be inserted in the
-    // vector.
-    Variable *Expanded = Func->makeVariable(InVectorElementTy);
-    auto *Cast = InstCast::create(Func, InstCast::Zext, Expanded,
-                                  ElementToInsertNotLegalized);
-    lowerCast(Cast);
-    ElementToInsertNotLegalized = Expanded;
-  }
-
-  if (Ty == IceType_v8i16 || Ty == IceType_v8i1 ||
-      InstructionSet >= Traits::SSE4_1) {
-    // Use insertps, pinsrb, pinsrw, or pinsrd.
-    Operand *ElementRM =
-        legalize(ElementToInsertNotLegalized, Legal_Reg | Legal_Mem);
-    Operand *SourceVectRM =
-        legalize(SourceVectNotLegalized, Legal_Reg | Legal_Mem);
-    Variable *T = makeReg(Ty);
-    _movp(T, SourceVectRM);
-    if (Ty == IceType_v4f32) {
-      _insertps(T, ElementRM, Ctx->getConstantInt32(Index << 4));
-    } else {
-      // For the pinsrb and pinsrw instructions, when the source operand is a
-      // register, it must be a full r32 register like eax, and not ax/al/ah.
-      // For filetype=asm, InstX86Pinsr<TraitsType>::emit() compensates for
-      // the use
-      // of r16 and r8 by converting them through getBaseReg(), while emitIAS()
-      // validates that the original and base register encodings are the same.
-      if (ElementRM->getType() == IceType_i8 &&
-          llvm::isa<Variable>(ElementRM)) {
-        // Don't use ah/bh/ch/dh for pinsrb.
-        ElementRM = copyToReg8(ElementRM);
-      }
-      _pinsr(T, ElementRM, Ctx->getConstantInt32(Index));
-    }
-    _movp(Instr->getDest(), T);
-  } else if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) {
-    // Use shufps or movss.
-    Variable *ElementR = nullptr;
-    Operand *SourceVectRM =
-        legalize(SourceVectNotLegalized, Legal_Reg | Legal_Mem);
-
-    if (InVectorElementTy == IceType_f32) {
-      // ElementR will be in an XMM register since it is floating point.
-      ElementR = legalizeToReg(ElementToInsertNotLegalized);
-    } else {
-      // Copy an integer to an XMM register.
-      Operand *T = legalize(ElementToInsertNotLegalized, Legal_Reg | Legal_Mem);
-      ElementR = makeReg(Ty);
-      _movd(ElementR, T);
-    }
-
-    if (Index == 0) {
-      Variable *T = makeReg(Ty);
-      _movp(T, SourceVectRM);
-      _movss(T, ElementR);
-      _movp(Instr->getDest(), T);
-      return;
-    }
-
-    // shufps treats the source and destination operands as vectors of four
-    // doublewords. The destination's two high doublewords are selected from
-    // the source operand and the two low doublewords are selected from the
-    // (original value of) the destination operand. An insertelement operation
-    // can be effected with a sequence of two shufps operations with
-    // appropriate masks. In all cases below, Element[0] is being inserted into
-    // SourceVectOperand. Indices are ordered from left to right.
-    //
-    // insertelement into index 1 (result is stored in ElementR):
-    //   ElementR := ElementR[0, 0] SourceVectRM[0, 0]
-    //   ElementR := ElementR[3, 0] SourceVectRM[2, 3]
-    //
-    // insertelement into index 2 (result is stored in T):
-    //   T := SourceVectRM
-    //   ElementR := ElementR[0, 0] T[0, 3]
-    //   T := T[0, 1] ElementR[0, 3]
-    //
-    // insertelement into index 3 (result is stored in T):
-    //   T := SourceVectRM
-    //   ElementR := ElementR[0, 0] T[0, 2]
-    //   T := T[0, 1] ElementR[3, 0]
-    const unsigned char Mask1[3] = {0, 192, 128};
-    const unsigned char Mask2[3] = {227, 196, 52};
-
-    Constant *Mask1Constant = Ctx->getConstantInt32(Mask1[Index - 1]);
-    Constant *Mask2Constant = Ctx->getConstantInt32(Mask2[Index - 1]);
-
-    if (Index == 1) {
-      _shufps(ElementR, SourceVectRM, Mask1Constant);
-      _shufps(ElementR, SourceVectRM, Mask2Constant);
-      _movp(Instr->getDest(), ElementR);
-    } else {
-      Variable *T = makeReg(Ty);
-      _movp(T, SourceVectRM);
-      _shufps(ElementR, T, Mask1Constant);
-      _shufps(T, ElementR, Mask2Constant);
-      _movp(Instr->getDest(), T);
-    }
-  } else {
-    assert(Ty == IceType_v16i8 || Ty == IceType_v16i1);
-    // Spill the value to a stack slot and perform the insertion in memory.
-    //
-    // TODO(wala): use legalize(SourceVectNotLegalized, Legal_Mem) when support
-    // for legalizing to mem is implemented.
-    Variable *Slot = Func->makeVariable(Ty);
-    Slot->setMustNotHaveReg();
-    _movp(Slot, legalizeToReg(SourceVectNotLegalized));
-
-    // Compute the location of the position to insert in memory.
-    unsigned Offset = Index * typeWidthInBytes(InVectorElementTy);
-    X86OperandMem *Loc =
-        getMemoryOperandForStackSlot(InVectorElementTy, Slot, Offset);
-    _store(legalizeToReg(ElementToInsertNotLegalized), Loc);
-
-    Variable *T = makeReg(Ty);
-    _movp(T, Slot);
-    _movp(Instr->getDest(), T);
-  }
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::lowerIntrinsic(const InstIntrinsic *Instr) {
-  switch (Intrinsics::IntrinsicID ID = Instr->getIntrinsicID()) {
-  case Intrinsics::AtomicCmpxchg: {
-    if (!Intrinsics::isMemoryOrderValid(
-            ID, getConstantMemoryOrder(Instr->getArg(3)),
-            getConstantMemoryOrder(Instr->getArg(4)))) {
-      Func->setError("Unexpected memory ordering for AtomicCmpxchg");
-      return;
-    }
-    Variable *DestPrev = Instr->getDest();
-    Operand *PtrToMem = legalize(Instr->getArg(0));
-    Operand *Expected = legalize(Instr->getArg(1));
-    Operand *Desired = legalize(Instr->getArg(2));
-    if (tryOptimizedCmpxchgCmpBr(DestPrev, PtrToMem, Expected, Desired))
-      return;
-    lowerAtomicCmpxchg(DestPrev, PtrToMem, Expected, Desired);
-    return;
-  }
-  case Intrinsics::AtomicFence:
-    if (!Intrinsics::isMemoryOrderValid(
-            ID, getConstantMemoryOrder(Instr->getArg(0)))) {
-      Func->setError("Unexpected memory ordering for AtomicFence");
-      return;
-    }
-    _mfence();
-    return;
-  case Intrinsics::AtomicFenceAll:
-    // NOTE: FenceAll should prevent and load/store from being moved across the
-    // fence (both atomic and non-atomic). The InstX86Mfence instruction is
-    // currently marked coarsely as "HasSideEffects".
-    _mfence();
-    return;
-  case Intrinsics::AtomicIsLockFree: {
-    // X86 is always lock free for 8/16/32/64 bit accesses.
-    // TODO(jvoung): Since the result is constant when given a constant byte
-    // size, this opens up DCE opportunities.
-    Operand *ByteSize = Instr->getArg(0);
-    Variable *Dest = Instr->getDest();
-    if (auto *CI = llvm::dyn_cast<ConstantInteger32>(ByteSize)) {
-      Constant *Result;
-      switch (CI->getValue()) {
-      default:
-        // Some x86-64 processors support the cmpxchg16b instruction, which can
-        // make 16-byte operations lock free (when used with the LOCK prefix).
-        // However, that's not supported in 32-bit mode, so just return 0 even
-        // for large sizes.
-        Result = Ctx->getConstantZero(IceType_i32);
-        break;
-      case 1:
-      case 2:
-      case 4:
-      case 8:
-        Result = Ctx->getConstantInt32(1);
-        break;
-      }
-      _mov(Dest, Result);
-      return;
-    }
-    // The PNaCl ABI requires the byte size to be a compile-time constant.
-    Func->setError("AtomicIsLockFree byte size should be compile-time const");
-    return;
-  }
-  case Intrinsics::AtomicLoad: {
-    // We require the memory address to be naturally aligned. Given that is the
-    // case, then normal loads are atomic.
-    if (!Intrinsics::isMemoryOrderValid(
-            ID, getConstantMemoryOrder(Instr->getArg(1)))) {
-      Func->setError("Unexpected memory ordering for AtomicLoad");
-      return;
-    }
-    Variable *Dest = Instr->getDest();
-    if (!Traits::Is64Bit) {
-      if (auto *Dest64On32 = llvm::dyn_cast<Variable64On32>(Dest)) {
-        // Follow what GCC does and use a movq instead of what lowerLoad()
-        // normally does (split the load into two). Thus, this skips
-        // load/arithmetic op folding. Load/arithmetic folding can't happen
-        // anyway, since this is x86-32 and integer arithmetic only happens on
-        // 32-bit quantities.
-        Variable *T = makeReg(IceType_f64);
-        X86OperandMem *Addr = formMemoryOperand(Instr->getArg(0), IceType_f64);
-        _movq(T, Addr);
-        // Then cast the bits back out of the XMM register to the i64 Dest.
-        auto *Cast = InstCast::create(Func, InstCast::Bitcast, Dest, T);
-        lowerCast(Cast);
-        // Make sure that the atomic load isn't elided when unused.
-        Context.insert<InstFakeUse>(Dest64On32->getLo());
-        Context.insert<InstFakeUse>(Dest64On32->getHi());
-        return;
-      }
-    }
-    auto *Load = InstLoad::create(Func, Dest, Instr->getArg(0));
-    lowerLoad(Load);
-    // Make sure the atomic load isn't elided when unused, by adding a FakeUse.
-    // Since lowerLoad may fuse the load w/ an arithmetic instruction, insert
-    // the FakeUse on the last-inserted instruction's dest.
-    Context.insert<InstFakeUse>(Context.getLastInserted()->getDest());
-    return;
-  }
-  case Intrinsics::AtomicRMW:
-    if (!Intrinsics::isMemoryOrderValid(
-            ID, getConstantMemoryOrder(Instr->getArg(3)))) {
-      Func->setError("Unexpected memory ordering for AtomicRMW");
-      return;
-    }
-    lowerAtomicRMW(
-        Instr->getDest(),
-        static_cast<uint32_t>(
-            llvm::cast<ConstantInteger32>(Instr->getArg(0))->getValue()),
-        Instr->getArg(1), Instr->getArg(2));
-    return;
-  case Intrinsics::AtomicStore: {
-    if (!Intrinsics::isMemoryOrderValid(
-            ID, getConstantMemoryOrder(Instr->getArg(2)))) {
-      Func->setError("Unexpected memory ordering for AtomicStore");
-      return;
-    }
-    // We require the memory address to be naturally aligned. Given that is the
-    // case, then normal stores are atomic. Add a fence after the store to make
-    // it visible.
-    Operand *Value = Instr->getArg(0);
-    Operand *Ptr = Instr->getArg(1);
-    if (!Traits::Is64Bit && Value->getType() == IceType_i64) {
-      // Use a movq instead of what lowerStore() normally does (split the store
-      // into two), following what GCC does. Cast the bits from int -> to an
-      // xmm register first.
-      Variable *T = makeReg(IceType_f64);
-      auto *Cast = InstCast::create(Func, InstCast::Bitcast, T, Value);
-      lowerCast(Cast);
-      // Then store XMM w/ a movq.
-      X86OperandMem *Addr = formMemoryOperand(Ptr, IceType_f64);
-      _storeq(T, Addr);
-      _mfence();
-      return;
-    }
-    auto *Store = InstStore::create(Func, Value, Ptr);
-    lowerStore(Store);
-    _mfence();
-    return;
-  }
-  case Intrinsics::Bswap: {
-    Variable *Dest = Instr->getDest();
-    Operand *Val = Instr->getArg(0);
-    // In 32-bit mode, bswap only works on 32-bit arguments, and the argument
-    // must be a register. Use rotate left for 16-bit bswap.
-    if (!Traits::Is64Bit && Val->getType() == IceType_i64) {
-      Val = legalizeUndef(Val);
-      Variable *T_Lo = legalizeToReg(loOperand(Val));
-      Variable *T_Hi = legalizeToReg(hiOperand(Val));
-      auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
-      auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
-      _bswap(T_Lo);
-      _bswap(T_Hi);
-      _mov(DestLo, T_Hi);
-      _mov(DestHi, T_Lo);
-    } else if ((Traits::Is64Bit && Val->getType() == IceType_i64) ||
-               Val->getType() == IceType_i32) {
-      Variable *T = legalizeToReg(Val);
-      _bswap(T);
-      _mov(Dest, T);
-    } else {
-      assert(Val->getType() == IceType_i16);
-      Constant *Eight = Ctx->getConstantInt16(8);
-      Variable *T = nullptr;
-      Val = legalize(Val);
-      _mov(T, Val);
-      _rol(T, Eight);
-      _mov(Dest, T);
-    }
-    return;
-  }
-  case Intrinsics::Ctpop: {
-    Variable *Dest = Instr->getDest();
-    Variable *T = nullptr;
-    Operand *Val = Instr->getArg(0);
-    Type ValTy = Val->getType();
-    assert(ValTy == IceType_i32 || ValTy == IceType_i64);
-
-    if (!Traits::Is64Bit) {
-      T = Dest;
-    } else {
-      T = makeReg(IceType_i64);
-      if (ValTy == IceType_i32) {
-        // in x86-64, __popcountsi2 is not defined, so we cheat a bit by
-        // converting it to a 64-bit value, and using ctpop_i64. _movzx should
-        // ensure we will not have any bits set on Val's upper 32 bits.
-        Variable *V = makeReg(IceType_i64);
-        Operand *ValRM = legalize(Val, Legal_Reg | Legal_Mem);
-        _movzx(V, ValRM);
-        Val = V;
-      }
-      ValTy = IceType_i64;
-    }
-
-    InstCall *Call =
-        makeHelperCall(ValTy == IceType_i32 ? RuntimeHelper::H_call_ctpop_i32
-                                            : RuntimeHelper::H_call_ctpop_i64,
-                       T, 1);
-    Call->addArg(Val);
-    lowerCall(Call);
-    // The popcount helpers always return 32-bit values, while the intrinsic's
-    // signature matches the native POPCNT instruction and fills a 64-bit reg
-    // (in 64-bit mode). Thus, clear the upper bits of the dest just in case
-    // the user doesn't do that in the IR. If the user does that in the IR,
-    // then this zero'ing instruction is dead and gets optimized out.
-    if (!Traits::Is64Bit) {
-      assert(T == Dest);
-      if (Val->getType() == IceType_i64) {
-        auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
-        Constant *Zero = Ctx->getConstantZero(IceType_i32);
-        _mov(DestHi, Zero);
-      }
-    } else {
-      assert(Val->getType() == IceType_i64);
-      // T is 64 bit. It needs to be copied to dest. We need to:
-      //
-      // T_1.32 = trunc T.64 to i32
-      // T_2.64 = zext T_1.32 to i64
-      // Dest.<<right_size>> = T_2.<<right_size>>
-      //
-      // which ensures the upper 32 bits will always be cleared. Just doing a
-      //
-      // mov Dest.32 = trunc T.32 to i32
-      //
-      // is dangerous because there's a chance the compiler will optimize this
-      // copy out. To use _movzx we need two new registers (one 32-, and
-      // another 64-bit wide.)
-      Variable *T_1 = makeReg(IceType_i32);
-      _mov(T_1, T);
-      Variable *T_2 = makeReg(IceType_i64);
-      _movzx(T_2, T_1);
-      _mov(Dest, T_2);
-    }
-    return;
-  }
-  case Intrinsics::Ctlz: {
-    // The "is zero undef" parameter is ignored and we always return a
-    // well-defined value.
-    Operand *Val = legalize(Instr->getArg(0));
-    Operand *FirstVal;
-    Operand *SecondVal = nullptr;
-    if (!Traits::Is64Bit && Val->getType() == IceType_i64) {
-      FirstVal = loOperand(Val);
-      SecondVal = hiOperand(Val);
-    } else {
-      FirstVal = Val;
-    }
-    constexpr bool IsCttz = false;
-    lowerCountZeros(IsCttz, Val->getType(), Instr->getDest(), FirstVal,
-                    SecondVal);
-    return;
-  }
-  case Intrinsics::Cttz: {
-    // The "is zero undef" parameter is ignored and we always return a
-    // well-defined value.
-    Operand *Val = legalize(Instr->getArg(0));
-    Operand *FirstVal;
-    Operand *SecondVal = nullptr;
-    if (!Traits::Is64Bit && Val->getType() == IceType_i64) {
-      FirstVal = hiOperand(Val);
-      SecondVal = loOperand(Val);
-    } else {
-      FirstVal = Val;
-    }
-    constexpr bool IsCttz = true;
-    lowerCountZeros(IsCttz, Val->getType(), Instr->getDest(), FirstVal,
-                    SecondVal);
-    return;
-  }
-  case Intrinsics::Fabs: {
-    Operand *Src = legalize(Instr->getArg(0));
-    Type Ty = Src->getType();
-    Variable *Dest = Instr->getDest();
-    Variable *T = makeVectorOfFabsMask(Ty);
-    // The pand instruction operates on an m128 memory operand, so if Src is an
-    // f32 or f64, we need to make sure it's in a register.
-    if (isVectorType(Ty)) {
-      if (llvm::isa<X86OperandMem>(Src))
-        Src = legalizeToReg(Src);
-    } else {
-      Src = legalizeToReg(Src);
-    }
-    _pand(T, Src);
-    if (isVectorType(Ty))
-      _movp(Dest, T);
-    else
-      _mov(Dest, T);
-    return;
-  }
-  case Intrinsics::Longjmp: {
-    InstCall *Call = makeHelperCall(RuntimeHelper::H_call_longjmp, nullptr, 2);
-    Call->addArg(Instr->getArg(0));
-    Call->addArg(Instr->getArg(1));
-    lowerCall(Call);
-    return;
-  }
-  case Intrinsics::Memcpy: {
-    lowerMemcpy(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2));
-    return;
-  }
-  case Intrinsics::Memmove: {
-    lowerMemmove(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2));
-    return;
-  }
-  case Intrinsics::Memset: {
-    lowerMemset(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2));
-    return;
-  }
-  case Intrinsics::Setjmp: {
-    InstCall *Call =
-        makeHelperCall(RuntimeHelper::H_call_setjmp, Instr->getDest(), 1);
-    Call->addArg(Instr->getArg(0));
-    lowerCall(Call);
-    return;
-  }
-  case Intrinsics::Sqrt: {
-    Operand *Src = legalize(Instr->getArg(0));
-    Variable *Dest = Instr->getDest();
-    Variable *T = makeReg(Dest->getType());
-    _sqrt(T, Src);
-    if (isVectorType(Dest->getType())) {
-      _movp(Dest, T);
-    } else {
-      _mov(Dest, T);
-    }
-    return;
-  }
-  case Intrinsics::Stacksave: {
-    Variable *esp =
-        Func->getTarget()->getPhysicalRegister(getStackReg(), Traits::WordType);
-    Variable *Dest = Instr->getDest();
-    _mov(Dest, esp);
-    return;
-  }
-  case Intrinsics::Stackrestore: {
-    Operand *Src = Instr->getArg(0);
-    _mov_sp(Src);
-    return;
-  }
-
-  case Intrinsics::Trap:
-    _ud2();
-    return;
-  case Intrinsics::LoadSubVector: {
-    assert(llvm::isa<ConstantInteger32>(Instr->getArg(1)) &&
-           "LoadSubVector second argument must be a constant");
-    Variable *Dest = Instr->getDest();
-    Type Ty = Dest->getType();
-    auto *SubVectorSize = llvm::cast<ConstantInteger32>(Instr->getArg(1));
-    Operand *Addr = Instr->getArg(0);
-    X86OperandMem *Src = formMemoryOperand(Addr, Ty);
-    doMockBoundsCheck(Src);
-
-    if (Dest->isRematerializable()) {
-      Context.insert<InstFakeDef>(Dest);
-      return;
-    }
-
-    auto *T = makeReg(Ty);
-    switch (SubVectorSize->getValue()) {
-    case 4:
-      _movd(T, Src);
-      break;
-    case 8:
-      _movq(T, Src);
-      break;
-    default:
-      Func->setError("Unexpected size for LoadSubVector");
-      return;
-    }
-    _movp(Dest, T);
-    return;
-  }
-  case Intrinsics::StoreSubVector: {
-    assert(llvm::isa<ConstantInteger32>(Instr->getArg(2)) &&
-           "StoreSubVector third argument must be a constant");
-    auto *SubVectorSize = llvm::cast<ConstantInteger32>(Instr->getArg(2));
-    Operand *Value = Instr->getArg(0);
-    Operand *Addr = Instr->getArg(1);
-    X86OperandMem *NewAddr = formMemoryOperand(Addr, Value->getType());
-    doMockBoundsCheck(NewAddr);
-
-    Value = legalizeToReg(Value);
-
-    switch (SubVectorSize->getValue()) {
-    case 4:
-      _stored(Value, NewAddr);
-      break;
-    case 8:
-      _storeq(Value, NewAddr);
-      break;
-    default:
-      Func->setError("Unexpected size for StoreSubVector");
-      return;
-    }
-    return;
-  }
-  case Intrinsics::VectorPackSigned: {
-    Operand *Src0 = Instr->getArg(0);
-    Operand *Src1 = Instr->getArg(1);
-    Variable *Dest = Instr->getDest();
-    auto *T = makeReg(Src0->getType());
-    auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
-    auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
-    _movp(T, Src0RM);
-    _packss(T, Src1RM);
-    _movp(Dest, T);
-    return;
-  }
-  case Intrinsics::VectorPackUnsigned: {
-    Operand *Src0 = Instr->getArg(0);
-    Operand *Src1 = Instr->getArg(1);
-    Variable *Dest = Instr->getDest();
-    auto *T = makeReg(Src0->getType());
-    auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
-    auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
-    _movp(T, Src0RM);
-    _packus(T, Src1RM);
-    _movp(Dest, T);
-    return;
-  }
-  case Intrinsics::SignMask: {
-    Operand *SrcReg = legalizeToReg(Instr->getArg(0));
-    Variable *Dest = Instr->getDest();
-    Variable *T = makeReg(IceType_i32);
-    if (SrcReg->getType() == IceType_v4f32 ||
-        SrcReg->getType() == IceType_v4i32 ||
-        SrcReg->getType() == IceType_v16i8) {
-      _movmsk(T, SrcReg);
-    } else {
-      // TODO(capn): We could implement v8i16 sign mask using packsswb/pmovmskb
-      llvm::report_fatal_error("Invalid type for SignMask intrinsic");
-    }
-    _mov(Dest, T);
-    return;
-  }
-  case Intrinsics::MultiplyHighSigned: {
-    Operand *Src0 = Instr->getArg(0);
-    Operand *Src1 = Instr->getArg(1);
-    Variable *Dest = Instr->getDest();
-    auto *T = makeReg(Dest->getType());
-    auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
-    auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
-    _movp(T, Src0RM);
-    _pmulhw(T, Src1RM);
-    _movp(Dest, T);
-    return;
-  }
-  case Intrinsics::MultiplyHighUnsigned: {
-    Operand *Src0 = Instr->getArg(0);
-    Operand *Src1 = Instr->getArg(1);
-    Variable *Dest = Instr->getDest();
-    auto *T = makeReg(Dest->getType());
-    auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
-    auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
-    _movp(T, Src0RM);
-    _pmulhuw(T, Src1RM);
-    _movp(Dest, T);
-    return;
-  }
-  case Intrinsics::MultiplyAddPairs: {
-    Operand *Src0 = Instr->getArg(0);
-    Operand *Src1 = Instr->getArg(1);
-    Variable *Dest = Instr->getDest();
-    auto *T = makeReg(Dest->getType());
-    auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
-    auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
-    _movp(T, Src0RM);
-    _pmaddwd(T, Src1RM);
-    _movp(Dest, T);
-    return;
-  }
-  case Intrinsics::AddSaturateSigned: {
-    Operand *Src0 = Instr->getArg(0);
-    Operand *Src1 = Instr->getArg(1);
-    Variable *Dest = Instr->getDest();
-    auto *T = makeReg(Dest->getType());
-    auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
-    auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
-    _movp(T, Src0RM);
-    _padds(T, Src1RM);
-    _movp(Dest, T);
-    return;
-  }
-  case Intrinsics::SubtractSaturateSigned: {
-    Operand *Src0 = Instr->getArg(0);
-    Operand *Src1 = Instr->getArg(1);
-    Variable *Dest = Instr->getDest();
-    auto *T = makeReg(Dest->getType());
-    auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
-    auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
-    _movp(T, Src0RM);
-    _psubs(T, Src1RM);
-    _movp(Dest, T);
-    return;
-  }
-  case Intrinsics::AddSaturateUnsigned: {
-    Operand *Src0 = Instr->getArg(0);
-    Operand *Src1 = Instr->getArg(1);
-    Variable *Dest = Instr->getDest();
-    auto *T = makeReg(Dest->getType());
-    auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
-    auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
-    _movp(T, Src0RM);
-    _paddus(T, Src1RM);
-    _movp(Dest, T);
-    return;
-  }
-  case Intrinsics::SubtractSaturateUnsigned: {
-    Operand *Src0 = Instr->getArg(0);
-    Operand *Src1 = Instr->getArg(1);
-    Variable *Dest = Instr->getDest();
-    auto *T = makeReg(Dest->getType());
-    auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
-    auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
-    _movp(T, Src0RM);
-    _psubus(T, Src1RM);
-    _movp(Dest, T);
-    return;
-  }
-  case Intrinsics::Nearbyint: {
-    Operand *Src = Instr->getArg(0);
-    Variable *Dest = Instr->getDest();
-    Type DestTy = Dest->getType();
-    if (isVectorType(DestTy)) {
-      assert(DestTy == IceType_v4i32);
-      assert(Src->getType() == IceType_v4f32);
-      Operand *Src0R = legalizeToReg(Src);
-      Variable *T = makeReg(DestTy);
-      _cvt(T, Src0R, Traits::Insts::Cvt::Ps2dq);
-      _movp(Dest, T);
-    } else if (!Traits::Is64Bit && DestTy == IceType_i64) {
-      llvm::report_fatal_error("Helper call was expected");
-    } else {
-      Operand *Src0RM = legalize(Src, Legal_Reg | Legal_Mem);
-      // t1.i32 = cvt Src0RM; t2.dest_type = t1; Dest = t2.dest_type
-      Variable *T_1 = nullptr;
-      if (Traits::Is64Bit && DestTy == IceType_i64) {
-        T_1 = makeReg(IceType_i64);
-      } else {
-        assert(DestTy != IceType_i64);
-        T_1 = makeReg(IceType_i32);
-      }
-      // cvt() requires its integer argument to be a GPR.
-      Variable *T_2 = makeReg(DestTy);
-      if (isByteSizedType(DestTy)) {
-        assert(T_1->getType() == IceType_i32);
-        T_1->setRegClass(RCX86_Is32To8);
-        T_2->setRegClass(RCX86_IsTrunc8Rcvr);
-      }
-      _cvt(T_1, Src0RM, Traits::Insts::Cvt::Ss2si);
-      _mov(T_2, T_1); // T_1 and T_2 may have different integer types
-      if (DestTy == IceType_i1)
-        _and(T_2, Ctx->getConstantInt1(1));
-      _mov(Dest, T_2);
-    }
-    return;
-  }
-  case Intrinsics::Round: {
-    assert(InstructionSet >= Traits::SSE4_1);
-    Variable *Dest = Instr->getDest();
-    Operand *Src = Instr->getArg(0);
-    Operand *Mode = Instr->getArg(1);
-    assert(llvm::isa<ConstantInteger32>(Mode) &&
-           "Round last argument must be a constant");
-    auto *SrcRM = legalize(Src, Legal_Reg | Legal_Mem);
-    int32_t Imm = llvm::cast<ConstantInteger32>(Mode)->getValue();
-    (void)Imm;
-    assert(Imm >= 0 && Imm < 4 && "Invalid rounding mode");
-    auto *T = makeReg(Dest->getType());
-    _round(T, SrcRM, Mode);
-    _movp(Dest, T);
-    return;
-  }
-  default: // UnknownIntrinsic
-    Func->setError("Unexpected intrinsic");
-    return;
-  }
-  return;
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::lowerAtomicCmpxchg(Variable *DestPrev,
-                                                   Operand *Ptr,
-                                                   Operand *Expected,
-                                                   Operand *Desired) {
-  Type Ty = Expected->getType();
-  if (!Traits::Is64Bit && Ty == IceType_i64) {
-    // Reserve the pre-colored registers first, before adding any more
-    // infinite-weight variables from formMemoryOperand's legalization.
-    Variable *T_edx = makeReg(IceType_i32, Traits::RegisterSet::Reg_edx);
-    Variable *T_eax = makeReg(IceType_i32, Traits::RegisterSet::Reg_eax);
-    Variable *T_ecx = makeReg(IceType_i32, Traits::RegisterSet::Reg_ecx);
-    Variable *T_ebx = makeReg(IceType_i32, Traits::RegisterSet::Reg_ebx);
-    _mov(T_eax, loOperand(Expected));
-    _mov(T_edx, hiOperand(Expected));
-    _mov(T_ebx, loOperand(Desired));
-    _mov(T_ecx, hiOperand(Desired));
-    X86OperandMem *Addr = formMemoryOperand(Ptr, Ty);
-    constexpr bool Locked = true;
-    _cmpxchg8b(Addr, T_edx, T_eax, T_ecx, T_ebx, Locked);
-    auto *DestLo = llvm::cast<Variable>(loOperand(DestPrev));
-    auto *DestHi = llvm::cast<Variable>(hiOperand(DestPrev));
-    _mov(DestLo, T_eax);
-    _mov(DestHi, T_edx);
-    return;
-  }
-  RegNumT Eax;
-  switch (Ty) {
-  default:
-    llvm::report_fatal_error("Bad type for cmpxchg");
-  case IceType_i64:
-    Eax = Traits::getRaxOrDie();
-    break;
-  case IceType_i32:
-    Eax = Traits::RegisterSet::Reg_eax;
-    break;
-  case IceType_i16:
-    Eax = Traits::RegisterSet::Reg_ax;
-    break;
-  case IceType_i8:
-    Eax = Traits::RegisterSet::Reg_al;
-    break;
-  }
-  Variable *T_eax = makeReg(Ty, Eax);
-  _mov(T_eax, Expected);
-  X86OperandMem *Addr = formMemoryOperand(Ptr, Ty);
-  Variable *DesiredReg = legalizeToReg(Desired);
-  constexpr bool Locked = true;
-  _cmpxchg(Addr, T_eax, DesiredReg, Locked);
-  _mov(DestPrev, T_eax);
-}
-
-template <typename TraitsType>
-bool TargetX86Base<TraitsType>::tryOptimizedCmpxchgCmpBr(Variable *Dest,
-                                                         Operand *PtrToMem,
-                                                         Operand *Expected,
-                                                         Operand *Desired) {
-  if (Func->getOptLevel() == Opt_m1)
-    return false;
-  // Peek ahead a few instructions and see how Dest is used.
-  // It's very common to have:
-  //
-  // %x = call i32 @llvm.nacl.atomic.cmpxchg.i32(i32* ptr, i32 %expected, ...)
-  // [%y_phi = ...] // list of phi stores
-  // %p = icmp eq i32 %x, %expected
-  // br i1 %p, label %l1, label %l2
-  //
-  // which we can optimize into:
-  //
-  // %x = <cmpxchg code>
-  // [%y_phi = ...] // list of phi stores
-  // br eq, %l1, %l2
-  InstList::iterator I = Context.getCur();
-  // I is currently the InstIntrinsic. Peek past that.
-  // This assumes that the atomic cmpxchg has not been lowered yet,
-  // so that the instructions seen in the scan from "Cur" is simple.
-  assert(llvm::isa<InstIntrinsic>(*I));
-  Inst *NextInst = Context.getNextInst(I);
-  if (!NextInst)
-    return false;
-  // There might be phi assignments right before the compare+branch, since this
-  // could be a backward branch for a loop. This placement of assignments is
-  // determined by placePhiStores().
-  CfgVector<InstAssign *> PhiAssigns;
-  while (auto *PhiAssign = llvm::dyn_cast<InstAssign>(NextInst)) {
-    if (PhiAssign->getDest() == Dest)
-      return false;
-    PhiAssigns.push_back(PhiAssign);
-    NextInst = Context.getNextInst(I);
-    if (!NextInst)
-      return false;
-  }
-  if (auto *NextCmp = llvm::dyn_cast<InstIcmp>(NextInst)) {
-    if (!(NextCmp->getCondition() == InstIcmp::Eq &&
-          ((NextCmp->getSrc(0) == Dest && NextCmp->getSrc(1) == Expected) ||
-           (NextCmp->getSrc(1) == Dest && NextCmp->getSrc(0) == Expected)))) {
-      return false;
-    }
-    NextInst = Context.getNextInst(I);
-    if (!NextInst)
-      return false;
-    if (auto *NextBr = llvm::dyn_cast<InstBr>(NextInst)) {
-      if (!NextBr->isUnconditional() &&
-          NextCmp->getDest() == NextBr->getCondition() &&
-          NextBr->isLastUse(NextCmp->getDest())) {
-        lowerAtomicCmpxchg(Dest, PtrToMem, Expected, Desired);
-        for (size_t i = 0; i < PhiAssigns.size(); ++i) {
-          // Lower the phi assignments now, before the branch (same placement
-          // as before).
-          InstAssign *PhiAssign = PhiAssigns[i];
-          PhiAssign->setDeleted();
-          lowerAssign(PhiAssign);
-          Context.advanceNext();
-        }
-        _br(CondX86::Br_e, NextBr->getTargetTrue(), NextBr->getTargetFalse());
-        // Skip over the old compare and branch, by deleting them.
-        NextCmp->setDeleted();
-        NextBr->setDeleted();
-        Context.advanceNext();
-        Context.advanceNext();
-        return true;
-      }
-    }
-  }
-  return false;
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::lowerAtomicRMW(Variable *Dest,
-                                               uint32_t Operation, Operand *Ptr,
-                                               Operand *Val) {
-  bool NeedsCmpxchg = false;
-  LowerBinOp Op_Lo = nullptr;
-  LowerBinOp Op_Hi = nullptr;
-  switch (Operation) {
-  default:
-    Func->setError("Unknown AtomicRMW operation");
-    return;
-  case Intrinsics::AtomicAdd: {
-    if (!Traits::Is64Bit && Dest->getType() == IceType_i64) {
-      // All the fall-through paths must set this to true, but use this
-      // for asserting.
-      NeedsCmpxchg = true;
-      Op_Lo = &TargetX86Base<TraitsType>::_add;
-      Op_Hi = &TargetX86Base<TraitsType>::_adc;
-      break;
-    }
-    X86OperandMem *Addr = formMemoryOperand(Ptr, Dest->getType());
-    constexpr bool Locked = true;
-    Variable *T = nullptr;
-    _mov(T, Val);
-    _xadd(Addr, T, Locked);
-    _mov(Dest, T);
-    return;
-  }
-  case Intrinsics::AtomicSub: {
-    if (!Traits::Is64Bit && Dest->getType() == IceType_i64) {
-      NeedsCmpxchg = true;
-      Op_Lo = &TargetX86Base<TraitsType>::_sub;
-      Op_Hi = &TargetX86Base<TraitsType>::_sbb;
-      break;
-    }
-    X86OperandMem *Addr = formMemoryOperand(Ptr, Dest->getType());
-    constexpr bool Locked = true;
-    Variable *T = nullptr;
-    _mov(T, Val);
-    _neg(T);
-    _xadd(Addr, T, Locked);
-    _mov(Dest, T);
-    return;
-  }
-  case Intrinsics::AtomicOr:
-    // TODO(jvoung): If Dest is null or dead, then some of these
-    // operations do not need an "exchange", but just a locked op.
-    // That appears to be "worth" it for sub, or, and, and xor.
-    // xadd is probably fine vs lock add for add, and xchg is fine
-    // vs an atomic store.
-    NeedsCmpxchg = true;
-    Op_Lo = &TargetX86Base<TraitsType>::_or;
-    Op_Hi = &TargetX86Base<TraitsType>::_or;
-    break;
-  case Intrinsics::AtomicAnd:
-    NeedsCmpxchg = true;
-    Op_Lo = &TargetX86Base<TraitsType>::_and;
-    Op_Hi = &TargetX86Base<TraitsType>::_and;
-    break;
-  case Intrinsics::AtomicXor:
-    NeedsCmpxchg = true;
-    Op_Lo = &TargetX86Base<TraitsType>::_xor;
-    Op_Hi = &TargetX86Base<TraitsType>::_xor;
-    break;
-  case Intrinsics::AtomicExchange:
-    if (!Traits::Is64Bit && Dest->getType() == IceType_i64) {
-      NeedsCmpxchg = true;
-      // NeedsCmpxchg, but no real Op_Lo/Op_Hi need to be done. The values
-      // just need to be moved to the ecx and ebx registers.
-      Op_Lo = nullptr;
-      Op_Hi = nullptr;
-      break;
-    }
-    X86OperandMem *Addr = formMemoryOperand(Ptr, Dest->getType());
-    Variable *T = nullptr;
-    _mov(T, Val);
-    _xchg(Addr, T);
-    _mov(Dest, T);
-    return;
-  }
-  // Otherwise, we need a cmpxchg loop.
-  (void)NeedsCmpxchg;
-  assert(NeedsCmpxchg);
-  expandAtomicRMWAsCmpxchg(Op_Lo, Op_Hi, Dest, Ptr, Val);
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::expandAtomicRMWAsCmpxchg(LowerBinOp Op_Lo,
-                                                         LowerBinOp Op_Hi,
-                                                         Variable *Dest,
-                                                         Operand *Ptr,
-                                                         Operand *Val) {
-  // Expand a more complex RMW operation as a cmpxchg loop:
-  // For 64-bit:
-  //   mov     eax, [ptr]
-  //   mov     edx, [ptr + 4]
-  // .LABEL:
-  //   mov     ebx, eax
-  //   <Op_Lo> ebx, <desired_adj_lo>
-  //   mov     ecx, edx
-  //   <Op_Hi> ecx, <desired_adj_hi>
-  //   lock cmpxchg8b [ptr]
-  //   jne     .LABEL
-  //   mov     <dest_lo>, eax
-  //   mov     <dest_lo>, edx
-  //
-  // For 32-bit:
-  //   mov     eax, [ptr]
-  // .LABEL:
-  //   mov     <reg>, eax
-  //   op      <reg>, [desired_adj]
-  //   lock cmpxchg [ptr], <reg>
-  //   jne     .LABEL
-  //   mov     <dest>, eax
-  //
-  // If Op_{Lo,Hi} are nullptr, then just copy the value.
-  Val = legalize(Val);
-  Type Ty = Val->getType();
-  if (!Traits::Is64Bit && Ty == IceType_i64) {
-    Variable *T_edx = makeReg(IceType_i32, Traits::RegisterSet::Reg_edx);
-    Variable *T_eax = makeReg(IceType_i32, Traits::RegisterSet::Reg_eax);
-    X86OperandMem *Addr = formMemoryOperand(Ptr, Ty);
-    _mov(T_eax, loOperand(Addr));
-    _mov(T_edx, hiOperand(Addr));
-    Variable *T_ecx = makeReg(IceType_i32, Traits::RegisterSet::Reg_ecx);
-    Variable *T_ebx = makeReg(IceType_i32, Traits::RegisterSet::Reg_ebx);
-    InstX86Label *Label = InstX86Label::create(Func, this);
-    const bool IsXchg8b = Op_Lo == nullptr && Op_Hi == nullptr;
-    if (!IsXchg8b) {
-      Context.insert(Label);
-      _mov(T_ebx, T_eax);
-      (this->*Op_Lo)(T_ebx, loOperand(Val));
-      _mov(T_ecx, T_edx);
-      (this->*Op_Hi)(T_ecx, hiOperand(Val));
-    } else {
-      // This is for xchg, which doesn't need an actual Op_Lo/Op_Hi.
-      // It just needs the Val loaded into ebx and ecx.
-      // That can also be done before the loop.
-      _mov(T_ebx, loOperand(Val));
-      _mov(T_ecx, hiOperand(Val));
-      Context.insert(Label);
-    }
-    constexpr bool Locked = true;
-    _cmpxchg8b(Addr, T_edx, T_eax, T_ecx, T_ebx, Locked);
-    _br(CondX86::Br_ne, Label);
-    if (!IsXchg8b) {
-      // If Val is a variable, model the extended live range of Val through
-      // the end of the loop, since it will be re-used by the loop.
-      if (auto *ValVar = llvm::dyn_cast<Variable>(Val)) {
-        auto *ValLo = llvm::cast<Variable>(loOperand(ValVar));
-        auto *ValHi = llvm::cast<Variable>(hiOperand(ValVar));
-        Context.insert<InstFakeUse>(ValLo);
-        Context.insert<InstFakeUse>(ValHi);
-      }
-    } else {
-      // For xchg, the loop is slightly smaller and ebx/ecx are used.
-      Context.insert<InstFakeUse>(T_ebx);
-      Context.insert<InstFakeUse>(T_ecx);
-    }
-    // The address base (if any) is also reused in the loop.
-    if (Variable *Base = Addr->getBase())
-      Context.insert<InstFakeUse>(Base);
-    auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
-    auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
-    _mov(DestLo, T_eax);
-    _mov(DestHi, T_edx);
-    return;
-  }
-  X86OperandMem *Addr = formMemoryOperand(Ptr, Ty);
-  RegNumT Eax;
-  switch (Ty) {
-  default:
-    llvm::report_fatal_error("Bad type for atomicRMW");
-  case IceType_i64:
-    Eax = Traits::getRaxOrDie();
-    break;
-  case IceType_i32:
-    Eax = Traits::RegisterSet::Reg_eax;
-    break;
-  case IceType_i16:
-    Eax = Traits::RegisterSet::Reg_ax;
-    break;
-  case IceType_i8:
-    Eax = Traits::RegisterSet::Reg_al;
-    break;
-  }
-  Variable *T_eax = makeReg(Ty, Eax);
-  _mov(T_eax, Addr);
-  auto *Label = Context.insert<InstX86Label>(this);
-  // We want to pick a different register for T than Eax, so don't use
-  // _mov(T == nullptr, T_eax).
-  Variable *T = makeReg(Ty);
-  _mov(T, T_eax);
-  (this->*Op_Lo)(T, Val);
-  constexpr bool Locked = true;
-  _cmpxchg(Addr, T_eax, T, Locked);
-  _br(CondX86::Br_ne, Label);
-  // If Val is a variable, model the extended live range of Val through
-  // the end of the loop, since it will be re-used by the loop.
-  if (auto *ValVar = llvm::dyn_cast<Variable>(Val)) {
-    Context.insert<InstFakeUse>(ValVar);
-  }
-  // The address base (if any) is also reused in the loop.
-  if (Variable *Base = Addr->getBase())
-    Context.insert<InstFakeUse>(Base);
-  _mov(Dest, T_eax);
-}
-
-/// Lowers count {trailing, leading} zeros intrinsic.
-///
-/// We could do constant folding here, but that should have
-/// been done by the front-end/middle-end optimizations.
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::lowerCountZeros(bool Cttz, Type Ty,
-                                                Variable *Dest,
-                                                Operand *FirstVal,
-                                                Operand *SecondVal) {
-  // TODO(jvoung): Determine if the user CPU supports LZCNT (BMI).
-  // Then the instructions will handle the Val == 0 case much more simply
-  // and won't require conversion from bit position to number of zeros.
-  //
-  // Otherwise:
-  //   bsr IF_NOT_ZERO, Val
-  //   mov T_DEST, ((Ty == i32) ? 63 : 127)
-  //   cmovne T_DEST, IF_NOT_ZERO
-  //   xor T_DEST, ((Ty == i32) ? 31 : 63)
-  //   mov DEST, T_DEST
-  //
-  // NOTE: T_DEST must be a register because cmov requires its dest to be a
-  // register. Also, bsf and bsr require their dest to be a register.
-  //
-  // The xor DEST, C(31|63) converts a bit position to # of leading zeroes.
-  // E.g., for 000... 00001100, bsr will say that the most significant bit
-  // set is at position 3, while the number of leading zeros is 28. Xor is
-  // like (M - N) for N <= M, and converts 63 to 32, and 127 to 64 (for the
-  // all-zeros case).
-  //
-  // X8632 only: Similar for 64-bit, but start w/ speculating that the upper 32
-  // bits are all zero, and compute the result for that case (checking the
-  // lower 32 bits). Then actually compute the result for the upper bits and
-  // cmov in the result from the lower computation if the earlier speculation
-  // was correct.
-  //
-  // Cttz, is similar, but uses bsf instead, and doesn't require the xor
-  // bit position conversion, and the speculation is reversed.
-
-  // TODO(jpp): refactor this method.
-  assert(Ty == IceType_i32 || Ty == IceType_i64);
-  const Type DestTy = Traits::Is64Bit ? Dest->getType() : IceType_i32;
-  Variable *T = makeReg(DestTy);
-  Operand *FirstValRM = legalize(FirstVal, Legal_Mem | Legal_Reg);
-  if (Cttz) {
-    _bsf(T, FirstValRM);
-  } else {
-    _bsr(T, FirstValRM);
-  }
-  Variable *T_Dest = makeReg(DestTy);
-  Constant *_31 = Ctx->getConstantInt32(31);
-  Constant *_32 = Ctx->getConstantInt(DestTy, 32);
-  Constant *_63 = Ctx->getConstantInt(DestTy, 63);
-  Constant *_64 = Ctx->getConstantInt(DestTy, 64);
-  if (Cttz) {
-    if (DestTy == IceType_i64) {
-      _mov(T_Dest, _64);
-    } else {
-      _mov(T_Dest, _32);
-    }
-  } else {
-    Constant *_127 = Ctx->getConstantInt(DestTy, 127);
-    if (DestTy == IceType_i64) {
-      _mov(T_Dest, _127);
-    } else {
-      _mov(T_Dest, _63);
-    }
-  }
-  _cmov(T_Dest, T, CondX86::Br_ne);
-  if (!Cttz) {
-    if (DestTy == IceType_i64) {
-      // Even though there's a _63 available at this point, that constant might
-      // not be an i32, which will cause the xor emission to fail.
-      Constant *_63 = Ctx->getConstantInt32(63);
-      _xor(T_Dest, _63);
-    } else {
-      _xor(T_Dest, _31);
-    }
-  }
-  if (Traits::Is64Bit || Ty == IceType_i32) {
-    _mov(Dest, T_Dest);
-    return;
-  }
-  _add(T_Dest, _32);
-  auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
-  auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
-  // Will be using "test" on this, so we need a registerized variable.
-  Variable *SecondVar = legalizeToReg(SecondVal);
-  Variable *T_Dest2 = makeReg(IceType_i32);
-  if (Cttz) {
-    _bsf(T_Dest2, SecondVar);
-  } else {
-    _bsr(T_Dest2, SecondVar);
-    _xor(T_Dest2, _31);
-  }
-  _test(SecondVar, SecondVar);
-  _cmov(T_Dest2, T_Dest, CondX86::Br_e);
-  _mov(DestLo, T_Dest2);
-  _mov(DestHi, Ctx->getConstantZero(IceType_i32));
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::typedLoad(Type Ty, Variable *Dest,
-                                          Variable *Base, Constant *Offset) {
-  // If Offset is a ConstantRelocatable in Non-SFI mode, we will need to
-  // legalize Mem properly.
-  if (Offset)
-    assert(!llvm::isa<ConstantRelocatable>(Offset));
-
-  auto *Mem = X86OperandMem::create(Func, Ty, Base, Offset);
-
-  if (isVectorType(Ty))
-    _movp(Dest, Mem);
-  else if (Ty == IceType_f64)
-    _movq(Dest, Mem);
-  else
-    _mov(Dest, Mem);
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::typedStore(Type Ty, Variable *Value,
-                                           Variable *Base, Constant *Offset) {
-  // If Offset is a ConstantRelocatable in Non-SFI mode, we will need to
-  // legalize Mem properly.
-  if (Offset)
-    assert(!llvm::isa<ConstantRelocatable>(Offset));
-
-  auto *Mem = X86OperandMem::create(Func, Ty, Base, Offset);
-
-  if (isVectorType(Ty))
-    _storep(Value, Mem);
-  else if (Ty == IceType_f64)
-    _storeq(Value, Mem);
-  else
-    _store(Value, Mem);
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::copyMemory(Type Ty, Variable *Dest,
-                                           Variable *Src, int32_t OffsetAmt) {
-  Constant *Offset = OffsetAmt ? Ctx->getConstantInt32(OffsetAmt) : nullptr;
-  // TODO(ascull): this or add nullptr test to _movp, _movq
-  Variable *Data = makeReg(Ty);
-
-  typedLoad(Ty, Data, Src, Offset);
-  typedStore(Ty, Data, Dest, Offset);
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::lowerMemcpy(Operand *Dest, Operand *Src,
-                                            Operand *Count) {
-  // There is a load and store for each chunk in the unroll
-  constexpr uint32_t BytesPerStorep = 16;
-
-  // Check if the operands are constants
-  const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count);
-  const bool IsCountConst = CountConst != nullptr;
-  const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0;
-
-  if (shouldOptimizeMemIntrins() && IsCountConst &&
-      CountValue <= BytesPerStorep * Traits::MEMCPY_UNROLL_LIMIT) {
-    // Unlikely, but nothing to do if it does happen
-    if (CountValue == 0)
-      return;
-
-    Variable *SrcBase = legalizeToReg(Src);
-    Variable *DestBase = legalizeToReg(Dest);
-
-    // Find the largest type that can be used and use it as much as possible in
-    // reverse order. Then handle any remainder with overlapping copies. Since
-    // the remainder will be at the end, there will be reduced pressure on the
-    // memory unit as the accesses to the same memory are far apart.
-    Type Ty = largestTypeInSize(CountValue);
-    uint32_t TyWidth = typeWidthInBytes(Ty);
-
-    uint32_t RemainingBytes = CountValue;
-    int32_t Offset = (CountValue & ~(TyWidth - 1)) - TyWidth;
-    while (RemainingBytes >= TyWidth) {
-      copyMemory(Ty, DestBase, SrcBase, Offset);
-      RemainingBytes -= TyWidth;
-      Offset -= TyWidth;
-    }
-
-    if (RemainingBytes == 0)
-      return;
-
-    // Lower the remaining bytes. Adjust to larger types in order to make use
-    // of overlaps in the copies.
-    Type LeftOverTy = firstTypeThatFitsSize(RemainingBytes);
-    Offset = CountValue - typeWidthInBytes(LeftOverTy);
-    copyMemory(LeftOverTy, DestBase, SrcBase, Offset);
-    return;
-  }
-
-  // Fall back on a function call
-  InstCall *Call = makeHelperCall(RuntimeHelper::H_call_memcpy, nullptr, 3);
-  Call->addArg(Dest);
-  Call->addArg(Src);
-  Call->addArg(Count);
-  lowerCall(Call);
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::lowerMemmove(Operand *Dest, Operand *Src,
-                                             Operand *Count) {
-  // There is a load and store for each chunk in the unroll
-  constexpr uint32_t BytesPerStorep = 16;
-
-  // Check if the operands are constants
-  const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count);
-  const bool IsCountConst = CountConst != nullptr;
-  const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0;
-
-  if (shouldOptimizeMemIntrins() && IsCountConst &&
-      CountValue <= BytesPerStorep * Traits::MEMMOVE_UNROLL_LIMIT) {
-    // Unlikely, but nothing to do if it does happen
-    if (CountValue == 0)
-      return;
-
-    Variable *SrcBase = legalizeToReg(Src);
-    Variable *DestBase = legalizeToReg(Dest);
-
-    std::tuple<Type, Constant *, Variable *>
-        Moves[Traits::MEMMOVE_UNROLL_LIMIT];
-    Constant *Offset;
-    Variable *Reg;
-
-    // Copy the data into registers as the source and destination could overlap
-    // so make sure not to clobber the memory. This also means overlapping
-    // moves can be used as we are taking a safe snapshot of the memory.
-    Type Ty = largestTypeInSize(CountValue);
-    uint32_t TyWidth = typeWidthInBytes(Ty);
-
-    uint32_t RemainingBytes = CountValue;
-    int32_t OffsetAmt = (CountValue & ~(TyWidth - 1)) - TyWidth;
-    size_t N = 0;
-    while (RemainingBytes >= TyWidth) {
-      assert(N <= Traits::MEMMOVE_UNROLL_LIMIT);
-      Offset = Ctx->getConstantInt32(OffsetAmt);
-      Reg = makeReg(Ty);
-      typedLoad(Ty, Reg, SrcBase, Offset);
-      RemainingBytes -= TyWidth;
-      OffsetAmt -= TyWidth;
-      Moves[N++] = std::make_tuple(Ty, Offset, Reg);
-    }
-
-    if (RemainingBytes != 0) {
-      // Lower the remaining bytes. Adjust to larger types in order to make use
-      // of overlaps in the copies.
-      assert(N <= Traits::MEMMOVE_UNROLL_LIMIT);
-      Ty = firstTypeThatFitsSize(RemainingBytes);
-      Offset = Ctx->getConstantInt32(CountValue - typeWidthInBytes(Ty));
-      Reg = makeReg(Ty);
-      typedLoad(Ty, Reg, SrcBase, Offset);
-      Moves[N++] = std::make_tuple(Ty, Offset, Reg);
-    }
-
-    // Copy the data out into the destination memory
-    for (size_t i = 0; i < N; ++i) {
-      std::tie(Ty, Offset, Reg) = Moves[i];
-      typedStore(Ty, Reg, DestBase, Offset);
-    }
-
-    return;
-  }
-
-  // Fall back on a function call
-  InstCall *Call = makeHelperCall(RuntimeHelper::H_call_memmove, nullptr, 3);
-  Call->addArg(Dest);
-  Call->addArg(Src);
-  Call->addArg(Count);
-  lowerCall(Call);
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::lowerMemset(Operand *Dest, Operand *Val,
-                                            Operand *Count) {
-  constexpr uint32_t BytesPerStorep = 16;
-  constexpr uint32_t BytesPerStoreq = 8;
-  constexpr uint32_t BytesPerStorei32 = 4;
-  assert(Val->getType() == IceType_i8);
-
-  // Check if the operands are constants
-  const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count);
-  const auto *ValConst = llvm::dyn_cast<const ConstantInteger32>(Val);
-  const bool IsCountConst = CountConst != nullptr;
-  const bool IsValConst = ValConst != nullptr;
-  const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0;
-  const uint32_t ValValue = IsValConst ? ValConst->getValue() : 0;
-
-  // Unlikely, but nothing to do if it does happen
-  if (IsCountConst && CountValue == 0)
-    return;
-
-  // TODO(ascull): if the count is constant but val is not it would be possible
-  // to inline by spreading the value across 4 bytes and accessing subregs e.g.
-  // eax, ax and al.
-  if (shouldOptimizeMemIntrins() && IsCountConst && IsValConst) {
-    Variable *Base = nullptr;
-    Variable *VecReg = nullptr;
-    const uint32_t MaskValue = (ValValue & 0xff);
-    const uint32_t SpreadValue =
-        (MaskValue << 24) | (MaskValue << 16) | (MaskValue << 8) | MaskValue;
-
-    auto lowerSet = [this, &Base, SpreadValue, &VecReg](Type Ty,
-                                                        uint32_t OffsetAmt) {
-      assert(Base != nullptr);
-      Constant *Offset = OffsetAmt ? Ctx->getConstantInt32(OffsetAmt) : nullptr;
-
-      // TODO(ascull): is 64-bit better with vector or scalar movq?
-      auto *Mem = X86OperandMem::create(Func, Ty, Base, Offset);
-      if (isVectorType(Ty)) {
-        assert(VecReg != nullptr);
-        _storep(VecReg, Mem);
-      } else if (Ty == IceType_f64) {
-        assert(VecReg != nullptr);
-        _storeq(VecReg, Mem);
-      } else {
-        assert(Ty != IceType_i64);
-        _store(Ctx->getConstantInt(Ty, SpreadValue), Mem);
-      }
-    };
-
-    // Find the largest type that can be used and use it as much as possible in
-    // reverse order. Then handle any remainder with overlapping copies. Since
-    // the remainder will be at the end, there will be reduces pressure on the
-    // memory unit as the access to the same memory are far apart.
-    Type Ty = IceType_void;
-    if (ValValue == 0 && CountValue >= BytesPerStoreq &&
-        CountValue <= BytesPerStorep * Traits::MEMSET_UNROLL_LIMIT) {
-      // When the value is zero it can be loaded into a vector register cheaply
-      // using the xor trick.
-      Base = legalizeToReg(Dest);
-      VecReg = makeVectorOfZeros(IceType_v16i8);
-      Ty = largestTypeInSize(CountValue);
-    } else if (CountValue <= BytesPerStorei32 * Traits::MEMSET_UNROLL_LIMIT) {
-      // When the value is non-zero or the count is small we can't use vector
-      // instructions so are limited to 32-bit stores.
-      Base = legalizeToReg(Dest);
-      constexpr uint32_t MaxSize = 4;
-      Ty = largestTypeInSize(CountValue, MaxSize);
-    }
-
-    if (Base) {
-      uint32_t TyWidth = typeWidthInBytes(Ty);
-
-      uint32_t RemainingBytes = CountValue;
-      uint32_t Offset = (CountValue & ~(TyWidth - 1)) - TyWidth;
-      while (RemainingBytes >= TyWidth) {
-        lowerSet(Ty, Offset);
-        RemainingBytes -= TyWidth;
-        Offset -= TyWidth;
-      }
-
-      if (RemainingBytes == 0)
-        return;
-
-      // Lower the remaining bytes. Adjust to larger types in order to make use
-      // of overlaps in the copies.
-      Type LeftOverTy = firstTypeThatFitsSize(RemainingBytes);
-      Offset = CountValue - typeWidthInBytes(LeftOverTy);
-      lowerSet(LeftOverTy, Offset);
-      return;
-    }
-  }
-
-  // Fall back on calling the memset function. The value operand needs to be
-  // extended to a stack slot size because the PNaCl ABI requires arguments to
-  // be at least 32 bits wide.
-  Operand *ValExt;
-  if (IsValConst) {
-    ValExt = Ctx->getConstantInt(stackSlotType(), ValValue);
-  } else {
-    Variable *ValExtVar = Func->makeVariable(stackSlotType());
-    lowerCast(InstCast::create(Func, InstCast::Zext, ValExtVar, Val));
-    ValExt = ValExtVar;
-  }
-  InstCall *Call = makeHelperCall(RuntimeHelper::H_call_memset, nullptr, 3);
-  Call->addArg(Dest);
-  Call->addArg(ValExt);
-  Call->addArg(Count);
-  lowerCall(Call);
-}
-
-class AddressOptimizer {
-  AddressOptimizer() = delete;
-  AddressOptimizer(const AddressOptimizer &) = delete;
-  AddressOptimizer &operator=(const AddressOptimizer &) = delete;
-
-public:
-  explicit AddressOptimizer(const Cfg *Func)
-      : Func(Func), VMetadata(Func->getVMetadata()) {}
-
-  inline void dumpAddressOpt(const ConstantRelocatable *const Relocatable,
-                             int32_t Offset, const Variable *Base,
-                             const Variable *Index, uint16_t Shift,
-                             const Inst *Reason) const;
-
-  inline const Inst *matchAssign(Variable **Var,
-                                 ConstantRelocatable **Relocatable,
-                                 int32_t *Offset);
-
-  inline const Inst *matchCombinedBaseIndex(Variable **Base, Variable **Index,
-                                            uint16_t *Shift);
-
-  inline const Inst *matchShiftedIndex(Variable **Index, uint16_t *Shift);
-
-  inline const Inst *matchOffsetIndexOrBase(Variable **IndexOrBase,
-                                            const uint16_t Shift,
-                                            ConstantRelocatable **Relocatable,
-                                            int32_t *Offset);
-
-private:
-  const Cfg *const Func;
-  const VariablesMetadata *const VMetadata;
-
-  static bool isAdd(const Inst *Instr) {
-    if (auto *Arith = llvm::dyn_cast_or_null<const InstArithmetic>(Instr)) {
-      return (Arith->getOp() == InstArithmetic::Add);
-    }
-    return false;
-  }
-};
-
-void AddressOptimizer::dumpAddressOpt(
-    const ConstantRelocatable *const Relocatable, int32_t Offset,
-    const Variable *Base, const Variable *Index, uint16_t Shift,
-    const Inst *Reason) const {
-  if (!BuildDefs::dump())
-    return;
-  if (!Func->isVerbose(IceV_AddrOpt))
-    return;
-  OstreamLocker L(Func->getContext());
-  Ostream &Str = Func->getContext()->getStrDump();
-  Str << "Instruction: ";
-  Reason->dumpDecorated(Func);
-  Str << "  results in Base=";
-  if (Base)
-    Base->dump(Func);
-  else
-    Str << "<null>";
-  Str << ", Index=";
-  if (Index)
-    Index->dump(Func);
-  else
-    Str << "<null>";
-  Str << ", Shift=" << Shift << ", Offset=" << Offset
-      << ", Relocatable=" << Relocatable << "\n";
-}
-
-const Inst *AddressOptimizer::matchAssign(Variable **Var,
-                                          ConstantRelocatable **Relocatable,
-                                          int32_t *Offset) {
-  // Var originates from Var=SrcVar ==> set Var:=SrcVar
-  if (*Var == nullptr)
-    return nullptr;
-  if (const Inst *VarAssign = VMetadata->getSingleDefinition(*Var)) {
-    assert(!VMetadata->isMultiDef(*Var));
-    if (llvm::isa<InstAssign>(VarAssign)) {
-      Operand *SrcOp = VarAssign->getSrc(0);
-      assert(SrcOp);
-      if (auto *SrcVar = llvm::dyn_cast<Variable>(SrcOp)) {
-        if (!VMetadata->isMultiDef(SrcVar) &&
-            // TODO: ensure SrcVar stays single-BB
-            true) {
-          *Var = SrcVar;
-          return VarAssign;
-        }
-      } else if (auto *Const = llvm::dyn_cast<ConstantInteger32>(SrcOp)) {
-        int32_t MoreOffset = Const->getValue();
-        if (Utils::WouldOverflowAdd(*Offset, MoreOffset))
-          return nullptr;
-        *Var = nullptr;
-        *Offset += MoreOffset;
-        return VarAssign;
-      } else if (auto *AddReloc = llvm::dyn_cast<ConstantRelocatable>(SrcOp)) {
-        if (*Relocatable == nullptr) {
-          // It is always safe to fold a relocatable through assignment -- the
-          // assignment frees a slot in the address operand that can be used to
-          // hold the Sandbox Pointer -- if any.
-          *Var = nullptr;
-          *Relocatable = AddReloc;
-          return VarAssign;
-        }
-      }
-    }
-  }
-  return nullptr;
-}
-
-const Inst *AddressOptimizer::matchCombinedBaseIndex(Variable **Base,
-                                                     Variable **Index,
-                                                     uint16_t *Shift) {
-  // Index==nullptr && Base is Base=Var1+Var2 ==>
-  //   set Base=Var1, Index=Var2, Shift=0
-  if (*Base == nullptr)
-    return nullptr;
-  if (*Index != nullptr)
-    return nullptr;
-  auto *BaseInst = VMetadata->getSingleDefinition(*Base);
-  if (BaseInst == nullptr)
-    return nullptr;
-  assert(!VMetadata->isMultiDef(*Base));
-  if (BaseInst->getSrcSize() < 2)
-    return nullptr;
-  if (auto *Var1 = llvm::dyn_cast<Variable>(BaseInst->getSrc(0))) {
-    if (VMetadata->isMultiDef(Var1))
-      return nullptr;
-    if (auto *Var2 = llvm::dyn_cast<Variable>(BaseInst->getSrc(1))) {
-      if (VMetadata->isMultiDef(Var2))
-        return nullptr;
-      if (isAdd(BaseInst) &&
-          // TODO: ensure Var1 and Var2 stay single-BB
-          true) {
-        *Base = Var1;
-        *Index = Var2;
-        *Shift = 0; // should already have been 0
-        return BaseInst;
-      }
-    }
-  }
-  return nullptr;
-}
-
-const Inst *AddressOptimizer::matchShiftedIndex(Variable **Index,
-                                                uint16_t *Shift) {
-  // Index is Index=Var*Const && log2(Const)+Shift<=3 ==>
-  //   Index=Var, Shift+=log2(Const)
-  if (*Index == nullptr)
-    return nullptr;
-  auto *IndexInst = VMetadata->getSingleDefinition(*Index);
-  if (IndexInst == nullptr)
-    return nullptr;
-  assert(!VMetadata->isMultiDef(*Index));
-
-  // When using an unsigned 32-bit array index on x64, it gets zero-extended
-  // before the shift & add. The explicit zero extension can be eliminated
-  // because x86 32-bit operations automatically get zero-extended into the
-  // corresponding 64-bit register.
-  if (auto *CastInst = llvm::dyn_cast<InstCast>(IndexInst)) {
-    if (CastInst->getCastKind() == InstCast::Zext) {
-      if (auto *Var = llvm::dyn_cast<Variable>(CastInst->getSrc(0))) {
-        if (Var->getType() == IceType_i32 &&
-            CastInst->getDest()->getType() == IceType_i64) {
-          IndexInst = VMetadata->getSingleDefinition(Var);
-        }
-      }
-    }
-  }
-
-  if (IndexInst->getSrcSize() < 2)
-    return nullptr;
-  if (auto *ArithInst = llvm::dyn_cast<InstArithmetic>(IndexInst)) {
-    if (auto *Var = llvm::dyn_cast<Variable>(ArithInst->getSrc(0))) {
-      if (auto *Const =
-              llvm::dyn_cast<ConstantInteger32>(ArithInst->getSrc(1))) {
-        if (VMetadata->isMultiDef(Var) || Const->getType() != IceType_i32)
-          return nullptr;
-        switch (ArithInst->getOp()) {
-        default:
-          return nullptr;
-        case InstArithmetic::Mul: {
-          uint32_t Mult = Const->getValue();
-          uint32_t LogMult;
-          switch (Mult) {
-          case 1:
-            LogMult = 0;
-            break;
-          case 2:
-            LogMult = 1;
-            break;
-          case 4:
-            LogMult = 2;
-            break;
-          case 8:
-            LogMult = 3;
-            break;
-          default:
-            return nullptr;
-          }
-          if (*Shift + LogMult <= 3) {
-            *Index = Var;
-            *Shift += LogMult;
-            return IndexInst;
-          }
-        }
-        case InstArithmetic::Shl: {
-          uint32_t ShiftAmount = Const->getValue();
-          switch (ShiftAmount) {
-          case 0:
-          case 1:
-          case 2:
-          case 3:
-            break;
-          default:
-            return nullptr;
-          }
-          if (*Shift + ShiftAmount <= 3) {
-            *Index = Var;
-            *Shift += ShiftAmount;
-            return IndexInst;
-          }
-        }
-        }
-      }
-    }
-  }
-  return nullptr;
-}
-
-const Inst *AddressOptimizer::matchOffsetIndexOrBase(
-    Variable **IndexOrBase, const uint16_t Shift,
-    ConstantRelocatable **Relocatable, int32_t *Offset) {
-  // Base is Base=Var+Const || Base is Base=Const+Var ==>
-  //   set Base=Var, Offset+=Const
-  // Base is Base=Var-Const ==>
-  //   set Base=Var, Offset-=Const
-  // Index is Index=Var+Const ==>
-  //   set Index=Var, Offset+=(Const<<Shift)
-  // Index is Index=Const+Var ==>
-  //   set Index=Var, Offset+=(Const<<Shift)
-  // Index is Index=Var-Const ==>
-  //   set Index=Var, Offset-=(Const<<Shift)
-  // Treat Index=Var Or Const as Index=Var + Const
-  //    when Var = Var' << N and log2(Const) <= N
-  // or when Var = (2^M) * (2^N) and log2(Const) <= (M+N)
-
-  if (*IndexOrBase == nullptr) {
-    return nullptr;
-  }
-  const Inst *Definition = VMetadata->getSingleDefinition(*IndexOrBase);
-  if (Definition == nullptr) {
-    return nullptr;
-  }
-  assert(!VMetadata->isMultiDef(*IndexOrBase));
-  if (auto *ArithInst = llvm::dyn_cast<const InstArithmetic>(Definition)) {
-    switch (ArithInst->getOp()) {
-    case InstArithmetic::Add:
-    case InstArithmetic::Sub:
-    case InstArithmetic::Or:
-      break;
-    default:
-      return nullptr;
-    }
-
-    Operand *Src0 = ArithInst->getSrc(0);
-    Operand *Src1 = ArithInst->getSrc(1);
-    auto *Var0 = llvm::dyn_cast<Variable>(Src0);
-    auto *Var1 = llvm::dyn_cast<Variable>(Src1);
-    auto *Const0 = llvm::dyn_cast<ConstantInteger32>(Src0);
-    auto *Const1 = llvm::dyn_cast<ConstantInteger32>(Src1);
-    auto *Reloc0 = llvm::dyn_cast<ConstantRelocatable>(Src0);
-    auto *Reloc1 = llvm::dyn_cast<ConstantRelocatable>(Src1);
-
-    bool IsAdd = false;
-    if (ArithInst->getOp() == InstArithmetic::Or) {
-      Variable *Var = nullptr;
-      ConstantInteger32 *Const = nullptr;
-      if (Var0 && Const1) {
-        Var = Var0;
-        Const = Const1;
-      } else if (Const0 && Var1) {
-        Var = Var1;
-        Const = Const0;
-      } else {
-        return nullptr;
-      }
-      auto *VarDef =
-          llvm::dyn_cast<InstArithmetic>(VMetadata->getSingleDefinition(Var));
-      if (VarDef == nullptr)
-        return nullptr;
-
-      SizeT ZeroesAvailable = 0;
-      if (VarDef->getOp() == InstArithmetic::Shl) {
-        if (auto *ConstInt =
-                llvm::dyn_cast<ConstantInteger32>(VarDef->getSrc(1))) {
-          ZeroesAvailable = ConstInt->getValue();
-        }
-      } else if (VarDef->getOp() == InstArithmetic::Mul) {
-        SizeT PowerOfTwo = 0;
-        if (auto *MultConst =
-                llvm::dyn_cast<ConstantInteger32>(VarDef->getSrc(0))) {
-          if (llvm::isPowerOf2_32(MultConst->getValue())) {
-            PowerOfTwo += MultConst->getValue();
-          }
-        }
-        if (auto *MultConst =
-                llvm::dyn_cast<ConstantInteger32>(VarDef->getSrc(1))) {
-          if (llvm::isPowerOf2_32(MultConst->getValue())) {
-            PowerOfTwo += MultConst->getValue();
-          }
-        }
-        ZeroesAvailable = llvm::Log2_32(PowerOfTwo) + 1;
-      }
-      SizeT ZeroesNeeded = llvm::Log2_32(Const->getValue()) + 1;
-      if (ZeroesNeeded == 0 || ZeroesNeeded > ZeroesAvailable)
-        return nullptr;
-      IsAdd = true; // treat it as an add if the above conditions hold
-    } else {
-      IsAdd = ArithInst->getOp() == InstArithmetic::Add;
-    }
-
-    Variable *NewIndexOrBase = nullptr;
-    int32_t NewOffset = 0;
-    ConstantRelocatable *NewRelocatable = *Relocatable;
-    if (Var0 && Var1)
-      // TODO(sehr): merge base/index splitting into here.
-      return nullptr;
-    if (!IsAdd && Var1)
-      return nullptr;
-    if (Var0)
-      NewIndexOrBase = Var0;
-    else if (Var1)
-      NewIndexOrBase = Var1;
-    // Don't know how to add/subtract two relocatables.
-    if ((*Relocatable && (Reloc0 || Reloc1)) || (Reloc0 && Reloc1))
-      return nullptr;
-    // Don't know how to subtract a relocatable.
-    if (!IsAdd && Reloc1)
-      return nullptr;
-    // Incorporate ConstantRelocatables.
-    if (Reloc0)
-      NewRelocatable = Reloc0;
-    else if (Reloc1)
-      NewRelocatable = Reloc1;
-    // Compute the updated constant offset.
-    if (Const0) {
-      const int32_t MoreOffset =
-          IsAdd ? Const0->getValue() : -Const0->getValue();
-      if (Utils::WouldOverflowAdd(*Offset + NewOffset, MoreOffset))
-        return nullptr;
-      NewOffset += MoreOffset;
-    }
-    if (Const1) {
-      const int32_t MoreOffset =
-          IsAdd ? Const1->getValue() : -Const1->getValue();
-      if (Utils::WouldOverflowAdd(*Offset + NewOffset, MoreOffset))
-        return nullptr;
-      NewOffset += MoreOffset;
-    }
-    if (Utils::WouldOverflowAdd(*Offset, NewOffset << Shift))
-      return nullptr;
-    *IndexOrBase = NewIndexOrBase;
-    *Offset += (NewOffset << Shift);
-    // Shift is always zero if this is called with the base
-    *Relocatable = NewRelocatable;
-    return Definition;
-  }
-  return nullptr;
-}
-
-template <typename TypeTraits>
-typename TargetX86Base<TypeTraits>::X86OperandMem *
-TargetX86Base<TypeTraits>::computeAddressOpt(const Inst *Instr, Type MemType,
-                                             Operand *Addr) {
-  Func->resetCurrentNode();
-  if (Func->isVerbose(IceV_AddrOpt)) {
-    OstreamLocker L(Func->getContext());
-    Ostream &Str = Func->getContext()->getStrDump();
-    Str << "\nStarting computeAddressOpt for instruction:\n  ";
-    Instr->dumpDecorated(Func);
-  }
-
-  OptAddr NewAddr;
-  NewAddr.Base = llvm::dyn_cast<Variable>(Addr);
-  if (NewAddr.Base == nullptr)
-    return nullptr;
-
-  // If the Base has more than one use or is live across multiple blocks, then
-  // don't go further. Alternatively (?), never consider a transformation that
-  // would change a variable that is currently *not* live across basic block
-  // boundaries into one that *is*.
-  if (!getFlags().getLoopInvariantCodeMotion()) {
-    // Need multi block address opt when licm is enabled.
-    // Might make sense to restrict to current node and loop header.
-    if (Func->getVMetadata()->isMultiBlock(
-            NewAddr.Base) /* || Base->getUseCount() > 1*/)
-      return nullptr;
-  }
-  AddressOptimizer AddrOpt(Func);
-  const bool MockBounds = getFlags().getMockBoundsCheck();
-  const Inst *Reason = nullptr;
-  bool AddressWasOptimized = false;
-  // The following unnamed struct identifies the address mode formation steps
-  // that could potentially create an invalid memory operand (i.e., no free
-  // slots for RebasePtr.) We add all those variables to this struct so that we
-  // can use memset() to reset all members to false.
-  struct {
-    bool AssignBase = false;
-    bool AssignIndex = false;
-    bool OffsetFromBase = false;
-    bool OffsetFromIndex = false;
-    bool CombinedBaseIndex = false;
-  } Skip;
-  // NewAddrCheckpoint is used to rollback the address being formed in case an
-  // invalid address is formed.
-  OptAddr NewAddrCheckpoint;
-  Reason = Instr;
-  do {
-    if (Reason) {
-      AddrOpt.dumpAddressOpt(NewAddr.Relocatable, NewAddr.Offset, NewAddr.Base,
-                             NewAddr.Index, NewAddr.Shift, Reason);
-      AddressWasOptimized = true;
-      Reason = nullptr;
-      memset(reinterpret_cast<void *>(&Skip), 0, sizeof(Skip));
-    }
-
-    NewAddrCheckpoint = NewAddr;
-
-    // Update Base and Index to follow through assignments to definitions.
-    if (!Skip.AssignBase &&
-        (Reason = AddrOpt.matchAssign(&NewAddr.Base, &NewAddr.Relocatable,
-                                      &NewAddr.Offset))) {
-      // Assignments of Base from a Relocatable or ConstantInt32 can result
-      // in Base becoming nullptr.  To avoid code duplication in this loop we
-      // prefer that Base be non-nullptr if possible.
-      if ((NewAddr.Base == nullptr) && (NewAddr.Index != nullptr) &&
-          NewAddr.Shift == 0) {
-        std::swap(NewAddr.Base, NewAddr.Index);
-      }
-      continue;
-    }
-    if (!Skip.AssignBase &&
-        (Reason = AddrOpt.matchAssign(&NewAddr.Index, &NewAddr.Relocatable,
-                                      &NewAddr.Offset))) {
-      continue;
-    }
-
-    if (!MockBounds) {
-      // Transition from:
-      //   <Relocatable + Offset>(Base) to
-      //   <Relocatable + Offset>(Base, Index)
-      if (!Skip.CombinedBaseIndex &&
-          (Reason = AddrOpt.matchCombinedBaseIndex(
-               &NewAddr.Base, &NewAddr.Index, &NewAddr.Shift))) {
-        continue;
-      }
-
-      // Recognize multiply/shift and update Shift amount.
-      // Index becomes Index=Var<<Const && Const+Shift<=3 ==>
-      //   Index=Var, Shift+=Const
-      // Index becomes Index=Const*Var && log2(Const)+Shift<=3 ==>
-      //   Index=Var, Shift+=log2(Const)
-      if ((Reason =
-               AddrOpt.matchShiftedIndex(&NewAddr.Index, &NewAddr.Shift))) {
-        continue;
-      }
-
-      // If Shift is zero, the choice of Base and Index was purely arbitrary.
-      // Recognize multiply/shift and set Shift amount.
-      // Shift==0 && Base is Base=Var*Const && log2(Const)+Shift<=3 ==>
-      //   swap(Index,Base)
-      // Similar for Base=Const*Var and Base=Var<<Const
-      if (NewAddr.Shift == 0 &&
-          (Reason = AddrOpt.matchShiftedIndex(&NewAddr.Base, &NewAddr.Shift))) {
-        std::swap(NewAddr.Base, NewAddr.Index);
-        continue;
-      }
-    }
-
-    // Update Offset to reflect additions/subtractions with constants and
-    // relocatables.
-    // TODO: consider overflow issues with respect to Offset.
-    if (!Skip.OffsetFromBase && (Reason = AddrOpt.matchOffsetIndexOrBase(
-                                     &NewAddr.Base, /*Shift =*/0,
-                                     &NewAddr.Relocatable, &NewAddr.Offset))) {
-      continue;
-    }
-    if (!Skip.OffsetFromIndex && (Reason = AddrOpt.matchOffsetIndexOrBase(
-                                      &NewAddr.Index, NewAddr.Shift,
-                                      &NewAddr.Relocatable, &NewAddr.Offset))) {
-      continue;
-    }
-
-    break;
-  } while (Reason);
-
-  if (!AddressWasOptimized) {
-    return nullptr;
-  }
-
-  Constant *OffsetOp = nullptr;
-  if (NewAddr.Relocatable == nullptr) {
-    OffsetOp = Ctx->getConstantInt32(NewAddr.Offset);
-  } else {
-    OffsetOp =
-        Ctx->getConstantSym(NewAddr.Relocatable->getOffset() + NewAddr.Offset,
-                            NewAddr.Relocatable->getName());
-  }
-  // Vanilla ICE load instructions should not use the segment registers, and
-  // computeAddressOpt only works at the level of Variables and Constants, not
-  // other X86OperandMem, so there should be no mention of segment
-  // registers there either.
-  static constexpr auto SegmentReg =
-      X86OperandMem::SegmentRegisters::DefaultSegment;
-
-  return X86OperandMem::create(Func, MemType, NewAddr.Base, OffsetOp,
-                               NewAddr.Index, NewAddr.Shift, SegmentReg);
-}
-
-/// Add a mock bounds check on the memory address before using it as a load or
-/// store operand.  The basic idea is that given a memory operand [reg], we
-/// would first add bounds-check code something like:
-///
-///   cmp reg, <lb>
-///   jl out_of_line_error
-///   cmp reg, <ub>
-///   jg out_of_line_error
-///
-/// In reality, the specific code will depend on how <lb> and <ub> are
-/// represented, e.g. an immediate, a global, or a function argument.
-///
-/// As such, we need to enforce that the memory operand does not have the form
-/// [reg1+reg2], because then there is no simple cmp instruction that would
-/// suffice.  However, we consider [reg+offset] to be OK because the offset is
-/// usually small, and so <ub> could have a safety buffer built in and then we
-/// could instead branch to a custom out_of_line_error that does the precise
-/// check and jumps back if it turns out OK.
-///
-/// For the purpose of mocking the bounds check, we'll do something like this:
-///
-///   cmp reg, 0
-///   je label
-///   cmp reg, 1
-///   je label
-///   label:
-///
-/// Also note that we don't need to add a bounds check to a dereference of a
-/// simple global variable address.
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::doMockBoundsCheck(Operand *Opnd) {
-  if (!getFlags().getMockBoundsCheck())
-    return;
-  if (auto *Mem = llvm::dyn_cast<X86OperandMem>(Opnd)) {
-    if (Mem->getIndex()) {
-      llvm::report_fatal_error("doMockBoundsCheck: Opnd contains index reg");
-    }
-    Opnd = Mem->getBase();
-  }
-  // At this point Opnd could be nullptr, or Variable, or Constant, or perhaps
-  // something else.  We only care if it is Variable.
-  auto *Var = llvm::dyn_cast_or_null<Variable>(Opnd);
-  if (Var == nullptr)
-    return;
-  // We use lowerStore() to copy out-args onto the stack.  This creates a memory
-  // operand with the stack pointer as the base register.  Don't do bounds
-  // checks on that.
-  if (Var->getRegNum() == getStackReg())
-    return;
-
-  auto *Label = InstX86Label::create(Func, this);
-  _cmp(Opnd, Ctx->getConstantZero(IceType_i32));
-  _br(CondX86::Br_e, Label);
-  _cmp(Opnd, Ctx->getConstantInt32(1));
-  _br(CondX86::Br_e, Label);
-  Context.insert(Label);
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::lowerLoad(const InstLoad *Load) {
-  // A Load instruction can be treated the same as an Assign instruction, after
-  // the source operand is transformed into an X86OperandMem operand.  Note that
-  // the address mode optimization already creates an X86OperandMem operand, so
-  // it doesn't need another level of transformation.
-  Variable *DestLoad = Load->getDest();
-  Type Ty = DestLoad->getType();
-  Operand *Src0 = formMemoryOperand(Load->getLoadAddress(), Ty);
-  doMockBoundsCheck(Src0);
-  auto *Assign = InstAssign::create(Func, DestLoad, Src0);
-  lowerAssign(Assign);
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::doAddressOptOther() {
-  // Inverts some Icmp instructions which helps doAddressOptLoad later.
-  // TODO(manasijm): Refactor to unify the conditions for Var0 and Var1
-  Inst *Instr = iteratorToInst(Context.getCur());
-  auto *VMetadata = Func->getVMetadata();
-  if (auto *Icmp = llvm::dyn_cast<InstIcmp>(Instr)) {
-    if (llvm::isa<Constant>(Icmp->getSrc(0)) ||
-        llvm::isa<Constant>(Icmp->getSrc(1)))
-      return;
-    auto *Var0 = llvm::dyn_cast<Variable>(Icmp->getSrc(0));
-    if (Var0 == nullptr)
-      return;
-    if (!VMetadata->isTracked(Var0))
-      return;
-    auto *Op0Def = VMetadata->getFirstDefinitionSingleBlock(Var0);
-    if (Op0Def == nullptr || !llvm::isa<InstLoad>(Op0Def))
-      return;
-    if (VMetadata->getLocalUseNode(Var0) != Context.getNode())
-      return;
-
-    auto *Var1 = llvm::dyn_cast<Variable>(Icmp->getSrc(1));
-    if (Var1 != nullptr && VMetadata->isTracked(Var1)) {
-      auto *Op1Def = VMetadata->getFirstDefinitionSingleBlock(Var1);
-      if (Op1Def != nullptr && !VMetadata->isMultiBlock(Var1) &&
-          llvm::isa<InstLoad>(Op1Def)) {
-        return; // Both are loads
-      }
-    }
-    Icmp->reverseConditionAndOperands();
-  }
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::doAddressOptLoad() {
-  Inst *Instr = iteratorToInst(Context.getCur());
-  Operand *Addr = Instr->getSrc(0);
-  Variable *Dest = Instr->getDest();
-  if (auto *OptAddr = computeAddressOpt(Instr, Dest->getType(), Addr)) {
-    Instr->setDeleted();
-    Context.insert<InstLoad>(Dest, OptAddr);
-  }
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::doAddressOptLoadSubVector() {
-  auto *Intrinsic = llvm::cast<InstIntrinsic>(Context.getCur());
-  Operand *Addr = Intrinsic->getArg(0);
-  Variable *Dest = Intrinsic->getDest();
-  if (auto *OptAddr = computeAddressOpt(Intrinsic, Dest->getType(), Addr)) {
-    Intrinsic->setDeleted();
-    const Ice::Intrinsics::IntrinsicInfo Info = {
-        Ice::Intrinsics::LoadSubVector, Ice::Intrinsics::SideEffects_F,
-        Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
-    auto *NewLoad = Context.insert<InstIntrinsic>(2, Dest, Info);
-    NewLoad->addArg(OptAddr);
-    NewLoad->addArg(Intrinsic->getArg(1));
-  }
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::lowerPhi(const InstPhi * /*Instr*/) {
-  Func->setError("Phi found in regular instruction list");
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::lowerRet(const InstRet *Instr) {
-  Variable *Reg = nullptr;
-  if (Instr->hasRetValue()) {
-    Operand *RetValue = legalize(Instr->getRetValue());
-    const Type ReturnType = RetValue->getType();
-    assert(isVectorType(ReturnType) || isScalarFloatingType(ReturnType) ||
-           (ReturnType == IceType_i32) || (ReturnType == IceType_i64));
-    Reg = moveReturnValueToRegister(RetValue, ReturnType);
-  }
-  // Add a ret instruction even if sandboxing is enabled, because addEpilog
-  // explicitly looks for a ret instruction as a marker for where to insert the
-  // frame removal instructions.
-  _ret(Reg);
-  // Add a fake use of esp to make sure esp stays alive for the entire
-  // function. Otherwise post-call esp adjustments get dead-code eliminated.
-  keepEspLiveAtExit();
-}
-
-inline uint32_t makePshufdMask(SizeT Index0, SizeT Index1, SizeT Index2,
-                               SizeT Index3) {
-  const SizeT Mask = (Index0 & 0x3) | ((Index1 & 0x3) << 2) |
-                     ((Index2 & 0x3) << 4) | ((Index3 & 0x3) << 6);
-  assert(Mask < 256);
-  return Mask;
-}
-
-template <typename TraitsType>
-Variable *TargetX86Base<TraitsType>::lowerShuffleVector_AllFromSameSrc(
-    Operand *Src, SizeT Index0, SizeT Index1, SizeT Index2, SizeT Index3) {
-  constexpr SizeT SrcBit = 1 << 2;
-  assert((Index0 & SrcBit) == (Index1 & SrcBit));
-  assert((Index0 & SrcBit) == (Index2 & SrcBit));
-  assert((Index0 & SrcBit) == (Index3 & SrcBit));
-  (void)SrcBit;
-
-  const Type SrcTy = Src->getType();
-  auto *T = makeReg(SrcTy);
-  auto *SrcRM = legalize(Src, Legal_Reg | Legal_Mem);
-  auto *Mask =
-      Ctx->getConstantInt32(makePshufdMask(Index0, Index1, Index2, Index3));
-  _pshufd(T, SrcRM, Mask);
-  return T;
-}
-
-template <typename TraitsType>
-Variable *TargetX86Base<TraitsType>::lowerShuffleVector_TwoFromSameSrc(
-    Operand *Src0, SizeT Index0, SizeT Index1, Operand *Src1, SizeT Index2,
-    SizeT Index3) {
-  constexpr SizeT SrcBit = 1 << 2;
-  assert((Index0 & SrcBit) == (Index1 & SrcBit) || (Index1 == IGNORE_INDEX));
-  assert((Index2 & SrcBit) == (Index3 & SrcBit) || (Index3 == IGNORE_INDEX));
-  (void)SrcBit;
-
-  const Type SrcTy = Src0->getType();
-  assert(Src1->getType() == SrcTy);
-  auto *T = makeReg(SrcTy);
-  auto *Src0R = legalizeToReg(Src0);
-  auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
-  auto *Mask =
-      Ctx->getConstantInt32(makePshufdMask(Index0, Index1, Index2, Index3));
-  _movp(T, Src0R);
-  _shufps(T, Src1RM, Mask);
-  return T;
-}
-
-template <typename TraitsType>
-Variable *TargetX86Base<TraitsType>::lowerShuffleVector_UnifyFromDifferentSrcs(
-    Operand *Src0, SizeT Index0, Operand *Src1, SizeT Index1) {
-  return lowerShuffleVector_TwoFromSameSrc(Src0, Index0, IGNORE_INDEX, Src1,
-                                           Index1, IGNORE_INDEX);
-}
-
-inline SizeT makeSrcSwitchMask(SizeT Index0, SizeT Index1, SizeT Index2,
-                               SizeT Index3) {
-  constexpr SizeT SrcBit = 1 << 2;
-  const SizeT Index0Bits = ((Index0 & SrcBit) == 0) ? 0 : (1 << 0);
-  const SizeT Index1Bits = ((Index1 & SrcBit) == 0) ? 0 : (1 << 1);
-  const SizeT Index2Bits = ((Index2 & SrcBit) == 0) ? 0 : (1 << 2);
-  const SizeT Index3Bits = ((Index3 & SrcBit) == 0) ? 0 : (1 << 3);
-  return Index0Bits | Index1Bits | Index2Bits | Index3Bits;
-}
-
-template <typename TraitsType>
-GlobalString TargetX86Base<TraitsType>::lowerShuffleVector_NewMaskName() {
-  GlobalString FuncName = Func->getFunctionName();
-  const SizeT Id = PshufbMaskCount++;
-  if (!BuildDefs::dump() || !FuncName.hasStdString()) {
-    return GlobalString::createWithString(
-        Ctx,
-        "$PS" + std::to_string(FuncName.getID()) + "_" + std::to_string(Id));
-  }
-  return GlobalString::createWithString(
-      Ctx, "Pshufb$" + Func->getFunctionName() + "$" + std::to_string(Id));
-}
-
-template <typename TraitsType>
-ConstantRelocatable *
-TargetX86Base<TraitsType>::lowerShuffleVector_CreatePshufbMask(
-    int8_t Idx0, int8_t Idx1, int8_t Idx2, int8_t Idx3, int8_t Idx4,
-    int8_t Idx5, int8_t Idx6, int8_t Idx7, int8_t Idx8, int8_t Idx9,
-    int8_t Idx10, int8_t Idx11, int8_t Idx12, int8_t Idx13, int8_t Idx14,
-    int8_t Idx15) {
-  static constexpr uint8_t NumElements = 16;
-  const char Initializer[NumElements] = {
-      Idx0, Idx1, Idx2,  Idx3,  Idx4,  Idx5,  Idx6,  Idx7,
-      Idx8, Idx9, Idx10, Idx11, Idx12, Idx13, Idx14, Idx15,
-  };
-
-  static constexpr Type V4VectorType = IceType_v4i32;
-  const uint32_t MaskAlignment = typeWidthInBytesOnStack(V4VectorType);
-  auto *Mask = VariableDeclaration::create(Func->getGlobalPool());
-  GlobalString MaskName = lowerShuffleVector_NewMaskName();
-  Mask->setIsConstant(true);
-  Mask->addInitializer(VariableDeclaration::DataInitializer::create(
-      Func->getGlobalPool(), Initializer, NumElements));
-  Mask->setName(MaskName);
-  // Mask needs to be 16-byte aligned, or pshufb will seg fault.
-  Mask->setAlignment(MaskAlignment);
-  Func->addGlobal(Mask);
-
-  constexpr RelocOffsetT Offset = 0;
-  return llvm::cast<ConstantRelocatable>(Ctx->getConstantSym(Offset, MaskName));
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::lowerShuffleVector_UsingPshufb(
-    Variable *Dest, Operand *Src0, Operand *Src1, int8_t Idx0, int8_t Idx1,
-    int8_t Idx2, int8_t Idx3, int8_t Idx4, int8_t Idx5, int8_t Idx6,
-    int8_t Idx7, int8_t Idx8, int8_t Idx9, int8_t Idx10, int8_t Idx11,
-    int8_t Idx12, int8_t Idx13, int8_t Idx14, int8_t Idx15) {
-  const Type DestTy = Dest->getType();
-  static constexpr bool NotRebased = false;
-  static constexpr Variable *NoBase = nullptr;
-  // We use void for the memory operand instead of DestTy because using the
-  // latter causes a validation failure: the X86 Inst layer complains that
-  // vector mem operands could be under aligned. Thus, using void we avoid the
-  // validation error. Note that the mask global declaration is aligned, so it
-  // can be used as an XMM mem operand.
-  static constexpr Type MaskType = IceType_void;
-#define IDX_IN_SRC(N, S)                                                       \
-  ((((N) & (1 << 4)) == (S << 4)) ? ((N)&0xf) : CLEAR_ALL_BITS)
-  auto *Mask0M = X86OperandMem::create(
-      Func, MaskType, NoBase,
-      lowerShuffleVector_CreatePshufbMask(
-          IDX_IN_SRC(Idx0, 0), IDX_IN_SRC(Idx1, 0), IDX_IN_SRC(Idx2, 0),
-          IDX_IN_SRC(Idx3, 0), IDX_IN_SRC(Idx4, 0), IDX_IN_SRC(Idx5, 0),
-          IDX_IN_SRC(Idx6, 0), IDX_IN_SRC(Idx7, 0), IDX_IN_SRC(Idx8, 0),
-          IDX_IN_SRC(Idx9, 0), IDX_IN_SRC(Idx10, 0), IDX_IN_SRC(Idx11, 0),
-          IDX_IN_SRC(Idx12, 0), IDX_IN_SRC(Idx13, 0), IDX_IN_SRC(Idx14, 0),
-          IDX_IN_SRC(Idx15, 0)),
-      NotRebased);
-
-  auto *T0 = makeReg(DestTy);
-  auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
-  _movp(T0, Src0RM);
-
-  _pshufb(T0, Mask0M);
-
-  if (Idx0 >= 16 || Idx1 >= 16 || Idx2 >= 16 || Idx3 >= 16 || Idx4 >= 16 ||
-      Idx5 >= 16 || Idx6 >= 16 || Idx7 >= 16 || Idx8 >= 16 || Idx9 >= 16 ||
-      Idx10 >= 16 || Idx11 >= 16 || Idx12 >= 16 || Idx13 >= 16 || Idx14 >= 16 ||
-      Idx15 >= 16) {
-    auto *Mask1M = X86OperandMem::create(
-        Func, MaskType, NoBase,
-        lowerShuffleVector_CreatePshufbMask(
-            IDX_IN_SRC(Idx0, 1), IDX_IN_SRC(Idx1, 1), IDX_IN_SRC(Idx2, 1),
-            IDX_IN_SRC(Idx3, 1), IDX_IN_SRC(Idx4, 1), IDX_IN_SRC(Idx5, 1),
-            IDX_IN_SRC(Idx6, 1), IDX_IN_SRC(Idx7, 1), IDX_IN_SRC(Idx8, 1),
-            IDX_IN_SRC(Idx9, 1), IDX_IN_SRC(Idx10, 1), IDX_IN_SRC(Idx11, 1),
-            IDX_IN_SRC(Idx12, 1), IDX_IN_SRC(Idx13, 1), IDX_IN_SRC(Idx14, 1),
-            IDX_IN_SRC(Idx15, 1)),
-        NotRebased);
-#undef IDX_IN_SRC
-    auto *T1 = makeReg(DestTy);
-    auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
-    _movp(T1, Src1RM);
-    _pshufb(T1, Mask1M);
-    _por(T0, T1);
-  }
-
-  _movp(Dest, T0);
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::lowerShuffleVector(
-    const InstShuffleVector *Instr) {
-  auto *Dest = Instr->getDest();
-  const Type DestTy = Dest->getType();
-  auto *Src0 = Instr->getSrc(0);
-  auto *Src1 = Instr->getSrc(1);
-  const SizeT NumElements = typeNumElements(DestTy);
-
-  auto *T = makeReg(DestTy);
-
-  switch (DestTy) {
-  default:
-    llvm::report_fatal_error("Unexpected vector type.");
-  case IceType_v16i1:
-  case IceType_v16i8: {
-    static constexpr SizeT ExpectedNumElements = 16;
-    assert(ExpectedNumElements == Instr->getNumIndexes());
-    (void)ExpectedNumElements;
-
-    if (Instr->indexesAre(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7)) {
-      auto *T = makeReg(DestTy);
-      auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
-      _movp(T, Src0RM);
-      _punpckl(T, Src0RM);
-      _movp(Dest, T);
-      return;
-    }
-
-    if (Instr->indexesAre(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7,
-                          23)) {
-      auto *T = makeReg(DestTy);
-      auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
-      auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
-      _movp(T, Src0RM);
-      _punpckl(T, Src1RM);
-      _movp(Dest, T);
-      return;
-    }
-
-    if (Instr->indexesAre(8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14,
-                          15, 15)) {
-      auto *T = makeReg(DestTy);
-      auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
-      _movp(T, Src0RM);
-      _punpckh(T, Src0RM);
-      _movp(Dest, T);
-      return;
-    }
-
-    if (Instr->indexesAre(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30,
-                          15, 31)) {
-      auto *T = makeReg(DestTy);
-      auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
-      auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
-      _movp(T, Src0RM);
-      _punpckh(T, Src1RM);
-      _movp(Dest, T);
-      return;
-    }
-
-    if (InstructionSet < Traits::SSE4_1) {
-      // TODO(jpp): figure out how to lower with sse2.
-      break;
-    }
-
-    const SizeT Index0 = Instr->getIndexValue(0);
-    const SizeT Index1 = Instr->getIndexValue(1);
-    const SizeT Index2 = Instr->getIndexValue(2);
-    const SizeT Index3 = Instr->getIndexValue(3);
-    const SizeT Index4 = Instr->getIndexValue(4);
-    const SizeT Index5 = Instr->getIndexValue(5);
-    const SizeT Index6 = Instr->getIndexValue(6);
-    const SizeT Index7 = Instr->getIndexValue(7);
-    const SizeT Index8 = Instr->getIndexValue(8);
-    const SizeT Index9 = Instr->getIndexValue(9);
-    const SizeT Index10 = Instr->getIndexValue(10);
-    const SizeT Index11 = Instr->getIndexValue(11);
-    const SizeT Index12 = Instr->getIndexValue(12);
-    const SizeT Index13 = Instr->getIndexValue(13);
-    const SizeT Index14 = Instr->getIndexValue(14);
-    const SizeT Index15 = Instr->getIndexValue(15);
-
-    lowerShuffleVector_UsingPshufb(Dest, Src0, Src1, Index0, Index1, Index2,
-                                   Index3, Index4, Index5, Index6, Index7,
-                                   Index8, Index9, Index10, Index11, Index12,
-                                   Index13, Index14, Index15);
-    return;
-  }
-  case IceType_v8i1:
-  case IceType_v8i16: {
-    static constexpr SizeT ExpectedNumElements = 8;
-    assert(ExpectedNumElements == Instr->getNumIndexes());
-    (void)ExpectedNumElements;
-
-    if (Instr->indexesAre(0, 0, 1, 1, 2, 2, 3, 3)) {
-      auto *T = makeReg(DestTy);
-      auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
-      _movp(T, Src0RM);
-      _punpckl(T, Src0RM);
-      _movp(Dest, T);
-      return;
-    }
-
-    if (Instr->indexesAre(0, 8, 1, 9, 2, 10, 3, 11)) {
-      auto *T = makeReg(DestTy);
-      auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
-      auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
-      _movp(T, Src0RM);
-      _punpckl(T, Src1RM);
-      _movp(Dest, T);
-      return;
-    }
-
-    if (Instr->indexesAre(4, 4, 5, 5, 6, 6, 7, 7)) {
-      auto *T = makeReg(DestTy);
-      auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
-      _movp(T, Src0RM);
-      _punpckh(T, Src0RM);
-      _movp(Dest, T);
-      return;
-    }
-
-    if (Instr->indexesAre(4, 12, 5, 13, 6, 14, 7, 15)) {
-      auto *T = makeReg(DestTy);
-      auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
-      auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
-      _movp(T, Src0RM);
-      _punpckh(T, Src1RM);
-      _movp(Dest, T);
-      return;
-    }
-
-    if (InstructionSet < Traits::SSE4_1) {
-      // TODO(jpp): figure out how to lower with sse2.
-      break;
-    }
-
-    const SizeT Index0 = Instr->getIndexValue(0);
-    const SizeT Index1 = Instr->getIndexValue(1);
-    const SizeT Index2 = Instr->getIndexValue(2);
-    const SizeT Index3 = Instr->getIndexValue(3);
-    const SizeT Index4 = Instr->getIndexValue(4);
-    const SizeT Index5 = Instr->getIndexValue(5);
-    const SizeT Index6 = Instr->getIndexValue(6);
-    const SizeT Index7 = Instr->getIndexValue(7);
-
-#define TO_BYTE_INDEX(I) ((I) << 1)
-    lowerShuffleVector_UsingPshufb(
-        Dest, Src0, Src1, TO_BYTE_INDEX(Index0), TO_BYTE_INDEX(Index0) + 1,
-        TO_BYTE_INDEX(Index1), TO_BYTE_INDEX(Index1) + 1, TO_BYTE_INDEX(Index2),
-        TO_BYTE_INDEX(Index2) + 1, TO_BYTE_INDEX(Index3),
-        TO_BYTE_INDEX(Index3) + 1, TO_BYTE_INDEX(Index4),
-        TO_BYTE_INDEX(Index4) + 1, TO_BYTE_INDEX(Index5),
-        TO_BYTE_INDEX(Index5) + 1, TO_BYTE_INDEX(Index6),
-        TO_BYTE_INDEX(Index6) + 1, TO_BYTE_INDEX(Index7),
-        TO_BYTE_INDEX(Index7) + 1);
-#undef TO_BYTE_INDEX
-    return;
-  }
-  case IceType_v4i1:
-  case IceType_v4i32:
-  case IceType_v4f32: {
-    static constexpr SizeT ExpectedNumElements = 4;
-    assert(ExpectedNumElements == Instr->getNumIndexes());
-    const SizeT Index0 = Instr->getIndexValue(0);
-    const SizeT Index1 = Instr->getIndexValue(1);
-    const SizeT Index2 = Instr->getIndexValue(2);
-    const SizeT Index3 = Instr->getIndexValue(3);
-    Variable *T = nullptr;
-    switch (makeSrcSwitchMask(Index0, Index1, Index2, Index3)) {
-#define CASE_SRCS_IN(S0, S1, S2, S3)                                           \
-  case (((S0) << 0) | ((S1) << 1) | ((S2) << 2) | ((S3) << 3))
-      CASE_SRCS_IN(0, 0, 0, 0) : {
-        T = lowerShuffleVector_AllFromSameSrc(Src0, Index0, Index1, Index2,
-                                              Index3);
-      }
-      break;
-      CASE_SRCS_IN(0, 0, 0, 1) : {
-        auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src0, Index2,
-                                                                  Src1, Index3);
-        T = lowerShuffleVector_TwoFromSameSrc(Src0, Index0, Index1, Unified,
-                                              UNIFIED_INDEX_0, UNIFIED_INDEX_1);
-      }
-      break;
-      CASE_SRCS_IN(0, 0, 1, 0) : {
-        auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src1, Index2,
-                                                                  Src0, Index3);
-        T = lowerShuffleVector_TwoFromSameSrc(Src0, Index0, Index1, Unified,
-                                              UNIFIED_INDEX_0, UNIFIED_INDEX_1);
-      }
-      break;
-      CASE_SRCS_IN(0, 0, 1, 1) : {
-        T = lowerShuffleVector_TwoFromSameSrc(Src0, Index0, Index1, Src1,
-                                              Index2, Index3);
-      }
-      break;
-      CASE_SRCS_IN(0, 1, 0, 0) : {
-        auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src0, Index0,
-                                                                  Src1, Index1);
-        T = lowerShuffleVector_TwoFromSameSrc(
-            Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Src0, Index2, Index3);
-      }
-      break;
-      CASE_SRCS_IN(0, 1, 0, 1) : {
-        if (Index0 == 0 && (Index1 - ExpectedNumElements) == 0 && Index2 == 1 &&
-            (Index3 - ExpectedNumElements) == 1) {
-          auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
-          auto *Src0R = legalizeToReg(Src0);
-          T = makeReg(DestTy);
-          _movp(T, Src0R);
-          _punpckl(T, Src1RM);
-        } else if (Index0 == Index2 && Index1 == Index3) {
-          auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(
-              Src0, Index0, Src1, Index1);
-          T = lowerShuffleVector_AllFromSameSrc(
-              Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, UNIFIED_INDEX_0,
-              UNIFIED_INDEX_1);
-        } else {
-          auto *Unified0 = lowerShuffleVector_UnifyFromDifferentSrcs(
-              Src0, Index0, Src1, Index1);
-          auto *Unified1 = lowerShuffleVector_UnifyFromDifferentSrcs(
-              Src0, Index2, Src1, Index3);
-          T = lowerShuffleVector_TwoFromSameSrc(
-              Unified0, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Unified1,
-              UNIFIED_INDEX_0, UNIFIED_INDEX_1);
-        }
-      }
-      break;
-      CASE_SRCS_IN(0, 1, 1, 0) : {
-        if (Index0 == Index3 && Index1 == Index2) {
-          auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(
-              Src0, Index0, Src1, Index1);
-          T = lowerShuffleVector_AllFromSameSrc(
-              Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, UNIFIED_INDEX_1,
-              UNIFIED_INDEX_0);
-        } else {
-          auto *Unified0 = lowerShuffleVector_UnifyFromDifferentSrcs(
-              Src0, Index0, Src1, Index1);
-          auto *Unified1 = lowerShuffleVector_UnifyFromDifferentSrcs(
-              Src1, Index2, Src0, Index3);
-          T = lowerShuffleVector_TwoFromSameSrc(
-              Unified0, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Unified1,
-              UNIFIED_INDEX_0, UNIFIED_INDEX_1);
-        }
-      }
-      break;
-      CASE_SRCS_IN(0, 1, 1, 1) : {
-        auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src0, Index0,
-                                                                  Src1, Index1);
-        T = lowerShuffleVector_TwoFromSameSrc(
-            Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Src1, Index2, Index3);
-      }
-      break;
-      CASE_SRCS_IN(1, 0, 0, 0) : {
-        auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src1, Index0,
-                                                                  Src0, Index1);
-        T = lowerShuffleVector_TwoFromSameSrc(
-            Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Src0, Index2, Index3);
-      }
-      break;
-      CASE_SRCS_IN(1, 0, 0, 1) : {
-        if (Index0 == Index3 && Index1 == Index2) {
-          auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(
-              Src1, Index0, Src0, Index1);
-          T = lowerShuffleVector_AllFromSameSrc(
-              Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, UNIFIED_INDEX_1,
-              UNIFIED_INDEX_0);
-        } else {
-          auto *Unified0 = lowerShuffleVector_UnifyFromDifferentSrcs(
-              Src1, Index0, Src0, Index1);
-          auto *Unified1 = lowerShuffleVector_UnifyFromDifferentSrcs(
-              Src0, Index2, Src1, Index3);
-          T = lowerShuffleVector_TwoFromSameSrc(
-              Unified0, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Unified1,
-              UNIFIED_INDEX_0, UNIFIED_INDEX_1);
-        }
-      }
-      break;
-      CASE_SRCS_IN(1, 0, 1, 0) : {
-        if ((Index0 - ExpectedNumElements) == 0 && Index1 == 0 &&
-            (Index2 - ExpectedNumElements) == 1 && Index3 == 1) {
-          auto *Src1RM = legalize(Src0, Legal_Reg | Legal_Mem);
-          auto *Src0R = legalizeToReg(Src1);
-          T = makeReg(DestTy);
-          _movp(T, Src0R);
-          _punpckl(T, Src1RM);
-        } else if (Index0 == Index2 && Index1 == Index3) {
-          auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(
-              Src1, Index0, Src0, Index1);
-          T = lowerShuffleVector_AllFromSameSrc(
-              Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, UNIFIED_INDEX_0,
-              UNIFIED_INDEX_1);
-        } else {
-          auto *Unified0 = lowerShuffleVector_UnifyFromDifferentSrcs(
-              Src1, Index0, Src0, Index1);
-          auto *Unified1 = lowerShuffleVector_UnifyFromDifferentSrcs(
-              Src1, Index2, Src0, Index3);
-          T = lowerShuffleVector_TwoFromSameSrc(
-              Unified0, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Unified1,
-              UNIFIED_INDEX_0, UNIFIED_INDEX_1);
-        }
-      }
-      break;
-      CASE_SRCS_IN(1, 0, 1, 1) : {
-        auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src1, Index0,
-                                                                  Src0, Index1);
-        T = lowerShuffleVector_TwoFromSameSrc(
-            Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Src1, Index2, Index3);
-      }
-      break;
-      CASE_SRCS_IN(1, 1, 0, 0) : {
-        T = lowerShuffleVector_TwoFromSameSrc(Src1, Index0, Index1, Src0,
-                                              Index2, Index3);
-      }
-      break;
-      CASE_SRCS_IN(1, 1, 0, 1) : {
-        auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src0, Index2,
-                                                                  Src1, Index3);
-        T = lowerShuffleVector_TwoFromSameSrc(Src1, Index0, Index1, Unified,
-                                              UNIFIED_INDEX_0, UNIFIED_INDEX_1);
-      }
-      break;
-      CASE_SRCS_IN(1, 1, 1, 0) : {
-        auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src1, Index2,
-                                                                  Src0, Index3);
-        T = lowerShuffleVector_TwoFromSameSrc(Src1, Index0, Index1, Unified,
-                                              UNIFIED_INDEX_0, UNIFIED_INDEX_1);
-      }
-      break;
-      CASE_SRCS_IN(1, 1, 1, 1) : {
-        T = lowerShuffleVector_AllFromSameSrc(Src1, Index0, Index1, Index2,
-                                              Index3);
-      }
-      break;
-#undef CASE_SRCS_IN
-    }
-
-    assert(T != nullptr);
-    assert(T->getType() == DestTy);
-    _movp(Dest, T);
-    return;
-  } break;
-  }
-
-  // Unoptimized shuffle. Perform a series of inserts and extracts.
-  Context.insert<InstFakeDef>(T);
-  const Type ElementType = typeElementType(DestTy);
-  for (SizeT I = 0; I < Instr->getNumIndexes(); ++I) {
-    auto *Index = Instr->getIndex(I);
-    const SizeT Elem = Index->getValue();
-    auto *ExtElmt = makeReg(ElementType);
-    if (Elem < NumElements) {
-      lowerExtractElement(
-          InstExtractElement::create(Func, ExtElmt, Src0, Index));
-    } else {
-      lowerExtractElement(InstExtractElement::create(
-          Func, ExtElmt, Src1, Ctx->getConstantInt32(Elem - NumElements)));
-    }
-    auto *NewT = makeReg(DestTy);
-    lowerInsertElement(InstInsertElement::create(Func, NewT, T, ExtElmt,
-                                                 Ctx->getConstantInt32(I)));
-    T = NewT;
-  }
-  _movp(Dest, T);
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::lowerSelect(const InstSelect *Select) {
-  Variable *Dest = Select->getDest();
-
-  Operand *Condition = Select->getCondition();
-  // Handle folding opportunities.
-  if (const Inst *Producer = FoldingInfo.getProducerFor(Condition)) {
-    assert(Producer->isDeleted());
-    switch (BoolFolding<Traits>::getProducerKind(Producer)) {
-    default:
-      break;
-    case BoolFolding<Traits>::PK_Icmp32:
-    case BoolFolding<Traits>::PK_Icmp64: {
-      lowerIcmpAndConsumer(llvm::cast<InstIcmp>(Producer), Select);
-      return;
-    }
-    case BoolFolding<Traits>::PK_Fcmp: {
-      lowerFcmpAndConsumer(llvm::cast<InstFcmp>(Producer), Select);
-      return;
-    }
-    }
-  }
-
-  if (isVectorType(Dest->getType())) {
-    lowerSelectVector(Select);
-    return;
-  }
-
-  Operand *CmpResult = legalize(Condition, Legal_Reg | Legal_Mem);
-  Operand *Zero = Ctx->getConstantZero(IceType_i32);
-  _cmp(CmpResult, Zero);
-  Operand *SrcT = Select->getTrueOperand();
-  Operand *SrcF = Select->getFalseOperand();
-  const BrCond Cond = CondX86::Br_ne;
-  lowerSelectMove(Dest, Cond, SrcT, SrcF);
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::lowerSelectMove(Variable *Dest, BrCond Cond,
-                                                Operand *SrcT, Operand *SrcF) {
-  Type DestTy = Dest->getType();
-  if (typeWidthInBytes(DestTy) == 1 || isFloatingType(DestTy)) {
-    // The cmov instruction doesn't allow 8-bit or FP operands, so we need
-    // explicit control flow.
-    // d=cmp e,f; a=d?b:c ==> cmp e,f; a=b; jne L1; a=c; L1:
-    auto *Label = InstX86Label::create(Func, this);
-    SrcT = legalize(SrcT, Legal_Reg | Legal_Imm);
-    _mov(Dest, SrcT);
-    _br(Cond, Label);
-    SrcF = legalize(SrcF, Legal_Reg | Legal_Imm);
-    _redefined(_mov(Dest, SrcF));
-    Context.insert(Label);
-    return;
-  }
-  // mov t, SrcF; cmov_cond t, SrcT; mov dest, t
-  // But if SrcT is immediate, we might be able to do better, as the cmov
-  // instruction doesn't allow an immediate operand:
-  // mov t, SrcT; cmov_!cond t, SrcF; mov dest, t
-  if (llvm::isa<Constant>(SrcT) && !llvm::isa<Constant>(SrcF)) {
-    std::swap(SrcT, SrcF);
-    Cond = InstImpl<TraitsType>::InstX86Base::getOppositeCondition(Cond);
-  }
-  if (!Traits::Is64Bit && DestTy == IceType_i64) {
-    SrcT = legalizeUndef(SrcT);
-    SrcF = legalizeUndef(SrcF);
-    // Set the low portion.
-    auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
-    lowerSelectIntMove(DestLo, Cond, loOperand(SrcT), loOperand(SrcF));
-    // Set the high portion.
-    auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
-    lowerSelectIntMove(DestHi, Cond, hiOperand(SrcT), hiOperand(SrcF));
-    return;
-  }
-
-  assert(DestTy == IceType_i16 || DestTy == IceType_i32 ||
-         (Traits::Is64Bit && DestTy == IceType_i64));
-  lowerSelectIntMove(Dest, Cond, SrcT, SrcF);
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::lowerSelectIntMove(Variable *Dest, BrCond Cond,
-                                                   Operand *SrcT,
-                                                   Operand *SrcF) {
-  Variable *T = nullptr;
-  SrcF = legalize(SrcF);
-  _mov(T, SrcF);
-  SrcT = legalize(SrcT, Legal_Reg | Legal_Mem);
-  _cmov(T, SrcT, Cond);
-  _mov(Dest, T);
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::lowerMove(Variable *Dest, Operand *Src,
-                                          bool IsRedefinition) {
-  assert(Dest->getType() == Src->getType());
-  assert(!Dest->isRematerializable());
-  if (!Traits::Is64Bit && Dest->getType() == IceType_i64) {
-    Src = legalize(Src);
-    Operand *SrcLo = loOperand(Src);
-    Operand *SrcHi = hiOperand(Src);
-    auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
-    auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
-    Variable *T_Lo = nullptr, *T_Hi = nullptr;
-    _mov(T_Lo, SrcLo);
-    _redefined(_mov(DestLo, T_Lo), IsRedefinition);
-    _mov(T_Hi, SrcHi);
-    _redefined(_mov(DestHi, T_Hi), IsRedefinition);
-  } else {
-    Operand *SrcLegal;
-    if (Dest->hasReg()) {
-      // If Dest already has a physical register, then only basic legalization
-      // is needed, as the source operand can be a register, immediate, or
-      // memory.
-      SrcLegal = legalize(Src, Legal_Reg, Dest->getRegNum());
-    } else {
-      // If Dest could be a stack operand, then RI must be a physical register
-      // or a scalar integer immediate.
-      SrcLegal = legalize(Src, Legal_Reg | Legal_Imm);
-    }
-    if (isVectorType(Dest->getType())) {
-      _redefined(_movp(Dest, SrcLegal), IsRedefinition);
-    } else {
-      _redefined(_mov(Dest, SrcLegal), IsRedefinition);
-    }
-  }
-}
-
-template <typename TraitsType>
-bool TargetX86Base<TraitsType>::lowerOptimizeFcmpSelect(
-    const InstFcmp *Fcmp, const InstSelect *Select) {
-  Operand *CmpSrc0 = Fcmp->getSrc(0);
-  Operand *CmpSrc1 = Fcmp->getSrc(1);
-  Operand *SelectSrcT = Select->getTrueOperand();
-  Operand *SelectSrcF = Select->getFalseOperand();
-  Variable *SelectDest = Select->getDest();
-
-  // TODO(capn): also handle swapped compare/select operand order.
-  if (CmpSrc0 != SelectSrcT || CmpSrc1 != SelectSrcF)
-    return false;
-
-  // TODO(sehr, stichnot): fcmp/select patterns (e.g., minsd/maxss) go here.
-  InstFcmp::FCond Condition = Fcmp->getCondition();
-  switch (Condition) {
-  default:
-    return false;
-  case InstFcmp::True:
-    break;
-  case InstFcmp::False:
-    break;
-  case InstFcmp::Ogt: {
-    Variable *T = makeReg(SelectDest->getType());
-    if (isScalarFloatingType(SelectSrcT->getType())) {
-      _mov(T, legalize(SelectSrcT, Legal_Reg | Legal_Mem));
-      _maxss(T, legalize(SelectSrcF, Legal_Reg | Legal_Mem));
-      _mov(SelectDest, T);
-    } else {
-      _movp(T, legalize(SelectSrcT, Legal_Reg | Legal_Mem));
-      _maxps(T, legalize(SelectSrcF, Legal_Reg | Legal_Mem));
-      _movp(SelectDest, T);
-    }
-    return true;
-  } break;
-  case InstFcmp::Olt: {
-    Variable *T = makeReg(SelectSrcT->getType());
-    if (isScalarFloatingType(SelectSrcT->getType())) {
-      _mov(T, legalize(SelectSrcT, Legal_Reg | Legal_Mem));
-      _minss(T, legalize(SelectSrcF, Legal_Reg | Legal_Mem));
-      _mov(SelectDest, T);
-    } else {
-      _movp(T, legalize(SelectSrcT, Legal_Reg | Legal_Mem));
-      _minps(T, legalize(SelectSrcF, Legal_Reg | Legal_Mem));
-      _movp(SelectDest, T);
-    }
-    return true;
-  } break;
-  }
-  return false;
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::lowerIcmp(const InstIcmp *Icmp) {
-  Variable *Dest = Icmp->getDest();
-  if (isVectorType(Dest->getType())) {
-    lowerIcmpVector(Icmp);
-  } else {
-    constexpr Inst *Consumer = nullptr;
-    lowerIcmpAndConsumer(Icmp, Consumer);
-  }
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::lowerSelectVector(const InstSelect *Instr) {
-  Variable *Dest = Instr->getDest();
-  Type DestTy = Dest->getType();
-  Operand *SrcT = Instr->getTrueOperand();
-  Operand *SrcF = Instr->getFalseOperand();
-  Operand *Condition = Instr->getCondition();
-
-  if (!isVectorType(DestTy))
-    llvm::report_fatal_error("Expected a vector select");
-
-  Type SrcTy = SrcT->getType();
-  Variable *T = makeReg(SrcTy);
-  Operand *SrcTRM = legalize(SrcT, Legal_Reg | Legal_Mem);
-  Operand *SrcFRM = legalize(SrcF, Legal_Reg | Legal_Mem);
-
-  if (InstructionSet >= Traits::SSE4_1) {
-    // TODO(wala): If the condition operand is a constant, use blendps or
-    // pblendw.
-    //
-    // Use blendvps or pblendvb to implement select.
-    if (SrcTy == IceType_v4i1 || SrcTy == IceType_v4i32 ||
-        SrcTy == IceType_v4f32) {
-      Operand *ConditionRM = legalize(Condition, Legal_Reg | Legal_Mem);
-      Variable *xmm0 = makeReg(IceType_v4i32, Traits::RegisterSet::Reg_xmm0);
-      _movp(xmm0, ConditionRM);
-      _psll(xmm0, Ctx->getConstantInt8(31));
-      _movp(T, SrcFRM);
-      _blendvps(T, SrcTRM, xmm0);
-      _movp(Dest, T);
-    } else {
-      assert(typeNumElements(SrcTy) == 8 || typeNumElements(SrcTy) == 16);
-      Type SignExtTy =
-          Condition->getType() == IceType_v8i1 ? IceType_v8i16 : IceType_v16i8;
-      Variable *xmm0 = makeReg(SignExtTy, Traits::RegisterSet::Reg_xmm0);
-      lowerCast(InstCast::create(Func, InstCast::Sext, xmm0, Condition));
-      _movp(T, SrcFRM);
-      _pblendvb(T, SrcTRM, xmm0);
-      _movp(Dest, T);
-    }
-    return;
-  }
-  // Lower select without Traits::SSE4.1:
-  // a=d?b:c ==>
-  //   if elementtype(d) != i1:
-  //      d=sext(d);
-  //   a=(b&d)|(c&~d);
-  Variable *T2 = makeReg(SrcTy);
-  // Sign extend the condition operand if applicable.
-  if (SrcTy == IceType_v4f32) {
-    // The sext operation takes only integer arguments.
-    Variable *T3 = Func->makeVariable(IceType_v4i32);
-    lowerCast(InstCast::create(Func, InstCast::Sext, T3, Condition));
-    _movp(T, T3);
-  } else if (typeElementType(SrcTy) != IceType_i1) {
-    lowerCast(InstCast::create(Func, InstCast::Sext, T, Condition));
-  } else {
-    Operand *ConditionRM = legalize(Condition, Legal_Reg | Legal_Mem);
-    _movp(T, ConditionRM);
-  }
-  _movp(T2, T);
-  _pand(T, SrcTRM);
-  _pandn(T2, SrcFRM);
-  _por(T, T2);
-  _movp(Dest, T);
-
-  return;
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::lowerStore(const InstStore *Instr) {
-  Operand *Value = Instr->getData();
-  Operand *Addr = Instr->getStoreAddress();
-  X86OperandMem *NewAddr = formMemoryOperand(Addr, Value->getType());
-  doMockBoundsCheck(NewAddr);
-  Type Ty = NewAddr->getType();
-
-  if (!Traits::Is64Bit && Ty == IceType_i64) {
-    Value = legalizeUndef(Value);
-    Operand *ValueHi = legalize(hiOperand(Value), Legal_Reg | Legal_Imm);
-    _store(ValueHi, llvm::cast<X86OperandMem>(hiOperand(NewAddr)));
-    Operand *ValueLo = legalize(loOperand(Value), Legal_Reg | Legal_Imm);
-    _store(ValueLo, llvm::cast<X86OperandMem>(loOperand(NewAddr)));
-  } else if (isVectorType(Ty)) {
-    _storep(legalizeToReg(Value), NewAddr);
-  } else {
-    Value = legalize(Value, Legal_Reg | Legal_Imm);
-    _store(Value, NewAddr);
-  }
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::doAddressOptStore() {
-  auto *Instr = llvm::cast<InstStore>(Context.getCur());
-  Operand *Addr = Instr->getStoreAddress();
-  Operand *Data = Instr->getData();
-  if (auto *OptAddr = computeAddressOpt(Instr, Data->getType(), Addr)) {
-    Instr->setDeleted();
-    auto *NewStore = Context.insert<InstStore>(Data, OptAddr);
-    if (Instr->getDest())
-      NewStore->setRmwBeacon(Instr->getRmwBeacon());
-  }
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::doAddressOptStoreSubVector() {
-  auto *Intrinsic = llvm::cast<InstIntrinsic>(Context.getCur());
-  Operand *Addr = Intrinsic->getArg(1);
-  Operand *Data = Intrinsic->getArg(0);
-  if (auto *OptAddr = computeAddressOpt(Intrinsic, Data->getType(), Addr)) {
-    Intrinsic->setDeleted();
-    const Ice::Intrinsics::IntrinsicInfo Info = {
-        Ice::Intrinsics::StoreSubVector, Ice::Intrinsics::SideEffects_T,
-        Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_T};
-    auto *NewStore = Context.insert<InstIntrinsic>(3, nullptr, Info);
-    NewStore->addArg(Data);
-    NewStore->addArg(OptAddr);
-    NewStore->addArg(Intrinsic->getArg(2));
-  }
-}
-
-template <typename TraitsType>
-Operand *TargetX86Base<TraitsType>::lowerCmpRange(Operand *Comparison,
-                                                  uint64_t Min, uint64_t Max) {
-  // TODO(ascull): 64-bit should not reach here but only because it is not
-  // implemented yet. This should be able to handle the 64-bit case.
-  assert(Traits::Is64Bit || Comparison->getType() != IceType_i64);
-  // Subtracting 0 is a nop so don't do it
-  if (Min != 0) {
-    // Avoid clobbering the comparison by copying it
-    Variable *T = nullptr;
-    _mov(T, Comparison);
-    _sub(T, Ctx->getConstantInt32(Min));
-    Comparison = T;
-  }
-
-  _cmp(Comparison, Ctx->getConstantInt32(Max - Min));
-
-  return Comparison;
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::lowerCaseCluster(const CaseCluster &Case,
-                                                 Operand *Comparison,
-                                                 bool DoneCmp,
-                                                 CfgNode *DefaultTarget) {
-  switch (Case.getKind()) {
-  case CaseCluster::JumpTable: {
-    InstX86Label *SkipJumpTable;
-
-    Operand *RangeIndex =
-        lowerCmpRange(Comparison, Case.getLow(), Case.getHigh());
-    if (DefaultTarget == nullptr) {
-      // Skip over jump table logic if comparison not in range and no default
-      SkipJumpTable = InstX86Label::create(Func, this);
-      _br(CondX86::Br_a, SkipJumpTable);
-    } else {
-      _br(CondX86::Br_a, DefaultTarget);
-    }
-
-    InstJumpTable *JumpTable = Case.getJumpTable();
-    Context.insert(JumpTable);
-
-    // Make sure the index is a register of the same width as the base
-    Variable *Index;
-    const Type PointerType = getPointerType();
-    if (RangeIndex->getType() != PointerType) {
-      Index = makeReg(PointerType);
-      if (RangeIndex->getType() == IceType_i64) {
-        assert(Traits::Is64Bit);
-        _mov(Index, RangeIndex); // trunc
-      } else {
-        Operand *RangeIndexRM = legalize(RangeIndex, Legal_Reg | Legal_Mem);
-        _movzx(Index, RangeIndexRM);
-      }
-    } else {
-      Index = legalizeToReg(RangeIndex);
-    }
-
-    constexpr RelocOffsetT RelocOffset = 0;
-    constexpr Variable *NoBase = nullptr;
-    constexpr Constant *NoOffset = nullptr;
-    auto JTName = GlobalString::createWithString(Ctx, JumpTable->getName());
-    Constant *Offset = Ctx->getConstantSym(RelocOffset, JTName);
-    uint16_t Shift = typeWidthInBytesLog2(PointerType);
-    constexpr auto Segment = X86OperandMem::SegmentRegisters::DefaultSegment;
-
-    Variable *Target = nullptr;
-    if (PointerType == IceType_i32) {
-      _mov(Target, X86OperandMem::create(Func, PointerType, NoBase, Offset,
-                                         Index, Shift, Segment));
-    } else {
-      auto *Base = makeReg(IceType_i64);
-      _lea(Base, X86OperandMem::create(Func, IceType_void, NoBase, Offset));
-      _mov(Target, X86OperandMem::create(Func, PointerType, Base, NoOffset,
-                                         Index, Shift, Segment));
-    }
-
-    lowerIndirectJump(Target);
-
-    if (DefaultTarget == nullptr)
-      Context.insert(SkipJumpTable);
-    return;
-  }
-  case CaseCluster::Range: {
-    if (Case.isUnitRange()) {
-      // Single item
-      if (!DoneCmp) {
-        Constant *Value = Ctx->getConstantInt32(Case.getLow());
-        _cmp(Comparison, Value);
-      }
-      _br(CondX86::Br_e, Case.getTarget());
-    } else if (DoneCmp && Case.isPairRange()) {
-      // Range of two items with first item aleady compared against
-      _br(CondX86::Br_e, Case.getTarget());
-      Constant *Value = Ctx->getConstantInt32(Case.getHigh());
-      _cmp(Comparison, Value);
-      _br(CondX86::Br_e, Case.getTarget());
-    } else {
-      // Range
-      lowerCmpRange(Comparison, Case.getLow(), Case.getHigh());
-      _br(CondX86::Br_be, Case.getTarget());
-    }
-    if (DefaultTarget != nullptr)
-      _br(DefaultTarget);
-    return;
-  }
-  }
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::lowerSwitch(const InstSwitch *Instr) {
-  // Group cases together and navigate through them with a binary search
-  CaseClusterArray CaseClusters = CaseCluster::clusterizeSwitch(Func, Instr);
-  Operand *Src0 = Instr->getComparison();
-  CfgNode *DefaultTarget = Instr->getLabelDefault();
-
-  assert(CaseClusters.size() != 0); // Should always be at least one
-
-  if (!Traits::Is64Bit && Src0->getType() == IceType_i64) {
-    Src0 = legalize(Src0); // get Base/Index into physical registers
-    Operand *Src0Lo = loOperand(Src0);
-    Operand *Src0Hi = hiOperand(Src0);
-    if (CaseClusters.back().getHigh() > UINT32_MAX) {
-      // TODO(ascull): handle 64-bit case properly (currently naive version)
-      // This might be handled by a higher level lowering of switches.
-      SizeT NumCases = Instr->getNumCases();
-      if (NumCases >= 2) {
-        Src0Lo = legalizeToReg(Src0Lo);
-        Src0Hi = legalizeToReg(Src0Hi);
-      } else {
-        Src0Lo = legalize(Src0Lo, Legal_Reg | Legal_Mem);
-        Src0Hi = legalize(Src0Hi, Legal_Reg | Legal_Mem);
-      }
-      for (SizeT I = 0; I < NumCases; ++I) {
-        Constant *ValueLo = Ctx->getConstantInt32(Instr->getValue(I));
-        Constant *ValueHi = Ctx->getConstantInt32(Instr->getValue(I) >> 32);
-        InstX86Label *Label = InstX86Label::create(Func, this);
-        _cmp(Src0Lo, ValueLo);
-        _br(CondX86::Br_ne, Label);
-        _cmp(Src0Hi, ValueHi);
-        _br(CondX86::Br_e, Instr->getLabel(I));
-        Context.insert(Label);
-      }
-      _br(Instr->getLabelDefault());
-      return;
-    } else {
-      // All the values are 32-bit so just check the operand is too and then
-      // fall through to the 32-bit implementation. This is a common case.
-      Src0Hi = legalize(Src0Hi, Legal_Reg | Legal_Mem);
-      Constant *Zero = Ctx->getConstantInt32(0);
-      _cmp(Src0Hi, Zero);
-      _br(CondX86::Br_ne, DefaultTarget);
-      Src0 = Src0Lo;
-    }
-  }
-
-  // 32-bit lowering
-
-  if (CaseClusters.size() == 1) {
-    // Jump straight to default if needed. Currently a common case as jump
-    // tables occur on their own.
-    constexpr bool DoneCmp = false;
-    lowerCaseCluster(CaseClusters.front(), Src0, DoneCmp, DefaultTarget);
-    return;
-  }
-
-  // Going to be using multiple times so get it in a register early
-  Variable *Comparison = legalizeToReg(Src0);
-
-  // A span is over the clusters
-  struct SearchSpan {
-    SearchSpan(SizeT Begin, SizeT Size, InstX86Label *Label)
-        : Begin(Begin), Size(Size), Label(Label) {}
-
-    SizeT Begin;
-    SizeT Size;
-    InstX86Label *Label;
-  };
-  // The stack will only grow to the height of the tree so 12 should be plenty
-  std::stack<SearchSpan, llvm::SmallVector<SearchSpan, 12>> SearchSpanStack;
-  SearchSpanStack.emplace(0, CaseClusters.size(), nullptr);
-  bool DoneCmp = false;
-
-  while (!SearchSpanStack.empty()) {
-    SearchSpan Span = SearchSpanStack.top();
-    SearchSpanStack.pop();
-
-    if (Span.Label != nullptr)
-      Context.insert(Span.Label);
-
-    switch (Span.Size) {
-    case 0:
-      llvm::report_fatal_error("Invalid SearchSpan size");
-      break;
-
-    case 1:
-      lowerCaseCluster(CaseClusters[Span.Begin], Comparison, DoneCmp,
-                       SearchSpanStack.empty() ? nullptr : DefaultTarget);
-      DoneCmp = false;
-      break;
-
-    case 2: {
-      const CaseCluster *CaseA = &CaseClusters[Span.Begin];
-      const CaseCluster *CaseB = &CaseClusters[Span.Begin + 1];
-
-      // Placing a range last may allow register clobbering during the range
-      // test. That means there is no need to clone the register. If it is a
-      // unit range the comparison may have already been done in the binary
-      // search (DoneCmp) and so it should be placed first. If this is a range
-      // of two items and the comparison with the low value has already been
-      // done, comparing with the other element is cheaper than a range test.
-      // If the low end of the range is zero then there is no subtraction and
-      // nothing to be gained.
-      if (!CaseA->isUnitRange() &&
-          !(CaseA->getLow() == 0 || (DoneCmp && CaseA->isPairRange()))) {
-        std::swap(CaseA, CaseB);
-        DoneCmp = false;
-      }
-
-      lowerCaseCluster(*CaseA, Comparison, DoneCmp);
-      DoneCmp = false;
-      lowerCaseCluster(*CaseB, Comparison, DoneCmp,
-                       SearchSpanStack.empty() ? nullptr : DefaultTarget);
-    } break;
-
-    default:
-      // Pick the middle item and branch b or ae
-      SizeT PivotIndex = Span.Begin + (Span.Size / 2);
-      const CaseCluster &Pivot = CaseClusters[PivotIndex];
-      Constant *Value = Ctx->getConstantInt32(Pivot.getLow());
-      InstX86Label *Label = InstX86Label::create(Func, this);
-      _cmp(Comparison, Value);
-      // TODO(ascull): does it alway have to be far?
-      _br(CondX86::Br_b, Label, InstX86Br::Far);
-      // Lower the left and (pivot+right) sides, falling through to the right
-      SearchSpanStack.emplace(Span.Begin, Span.Size / 2, Label);
-      SearchSpanStack.emplace(PivotIndex, Span.Size - (Span.Size / 2), nullptr);
-      DoneCmp = true;
-      break;
-    }
-  }
-
-  _br(DefaultTarget);
-}
-
-/// The following pattern occurs often in lowered C and C++ code:
-///
-///   %cmp     = fcmp/icmp pred <n x ty> %src0, %src1
-///   %cmp.ext = sext <n x i1> %cmp to <n x ty>
-///
-/// We can eliminate the sext operation by copying the result of pcmpeqd,
-/// pcmpgtd, or cmpps (which produce sign extended results) to the result of the
-/// sext operation.
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::eliminateNextVectorSextInstruction(
-    Variable *SignExtendedResult) {
-  if (auto *NextCast =
-          llvm::dyn_cast_or_null<InstCast>(Context.getNextInst())) {
-    if (NextCast->getCastKind() == InstCast::Sext &&
-        NextCast->getSrc(0) == SignExtendedResult) {
-      NextCast->setDeleted();
-      _movp(NextCast->getDest(), legalizeToReg(SignExtendedResult));
-      // Skip over the instruction.
-      Context.advanceNext();
-    }
-  }
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::lowerUnreachable(
-    const InstUnreachable * /*Instr*/) {
-  _ud2();
-  // Add a fake use of esp to make sure esp adjustments after the unreachable
-  // do not get dead-code eliminated.
-  keepEspLiveAtExit();
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::lowerBreakpoint(
-    const InstBreakpoint * /*Instr*/) {
-  _int3();
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::lowerRMW(const InstX86FakeRMW *RMW) {
-  // If the beacon variable's live range does not end in this instruction, then
-  // it must end in the modified Store instruction that follows. This means
-  // that the original Store instruction is still there, either because the
-  // value being stored is used beyond the Store instruction, or because dead
-  // code elimination did not happen. In either case, we cancel RMW lowering
-  // (and the caller deletes the RMW instruction).
-  if (!RMW->isLastUse(RMW->getBeacon()))
-    return;
-  Operand *Src = RMW->getData();
-  Type Ty = Src->getType();
-  X86OperandMem *Addr = formMemoryOperand(RMW->getAddr(), Ty);
-  doMockBoundsCheck(Addr);
-  if (!Traits::Is64Bit && Ty == IceType_i64) {
-    Src = legalizeUndef(Src);
-    Operand *SrcLo = legalize(loOperand(Src), Legal_Reg | Legal_Imm);
-    Operand *SrcHi = legalize(hiOperand(Src), Legal_Reg | Legal_Imm);
-    auto *AddrLo = llvm::cast<X86OperandMem>(loOperand(Addr));
-    auto *AddrHi = llvm::cast<X86OperandMem>(hiOperand(Addr));
-    switch (RMW->getOp()) {
-    default:
-      // TODO(stichnot): Implement other arithmetic operators.
-      break;
-    case InstArithmetic::Add:
-      _add_rmw(AddrLo, SrcLo);
-      _adc_rmw(AddrHi, SrcHi);
-      return;
-    case InstArithmetic::Sub:
-      _sub_rmw(AddrLo, SrcLo);
-      _sbb_rmw(AddrHi, SrcHi);
-      return;
-    case InstArithmetic::And:
-      _and_rmw(AddrLo, SrcLo);
-      _and_rmw(AddrHi, SrcHi);
-      return;
-    case InstArithmetic::Or:
-      _or_rmw(AddrLo, SrcLo);
-      _or_rmw(AddrHi, SrcHi);
-      return;
-    case InstArithmetic::Xor:
-      _xor_rmw(AddrLo, SrcLo);
-      _xor_rmw(AddrHi, SrcHi);
-      return;
-    }
-  } else {
-    // x86-32: i8, i16, i32
-    // x86-64: i8, i16, i32, i64
-    switch (RMW->getOp()) {
-    default:
-      // TODO(stichnot): Implement other arithmetic operators.
-      break;
-    case InstArithmetic::Add:
-      Src = legalize(Src, Legal_Reg | Legal_Imm);
-      _add_rmw(Addr, Src);
-      return;
-    case InstArithmetic::Sub:
-      Src = legalize(Src, Legal_Reg | Legal_Imm);
-      _sub_rmw(Addr, Src);
-      return;
-    case InstArithmetic::And:
-      Src = legalize(Src, Legal_Reg | Legal_Imm);
-      _and_rmw(Addr, Src);
-      return;
-    case InstArithmetic::Or:
-      Src = legalize(Src, Legal_Reg | Legal_Imm);
-      _or_rmw(Addr, Src);
-      return;
-    case InstArithmetic::Xor:
-      Src = legalize(Src, Legal_Reg | Legal_Imm);
-      _xor_rmw(Addr, Src);
-      return;
-    }
-  }
-  llvm::report_fatal_error("Couldn't lower RMW instruction");
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::lowerOther(const Inst *Instr) {
-  if (const auto *RMW = llvm::dyn_cast<InstX86FakeRMW>(Instr)) {
-    lowerRMW(RMW);
-  } else {
-    TargetLowering::lowerOther(Instr);
-  }
-}
-
-/// Turn an i64 Phi instruction into a pair of i32 Phi instructions, to preserve
-/// integrity of liveness analysis. Undef values are also turned into zeroes,
-/// since loOperand() and hiOperand() don't expect Undef input.  Also, in
-/// Non-SFI mode, add a FakeUse(RebasePtr) for every pooled constant operand.
-template <typename TraitsType> void TargetX86Base<TraitsType>::prelowerPhis() {
-  if (Traits::Is64Bit) {
-    // On x86-64 we don't need to prelower phis -- the architecture can handle
-    // 64-bit integer natively.
-    return;
-  }
-
-  PhiLowering::prelowerPhis32Bit<TargetX86Base<TraitsType>>(
-      this, Context.getNode(), Func);
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::genTargetHelperCallFor(Inst *Instr) {
-  uint32_t StackArgumentsSize = 0;
-  if (auto *Arith = llvm::dyn_cast<InstArithmetic>(Instr)) {
-    RuntimeHelper HelperID = RuntimeHelper::H_Num;
-    Variable *Dest = Arith->getDest();
-    Type DestTy = Dest->getType();
-    if (!Traits::Is64Bit && DestTy == IceType_i64) {
-      switch (Arith->getOp()) {
-      default:
-        return;
-      case InstArithmetic::Udiv:
-        HelperID = RuntimeHelper::H_udiv_i64;
-        break;
-      case InstArithmetic::Sdiv:
-        HelperID = RuntimeHelper::H_sdiv_i64;
-        break;
-      case InstArithmetic::Urem:
-        HelperID = RuntimeHelper::H_urem_i64;
-        break;
-      case InstArithmetic::Srem:
-        HelperID = RuntimeHelper::H_srem_i64;
-        break;
-      }
-    } else if (isVectorType(DestTy)) {
-      Variable *Dest = Arith->getDest();
-      Operand *Src0 = Arith->getSrc(0);
-      Operand *Src1 = Arith->getSrc(1);
-      switch (Arith->getOp()) {
-      default:
-        return;
-      case InstArithmetic::Mul:
-        if (DestTy == IceType_v16i8) {
-          scalarizeArithmetic(Arith->getOp(), Dest, Src0, Src1);
-          Arith->setDeleted();
-        }
-        return;
-      case InstArithmetic::Shl:
-      case InstArithmetic::Lshr:
-      case InstArithmetic::Ashr:
-        if (llvm::isa<Constant>(Src1)) {
-          return;
-        }
-      case InstArithmetic::Udiv:
-      case InstArithmetic::Urem:
-      case InstArithmetic::Sdiv:
-      case InstArithmetic::Srem:
-      case InstArithmetic::Frem:
-        scalarizeArithmetic(Arith->getOp(), Dest, Src0, Src1);
-        Arith->setDeleted();
-        return;
-      }
-    } else {
-      switch (Arith->getOp()) {
-      default:
-        return;
-      case InstArithmetic::Frem:
-        if (isFloat32Asserting32Or64(DestTy))
-          HelperID = RuntimeHelper::H_frem_f32;
-        else
-          HelperID = RuntimeHelper::H_frem_f64;
-      }
-    }
-    constexpr SizeT MaxSrcs = 2;
-    InstCall *Call = makeHelperCall(HelperID, Dest, MaxSrcs);
-    Call->addArg(Arith->getSrc(0));
-    Call->addArg(Arith->getSrc(1));
-    StackArgumentsSize = getCallStackArgumentsSizeBytes(Call);
-    Context.insert(Call);
-    Arith->setDeleted();
-  } else if (auto *Cast = llvm::dyn_cast<InstCast>(Instr)) {
-    InstCast::OpKind CastKind = Cast->getCastKind();
-    Operand *Src0 = Cast->getSrc(0);
-    const Type SrcType = Src0->getType();
-    Variable *Dest = Cast->getDest();
-    const Type DestTy = Dest->getType();
-    RuntimeHelper HelperID = RuntimeHelper::H_Num;
-    Variable *CallDest = Dest;
-    switch (CastKind) {
-    default:
-      return;
-    case InstCast::Fptosi:
-      if (!Traits::Is64Bit && DestTy == IceType_i64) {
-        HelperID = isFloat32Asserting32Or64(SrcType)
-                       ? RuntimeHelper::H_fptosi_f32_i64
-                       : RuntimeHelper::H_fptosi_f64_i64;
-      } else {
-        return;
-      }
-      break;
-    case InstCast::Fptoui:
-      if (isVectorType(DestTy)) {
-        assert(DestTy == IceType_v4i32);
-        assert(SrcType == IceType_v4f32);
-        HelperID = RuntimeHelper::H_fptoui_4xi32_f32;
-      } else if (DestTy == IceType_i64 ||
-                 (!Traits::Is64Bit && DestTy == IceType_i32)) {
-        if (Traits::Is64Bit) {
-          HelperID = isFloat32Asserting32Or64(SrcType)
-                         ? RuntimeHelper::H_fptoui_f32_i64
-                         : RuntimeHelper::H_fptoui_f64_i64;
-        } else if (isInt32Asserting32Or64(DestTy)) {
-          HelperID = isFloat32Asserting32Or64(SrcType)
-                         ? RuntimeHelper::H_fptoui_f32_i32
-                         : RuntimeHelper::H_fptoui_f64_i32;
-        } else {
-          HelperID = isFloat32Asserting32Or64(SrcType)
-                         ? RuntimeHelper::H_fptoui_f32_i64
-                         : RuntimeHelper::H_fptoui_f64_i64;
-        }
-      } else {
-        return;
-      }
-      break;
-    case InstCast::Sitofp:
-      if (!Traits::Is64Bit && SrcType == IceType_i64) {
-        HelperID = isFloat32Asserting32Or64(DestTy)
-                       ? RuntimeHelper::H_sitofp_i64_f32
-                       : RuntimeHelper::H_sitofp_i64_f64;
-      } else {
-        return;
-      }
-      break;
-    case InstCast::Uitofp:
-      if (isVectorType(SrcType)) {
-        assert(DestTy == IceType_v4f32);
-        assert(SrcType == IceType_v4i32);
-        HelperID = RuntimeHelper::H_uitofp_4xi32_4xf32;
-      } else if (SrcType == IceType_i64 ||
-                 (!Traits::Is64Bit && SrcType == IceType_i32)) {
-        if (isInt32Asserting32Or64(SrcType)) {
-          HelperID = isFloat32Asserting32Or64(DestTy)
-                         ? RuntimeHelper::H_uitofp_i32_f32
-                         : RuntimeHelper::H_uitofp_i32_f64;
-        } else {
-          HelperID = isFloat32Asserting32Or64(DestTy)
-                         ? RuntimeHelper::H_uitofp_i64_f32
-                         : RuntimeHelper::H_uitofp_i64_f64;
-        }
-      } else {
-        return;
-      }
-      break;
-    case InstCast::Bitcast: {
-      if (DestTy == Src0->getType())
-        return;
-      switch (DestTy) {
-      default:
-        return;
-      case IceType_i8:
-        assert(Src0->getType() == IceType_v8i1);
-        HelperID = RuntimeHelper::H_bitcast_8xi1_i8;
-        CallDest = Func->makeVariable(IceType_i32);
-        break;
-      case IceType_i16:
-        assert(Src0->getType() == IceType_v16i1);
-        HelperID = RuntimeHelper::H_bitcast_16xi1_i16;
-        CallDest = Func->makeVariable(IceType_i32);
-        break;
-      case IceType_v8i1: {
-        assert(Src0->getType() == IceType_i8);
-        HelperID = RuntimeHelper::H_bitcast_i8_8xi1;
-        Variable *Src0AsI32 = Func->makeVariable(stackSlotType());
-        // Arguments to functions are required to be at least 32 bits wide.
-        Context.insert<InstCast>(InstCast::Zext, Src0AsI32, Src0);
-        Src0 = Src0AsI32;
-      } break;
-      case IceType_v16i1: {
-        assert(Src0->getType() == IceType_i16);
-        HelperID = RuntimeHelper::H_bitcast_i16_16xi1;
-        Variable *Src0AsI32 = Func->makeVariable(stackSlotType());
-        // Arguments to functions are required to be at least 32 bits wide.
-        Context.insert<InstCast>(InstCast::Zext, Src0AsI32, Src0);
-        Src0 = Src0AsI32;
-      } break;
-      }
-    } break;
-    }
-    constexpr SizeT MaxSrcs = 1;
-    InstCall *Call = makeHelperCall(HelperID, CallDest, MaxSrcs);
-    Call->addArg(Src0);
-    StackArgumentsSize = getCallStackArgumentsSizeBytes(Call);
-    Context.insert(Call);
-    // The PNaCl ABI disallows i8/i16 return types, so truncate the helper call
-    // result to the appropriate type as necessary.
-    if (CallDest->getType() != Dest->getType())
-      Context.insert<InstCast>(InstCast::Trunc, Dest, CallDest);
-    Cast->setDeleted();
-  } else if (auto *Intrinsic = llvm::dyn_cast<InstIntrinsic>(Instr)) {
-    CfgVector<Type> ArgTypes;
-    Type ReturnType = IceType_void;
-    switch (Intrinsics::IntrinsicID ID = Intrinsic->getIntrinsicID()) {
-    default:
-      return;
-    case Intrinsics::Ctpop: {
-      Operand *Val = Intrinsic->getArg(0);
-      Type ValTy = Val->getType();
-      if (ValTy == IceType_i64)
-        ArgTypes = {IceType_i64};
-      else
-        ArgTypes = {IceType_i32};
-      ReturnType = IceType_i32;
-    } break;
-    case Intrinsics::Longjmp:
-      ArgTypes = {IceType_i32, IceType_i32};
-      ReturnType = IceType_void;
-      break;
-    case Intrinsics::Memcpy:
-      ArgTypes = {IceType_i32, IceType_i32, IceType_i32};
-      ReturnType = IceType_void;
-      break;
-    case Intrinsics::Memmove:
-      ArgTypes = {IceType_i32, IceType_i32, IceType_i32};
-      ReturnType = IceType_void;
-      break;
-    case Intrinsics::Memset:
-      ArgTypes = {IceType_i32, IceType_i32, IceType_i32};
-      ReturnType = IceType_void;
-      break;
-    case Intrinsics::Setjmp:
-      ArgTypes = {IceType_i32};
-      ReturnType = IceType_i32;
-      break;
-    }
-    StackArgumentsSize = getCallStackArgumentsSizeBytes(ArgTypes, ReturnType);
-  } else if (auto *Call = llvm::dyn_cast<InstCall>(Instr)) {
-    StackArgumentsSize = getCallStackArgumentsSizeBytes(Call);
-  } else if (auto *Ret = llvm::dyn_cast<InstRet>(Instr)) {
-    if (!Ret->hasRetValue())
-      return;
-    Operand *RetValue = Ret->getRetValue();
-    Type ReturnType = RetValue->getType();
-    if (!isScalarFloatingType(ReturnType))
-      return;
-    StackArgumentsSize = typeWidthInBytes(ReturnType);
-  } else {
-    return;
-  }
-  StackArgumentsSize = Traits::applyStackAlignment(StackArgumentsSize);
-  updateMaxOutArgsSizeBytes(StackArgumentsSize);
-}
-
-template <typename TraitsType>
-uint32_t TargetX86Base<TraitsType>::getCallStackArgumentsSizeBytes(
-    const CfgVector<Type> &ArgTypes, Type ReturnType) {
-  uint32_t OutArgumentsSizeBytes = 0;
-  uint32_t XmmArgCount = 0;
-  uint32_t GprArgCount = 0;
-  for (SizeT i = 0, NumArgTypes = ArgTypes.size(); i < NumArgTypes; ++i) {
-    Type Ty = ArgTypes[i];
-    // The PNaCl ABI requires the width of arguments to be at least 32 bits.
-    assert(typeWidthInBytes(Ty) >= 4);
-    if (isVectorType(Ty) &&
-        Traits::getRegisterForXmmArgNum(Traits::getArgIndex(i, XmmArgCount))
-            .hasValue()) {
-      ++XmmArgCount;
-    } else if (isScalarFloatingType(Ty) &&
-               Traits::getRegisterForXmmArgNum(
-                   Traits::getArgIndex(i, XmmArgCount))
-                   .hasValue()) {
-      ++XmmArgCount;
-    } else if (isScalarIntegerType(Ty) &&
-               Traits::getRegisterForGprArgNum(
-                   Ty, Traits::getArgIndex(i, GprArgCount))
-                   .hasValue()) {
-      // The 64 bit ABI allows some integers to be passed in GPRs.
-      ++GprArgCount;
-    } else {
-      if (isVectorType(Ty)) {
-        OutArgumentsSizeBytes =
-            Traits::applyStackAlignment(OutArgumentsSizeBytes);
-      }
-      OutArgumentsSizeBytes += typeWidthInBytesOnStack(Ty);
-    }
-  }
-  if (Traits::Is64Bit)
-    return OutArgumentsSizeBytes;
-  // The 32 bit ABI requires floating point values to be returned on the x87 FP
-  // stack. Ensure there is enough space for the fstp/movs for floating returns.
-  if (isScalarFloatingType(ReturnType)) {
-    OutArgumentsSizeBytes =
-        std::max(OutArgumentsSizeBytes,
-                 static_cast<uint32_t>(typeWidthInBytesOnStack(ReturnType)));
-  }
-  return OutArgumentsSizeBytes;
-}
-
-template <typename TraitsType>
-uint32_t TargetX86Base<TraitsType>::getCallStackArgumentsSizeBytes(
-    const InstCall *Instr) {
-  // Build a vector of the arguments' types.
-  const SizeT NumArgs = Instr->getNumArgs();
-  CfgVector<Type> ArgTypes;
-  ArgTypes.reserve(NumArgs);
-  for (SizeT i = 0; i < NumArgs; ++i) {
-    Operand *Arg = Instr->getArg(i);
-    ArgTypes.emplace_back(Arg->getType());
-  }
-  // Compute the return type (if any);
-  Type ReturnType = IceType_void;
-  Variable *Dest = Instr->getDest();
-  if (Dest != nullptr)
-    ReturnType = Dest->getType();
-  return getShadowStoreSize() +
-         getCallStackArgumentsSizeBytes(ArgTypes, ReturnType);
-}
-
-template <typename TraitsType>
-Variable *TargetX86Base<TraitsType>::makeZeroedRegister(Type Ty,
-                                                        RegNumT RegNum) {
-  Variable *Reg = makeReg(Ty, RegNum);
-  switch (Ty) {
-  case IceType_i1:
-  case IceType_i8:
-  case IceType_i16:
-  case IceType_i32:
-  case IceType_i64:
-    // Conservatively do "mov reg, 0" to avoid modifying FLAGS.
-    _mov(Reg, Ctx->getConstantZero(Ty));
-    break;
-  case IceType_f32:
-  case IceType_f64:
-    Context.insert<InstFakeDef>(Reg);
-    _xorps(Reg, Reg);
-    break;
-  default:
-    // All vector types use the same pxor instruction.
-    assert(isVectorType(Ty));
-    Context.insert<InstFakeDef>(Reg);
-    _pxor(Reg, Reg);
-    break;
-  }
-  return Reg;
-}
-
-// There is no support for loading or emitting vector constants, so the vector
-// values returned from makeVectorOfZeros, makeVectorOfOnes, etc. are
-// initialized with register operations.
-//
-// TODO(wala): Add limited support for vector constants so that complex
-// initialization in registers is unnecessary.
-
-template <typename TraitsType>
-Variable *TargetX86Base<TraitsType>::makeVectorOfZeros(Type Ty,
-                                                       RegNumT RegNum) {
-  return makeZeroedRegister(Ty, RegNum);
-}
-
-template <typename TraitsType>
-Variable *TargetX86Base<TraitsType>::makeVectorOfMinusOnes(Type Ty,
-                                                           RegNumT RegNum) {
-  Variable *MinusOnes = makeReg(Ty, RegNum);
-  // Insert a FakeDef so the live range of MinusOnes is not overestimated.
-  Context.insert<InstFakeDef>(MinusOnes);
-  if (Ty == IceType_f64)
-    // Making a vector of minus ones of type f64 is currently only used for the
-    // fabs intrinsic.  To use the f64 type to create this mask with pcmpeqq
-    // requires SSE 4.1.  Since we're just creating a mask, pcmpeqd does the
-    // same job and only requires SSE2.
-    _pcmpeq(MinusOnes, MinusOnes, IceType_f32);
-  else
-    _pcmpeq(MinusOnes, MinusOnes);
-  return MinusOnes;
-}
-
-template <typename TraitsType>
-Variable *TargetX86Base<TraitsType>::makeVectorOfOnes(Type Ty, RegNumT RegNum) {
-  Variable *Dest = makeVectorOfZeros(Ty, RegNum);
-  Variable *MinusOne = makeVectorOfMinusOnes(Ty);
-  _psub(Dest, MinusOne);
-  return Dest;
-}
-
-template <typename TraitsType>
-Variable *TargetX86Base<TraitsType>::makeVectorOfHighOrderBits(Type Ty,
-                                                               RegNumT RegNum) {
-  assert(Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v8i16 ||
-         Ty == IceType_v16i8);
-  if (Ty == IceType_v4f32 || Ty == IceType_v4i32 || Ty == IceType_v8i16) {
-    Variable *Reg = makeVectorOfOnes(Ty, RegNum);
-    SizeT Shift =
-        typeWidthInBytes(typeElementType(Ty)) * Traits::X86_CHAR_BIT - 1;
-    _psll(Reg, Ctx->getConstantInt8(Shift));
-    return Reg;
-  } else {
-    // SSE has no left shift operation for vectors of 8 bit integers.
-    constexpr uint32_t HIGH_ORDER_BITS_MASK = 0x80808080;
-    Constant *ConstantMask = Ctx->getConstantInt32(HIGH_ORDER_BITS_MASK);
-    Variable *Reg = makeReg(Ty, RegNum);
-    _movd(Reg, legalize(ConstantMask, Legal_Reg | Legal_Mem));
-    _pshufd(Reg, Reg, Ctx->getConstantZero(IceType_i8));
-    return Reg;
-  }
-}
-
-/// Construct a mask in a register that can be and'ed with a floating-point
-/// value to mask off its sign bit. The value will be <4 x 0x7fffffff> for f32
-/// and v4f32, and <2 x 0x7fffffffffffffff> for f64. Construct it as vector of
-/// ones logically right shifted one bit.
-// TODO(stichnot): Fix the wala
-// TODO: above, to represent vector constants in memory.
-template <typename TraitsType>
-Variable *TargetX86Base<TraitsType>::makeVectorOfFabsMask(Type Ty,
-                                                          RegNumT RegNum) {
-  Variable *Reg = makeVectorOfMinusOnes(Ty, RegNum);
-  _psrl(Reg, Ctx->getConstantInt8(1));
-  return Reg;
-}
-
-template <typename TraitsType>
-typename TargetX86Base<TraitsType>::X86OperandMem *
-TargetX86Base<TraitsType>::getMemoryOperandForStackSlot(Type Ty, Variable *Slot,
-                                                        uint32_t Offset) {
-  // Ensure that Loc is a stack slot.
-  assert(Slot->mustNotHaveReg());
-  assert(Slot->getRegNum().hasNoValue());
-  // Compute the location of Loc in memory.
-  // TODO(wala,stichnot): lea should not
-  // be required. The address of the stack slot is known at compile time
-  // (although not until after addProlog()).
-  const Type PointerType = getPointerType();
-  Variable *Loc = makeReg(PointerType);
-  _lea(Loc, Slot);
-  Constant *ConstantOffset = Ctx->getConstantInt32(Offset);
-  return X86OperandMem::create(Func, Ty, Loc, ConstantOffset);
-}
-
-/// Lowering helper to copy a scalar integer source operand into some 8-bit GPR.
-/// Src is assumed to already be legalized.  If the source operand is known to
-/// be a memory or immediate operand, a simple mov will suffice.  But if the
-/// source operand can be a physical register, then it must first be copied into
-/// a physical register that is truncable to 8-bit, then truncated into a
-/// physical register that can receive a truncation, and finally copied into the
-/// result 8-bit register (which in general can be any 8-bit register).  For
-/// example, moving %ebp into %ah may be accomplished as:
-///   movl %ebp, %edx
-///   mov_trunc %edx, %dl  // this redundant assignment is ultimately elided
-///   movb %dl, %ah
-/// On the other hand, moving a memory or immediate operand into ah:
-///   movb 4(%ebp), %ah
-///   movb $my_imm, %ah
-///
-/// Note #1.  On a 64-bit target, the "movb 4(%ebp), %ah" is likely not
-/// encodable, so RegNum=Reg_ah should NOT be given as an argument.  Instead,
-/// use RegNum=RegNumT() and then let the caller do a separate copy into
-/// Reg_ah.
-///
-/// Note #2.  ConstantRelocatable operands are also put through this process
-/// (not truncated directly) because our ELF emitter does R_386_32 relocations
-/// but not R_386_8 relocations.
-///
-/// Note #3.  If Src is a Variable, the result will be an infinite-weight i8
-/// Variable with the RCX86_IsTrunc8Rcvr register class.  As such, this helper
-/// is a convenient way to prevent ah/bh/ch/dh from being an (invalid) argument
-/// to the pinsrb instruction.
-template <typename TraitsType>
-Variable *TargetX86Base<TraitsType>::copyToReg8(Operand *Src, RegNumT RegNum) {
-  Type Ty = Src->getType();
-  assert(isScalarIntegerType(Ty));
-  assert(Ty != IceType_i1);
-  Variable *Reg = makeReg(IceType_i8, RegNum);
-  Reg->setRegClass(RCX86_IsTrunc8Rcvr);
-  if (llvm::isa<Variable>(Src) || llvm::isa<ConstantRelocatable>(Src)) {
-    Variable *SrcTruncable = makeReg(Ty);
-    switch (Ty) {
-    case IceType_i64:
-      SrcTruncable->setRegClass(RCX86_Is64To8);
-      break;
-    case IceType_i32:
-      SrcTruncable->setRegClass(RCX86_Is32To8);
-      break;
-    case IceType_i16:
-      SrcTruncable->setRegClass(RCX86_Is16To8);
-      break;
-    default:
-      // i8 - just use default register class
-      break;
-    }
-    Variable *SrcRcvr = makeReg(IceType_i8);
-    SrcRcvr->setRegClass(RCX86_IsTrunc8Rcvr);
-    _mov(SrcTruncable, Src);
-    _mov(SrcRcvr, SrcTruncable);
-    Src = SrcRcvr;
-  }
-  _mov(Reg, Src);
-  return Reg;
-}
-
-/// Helper for legalize() to emit the right code to lower an operand to a
-/// register of the appropriate type.
-template <typename TraitsType>
-Variable *TargetX86Base<TraitsType>::copyToReg(Operand *Src, RegNumT RegNum) {
-  Type Ty = Src->getType();
-  Variable *Reg = makeReg(Ty, RegNum);
-  if (isVectorType(Ty)) {
-    _movp(Reg, Src);
-  } else {
-    _mov(Reg, Src);
-  }
-  return Reg;
-}
-
-template <typename TraitsType>
-Operand *TargetX86Base<TraitsType>::legalize(Operand *From, LegalMask Allowed,
-                                             RegNumT RegNum) {
-  const Type Ty = From->getType();
-  // Assert that a physical register is allowed. To date, all calls to
-  // legalize() allow a physical register. If a physical register needs to be
-  // explicitly disallowed, then new code will need to be written to force a
-  // spill.
-  assert(Allowed & Legal_Reg);
-  // If we're asking for a specific physical register, make sure we're not
-  // allowing any other operand kinds. (This could be future work, e.g. allow
-  // the shl shift amount to be either an immediate or in ecx.)
-  assert(RegNum.hasNoValue() || Allowed == Legal_Reg);
-
-  // Substitute with an available infinite-weight variable if possible.  Only do
-  // this when we are not asking for a specific register, and when the
-  // substitution is not locked to a specific register, and when the types
-  // match, in order to capture the vast majority of opportunities and avoid
-  // corner cases in the lowering.
-  if (RegNum.hasNoValue()) {
-    if (Variable *Subst = getContext().availabilityGet(From)) {
-      // At this point we know there is a potential substitution available.
-      if (Subst->mustHaveReg() && !Subst->hasReg()) {
-        // At this point we know the substitution will have a register.
-        if (From->getType() == Subst->getType()) {
-          // At this point we know the substitution's register is compatible.
-          return Subst;
-        }
-      }
-    }
-  }
-
-  if (auto *Mem = llvm::dyn_cast<X86OperandMem>(From)) {
-    // Before doing anything with a Mem operand, we need to ensure that the
-    // Base and Index components are in physical registers.
-    Variable *Base = Mem->getBase();
-    Variable *Index = Mem->getIndex();
-    Constant *Offset = Mem->getOffset();
-    Variable *RegBase = nullptr;
-    Variable *RegIndex = nullptr;
-    uint16_t Shift = Mem->getShift();
-    if (Base) {
-      RegBase = llvm::cast<Variable>(
-          legalize(Base, Legal_Reg | Legal_Rematerializable));
-    }
-    if (Index) {
-      // TODO(jpp): perhaps we should only allow Legal_Reg if
-      // Base->isRematerializable.
-      RegIndex = llvm::cast<Variable>(
-          legalize(Index, Legal_Reg | Legal_Rematerializable));
-    }
-
-    if (Base != RegBase || Index != RegIndex) {
-      Mem = X86OperandMem::create(Func, Ty, RegBase, Offset, RegIndex, Shift,
-                                  Mem->getSegmentRegister());
-    }
-
-    From = Mem;
-
-    if (!(Allowed & Legal_Mem)) {
-      From = copyToReg(From, RegNum);
-    }
-    return From;
-  }
-
-  if (auto *Const = llvm::dyn_cast<Constant>(From)) {
-    if (llvm::isa<ConstantUndef>(Const)) {
-      From = legalizeUndef(Const, RegNum);
-      if (isVectorType(Ty))
-        return From;
-      Const = llvm::cast<Constant>(From);
-    }
-    // There should be no constants of vector type (other than undef).
-    assert(!isVectorType(Ty));
-
-    // If the operand is a 64 bit constant integer we need to legalize it to a
-    // register in x86-64.
-    if (Traits::Is64Bit) {
-      if (auto *C64 = llvm::dyn_cast<ConstantInteger64>(Const)) {
-        if (!Utils::IsInt(32, C64->getValue())) {
-          if (RegNum.hasValue()) {
-            assert(Traits::getGprForType(IceType_i64, RegNum) == RegNum);
-          }
-          return copyToReg(Const, RegNum);
-        }
-      }
-    }
-
-    if (!llvm::dyn_cast<ConstantRelocatable>(Const)) {
-      if (isScalarFloatingType(Ty)) {
-        // Convert a scalar floating point constant into an explicit memory
-        // operand.
-        if (auto *ConstFloat = llvm::dyn_cast<ConstantFloat>(Const)) {
-          if (Utils::isPositiveZero(ConstFloat->getValue()))
-            return makeZeroedRegister(Ty, RegNum);
-        } else if (auto *ConstDouble = llvm::dyn_cast<ConstantDouble>(Const)) {
-          if (Utils::isPositiveZero(ConstDouble->getValue()))
-            return makeZeroedRegister(Ty, RegNum);
-        }
-
-        auto *CFrom = llvm::cast<Constant>(From);
-        assert(CFrom->getShouldBePooled());
-        Constant *Offset = Ctx->getConstantSym(0, CFrom->getLabelName());
-        auto *Mem = X86OperandMem::create(Func, Ty, nullptr, Offset);
-        From = Mem;
-      }
-    }
-
-    bool NeedsReg = false;
-    if (!(Allowed & Legal_Imm) && !isScalarFloatingType(Ty))
-      // Immediate specifically not allowed.
-      NeedsReg = true;
-    if (!(Allowed & Legal_Mem) && isScalarFloatingType(Ty))
-      // On x86, FP constants are lowered to mem operands.
-      NeedsReg = true;
-    if (NeedsReg) {
-      From = copyToReg(From, RegNum);
-    }
-    return From;
-  }
-
-  if (auto *Var = llvm::dyn_cast<Variable>(From)) {
-    // Check if the variable is guaranteed a physical register. This can happen
-    // either when the variable is pre-colored or when it is assigned infinite
-    // weight.
-    bool MustHaveRegister = (Var->hasReg() || Var->mustHaveReg());
-    bool MustRematerialize =
-        (Var->isRematerializable() && !(Allowed & Legal_Rematerializable));
-    // We need a new physical register for the operand if:
-    // - Mem is not allowed and Var isn't guaranteed a physical register, or
-    // - RegNum is required and Var->getRegNum() doesn't match, or
-    // - Var is a rematerializable variable and rematerializable pass-through is
-    //   not allowed (in which case we need a lea instruction).
-    if (MustRematerialize) {
-      Variable *NewVar = makeReg(Ty, RegNum);
-      // Since Var is rematerializable, the offset will be added when the lea is
-      // emitted.
-      constexpr Constant *NoOffset = nullptr;
-      auto *Mem = X86OperandMem::create(Func, Ty, Var, NoOffset);
-      _lea(NewVar, Mem);
-      From = NewVar;
-    } else if ((!(Allowed & Legal_Mem) && !MustHaveRegister) ||
-               (RegNum.hasValue() && RegNum != Var->getRegNum())) {
-      From = copyToReg(From, RegNum);
-    }
-    return From;
-  }
-
-  llvm::report_fatal_error("Unhandled operand kind in legalize()");
-  return From;
-}
-
-/// Provide a trivial wrapper to legalize() for this common usage.
-template <typename TraitsType>
-Variable *TargetX86Base<TraitsType>::legalizeToReg(Operand *From,
-                                                   RegNumT RegNum) {
-  return llvm::cast<Variable>(legalize(From, Legal_Reg, RegNum));
-}
-
-/// Legalize undef values to concrete values.
-template <typename TraitsType>
-Operand *TargetX86Base<TraitsType>::legalizeUndef(Operand *From,
-                                                  RegNumT RegNum) {
-  Type Ty = From->getType();
-  if (llvm::isa<ConstantUndef>(From)) {
-    // Lower undefs to zero.  Another option is to lower undefs to an
-    // uninitialized register; however, using an uninitialized register results
-    // in less predictable code.
-    //
-    // If in the future the implementation is changed to lower undef values to
-    // uninitialized registers, a FakeDef will be needed:
-    //     Context.insert<InstFakeDef>(Reg);
-    // This is in order to ensure that the live range of Reg is not
-    // overestimated.  If the constant being lowered is a 64 bit value, then
-    // the result should be split and the lo and hi components will need to go
-    // in uninitialized registers.
-    if (isVectorType(Ty))
-      return makeVectorOfZeros(Ty, RegNum);
-    return Ctx->getConstantZero(Ty);
-  }
-  return From;
-}
-
-/// For the cmp instruction, if Src1 is an immediate, or known to be a physical
-/// register, we can allow Src0 to be a memory operand. Otherwise, Src0 must be
-/// copied into a physical register. (Actually, either Src0 or Src1 can be
-/// chosen for the physical register, but unfortunately we have to commit to one
-/// or the other before register allocation.)
-template <typename TraitsType>
-Operand *TargetX86Base<TraitsType>::legalizeSrc0ForCmp(Operand *Src0,
-                                                       Operand *Src1) {
-  bool IsSrc1ImmOrReg = false;
-  if (llvm::isa<Constant>(Src1)) {
-    IsSrc1ImmOrReg = true;
-  } else if (auto *Var = llvm::dyn_cast<Variable>(Src1)) {
-    if (Var->hasReg())
-      IsSrc1ImmOrReg = true;
-  }
-  return legalize(Src0, IsSrc1ImmOrReg ? (Legal_Reg | Legal_Mem) : Legal_Reg);
-}
-
-template <typename TraitsType>
-typename TargetX86Base<TraitsType>::X86OperandMem *
-TargetX86Base<TraitsType>::formMemoryOperand(Operand *Opnd, Type Ty,
-                                             bool DoLegalize) {
-  auto *Mem = llvm::dyn_cast<X86OperandMem>(Opnd);
-  // It may be the case that address mode optimization already creates an
-  // X86OperandMem, so in that case it wouldn't need another level of
-  // transformation.
-  if (!Mem) {
-    auto *Base = llvm::dyn_cast<Variable>(Opnd);
-    auto *Offset = llvm::dyn_cast<Constant>(Opnd);
-    assert(Base || Offset);
-    if (Offset) {
-      if (!llvm::isa<ConstantRelocatable>(Offset)) {
-        if (llvm::isa<ConstantInteger64>(Offset)) {
-          // Memory operands cannot have 64-bit immediates, so they must be
-          // legalized into a register only.
-          Base = llvm::cast<Variable>(legalize(Offset, Legal_Reg));
-          Offset = nullptr;
-        } else {
-          Offset = llvm::cast<Constant>(legalize(Offset));
-
-          assert(llvm::isa<ConstantInteger32>(Offset) ||
-                 llvm::isa<ConstantRelocatable>(Offset));
-        }
-      }
-    }
-    Mem = X86OperandMem::create(Func, Ty, Base, Offset);
-  }
-  return llvm::cast<X86OperandMem>(DoLegalize ? legalize(Mem) : Mem);
-}
-
-template <typename TraitsType>
-Variable *TargetX86Base<TraitsType>::makeReg(Type Type, RegNumT RegNum) {
-  // There aren't any 64-bit integer registers for x86-32.
-  assert(Traits::Is64Bit || Type != IceType_i64);
-  Variable *Reg = Func->makeVariable(Type);
-  if (RegNum.hasValue())
-    Reg->setRegNum(RegNum);
-  else
-    Reg->setMustHaveReg();
-  return Reg;
-}
-
-const Type TypeForSize[] = {IceType_i8, IceType_i16, IceType_i32, IceType_f64,
-                            IceType_v16i8};
-
-template <typename TraitsType>
-Type TargetX86Base<TraitsType>::largestTypeInSize(uint32_t Size,
-                                                  uint32_t MaxSize) {
-  assert(Size != 0);
-  uint32_t TyIndex = llvm::findLastSet(Size, llvm::ZB_Undefined);
-  uint32_t MaxIndex = MaxSize == NoSizeLimit
-                          ? llvm::array_lengthof(TypeForSize) - 1
-                          : llvm::findLastSet(MaxSize, llvm::ZB_Undefined);
-  return TypeForSize[std::min(TyIndex, MaxIndex)];
-}
-
-template <typename TraitsType>
-Type TargetX86Base<TraitsType>::firstTypeThatFitsSize(uint32_t Size,
-                                                      uint32_t MaxSize) {
-  assert(Size != 0);
-  uint32_t TyIndex = llvm::findLastSet(Size, llvm::ZB_Undefined);
-  if (!llvm::isPowerOf2_32(Size))
-    ++TyIndex;
-  uint32_t MaxIndex = MaxSize == NoSizeLimit
-                          ? llvm::array_lengthof(TypeForSize) - 1
-                          : llvm::findLastSet(MaxSize, llvm::ZB_Undefined);
-  return TypeForSize[std::min(TyIndex, MaxIndex)];
-}
-
-template <typename TraitsType> void TargetX86Base<TraitsType>::postLower() {
-  if (Func->getOptLevel() == Opt_m1)
-    return;
-  markRedefinitions();
-  Context.availabilityUpdate();
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::emit(const ConstantInteger32 *C) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Ctx->getStrEmit();
-  Str << "$" << C->getValue();
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::emit(const ConstantInteger64 *C) const {
-  if (!Traits::Is64Bit) {
-    llvm::report_fatal_error("Not expecting to emit 64-bit integers");
-  } else {
-    if (!BuildDefs::dump())
-      return;
-    Ostream &Str = Ctx->getStrEmit();
-    Str << "$" << C->getValue();
-  }
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::emit(const ConstantFloat *C) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Ctx->getStrEmit();
-  Str << C->getLabelName();
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::emit(const ConstantDouble *C) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Ctx->getStrEmit();
-  Str << C->getLabelName();
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::emit(const ConstantUndef *) const {
-  llvm::report_fatal_error("undef value encountered by emitter.");
-}
-
-template <class Machine>
-void TargetX86Base<Machine>::emit(const ConstantRelocatable *C) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Ctx->getStrEmit();
-  Str << "$";
-  emitWithoutPrefix(C);
-}
-
-template <typename TraitsType>
-void TargetX86Base<TraitsType>::emitJumpTable(
-    const Cfg *, const InstJumpTable *JumpTable) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Ctx->getStrEmit();
-  Str << "\t.section\t.rodata." << JumpTable->getSectionName()
-      << ",\"a\",@progbits\n"
-         "\t.align\t"
-      << typeWidthInBytes(getPointerType()) << "\n"
-      << JumpTable->getName() << ":";
-
-  for (SizeT I = 0; I < JumpTable->getNumTargets(); ++I)
-    Str << "\n\t.val\t" << JumpTable->getTarget(I)->getAsmName();
-  Str << "\n";
-}
-
-template <typename TraitsType>
-template <typename T>
-void TargetDataX86<TraitsType>::emitConstantPool(GlobalContext *Ctx) {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Ctx->getStrEmit();
-  Type Ty = T::Ty;
-  SizeT Align = typeAlignInBytes(Ty);
-  ConstantList Pool = Ctx->getConstantPool(Ty);
-
-  Str << "\t.section\t.rodata.cst" << Align << ",\"aM\",@progbits," << Align
-      << "\n";
-  Str << "\t.align\t" << Align << "\n";
-
-  for (Constant *C : Pool) {
-    if (!C->getShouldBePooled())
-      continue;
-    auto *Const = llvm::cast<typename T::IceType>(C);
-    typename T::IceType::PrimType Value = Const->getValue();
-    // Use memcpy() to copy bits from Value into RawValue in a way that avoids
-    // breaking strict-aliasing rules.
-    typename T::PrimitiveIntType RawValue;
-    memcpy(&RawValue, &Value, sizeof(Value));
-    char buf[30];
-    int CharsPrinted =
-        snprintf(buf, llvm::array_lengthof(buf), T::PrintfString, RawValue);
-    assert(CharsPrinted >= 0);
-    assert((size_t)CharsPrinted < llvm::array_lengthof(buf));
-    (void)CharsPrinted; // avoid warnings if asserts are disabled
-    Str << Const->getLabelName();
-    Str << ":\n\t" << T::AsmTag << "\t" << buf << "\t/* " << T::TypeName << " "
-        << Value << " */\n";
-  }
-}
-
-template <typename TraitsType>
-void TargetDataX86<TraitsType>::lowerConstants() {
-  if (getFlags().getDisableTranslation())
-    return;
-  switch (getFlags().getOutFileType()) {
-  case FT_Elf: {
-    ELFObjectWriter *Writer = Ctx->getObjectWriter();
-
-    Writer->writeConstantPool<ConstantInteger32>(IceType_i8);
-    Writer->writeConstantPool<ConstantInteger32>(IceType_i16);
-    Writer->writeConstantPool<ConstantInteger32>(IceType_i32);
-
-    Writer->writeConstantPool<ConstantFloat>(IceType_f32);
-    Writer->writeConstantPool<ConstantDouble>(IceType_f64);
-  } break;
-  case FT_Asm:
-  case FT_Iasm: {
-    OstreamLocker L(Ctx);
-
-    emitConstantPool<PoolTypeConverter<uint8_t>>(Ctx);
-    emitConstantPool<PoolTypeConverter<uint16_t>>(Ctx);
-    emitConstantPool<PoolTypeConverter<uint32_t>>(Ctx);
-
-    emitConstantPool<PoolTypeConverter<float>>(Ctx);
-    emitConstantPool<PoolTypeConverter<double>>(Ctx);
-  } break;
-  }
-}
-
-template <typename TraitsType>
-void TargetDataX86<TraitsType>::lowerJumpTables() {
-  const bool IsPIC = false;
-  switch (getFlags().getOutFileType()) {
-  case FT_Elf: {
-    ELFObjectWriter *Writer = Ctx->getObjectWriter();
-    constexpr FixupKind FK_Abs64 = llvm::ELF::R_X86_64_64;
-    const FixupKind RelocationKind =
-        (getPointerType() == IceType_i32) ? Traits::FK_Abs : FK_Abs64;
-    for (const JumpTableData &JT : Ctx->getJumpTables())
-      Writer->writeJumpTable(JT, RelocationKind, IsPIC);
-  } break;
-  case FT_Asm:
-    // Already emitted from Cfg
-    break;
-  case FT_Iasm: {
-    if (!BuildDefs::dump())
-      return;
-    Ostream &Str = Ctx->getStrEmit();
-    const char *Prefix = IsPIC ? ".data.rel.ro." : ".rodata.";
-    for (const JumpTableData &JT : Ctx->getJumpTables()) {
-      Str << "\t.section\t" << Prefix << JT.getSectionName()
-          << ",\"a\",@progbits\n"
-             "\t.align\t"
-          << typeWidthInBytes(getPointerType()) << "\n"
-          << JT.getName().toString() << ":";
-
-      for (intptr_t TargetOffset : JT.getTargetOffsets())
-        Str << "\n\t.val\t" << JT.getFunctionName() << "+" << TargetOffset;
-      Str << "\n";
-    }
-  } break;
-  }
-}
-
-template <typename TraitsType>
-void TargetDataX86<TraitsType>::lowerGlobals(
-    const VariableDeclarationList &Vars, const std::string &SectionSuffix) {
-  const bool IsPIC = false;
-  switch (getFlags().getOutFileType()) {
-  case FT_Elf: {
-    ELFObjectWriter *Writer = Ctx->getObjectWriter();
-    Writer->writeDataSection(Vars, Traits::FK_Abs, SectionSuffix, IsPIC);
-  } break;
-  case FT_Asm:
-  case FT_Iasm: {
-    OstreamLocker L(Ctx);
-    for (const VariableDeclaration *Var : Vars) {
-      if (getFlags().matchTranslateOnly(Var->getName(), 0)) {
-        emitGlobal(*Var, SectionSuffix);
-      }
-    }
-  } break;
-  }
-}
-} // namespace X8664
-} // end of namespace Ice
-
-#endif // SUBZERO_SRC_ICETARGETLOWERINGX8664BASEIMPL_H
diff --git a/third_party/subzero/src/IceTargetLoweringX8664Traits.h b/third_party/subzero/src/IceTargetLoweringX8664Traits.h
index 0df0244..d650b48 100644
--- a/third_party/subzero/src/IceTargetLoweringX8664Traits.h
+++ b/third_party/subzero/src/IceTargetLoweringX8664Traits.h
@@ -35,8 +35,8 @@
 using namespace ::Ice::X86;
 
 class AssemblerX8664;
-template <class TraitsType> struct Insts;
-template <class TraitsType> class TargetX86Base;
+struct Insts;
+class TargetX8664;
 
 class TargetX8664;
 
@@ -49,8 +49,6 @@
   //      \/_/\/_/\/_____/\/_/  \/_/
   //
   //----------------------------------------------------------------------------
-  static constexpr ::Ice::Assembler::AssemblerKind AsmKind =
-      ::Ice::Assembler::Asm_X8664;
 
   static constexpr bool Is64Bit = true;
   static constexpr ::Ice::RegX8664::GPRRegister Last8BitGPR =
@@ -284,14 +282,6 @@
   //      \/_____/\/_____/\/_/   \/_/\/_____/\/_/ /_/\/_/\/_/ \/_/\/_____/
   //
   //----------------------------------------------------------------------------
-  enum InstructionSet {
-    Begin,
-    // SSE2 is the PNaCl baseline instruction set.
-    SSE2 = Begin,
-    SSE4_1,
-    End
-  };
-
   static const char *TargetName;
   static constexpr Type WordType = IceType_i64;
 
@@ -706,6 +696,8 @@
   }
 #endif
 
+  /// Whether scalar floating point arguments are passed in XMM registers
+  static constexpr bool X86_PASS_SCALAR_FP_IN_XMM = true;
   /// Get the register for a given argument slot in the XMM registers.
   static RegNumT getRegisterForXmmArgNum(uint32_t ArgNum) {
     // TODO(sehr): Change to use the CCArg technique used in ARM32.
@@ -821,10 +813,8 @@
   //
   //----------------------------------------------------------------------------
   using Traits = TargetX8664Traits;
-  using Insts = ::Ice::X8664::Insts<Traits>;
 
-  using TargetLowering = ::Ice::X8664::TargetX86Base<Traits>;
-  using ConcreteTarget = ::Ice::X8664::TargetX8664;
+  using TargetLowering = ::Ice::X8664::TargetX8664;
   using Assembler = ::Ice::X8664::AssemblerX8664;
 
   /// X86Operand extends the Operand hierarchy. Its subclasses are X86OperandMem
diff --git a/third_party/subzero/unittest/AssemblerX8632/TestUtil.h b/third_party/subzero/unittest/AssemblerX8632/TestUtil.h
index fc9abae..c0fdf98 100644
--- a/third_party/subzero/unittest/AssemblerX8632/TestUtil.h
+++ b/third_party/subzero/unittest/AssemblerX8632/TestUtil.h
@@ -37,7 +37,7 @@
 class AssemblerX8632TestBase : public ::testing::Test {
 protected:
   using Address = AssemblerX8632::Traits::Address;
-  using Cond = AssemblerX8632::Traits::Cond;
+  using Cond = AssemblerX8632::CondX86;
   using GPRRegister = AssemblerX8632::Traits::GPRRegister;
   using ByteRegister = AssemblerX8632::Traits::ByteRegister;
   using Label = ::Ice::X8632::Label;
diff --git a/third_party/subzero/unittest/AssemblerX8664/TestUtil.h b/third_party/subzero/unittest/AssemblerX8664/TestUtil.h
index 459385c..c93c061 100644
--- a/third_party/subzero/unittest/AssemblerX8664/TestUtil.h
+++ b/third_party/subzero/unittest/AssemblerX8664/TestUtil.h
@@ -37,7 +37,7 @@
 class AssemblerX8664TestBase : public ::testing::Test {
 protected:
   using Address = AssemblerX8664::Traits::Address;
-  using Cond = AssemblerX8664::Traits::Cond;
+  using Cond = AssemblerX8664::CondX86;
   using GPRRegister = AssemblerX8664::Traits::GPRRegister;
   using ByteRegister = AssemblerX8664::Traits::ByteRegister;
   using Traits = AssemblerX8664::Traits;