Add initial integrated assembler w/ some Xmm ops.

Add a flag to use the integrated assembler.

Handle simple XMM binary op instructions as an initial example of how
instructions might be handled. This tests fixups in a very limited sense --
Track buffer locations of fixups for floating point immediates.

Patchset one shows the original dart assembler code (revision 39313), so that
it can be diffed.

BUG=none
R=stichnot@chromium.org

Review URL: https://codereview.chromium.org/574133002
diff --git a/src/IceCfg.cpp b/src/IceCfg.cpp
index e1255df..d2548a6 100644
--- a/src/IceCfg.cpp
+++ b/src/IceCfg.cpp
@@ -28,7 +28,10 @@
       IsInternalLinkage(false), HasError(false), ErrorMessage(""), Entry(NULL),
       NextInstNumber(1), Live(NULL),
       Target(TargetLowering::createLowering(Ctx->getTargetArch(), this)),
-      VMetadata(new VariablesMetadata(this)), CurrentNode(NULL) {}
+      VMetadata(new VariablesMetadata(this)),
+      TargetAssembler(
+          TargetLowering::createAssembler(Ctx->getTargetArch(), this)),
+      CurrentNode(NULL) {}
 
 Cfg::~Cfg() {}
 
diff --git a/src/IceCfg.h b/src/IceCfg.h
index 73dd814..1de7795 100644
--- a/src/IceCfg.h
+++ b/src/IceCfg.h
@@ -17,6 +17,9 @@
 
 #include "IceDefs.h"
 #include "IceTypes.h"
+
+#include "assembler.h"
+#include "IceClFlags.h"
 #include "IceGlobalContext.h"
 
 #include "llvm/ADT/OwningPtr.h"
@@ -86,6 +89,12 @@
   TargetLowering *getTarget() const { return Target.get(); }
   VariablesMetadata *getVMetadata() const { return VMetadata.get(); }
   Liveness *getLiveness() const { return Live.get(); }
+  template <typename T> T *getAssembler() const {
+    return static_cast<T *>(TargetAssembler.get());
+  }
+  bool UseIntegratedAssembler() const {
+    return getContext()->getFlags().UseIntegratedAssembler;
+  }
   bool hasComputedFrame() const;
 
   // Passes over the CFG.
@@ -166,6 +175,7 @@
   llvm::OwningPtr<Liveness> Live;
   llvm::OwningPtr<TargetLowering> Target;
   llvm::OwningPtr<VariablesMetadata> VMetadata;
+  llvm::OwningPtr<Assembler> TargetAssembler;
 
   // CurrentNode is maintained during dumping/emitting just for
   // validating Variable::DefNode.  Normally, a traversal over
diff --git a/src/IceCfgNode.cpp b/src/IceCfgNode.cpp
index 3277564..eff4e97 100644
--- a/src/IceCfgNode.cpp
+++ b/src/IceCfgNode.cpp
@@ -492,7 +492,11 @@
     // suppress them.
     if (Inst->isRedundantAssign())
       continue;
-    (*I)->emit(Func);
+    if (Func->UseIntegratedAssembler()) {
+      (*I)->emitIAS(Func);
+    } else {
+      (*I)->emit(Func);
+    }
     // Update emitted instruction count, plus fill/spill count for
     // Variable operands without a physical register.
     if (uint32_t Count = (*I)->getEmitInstCount()) {
diff --git a/src/IceClFlags.h b/src/IceClFlags.h
index 2d14a93..d6c232f 100644
--- a/src/IceClFlags.h
+++ b/src/IceClFlags.h
@@ -24,13 +24,15 @@
   ClFlags()
       : DisableInternal(false), SubzeroTimingEnabled(false),
         DisableTranslation(false), DisableGlobals(false),
-        FunctionSections(false), UseSandboxing(false), DumpStats(false),
-        DefaultGlobalPrefix(""), DefaultFunctionPrefix("") {}
+        FunctionSections(false), UseIntegratedAssembler(false),
+        UseSandboxing(false), DumpStats(false), DefaultGlobalPrefix(""),
+        DefaultFunctionPrefix("") {}
   bool DisableInternal;
   bool SubzeroTimingEnabled;
   bool DisableTranslation;
   bool DisableGlobals;
   bool FunctionSections;
+  bool UseIntegratedAssembler;
   bool UseSandboxing;
   bool DumpStats;
   IceString DefaultGlobalPrefix;
diff --git a/src/IceDefs.h b/src/IceDefs.h
index 0f200bc..322e9b4 100644
--- a/src/IceDefs.h
+++ b/src/IceDefs.h
@@ -130,11 +130,6 @@
   Timer &operator=(const Timer &) LLVM_DELETED_FUNCTION;
 };
 
-template <typename T> bool WouldOverflowAdd(T X, T Y) {
-  return ((X > 0 && Y > 0 && (X > std::numeric_limits<T>::max() - Y)) ||
-          (X < 0 && Y < 0 && (X < std::numeric_limits<T>::min() - Y)));
-}
-
 } // end of namespace Ice
 
 #endif // SUBZERO_SRC_ICEDEFS_H
diff --git a/src/IceFixups.h b/src/IceFixups.h
new file mode 100644
index 0000000..7144aa8
--- /dev/null
+++ b/src/IceFixups.h
@@ -0,0 +1,32 @@
+//===- subzero/src/IceFixups.h - Assembler fixup kinds ----------*- C++ -*-===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares generic fixup types.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SUBZERO_SRC_ICEFIXUPS_H
+#define SUBZERO_SRC_ICEFIXUPS_H
+
+#include "IceTypes.def"
+
+namespace Ice {
+
+enum FixupKind {
+  // Specify some of the most common relocation types.
+  FK_Abs_4 = 0,
+  FK_PcRel_4 = 1,
+
+  // Target specific relocation types follow this.
+  FK_FirstTargetSpecific = 1 << 4
+};
+
+} // end of namespace Ice
+
+#endif // SUBZERO_SRC_ICEFIXUPS_H
diff --git a/src/IceInst.cpp b/src/IceInst.cpp
index 629c6bd..a88194e 100644
--- a/src/IceInst.cpp
+++ b/src/IceInst.cpp
@@ -458,6 +458,8 @@
   llvm_unreachable("emit() called on a non-lowered instruction");
 }
 
+void Inst::emitIAS(const Cfg *Func) const { emit(Func); }
+
 void Inst::dump(const Cfg *Func) const {
   Ostream &Str = Func->getContext()->getStrDump();
   dumpDest(Func);
diff --git a/src/IceInst.h b/src/IceInst.h
index 6db4971..5c07dc0 100644
--- a/src/IceInst.h
+++ b/src/IceInst.h
@@ -102,6 +102,7 @@
   // instruction results in a single native instruction.
   virtual uint32_t getEmitInstCount() const { return 0; }
   virtual void emit(const Cfg *Func) const;
+  virtual void emitIAS(const Cfg *Func) const;
   virtual void dump(const Cfg *Func) const;
   virtual void dumpExtras(const Cfg *Func) const;
   void dumpDecorated(const Cfg *Func) const;
diff --git a/src/IceInstX8632.cpp b/src/IceInstX8632.cpp
index 93c193f..8c6b99a 100644
--- a/src/IceInstX8632.cpp
+++ b/src/IceInstX8632.cpp
@@ -12,6 +12,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "assembler_ia32.h"
 #include "IceCfg.h"
 #include "IceCfgNode.h"
 #include "IceConditionCodesX8632.h"
@@ -331,6 +332,47 @@
 
 // ======================== Dump routines ======================== //
 
+namespace {
+
+void emitIASBytes(Ostream &Str, const x86::AssemblerX86 *Asm,
+                  intptr_t StartPosition) {
+  intptr_t EndPosition = Asm->GetPosition();
+  intptr_t LastFixupLoc = -1;
+  AssemblerFixup *LastFixup = NULL;
+  if (Asm->GetLatestFixup()) {
+    LastFixup = Asm->GetLatestFixup();
+    LastFixupLoc = LastFixup->position();
+  }
+  if (LastFixupLoc < StartPosition) {
+    // The fixup doesn't apply to this current block.
+    for (intptr_t i = 0; i < EndPosition - StartPosition; ++i) {
+      Str << "\t.byte "
+          << static_cast<uint32_t>(Asm->LoadBuffer<uint8_t>(StartPosition + i))
+          << "\n";
+    }
+    return;
+  }
+  const intptr_t FixupSize = 4;
+  assert(LastFixupLoc + FixupSize <= EndPosition);
+  // The fixup does apply to this current block.
+  for (intptr_t i = 0; i < LastFixupLoc - StartPosition; ++i) {
+    Str << "\t.byte "
+        << static_cast<uint32_t>(Asm->LoadBuffer<uint8_t>(StartPosition + i))
+        << "\n";
+  }
+  Str << "\t.long " << LastFixup->value()->getName();
+  if (LastFixup->value()->getOffset()) {
+    Str << " + " << LastFixup->value()->getOffset();
+  }
+  Str << "\n";
+  for (intptr_t i = LastFixupLoc + FixupSize; i < EndPosition; ++i) {
+    Str << "\t.byte " << static_cast<uint32_t>(Asm->LoadBuffer<uint8_t>(i))
+        << "\n";
+  }
+}
+
+} // end of anonymous namespace
+
 void InstX8632::dump(const Cfg *Func) const {
   Ostream &Str = Func->getContext()->getStrDump();
   Str << "[X8632] ";
@@ -436,6 +478,38 @@
   Str << "\n";
 }
 
+void
+emitIASVarOperandTyXMM(const Cfg *Func, Type Ty, const Variable *Var,
+                       const Operand *Src,
+                       const x86::AssemblerX86::TypedXmmEmitters &Emitter) {
+  x86::AssemblerX86 *Asm = Func->getAssembler<x86::AssemblerX86>();
+  intptr_t StartPosition = Asm->GetPosition();
+  assert(Var->hasReg());
+  RegX8632::XmmRegister VarReg = RegX8632::getEncodedXmm(Var->getRegNum());
+  if (const Variable *SrcVar = llvm::dyn_cast<Variable>(Src)) {
+    if (SrcVar->hasReg()) {
+      RegX8632::XmmRegister SrcReg =
+          RegX8632::getEncodedXmm(SrcVar->getRegNum());
+      (Asm->*(Emitter.XmmXmm))(Ty, VarReg, SrcReg);
+    } else {
+      x86::Address SrcStackAddr = static_cast<TargetX8632 *>(Func->getTarget())
+                                      ->stackVarToAsmOperand(SrcVar);
+      (Asm->*(Emitter.XmmAddr))(Ty, VarReg, SrcStackAddr);
+    }
+  } else if (const OperandX8632Mem *Mem =
+                 llvm::dyn_cast<OperandX8632Mem>(Src)) {
+    x86::Address SrcAddr = Mem->toAsmAddress(Asm);
+    (Asm->*(Emitter.XmmAddr))(Ty, VarReg, SrcAddr);
+  } else if (const Constant *Imm = llvm::dyn_cast<Constant>(Src)) {
+    (Asm->*(Emitter.XmmAddr))(
+        Ty, VarReg, x86::Address::ofConstPool(Func->getContext(), Asm, Imm));
+  } else {
+    llvm_unreachable("Unexpected operand type");
+  }
+  Ostream &Str = Func->getContext()->getStrEmit();
+  emitIASBytes(Str, Asm, StartPosition);
+}
+
 bool checkForRedundantAssign(const Variable *Dest, const Operand *Source) {
   const Variable *Src = llvm::dyn_cast<const Variable>(Source);
   if (Src == NULL)
@@ -512,6 +586,56 @@
 template <> const char *InstX8632Pextr::Opcode = "pextr";
 template <> const char *InstX8632Pshufd::Opcode = "pshufd";
 
+// Binary XMM ops
+template <>
+const x86::AssemblerX86::TypedXmmEmitters InstX8632Addss::Emitter = {
+    &x86::AssemblerX86::addss, &x86::AssemblerX86::addss, NULL};
+template <>
+const x86::AssemblerX86::TypedXmmEmitters InstX8632Addps::Emitter = {
+    &x86::AssemblerX86::addps, &x86::AssemblerX86::addps, NULL};
+template <>
+const x86::AssemblerX86::TypedXmmEmitters InstX8632Divss::Emitter = {
+    &x86::AssemblerX86::divss, &x86::AssemblerX86::divss, NULL};
+template <>
+const x86::AssemblerX86::TypedXmmEmitters InstX8632Divps::Emitter = {
+    &x86::AssemblerX86::divps, &x86::AssemblerX86::divps, NULL};
+template <>
+const x86::AssemblerX86::TypedXmmEmitters InstX8632Mulss::Emitter = {
+    &x86::AssemblerX86::mulss, &x86::AssemblerX86::mulss, NULL};
+template <>
+const x86::AssemblerX86::TypedXmmEmitters InstX8632Mulps::Emitter = {
+    &x86::AssemblerX86::mulps, &x86::AssemblerX86::mulps, NULL};
+template <>
+const x86::AssemblerX86::TypedXmmEmitters InstX8632Padd::Emitter = {
+    &x86::AssemblerX86::padd, &x86::AssemblerX86::padd, NULL};
+template <>
+const x86::AssemblerX86::TypedXmmEmitters InstX8632Pand::Emitter = {
+    &x86::AssemblerX86::pand, &x86::AssemblerX86::pand, NULL};
+template <>
+const x86::AssemblerX86::TypedXmmEmitters InstX8632Pandn::Emitter = {
+    &x86::AssemblerX86::pandn, &x86::AssemblerX86::pandn, NULL};
+template <>
+const x86::AssemblerX86::TypedXmmEmitters InstX8632Pmuludq::Emitter = {
+    &x86::AssemblerX86::pmuludq, &x86::AssemblerX86::pmuludq, NULL};
+template <>
+const x86::AssemblerX86::TypedXmmEmitters InstX8632Por::Emitter = {
+    &x86::AssemblerX86::por, &x86::AssemblerX86::por, NULL};
+template <>
+const x86::AssemblerX86::TypedXmmEmitters InstX8632Psub::Emitter = {
+    &x86::AssemblerX86::psub, &x86::AssemblerX86::psub, NULL};
+template <>
+const x86::AssemblerX86::TypedXmmEmitters InstX8632Pxor::Emitter = {
+    &x86::AssemblerX86::pxor, &x86::AssemblerX86::pxor, NULL};
+template <>
+const x86::AssemblerX86::TypedXmmEmitters InstX8632Sqrtss::Emitter = {
+    &x86::AssemblerX86::sqrtss, &x86::AssemblerX86::sqrtss, NULL};
+template <>
+const x86::AssemblerX86::TypedXmmEmitters InstX8632Subss::Emitter = {
+    &x86::AssemblerX86::subss, &x86::AssemblerX86::subss, NULL};
+template <>
+const x86::AssemblerX86::TypedXmmEmitters InstX8632Subps::Emitter = {
+    &x86::AssemblerX86::subps, &x86::AssemblerX86::subps, NULL};
+
 template <> void InstX8632Sqrtss::emit(const Cfg *Func) const {
   Ostream &Str = Func->getContext()->getStrEmit();
   assert(getSrcSize() == 1);
@@ -790,6 +914,28 @@
   Str << "\n";
 }
 
+void InstX8632Cmpps::emitIAS(const Cfg *Func) const {
+  Ostream &Str = Func->getContext()->getStrEmit();
+  x86::AssemblerX86 *Asm = Func->getAssembler<x86::AssemblerX86>();
+  intptr_t StartPosition = Asm->GetPosition();
+  assert(getSrcSize() == 2);
+  assert(Condition < CondX86::Cmpps_Invalid);
+  // Assuming there isn't any load folding for cmpps, and vector constants
+  // are not allowed in PNaCl.
+  assert(llvm::isa<Variable>(getSrc(1)));
+  const Variable *SrcVar = llvm::cast<Variable>(getSrc(1));
+  if (SrcVar->hasReg()) {
+    Asm->cmpps(RegX8632::getEncodedXmm(getDest()->getRegNum()),
+               RegX8632::getEncodedXmm(SrcVar->getRegNum()), Condition);
+  } else {
+    x86::Address SrcStackAddr = static_cast<TargetX8632 *>(Func->getTarget())
+                                    ->stackVarToAsmOperand(SrcVar);
+    Asm->cmpps(RegX8632::getEncodedXmm(getDest()->getRegNum()), SrcStackAddr,
+               Condition);
+  }
+  emitIASBytes(Str, Asm, StartPosition);
+}
+
 void InstX8632Cmpps::dump(const Cfg *Func) const {
   Ostream &Str = Func->getContext()->getStrDump();
   assert(Condition < CondX86::Cmpps_Invalid);
@@ -893,6 +1039,18 @@
   Str << "\n";
 }
 
+void InstX8632Ucomiss::emitIAS(const Cfg *Func) const {
+  assert(getSrcSize() == 2);
+  // Currently src0 is always a variable by convention, to avoid having
+  // two memory operands.
+  assert(llvm::isa<Variable>(getSrc(0)));
+  const Variable *Src0 = llvm::cast<Variable>(getSrc(0));
+  Type Ty = Src0->getType();
+  const static x86::AssemblerX86::TypedXmmEmitters Emitter = {
+      &x86::AssemblerX86::ucomiss, &x86::AssemblerX86::ucomiss, NULL};
+  emitIASVarOperandTyXMM(Func, Ty, Src0, getSrc(1), Emitter);
+}
+
 void InstX8632Ucomiss::dump(const Cfg *Func) const {
   Ostream &Str = Func->getContext()->getStrDump();
   Str << "ucomiss." << getSrc(0)->getType() << " ";
@@ -1133,6 +1291,15 @@
   Str << "\tnop\t# variant = " << Variant << "\n";
 }
 
+void InstX8632Nop::emitIAS(const Cfg *Func) const {
+  Ostream &Str = Func->getContext()->getStrEmit();
+  x86::AssemblerX86 *Asm = Func->getAssembler<x86::AssemblerX86>();
+  intptr_t StartPosition = Asm->GetPosition();
+  // TODO: Emit the right code for the variant.
+  Asm->nop();
+  emitIASBytes(Str, Asm, StartPosition);
+}
+
 void InstX8632Nop::dump(const Cfg *Func) const {
   Ostream &Str = Func->getContext()->getStrDump();
   Str << "nop (variant = " << Variant << ")";
@@ -1272,6 +1439,20 @@
   Str << "\n";
 }
 
+void InstX8632Pop::emitIAS(const Cfg *Func) const {
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(getSrcSize() == 0);
+  x86::AssemblerX86 *Asm = Func->getAssembler<x86::AssemblerX86>();
+  intptr_t StartPosition = Asm->GetPosition();
+  if (getDest()->hasReg()) {
+    Asm->popl(RegX8632::getEncodedGPR(getDest()->getRegNum()));
+  } else {
+    Asm->popl(static_cast<TargetX8632 *>(Func->getTarget())
+                  ->stackVarToAsmOperand(getDest()));
+  }
+  emitIASBytes(Str, Asm, StartPosition);
+}
+
 void InstX8632Pop::dump(const Cfg *Func) const {
   Ostream &Str = Func->getContext()->getStrDump();
   dumpDest(Func);
@@ -1284,6 +1465,15 @@
   Func->getTarget()->updateStackAdjustment(Amount);
 }
 
+void InstX8632AdjustStack::emitIAS(const Cfg *Func) const {
+  Ostream &Str = Func->getContext()->getStrEmit();
+  x86::AssemblerX86 *Asm = Func->getAssembler<x86::AssemblerX86>();
+  intptr_t StartPosition = Asm->GetPosition();
+  Asm->subl(RegX8632::Encoded_Reg_esp, x86::Immediate(Amount));
+  emitIASBytes(Str, Asm, StartPosition);
+  Func->getTarget()->updateStackAdjustment(Amount);
+}
+
 void InstX8632AdjustStack::dump(const Cfg *Func) const {
   Ostream &Str = Func->getContext()->getStrDump();
   Str << "esp = sub.i32 esp, " << Amount;
@@ -1356,6 +1546,14 @@
   Str << "\tret\n";
 }
 
+void InstX8632Ret::emitIAS(const Cfg *Func) const {
+  Ostream &Str = Func->getContext()->getStrEmit();
+  x86::AssemblerX86 *Asm = Func->getAssembler<x86::AssemblerX86>();
+  intptr_t StartPosition = Asm->GetPosition();
+  Asm->ret();
+  emitIASBytes(Str, Asm, StartPosition);
+}
+
 void InstX8632Ret::dump(const Cfg *Func) const {
   Ostream &Str = Func->getContext()->getStrDump();
   Type Ty = (getSrcSize() == 0 ? IceType_void : getSrc(0)->getType());
@@ -1498,6 +1696,41 @@
   Str << "]";
 }
 
+x86::Address OperandX8632Mem::toAsmAddress(Assembler *Asm) const {
+  int32_t Disp = 0;
+  AssemblerFixup *Fixup = NULL;
+  // Determine the offset (is it relocatable?)
+  if (getOffset()) {
+    if (ConstantInteger32 *CI =
+            llvm::dyn_cast<ConstantInteger32>(getOffset())) {
+      Disp = static_cast<int32_t>(CI->getValue());
+    } else if (ConstantRelocatable *CR =
+                   llvm::dyn_cast<ConstantRelocatable>(getOffset())) {
+      // TODO(jvoung): CR + non-zero-offset isn't really tested yet,
+      // since the addressing mode optimization doesn't try to combine
+      // ConstantRelocatable with something else.
+      assert(CR->getOffset() == 0);
+      Fixup = x86::DisplacementRelocation::create(Asm, FK_Abs_4, CR);
+    } else {
+      llvm_unreachable("Unexpected offset type");
+    }
+  }
+
+  // Now convert to the various possible forms.
+  if (getBase() && getIndex()) {
+    return x86::Address(RegX8632::getEncodedGPR(getBase()->getRegNum()),
+                        RegX8632::getEncodedGPR(getIndex()->getRegNum()),
+                        x86::ScaleFactor(getShift()), Disp);
+  } else if (getBase()) {
+    return x86::Address(RegX8632::getEncodedGPR(getBase()->getRegNum()), Disp);
+  } else if (getIndex()) {
+    return x86::Address(RegX8632::getEncodedGPR(getIndex()->getRegNum()),
+                        x86::ScaleFactor(getShift()), Disp);
+  } else {
+    return x86::Address::Absolute(Disp, Fixup);
+  }
+}
+
 void VariableSplit::emit(const Cfg *Func) const {
   Ostream &Str = Func->getContext()->getStrEmit();
   assert(!Var->hasReg());
diff --git a/src/IceInstX8632.h b/src/IceInstX8632.h
index 7ab8592..514c374 100644
--- a/src/IceInstX8632.h
+++ b/src/IceInstX8632.h
@@ -16,6 +16,7 @@
 #ifndef SUBZERO_SRC_ICEINSTX8632_H
 #define SUBZERO_SRC_ICEINSTX8632_H
 
+#include "assembler_ia32.h"
 #include "IceDefs.h"
 #include "IceInst.h"
 #include "IceConditionCodesX8632.h"
@@ -75,6 +76,7 @@
   Variable *getIndex() const { return Index; }
   uint16_t getShift() const { return Shift; }
   SegmentRegisters getSegmentRegister() const { return SegmentReg; }
+  x86::Address toAsmAddress(Assembler *Asm) const;
   virtual void emit(const Cfg *Func) const;
   using OperandX8632::dump;
   virtual void dump(const Cfg *Func, Ostream &Str) const;
@@ -396,6 +398,7 @@
         InstX8632AdjustStack(Func, Amount, Esp);
   }
   virtual void emit(const Cfg *Func) const;
+  virtual void emitIAS(const Cfg *Func) const;
   virtual void dump(const Cfg *Func) const;
   static bool classof(const Inst *Inst) { return isClassof(Inst, Adjuststack); }
 
@@ -478,6 +481,7 @@
     getSrc(0)->emit(Func);
     Str << "\n";
   }
+  virtual void emitIAS(const Cfg *Func) const { emit(Func); }
   virtual void dump(const Cfg *Func) const {
     Ostream &Str = Func->getContext()->getStrDump();
     dumpDest(Func);
@@ -497,6 +501,52 @@
   static const char *Opcode;
 };
 
+void emitIASVarOperandTyXMM(const Cfg *Func, Type Ty, const Variable *Var,
+                            const Operand *Src,
+                            const x86::AssemblerX86::TypedXmmEmitters &Emitter);
+
+template <InstX8632::InstKindX8632 K>
+class InstX8632UnaryopXmm : public InstX8632 {
+public:
+  static InstX8632UnaryopXmm *create(Cfg *Func, Variable *Dest, Operand *Src) {
+    return new (Func->allocate<InstX8632UnaryopXmm>())
+        InstX8632UnaryopXmm(Func, Dest, Src);
+  }
+  virtual void emit(const Cfg *Func) const {
+    Ostream &Str = Func->getContext()->getStrEmit();
+    assert(getSrcSize() == 1);
+    Str << "\t" << Opcode << "\t";
+    getDest()->emit(Func);
+    Str << ", ";
+    getSrc(0)->emit(Func);
+    Str << "\n";
+  }
+  virtual void emitIAS(const Cfg *Func) const {
+    Type Ty = getDest()->getType();
+    assert(getSrcSize() == 1);
+    emitIASVarOperandTyXMM(Func, Ty, getDest(), getSrc(0), Emitter);
+  }
+  virtual void dump(const Cfg *Func) const {
+    Ostream &Str = Func->getContext()->getStrDump();
+    dumpDest(Func);
+    Str << " = " << Opcode << "." << getDest()->getType() << " ";
+    dumpSources(Func);
+  }
+  static bool classof(const Inst *Inst) { return isClassof(Inst, K); }
+
+private:
+  InstX8632UnaryopXmm(Cfg *Func, Variable *Dest, Operand *Src)
+      : InstX8632(Func, K, 1, Dest) {
+    addSource(Src);
+  }
+  InstX8632UnaryopXmm(const InstX8632UnaryopXmm &) LLVM_DELETED_FUNCTION;
+  InstX8632UnaryopXmm &
+  operator=(const InstX8632UnaryopXmm &) LLVM_DELETED_FUNCTION;
+  virtual ~InstX8632UnaryopXmm() {}
+  static const char *Opcode;
+  static const x86::AssemblerX86::TypedXmmEmitters Emitter;
+};
+
 // See the definition of emitTwoAddress() for a description of
 // ShiftHack.
 void emitTwoAddress(const char *Opcode, const Inst *Inst, const Cfg *Func,
@@ -533,6 +583,46 @@
   static const char *Opcode;
 };
 
+template <InstX8632::InstKindX8632 K, bool NeedsElementType>
+class InstX8632BinopXmm : public InstX8632 {
+public:
+  // Create an XMM binary-op instruction like addss or addps.
+  static InstX8632BinopXmm *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX8632BinopXmm>())
+        InstX8632BinopXmm(Func, Dest, Source);
+  }
+  virtual void emit(const Cfg *Func) const {
+    const bool ShiftHack = false;
+    emitTwoAddress(Opcode, this, Func, ShiftHack);
+  }
+  virtual void emitIAS(const Cfg *Func) const {
+    Type Ty = getDest()->getType();
+    if (NeedsElementType)
+      Ty = typeElementType(Ty);
+    assert(getSrcSize() == 2);
+    emitIASVarOperandTyXMM(Func, Ty, getDest(), getSrc(1), Emitter);
+  }
+  virtual void dump(const Cfg *Func) const {
+    Ostream &Str = Func->getContext()->getStrDump();
+    dumpDest(Func);
+    Str << " = " << Opcode << "." << getDest()->getType() << " ";
+    dumpSources(Func);
+  }
+  static bool classof(const Inst *Inst) { return isClassof(Inst, K); }
+
+private:
+  InstX8632BinopXmm(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX8632(Func, K, 2, Dest) {
+    addSource(Dest);
+    addSource(Source);
+  }
+  InstX8632BinopXmm(const InstX8632BinopXmm &) LLVM_DELETED_FUNCTION;
+  InstX8632BinopXmm &operator=(const InstX8632BinopXmm &) LLVM_DELETED_FUNCTION;
+  virtual ~InstX8632BinopXmm() {}
+  static const char *Opcode;
+  static const x86::AssemblerX86::TypedXmmEmitters Emitter;
+};
+
 template <InstX8632::InstKindX8632 K> class InstX8632Ternop : public InstX8632 {
 public:
   // Create a ternary-op instruction like div or idiv.
@@ -657,7 +747,7 @@
 typedef InstX8632Unaryop<InstX8632::Bsr> InstX8632Bsr;
 typedef InstX8632Unaryop<InstX8632::Lea> InstX8632Lea;
 typedef InstX8632Unaryop<InstX8632::Movd> InstX8632Movd;
-typedef InstX8632Unaryop<InstX8632::Sqrtss> InstX8632Sqrtss;
+typedef InstX8632UnaryopXmm<InstX8632::Sqrtss> InstX8632Sqrtss;
 // Cbwdq instruction - wrapper for cbw, cwd, and cdq
 typedef InstX8632Unaryop<InstX8632::Cbwdq> InstX8632Cbwdq;
 // Move/assignment instruction - wrapper for mov/movss/movsd.
@@ -668,29 +758,29 @@
 // Movq - copy between XMM registers, or mem64 and XMM registers.
 typedef InstX8632Movlike<InstX8632::Movq> InstX8632Movq;
 typedef InstX8632Binop<InstX8632::Add> InstX8632Add;
-typedef InstX8632Binop<InstX8632::Addps> InstX8632Addps;
+typedef InstX8632BinopXmm<InstX8632::Addps, true> InstX8632Addps;
 typedef InstX8632Binop<InstX8632::Adc> InstX8632Adc;
-typedef InstX8632Binop<InstX8632::Addss> InstX8632Addss;
-typedef InstX8632Binop<InstX8632::Padd> InstX8632Padd;
+typedef InstX8632BinopXmm<InstX8632::Addss, false> InstX8632Addss;
+typedef InstX8632BinopXmm<InstX8632::Padd, true> InstX8632Padd;
 typedef InstX8632Binop<InstX8632::Sub> InstX8632Sub;
-typedef InstX8632Binop<InstX8632::Subps> InstX8632Subps;
-typedef InstX8632Binop<InstX8632::Subss> InstX8632Subss;
+typedef InstX8632BinopXmm<InstX8632::Subps, true> InstX8632Subps;
+typedef InstX8632BinopXmm<InstX8632::Subss, false> InstX8632Subss;
 typedef InstX8632Binop<InstX8632::Sbb> InstX8632Sbb;
-typedef InstX8632Binop<InstX8632::Psub> InstX8632Psub;
+typedef InstX8632BinopXmm<InstX8632::Psub, true> InstX8632Psub;
 typedef InstX8632Binop<InstX8632::And> InstX8632And;
-typedef InstX8632Binop<InstX8632::Pand> InstX8632Pand;
-typedef InstX8632Binop<InstX8632::Pandn> InstX8632Pandn;
+typedef InstX8632BinopXmm<InstX8632::Pand, false> InstX8632Pand;
+typedef InstX8632BinopXmm<InstX8632::Pandn, false> InstX8632Pandn;
 typedef InstX8632Binop<InstX8632::Or> InstX8632Or;
-typedef InstX8632Binop<InstX8632::Por> InstX8632Por;
+typedef InstX8632BinopXmm<InstX8632::Por, false> InstX8632Por;
 typedef InstX8632Binop<InstX8632::Xor> InstX8632Xor;
-typedef InstX8632Binop<InstX8632::Pxor> InstX8632Pxor;
+typedef InstX8632BinopXmm<InstX8632::Pxor, false> InstX8632Pxor;
 typedef InstX8632Binop<InstX8632::Imul> InstX8632Imul;
-typedef InstX8632Binop<InstX8632::Mulps> InstX8632Mulps;
-typedef InstX8632Binop<InstX8632::Mulss> InstX8632Mulss;
+typedef InstX8632BinopXmm<InstX8632::Mulps, true> InstX8632Mulps;
+typedef InstX8632BinopXmm<InstX8632::Mulss, false> InstX8632Mulss;
 typedef InstX8632Binop<InstX8632::Pmull> InstX8632Pmull;
-typedef InstX8632Binop<InstX8632::Pmuludq> InstX8632Pmuludq;
-typedef InstX8632Binop<InstX8632::Divps> InstX8632Divps;
-typedef InstX8632Binop<InstX8632::Divss> InstX8632Divss;
+typedef InstX8632BinopXmm<InstX8632::Pmuludq, false> InstX8632Pmuludq;
+typedef InstX8632BinopXmm<InstX8632::Divps, true> InstX8632Divps;
+typedef InstX8632BinopXmm<InstX8632::Divss, false> InstX8632Divss;
 typedef InstX8632Binop<InstX8632::Rol, true> InstX8632Rol;
 typedef InstX8632Binop<InstX8632::Shl, true> InstX8632Shl;
 typedef InstX8632Binop<InstX8632::Psll> InstX8632Psll;
@@ -828,6 +918,7 @@
         InstX8632Cmpps(Func, Dest, Source, Condition);
   }
   virtual void emit(const Cfg *Func) const;
+  virtual void emitIAS(const Cfg *Func) const;
   virtual void dump(const Cfg *Func) const;
   static bool classof(const Inst *Inst) { return isClassof(Inst, Cmpps); }
 
@@ -941,6 +1032,7 @@
         InstX8632Ucomiss(Func, Src1, Src2);
   }
   virtual void emit(const Cfg *Func) const;
+  virtual void emitIAS(const Cfg *Func) const;
   virtual void dump(const Cfg *Func) const;
   static bool classof(const Inst *Inst) { return isClassof(Inst, Ucomiss); }
 
@@ -1108,6 +1200,7 @@
     return new (Func->allocate<InstX8632Nop>()) InstX8632Nop(Func, Variant);
   }
   virtual void emit(const Cfg *Func) const;
+  virtual void emitIAS(const Cfg *Func) const;
   virtual void dump(const Cfg *Func) const;
   static bool classof(const Inst *Inst) { return isClassof(Inst, Nop); }
 
@@ -1160,6 +1253,7 @@
     return new (Func->allocate<InstX8632Pop>()) InstX8632Pop(Func, Dest);
   }
   virtual void emit(const Cfg *Func) const;
+  virtual void emitIAS(const Cfg *Func) const;
   virtual void dump(const Cfg *Func) const;
   static bool classof(const Inst *Inst) { return isClassof(Inst, Pop); }
 
@@ -1199,6 +1293,7 @@
     return new (Func->allocate<InstX8632Ret>()) InstX8632Ret(Func, Source);
   }
   virtual void emit(const Cfg *Func) const;
+  virtual void emitIAS(const Cfg *Func) const;
   virtual void dump(const Cfg *Func) const;
   static bool classof(const Inst *Inst) { return isClassof(Inst, Ret); }
 
diff --git a/src/IceMemoryRegion.cpp b/src/IceMemoryRegion.cpp
new file mode 100644
index 0000000..9c42940
--- /dev/null
+++ b/src/IceMemoryRegion.cpp
@@ -0,0 +1,32 @@
+// Copyright (c) 2011, the Dart project authors.  Please see the AUTHORS file
+// for details. All rights reserved. Use of this source code is governed by a
+// BSD-style license that can be found in the LICENSE file.
+//
+// Modified by the Subzero authors.
+//
+//===- subzero/src/IceMemoryRegion.cpp - Memory region --------------------===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the MemoryRegion class. It tracks a pointer plus its
+// bounds for bounds-checking in debug mode.
+//===----------------------------------------------------------------------===//
+
+#include "IceMemoryRegion.h"
+
+namespace Ice {
+
+void MemoryRegion::CopyFrom(uintptr_t offset, const MemoryRegion &from) const {
+  assert(from.pointer() != NULL && from.size() > 0);
+  assert(this->size() >= from.size());
+  assert(offset <= this->size() - from.size());
+  memmove(reinterpret_cast<void *>(start() + offset), from.pointer(),
+          from.size());
+}
+
+} // end of namespace Ice
diff --git a/src/IceMemoryRegion.h b/src/IceMemoryRegion.h
new file mode 100644
index 0000000..2a55a41
--- /dev/null
+++ b/src/IceMemoryRegion.h
@@ -0,0 +1,93 @@
+// Copyright (c) 2012, the Dart project authors.  Please see the AUTHORS file
+// for details. All rights reserved. Use of this source code is governed by a
+// BSD-style license that can be found in the LICENSE file.
+//
+// Modified by the Subzero authors.
+//
+//===- subzero/src/IceMemoryRegion.h - Memory region ------------*- C++ -*-===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the MemoryRegion class. It tracks a pointer
+// plus its bounds for bounds-checking in debug mode.
+//===----------------------------------------------------------------------===//
+
+#ifndef SUBZERO_SRC_ICE_MEMORY_REGION_H_
+#define SUBZERO_SRC_ICE_MEMORY_REGION_H_
+
+#include "IceDefs.h"
+#include "IceUtils.h"
+
+namespace Ice {
+
+// Memory regions are useful for accessing memory with bounds check in
+// debug mode. They can be safely passed by value and do not assume ownership
+// of the region.
+class MemoryRegion {
+public:
+  MemoryRegion() : pointer_(NULL), size_(0) {}
+  MemoryRegion(void *pointer, size_t size) : pointer_(pointer), size_(size) {}
+  MemoryRegion(const MemoryRegion &other) { *this = other; }
+  MemoryRegion &operator=(const MemoryRegion &other) {
+    pointer_ = other.pointer_;
+    size_ = other.size_;
+    return *this;
+  }
+
+  void *pointer() const { return pointer_; }
+  size_t size() const { return size_; }
+
+  size_t start() const { return reinterpret_cast<size_t>(pointer_); }
+  size_t end() const { return start() + size_; }
+
+  template <typename T> T Load(size_t offset) const {
+    return *ComputeInternalPointer<T>(offset);
+  }
+
+  template <typename T> void Store(size_t offset, T value) const {
+    *ComputeInternalPointer<T>(offset) = value;
+  }
+
+  template <typename T> T *PointerTo(size_t offset) const {
+    return ComputeInternalPointer<T>(offset);
+  }
+
+  bool Contains(size_t address) const {
+    return (address >= start()) && (address < end());
+  }
+
+  void CopyFrom(size_t offset, const MemoryRegion &from) const;
+
+  // Compute a sub memory region based on an existing one.
+  void Subregion(const MemoryRegion &from, size_t offset, size_t size) {
+    assert(from.size() >= size);
+    assert(offset <= (from.size() - size));
+    pointer_ = reinterpret_cast<void *>(from.start() + offset);
+    size_ = size;
+  }
+
+  // Compute an extended memory region based on an existing one.
+  void Extend(const MemoryRegion &region, size_t extra) {
+    pointer_ = region.pointer();
+    size_ = (region.size() + extra);
+  }
+
+private:
+  template <typename T> T *ComputeInternalPointer(size_t offset) const {
+    assert(size() >= sizeof(T));
+    assert(offset <= size() - sizeof(T));
+    return reinterpret_cast<T *>(start() + offset);
+  }
+
+  void *pointer_;
+  size_t size_;
+};
+
+} // end of namespace Ice
+
+#endif // SUBZERO_SRC_ICE_MEMORY_REGION_H_
diff --git a/src/IceRegistersX8632.h b/src/IceRegistersX8632.h
index effbf36..3aa8178 100644
--- a/src/IceRegistersX8632.h
+++ b/src/IceRegistersX8632.h
@@ -19,68 +19,72 @@
 
 namespace Ice {
 
-class RegX8632 {
-public:
-  // An enum of every register. The enum value may not match the encoding
-  // used to binary encode register operands in instructions.
-  enum AllRegisters {
+namespace RegX8632 {
+
+// An enum of every register. The enum value may not match the encoding
+// used to binary encode register operands in instructions.
+enum AllRegisters {
 #define X(val, encode, name, name16, name8, scratch, preserved, stackptr,      \
           frameptr, isI8, isInt, isFP)                                         \
   val,
-    REGX8632_TABLE
+  REGX8632_TABLE
 #undef X
-        Reg_NUM,
+      Reg_NUM,
 #define X(val, init) val init,
-    REGX8632_TABLE_BOUNDS
+  REGX8632_TABLE_BOUNDS
 #undef X
-  };
-
-  // An enum of GPR Registers. The enum value does match encoding used
-  // to binary encode register operands in instructions.
-  enum GPRRegister {
-#define X(val, encode, name, name16, name8, scratch, preserved, stackptr,      \
-          frameptr, isI8, isInt, isFP)                                         \
-  Encoded_##val encode,
-    REGX8632_GPR_TABLE
-#undef X
-  };
-
-  // An enum of XMM Registers. The enum value does match encoding used
-  // to binary encode register operands in instructions.
-  enum XmmRegister {
-#define X(val, encode, name, name16, name8, scratch, preserved, stackptr,      \
-          frameptr, isI8, isInt, isFP)                                         \
-  Encoded_##val encode,
-    REGX8632_XMM_TABLE
-#undef X
-  };
-
-  // An enum of Byte Registers. The enum value does match encoding used
-  // to binary encode register operands in instructions.
-  enum ByteRegister {
-#define X(val, encode) Encoded_##val encode,
-    REGX8632_BYTEREG_TABLE
-#undef X
-  };
-
-  static GPRRegister getEncodedGPR(int32_t RegNum) {
-    assert(Reg_GPR_First <= RegNum && RegNum <= Reg_GPR_Last);
-    return GPRRegister(RegNum - Reg_GPR_First);
-  }
-
-  static XmmRegister getEncodedXmm(int32_t RegNum) {
-    assert(Reg_XMM_First <= RegNum && RegNum <= Reg_XMM_Last);
-    return XmmRegister(RegNum - Reg_XMM_First);
-  }
-
-  static ByteRegister getEncodedByteReg(int32_t RegNum) {
-    assert(RegNum == Reg_ah || (Reg_GPR_First <= RegNum && RegNum <= Reg_ebx));
-    if (RegNum == Reg_ah)
-      return Encoded_Reg_ah;
-    return ByteRegister(RegNum - Reg_GPR_First);
-  }
 };
 
+// An enum of GPR Registers. The enum value does match encoding used
+// to binary encode register operands in instructions.
+enum GPRRegister {
+#define X(val, encode, name, name16, name8, scratch, preserved, stackptr,      \
+          frameptr, isI8, isInt, isFP)                                         \
+  Encoded_##val encode,
+  REGX8632_GPR_TABLE
+#undef X
+      Encoded_Not_GPR = -1
+};
+
+// An enum of XMM Registers. The enum value does match encoding used
+// to binary encode register operands in instructions.
+enum XmmRegister {
+#define X(val, encode, name, name16, name8, scratch, preserved, stackptr,      \
+          frameptr, isI8, isInt, isFP)                                         \
+  Encoded_##val encode,
+  REGX8632_XMM_TABLE
+#undef X
+      Encoded_Not_Xmm = -1
+};
+
+// An enum of Byte Registers. The enum value does match encoding used
+// to binary encode register operands in instructions.
+enum ByteRegister {
+#define X(val, encode) Encoded_##val encode,
+  REGX8632_BYTEREG_TABLE
+#undef X
+      Encoded_Not_ByteReg = -1
+};
+
+static inline GPRRegister getEncodedGPR(int32_t RegNum) {
+  assert(Reg_GPR_First <= RegNum && RegNum <= Reg_GPR_Last);
+  return GPRRegister(RegNum - Reg_GPR_First);
+}
+
+static inline XmmRegister getEncodedXmm(int32_t RegNum) {
+  assert(Reg_XMM_First <= RegNum && RegNum <= Reg_XMM_Last);
+  return XmmRegister(RegNum - Reg_XMM_First);
+}
+
+static inline ByteRegister getEncodedByteReg(int32_t RegNum) {
+  assert(RegNum == Reg_ah || (Reg_GPR_First <= RegNum && RegNum <= Reg_ebx));
+  if (RegNum == Reg_ah)
+    return Encoded_Reg_ah;
+  return ByteRegister(RegNum - Reg_GPR_First);
+}
+
+} // end of namespace RegX8632
+
 } // end of namespace Ice
 
 #endif // SUBZERO_SRC_ICEREGISTERSX8632_H
diff --git a/src/IceTargetLowering.cpp b/src/IceTargetLowering.cpp
index 3bae591..f663155 100644
--- a/src/IceTargetLowering.cpp
+++ b/src/IceTargetLowering.cpp
@@ -15,6 +15,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "assembler_ia32.h"
 #include "IceCfg.h" // setError()
 #include "IceCfgNode.h"
 #include "IceOperand.h"
@@ -97,6 +98,15 @@
   return NULL;
 }
 
+Assembler *TargetLowering::createAssembler(TargetArch Target, Cfg *Func) {
+  // These statements can be #ifdef'd to specialize the assembler
+  // to a subset of the available targets.  TODO: use CRTP.
+  if (Target == Target_X8632)
+    return new x86::AssemblerX86();
+  Func->setError("Unsupported target");
+  return NULL;
+}
+
 void TargetLowering::doAddressOpt() {
   if (llvm::isa<InstLoad>(*Context.getCur()))
     doAddressOptLoad();
diff --git a/src/IceTargetLowering.h b/src/IceTargetLowering.h
index 99716be..37e14df 100644
--- a/src/IceTargetLowering.h
+++ b/src/IceTargetLowering.h
@@ -25,6 +25,8 @@
 
 namespace Ice {
 
+class Assembler;
+
 // LoweringContext makes it easy to iterate through non-deleted
 // instructions in a node, and insert new (lowered) instructions at
 // the current point.  Along with the instruction list container and
@@ -87,6 +89,7 @@
 class TargetLowering {
 public:
   static TargetLowering *createLowering(TargetArch Target, Cfg *Func);
+  static Assembler *createAssembler(TargetArch Target, Cfg *Func);
   void translate() {
     switch (Ctx->getOptLevel()) {
     case Opt_m1:
diff --git a/src/IceTargetLoweringX8632.cpp b/src/IceTargetLoweringX8632.cpp
index 33a5741..6828940 100644
--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
@@ -24,6 +24,7 @@
 #include "IceRegistersX8632.h"
 #include "IceTargetLoweringX8632.def"
 #include "IceTargetLoweringX8632.h"
+#include "IceUtils.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/CommandLine.h"
@@ -528,6 +529,14 @@
   Str << "]";
 }
 
+x86::Address TargetX8632::stackVarToAsmOperand(const Variable *Var) const {
+  assert(!Var->hasReg());
+  int32_t Offset = Var->getStackOffset();
+  if (!hasFramePointer())
+    Offset += getStackAdjustment();
+  return x86::Address(RegX8632::getEncodedGPR(getFrameOrStackReg()), Offset);
+}
+
 void TargetX8632::lowerArguments() {
   VarList &Args = Func->getArgs();
   // The first four arguments of vector type, regardless of their
@@ -3710,7 +3719,7 @@
     if (Var == NULL || Const == NULL || VMetadata->isMultiDef(Var))
       return false;
     int32_t MoreOffset = IsAdd ? Const->getValue() : -Const->getValue();
-    if (WouldOverflowAdd(Offset, MoreOffset))
+    if (Utils::WouldOverflowAdd(Offset, MoreOffset))
       return false;
     Base = Var;
     Offset += MoreOffset;
diff --git a/src/IceTargetLoweringX8632.h b/src/IceTargetLoweringX8632.h
index 39609cf..71062cc 100644
--- a/src/IceTargetLoweringX8632.h
+++ b/src/IceTargetLoweringX8632.h
@@ -18,6 +18,7 @@
 
 #include "IceDefs.h"
 #include "IceTargetLowering.h"
+#include "assembler_ia32.h"
 #include "IceInstX8632.h"
 #include "IceRegistersX8632.h"
 
@@ -68,6 +69,7 @@
                               size_t BasicFrameOffset, size_t &InArgsSizeBytes);
   Operand *loOperand(Operand *Operand);
   Operand *hiOperand(Operand *Operand);
+  x86::Address stackVarToAsmOperand(const Variable *Var) const;
 
   enum X86InstructionSet {
     // SSE2 is the PNaCl baseline instruction set.
diff --git a/src/IceUtils.h b/src/IceUtils.h
new file mode 100644
index 0000000..ffeb792
--- /dev/null
+++ b/src/IceUtils.h
@@ -0,0 +1,59 @@
+//===- subzero/src/IceUtils.h - Utility functions ---------------*- C++ -*-===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares some utility functions
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SUBZERO_SRC_ICEUTILS_H
+#define SUBZERO_SRC_ICEUTILS_H
+
+#include <climits>
+
+namespace Ice {
+
+// Similar to bit_cast, but allows copying from types of unrelated
+// sizes. This method was introduced to enable the strict aliasing
+// optimizations of GCC 4.4. Basically, GCC mindlessly relies on
+// obscure details in the C++ standard that make reinterpret_cast
+// virtually useless.
+template <class D, class S> inline D bit_copy(const S &source) {
+  D destination;
+  // This use of memcpy is safe: source and destination cannot overlap.
+  memcpy(&destination, reinterpret_cast<const void *>(&source),
+         sizeof(destination));
+  return destination;
+}
+
+class Utils {
+public:
+  // Check whether an N-bit two's-complement representation can hold value.
+  template <typename T> static inline bool IsInt(int N, T value) {
+    assert((0 < N) &&
+           (static_cast<unsigned int>(N) < (CHAR_BIT * sizeof(value))));
+    T limit = static_cast<T>(1) << (N - 1);
+    return (-limit <= value) && (value < limit);
+  }
+
+  template <typename T> static inline bool IsUint(int N, T value) {
+    assert((0 < N) &&
+           (static_cast<unsigned int>(N) < (CHAR_BIT * sizeof(value))));
+    T limit = static_cast<T>(1) << N;
+    return (0 <= value) && (value < limit);
+  }
+
+  template <typename T> static inline bool WouldOverflowAdd(T X, T Y) {
+    return ((X > 0 && Y > 0 && (X > std::numeric_limits<T>::max() - Y)) ||
+            (X < 0 && Y < 0 && (X < std::numeric_limits<T>::min() - Y)));
+  }
+};
+
+} // end of namespace Ice
+
+#endif // SUBZERO_SRC_ICEUTILS_H
diff --git a/src/assembler.cpp b/src/assembler.cpp
new file mode 100644
index 0000000..a169aff
--- /dev/null
+++ b/src/assembler.cpp
@@ -0,0 +1,129 @@
+// Copyright (c) 2012, the Dart project authors.  Please see the AUTHORS file
+// for details. All rights reserved. Use of this source code is governed by a
+// BSD-style license that can be found in the LICENSE file.
+//
+// Modified by the Subzero authors.
+//
+//===- subzero/src/assembler.cpp - Assembler base class -------------------===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Assembler class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "assembler.h"
+#include "IceMemoryRegion.h"
+
+namespace Ice {
+
+static uintptr_t NewContents(Assembler &assembler, intptr_t capacity) {
+  uintptr_t result = assembler.AllocateBytes(capacity);
+  return result;
+}
+
+#if defined(DEBUG)
+AssemblerBuffer::EnsureCapacity::EnsureCapacity(AssemblerBuffer *buffer) {
+  if (buffer->cursor() >= buffer->limit())
+    buffer->ExtendCapacity();
+  // In debug mode, we save the assembler buffer along with the gap
+  // size before we start emitting to the buffer. This allows us to
+  // check that any single generated instruction doesn't overflow the
+  // limit implied by the minimum gap size.
+  buffer_ = buffer;
+  gap_ = ComputeGap();
+  // Make sure that extending the capacity leaves a big enough gap
+  // for any kind of instruction.
+  assert(gap_ >= kMinimumGap);
+  // Mark the buffer as having ensured the capacity.
+  assert(!buffer->HasEnsuredCapacity()); // Cannot nest.
+  buffer->has_ensured_capacity_ = true;
+}
+
+AssemblerBuffer::EnsureCapacity::~EnsureCapacity() {
+  // Unmark the buffer, so we cannot emit after this.
+  buffer_->has_ensured_capacity_ = false;
+  // Make sure the generated instruction doesn't take up more
+  // space than the minimum gap.
+  intptr_t delta = gap_ - ComputeGap();
+  assert(delta <= kMinimumGap);
+}
+#endif
+
+AssemblerBuffer::AssemblerBuffer(Assembler &assembler) : assembler_(assembler) {
+  const intptr_t OneKB = 1024;
+  static const intptr_t kInitialBufferCapacity = 4 * OneKB;
+  contents_ = NewContents(assembler_, kInitialBufferCapacity);
+  cursor_ = contents_;
+  limit_ = ComputeLimit(contents_, kInitialBufferCapacity);
+#if defined(DEBUG)
+  has_ensured_capacity_ = false;
+  fixups_processed_ = false;
+#endif
+
+  // Verify internal state.
+  assert(Capacity() == kInitialBufferCapacity);
+  assert(Size() == 0);
+}
+
+AssemblerBuffer::~AssemblerBuffer() {}
+
+AssemblerFixup *AssemblerBuffer::GetLatestFixup() const {
+  if (fixups_.empty())
+    return NULL;
+  return fixups_.back();
+}
+
+void AssemblerBuffer::ProcessFixups(const MemoryRegion &region) {
+  for (SizeT I = 0; I < fixups_.size(); ++I) {
+    AssemblerFixup *fixup = fixups_[I];
+    fixup->Process(region, fixup->position());
+  }
+}
+
+void AssemblerBuffer::FinalizeInstructions(const MemoryRegion &instructions) {
+  // Copy the instructions from the buffer.
+  MemoryRegion from(reinterpret_cast<void *>(contents()), Size());
+  instructions.CopyFrom(0, from);
+
+  // Process fixups in the instructions.
+  ProcessFixups(instructions);
+#if defined(DEBUG)
+  fixups_processed_ = true;
+#endif
+}
+
+void AssemblerBuffer::ExtendCapacity() {
+  intptr_t old_size = Size();
+  intptr_t old_capacity = Capacity();
+  const intptr_t OneMB = 1 << 20;
+  intptr_t new_capacity = std::min(old_capacity * 2, old_capacity + OneMB);
+  if (new_capacity < old_capacity) {
+    // FATAL
+    llvm_unreachable("Unexpected overflow in AssemblerBuffer::ExtendCapacity");
+  }
+
+  // Allocate the new data area and copy contents of the old one to it.
+  uintptr_t new_contents = NewContents(assembler_, new_capacity);
+  memmove(reinterpret_cast<void *>(new_contents),
+          reinterpret_cast<void *>(contents_), old_size);
+
+  // Compute the relocation delta and switch to the new contents area.
+  intptr_t delta = new_contents - contents_;
+  contents_ = new_contents;
+
+  // Update the cursor and recompute the limit.
+  cursor_ += delta;
+  limit_ = ComputeLimit(new_contents, new_capacity);
+
+  // Verify internal state.
+  assert(Capacity() == new_capacity);
+  assert(Size() == old_size);
+}
+
+} // end of namespace Ice
diff --git a/src/assembler.h b/src/assembler.h
new file mode 100644
index 0000000..92cc98d
--- /dev/null
+++ b/src/assembler.h
@@ -0,0 +1,222 @@
+// Copyright (c) 2012, the Dart project authors.  Please see the AUTHORS file
+// for details. All rights reserved. Use of this source code is governed by a
+// BSD-style license that can be found in the LICENSE file.
+//
+// Modified by the Subzero authors.
+//
+//===- subzero/src/assembler.h - Integrated assembler -----------*- C++ -*-===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the Assembler base class.  Instructions are assembled
+// by architecture-specific assemblers that derive from this base class.
+// This base class manages buffers and fixups for emitting code, etc.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SUBZERO_SRC_ASSEMBLER_H
+#define SUBZERO_SRC_ASSEMBLER_H
+
+#include "IceDefs.h"
+
+#include "IceFixups.h"
+#include "llvm/Support/Allocator.h"
+
+namespace Ice {
+
+// Forward declarations.
+class Assembler;
+class AssemblerFixup;
+class AssemblerBuffer;
+class ConstantRelocatable;
+class MemoryRegion;
+
+// Assembler fixups are positions in generated code that hold relocation
+// information that needs to be processed before finalizing the code
+// into executable memory.
+class AssemblerFixup {
+public:
+  virtual void Process(const MemoryRegion &region, intptr_t position) = 0;
+
+  // It would be ideal if the destructor method could be made private,
+  // but the g++ compiler complains when this is subclassed.
+  virtual ~AssemblerFixup() { llvm_unreachable("~AssemblerFixup used"); }
+
+  intptr_t position() const { return position_; }
+
+  FixupKind kind() const { return kind_; }
+
+  const ConstantRelocatable *value() const { return value_; }
+
+protected:
+  AssemblerFixup(FixupKind Kind, const ConstantRelocatable *Value)
+      : position_(0), kind_(Kind), value_(Value) {}
+
+private:
+  intptr_t position_;
+  FixupKind kind_;
+  const ConstantRelocatable *value_;
+
+  void set_position(intptr_t position) { position_ = position; }
+
+  AssemblerFixup(const AssemblerFixup &) LLVM_DELETED_FUNCTION;
+  AssemblerFixup &operator=(const AssemblerFixup &) LLVM_DELETED_FUNCTION;
+  friend class AssemblerBuffer;
+};
+
+// Assembler buffers are used to emit binary code. They grow on demand.
+class AssemblerBuffer {
+public:
+  AssemblerBuffer(Assembler &);
+  ~AssemblerBuffer();
+
+  // Basic support for emitting, loading, and storing.
+  template <typename T> void Emit(T value) {
+    assert(HasEnsuredCapacity());
+    *reinterpret_cast<T *>(cursor_) = value;
+    cursor_ += sizeof(T);
+  }
+
+  template <typename T> T Load(intptr_t position) const {
+    assert(position >= 0 &&
+           position <= (Size() - static_cast<intptr_t>(sizeof(T))));
+    return *reinterpret_cast<T *>(contents_ + position);
+  }
+
+  template <typename T> void Store(intptr_t position, T value) {
+    assert(position >= 0 &&
+           position <= (Size() - static_cast<intptr_t>(sizeof(T))));
+    *reinterpret_cast<T *>(contents_ + position) = value;
+  }
+
+  // Emit a fixup at the current location.
+  void EmitFixup(AssemblerFixup *fixup) {
+    fixup->set_position(Size());
+    fixups_.push_back(fixup);
+  }
+
+  // Get the size of the emitted code.
+  intptr_t Size() const { return cursor_ - contents_; }
+  uintptr_t contents() const { return contents_; }
+
+  // Copy the assembled instructions into the specified memory block
+  // and apply all fixups.
+  // TODO(jvoung): This will be different. We'll be writing the text
+  // and reloc section to a file?
+  void FinalizeInstructions(const MemoryRegion &region);
+
+// To emit an instruction to the assembler buffer, the EnsureCapacity helper
+// must be used to guarantee that the underlying data area is big enough to
+// hold the emitted instruction. Usage:
+//
+//     AssemblerBuffer buffer;
+//     AssemblerBuffer::EnsureCapacity ensured(&buffer);
+//     ... emit bytes for single instruction ...
+
+#if defined(DEBUG)
+  class EnsureCapacity {
+  public:
+    explicit EnsureCapacity(AssemblerBuffer *buffer);
+    ~EnsureCapacity();
+
+  private:
+    AssemblerBuffer *buffer_;
+    intptr_t gap_;
+
+    intptr_t ComputeGap() { return buffer_->Capacity() - buffer_->Size(); }
+  };
+
+  bool has_ensured_capacity_;
+  bool HasEnsuredCapacity() const { return has_ensured_capacity_; }
+#else
+  class EnsureCapacity {
+  public:
+    explicit EnsureCapacity(AssemblerBuffer *buffer) {
+      if (buffer->cursor() >= buffer->limit())
+        buffer->ExtendCapacity();
+    }
+  };
+
+  // When building the C++ tests, assertion code is enabled. To allow
+  // asserting that the user of the assembler buffer has ensured the
+  // capacity needed for emitting, we add a dummy method in non-debug mode.
+  bool HasEnsuredCapacity() const { return true; }
+#endif
+
+  // Returns the position in the instruction stream.
+  intptr_t GetPosition() const { return cursor_ - contents_; }
+
+  // For bringup only.
+  AssemblerFixup *GetLatestFixup() const;
+
+private:
+  // The limit is set to kMinimumGap bytes before the end of the data area.
+  // This leaves enough space for the longest possible instruction and allows
+  // for a single, fast space check per instruction.
+  static const intptr_t kMinimumGap = 32;
+
+  uintptr_t contents_;
+  uintptr_t cursor_;
+  uintptr_t limit_;
+  Assembler &assembler_;
+  std::vector<AssemblerFixup *> fixups_;
+#if defined(DEBUG)
+  bool fixups_processed_;
+#endif
+
+  uintptr_t cursor() const { return cursor_; }
+  uintptr_t limit() const { return limit_; }
+  intptr_t Capacity() const {
+    assert(limit_ >= contents_);
+    return (limit_ - contents_) + kMinimumGap;
+  }
+
+  // Process the fixup chain.
+  void ProcessFixups(const MemoryRegion &region);
+
+  // Compute the limit based on the data area and the capacity. See
+  // description of kMinimumGap for the reasoning behind the value.
+  static uintptr_t ComputeLimit(uintptr_t data, intptr_t capacity) {
+    return data + capacity - kMinimumGap;
+  }
+
+  void ExtendCapacity();
+
+  friend class AssemblerFixup;
+};
+
+class Assembler {
+public:
+  Assembler() {}
+  ~Assembler() {}
+
+  // Allocate a chunk of bytes using the per-Assembler allocator.
+  uintptr_t AllocateBytes(size_t bytes) {
+    // For now, alignment is not related to NaCl bundle alignment, since
+    // the buffer's GetPosition is relative to the base. So NaCl bundle
+    // alignment checks can be relative to that base. Later, the buffer
+    // will be copied out to a ".text" section (or an in memory-buffer
+    // that can be mprotect'ed with executable permission), and that
+    // second buffer should be aligned for NaCl.
+    const size_t Alignment = 16;
+    return reinterpret_cast<uintptr_t>(Allocator.Allocate(bytes, Alignment));
+  }
+
+  // Allocate data of type T using the per-Assembler allocator.
+  template <typename T> T *Allocate() { return Allocator.Allocate<T>(); }
+
+private:
+  llvm::BumpPtrAllocator Allocator;
+
+  Assembler(const Assembler &) LLVM_DELETED_FUNCTION;
+  Assembler &operator=(const Assembler &) LLVM_DELETED_FUNCTION;
+};
+
+} // end of namespace Ice
+
+#endif // SUBZERO_SRC_ASSEMBLER_H_
diff --git a/src/assembler_ia32.cpp b/src/assembler_ia32.cpp
new file mode 100644
index 0000000..ceed1d1
--- /dev/null
+++ b/src/assembler_ia32.cpp
@@ -0,0 +1,1822 @@
+// Copyright (c) 2013, the Dart project authors.  Please see the AUTHORS file
+// for details. All rights reserved. Use of this source code is governed by a
+// BSD-style license that can be found in the LICENSE file.
+//
+// Modified by the Subzero authors.
+//
+//===- subzero/src/assembler_ia32.cpp - Assembler for x86-32  -------------===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Assembler class for x86-32.
+//
+//===----------------------------------------------------------------------===//
+
+#include "assembler_ia32.h"
+#include "IceCfg.h"
+#include "IceMemoryRegion.h"
+#include "IceOperand.h"
+
+namespace Ice {
+namespace x86 {
+
+class DirectCallRelocation : public AssemblerFixup {
+public:
+  static DirectCallRelocation *create(Assembler *Asm, FixupKind Kind,
+                                      const ConstantRelocatable *Sym) {
+    return new (Asm->Allocate<DirectCallRelocation>())
+        DirectCallRelocation(Kind, Sym);
+  }
+
+  void Process(const MemoryRegion &region, intptr_t position) {
+    // Direct calls are relative to the following instruction on x86.
+    int32_t pointer = region.Load<int32_t>(position);
+    int32_t delta = region.start() + position + sizeof(int32_t);
+    region.Store<int32_t>(position, pointer - delta);
+  }
+
+private:
+  DirectCallRelocation(FixupKind Kind, const ConstantRelocatable *Sym)
+      : AssemblerFixup(Kind, Sym) {}
+};
+
+Address Address::ofConstPool(GlobalContext *Ctx, Assembler *Asm,
+                             const Constant *Imm) {
+  // We should make this much lighter-weight. E.g., just record the const pool
+  // entry ID.
+  std::string Buffer;
+  llvm::raw_string_ostream StrBuf(Buffer);
+  Type Ty = Imm->getType();
+  assert(llvm::isa<ConstantFloat>(Imm) || llvm::isa<ConstantDouble>(Imm));
+  StrBuf << "L$" << Ty << "$" << Imm->getPoolEntryID();
+  const int64_t Offset = 0;
+  const bool SuppressMangling = true;
+  Constant *Sym =
+      Ctx->getConstantSym(Ty, Offset, StrBuf.str(), SuppressMangling);
+  AssemblerFixup *Fixup = x86::DisplacementRelocation::create(
+      Asm, FK_Abs_4, llvm::cast<ConstantRelocatable>(Sym));
+  return x86::Address::Absolute(Offset, Fixup);
+}
+
+void AssemblerX86::call(GPRRegister reg) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0xFF);
+  EmitRegisterOperand(2, reg);
+}
+
+void AssemblerX86::call(const Address &address) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0xFF);
+  EmitOperand(2, address);
+}
+
+void AssemblerX86::call(Label *label) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0xE8);
+  static const int kSize = 5;
+  EmitLabel(label, kSize);
+}
+
+void AssemblerX86::call(const ConstantRelocatable *label) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  intptr_t call_start = buffer_.GetPosition();
+  EmitUint8(0xE8);
+  EmitFixup(DirectCallRelocation::create(this, FK_PcRel_4, label));
+  EmitInt32(-4);
+  assert((buffer_.GetPosition() - call_start) == kCallExternalLabelSize);
+}
+
+void AssemblerX86::pushl(GPRRegister reg) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x50 + reg);
+}
+
+void AssemblerX86::pushl(const Address &address) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0xFF);
+  EmitOperand(6, address);
+}
+
+void AssemblerX86::pushl(const Immediate &imm) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x68);
+  EmitImmediate(imm);
+}
+
+void AssemblerX86::popl(GPRRegister reg) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x58 + reg);
+}
+
+void AssemblerX86::popl(const Address &address) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x8F);
+  EmitOperand(0, address);
+}
+
+void AssemblerX86::pushal() {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x60);
+}
+
+void AssemblerX86::popal() {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x61);
+}
+
+void AssemblerX86::setcc(CondX86::BrCond condition, ByteRegister dst) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x0F);
+  EmitUint8(0x90 + condition);
+  EmitUint8(0xC0 + dst);
+}
+
+void AssemblerX86::movl(GPRRegister dst, const Immediate &imm) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0xB8 + dst);
+  EmitImmediate(imm);
+}
+
+void AssemblerX86::movl(GPRRegister dst, GPRRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x89);
+  EmitRegisterOperand(src, dst);
+}
+
+void AssemblerX86::movl(GPRRegister dst, const Address &src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x8B);
+  EmitOperand(dst, src);
+}
+
+void AssemblerX86::movl(const Address &dst, GPRRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x89);
+  EmitOperand(src, dst);
+}
+
+void AssemblerX86::movl(const Address &dst, const Immediate &imm) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0xC7);
+  EmitOperand(0, dst);
+  EmitImmediate(imm);
+}
+
+void AssemblerX86::movzxb(GPRRegister dst, ByteRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x0F);
+  EmitUint8(0xB6);
+  EmitRegisterOperand(dst, src);
+}
+
+void AssemblerX86::movzxb(GPRRegister dst, const Address &src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x0F);
+  EmitUint8(0xB6);
+  EmitOperand(dst, src);
+}
+
+void AssemblerX86::movsxb(GPRRegister dst, ByteRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x0F);
+  EmitUint8(0xBE);
+  EmitRegisterOperand(dst, src);
+}
+
+void AssemblerX86::movsxb(GPRRegister dst, const Address &src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x0F);
+  EmitUint8(0xBE);
+  EmitOperand(dst, src);
+}
+
+void AssemblerX86::movb(ByteRegister dst, const Address &src) {
+  (void)dst;
+  (void)src;
+  // FATAL
+  llvm_unreachable("Use movzxb or movsxb instead.");
+}
+
+void AssemblerX86::movb(const Address &dst, ByteRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x88);
+  EmitOperand(src, dst);
+}
+
+void AssemblerX86::movb(const Address &dst, const Immediate &imm) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0xC6);
+  EmitOperand(RegX8632::Encoded_Reg_eax, dst);
+  assert(imm.is_int8());
+  EmitUint8(imm.value() & 0xFF);
+}
+
+void AssemblerX86::movzxw(GPRRegister dst, GPRRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x0F);
+  EmitUint8(0xB7);
+  EmitRegisterOperand(dst, src);
+}
+
+void AssemblerX86::movzxw(GPRRegister dst, const Address &src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x0F);
+  EmitUint8(0xB7);
+  EmitOperand(dst, src);
+}
+
+void AssemblerX86::movsxw(GPRRegister dst, GPRRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x0F);
+  EmitUint8(0xBF);
+  EmitRegisterOperand(dst, src);
+}
+
+void AssemblerX86::movsxw(GPRRegister dst, const Address &src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x0F);
+  EmitUint8(0xBF);
+  EmitOperand(dst, src);
+}
+
+void AssemblerX86::movw(GPRRegister dst, const Address &src) {
+  (void)dst;
+  (void)src;
+  // FATAL
+  llvm_unreachable("Use movzxw or movsxw instead.");
+}
+
+void AssemblerX86::movw(const Address &dst, GPRRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOperandSizeOverride();
+  EmitUint8(0x89);
+  EmitOperand(src, dst);
+}
+
+void AssemblerX86::leal(GPRRegister dst, const Address &src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x8D);
+  EmitOperand(dst, src);
+}
+
+void AssemblerX86::cmov(CondX86::BrCond cond, GPRRegister dst,
+                        GPRRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x0F);
+  EmitUint8(0x40 + cond);
+  EmitRegisterOperand(dst, src);
+}
+
+void AssemblerX86::rep_movsb() {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0xF3);
+  EmitUint8(0xA4);
+}
+
+void AssemblerX86::movss(XmmRegister dst, const Address &src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0xF3);
+  EmitUint8(0x0F);
+  EmitUint8(0x10);
+  EmitOperand(dst, src);
+}
+
+void AssemblerX86::movss(const Address &dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0xF3);
+  EmitUint8(0x0F);
+  EmitUint8(0x11);
+  EmitOperand(src, dst);
+}
+
+void AssemblerX86::movss(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0xF3);
+  EmitUint8(0x0F);
+  EmitUint8(0x11);
+  EmitXmmRegisterOperand(src, dst);
+}
+
+void AssemblerX86::movd(XmmRegister dst, GPRRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  EmitUint8(0x6E);
+  EmitOperand(dst, Operand(src));
+}
+
+void AssemblerX86::movd(GPRRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  EmitUint8(0x7E);
+  EmitOperand(src, Operand(dst));
+}
+
+void AssemblerX86::movq(const Address &dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  EmitUint8(0xD6);
+  EmitOperand(src, Operand(dst));
+}
+
+void AssemblerX86::movq(XmmRegister dst, const Address &src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0xF3);
+  EmitUint8(0x0F);
+  EmitUint8(0x7E);
+  EmitOperand(dst, Operand(src));
+}
+
+void AssemblerX86::addss(Type Ty, XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(Ty == IceType_f32 ? 0xF3 : 0xF2);
+  EmitUint8(0x0F);
+  EmitUint8(0x58);
+  EmitXmmRegisterOperand(dst, src);
+}
+
+void AssemblerX86::addss(Type Ty, XmmRegister dst, const Address &src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(Ty == IceType_f32 ? 0xF3 : 0xF2);
+  EmitUint8(0x0F);
+  EmitUint8(0x58);
+  EmitOperand(dst, src);
+}
+
+void AssemblerX86::subss(Type Ty, XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(Ty == IceType_f32 ? 0xF3 : 0xF2);
+  EmitUint8(0x0F);
+  EmitUint8(0x5C);
+  EmitXmmRegisterOperand(dst, src);
+}
+
+void AssemblerX86::subss(Type Ty, XmmRegister dst, const Address &src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(Ty == IceType_f32 ? 0xF3 : 0xF2);
+  EmitUint8(0x0F);
+  EmitUint8(0x5C);
+  EmitOperand(dst, src);
+}
+
+void AssemblerX86::mulss(Type Ty, XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(Ty == IceType_f32 ? 0xF3 : 0xF2);
+  EmitUint8(0x0F);
+  EmitUint8(0x59);
+  EmitXmmRegisterOperand(dst, src);
+}
+
+void AssemblerX86::mulss(Type Ty, XmmRegister dst, const Address &src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(Ty == IceType_f32 ? 0xF3 : 0xF2);
+  EmitUint8(0x0F);
+  EmitUint8(0x59);
+  EmitOperand(dst, src);
+}
+
+void AssemblerX86::divss(Type Ty, XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(Ty == IceType_f32 ? 0xF3 : 0xF2);
+  EmitUint8(0x0F);
+  EmitUint8(0x5E);
+  EmitXmmRegisterOperand(dst, src);
+}
+
+void AssemblerX86::divss(Type Ty, XmmRegister dst, const Address &src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(Ty == IceType_f32 ? 0xF3 : 0xF2);
+  EmitUint8(0x0F);
+  EmitUint8(0x5E);
+  EmitOperand(dst, src);
+}
+
+void AssemblerX86::flds(const Address &src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0xD9);
+  EmitOperand(0, src);
+}
+
+void AssemblerX86::fstps(const Address &dst) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0xD9);
+  EmitOperand(3, dst);
+}
+
+void AssemblerX86::movsd(XmmRegister dst, const Address &src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0xF2);
+  EmitUint8(0x0F);
+  EmitUint8(0x10);
+  EmitOperand(dst, src);
+}
+
+void AssemblerX86::movsd(const Address &dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0xF2);
+  EmitUint8(0x0F);
+  EmitUint8(0x11);
+  EmitOperand(src, dst);
+}
+
+void AssemblerX86::movsd(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0xF2);
+  EmitUint8(0x0F);
+  EmitUint8(0x11);
+  EmitXmmRegisterOperand(src, dst);
+}
+
+void AssemblerX86::movaps(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x0F);
+  EmitUint8(0x28);
+  EmitXmmRegisterOperand(dst, src);
+}
+
+void AssemblerX86::movups(XmmRegister dst, const Address &src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x0F);
+  EmitUint8(0x10);
+  EmitOperand(dst, src);
+}
+
+void AssemblerX86::movups(const Address &dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x0F);
+  EmitUint8(0x11);
+  EmitOperand(src, dst);
+}
+
+void AssemblerX86::padd(Type Ty, XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  if (Ty == IceType_i8 || Ty == IceType_i1) {
+    EmitUint8(0xFC);
+  } else if (Ty == IceType_i16) {
+    EmitUint8(0xFD);
+  } else {
+    EmitUint8(0xFE);
+  }
+  EmitXmmRegisterOperand(dst, src);
+}
+
+void AssemblerX86::padd(Type Ty, XmmRegister dst, const Address &src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  if (Ty == IceType_i8 || Ty == IceType_i1) {
+    EmitUint8(0xFC);
+  } else if (Ty == IceType_i16) {
+    EmitUint8(0xFD);
+  } else {
+    EmitUint8(0xFE);
+  }
+  EmitOperand(dst, src);
+}
+
+void AssemblerX86::pand(Type /* Ty */, XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  EmitUint8(0xDB);
+  EmitXmmRegisterOperand(dst, src);
+}
+
+void AssemblerX86::pand(Type /* Ty */, XmmRegister dst, const Address &src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  EmitUint8(0xDB);
+  EmitOperand(dst, src);
+}
+
+void AssemblerX86::pandn(Type /* Ty */, XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  EmitUint8(0xDF);
+  EmitXmmRegisterOperand(dst, src);
+}
+
+void AssemblerX86::pandn(Type /* Ty */, XmmRegister dst, const Address &src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  EmitUint8(0xDF);
+  EmitOperand(dst, src);
+}
+
+void AssemblerX86::pmuludq(Type /* Ty */, XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  EmitUint8(0xF4);
+  EmitXmmRegisterOperand(dst, src);
+}
+
+void AssemblerX86::pmuludq(Type /* Ty */, XmmRegister dst, const Address &src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  EmitUint8(0xF4);
+  EmitOperand(dst, src);
+}
+
+void AssemblerX86::por(Type /* Ty */, XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  EmitUint8(0xEB);
+  EmitXmmRegisterOperand(dst, src);
+}
+
+void AssemblerX86::por(Type /* Ty */, XmmRegister dst, const Address &src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  EmitUint8(0xEB);
+  EmitOperand(dst, src);
+}
+
+void AssemblerX86::psub(Type Ty, XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  if (Ty == IceType_i8 || Ty == IceType_i1) {
+    EmitUint8(0xF8);
+  } else if (Ty == IceType_i16) {
+    EmitUint8(0xF9);
+  } else {
+    EmitUint8(0xFA);
+  }
+  EmitXmmRegisterOperand(dst, src);
+}
+
+void AssemblerX86::psub(Type Ty, XmmRegister dst, const Address &src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  if (Ty == IceType_i8 || Ty == IceType_i1) {
+    EmitUint8(0xF8);
+  } else if (Ty == IceType_i16) {
+    EmitUint8(0xF9);
+  } else {
+    EmitUint8(0xFA);
+  }
+  EmitOperand(dst, src);
+}
+
+void AssemblerX86::pxor(Type /* Ty */, XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  EmitUint8(0xEF);
+  EmitXmmRegisterOperand(dst, src);
+}
+
+void AssemblerX86::pxor(Type /* Ty */, XmmRegister dst, const Address &src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  EmitUint8(0xEF);
+  EmitOperand(dst, src);
+}
+
+// {add,sub,mul,div}ps are given a Ty parameter for consistency with
+// {add,sub,mul,div}ss. In the future, when the PNaCl ABI allows
+// addpd, etc., we can use the Ty parameter to decide on adding
+// a 0x66 prefix.
+void AssemblerX86::addps(Type /* Ty */, XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x0F);
+  EmitUint8(0x58);
+  EmitXmmRegisterOperand(dst, src);
+}
+
+void AssemblerX86::addps(Type /* Ty */, XmmRegister dst, const Address &src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x0F);
+  EmitUint8(0x58);
+  EmitOperand(dst, src);
+}
+
+void AssemblerX86::subps(Type /* Ty */, XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x0F);
+  EmitUint8(0x5C);
+  EmitXmmRegisterOperand(dst, src);
+}
+
+void AssemblerX86::subps(Type /* Ty */, XmmRegister dst, const Address &src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x0F);
+  EmitUint8(0x5C);
+  EmitOperand(dst, src);
+}
+
+void AssemblerX86::divps(Type /* Ty */, XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x0F);
+  EmitUint8(0x5E);
+  EmitXmmRegisterOperand(dst, src);
+}
+
+void AssemblerX86::divps(Type /* Ty */, XmmRegister dst, const Address &src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x0F);
+  EmitUint8(0x5E);
+  EmitOperand(dst, src);
+}
+
+void AssemblerX86::mulps(Type /* Ty */, XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x0F);
+  EmitUint8(0x59);
+  EmitXmmRegisterOperand(dst, src);
+}
+
+void AssemblerX86::mulps(Type /* Ty */, XmmRegister dst, const Address &src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x0F);
+  EmitUint8(0x59);
+  EmitOperand(dst, src);
+}
+
+void AssemblerX86::minps(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x0F);
+  EmitUint8(0x5D);
+  EmitXmmRegisterOperand(dst, src);
+}
+
+void AssemblerX86::maxps(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x0F);
+  EmitUint8(0x5F);
+  EmitXmmRegisterOperand(dst, src);
+}
+
+void AssemblerX86::andps(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x0F);
+  EmitUint8(0x54);
+  EmitXmmRegisterOperand(dst, src);
+}
+
+void AssemblerX86::andps(XmmRegister dst, const Address &src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x0F);
+  EmitUint8(0x54);
+  EmitOperand(dst, src);
+}
+
+void AssemblerX86::orps(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x0F);
+  EmitUint8(0x56);
+  EmitXmmRegisterOperand(dst, src);
+}
+
+void AssemblerX86::cmpps(XmmRegister dst, XmmRegister src,
+                         CondX86::CmppsCond CmpCondition) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x0F);
+  EmitUint8(0xC2);
+  EmitXmmRegisterOperand(dst, src);
+  EmitUint8(CmpCondition);
+}
+
+void AssemblerX86::cmpps(XmmRegister dst, const Address &src,
+                         CondX86::CmppsCond CmpCondition) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x0F);
+  EmitUint8(0xC2);
+  EmitOperand(dst, src);
+  EmitUint8(CmpCondition);
+}
+
+void AssemblerX86::sqrtps(XmmRegister dst) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x0F);
+  EmitUint8(0x51);
+  EmitXmmRegisterOperand(dst, dst);
+}
+
+void AssemblerX86::rsqrtps(XmmRegister dst) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x0F);
+  EmitUint8(0x52);
+  EmitXmmRegisterOperand(dst, dst);
+}
+
+void AssemblerX86::reciprocalps(XmmRegister dst) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x0F);
+  EmitUint8(0x53);
+  EmitXmmRegisterOperand(dst, dst);
+}
+
+void AssemblerX86::movhlps(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x0F);
+  EmitUint8(0x12);
+  EmitXmmRegisterOperand(dst, src);
+}
+
+void AssemblerX86::movlhps(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x0F);
+  EmitUint8(0x16);
+  EmitXmmRegisterOperand(dst, src);
+}
+
+void AssemblerX86::unpcklps(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x0F);
+  EmitUint8(0x14);
+  EmitXmmRegisterOperand(dst, src);
+}
+
+void AssemblerX86::unpckhps(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x0F);
+  EmitUint8(0x15);
+  EmitXmmRegisterOperand(dst, src);
+}
+
+void AssemblerX86::unpcklpd(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  EmitUint8(0x14);
+  EmitXmmRegisterOperand(dst, src);
+}
+
+void AssemblerX86::unpckhpd(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  EmitUint8(0x15);
+  EmitXmmRegisterOperand(dst, src);
+}
+
+void AssemblerX86::set1ps(XmmRegister dst, GPRRegister tmp1,
+                          const Immediate &imm) {
+  // Load 32-bit immediate value into tmp1.
+  movl(tmp1, imm);
+  // Move value from tmp1 into dst.
+  movd(dst, tmp1);
+  // Broadcast low lane into other three lanes.
+  shufps(dst, dst, Immediate(0x0));
+}
+
+void AssemblerX86::shufps(XmmRegister dst, XmmRegister src,
+                          const Immediate &imm) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x0F);
+  EmitUint8(0xC6);
+  EmitXmmRegisterOperand(dst, src);
+  assert(imm.is_uint8());
+  EmitUint8(imm.value());
+}
+
+void AssemblerX86::minpd(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  EmitUint8(0x5D);
+  EmitXmmRegisterOperand(dst, src);
+}
+
+void AssemblerX86::maxpd(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  EmitUint8(0x5F);
+  EmitXmmRegisterOperand(dst, src);
+}
+
+void AssemblerX86::sqrtpd(XmmRegister dst) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  EmitUint8(0x51);
+  EmitXmmRegisterOperand(dst, dst);
+}
+
+void AssemblerX86::cvtps2pd(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x0F);
+  EmitUint8(0x5A);
+  EmitXmmRegisterOperand(dst, src);
+}
+
+void AssemblerX86::cvtpd2ps(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  EmitUint8(0x5A);
+  EmitXmmRegisterOperand(dst, src);
+}
+
+void AssemblerX86::shufpd(XmmRegister dst, XmmRegister src,
+                          const Immediate &imm) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  EmitUint8(0xC6);
+  EmitXmmRegisterOperand(dst, src);
+  assert(imm.is_uint8());
+  EmitUint8(imm.value());
+}
+
+void AssemblerX86::cvtsi2ss(XmmRegister dst, GPRRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0xF3);
+  EmitUint8(0x0F);
+  EmitUint8(0x2A);
+  EmitOperand(dst, Operand(src));
+}
+
+void AssemblerX86::cvtsi2sd(XmmRegister dst, GPRRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0xF2);
+  EmitUint8(0x0F);
+  EmitUint8(0x2A);
+  EmitOperand(dst, Operand(src));
+}
+
+void AssemblerX86::cvtss2si(GPRRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0xF3);
+  EmitUint8(0x0F);
+  EmitUint8(0x2D);
+  EmitXmmRegisterOperand(dst, src);
+}
+
+void AssemblerX86::cvtss2sd(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0xF3);
+  EmitUint8(0x0F);
+  EmitUint8(0x5A);
+  EmitXmmRegisterOperand(dst, src);
+}
+
+void AssemblerX86::cvtsd2si(GPRRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0xF2);
+  EmitUint8(0x0F);
+  EmitUint8(0x2D);
+  EmitXmmRegisterOperand(dst, src);
+}
+
+void AssemblerX86::cvttss2si(GPRRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0xF3);
+  EmitUint8(0x0F);
+  EmitUint8(0x2C);
+  EmitXmmRegisterOperand(dst, src);
+}
+
+void AssemblerX86::cvttsd2si(GPRRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0xF2);
+  EmitUint8(0x0F);
+  EmitUint8(0x2C);
+  EmitXmmRegisterOperand(dst, src);
+}
+
+void AssemblerX86::cvtsd2ss(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0xF2);
+  EmitUint8(0x0F);
+  EmitUint8(0x5A);
+  EmitXmmRegisterOperand(dst, src);
+}
+
+void AssemblerX86::cvtdq2pd(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0xF3);
+  EmitUint8(0x0F);
+  EmitUint8(0xE6);
+  EmitXmmRegisterOperand(dst, src);
+}
+
+void AssemblerX86::ucomiss(Type Ty, XmmRegister a, XmmRegister b) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  if (Ty == IceType_f64)
+    EmitUint8(0x66);
+  EmitUint8(0x0F);
+  EmitUint8(0x2E);
+  EmitXmmRegisterOperand(a, b);
+}
+
+void AssemblerX86::ucomiss(Type Ty, XmmRegister a, const Address &b) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  if (Ty == IceType_f64)
+    EmitUint8(0x66);
+  EmitUint8(0x0F);
+  EmitUint8(0x2E);
+  EmitOperand(a, b);
+}
+
+void AssemblerX86::movmskpd(GPRRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  EmitUint8(0x50);
+  EmitXmmRegisterOperand(dst, src);
+}
+
+void AssemblerX86::movmskps(GPRRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x0F);
+  EmitUint8(0x50);
+  EmitXmmRegisterOperand(dst, src);
+}
+
+void AssemblerX86::sqrtss(Type Ty, XmmRegister dst, const Address &src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(Ty == IceType_f32 ? 0xF3 : 0xF2);
+  EmitUint8(0x0F);
+  EmitUint8(0x51);
+  EmitOperand(dst, src);
+}
+
+void AssemblerX86::sqrtss(Type Ty, XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(Ty == IceType_f32 ? 0xF3 : 0xF2);
+  EmitUint8(0x0F);
+  EmitUint8(0x51);
+  EmitXmmRegisterOperand(dst, src);
+}
+
+void AssemblerX86::xorpd(XmmRegister dst, const Address &src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  EmitUint8(0x57);
+  EmitOperand(dst, src);
+}
+
+void AssemblerX86::xorpd(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  EmitUint8(0x57);
+  EmitXmmRegisterOperand(dst, src);
+}
+
+void AssemblerX86::orpd(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  EmitUint8(0x56);
+  EmitXmmRegisterOperand(dst, src);
+}
+
+void AssemblerX86::xorps(XmmRegister dst, const Address &src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x0F);
+  EmitUint8(0x57);
+  EmitOperand(dst, src);
+}
+
+void AssemblerX86::xorps(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x0F);
+  EmitUint8(0x57);
+  EmitXmmRegisterOperand(dst, src);
+}
+
+void AssemblerX86::andpd(XmmRegister dst, const Address &src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  EmitUint8(0x54);
+  EmitOperand(dst, src);
+}
+
+void AssemblerX86::andpd(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  EmitUint8(0x54);
+  EmitXmmRegisterOperand(dst, src);
+}
+
+void AssemblerX86::pextrd(GPRRegister dst, XmmRegister src,
+                          const Immediate &imm) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  EmitUint8(0x3A);
+  EmitUint8(0x16);
+  EmitOperand(src, Operand(dst));
+  assert(imm.is_uint8());
+  EmitUint8(imm.value());
+}
+
+void AssemblerX86::pmovsxdq(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  EmitUint8(0x38);
+  EmitUint8(0x25);
+  EmitXmmRegisterOperand(dst, src);
+}
+
+void AssemblerX86::pcmpeqq(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  EmitUint8(0x38);
+  EmitUint8(0x29);
+  EmitXmmRegisterOperand(dst, src);
+}
+
+void AssemblerX86::roundsd(XmmRegister dst, XmmRegister src,
+                           RoundingMode mode) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  EmitUint8(0x3A);
+  EmitUint8(0x0B);
+  EmitXmmRegisterOperand(dst, src);
+  // Mask precision exeption.
+  EmitUint8(static_cast<uint8_t>(mode) | 0x8);
+}
+
+void AssemblerX86::fldl(const Address &src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0xDD);
+  EmitOperand(0, src);
+}
+
+void AssemblerX86::fstpl(const Address &dst) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0xDD);
+  EmitOperand(3, dst);
+}
+
+void AssemblerX86::fnstcw(const Address &dst) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0xD9);
+  EmitOperand(7, dst);
+}
+
+void AssemblerX86::fldcw(const Address &src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0xD9);
+  EmitOperand(5, src);
+}
+
+void AssemblerX86::fistpl(const Address &dst) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0xDF);
+  EmitOperand(7, dst);
+}
+
+void AssemblerX86::fistps(const Address &dst) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0xDB);
+  EmitOperand(3, dst);
+}
+
+void AssemblerX86::fildl(const Address &src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0xDF);
+  EmitOperand(5, src);
+}
+
+void AssemblerX86::filds(const Address &src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0xDB);
+  EmitOperand(0, src);
+}
+
+void AssemblerX86::fincstp() {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0xD9);
+  EmitUint8(0xF7);
+}
+
+void AssemblerX86::xchgl(GPRRegister dst, GPRRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x87);
+  EmitRegisterOperand(dst, src);
+}
+
+void AssemblerX86::cmpl(GPRRegister reg, const Immediate &imm) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitComplex(7, Operand(reg), imm);
+}
+
+void AssemblerX86::cmpl(GPRRegister reg0, GPRRegister reg1) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x3B);
+  EmitOperand(reg0, Operand(reg1));
+}
+
+void AssemblerX86::cmpl(GPRRegister reg, const Address &address) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x3B);
+  EmitOperand(reg, address);
+}
+
+void AssemblerX86::addl(GPRRegister dst, GPRRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x03);
+  EmitRegisterOperand(dst, src);
+}
+
+void AssemblerX86::addl(GPRRegister reg, const Address &address) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x03);
+  EmitOperand(reg, address);
+}
+
+void AssemblerX86::cmpl(const Address &address, GPRRegister reg) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x39);
+  EmitOperand(reg, address);
+}
+
+void AssemblerX86::cmpl(const Address &address, const Immediate &imm) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitComplex(7, address, imm);
+}
+
+void AssemblerX86::cmpb(const Address &address, const Immediate &imm) {
+  assert(imm.is_int8());
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x80);
+  EmitOperand(7, address);
+  EmitUint8(imm.value() & 0xFF);
+}
+
+void AssemblerX86::testl(GPRRegister reg1, GPRRegister reg2) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x85);
+  EmitRegisterOperand(reg1, reg2);
+}
+
+void AssemblerX86::testl(GPRRegister reg, const Immediate &immediate) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  // For registers that have a byte variant (EAX, EBX, ECX, and EDX)
+  // we only test the byte register to keep the encoding short.
+  if (immediate.is_uint8() && reg < 4) {
+    // Use zero-extended 8-bit immediate.
+    if (reg == RegX8632::Encoded_Reg_eax) {
+      EmitUint8(0xA8);
+    } else {
+      EmitUint8(0xF6);
+      EmitUint8(0xC0 + reg);
+    }
+    EmitUint8(immediate.value() & 0xFF);
+  } else if (reg == RegX8632::Encoded_Reg_eax) {
+    // Use short form if the destination is EAX.
+    EmitUint8(0xA9);
+    EmitImmediate(immediate);
+  } else {
+    EmitUint8(0xF7);
+    EmitOperand(0, Operand(reg));
+    EmitImmediate(immediate);
+  }
+}
+
+void AssemblerX86::andl(GPRRegister dst, GPRRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x23);
+  EmitOperand(dst, Operand(src));
+}
+
+void AssemblerX86::andl(GPRRegister dst, const Immediate &imm) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitComplex(4, Operand(dst), imm);
+}
+
+void AssemblerX86::andl(GPRRegister dst, const Address &address) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x23);
+  EmitOperand(dst, address);
+}
+
+void AssemblerX86::orl(GPRRegister dst, GPRRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x0B);
+  EmitOperand(dst, Operand(src));
+}
+
+void AssemblerX86::orl(GPRRegister dst, const Immediate &imm) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitComplex(1, Operand(dst), imm);
+}
+
+void AssemblerX86::orl(GPRRegister dst, const Address &address) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x0B);
+  EmitOperand(dst, address);
+}
+
+void AssemblerX86::xorl(GPRRegister dst, GPRRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x33);
+  EmitOperand(dst, Operand(src));
+}
+
+void AssemblerX86::xorl(GPRRegister dst, const Immediate &imm) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitComplex(6, Operand(dst), imm);
+}
+
+void AssemblerX86::xorl(GPRRegister dst, const Address &address) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x33);
+  EmitOperand(dst, address);
+}
+
+void AssemblerX86::addl(GPRRegister reg, const Immediate &imm) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitComplex(0, Operand(reg), imm);
+}
+
+void AssemblerX86::addl(const Address &address, GPRRegister reg) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x01);
+  EmitOperand(reg, address);
+}
+
+void AssemblerX86::addl(const Address &address, const Immediate &imm) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitComplex(0, address, imm);
+}
+
+void AssemblerX86::adcl(GPRRegister reg, const Immediate &imm) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitComplex(2, Operand(reg), imm);
+}
+
+void AssemblerX86::adcl(GPRRegister dst, GPRRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x13);
+  EmitOperand(dst, Operand(src));
+}
+
+void AssemblerX86::adcl(GPRRegister dst, const Address &address) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x13);
+  EmitOperand(dst, address);
+}
+
+void AssemblerX86::adcl(const Address &address, GPRRegister reg) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x11);
+  EmitOperand(reg, address);
+}
+
+void AssemblerX86::subl(GPRRegister dst, GPRRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x2B);
+  EmitOperand(dst, Operand(src));
+}
+
+void AssemblerX86::subl(GPRRegister reg, const Immediate &imm) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitComplex(5, Operand(reg), imm);
+}
+
+void AssemblerX86::subl(GPRRegister reg, const Address &address) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x2B);
+  EmitOperand(reg, address);
+}
+
+void AssemblerX86::subl(const Address &address, GPRRegister reg) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x29);
+  EmitOperand(reg, address);
+}
+
+void AssemblerX86::cdq() {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x99);
+}
+
+void AssemblerX86::idivl(GPRRegister reg) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0xF7);
+  EmitUint8(0xF8 | reg);
+}
+
+void AssemblerX86::imull(GPRRegister dst, GPRRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x0F);
+  EmitUint8(0xAF);
+  EmitOperand(dst, Operand(src));
+}
+
+void AssemblerX86::imull(GPRRegister reg, const Immediate &imm) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x69);
+  EmitOperand(reg, Operand(reg));
+  EmitImmediate(imm);
+}
+
+void AssemblerX86::imull(GPRRegister reg, const Address &address) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x0F);
+  EmitUint8(0xAF);
+  EmitOperand(reg, address);
+}
+
+void AssemblerX86::imull(GPRRegister reg) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0xF7);
+  EmitOperand(5, Operand(reg));
+}
+
+void AssemblerX86::imull(const Address &address) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0xF7);
+  EmitOperand(5, address);
+}
+
+void AssemblerX86::mull(GPRRegister reg) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0xF7);
+  EmitOperand(4, Operand(reg));
+}
+
+void AssemblerX86::mull(const Address &address) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0xF7);
+  EmitOperand(4, address);
+}
+
+void AssemblerX86::sbbl(GPRRegister dst, GPRRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x1B);
+  EmitOperand(dst, Operand(src));
+}
+
+void AssemblerX86::sbbl(GPRRegister reg, const Immediate &imm) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitComplex(3, Operand(reg), imm);
+}
+
+void AssemblerX86::sbbl(GPRRegister dst, const Address &address) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x1B);
+  EmitOperand(dst, address);
+}
+
+void AssemblerX86::sbbl(const Address &address, GPRRegister dst) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x19);
+  EmitOperand(dst, address);
+}
+
+void AssemblerX86::incl(GPRRegister reg) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x40 + reg);
+}
+
+void AssemblerX86::incl(const Address &address) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0xFF);
+  EmitOperand(0, address);
+}
+
+void AssemblerX86::decl(GPRRegister reg) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x48 + reg);
+}
+
+void AssemblerX86::decl(const Address &address) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0xFF);
+  EmitOperand(1, address);
+}
+
+void AssemblerX86::shll(GPRRegister reg, const Immediate &imm) {
+  EmitGenericShift(4, reg, imm);
+}
+
+void AssemblerX86::shll(GPRRegister operand, GPRRegister shifter) {
+  EmitGenericShift(4, Operand(operand), shifter);
+}
+
+void AssemblerX86::shll(const Address &operand, GPRRegister shifter) {
+  EmitGenericShift(4, Operand(operand), shifter);
+}
+
+void AssemblerX86::shrl(GPRRegister reg, const Immediate &imm) {
+  EmitGenericShift(5, reg, imm);
+}
+
+void AssemblerX86::shrl(GPRRegister operand, GPRRegister shifter) {
+  EmitGenericShift(5, Operand(operand), shifter);
+}
+
+void AssemblerX86::sarl(GPRRegister reg, const Immediate &imm) {
+  EmitGenericShift(7, reg, imm);
+}
+
+void AssemblerX86::sarl(GPRRegister operand, GPRRegister shifter) {
+  EmitGenericShift(7, Operand(operand), shifter);
+}
+
+void AssemblerX86::sarl(const Address &address, GPRRegister shifter) {
+  EmitGenericShift(7, Operand(address), shifter);
+}
+
+void AssemblerX86::shld(GPRRegister dst, GPRRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x0F);
+  EmitUint8(0xA5);
+  EmitRegisterOperand(src, dst);
+}
+
+void AssemblerX86::shld(GPRRegister dst, GPRRegister src,
+                        const Immediate &imm) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  assert(imm.is_int8());
+  EmitUint8(0x0F);
+  EmitUint8(0xA4);
+  EmitRegisterOperand(src, dst);
+  EmitUint8(imm.value() & 0xFF);
+}
+
+void AssemblerX86::shld(const Address &operand, GPRRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x0F);
+  EmitUint8(0xA5);
+  EmitOperand(src, Operand(operand));
+}
+
+void AssemblerX86::shrd(GPRRegister dst, GPRRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x0F);
+  EmitUint8(0xAD);
+  EmitRegisterOperand(src, dst);
+}
+
+void AssemblerX86::shrd(GPRRegister dst, GPRRegister src,
+                        const Immediate &imm) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  assert(imm.is_int8());
+  EmitUint8(0x0F);
+  EmitUint8(0xAC);
+  EmitRegisterOperand(src, dst);
+  EmitUint8(imm.value() & 0xFF);
+}
+
+void AssemblerX86::shrd(const Address &dst, GPRRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x0F);
+  EmitUint8(0xAD);
+  EmitOperand(src, Operand(dst));
+}
+
+void AssemblerX86::negl(GPRRegister reg) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0xF7);
+  EmitOperand(3, Operand(reg));
+}
+
+void AssemblerX86::notl(GPRRegister reg) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0xF7);
+  EmitUint8(0xD0 | reg);
+}
+
+void AssemblerX86::bsrl(GPRRegister dst, GPRRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x0F);
+  EmitUint8(0xBD);
+  EmitRegisterOperand(dst, src);
+}
+
+void AssemblerX86::bt(GPRRegister base, GPRRegister offset) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x0F);
+  EmitUint8(0xA3);
+  EmitRegisterOperand(offset, base);
+}
+
+void AssemblerX86::ret() {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0xC3);
+}
+
+void AssemblerX86::ret(const Immediate &imm) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0xC2);
+  assert(imm.is_uint16());
+  EmitUint8(imm.value() & 0xFF);
+  EmitUint8((imm.value() >> 8) & 0xFF);
+}
+
+void AssemblerX86::nop(int size) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  // There are nops up to size 15, but for now just provide up to size 8.
+  assert(0 < size && size <= MAX_NOP_SIZE);
+  switch (size) {
+  case 1:
+    EmitUint8(0x90);
+    break;
+  case 2:
+    EmitUint8(0x66);
+    EmitUint8(0x90);
+    break;
+  case 3:
+    EmitUint8(0x0F);
+    EmitUint8(0x1F);
+    EmitUint8(0x00);
+    break;
+  case 4:
+    EmitUint8(0x0F);
+    EmitUint8(0x1F);
+    EmitUint8(0x40);
+    EmitUint8(0x00);
+    break;
+  case 5:
+    EmitUint8(0x0F);
+    EmitUint8(0x1F);
+    EmitUint8(0x44);
+    EmitUint8(0x00);
+    EmitUint8(0x00);
+    break;
+  case 6:
+    EmitUint8(0x66);
+    EmitUint8(0x0F);
+    EmitUint8(0x1F);
+    EmitUint8(0x44);
+    EmitUint8(0x00);
+    EmitUint8(0x00);
+    break;
+  case 7:
+    EmitUint8(0x0F);
+    EmitUint8(0x1F);
+    EmitUint8(0x80);
+    EmitUint8(0x00);
+    EmitUint8(0x00);
+    EmitUint8(0x00);
+    EmitUint8(0x00);
+    break;
+  case 8:
+    EmitUint8(0x0F);
+    EmitUint8(0x1F);
+    EmitUint8(0x84);
+    EmitUint8(0x00);
+    EmitUint8(0x00);
+    EmitUint8(0x00);
+    EmitUint8(0x00);
+    EmitUint8(0x00);
+    break;
+  default:
+    llvm_unreachable("Unimplemented");
+  }
+}
+
+void AssemblerX86::int3() {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0xCC);
+}
+
+void AssemblerX86::hlt() {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0xF4);
+}
+
+void AssemblerX86::j(CondX86::BrCond condition, Label *label, bool near) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  if (label->IsBound()) {
+    static const int kShortSize = 2;
+    static const int kLongSize = 6;
+    intptr_t offset = label->Position() - buffer_.Size();
+    assert(offset <= 0);
+    if (Utils::IsInt(8, offset - kShortSize)) {
+      EmitUint8(0x70 + condition);
+      EmitUint8((offset - kShortSize) & 0xFF);
+    } else {
+      EmitUint8(0x0F);
+      EmitUint8(0x80 + condition);
+      EmitInt32(offset - kLongSize);
+    }
+  } else if (near) {
+    EmitUint8(0x70 + condition);
+    EmitNearLabelLink(label);
+  } else {
+    EmitUint8(0x0F);
+    EmitUint8(0x80 + condition);
+    EmitLabelLink(label);
+  }
+}
+
+void AssemblerX86::j(CondX86::BrCond condition,
+                     const ConstantRelocatable *label) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x0F);
+  EmitUint8(0x80 + condition);
+  EmitFixup(DirectCallRelocation::create(this, FK_PcRel_4, label));
+  EmitInt32(-4);
+}
+
+void AssemblerX86::jmp(GPRRegister reg) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0xFF);
+  EmitRegisterOperand(4, reg);
+}
+
+void AssemblerX86::jmp(Label *label, bool near) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  if (label->IsBound()) {
+    static const int kShortSize = 2;
+    static const int kLongSize = 5;
+    intptr_t offset = label->Position() - buffer_.Size();
+    assert(offset <= 0);
+    if (Utils::IsInt(8, offset - kShortSize)) {
+      EmitUint8(0xEB);
+      EmitUint8((offset - kShortSize) & 0xFF);
+    } else {
+      EmitUint8(0xE9);
+      EmitInt32(offset - kLongSize);
+    }
+  } else if (near) {
+    EmitUint8(0xEB);
+    EmitNearLabelLink(label);
+  } else {
+    EmitUint8(0xE9);
+    EmitLabelLink(label);
+  }
+}
+
+void AssemblerX86::jmp(const ConstantRelocatable *label) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0xE9);
+  EmitFixup(DirectCallRelocation::create(this, FK_PcRel_4, label));
+  EmitInt32(-4);
+}
+
+void AssemblerX86::lock() {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0xF0);
+}
+
+void AssemblerX86::cmpxchgl(const Address &address, GPRRegister reg) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x0F);
+  EmitUint8(0xB1);
+  EmitOperand(reg, address);
+}
+
+void AssemblerX86::Align(intptr_t alignment, intptr_t offset) {
+  assert(llvm::isPowerOf2_32(alignment));
+  intptr_t pos = offset + buffer_.GetPosition();
+  intptr_t mod = pos & (alignment - 1);
+  if (mod == 0) {
+    return;
+  }
+  intptr_t bytes_needed = alignment - mod;
+  while (bytes_needed > MAX_NOP_SIZE) {
+    nop(MAX_NOP_SIZE);
+    bytes_needed -= MAX_NOP_SIZE;
+  }
+  if (bytes_needed) {
+    nop(bytes_needed);
+  }
+  assert(((offset + buffer_.GetPosition()) & (alignment - 1)) == 0);
+}
+
+void AssemblerX86::Bind(Label *label) {
+  intptr_t bound = buffer_.Size();
+  assert(!label->IsBound()); // Labels can only be bound once.
+  while (label->IsLinked()) {
+    intptr_t position = label->LinkPosition();
+    intptr_t next = buffer_.Load<int32_t>(position);
+    buffer_.Store<int32_t>(position, bound - (position + 4));
+    label->position_ = next;
+  }
+  while (label->HasNear()) {
+    intptr_t position = label->NearPosition();
+    intptr_t offset = bound - (position + 1);
+    assert(Utils::IsInt(8, offset));
+    buffer_.Store<int8_t>(position, offset);
+  }
+  label->BindTo(bound);
+}
+
+void AssemblerX86::EmitOperand(int rm, const Operand &operand) {
+  assert(rm >= 0 && rm < 8);
+  const intptr_t length = operand.length_;
+  assert(length > 0);
+  // Emit the ModRM byte updated with the given RM value.
+  assert((operand.encoding_[0] & 0x38) == 0);
+  EmitUint8(operand.encoding_[0] + (rm << 3));
+  if (operand.fixup()) {
+    EmitFixup(operand.fixup());
+  }
+  // Emit the rest of the encoded operand.
+  for (intptr_t i = 1; i < length; i++) {
+    EmitUint8(operand.encoding_[i]);
+  }
+}
+
+void AssemblerX86::EmitImmediate(const Immediate &imm) {
+  EmitInt32(imm.value());
+}
+
+void AssemblerX86::EmitComplexI8(int rm, const Operand &operand,
+                                 const Immediate &immediate) {
+  assert(rm >= 0 && rm < 8);
+  assert(immediate.is_int8());
+  if (operand.IsRegister(RegX8632::Encoded_Reg_eax)) {
+    // Use short form if the destination is al.
+    EmitUint8(0x04 + (rm << 3));
+    EmitUint8(immediate.value() & 0xFF);
+  } else {
+    // Use sign-extended 8-bit immediate.
+    EmitUint8(0x80);
+    EmitOperand(rm, operand);
+    EmitUint8(immediate.value() & 0xFF);
+  }
+}
+
+void AssemblerX86::EmitComplex(int rm, const Operand &operand,
+                               const Immediate &immediate) {
+  assert(rm >= 0 && rm < 8);
+  if (immediate.is_int8()) {
+    // Use sign-extended 8-bit immediate.
+    EmitUint8(0x83);
+    EmitOperand(rm, operand);
+    EmitUint8(immediate.value() & 0xFF);
+  } else if (operand.IsRegister(RegX8632::Encoded_Reg_eax)) {
+    // Use short form if the destination is eax.
+    EmitUint8(0x05 + (rm << 3));
+    EmitImmediate(immediate);
+  } else {
+    EmitUint8(0x81);
+    EmitOperand(rm, operand);
+    EmitImmediate(immediate);
+  }
+}
+
+void AssemblerX86::EmitLabel(Label *label, intptr_t instruction_size) {
+  if (label->IsBound()) {
+    intptr_t offset = label->Position() - buffer_.Size();
+    assert(offset <= 0);
+    EmitInt32(offset - instruction_size);
+  } else {
+    EmitLabelLink(label);
+  }
+}
+
+void AssemblerX86::EmitLabelLink(Label *label) {
+  assert(!label->IsBound());
+  intptr_t position = buffer_.Size();
+  EmitInt32(label->position_);
+  label->LinkTo(position);
+}
+
+void AssemblerX86::EmitNearLabelLink(Label *label) {
+  assert(!label->IsBound());
+  intptr_t position = buffer_.Size();
+  EmitUint8(0);
+  label->NearLinkTo(position);
+}
+
+void AssemblerX86::EmitGenericShift(int rm, GPRRegister reg,
+                                    const Immediate &imm) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  assert(imm.is_int8());
+  if (imm.value() == 1) {
+    EmitUint8(0xD1);
+    EmitOperand(rm, Operand(reg));
+  } else {
+    EmitUint8(0xC1);
+    EmitOperand(rm, Operand(reg));
+    EmitUint8(imm.value() & 0xFF);
+  }
+}
+
+void AssemblerX86::EmitGenericShift(int rm, const Operand &operand,
+                                    GPRRegister shifter) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  assert(shifter == RegX8632::Encoded_Reg_ecx);
+  EmitUint8(0xD3);
+  EmitOperand(rm, Operand(operand));
+}
+
+} // end of namespace x86
+} // end of namespace Ice
diff --git a/src/assembler_ia32.h b/src/assembler_ia32.h
new file mode 100644
index 0000000..810fab3
--- /dev/null
+++ b/src/assembler_ia32.h
@@ -0,0 +1,724 @@
+// Copyright (c) 2013, the Dart project authors.  Please see the AUTHORS file
+// for details. All rights reserved. Use of this source code is governed by a
+// BSD-style license that can be found in the LICENSE file.
+//
+// Modified by the Subzero authors.
+//
+//===- subzero/src/assembler_ia32.h - Assembler for x86-32 ----------------===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Assembler class for x86-32.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SUBZERO_SRC_ASSEMBLER_IA32_H_
+#define SUBZERO_SRC_ASSEMBLER_IA32_H_
+
+#include "IceDefs.h"
+#include "IceConditionCodesX8632.h"
+#include "IceRegistersX8632.h"
+#include "IceTypes.h"
+#include "IceUtils.h"
+
+#include "assembler.h"
+
+namespace Ice {
+
+class Assembler;
+class ConstantRelocatable;
+
+using RegX8632::GPRRegister;
+using RegX8632::XmmRegister;
+using RegX8632::ByteRegister;
+
+namespace x86 {
+
+const int MAX_NOP_SIZE = 8;
+
+enum ScaleFactor { TIMES_1 = 0, TIMES_2 = 1, TIMES_4 = 2, TIMES_8 = 3 };
+
+class DisplacementRelocation : public AssemblerFixup {
+public:
+  static DisplacementRelocation *create(Assembler *Asm, FixupKind Kind,
+                                        const ConstantRelocatable *Sym) {
+    return new (Asm->Allocate<DisplacementRelocation>())
+        DisplacementRelocation(Kind, Sym);
+  }
+
+  void Process(const MemoryRegion &region, intptr_t position) {
+    (void)region;
+    (void)position;
+    llvm_unreachable("We might not be using this Process() method later.");
+  }
+
+private:
+  DisplacementRelocation(FixupKind Kind, const ConstantRelocatable *Sym)
+      : AssemblerFixup(Kind, Sym) {}
+  DisplacementRelocation(const DisplacementRelocation &) LLVM_DELETED_FUNCTION;
+  DisplacementRelocation &
+  operator=(const DisplacementRelocation &) LLVM_DELETED_FUNCTION;
+};
+
+class Immediate {
+public:
+  explicit Immediate(int32_t value) : value_(value) {}
+
+  Immediate(const Immediate &other) : value_(other.value_) {}
+
+  int32_t value() const { return value_; }
+
+  bool is_int8() const { return Utils::IsInt(8, value_); }
+  bool is_uint8() const { return Utils::IsUint(8, value_); }
+  bool is_uint16() const { return Utils::IsUint(16, value_); }
+
+private:
+  const int32_t value_;
+};
+
+class Operand {
+public:
+  uint8_t mod() const { return (encoding_at(0) >> 6) & 3; }
+
+  GPRRegister rm() const {
+    return static_cast<GPRRegister>(encoding_at(0) & 7);
+  }
+
+  ScaleFactor scale() const {
+    return static_cast<ScaleFactor>((encoding_at(1) >> 6) & 3);
+  }
+
+  GPRRegister index() const {
+    return static_cast<GPRRegister>((encoding_at(1) >> 3) & 7);
+  }
+
+  GPRRegister base() const {
+    return static_cast<GPRRegister>(encoding_at(1) & 7);
+  }
+
+  int8_t disp8() const {
+    assert(length_ >= 2);
+    return static_cast<int8_t>(encoding_[length_ - 1]);
+  }
+
+  int32_t disp32() const {
+    assert(length_ >= 5);
+    return bit_copy<int32_t>(encoding_[length_ - 4]);
+  }
+
+  AssemblerFixup *fixup() const { return fixup_; }
+
+  Operand(const Operand &other) : length_(other.length_), fixup_(other.fixup_) {
+    memmove(&encoding_[0], &other.encoding_[0], other.length_);
+  }
+
+  Operand &operator=(const Operand &other) {
+    length_ = other.length_;
+    fixup_ = other.fixup_;
+    memmove(&encoding_[0], &other.encoding_[0], other.length_);
+    return *this;
+  }
+
+protected:
+  Operand() : length_(0), fixup_(NULL) {} // Needed by subclass Address.
+
+  void SetModRM(int mod, GPRRegister rm) {
+    assert((mod & ~3) == 0);
+    encoding_[0] = (mod << 6) | rm;
+    length_ = 1;
+  }
+
+  void SetSIB(ScaleFactor scale, GPRRegister index, GPRRegister base) {
+    assert(length_ == 1);
+    assert((scale & ~3) == 0);
+    encoding_[1] = (scale << 6) | (index << 3) | base;
+    length_ = 2;
+  }
+
+  void SetDisp8(int8_t disp) {
+    assert(length_ == 1 || length_ == 2);
+    encoding_[length_++] = static_cast<uint8_t>(disp);
+  }
+
+  void SetDisp32(int32_t disp) {
+    assert(length_ == 1 || length_ == 2);
+    intptr_t disp_size = sizeof(disp);
+    memmove(&encoding_[length_], &disp, disp_size);
+    length_ += disp_size;
+  }
+
+  void SetFixup(AssemblerFixup *fixup) { fixup_ = fixup; }
+
+private:
+  uint8_t length_;
+  uint8_t encoding_[6];
+  uint8_t padding_;
+  AssemblerFixup *fixup_;
+
+  explicit Operand(GPRRegister reg) : fixup_(NULL) { SetModRM(3, reg); }
+
+  // Get the operand encoding byte at the given index.
+  uint8_t encoding_at(intptr_t index) const {
+    assert(index >= 0 && index < length_);
+    return encoding_[index];
+  }
+
+  // Returns whether or not this operand is really the given register in
+  // disguise. Used from the assembler to generate better encodings.
+  bool IsRegister(GPRRegister reg) const {
+    return ((encoding_[0] & 0xF8) == 0xC0) // Addressing mode is register only.
+           && ((encoding_[0] & 0x07) == reg); // Register codes match.
+  }
+
+  friend class AssemblerX86;
+};
+
+class Address : public Operand {
+public:
+  Address(GPRRegister base, int32_t disp) {
+    if (disp == 0 && base != RegX8632::Encoded_Reg_ebp) {
+      SetModRM(0, base);
+      if (base == RegX8632::Encoded_Reg_esp)
+        SetSIB(TIMES_1, RegX8632::Encoded_Reg_esp, base);
+    } else if (Utils::IsInt(8, disp)) {
+      SetModRM(1, base);
+      if (base == RegX8632::Encoded_Reg_esp)
+        SetSIB(TIMES_1, RegX8632::Encoded_Reg_esp, base);
+      SetDisp8(disp);
+    } else {
+      SetModRM(2, base);
+      if (base == RegX8632::Encoded_Reg_esp)
+        SetSIB(TIMES_1, RegX8632::Encoded_Reg_esp, base);
+      SetDisp32(disp);
+    }
+  }
+
+  Address(GPRRegister index, ScaleFactor scale, int32_t disp) {
+    assert(index != RegX8632::Encoded_Reg_esp); // Illegal addressing mode.
+    SetModRM(0, RegX8632::Encoded_Reg_esp);
+    SetSIB(scale, index, RegX8632::Encoded_Reg_ebp);
+    SetDisp32(disp);
+  }
+
+  Address(GPRRegister base, GPRRegister index, ScaleFactor scale,
+          int32_t disp) {
+    assert(index != RegX8632::Encoded_Reg_esp); // Illegal addressing mode.
+    if (disp == 0 && base != RegX8632::Encoded_Reg_ebp) {
+      SetModRM(0, RegX8632::Encoded_Reg_esp);
+      SetSIB(scale, index, base);
+    } else if (Utils::IsInt(8, disp)) {
+      SetModRM(1, RegX8632::Encoded_Reg_esp);
+      SetSIB(scale, index, base);
+      SetDisp8(disp);
+    } else {
+      SetModRM(2, RegX8632::Encoded_Reg_esp);
+      SetSIB(scale, index, base);
+      SetDisp32(disp);
+    }
+  }
+
+  Address(const Address &other) : Operand(other) {}
+
+  Address &operator=(const Address &other) {
+    Operand::operator=(other);
+    return *this;
+  }
+
+  static Address Absolute(const uintptr_t addr, AssemblerFixup *fixup) {
+    Address result;
+    result.SetModRM(0, RegX8632::Encoded_Reg_ebp);
+    result.SetDisp32(addr);
+    result.SetFixup(fixup);
+    return result;
+  }
+
+  static Address ofConstPool(GlobalContext *Ctx, Assembler *Asm,
+                             const Constant *Imm);
+
+private:
+  Address() {} // Needed by Address::Absolute.
+};
+
+class Label {
+public:
+  Label() : position_(0), num_unresolved_(0) {
+#ifdef DEBUG
+    for (int i = 0; i < kMaxUnresolvedBranches; i++) {
+      unresolved_near_positions_[i] = -1;
+    }
+#endif // DEBUG
+  }
+
+  ~Label() {
+    // Assert if label is being destroyed with unresolved branches pending.
+    assert(!IsLinked());
+    assert(!HasNear());
+  }
+
+  // TODO(jvoung): why are labels offset by this?
+  static const uint32_t kWordSize = sizeof(uint32_t);
+
+  // Returns the position for bound labels (branches that come after this
+  // are considered backward branches). Cannot be used for unused or linked
+  // labels.
+  intptr_t Position() const {
+    assert(IsBound());
+    return -position_ - kWordSize;
+  }
+
+  // Returns the position of an earlier branch instruction that was linked
+  // to this label (branches that use this are considered forward branches).
+  // The linked instructions form a linked list, of sorts, using the
+  // instruction's displacement field for the location of the next
+  // instruction that is also linked to this label.
+  intptr_t LinkPosition() const {
+    assert(IsLinked());
+    return position_ - kWordSize;
+  }
+
+  // Returns the position of an earlier branch instruction which
+  // assumes that this label is "near", and bumps iterator to the
+  // next near position.
+  intptr_t NearPosition() {
+    assert(HasNear());
+    return unresolved_near_positions_[--num_unresolved_];
+  }
+
+  bool IsBound() const { return position_ < 0; }
+  bool IsLinked() const { return position_ > 0; }
+  bool IsUnused() const { return (position_ == 0) && (num_unresolved_ == 0); }
+  bool HasNear() const { return num_unresolved_ != 0; }
+
+private:
+  void BindTo(intptr_t position) {
+    assert(!IsBound());
+    assert(!HasNear());
+    position_ = -position - kWordSize;
+    assert(IsBound());
+  }
+
+  void LinkTo(intptr_t position) {
+    assert(!IsBound());
+    position_ = position + kWordSize;
+    assert(IsLinked());
+  }
+
+  void NearLinkTo(intptr_t position) {
+    assert(!IsBound());
+    assert(num_unresolved_ < kMaxUnresolvedBranches);
+    unresolved_near_positions_[num_unresolved_++] = position;
+  }
+
+  static const int kMaxUnresolvedBranches = 20;
+
+  intptr_t position_;
+  intptr_t num_unresolved_;
+  intptr_t unresolved_near_positions_[kMaxUnresolvedBranches];
+
+  friend class AssemblerX86;
+  Label(const Label &) LLVM_DELETED_FUNCTION;
+  Label &operator=(const Label &) LLVM_DELETED_FUNCTION;
+};
+
+class AssemblerX86 : public Assembler {
+public:
+  explicit AssemblerX86(bool use_far_branches = false) : buffer_(*this) {
+    // This mode is only needed and implemented for MIPS and ARM.
+    assert(!use_far_branches);
+  }
+  ~AssemblerX86() {}
+
+  static const bool kNearJump = true;
+  static const bool kFarJump = false;
+
+  // Operations to emit XMM instructions (and dispatch on operand type).
+  typedef void (AssemblerX86::*TypedEmitXmmXmm)(Type, XmmRegister, XmmRegister);
+  typedef void (AssemblerX86::*TypedEmitXmmAddr)(Type, XmmRegister,
+                                                 const Address &);
+  typedef void (AssemblerX86::*TypedEmitAddrXmm)(Type, const Address &,
+                                                 XmmRegister);
+  struct TypedXmmEmitters {
+    TypedEmitXmmXmm XmmXmm;
+    TypedEmitXmmAddr XmmAddr;
+    TypedEmitAddrXmm AddrXmm;
+  };
+
+  /*
+   * Emit Machine Instructions.
+   */
+  void call(GPRRegister reg);
+  void call(const Address &address);
+  void call(Label *label);
+  void call(const ConstantRelocatable *label);
+
+  static const intptr_t kCallExternalLabelSize = 5;
+
+  void pushl(GPRRegister reg);
+  void pushl(const Address &address);
+  void pushl(const Immediate &imm);
+
+  void popl(GPRRegister reg);
+  void popl(const Address &address);
+
+  void pushal();
+  void popal();
+
+  void setcc(CondX86::BrCond condition, ByteRegister dst);
+
+  void movl(GPRRegister dst, const Immediate &src);
+  void movl(GPRRegister dst, GPRRegister src);
+
+  void movl(GPRRegister dst, const Address &src);
+  void movl(const Address &dst, GPRRegister src);
+  void movl(const Address &dst, const Immediate &imm);
+
+  void movzxb(GPRRegister dst, ByteRegister src);
+  void movzxb(GPRRegister dst, const Address &src);
+  void movsxb(GPRRegister dst, ByteRegister src);
+  void movsxb(GPRRegister dst, const Address &src);
+
+  void movb(ByteRegister dst, const Address &src);
+  void movb(const Address &dst, ByteRegister src);
+  void movb(const Address &dst, const Immediate &imm);
+
+  void movzxw(GPRRegister dst, GPRRegister src);
+  void movzxw(GPRRegister dst, const Address &src);
+  void movsxw(GPRRegister dst, GPRRegister src);
+  void movsxw(GPRRegister dst, const Address &src);
+  void movw(GPRRegister dst, const Address &src);
+  void movw(const Address &dst, GPRRegister src);
+
+  void leal(GPRRegister dst, const Address &src);
+
+  void cmov(CondX86::BrCond cond, GPRRegister dst, GPRRegister src);
+
+  void rep_movsb();
+
+  void movss(XmmRegister dst, const Address &src);
+  void movss(const Address &dst, XmmRegister src);
+  void movss(XmmRegister dst, XmmRegister src);
+
+  void movd(XmmRegister dst, GPRRegister src);
+  void movd(GPRRegister dst, XmmRegister src);
+
+  void movq(const Address &dst, XmmRegister src);
+  void movq(XmmRegister dst, const Address &src);
+
+  void addss(Type Ty, XmmRegister dst, XmmRegister src);
+  void addss(Type Ty, XmmRegister dst, const Address &src);
+  void subss(Type Ty, XmmRegister dst, XmmRegister src);
+  void subss(Type Ty, XmmRegister dst, const Address &src);
+  void mulss(Type Ty, XmmRegister dst, XmmRegister src);
+  void mulss(Type Ty, XmmRegister dst, const Address &src);
+  void divss(Type Ty, XmmRegister dst, XmmRegister src);
+  void divss(Type Ty, XmmRegister dst, const Address &src);
+
+  void movsd(XmmRegister dst, const Address &src);
+  void movsd(const Address &dst, XmmRegister src);
+  void movsd(XmmRegister dst, XmmRegister src);
+
+  void movaps(XmmRegister dst, XmmRegister src);
+
+  void movups(XmmRegister dst, const Address &src);
+  void movups(const Address &dst, XmmRegister src);
+
+  void padd(Type Ty, XmmRegister dst, XmmRegister src);
+  void padd(Type Ty, XmmRegister dst, const Address &src);
+  void pand(Type Ty, XmmRegister dst, XmmRegister src);
+  void pand(Type Ty, XmmRegister dst, const Address &src);
+  void pandn(Type Ty, XmmRegister dst, XmmRegister src);
+  void pandn(Type Ty, XmmRegister dst, const Address &src);
+  void pmuludq(Type Ty, XmmRegister dst, XmmRegister src);
+  void pmuludq(Type Ty, XmmRegister dst, const Address &src);
+  void por(Type Ty, XmmRegister dst, XmmRegister src);
+  void por(Type Ty, XmmRegister dst, const Address &src);
+  void psub(Type Ty, XmmRegister dst, XmmRegister src);
+  void psub(Type Ty, XmmRegister dst, const Address &src);
+  void pxor(Type Ty, XmmRegister dst, XmmRegister src);
+  void pxor(Type Ty, XmmRegister dst, const Address &src);
+
+  void addps(Type Ty, XmmRegister dst, XmmRegister src);
+  void addps(Type Ty, XmmRegister dst, const Address &src);
+  void subps(Type Ty, XmmRegister dst, XmmRegister src);
+  void subps(Type Ty, XmmRegister dst, const Address &src);
+  void divps(Type Ty, XmmRegister dst, XmmRegister src);
+  void divps(Type Ty, XmmRegister dst, const Address &src);
+  void mulps(Type Ty, XmmRegister dst, XmmRegister src);
+  void mulps(Type Ty, XmmRegister dst, const Address &src);
+  void minps(XmmRegister dst, XmmRegister src);
+  void maxps(XmmRegister dst, XmmRegister src);
+  void andps(XmmRegister dst, XmmRegister src);
+  void andps(XmmRegister dst, const Address &src);
+  void orps(XmmRegister dst, XmmRegister src);
+
+  void cmpps(XmmRegister dst, XmmRegister src, CondX86::CmppsCond CmpCondition);
+  void cmpps(XmmRegister dst, const Address &src,
+             CondX86::CmppsCond CmpCondition);
+
+  void sqrtps(XmmRegister dst);
+  void rsqrtps(XmmRegister dst);
+  void reciprocalps(XmmRegister dst);
+  void movhlps(XmmRegister dst, XmmRegister src);
+  void movlhps(XmmRegister dst, XmmRegister src);
+  void unpcklps(XmmRegister dst, XmmRegister src);
+  void unpckhps(XmmRegister dst, XmmRegister src);
+  void unpcklpd(XmmRegister dst, XmmRegister src);
+  void unpckhpd(XmmRegister dst, XmmRegister src);
+
+  void set1ps(XmmRegister dst, GPRRegister tmp, const Immediate &imm);
+  void shufps(XmmRegister dst, XmmRegister src, const Immediate &mask);
+
+  void minpd(XmmRegister dst, XmmRegister src);
+  void maxpd(XmmRegister dst, XmmRegister src);
+  void sqrtpd(XmmRegister dst);
+  void cvtps2pd(XmmRegister dst, XmmRegister src);
+  void cvtpd2ps(XmmRegister dst, XmmRegister src);
+  void shufpd(XmmRegister dst, XmmRegister src, const Immediate &mask);
+
+  void cvtsi2ss(XmmRegister dst, GPRRegister src);
+  void cvtsi2sd(XmmRegister dst, GPRRegister src);
+
+  void cvtss2si(GPRRegister dst, XmmRegister src);
+  void cvtss2sd(XmmRegister dst, XmmRegister src);
+
+  void cvtsd2si(GPRRegister dst, XmmRegister src);
+  void cvtsd2ss(XmmRegister dst, XmmRegister src);
+
+  void cvttss2si(GPRRegister dst, XmmRegister src);
+  void cvttsd2si(GPRRegister dst, XmmRegister src);
+
+  void cvtdq2pd(XmmRegister dst, XmmRegister src);
+
+  void ucomiss(Type Ty, XmmRegister a, XmmRegister b);
+  void ucomiss(Type Ty, XmmRegister a, const Address &b);
+
+  void movmskpd(GPRRegister dst, XmmRegister src);
+  void movmskps(GPRRegister dst, XmmRegister src);
+
+  void sqrtss(Type Ty, XmmRegister dst, const Address &src);
+  void sqrtss(Type Ty, XmmRegister dst, XmmRegister src);
+
+  void xorpd(XmmRegister dst, const Address &src);
+  void xorpd(XmmRegister dst, XmmRegister src);
+  void xorps(XmmRegister dst, const Address &src);
+  void xorps(XmmRegister dst, XmmRegister src);
+
+  void andpd(XmmRegister dst, const Address &src);
+  void andpd(XmmRegister dst, XmmRegister src);
+
+  void orpd(XmmRegister dst, XmmRegister src);
+
+  void pextrd(GPRRegister dst, XmmRegister src, const Immediate &imm);
+  void pmovsxdq(XmmRegister dst, XmmRegister src);
+  void pcmpeqq(XmmRegister dst, XmmRegister src);
+
+  enum RoundingMode {
+    kRoundToNearest = 0x0,
+    kRoundDown = 0x1,
+    kRoundUp = 0x2,
+    kRoundToZero = 0x3
+  };
+  void roundsd(XmmRegister dst, XmmRegister src, RoundingMode mode);
+
+  void flds(const Address &src);
+  void fstps(const Address &dst);
+
+  void fldl(const Address &src);
+  void fstpl(const Address &dst);
+
+  void fnstcw(const Address &dst);
+  void fldcw(const Address &src);
+
+  void fistpl(const Address &dst);
+  void fistps(const Address &dst);
+  void fildl(const Address &src);
+  void filds(const Address &src);
+
+  void fincstp();
+
+  void xchgl(GPRRegister dst, GPRRegister src);
+
+  void cmpl(GPRRegister reg, const Immediate &imm);
+  void cmpl(GPRRegister reg0, GPRRegister reg1);
+  void cmpl(GPRRegister reg, const Address &address);
+  void cmpl(const Address &address, GPRRegister reg);
+  void cmpl(const Address &address, const Immediate &imm);
+  void cmpb(const Address &address, const Immediate &imm);
+
+  void testl(GPRRegister reg1, GPRRegister reg2);
+  void testl(GPRRegister reg, const Immediate &imm);
+
+  void andl(GPRRegister dst, const Immediate &imm);
+  void andl(GPRRegister dst, GPRRegister src);
+  void andl(GPRRegister dst, const Address &address);
+
+  void orl(GPRRegister dst, const Immediate &imm);
+  void orl(GPRRegister dst, GPRRegister src);
+  void orl(GPRRegister dst, const Address &address);
+
+  void xorl(GPRRegister dst, const Immediate &imm);
+  void xorl(GPRRegister dst, GPRRegister src);
+  void xorl(GPRRegister dst, const Address &address);
+
+  void addl(GPRRegister dst, GPRRegister src);
+  void addl(GPRRegister reg, const Immediate &imm);
+  void addl(GPRRegister reg, const Address &address);
+
+  void addl(const Address &address, GPRRegister reg);
+  void addl(const Address &address, const Immediate &imm);
+
+  void adcl(GPRRegister dst, GPRRegister src);
+  void adcl(GPRRegister reg, const Immediate &imm);
+  void adcl(GPRRegister dst, const Address &address);
+  void adcl(const Address &dst, GPRRegister src);
+
+  void subl(GPRRegister dst, GPRRegister src);
+  void subl(GPRRegister reg, const Immediate &imm);
+  void subl(GPRRegister reg, const Address &address);
+  void subl(const Address &address, GPRRegister reg);
+
+  void cdq();
+
+  void idivl(GPRRegister reg);
+
+  void imull(GPRRegister dst, GPRRegister src);
+  void imull(GPRRegister reg, const Immediate &imm);
+  void imull(GPRRegister reg, const Address &address);
+
+  void imull(GPRRegister reg);
+  void imull(const Address &address);
+
+  void mull(GPRRegister reg);
+  void mull(const Address &address);
+
+  void sbbl(GPRRegister dst, GPRRegister src);
+  void sbbl(GPRRegister reg, const Immediate &imm);
+  void sbbl(GPRRegister reg, const Address &address);
+  void sbbl(const Address &address, GPRRegister reg);
+
+  void incl(GPRRegister reg);
+  void incl(const Address &address);
+
+  void decl(GPRRegister reg);
+  void decl(const Address &address);
+
+  void shll(GPRRegister reg, const Immediate &imm);
+  void shll(GPRRegister operand, GPRRegister shifter);
+  void shll(const Address &operand, GPRRegister shifter);
+  void shrl(GPRRegister reg, const Immediate &imm);
+  void shrl(GPRRegister operand, GPRRegister shifter);
+  void sarl(GPRRegister reg, const Immediate &imm);
+  void sarl(GPRRegister operand, GPRRegister shifter);
+  void sarl(const Address &address, GPRRegister shifter);
+  void shld(GPRRegister dst, GPRRegister src);
+  void shld(GPRRegister dst, GPRRegister src, const Immediate &imm);
+  void shld(const Address &operand, GPRRegister src);
+  void shrd(GPRRegister dst, GPRRegister src);
+  void shrd(GPRRegister dst, GPRRegister src, const Immediate &imm);
+  void shrd(const Address &dst, GPRRegister src);
+
+  void negl(GPRRegister reg);
+  void notl(GPRRegister reg);
+
+  void bsrl(GPRRegister dst, GPRRegister src);
+
+  void bt(GPRRegister base, GPRRegister offset);
+
+  void ret();
+  void ret(const Immediate &imm);
+
+  // 'size' indicates size in bytes and must be in the range 1..8.
+  void nop(int size = 1);
+  void int3();
+  void hlt();
+
+  void j(CondX86::BrCond condition, Label *label, bool near = kFarJump);
+  void j(CondX86::BrCond condition, const ConstantRelocatable *label);
+
+  void jmp(GPRRegister reg);
+  void jmp(Label *label, bool near = kFarJump);
+  void jmp(const ConstantRelocatable *label);
+
+  void lock();
+  void cmpxchgl(const Address &address, GPRRegister reg);
+
+  void LockCmpxchgl(const Address &address, GPRRegister reg) {
+    lock();
+    cmpxchgl(address, reg);
+  }
+
+  intptr_t PreferredLoopAlignment() { return 16; }
+  void Align(intptr_t alignment, intptr_t offset);
+  void Bind(Label *label);
+
+  intptr_t CodeSize() const { return buffer_.Size(); }
+
+  void FinalizeInstructions(const MemoryRegion &region) {
+    buffer_.FinalizeInstructions(region);
+  }
+
+  // Expose the buffer, for bringup...
+  intptr_t GetPosition() const { return buffer_.GetPosition(); }
+  template <typename T> T LoadBuffer(intptr_t position) const {
+    return buffer_.Load<T>(position);
+  }
+  AssemblerFixup *GetLatestFixup() const { return buffer_.GetLatestFixup(); }
+
+private:
+  inline void EmitUint8(uint8_t value);
+  inline void EmitInt32(int32_t value);
+  inline void EmitRegisterOperand(int rm, int reg);
+  inline void EmitXmmRegisterOperand(int rm, XmmRegister reg);
+  inline void EmitFixup(AssemblerFixup *fixup);
+  inline void EmitOperandSizeOverride();
+
+  void EmitOperand(int rm, const Operand &operand);
+  void EmitImmediate(const Immediate &imm);
+  void EmitComplexI8(int rm, const Operand &operand,
+                     const Immediate &immediate);
+  void EmitComplex(int rm, const Operand &operand, const Immediate &immediate);
+  void EmitLabel(Label *label, intptr_t instruction_size);
+  void EmitLabelLink(Label *label);
+  void EmitNearLabelLink(Label *label);
+
+  void EmitGenericShift(int rm, GPRRegister reg, const Immediate &imm);
+  void EmitGenericShift(int rm, const Operand &operand, GPRRegister shifter);
+
+  AssemblerBuffer buffer_;
+
+  AssemblerX86(const AssemblerX86 &) LLVM_DELETED_FUNCTION;
+  AssemblerX86 &operator=(const AssemblerX86 &) LLVM_DELETED_FUNCTION;
+};
+
+inline void AssemblerX86::EmitUint8(uint8_t value) {
+  buffer_.Emit<uint8_t>(value);
+}
+
+inline void AssemblerX86::EmitInt32(int32_t value) {
+  buffer_.Emit<int32_t>(value);
+}
+
+inline void AssemblerX86::EmitRegisterOperand(int rm, int reg) {
+  assert(rm >= 0 && rm < 8);
+  buffer_.Emit<uint8_t>(0xC0 + (rm << 3) + reg);
+}
+
+inline void AssemblerX86::EmitXmmRegisterOperand(int rm, XmmRegister reg) {
+  EmitRegisterOperand(rm, static_cast<GPRRegister>(reg));
+}
+
+inline void AssemblerX86::EmitFixup(AssemblerFixup *fixup) {
+  buffer_.EmitFixup(fixup);
+}
+
+inline void AssemblerX86::EmitOperandSizeOverride() { EmitUint8(0x66); }
+
+} // end of namespace x86
+} // end of namespace Ice
+
+#endif // SUBZERO_SRC_ASSEMBLER_IA32_H_
diff --git a/src/llvm2ice.cpp b/src/llvm2ice.cpp
index f147877..3da3f4f 100644
--- a/src/llvm2ice.cpp
+++ b/src/llvm2ice.cpp
@@ -130,6 +130,11 @@
             cl::desc("Build ICE instructions when reading bitcode"),
             cl::init(false));
 
+static cl::opt<bool>
+    UseIntegratedAssembler("integrated-as",
+                           cl::desc("Use integrated assembler (default yes)"),
+                           cl::init(true));
+
 int main(int argc, char **argv) {
 
   cl::ParseCommandLineOptions(argc, argv);
@@ -158,6 +163,7 @@
   Flags.DisableTranslation = DisableTranslation;
   Flags.DisableGlobals = DisableGlobals;
   Flags.FunctionSections = FunctionSections;
+  Flags.UseIntegratedAssembler = UseIntegratedAssembler;
   Flags.UseSandboxing = UseSandboxing;
   Flags.DumpStats = DumpStats;
   Flags.DefaultGlobalPrefix = DefaultGlobalPrefix;