Adds the x86-64 assembler.

As part of this CL, x86-32 assembler tests are also introduced. They were implemented before the x86 base assembler template was modified for x86-64 support.

BUG=https://code.google.com/p/nativeclient/issues/detail?id=4077
R=stichnot@chromium.org

Review URL: https://codereview.chromium.org/1224173006.
diff --git a/src/IceAssemblerX8664.cpp b/src/IceAssemblerX8664.cpp
deleted file mode 100644
index 910924d..0000000
--- a/src/IceAssemblerX8664.cpp
+++ /dev/null
@@ -1,48 +0,0 @@
-//===- subzero/src/IceAssemblerX8664.cpp ----------------------------------===//
-//
-//                        The Subzero Code Generator
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file implements the Assembler class for x86-64.
-///
-//===----------------------------------------------------------------------===//
-
-#include "IceAssemblerX8664.h"
-
-namespace Ice {
-namespace X8664 {
-
-void AssemblerX8664::alignFunction() {
-  llvm::report_fatal_error("Not yet implemented");
-}
-
-void AssemblerX8664::padWithNop(intptr_t) {
-  llvm::report_fatal_error("Not yet implemented");
-}
-
-SizeT AssemblerX8664::getBundleAlignLog2Bytes() const {
-  llvm::report_fatal_error("Not yet implemented");
-}
-const char *AssemblerX8664::getNonExecPadDirective() const {
-  llvm::report_fatal_error("Not yet implemented");
-}
-
-llvm::ArrayRef<uint8_t> AssemblerX8664::getNonExecBundlePadding() const {
-  llvm::report_fatal_error("Not yet implemented");
-}
-
-void AssemblerX8664::bindCfgNodeLabel(SizeT) {
-  llvm::report_fatal_error("Not yet implemented");
-}
-
-bool AssemblerX8664::fixupIsPCRel(FixupKind) const {
-  llvm::report_fatal_error("Not yet implemented");
-}
-
-} // namespace X8664
-} // namespace Ice
diff --git a/src/IceAssemblerX8664.h b/src/IceAssemblerX8664.h
index f2ffd7f..5666810 100644
--- a/src/IceAssemblerX8664.h
+++ b/src/IceAssemblerX8664.h
@@ -1,4 +1,12 @@
-//===- subzero/src/IceAssemblerX8664.h - Assembler for x86-64 -*- C++ -*---===//
+//===- subzero/src/IceAssemblerX8664.h - Assembler for x86-64 ---*- C++ -*-===//
+//
+// Copyright (c) 2013, the Dart project authors.  Please see the AUTHORS file
+// for details. All rights reserved. Use of this source code is governed by a
+// BSD-style license that can be found in the LICENSE file.
+//
+// Modified by the Subzero authors.
+//
+//===----------------------------------------------------------------------===//
 //
 //                        The Subzero Code Generator
 //
@@ -16,33 +24,32 @@
 #define SUBZERO_SRC_ICEASSEMBLERX8664_H
 
 #include "IceAssembler.h"
+#include "IceAssemblerX86Base.h"
 #include "IceDefs.h"
+#include "IceOperand.h"
+#include "IceTargetLoweringX8664Traits.h"
+#include "IceTypes.h"
+#include "IceUtils.h"
 
 namespace Ice {
+
+class TargetX8664;
+
 namespace X8664 {
 
-class AssemblerX8664 final : public Assembler {
+using Immediate = ::Ice::X86Internal::Immediate;
+using Label = ::Ice::X86Internal::Label;
+
+class AssemblerX8664 : public X86Internal::AssemblerX86Base<TargetX8664> {
   AssemblerX8664(const AssemblerX8664 &) = delete;
   AssemblerX8664 &operator=(const AssemblerX8664 &) = delete;
 
 public:
   explicit AssemblerX8664(bool use_far_branches = false)
-      : Assembler(Asm_X8664) {
-    assert(!use_far_branches);
-    (void)use_far_branches;
-    llvm::report_fatal_error("Not yet implemented");
-  }
-
+      : X86Internal::AssemblerX86Base<TargetX8664>(Asm_X8664,
+                                                   use_far_branches) {}
   ~AssemblerX8664() override = default;
 
-  void alignFunction() override;
-  void padWithNop(intptr_t Padding) override;
-  SizeT getBundleAlignLog2Bytes() const override;
-  const char *getNonExecPadDirective() const override;
-  llvm::ArrayRef<uint8_t> getNonExecBundlePadding() const override;
-  void bindCfgNodeLabel(SizeT NodeNumber) override;
-  bool fixupIsPCRel(FixupKind Kind) const override;
-
   static bool classof(const Assembler *Asm) {
     return Asm->getKind() == Asm_X8664;
   }
diff --git a/src/IceAssemblerX86Base.h b/src/IceAssemblerX86Base.h
index 1db270a..4d0333a 100644
--- a/src/IceAssemblerX86Base.h
+++ b/src/IceAssemblerX86Base.h
@@ -314,7 +314,7 @@
    */
   void call(typename Traits::GPRRegister reg);
   void call(const typename Traits::Address &address);
-  void call(const ConstantRelocatable *label);
+  void call(const ConstantRelocatable *label); // not testable.
   void call(const Immediate &abs_address);
 
   static const intptr_t kCallExternalLabelSize = 5;
@@ -324,7 +324,11 @@
   void popl(typename Traits::GPRRegister reg);
   void popl(const typename Traits::Address &address);
 
+  template <typename T = Traits,
+            typename = typename std::enable_if<T::HasPusha>::type>
   void pushal();
+  template <typename T = Traits,
+            typename = typename std::enable_if<T::HasPopa>::type>
   void popal();
 
   void setcc(typename Traits::Cond::BrCond condition,
@@ -332,7 +336,6 @@
   void setcc(typename Traits::Cond::BrCond condition,
              const typename Traits::Address &address);
 
-  // All mov() overloads are tested.
   void mov(Type Ty, typename Traits::GPRRegister dst, const Immediate &src);
   void mov(Type Ty, typename Traits::GPRRegister dst,
            typename Traits::GPRRegister src);
@@ -342,6 +345,8 @@
            typename Traits::GPRRegister src);
   void mov(Type Ty, const typename Traits::Address &dst, const Immediate &imm);
 
+  void movFromAh(const typename Traits::GPRRegister dst);
+
   void movzx(Type Ty, typename Traits::GPRRegister dst,
              typename Traits::GPRRegister src);
   void movzx(Type Ty, typename Traits::GPRRegister dst,
@@ -503,6 +508,7 @@
   void sqrtps(typename Traits::XmmRegister dst);
   void rsqrtps(typename Traits::XmmRegister dst);
   void reciprocalps(typename Traits::XmmRegister dst);
+
   void movhlps(typename Traits::XmmRegister dst,
                typename Traits::XmmRegister src);
   void movlhps(typename Traits::XmmRegister dst,
@@ -518,16 +524,12 @@
 
   void set1ps(typename Traits::XmmRegister dst,
               typename Traits::GPRRegister tmp, const Immediate &imm);
-  void shufps(typename Traits::XmmRegister dst,
-              typename Traits::XmmRegister src, const Immediate &mask);
 
   void minpd(typename Traits::XmmRegister dst,
              typename Traits::XmmRegister src);
   void maxpd(typename Traits::XmmRegister dst,
              typename Traits::XmmRegister src);
   void sqrtpd(typename Traits::XmmRegister dst);
-  void shufpd(typename Traits::XmmRegister dst,
-              typename Traits::XmmRegister src, const Immediate &mask);
 
   void pshufd(Type Ty, typename Traits::XmmRegister dst,
               typename Traits::XmmRegister src, const Immediate &mask);
@@ -606,8 +608,6 @@
 
   void pextr(Type Ty, typename Traits::GPRRegister dst,
              typename Traits::XmmRegister src, const Immediate &imm);
-  void pextr(Type Ty, typename Traits::GPRRegister dst,
-             const typename Traits::Address &src, const Immediate &imm);
 
   void pmovsxdq(typename Traits::XmmRegister dst,
                 typename Traits::XmmRegister src);
@@ -630,19 +630,49 @@
   void roundsd(typename Traits::XmmRegister dst,
                typename Traits::XmmRegister src, RoundingMode mode);
 
-  void fld(Type Ty, const typename Traits::Address &src);
-  void fstp(Type Ty, const typename Traits::Address &dst);
-  void fstp(typename Traits::X87STRegister st);
+  //----------------------------------------------------------------------------
+  //
+  // Begin: X87 instructions. Only available when Traits::UsesX87.
+  //
+  //----------------------------------------------------------------------------
+  template <typename T = Traits,
+            typename = typename std::enable_if<T::UsesX87>::type>
+  void fld(Type Ty, const typename T::Address &src);
+  template <typename T = Traits,
+            typename = typename std::enable_if<T::UsesX87>::type>
+  void fstp(Type Ty, const typename T::Address &dst);
+  template <typename T = Traits,
+            typename = typename std::enable_if<T::UsesX87>::type>
+  void fstp(typename T::X87STRegister st);
 
-  void fnstcw(const typename Traits::Address &dst);
-  void fldcw(const typename Traits::Address &src);
+  template <typename T = Traits,
+            typename = typename std::enable_if<T::UsesX87>::type>
+  void fnstcw(const typename T::Address &dst);
+  template <typename T = Traits,
+            typename = typename std::enable_if<T::UsesX87>::type>
+  void fldcw(const typename T::Address &src);
 
-  void fistpl(const typename Traits::Address &dst);
-  void fistps(const typename Traits::Address &dst);
-  void fildl(const typename Traits::Address &src);
-  void filds(const typename Traits::Address &src);
+  template <typename T = Traits,
+            typename = typename std::enable_if<T::UsesX87>::type>
+  void fistpl(const typename T::Address &dst);
+  template <typename T = Traits,
+            typename = typename std::enable_if<T::UsesX87>::type>
+  void fistps(const typename T::Address &dst);
+  template <typename T = Traits,
+            typename = typename std::enable_if<T::UsesX87>::type>
+  void fildl(const typename T::Address &src);
+  template <typename T = Traits,
+            typename = typename std::enable_if<T::UsesX87>::type>
+  void filds(const typename T::Address &src);
 
+  template <typename T = Traits,
+            typename = typename std::enable_if<T::UsesX87>::type>
   void fincstp();
+  //----------------------------------------------------------------------------
+  //
+  // End: X87 instructions.
+  //
+  //----------------------------------------------------------------------------
 
   void cmp(Type Ty, typename Traits::GPRRegister reg0,
            typename Traits::GPRRegister reg1);
@@ -754,9 +784,13 @@
   void mul(Type Ty, typename Traits::GPRRegister reg);
   void mul(Type Ty, const typename Traits::Address &address);
 
+  template <class T = Traits,
+            typename = typename std::enable_if<!T::Is64Bit>::type>
   void incl(typename Traits::GPRRegister reg);
   void incl(const typename Traits::Address &address);
 
+  template <class T = Traits,
+            typename = typename std::enable_if<!T::Is64Bit>::type>
   void decl(typename Traits::GPRRegister reg);
   void decl(const typename Traits::Address &address);
 
@@ -825,16 +859,14 @@
   void ud2();
 
   // j(Label) is fully tested.
-  // j(ConstantRelocatable) is not tested as the test can not easily create such
-  // an argument.
   void j(typename Traits::Cond::BrCond condition, Label *label,
          bool near = kFarJump);
   void j(typename Traits::Cond::BrCond condition,
-         const ConstantRelocatable *label);
+         const ConstantRelocatable *label); // not testable.
 
   void jmp(typename Traits::GPRRegister reg);
   void jmp(Label *label, bool near = kFarJump);
-  void jmp(const ConstantRelocatable *label);
+  void jmp(const ConstantRelocatable *label); // not testable.
 
   void mfence();
 
@@ -855,12 +887,20 @@
 
   intptr_t CodeSize() const { return Buffer.size(); }
 
-private:
+protected:
   inline void emitUint8(uint8_t value);
+
+private:
+  static constexpr Type RexTypeIrrelevant = IceType_i32;
+  static constexpr Type IceType_ForceRexW = IceType_i64;
+  static constexpr typename Traits::GPRRegister RexRegIrrelevant =
+      Traits::GPRRegister::Encoded_Reg_eax;
+
   inline void emitInt16(int16_t value);
   inline void emitInt32(int32_t value);
   inline void emitRegisterOperand(int rm, int reg);
-  inline void emitXmmRegisterOperand(int rm, typename Traits::XmmRegister reg);
+  template <typename RegType, typename RmType>
+  inline void emitXmmRegisterOperand(RegType reg, RmType rm);
   inline void emitFixup(AssemblerFixup *fixup);
   inline void emitOperandSizeOverride();
 
@@ -910,6 +950,100 @@
   template <uint32_t Tag>
   void arith_int(Type Ty, const typename Traits::Address &address,
                  const Immediate &imm);
+
+  // gprEncoding returns Reg encoding for operand emission. For x86-64 we mask
+  // out the 4th bit as it is encoded in the REX.[RXB] bits. No other bits are
+  // touched because we don't want to mask errors.
+  template <typename RegType, typename T = Traits>
+  typename std::enable_if<T::Is64Bit, typename T::GPRRegister>::type
+  gprEncoding(const RegType Reg) {
+    return static_cast<typename Traits::GPRRegister>(static_cast<uint8_t>(Reg) &
+                                                     ~0x08);
+  }
+
+  template <typename RegType, typename T = Traits>
+  typename std::enable_if<!T::Is64Bit, typename T::GPRRegister>::type
+  gprEncoding(const RegType Reg) {
+    return static_cast<typename T::GPRRegister>(Reg);
+  }
+
+  template <typename RegType>
+  bool is8BitRegisterRequiringRex(const Type Ty, const RegType Reg) {
+    static constexpr bool IsGPR =
+        std::is_same<typename std::decay<RegType>::type,
+                     typename Traits::ByteRegister>::value ||
+        std::is_same<typename std::decay<RegType>::type,
+                     typename Traits::GPRRegister>::value;
+
+    return IsGPR && (Reg & 0x04) != 0 && (Reg & 0x08) == 0 &&
+           isByteSizedArithType(Ty);
+  };
+
+  // assembleAndEmitRex is used for determining which (if any) rex prefix should
+  // be emitted for the current instruction. It allows different types for Reg
+  // and Rm because they could be of different types (e.g., in mov[sz]x
+  // instrutions.) If Addr is not nullptr, then Rm is ignored, and Rex.B is
+  // determined by Addr instead. TyRm is still used to determine Addr's size.
+  template <typename RegType, typename RmType, typename T = Traits>
+  typename std::enable_if<T::Is64Bit, void>::type
+  assembleAndEmitRex(const Type TyReg, const RegType Reg, const Type TyRm,
+                     const RmType Rm,
+                     const typename T::Address *Addr = nullptr) {
+    const uint8_t W = (TyReg == IceType_i64 || TyRm == IceType_i64)
+                          ? T::Operand::RexW
+                          : T::Operand::RexNone;
+    const uint8_t R = (Reg & 0x08) ? T::Operand::RexR : T::Operand::RexNone;
+    const uint8_t X = (Addr != nullptr) ? Addr->rexX() : T::Operand::RexNone;
+    const uint8_t B =
+        (Addr != nullptr) ? Addr->rexB() : (Rm & 0x08) ? T::Operand::RexB
+                                                       : T::Operand::RexNone;
+    const uint8_t Prefix = W | R | X | B;
+    if (Prefix != T::Operand::RexNone) {
+      emitUint8(Prefix);
+    } else if (is8BitRegisterRequiringRex(TyReg, Reg) ||
+               (Addr == nullptr && is8BitRegisterRequiringRex(TyRm, Rm))) {
+      emitUint8(T::Operand::RexBase);
+    }
+  }
+
+  template <typename RegType, typename RmType, typename T = Traits>
+  typename std::enable_if<!T::Is64Bit, void>::type
+  assembleAndEmitRex(const Type, const RegType, const Type, const RmType,
+                     const typename T::Address * = nullptr) {}
+
+  // emitRexRB is used for emitting a Rex prefix instructions with two explicit
+  // register operands in its mod-rm byte.
+  template <typename RegType, typename RmType>
+  void emitRexRB(const Type Ty, const RegType Reg, const RmType Rm) {
+    assembleAndEmitRex(Ty, Reg, Ty, Rm);
+  }
+
+  template <typename RegType, typename RmType>
+  void emitRexRB(const Type TyReg, const RegType Reg, const Type TyRm,
+                 const RmType Rm) {
+    assembleAndEmitRex(TyReg, Reg, TyRm, Rm);
+  }
+
+  // emitRexB is used for emitting a Rex prefix if one is needed on encoding the
+  // Reg field in an x86 instruction. It is invoked by the template when Reg is
+  // the single register operand in the instruction (e.g., push Reg.)
+  template <typename RmType> void emitRexB(const Type Ty, const RmType Rm) {
+    emitRexRB(Ty, RexRegIrrelevant, Ty, Rm);
+  }
+
+  // emitRex is used for emitting a Rex prefix for an address and a GPR. The
+  // address may contain zero, one, or two registers.
+  template <typename RegType>
+  void emitRex(const Type Ty, const typename Traits::Address &Addr,
+               const RegType Reg) {
+    assembleAndEmitRex(Ty, Reg, Ty, RexRegIrrelevant, &Addr);
+  }
+
+  template <typename RegType>
+  void emitRex(const Type AddrTy, const typename Traits::Address &Addr,
+               const Type TyReg, const RegType Reg) {
+    assembleAndEmitRex(TyReg, Reg, AddrTy, RexRegIrrelevant, &Addr);
+  }
 };
 
 template <class Machine>
@@ -928,15 +1062,17 @@
 }
 
 template <class Machine>
-inline void AssemblerX86Base<Machine>::emitRegisterOperand(int rm, int reg) {
+inline void AssemblerX86Base<Machine>::emitRegisterOperand(int reg, int rm) {
+  assert(reg >= 0 && reg < 8);
   assert(rm >= 0 && rm < 8);
-  Buffer.emit<uint8_t>(0xC0 + (rm << 3) + reg);
+  Buffer.emit<uint8_t>(0xC0 + (reg << 3) + rm);
 }
 
 template <class Machine>
-inline void AssemblerX86Base<Machine>::emitXmmRegisterOperand(
-    int rm, typename Traits::XmmRegister reg) {
-  emitRegisterOperand(rm, static_cast<typename Traits::GPRRegister>(reg));
+template <typename RegType, typename RmType>
+inline void AssemblerX86Base<Machine>::emitXmmRegisterOperand(RegType reg,
+                                                              RmType rm) {
+  emitRegisterOperand(gprEncoding(reg), gprEncoding(rm));
 }
 
 template <class Machine>
diff --git a/src/IceAssemblerX86BaseImpl.h b/src/IceAssemblerX86BaseImpl.h
index f8ba4d4..0661c9f 100644
--- a/src/IceAssemblerX86BaseImpl.h
+++ b/src/IceAssemblerX86BaseImpl.h
@@ -96,13 +96,15 @@
 template <class Machine>
 void AssemblerX86Base<Machine>::call(typename Traits::GPRRegister reg) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRexB(RexTypeIrrelevant, reg);
   emitUint8(0xFF);
-  emitRegisterOperand(2, reg);
+  emitRegisterOperand(2, gprEncoding(reg));
 }
 
 template <class Machine>
 void AssemblerX86Base<Machine>::call(const typename Traits::Address &address) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRex(RexTypeIrrelevant, address, RexRegIrrelevant);
   emitUint8(0xFF);
   emitOperand(2, address);
 }
@@ -132,28 +134,37 @@
 template <class Machine>
 void AssemblerX86Base<Machine>::pushl(typename Traits::GPRRegister reg) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
-  emitUint8(0x50 + reg);
+  emitRexB(RexTypeIrrelevant, reg);
+  emitUint8(0x50 + gprEncoding(reg));
 }
 
 template <class Machine>
 void AssemblerX86Base<Machine>::popl(typename Traits::GPRRegister reg) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
-  emitUint8(0x58 + reg);
+  // Any type that would not force a REX prefix to be emitted can be provided
+  // here.
+  emitRexB(RexTypeIrrelevant, reg);
+  emitUint8(0x58 + gprEncoding(reg));
 }
 
 template <class Machine>
 void AssemblerX86Base<Machine>::popl(const typename Traits::Address &address) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRex(RexTypeIrrelevant, address, RexRegIrrelevant);
   emitUint8(0x8F);
   emitOperand(0, address);
 }
 
-template <class Machine> void AssemblerX86Base<Machine>::pushal() {
+template <class Machine>
+template <typename, typename>
+void AssemblerX86Base<Machine>::pushal() {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x60);
 }
 
-template <class Machine> void AssemblerX86Base<Machine>::popal() {
+template <class Machine>
+template <typename, typename>
+void AssemblerX86Base<Machine>::popal() {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x61);
 }
@@ -162,15 +173,17 @@
 void AssemblerX86Base<Machine>::setcc(typename Traits::Cond::BrCond condition,
                                       typename Traits::ByteRegister dst) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRexB(IceType_i8, dst);
   emitUint8(0x0F);
   emitUint8(0x90 + condition);
-  emitUint8(0xC0 + dst);
+  emitUint8(0xC0 + gprEncoding(dst));
 }
 
 template <class Machine>
 void AssemblerX86Base<Machine>::setcc(typename Traits::Cond::BrCond condition,
                                       const typename Traits::Address &address) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRex(RexTypeIrrelevant, address, RexRegIrrelevant);
   emitUint8(0x0F);
   emitUint8(0x90 + condition);
   emitOperand(0, address);
@@ -179,16 +192,18 @@
 template <class Machine>
 void AssemblerX86Base<Machine>::mov(Type Ty, typename Traits::GPRRegister dst,
                                     const Immediate &imm) {
+  assert(Ty != IceType_i64 && "i64 not supported yet.");
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
-  if (isByteSizedType(Ty)) {
-    emitUint8(0xB0 + dst);
-    emitUint8(imm.value() & 0xFF);
-    return;
-  }
   if (Ty == IceType_i16)
     emitOperandSizeOverride();
-  emitUint8(0xB8 + dst);
-  emitImmediate(Ty, imm);
+  emitRexB(Ty, dst);
+  if (isByteSizedType(Ty)) {
+    emitUint8(0xB0 + gprEncoding(dst));
+    emitUint8(imm.value() & 0xFF);
+  } else {
+    emitUint8(0xB8 + gprEncoding(dst));
+    emitImmediate(Ty, imm);
+  }
 }
 
 template <class Machine>
@@ -197,12 +212,13 @@
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   if (Ty == IceType_i16)
     emitOperandSizeOverride();
+  emitRexRB(Ty, src, dst);
   if (isByteSizedType(Ty)) {
     emitUint8(0x88);
   } else {
     emitUint8(0x89);
   }
-  emitRegisterOperand(src, dst);
+  emitRegisterOperand(gprEncoding(src), gprEncoding(dst));
 }
 
 template <class Machine>
@@ -211,12 +227,13 @@
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   if (Ty == IceType_i16)
     emitOperandSizeOverride();
+  emitRex(Ty, src, dst);
   if (isByteSizedType(Ty)) {
     emitUint8(0x8A);
   } else {
     emitUint8(0x8B);
   }
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
@@ -226,21 +243,24 @@
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   if (Ty == IceType_i16)
     emitOperandSizeOverride();
+  emitRex(Ty, dst, src);
   if (isByteSizedType(Ty)) {
     emitUint8(0x88);
   } else {
     emitUint8(0x89);
   }
-  emitOperand(src, dst);
+  emitOperand(gprEncoding(src), dst);
 }
 
 template <class Machine>
 void AssemblerX86Base<Machine>::mov(Type Ty,
                                     const typename Traits::Address &dst,
                                     const Immediate &imm) {
+  assert(Ty != IceType_i64 && "i64 not supported yet.");
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   if (Ty == IceType_i16)
     emitOperandSizeOverride();
+  emitRex(Ty, dst, RexRegIrrelevant);
   if (isByteSizedType(Ty)) {
     emitUint8(0xC6);
     emitOperand(0, dst);
@@ -259,9 +279,10 @@
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   bool ByteSized = isByteSizedType(SrcTy);
   assert(ByteSized || SrcTy == IceType_i16);
+  emitRexRB(RexTypeIrrelevant, dst, SrcTy, src);
   emitUint8(0x0F);
   emitUint8(ByteSized ? 0xB6 : 0xB7);
-  emitRegisterOperand(dst, src);
+  emitRegisterOperand(gprEncoding(dst), gprEncoding(src));
 }
 
 template <class Machine>
@@ -271,9 +292,10 @@
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   bool ByteSized = isByteSizedType(SrcTy);
   assert(ByteSized || SrcTy == IceType_i16);
+  emitRex(SrcTy, src, RexTypeIrrelevant, dst);
   emitUint8(0x0F);
   emitUint8(ByteSized ? 0xB6 : 0xB7);
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
@@ -282,10 +304,15 @@
                                       typename Traits::GPRRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   bool ByteSized = isByteSizedType(SrcTy);
-  assert(ByteSized || SrcTy == IceType_i16);
-  emitUint8(0x0F);
-  emitUint8(ByteSized ? 0xBE : 0xBF);
-  emitRegisterOperand(dst, src);
+  emitRexRB(IceType_ForceRexW, dst, SrcTy, src);
+  if (ByteSized || SrcTy == IceType_i16) {
+    emitUint8(0x0F);
+    emitUint8(ByteSized ? 0xBE : 0xBF);
+  } else {
+    assert(Traits::Is64Bit && SrcTy == IceType_i32);
+    emitUint8(0x63);
+  }
+  emitRegisterOperand(gprEncoding(dst), gprEncoding(src));
 }
 
 template <class Machine>
@@ -294,10 +321,15 @@
                                       const typename Traits::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   bool ByteSized = isByteSizedType(SrcTy);
-  assert(ByteSized || SrcTy == IceType_i16);
-  emitUint8(0x0F);
-  emitUint8(ByteSized ? 0xBE : 0xBF);
-  emitOperand(dst, src);
+  emitRex(SrcTy, src, IceType_ForceRexW, dst);
+  if (ByteSized || SrcTy == IceType_i16) {
+    emitUint8(0x0F);
+    emitUint8(ByteSized ? 0xBE : 0xBF);
+  } else {
+    assert(Traits::Is64Bit && SrcTy == IceType_i32);
+    emitUint8(0x63);
+  }
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
@@ -307,8 +339,9 @@
   assert(Ty == IceType_i16 || Ty == IceType_i32);
   if (Ty == IceType_i16)
     emitOperandSizeOverride();
+  emitRex(Ty, src, dst);
   emitUint8(0x8D);
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
@@ -321,9 +354,10 @@
     emitOperandSizeOverride();
   else
     assert(Ty == IceType_i32);
+  emitRexRB(Ty, dst, src);
   emitUint8(0x0F);
   emitUint8(0x40 + cond);
-  emitRegisterOperand(dst, src);
+  emitRegisterOperand(gprEncoding(dst), gprEncoding(src));
 }
 
 template <class Machine>
@@ -336,9 +370,10 @@
     emitOperandSizeOverride();
   else
     assert(Ty == IceType_i32);
+  emitRex(Ty, src, dst);
   emitUint8(0x0F);
   emitUint8(0x40 + cond);
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine> void AssemblerX86Base<Machine>::rep_movsb() {
@@ -352,9 +387,10 @@
                                       const typename Traits::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(isFloat32Asserting32Or64(Ty) ? 0xF3 : 0xF2);
+  emitRex(RexTypeIrrelevant, src, dst);
   emitUint8(0x0F);
   emitUint8(0x10);
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
@@ -363,9 +399,10 @@
                                       typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(isFloat32Asserting32Or64(Ty) ? 0xF3 : 0xF2);
+  emitRex(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x11);
-  emitOperand(src, dst);
+  emitOperand(gprEncoding(src), dst);
 }
 
 template <class Machine>
@@ -373,6 +410,7 @@
                                       typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(isFloat32Asserting32Or64(Ty) ? 0xF3 : 0xF2);
+  emitRexRB(RexTypeIrrelevant, src, dst);
   emitUint8(0x0F);
   emitUint8(0x11);
   emitXmmRegisterOperand(src, dst);
@@ -383,9 +421,10 @@
                                      typename Traits::GPRRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x6E);
-  emitRegisterOperand(dst, src);
+  emitRegisterOperand(gprEncoding(dst), gprEncoding(src));
 }
 
 template <class Machine>
@@ -393,9 +432,10 @@
                                      const typename Traits::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRex(RexTypeIrrelevant, src, dst);
   emitUint8(0x0F);
   emitUint8(0x6E);
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
@@ -403,9 +443,10 @@
                                      typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRexRB(RexTypeIrrelevant, src, dst);
   emitUint8(0x0F);
   emitUint8(0x7E);
-  emitRegisterOperand(src, dst);
+  emitRegisterOperand(gprEncoding(src), gprEncoding(dst));
 }
 
 template <class Machine>
@@ -413,9 +454,10 @@
                                      typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRex(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x7E);
-  emitOperand(src, dst);
+  emitOperand(gprEncoding(src), dst);
 }
 
 template <class Machine>
@@ -423,9 +465,10 @@
                                      typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0xF3);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x7E);
-  emitRegisterOperand(dst, src);
+  emitXmmRegisterOperand(dst, src);
 }
 
 template <class Machine>
@@ -433,9 +476,10 @@
                                      typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRex(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0xD6);
-  emitOperand(src, dst);
+  emitOperand(gprEncoding(src), dst);
 }
 
 template <class Machine>
@@ -443,9 +487,10 @@
                                      const typename Traits::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0xF3);
+  emitRex(RexTypeIrrelevant, src, dst);
   emitUint8(0x0F);
   emitUint8(0x7E);
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
@@ -453,6 +498,7 @@
                                       typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(isFloat32Asserting32Or64(Ty) ? 0xF3 : 0xF2);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x58);
   emitXmmRegisterOperand(dst, src);
@@ -463,9 +509,10 @@
                                       const typename Traits::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(isFloat32Asserting32Or64(Ty) ? 0xF3 : 0xF2);
+  emitRex(RexTypeIrrelevant, src, dst);
   emitUint8(0x0F);
   emitUint8(0x58);
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
@@ -473,6 +520,7 @@
                                       typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(isFloat32Asserting32Or64(Ty) ? 0xF3 : 0xF2);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x5C);
   emitXmmRegisterOperand(dst, src);
@@ -483,9 +531,10 @@
                                       const typename Traits::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(isFloat32Asserting32Or64(Ty) ? 0xF3 : 0xF2);
+  emitRex(RexTypeIrrelevant, src, dst);
   emitUint8(0x0F);
   emitUint8(0x5C);
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
@@ -493,6 +542,7 @@
                                       typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(isFloat32Asserting32Or64(Ty) ? 0xF3 : 0xF2);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x59);
   emitXmmRegisterOperand(dst, src);
@@ -503,9 +553,10 @@
                                       const typename Traits::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(isFloat32Asserting32Or64(Ty) ? 0xF3 : 0xF2);
+  emitRex(RexTypeIrrelevant, src, dst);
   emitUint8(0x0F);
   emitUint8(0x59);
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
@@ -513,6 +564,7 @@
                                       typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(isFloat32Asserting32Or64(Ty) ? 0xF3 : 0xF2);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x5E);
   emitXmmRegisterOperand(dst, src);
@@ -523,29 +575,31 @@
                                       const typename Traits::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(isFloat32Asserting32Or64(Ty) ? 0xF3 : 0xF2);
+  emitRex(RexTypeIrrelevant, src, dst);
   emitUint8(0x0F);
   emitUint8(0x5E);
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
-void AssemblerX86Base<Machine>::fld(Type Ty,
-                                    const typename Traits::Address &src) {
+template <typename T, typename>
+void AssemblerX86Base<Machine>::fld(Type Ty, const typename T::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(isFloat32Asserting32Or64(Ty) ? 0xD9 : 0xDD);
   emitOperand(0, src);
 }
 
 template <class Machine>
-void AssemblerX86Base<Machine>::fstp(Type Ty,
-                                     const typename Traits::Address &dst) {
+template <typename T, typename>
+void AssemblerX86Base<Machine>::fstp(Type Ty, const typename T::Address &dst) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(isFloat32Asserting32Or64(Ty) ? 0xD9 : 0xDD);
   emitOperand(3, dst);
 }
 
 template <class Machine>
-void AssemblerX86Base<Machine>::fstp(typename Traits::X87STRegister st) {
+template <typename T, typename>
+void AssemblerX86Base<Machine>::fstp(typename T::X87STRegister st) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0xDD);
   emitUint8(0xD8 + st);
@@ -555,6 +609,7 @@
 void AssemblerX86Base<Machine>::movaps(typename Traits::XmmRegister dst,
                                        typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x28);
   emitXmmRegisterOperand(dst, src);
@@ -564,27 +619,30 @@
 void AssemblerX86Base<Machine>::movups(typename Traits::XmmRegister dst,
                                        typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x10);
-  emitRegisterOperand(dst, src);
+  emitXmmRegisterOperand(dst, src);
 }
 
 template <class Machine>
 void AssemblerX86Base<Machine>::movups(typename Traits::XmmRegister dst,
                                        const typename Traits::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRex(RexTypeIrrelevant, src, dst);
   emitUint8(0x0F);
   emitUint8(0x10);
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
 void AssemblerX86Base<Machine>::movups(const typename Traits::Address &dst,
                                        typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRex(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x11);
-  emitOperand(src, dst);
+  emitOperand(gprEncoding(src), dst);
 }
 
 template <class Machine>
@@ -592,6 +650,7 @@
                                      typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   if (isByteSizedArithType(Ty)) {
     emitUint8(0xFC);
@@ -608,6 +667,7 @@
                                      const typename Traits::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRex(RexTypeIrrelevant, src, dst);
   emitUint8(0x0F);
   if (isByteSizedArithType(Ty)) {
     emitUint8(0xFC);
@@ -616,7 +676,7 @@
   } else {
     emitUint8(0xFE);
   }
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
@@ -625,6 +685,7 @@
                                      typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0xDB);
   emitXmmRegisterOperand(dst, src);
@@ -636,9 +697,10 @@
                                      const typename Traits::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRex(RexTypeIrrelevant, src, dst);
   emitUint8(0x0F);
   emitUint8(0xDB);
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
@@ -647,6 +709,7 @@
                                       typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0xDF);
   emitXmmRegisterOperand(dst, src);
@@ -658,9 +721,10 @@
                                       const typename Traits::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRex(RexTypeIrrelevant, src, dst);
   emitUint8(0x0F);
   emitUint8(0xDF);
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
@@ -668,6 +732,7 @@
                                       typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   if (Ty == IceType_i16) {
     emitUint8(0xD5);
@@ -684,6 +749,7 @@
                                       const typename Traits::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRex(RexTypeIrrelevant, src, dst);
   emitUint8(0x0F);
   if (Ty == IceType_i16) {
     emitUint8(0xD5);
@@ -692,7 +758,7 @@
     emitUint8(0x38);
     emitUint8(0x40);
   }
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
@@ -701,6 +767,7 @@
                                         typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0xF4);
   emitXmmRegisterOperand(dst, src);
@@ -712,9 +779,10 @@
                                         const typename Traits::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRex(RexTypeIrrelevant, src, dst);
   emitUint8(0x0F);
   emitUint8(0xF4);
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
@@ -723,6 +791,7 @@
                                     typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0xEB);
   emitXmmRegisterOperand(dst, src);
@@ -734,9 +803,10 @@
                                     const typename Traits::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRex(RexTypeIrrelevant, src, dst);
   emitUint8(0x0F);
   emitUint8(0xEB);
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
@@ -744,6 +814,7 @@
                                      typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   if (isByteSizedArithType(Ty)) {
     emitUint8(0xF8);
@@ -760,6 +831,7 @@
                                      const typename Traits::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRex(RexTypeIrrelevant, src, dst);
   emitUint8(0x0F);
   if (isByteSizedArithType(Ty)) {
     emitUint8(0xF8);
@@ -768,7 +840,7 @@
   } else {
     emitUint8(0xFA);
   }
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
@@ -777,6 +849,7 @@
                                      typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0xEF);
   emitXmmRegisterOperand(dst, src);
@@ -788,9 +861,10 @@
                                      const typename Traits::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRex(RexTypeIrrelevant, src, dst);
   emitUint8(0x0F);
   emitUint8(0xEF);
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
@@ -798,6 +872,7 @@
                                      typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   if (Ty == IceType_i16) {
     emitUint8(0xF1);
@@ -813,6 +888,7 @@
                                      const typename Traits::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRex(RexTypeIrrelevant, src, dst);
   emitUint8(0x0F);
   if (Ty == IceType_i16) {
     emitUint8(0xF1);
@@ -820,7 +896,7 @@
     assert(Ty == IceType_i32);
     emitUint8(0xF2);
   }
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
@@ -829,6 +905,7 @@
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   assert(imm.is_int8());
   emitUint8(0x66);
+  emitRexB(RexTypeIrrelevant, dst);
   emitUint8(0x0F);
   if (Ty == IceType_i16) {
     emitUint8(0x71);
@@ -836,7 +913,7 @@
     assert(Ty == IceType_i32);
     emitUint8(0x72);
   }
-  emitRegisterOperand(6, dst);
+  emitRegisterOperand(6, gprEncoding(dst));
   emitUint8(imm.value() & 0xFF);
 }
 
@@ -845,6 +922,7 @@
                                      typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   if (Ty == IceType_i16) {
     emitUint8(0xE1);
@@ -860,6 +938,7 @@
                                      const typename Traits::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRex(RexTypeIrrelevant, src, dst);
   emitUint8(0x0F);
   if (Ty == IceType_i16) {
     emitUint8(0xE1);
@@ -867,7 +946,7 @@
     assert(Ty == IceType_i32);
     emitUint8(0xE2);
   }
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
@@ -876,6 +955,7 @@
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   assert(imm.is_int8());
   emitUint8(0x66);
+  emitRexB(RexTypeIrrelevant, dst);
   emitUint8(0x0F);
   if (Ty == IceType_i16) {
     emitUint8(0x71);
@@ -883,7 +963,7 @@
     assert(Ty == IceType_i32);
     emitUint8(0x72);
   }
-  emitRegisterOperand(4, dst);
+  emitRegisterOperand(4, gprEncoding(dst));
   emitUint8(imm.value() & 0xFF);
 }
 
@@ -892,6 +972,7 @@
                                      typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   if (Ty == IceType_i16) {
     emitUint8(0xD1);
@@ -909,6 +990,7 @@
                                      const typename Traits::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRex(RexTypeIrrelevant, src, dst);
   emitUint8(0x0F);
   if (Ty == IceType_i16) {
     emitUint8(0xD1);
@@ -918,7 +1000,7 @@
     assert(Ty == IceType_i32 || Ty == IceType_f32 || Ty == IceType_v4f32);
     emitUint8(0xD2);
   }
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
@@ -927,6 +1009,7 @@
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   assert(imm.is_int8());
   emitUint8(0x66);
+  emitRexB(RexTypeIrrelevant, dst);
   emitUint8(0x0F);
   if (Ty == IceType_i16) {
     emitUint8(0x71);
@@ -936,7 +1019,7 @@
     assert(Ty == IceType_i32 || Ty == IceType_f32 || Ty == IceType_v4f32);
     emitUint8(0x72);
   }
-  emitRegisterOperand(2, dst);
+  emitRegisterOperand(2, gprEncoding(dst));
   emitUint8(imm.value() & 0xFF);
 }
 
@@ -949,6 +1032,7 @@
                                       typename Traits::XmmRegister dst,
                                       typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x58);
   emitXmmRegisterOperand(dst, src);
@@ -959,9 +1043,10 @@
                                       typename Traits::XmmRegister dst,
                                       const typename Traits::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRex(RexTypeIrrelevant, src, dst);
   emitUint8(0x0F);
   emitUint8(0x58);
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
@@ -969,6 +1054,7 @@
                                       typename Traits::XmmRegister dst,
                                       typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x5C);
   emitXmmRegisterOperand(dst, src);
@@ -979,9 +1065,10 @@
                                       typename Traits::XmmRegister dst,
                                       const typename Traits::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRex(RexTypeIrrelevant, src, dst);
   emitUint8(0x0F);
   emitUint8(0x5C);
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
@@ -989,6 +1076,7 @@
                                       typename Traits::XmmRegister dst,
                                       typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x5E);
   emitXmmRegisterOperand(dst, src);
@@ -999,9 +1087,10 @@
                                       typename Traits::XmmRegister dst,
                                       const typename Traits::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRex(RexTypeIrrelevant, src, dst);
   emitUint8(0x0F);
   emitUint8(0x5E);
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
@@ -1009,6 +1098,7 @@
                                       typename Traits::XmmRegister dst,
                                       typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x59);
   emitXmmRegisterOperand(dst, src);
@@ -1019,15 +1109,17 @@
                                       typename Traits::XmmRegister dst,
                                       const typename Traits::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRex(RexTypeIrrelevant, src, dst);
   emitUint8(0x0F);
   emitUint8(0x59);
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
 void AssemblerX86Base<Machine>::minps(typename Traits::XmmRegister dst,
                                       typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x5D);
   emitXmmRegisterOperand(dst, src);
@@ -1037,6 +1129,7 @@
 void AssemblerX86Base<Machine>::maxps(typename Traits::XmmRegister dst,
                                       typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x5F);
   emitXmmRegisterOperand(dst, src);
@@ -1046,6 +1139,7 @@
 void AssemblerX86Base<Machine>::andps(typename Traits::XmmRegister dst,
                                       typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x54);
   emitXmmRegisterOperand(dst, src);
@@ -1055,15 +1149,17 @@
 void AssemblerX86Base<Machine>::andps(typename Traits::XmmRegister dst,
                                       const typename Traits::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRex(RexTypeIrrelevant, src, dst);
   emitUint8(0x0F);
   emitUint8(0x54);
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
 void AssemblerX86Base<Machine>::orps(typename Traits::XmmRegister dst,
                                      typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x56);
   emitXmmRegisterOperand(dst, src);
@@ -1075,6 +1171,7 @@
                                          typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x38);
   emitUint8(0x14);
@@ -1087,10 +1184,11 @@
                                          const typename Traits::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRex(RexTypeIrrelevant, src, dst);
   emitUint8(0x0F);
   emitUint8(0x38);
   emitUint8(0x14);
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
@@ -1099,6 +1197,7 @@
                                          typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x38);
   emitUint8(0x10);
@@ -1111,10 +1210,11 @@
                                          const typename Traits::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRex(RexTypeIrrelevant, src, dst);
   emitUint8(0x0F);
   emitUint8(0x38);
   emitUint8(0x10);
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
@@ -1122,6 +1222,7 @@
     typename Traits::XmmRegister dst, typename Traits::XmmRegister src,
     typename Traits::Cond::CmppsCond CmpCondition) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0xC2);
   emitXmmRegisterOperand(dst, src);
@@ -1133,15 +1234,17 @@
     typename Traits::XmmRegister dst, const typename Traits::Address &src,
     typename Traits::Cond::CmppsCond CmpCondition) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRex(RexTypeIrrelevant, src, dst);
   emitUint8(0x0F);
   emitUint8(0xC2);
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
   emitUint8(CmpCondition);
 }
 
 template <class Machine>
 void AssemblerX86Base<Machine>::sqrtps(typename Traits::XmmRegister dst) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRexRB(RexTypeIrrelevant, dst, dst);
   emitUint8(0x0F);
   emitUint8(0x51);
   emitXmmRegisterOperand(dst, dst);
@@ -1150,6 +1253,7 @@
 template <class Machine>
 void AssemblerX86Base<Machine>::rsqrtps(typename Traits::XmmRegister dst) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRexRB(RexTypeIrrelevant, dst, dst);
   emitUint8(0x0F);
   emitUint8(0x52);
   emitXmmRegisterOperand(dst, dst);
@@ -1158,6 +1262,7 @@
 template <class Machine>
 void AssemblerX86Base<Machine>::reciprocalps(typename Traits::XmmRegister dst) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRexRB(RexTypeIrrelevant, dst, dst);
   emitUint8(0x0F);
   emitUint8(0x53);
   emitXmmRegisterOperand(dst, dst);
@@ -1167,6 +1272,7 @@
 void AssemblerX86Base<Machine>::movhlps(typename Traits::XmmRegister dst,
                                         typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x12);
   emitXmmRegisterOperand(dst, src);
@@ -1176,6 +1282,7 @@
 void AssemblerX86Base<Machine>::movlhps(typename Traits::XmmRegister dst,
                                         typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x16);
   emitXmmRegisterOperand(dst, src);
@@ -1185,6 +1292,7 @@
 void AssemblerX86Base<Machine>::unpcklps(typename Traits::XmmRegister dst,
                                          typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x14);
   emitXmmRegisterOperand(dst, src);
@@ -1194,6 +1302,7 @@
 void AssemblerX86Base<Machine>::unpckhps(typename Traits::XmmRegister dst,
                                          typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x15);
   emitXmmRegisterOperand(dst, src);
@@ -1204,6 +1313,7 @@
                                          typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x14);
   emitXmmRegisterOperand(dst, src);
@@ -1214,6 +1324,7 @@
                                          typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x15);
   emitXmmRegisterOperand(dst, src);
@@ -1228,19 +1339,7 @@
   // Move value from tmp1 into dst.
   movd(dst, tmp1);
   // Broadcast low lane into other three lanes.
-  shufps(dst, dst, Immediate(0x0));
-}
-
-template <class Machine>
-void AssemblerX86Base<Machine>::shufps(typename Traits::XmmRegister dst,
-                                       typename Traits::XmmRegister src,
-                                       const Immediate &imm) {
-  AssemblerBuffer::EnsureCapacity ensured(&Buffer);
-  emitUint8(0x0F);
-  emitUint8(0xC6);
-  emitXmmRegisterOperand(dst, src);
-  assert(imm.is_uint8());
-  emitUint8(imm.value());
+  shufps(RexTypeIrrelevant, dst, dst, Immediate(0x0));
 }
 
 template <class Machine>
@@ -1250,6 +1349,7 @@
                                        const Immediate &imm) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x70);
   emitXmmRegisterOperand(dst, src);
@@ -1264,9 +1364,10 @@
                                        const Immediate &imm) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRex(RexTypeIrrelevant, src, dst);
   emitUint8(0x0F);
   emitUint8(0x70);
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
   assert(imm.is_uint8());
   emitUint8(imm.value());
 }
@@ -1277,6 +1378,7 @@
                                        typename Traits::XmmRegister src,
                                        const Immediate &imm) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0xC6);
   emitXmmRegisterOperand(dst, src);
@@ -1290,9 +1392,10 @@
                                        const typename Traits::Address &src,
                                        const Immediate &imm) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRex(RexTypeIrrelevant, src, dst);
   emitUint8(0x0F);
   emitUint8(0xC6);
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
   assert(imm.is_uint8());
   emitUint8(imm.value());
 }
@@ -1302,6 +1405,7 @@
                                       typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x5D);
   emitXmmRegisterOperand(dst, src);
@@ -1312,6 +1416,7 @@
                                       typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x5F);
   emitXmmRegisterOperand(dst, src);
@@ -1321,29 +1426,18 @@
 void AssemblerX86Base<Machine>::sqrtpd(typename Traits::XmmRegister dst) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRexRB(RexTypeIrrelevant, dst, dst);
   emitUint8(0x0F);
   emitUint8(0x51);
   emitXmmRegisterOperand(dst, dst);
 }
 
 template <class Machine>
-void AssemblerX86Base<Machine>::shufpd(typename Traits::XmmRegister dst,
-                                       typename Traits::XmmRegister src,
-                                       const Immediate &imm) {
-  AssemblerBuffer::EnsureCapacity ensured(&Buffer);
-  emitUint8(0x66);
-  emitUint8(0x0F);
-  emitUint8(0xC6);
-  emitXmmRegisterOperand(dst, src);
-  assert(imm.is_uint8());
-  emitUint8(imm.value());
-}
-
-template <class Machine>
 void AssemblerX86Base<Machine>::cvtdq2ps(Type /* Ignore */,
                                          typename Traits::XmmRegister dst,
                                          typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x5B);
   emitXmmRegisterOperand(dst, src);
@@ -1354,9 +1448,10 @@
                                          typename Traits::XmmRegister dst,
                                          const typename Traits::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRex(RexTypeIrrelevant, src, dst);
   emitUint8(0x0F);
   emitUint8(0x5B);
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
@@ -1365,6 +1460,7 @@
                                           typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0xF3);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x5B);
   emitXmmRegisterOperand(dst, src);
@@ -1376,9 +1472,10 @@
                                           const typename Traits::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0xF3);
+  emitRex(RexTypeIrrelevant, src, dst);
   emitUint8(0x0F);
   emitUint8(0x5B);
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
@@ -1387,9 +1484,10 @@
                                          typename Traits::GPRRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(isFloat32Asserting32Or64(DestTy) ? 0xF3 : 0xF2);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x2A);
-  emitRegisterOperand(dst, src);
+  emitXmmRegisterOperand(dst, src);
 }
 
 template <class Machine>
@@ -1398,9 +1496,10 @@
                                          const typename Traits::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(isFloat32Asserting32Or64(DestTy) ? 0xF3 : 0xF2);
+  emitRex(RexTypeIrrelevant, src, dst);
   emitUint8(0x0F);
   emitUint8(0x2A);
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
@@ -1410,6 +1509,7 @@
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   // ss2sd or sd2ss
   emitUint8(isFloat32Asserting32Or64(SrcTy) ? 0xF3 : 0xF2);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x5A);
   emitXmmRegisterOperand(dst, src);
@@ -1421,9 +1521,10 @@
     const typename Traits::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(isFloat32Asserting32Or64(SrcTy) ? 0xF3 : 0xF2);
+  emitRex(RexTypeIrrelevant, src, dst);
   emitUint8(0x0F);
   emitUint8(0x5A);
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
@@ -1432,6 +1533,7 @@
                                           typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(isFloat32Asserting32Or64(SrcTy) ? 0xF3 : 0xF2);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x2C);
   emitXmmRegisterOperand(dst, src);
@@ -1443,9 +1545,10 @@
                                           const typename Traits::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(isFloat32Asserting32Or64(SrcTy) ? 0xF3 : 0xF2);
+  emitRex(RexTypeIrrelevant, src, dst);
   emitUint8(0x0F);
   emitUint8(0x2C);
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
@@ -1454,6 +1557,7 @@
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   if (Ty == IceType_f64)
     emitUint8(0x66);
+  emitRexRB(RexTypeIrrelevant, a, b);
   emitUint8(0x0F);
   emitUint8(0x2E);
   emitXmmRegisterOperand(a, b);
@@ -1465,9 +1569,10 @@
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   if (Ty == IceType_f64)
     emitUint8(0x66);
+  emitRex(RexTypeIrrelevant, b, a);
   emitUint8(0x0F);
   emitUint8(0x2E);
-  emitOperand(a, b);
+  emitOperand(gprEncoding(a), b);
 }
 
 template <class Machine>
@@ -1475,6 +1580,7 @@
                                          typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x50);
   emitXmmRegisterOperand(dst, src);
@@ -1484,6 +1590,7 @@
 void AssemblerX86Base<Machine>::movmskps(typename Traits::GPRRegister dst,
                                          typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x50);
   emitXmmRegisterOperand(dst, src);
@@ -1495,9 +1602,10 @@
                                        const typename Traits::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(isFloat32Asserting32Or64(Ty) ? 0xF3 : 0xF2);
+  emitRex(RexTypeIrrelevant, src, dst);
   emitUint8(0x0F);
   emitUint8(0x51);
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
@@ -1506,6 +1614,7 @@
                                        typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(isFloat32Asserting32Or64(Ty) ? 0xF3 : 0xF2);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x51);
   emitXmmRegisterOperand(dst, src);
@@ -1516,9 +1625,10 @@
                                       const typename Traits::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRex(RexTypeIrrelevant, src, dst);
   emitUint8(0x0F);
   emitUint8(0x57);
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
@@ -1526,6 +1636,7 @@
                                       typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x57);
   emitXmmRegisterOperand(dst, src);
@@ -1536,6 +1647,7 @@
                                      typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x56);
   emitXmmRegisterOperand(dst, src);
@@ -1545,15 +1657,17 @@
 void AssemblerX86Base<Machine>::xorps(typename Traits::XmmRegister dst,
                                       const typename Traits::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRex(RexTypeIrrelevant, src, dst);
   emitUint8(0x0F);
   emitUint8(0x57);
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
 void AssemblerX86Base<Machine>::xorps(typename Traits::XmmRegister dst,
                                       typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x57);
   emitXmmRegisterOperand(dst, src);
@@ -1564,9 +1678,10 @@
                                       const typename Traits::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRex(RexTypeIrrelevant, src, dst);
   emitUint8(0x0F);
   emitUint8(0x54);
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
@@ -1574,6 +1689,7 @@
                                       typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x54);
   emitXmmRegisterOperand(dst, src);
@@ -1589,6 +1705,7 @@
   assert(isVectorFloatingType(Ty));
   (void)Ty;
   emitUint8(0x66);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x3A);
   emitUint8(0x21);
@@ -1606,10 +1723,11 @@
   assert(isVectorFloatingType(Ty));
   (void)Ty;
   emitUint8(0x66);
+  emitRex(RexTypeIrrelevant, src, dst);
   emitUint8(0x0F);
   emitUint8(0x3A);
   emitUint8(0x21);
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
   emitUint8(imm.value());
 }
 
@@ -1619,20 +1737,17 @@
                                       const Immediate &imm) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   assert(imm.is_uint8());
+  emitUint8(0x66);
+  emitRexRB(Ty, dst, src);
+  emitUint8(0x0F);
   if (Ty == IceType_i16) {
-    emitUint8(0x66);
-    emitUint8(0x0F);
     emitUint8(0xC4);
-    emitXmmRegisterOperand(dst, typename Traits::XmmRegister(src));
-    emitUint8(imm.value());
   } else {
-    emitUint8(0x66);
-    emitUint8(0x0F);
     emitUint8(0x3A);
     emitUint8(isByteSizedType(Ty) ? 0x20 : 0x22);
-    emitXmmRegisterOperand(dst, typename Traits::XmmRegister(src));
-    emitUint8(imm.value());
   }
+  emitXmmRegisterOperand(dst, src);
+  emitUint8(imm.value());
 }
 
 template <class Machine>
@@ -1641,20 +1756,17 @@
                                       const Immediate &imm) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   assert(imm.is_uint8());
+  emitUint8(0x66);
+  emitRex(RexTypeIrrelevant, src, dst);
+  emitUint8(0x0F);
   if (Ty == IceType_i16) {
-    emitUint8(0x66);
-    emitUint8(0x0F);
     emitUint8(0xC4);
-    emitOperand(dst, src);
-    emitUint8(imm.value());
   } else {
-    emitUint8(0x66);
-    emitUint8(0x0F);
     emitUint8(0x3A);
     emitUint8(isByteSizedType(Ty) ? 0x20 : 0x22);
-    emitOperand(dst, src);
-    emitUint8(imm.value());
   }
+  emitOperand(gprEncoding(dst), src);
+  emitUint8(imm.value());
 }
 
 template <class Machine>
@@ -1665,18 +1777,20 @@
   assert(imm.is_uint8());
   if (Ty == IceType_i16) {
     emitUint8(0x66);
+    emitRexRB(Ty, dst, src);
     emitUint8(0x0F);
     emitUint8(0xC5);
-    emitXmmRegisterOperand(typename Traits::XmmRegister(dst), src);
+    emitXmmRegisterOperand(dst, src);
     emitUint8(imm.value());
   } else {
     emitUint8(0x66);
+    emitRexRB(Ty, src, dst);
     emitUint8(0x0F);
     emitUint8(0x3A);
     emitUint8(isByteSizedType(Ty) ? 0x14 : 0x16);
     // SSE 4.1 versions are "MRI" because dst can be mem, while
     // pextrw (SSE2) is RMI because dst must be reg.
-    emitXmmRegisterOperand(src, typename Traits::XmmRegister(dst));
+    emitXmmRegisterOperand(src, dst);
     emitUint8(imm.value());
   }
 }
@@ -1686,6 +1800,7 @@
                                          typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x38);
   emitUint8(0x25);
@@ -1698,6 +1813,7 @@
                                        typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   if (isByteSizedArithType(Ty)) {
     emitUint8(0x74);
@@ -1715,6 +1831,7 @@
                                        const typename Traits::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRex(RexTypeIrrelevant, src, dst);
   emitUint8(0x0F);
   if (isByteSizedArithType(Ty)) {
     emitUint8(0x74);
@@ -1723,7 +1840,7 @@
   } else {
     emitUint8(0x76);
   }
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
@@ -1732,6 +1849,7 @@
                                        typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   if (isByteSizedArithType(Ty)) {
     emitUint8(0x64);
@@ -1749,6 +1867,7 @@
                                        const typename Traits::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRex(RexTypeIrrelevant, src, dst);
   emitUint8(0x0F);
   if (isByteSizedArithType(Ty)) {
     emitUint8(0x64);
@@ -1757,7 +1876,7 @@
   } else {
     emitUint8(0x66);
   }
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
@@ -1766,6 +1885,7 @@
                                         RoundingMode mode) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x3A);
   emitUint8(0x0B);
@@ -1775,48 +1895,56 @@
 }
 
 template <class Machine>
-void AssemblerX86Base<Machine>::fnstcw(const typename Traits::Address &dst) {
+template <typename T, typename>
+void AssemblerX86Base<Machine>::fnstcw(const typename T::Address &dst) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0xD9);
   emitOperand(7, dst);
 }
 
 template <class Machine>
-void AssemblerX86Base<Machine>::fldcw(const typename Traits::Address &src) {
+template <typename T, typename>
+void AssemblerX86Base<Machine>::fldcw(const typename T::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0xD9);
   emitOperand(5, src);
 }
 
 template <class Machine>
-void AssemblerX86Base<Machine>::fistpl(const typename Traits::Address &dst) {
+template <typename T, typename>
+void AssemblerX86Base<Machine>::fistpl(const typename T::Address &dst) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0xDF);
   emitOperand(7, dst);
 }
 
 template <class Machine>
-void AssemblerX86Base<Machine>::fistps(const typename Traits::Address &dst) {
+template <typename T, typename>
+void AssemblerX86Base<Machine>::fistps(const typename T::Address &dst) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0xDB);
   emitOperand(3, dst);
 }
 
 template <class Machine>
-void AssemblerX86Base<Machine>::fildl(const typename Traits::Address &src) {
+template <typename T, typename>
+void AssemblerX86Base<Machine>::fildl(const typename T::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0xDF);
   emitOperand(5, src);
 }
 
 template <class Machine>
-void AssemblerX86Base<Machine>::filds(const typename Traits::Address &src) {
+template <typename T, typename>
+void AssemblerX86Base<Machine>::filds(const typename T::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0xDB);
   emitOperand(0, src);
 }
 
-template <class Machine> void AssemblerX86Base<Machine>::fincstp() {
+template <class Machine>
+template <typename, typename>
+void AssemblerX86Base<Machine>::fincstp() {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0xD9);
   emitUint8(0xF7);
@@ -1829,13 +1957,14 @@
                                           const Immediate &imm) {
   static_assert(Tag < 8, "Tag must be between 0..7");
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
-  if (isByteSizedType(Ty)) {
-    emitComplexI8(Tag, typename Traits::Operand(reg), imm);
-    return;
-  }
   if (Ty == IceType_i16)
     emitOperandSizeOverride();
-  emitComplex(Ty, Tag, typename Traits::Operand(reg), imm);
+  emitRexB(Ty, reg);
+  if (isByteSizedType(Ty)) {
+    emitComplexI8(Tag, typename Traits::Operand(reg), imm);
+  } else {
+    emitComplex(Ty, Tag, typename Traits::Operand(reg), imm);
+  }
 }
 
 template <class Machine>
@@ -1847,11 +1976,12 @@
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   if (Ty == IceType_i16)
     emitOperandSizeOverride();
+  emitRexRB(Ty, reg0, reg1);
   if (isByteSizedType(Ty))
     emitUint8(Tag * 8 + 2);
   else
     emitUint8(Tag * 8 + 3);
-  emitRegisterOperand(reg0, reg1);
+  emitRegisterOperand(gprEncoding(reg0), gprEncoding(reg1));
 }
 
 template <class Machine>
@@ -1863,11 +1993,12 @@
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   if (Ty == IceType_i16)
     emitOperandSizeOverride();
+  emitRex(Ty, address, reg);
   if (isByteSizedType(Ty))
     emitUint8(Tag * 8 + 2);
   else
     emitUint8(Tag * 8 + 3);
-  emitOperand(reg, address);
+  emitOperand(gprEncoding(reg), address);
 }
 
 template <class Machine>
@@ -1879,11 +2010,12 @@
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   if (Ty == IceType_i16)
     emitOperandSizeOverride();
+  emitRex(Ty, address, reg);
   if (isByteSizedType(Ty))
     emitUint8(Tag * 8 + 0);
   else
     emitUint8(Tag * 8 + 1);
-  emitOperand(reg, address);
+  emitOperand(gprEncoding(reg), address);
 }
 
 template <class Machine>
@@ -1892,13 +2024,14 @@
     Type Ty, const typename Traits::Address &address, const Immediate &imm) {
   static_assert(Tag < 8, "Tag must be between 0..7");
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
-  if (isByteSizedType(Ty)) {
-    emitComplexI8(Tag, address, imm);
-    return;
-  }
   if (Ty == IceType_i16)
     emitOperandSizeOverride();
-  emitComplex(Ty, Tag, address, imm);
+  emitRex(Ty, address, RexRegIrrelevant);
+  if (isByteSizedType(Ty)) {
+    emitComplexI8(Tag, address, imm);
+  } else {
+    emitComplex(Ty, Tag, address, imm);
+  }
 }
 
 template <class Machine>
@@ -1939,11 +2072,12 @@
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   if (Ty == IceType_i16)
     emitOperandSizeOverride();
+  emitRexRB(Ty, reg1, reg2);
   if (isByteSizedType(Ty))
     emitUint8(0x84);
   else
     emitUint8(0x85);
-  emitRegisterOperand(reg1, reg2);
+  emitRegisterOperand(gprEncoding(reg1), gprEncoding(reg2));
 }
 
 template <class Machine>
@@ -1953,11 +2087,12 @@
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   if (Ty == IceType_i16)
     emitOperandSizeOverride();
+  emitRex(Ty, addr, reg);
   if (isByteSizedType(Ty))
     emitUint8(0x84);
   else
     emitUint8(0x85);
-  emitOperand(reg, addr);
+  emitOperand(gprEncoding(reg), addr);
 }
 
 template <class Machine>
@@ -1969,13 +2104,14 @@
   // This is legal even if the register had high bits set since
   // this only sets flags registers based on the "AND" of the two operands,
   // and the immediate had zeros at those high bits.
-  if (immediate.is_uint8() && reg < 4) {
+  if (immediate.is_uint8() && reg <= Traits::Last8BitGPR) {
     // Use zero-extended 8-bit immediate.
+    emitRexB(Ty, reg);
     if (reg == Traits::Encoded_Reg_Accumulator) {
       emitUint8(0xA8);
     } else {
       emitUint8(0xF6);
-      emitUint8(0xC0 + reg);
+      emitUint8(0xC0 + gprEncoding(reg));
     }
     emitUint8(immediate.value() & 0xFF);
   } else if (reg == Traits::Encoded_Reg_Accumulator) {
@@ -1987,8 +2123,9 @@
   } else {
     if (Ty == IceType_i16)
       emitOperandSizeOverride();
+    emitRexB(Ty, reg);
     emitUint8(0xF7);
-    emitRegisterOperand(0, reg);
+    emitRegisterOperand(0, gprEncoding(reg));
     emitImmediate(Ty, immediate);
   }
 }
@@ -2002,12 +2139,14 @@
   // encoding short.
   if (immediate.is_uint8()) {
     // Use zero-extended 8-bit immediate.
+    emitRex(Ty, addr, RexRegIrrelevant);
     emitUint8(0xF6);
     emitOperand(0, addr);
     emitUint8(immediate.value() & 0xFF);
   } else {
     if (Ty == IceType_i16)
       emitOperandSizeOverride();
+    emitRex(Ty, addr, RexRegIrrelevant);
     emitUint8(0xF7);
     emitOperand(0, addr);
     emitImmediate(Ty, immediate);
@@ -2260,11 +2399,12 @@
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   if (Ty == IceType_i16)
     emitOperandSizeOverride();
+  emitRexB(Ty, reg);
   if (isByteSizedArithType(Ty))
     emitUint8(0xF6);
   else
     emitUint8(0xF7);
-  emitRegisterOperand(6, reg);
+  emitRegisterOperand(6, gprEncoding(reg));
 }
 
 template <class Machine>
@@ -2273,6 +2413,7 @@
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   if (Ty == IceType_i16)
     emitOperandSizeOverride();
+  emitRex(Ty, addr, RexRegIrrelevant);
   if (isByteSizedArithType(Ty))
     emitUint8(0xF6);
   else
@@ -2286,11 +2427,12 @@
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   if (Ty == IceType_i16)
     emitOperandSizeOverride();
+  emitRexB(Ty, reg);
   if (isByteSizedArithType(Ty))
     emitUint8(0xF6);
   else
     emitUint8(0xF7);
-  emitRegisterOperand(7, reg);
+  emitRegisterOperand(7, gprEncoding(reg));
 }
 
 template <class Machine>
@@ -2299,6 +2441,7 @@
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   if (Ty == IceType_i16)
     emitOperandSizeOverride();
+  emitRex(Ty, addr, RexRegIrrelevant);
   if (isByteSizedArithType(Ty))
     emitUint8(0xF6);
   else
@@ -2313,9 +2456,10 @@
   assert(Ty == IceType_i16 || Ty == IceType_i32);
   if (Ty == IceType_i16)
     emitOperandSizeOverride();
+  emitRexRB(Ty, dst, src);
   emitUint8(0x0F);
   emitUint8(0xAF);
-  emitRegisterOperand(dst, src);
+  emitRegisterOperand(gprEncoding(dst), gprEncoding(src));
 }
 
 template <class Machine>
@@ -2325,9 +2469,10 @@
   assert(Ty == IceType_i16 || Ty == IceType_i32);
   if (Ty == IceType_i16)
     emitOperandSizeOverride();
+  emitRex(Ty, address, reg);
   emitUint8(0x0F);
   emitUint8(0xAF);
-  emitOperand(reg, address);
+  emitOperand(gprEncoding(reg), address);
 }
 
 template <class Machine>
@@ -2337,13 +2482,14 @@
   assert(Ty == IceType_i16 || Ty == IceType_i32);
   if (Ty == IceType_i16)
     emitOperandSizeOverride();
+  emitRexRB(Ty, reg, reg);
   if (imm.is_int8()) {
     emitUint8(0x6B);
-    emitRegisterOperand(reg, reg);
+    emitRegisterOperand(gprEncoding(reg), gprEncoding(reg));
     emitUint8(imm.value() & 0xFF);
   } else {
     emitUint8(0x69);
-    emitRegisterOperand(reg, reg);
+    emitRegisterOperand(gprEncoding(reg), gprEncoding(reg));
     emitImmediate(Ty, imm);
   }
 }
@@ -2354,11 +2500,12 @@
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   if (Ty == IceType_i16)
     emitOperandSizeOverride();
+  emitRexB(Ty, reg);
   if (isByteSizedArithType(Ty))
     emitUint8(0xF6);
   else
     emitUint8(0xF7);
-  emitRegisterOperand(5, reg);
+  emitRegisterOperand(5, gprEncoding(reg));
 }
 
 template <class Machine>
@@ -2367,6 +2514,7 @@
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   if (Ty == IceType_i16)
     emitOperandSizeOverride();
+  emitRex(Ty, address, RexRegIrrelevant);
   if (isByteSizedArithType(Ty))
     emitUint8(0xF6);
   else
@@ -2379,11 +2527,12 @@
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   if (Ty == IceType_i16)
     emitOperandSizeOverride();
+  emitRexB(Ty, reg);
   if (isByteSizedArithType(Ty))
     emitUint8(0xF6);
   else
     emitUint8(0xF7);
-  emitRegisterOperand(4, reg);
+  emitRegisterOperand(4, gprEncoding(reg));
 }
 
 template <class Machine>
@@ -2392,6 +2541,7 @@
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   if (Ty == IceType_i16)
     emitOperandSizeOverride();
+  emitRex(Ty, address, RexRegIrrelevant);
   if (isByteSizedArithType(Ty))
     emitUint8(0xF6);
   else
@@ -2400,6 +2550,7 @@
 }
 
 template <class Machine>
+template <typename, typename>
 void AssemblerX86Base<Machine>::incl(typename Traits::GPRRegister reg) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x40 + reg);
@@ -2408,11 +2559,13 @@
 template <class Machine>
 void AssemblerX86Base<Machine>::incl(const typename Traits::Address &address) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRex(IceType_i32, address, RexRegIrrelevant);
   emitUint8(0xFF);
   emitOperand(0, address);
 }
 
 template <class Machine>
+template <typename, typename>
 void AssemblerX86Base<Machine>::decl(typename Traits::GPRRegister reg) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x48 + reg);
@@ -2421,6 +2574,7 @@
 template <class Machine>
 void AssemblerX86Base<Machine>::decl(const typename Traits::Address &address) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRex(IceType_i32, address, RexRegIrrelevant);
   emitUint8(0xFF);
   emitOperand(1, address);
 }
@@ -2512,9 +2666,10 @@
   assert(Ty == IceType_i16 || Ty == IceType_i32);
   if (Ty == IceType_i16)
     emitOperandSizeOverride();
+  emitRexRB(Ty, src, dst);
   emitUint8(0x0F);
   emitUint8(0xA5);
-  emitRegisterOperand(src, dst);
+  emitRegisterOperand(gprEncoding(src), gprEncoding(dst));
 }
 
 template <class Machine>
@@ -2526,9 +2681,10 @@
   assert(imm.is_int8());
   if (Ty == IceType_i16)
     emitOperandSizeOverride();
+  emitRexRB(Ty, src, dst);
   emitUint8(0x0F);
   emitUint8(0xA4);
-  emitRegisterOperand(src, dst);
+  emitRegisterOperand(gprEncoding(src), gprEncoding(dst));
   emitUint8(imm.value() & 0xFF);
 }
 
@@ -2540,9 +2696,10 @@
   assert(Ty == IceType_i16 || Ty == IceType_i32);
   if (Ty == IceType_i16)
     emitOperandSizeOverride();
+  emitRex(Ty, operand, src);
   emitUint8(0x0F);
   emitUint8(0xA5);
-  emitOperand(src, operand);
+  emitOperand(gprEncoding(src), operand);
 }
 
 template <class Machine>
@@ -2552,9 +2709,10 @@
   assert(Ty == IceType_i16 || Ty == IceType_i32);
   if (Ty == IceType_i16)
     emitOperandSizeOverride();
+  emitRexRB(Ty, src, dst);
   emitUint8(0x0F);
   emitUint8(0xAD);
-  emitRegisterOperand(src, dst);
+  emitRegisterOperand(gprEncoding(src), gprEncoding(dst));
 }
 
 template <class Machine>
@@ -2566,9 +2724,10 @@
   assert(imm.is_int8());
   if (Ty == IceType_i16)
     emitOperandSizeOverride();
+  emitRexRB(Ty, src, dst);
   emitUint8(0x0F);
   emitUint8(0xAC);
-  emitRegisterOperand(src, dst);
+  emitRegisterOperand(gprEncoding(src), gprEncoding(dst));
   emitUint8(imm.value() & 0xFF);
 }
 
@@ -2580,9 +2739,10 @@
   assert(Ty == IceType_i16 || Ty == IceType_i32);
   if (Ty == IceType_i16)
     emitOperandSizeOverride();
+  emitRex(Ty, dst, src);
   emitUint8(0x0F);
   emitUint8(0xAD);
-  emitOperand(src, dst);
+  emitOperand(gprEncoding(src), dst);
 }
 
 template <class Machine>
@@ -2590,11 +2750,12 @@
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   if (Ty == IceType_i16)
     emitOperandSizeOverride();
+  emitRexB(Ty, reg);
   if (isByteSizedArithType(Ty))
     emitUint8(0xF6);
   else
     emitUint8(0xF7);
-  emitRegisterOperand(3, reg);
+  emitRegisterOperand(3, gprEncoding(reg));
 }
 
 template <class Machine>
@@ -2603,6 +2764,7 @@
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   if (Ty == IceType_i16)
     emitOperandSizeOverride();
+  emitRex(Ty, addr, RexRegIrrelevant);
   if (isByteSizedArithType(Ty))
     emitUint8(0xF6);
   else
@@ -2613,8 +2775,9 @@
 template <class Machine>
 void AssemblerX86Base<Machine>::notl(typename Traits::GPRRegister reg) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRexB(IceType_i32, reg);
   emitUint8(0xF7);
-  emitUint8(0xD0 | reg);
+  emitUint8(0xD0 | gprEncoding(reg));
 }
 
 template <class Machine>
@@ -2623,8 +2786,9 @@
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   assert(Ty == IceType_i32);
   (void)Ty;
+  emitRexB(Ty, reg);
   emitUint8(0x0F);
-  emitUint8(0xC8 | reg);
+  emitUint8(0xC8 | gprEncoding(reg));
 }
 
 template <class Machine>
@@ -2634,9 +2798,10 @@
   assert(Ty == IceType_i16 || Ty == IceType_i32);
   if (Ty == IceType_i16)
     emitOperandSizeOverride();
+  emitRexRB(Ty, dst, src);
   emitUint8(0x0F);
   emitUint8(0xBC);
-  emitRegisterOperand(dst, src);
+  emitRegisterOperand(gprEncoding(dst), gprEncoding(src));
 }
 
 template <class Machine>
@@ -2646,9 +2811,10 @@
   assert(Ty == IceType_i16 || Ty == IceType_i32);
   if (Ty == IceType_i16)
     emitOperandSizeOverride();
+  emitRex(Ty, src, dst);
   emitUint8(0x0F);
   emitUint8(0xBC);
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
@@ -2658,9 +2824,10 @@
   assert(Ty == IceType_i16 || Ty == IceType_i32);
   if (Ty == IceType_i16)
     emitOperandSizeOverride();
+  emitRexRB(Ty, dst, src);
   emitUint8(0x0F);
   emitUint8(0xBD);
-  emitRegisterOperand(dst, src);
+  emitRegisterOperand(gprEncoding(dst), gprEncoding(src));
 }
 
 template <class Machine>
@@ -2670,18 +2837,20 @@
   assert(Ty == IceType_i16 || Ty == IceType_i32);
   if (Ty == IceType_i16)
     emitOperandSizeOverride();
+  emitRex(Ty, src, dst);
   emitUint8(0x0F);
   emitUint8(0xBD);
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
 void AssemblerX86Base<Machine>::bt(typename Traits::GPRRegister base,
                                    typename Traits::GPRRegister offset) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRexRB(IceType_i32, offset, base);
   emitUint8(0x0F);
   emitUint8(0xA3);
-  emitRegisterOperand(offset, base);
+  emitRegisterOperand(gprEncoding(offset), gprEncoding(base));
 }
 
 template <class Machine> void AssemblerX86Base<Machine>::ret() {
@@ -2823,8 +2992,9 @@
 template <class Machine>
 void AssemblerX86Base<Machine>::jmp(typename Traits::GPRRegister reg) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRexB(RexTypeIrrelevant, reg);
   emitUint8(0xFF);
-  emitRegisterOperand(4, reg);
+  emitRegisterOperand(4, gprEncoding(reg));
 }
 
 template <class Machine>
@@ -2881,12 +3051,13 @@
     emitOperandSizeOverride();
   if (Locked)
     emitUint8(0xF0);
+  emitRex(Ty, address, reg);
   emitUint8(0x0F);
   if (isByteSizedArithType(Ty))
     emitUint8(0xB0);
   else
     emitUint8(0xB1);
-  emitOperand(reg, address);
+  emitOperand(gprEncoding(reg), address);
 }
 
 template <class Machine>
@@ -2895,6 +3066,7 @@
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   if (Locked)
     emitUint8(0xF0);
+  emitRex(IceType_i32, address, RexRegIrrelevant);
   emitUint8(0x0F);
   emitUint8(0xC7);
   emitOperand(1, address);
@@ -2910,12 +3082,13 @@
     emitOperandSizeOverride();
   if (Locked)
     emitUint8(0xF0);
+  emitRex(Ty, addr, reg);
   emitUint8(0x0F);
   if (isByteSizedArithType(Ty))
     emitUint8(0xC0);
   else
     emitUint8(0xC1);
-  emitOperand(reg, addr);
+  emitOperand(gprEncoding(reg), addr);
 }
 
 template <class Machine>
@@ -2925,11 +3098,12 @@
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   if (Ty == IceType_i16)
     emitOperandSizeOverride();
+  emitRex(Ty, addr, reg);
   if (isByteSizedArithType(Ty))
     emitUint8(0x86);
   else
     emitUint8(0x87);
-  emitOperand(reg, addr);
+  emitOperand(gprEncoding(reg), addr);
 }
 
 template <class Machine>
@@ -3082,6 +3256,7 @@
   assert(imm.is_int8());
   if (Ty == IceType_i16)
     emitOperandSizeOverride();
+  emitRexB(Ty, reg);
   if (imm.value() == 1) {
     emitUint8(isByteSizedArithType(Ty) ? 0xD0 : 0xD1);
     emitOperand(rm, typename Traits::Operand(reg));
@@ -3101,6 +3276,7 @@
   (void)shifter;
   if (Ty == IceType_i16)
     emitOperandSizeOverride();
+  emitRexB(Ty, operand.rm());
   emitUint8(isByteSizedArithType(Ty) ? 0xD2 : 0xD3);
   emitOperand(rm, operand);
 }
diff --git a/src/IceConditionCodesX8664.h b/src/IceConditionCodesX8664.h
index 06155ea..d1d9dd8 100644
--- a/src/IceConditionCodesX8664.h
+++ b/src/IceConditionCodesX8664.h
@@ -21,6 +21,7 @@
 namespace Ice {
 
 class CondX8664 {
+public:
   /// An enum of condition codes used for branches and cmov. The enum value
   /// should match the value used to encode operands in binary instructions.
   enum BrCond {
diff --git a/src/IceInstX8664.def b/src/IceInstX8664.def
index 6857ed6..dd4b712 100644
--- a/src/IceInstX8664.def
+++ b/src/IceInstX8664.def
@@ -15,46 +15,51 @@
 #ifndef SUBZERO_SRC_ICEINSTX8664_DEF
 #define SUBZERO_SRC_ICEINSTX8664_DEF
 
+// NOTE: we use the 32bit register names for two reasons:
+//       (1) it makes it easier to implement the x86 assembler template.
+//       (2) when generating code, subzero defaults to using 32 bit registers,
+//       so using the 32 bit register name would hopefully make this design
+//       more explicit.
 // NOTE: esp is not considered isInt, to avoid register allocating it.
 #define REGX8664_GPR_TABLE                                                     \
   /* val, encode, name64, name, name16, name8, scratch, preserved, stackptr,   \
      frameptr, isInt, isFP */                                                  \
-  X(Reg_rax, =            0, "rax", "eax" ,  "ax" , "al"  , 1, 0, 0, 0, 1, 0)  \
-  X(Reg_rcx, = Reg_rax +  1, "rcx", "ecx" ,  "cx" , "cl"  , 1, 0, 0, 0, 1, 0)  \
-  X(Reg_rdx, = Reg_rax +  2, "rdx", "edx" ,  "dx" , "dl"  , 1, 0, 0, 0, 1, 0)  \
-  X(Reg_rbx, = Reg_rax +  3, "rbx", "ebx" ,  "bx" , "bl"  , 0, 1, 0, 0, 1, 0)  \
-  X(Reg_rsp, = Reg_rax +  4, "rsp", "esp" ,  "sp" , "spl" , 0, 0, 1, 0, 0, 0)  \
-  X(Reg_rbp, = Reg_rax +  5, "rbp", "ebp" ,  "bp" , "bpl" , 0, 0, 0, 1, 1, 0)  \
-  X(Reg_rsi, = Reg_rax +  6, "rsi", "esi" ,  "si" , "sil" , 1, 0, 0, 0, 1, 0)  \
-  X(Reg_rdi, = Reg_rax +  7, "rdi", "edi" ,  "di" , "dil" , 1, 0, 0, 0, 1, 0)  \
-  X(Reg_r8,  = Reg_rax +  8, "r8" , "r8d" ,  "r8w", "r8l" , 1, 0, 0, 0, 1, 0)  \
-  X(Reg_r9,  = Reg_rax +  9, "r9" , "r9d" ,  "r9w", "r9l" , 1, 0, 0, 0, 1, 0)  \
-  X(Reg_r10, = Reg_rax + 10, "r10", "r10d", "r10w", "r10l", 1, 0, 0, 0, 1, 0)  \
-  X(Reg_r11, = Reg_rax + 11, "r11", "r11d", "r11w", "r11l", 1, 0, 0, 0, 1, 0)  \
-  X(Reg_r12, = Reg_rax + 12, "r12", "r12d", "r12w", "r12l", 0, 1, 0, 0, 1, 0)  \
-  X(Reg_r13, = Reg_rax + 13, "r13", "r13d", "r13w", "r12l", 0, 1, 0, 0, 1, 0)  \
-  X(Reg_r14, = Reg_rax + 14, "r14", "r14d", "r14w", "r14l", 0, 1, 0, 0, 1, 0)  \
-  X(Reg_r15, = Reg_rax + 15, "r15", "r15d", "r15w", "r15l", 0, 1, 0, 0, 1, 0)
+  X(Reg_eax,  =  0, "rax", "eax" ,  "ax" , "al"  , 1, 0, 0, 0, 1, 0)           \
+  X(Reg_ecx,  =  1, "rcx", "ecx" ,  "cx" , "cl"  , 1, 0, 0, 0, 1, 0)           \
+  X(Reg_edx,  =  2, "rdx", "edx" ,  "dx" , "dl"  , 1, 0, 0, 0, 1, 0)           \
+  X(Reg_ebx,  =  3, "rbx", "ebx" ,  "bx" , "bl"  , 0, 1, 0, 0, 1, 0)           \
+  X(Reg_esp,  =  4, "rsp", "esp" ,  "sp" , "spl" , 0, 0, 1, 0, 0, 0)           \
+  X(Reg_ebp,  =  5, "rbp", "ebp" ,  "bp" , "bpl" , 0, 0, 0, 1, 1, 0)           \
+  X(Reg_esi,  =  6, "rsi", "esi" ,  "si" , "sil" , 1, 0, 0, 0, 1, 0)           \
+  X(Reg_edi,  =  7, "rdi", "edi" ,  "di" , "dil" , 1, 0, 0, 0, 1, 0)           \
+  X(Reg_r8d,  =  8, "r8" , "r8d" ,  "r8w", "r8l" , 1, 0, 0, 0, 1, 0)           \
+  X(Reg_r9d,  =  9, "r9" , "r9d" ,  "r9w", "r9l" , 1, 0, 0, 0, 1, 0)           \
+  X(Reg_r10d, = 10, "r10", "r10d", "r10w", "r10l", 1, 0, 0, 0, 1, 0)           \
+  X(Reg_r11d, = 11, "r11", "r11d", "r11w", "r11l", 1, 0, 0, 0, 1, 0)           \
+  X(Reg_r12d, = 12, "r12", "r12d", "r12w", "r12l", 0, 1, 0, 0, 1, 0)           \
+  X(Reg_r13d, = 13, "r13", "r13d", "r13w", "r12l", 0, 1, 0, 0, 1, 0)           \
+  X(Reg_r14d, = 14, "r14", "r14d", "r14w", "r14l", 0, 1, 0, 0, 1, 0)           \
+  X(Reg_r15d, = 15, "r15", "r15d", "r15w", "r15l", 0, 1, 0, 0, 1, 0)
 
 #define REGX8664_XMM_TABLE                                                     \
   /* val, encode, name64, name, name16, name8, scratch, preserved, stackptr,   \
      frameptr, isInt, isFP */                                                  \
-  X(Reg_xmm0,  =             0, "xmm0" , "", "", "", 1, 0, 0, 0, 0, 1)         \
-  X(Reg_xmm1,  = Reg_xmm0 +  1, "xmm1" , "", "", "", 1, 0, 0, 0, 0, 1)         \
-  X(Reg_xmm2,  = Reg_xmm0 +  2, "xmm2" , "", "", "", 1, 0, 0, 0, 0, 1)         \
-  X(Reg_xmm3,  = Reg_xmm0 +  3, "xmm3" , "", "", "", 1, 0, 0, 0, 0, 1)         \
-  X(Reg_xmm4,  = Reg_xmm0 +  4, "xmm4" , "", "", "", 1, 0, 0, 0, 0, 1)         \
-  X(Reg_xmm5,  = Reg_xmm0 +  5, "xmm5" , "", "", "", 1, 0, 0, 0, 0, 1)         \
-  X(Reg_xmm6,  = Reg_xmm0 +  6, "xmm6" , "", "", "", 1, 0, 0, 0, 0, 1)         \
-  X(Reg_xmm7,  = Reg_xmm0 +  7, "xmm7" , "", "", "", 1, 0, 0, 0, 0, 1)         \
-  X(Reg_xmm8,  = Reg_xmm0 +  8, "xmm8" , "", "", "", 1, 0, 0, 0, 0, 1)         \
-  X(Reg_xmm9,  = Reg_xmm0 +  9, "xmm9" , "", "", "", 1, 0, 0, 0, 0, 1)         \
-  X(Reg_xmm10, = Reg_xmm0 + 10, "xmm10", "", "", "", 1, 0, 0, 0, 0, 1)         \
-  X(Reg_xmm11, = Reg_xmm0 + 11, "xmm11", "", "", "", 1, 0, 0, 0, 0, 1)         \
-  X(Reg_xmm12, = Reg_xmm0 + 12, "xmm12", "", "", "", 1, 0, 0, 0, 0, 1)         \
-  X(Reg_xmm13, = Reg_xmm0 + 13, "xmm13", "", "", "", 1, 0, 0, 0, 0, 1)         \
-  X(Reg_xmm14, = Reg_xmm0 + 14, "xmm14", "", "", "", 1, 0, 0, 0, 0, 1)         \
-  X(Reg_xmm15, = Reg_xmm0 + 15, "xmm15", "", "", "", 1, 0, 0, 0, 0, 1)
+  X(Reg_xmm0,  =  0, "xmm0" , "", "", "", 1, 0, 0, 0, 0, 1)                    \
+  X(Reg_xmm1,  =  1, "xmm1" , "", "", "", 1, 0, 0, 0, 0, 1)                    \
+  X(Reg_xmm2,  =  2, "xmm2" , "", "", "", 1, 0, 0, 0, 0, 1)                    \
+  X(Reg_xmm3,  =  3, "xmm3" , "", "", "", 1, 0, 0, 0, 0, 1)                    \
+  X(Reg_xmm4,  =  4, "xmm4" , "", "", "", 1, 0, 0, 0, 0, 1)                    \
+  X(Reg_xmm5,  =  5, "xmm5" , "", "", "", 1, 0, 0, 0, 0, 1)                    \
+  X(Reg_xmm6,  =  6, "xmm6" , "", "", "", 1, 0, 0, 0, 0, 1)                    \
+  X(Reg_xmm7,  =  7, "xmm7" , "", "", "", 1, 0, 0, 0, 0, 1)                    \
+  X(Reg_xmm8,  =  8, "xmm8" , "", "", "", 1, 0, 0, 0, 0, 1)                    \
+  X(Reg_xmm9,  =  9, "xmm9" , "", "", "", 1, 0, 0, 0, 0, 1)                    \
+  X(Reg_xmm10, = 10, "xmm10", "", "", "", 1, 0, 0, 0, 0, 1)                    \
+  X(Reg_xmm11, = 11, "xmm11", "", "", "", 1, 0, 0, 0, 0, 1)                    \
+  X(Reg_xmm12, = 12, "xmm12", "", "", "", 1, 0, 0, 0, 0, 1)                    \
+  X(Reg_xmm13, = 13, "xmm13", "", "", "", 1, 0, 0, 0, 0, 1)                    \
+  X(Reg_xmm14, = 14, "xmm14", "", "", "", 1, 0, 0, 0, 0, 1)                    \
+  X(Reg_xmm15, = 15, "xmm15", "", "", "", 1, 0, 0, 0, 0, 1)
 //#define X(val, encode, name, name32, name16, name8, scratch, preserved,
 //          stackptr, frameptr, isI8, isInt, isFP)
 
@@ -77,8 +82,8 @@
 
 #define REGX8664_TABLE_BOUNDS                                                  \
   /* val         , init */                                                     \
-  X(Reg_GPR_First, = Reg_rax  )                                                \
-  X(Reg_GPR_Last , = Reg_r15  )                                                \
+  X(Reg_GPR_First, = Reg_eax  )                                                \
+  X(Reg_GPR_Last , = Reg_r15d )                                                \
   X(Reg_XMM_First, = Reg_xmm0 )                                                \
   X(Reg_XMM_Last , = Reg_xmm15)
 // define X(val, init)
diff --git a/src/IceRegistersX8664.h b/src/IceRegistersX8664.h
index 3e4c868..f95fe9e 100644
--- a/src/IceRegistersX8664.h
+++ b/src/IceRegistersX8664.h
@@ -79,10 +79,6 @@
   }
 
   static inline ByteRegister getEncodedByteReg(int32_t RegNum) {
-    // In x86-64, AH is not encodable when the REX prefix is used; the same
-    // encoding is used for spl. Therefore, ah needs special handling.
-    if (RegNum == Reg_ah)
-      return Encoded_Reg_spl;
     return ByteRegister(RegNum - Reg_GPR_First);
   }
 
diff --git a/src/IceTargetLoweringX8632Traits.h b/src/IceTargetLoweringX8632Traits.h
index ca15ea5..29066aa 100644
--- a/src/IceTargetLoweringX8632Traits.h
+++ b/src/IceTargetLoweringX8632Traits.h
@@ -48,6 +48,13 @@
   //      \/_/\/_/\/_____/\/_/  \/_/
   //
   //----------------------------------------------------------------------------
+  static constexpr bool Is64Bit = false;
+  static constexpr bool HasPopa = true;
+  static constexpr bool HasPusha = true;
+  static constexpr bool UsesX87 = true;
+  static constexpr ::Ice::RegX8632::GPRRegister Last8BitGPR =
+      ::Ice::RegX8632::GPRRegister::Encoded_Reg_ebx;
+
   enum ScaleFactor { TIMES_1 = 0, TIMES_2 = 1, TIMES_4 = 2, TIMES_8 = 3 };
 
   using GPRRegister = ::Ice::RegX8632::GPRRegister;
diff --git a/src/IceTargetLoweringX8664Traits.h b/src/IceTargetLoweringX8664Traits.h
new file mode 100644
index 0000000..fea1a8f
--- /dev/null
+++ b/src/IceTargetLoweringX8664Traits.h
@@ -0,0 +1,296 @@
+//===- subzero/src/IceTargetLoweringX8664Traits.h - x86-64 traits -*- C++ -*-=//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file declares the X8664 Target Lowering Traits.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef SUBZERO_SRC_ICETARGETLOWERINGX8664TRAITS_H
+#define SUBZERO_SRC_ICETARGETLOWERINGX8664TRAITS_H
+
+#include "IceAssembler.h"
+#include "IceConditionCodesX8664.h"
+#include "IceDefs.h"
+#include "IceInst.h"
+#include "IceInstX8664.def"
+#include "IceOperand.h"
+#include "IceRegistersX8664.h"
+#include "IceTargetLowering.h"
+
+namespace Ice {
+
+class TargetX8664;
+
+namespace X8664 {
+class AssemblerX8664;
+} // end of namespace X8664
+
+namespace X86Internal {
+
+template <class Machine> struct Insts;
+template <class Machine> struct MachineTraits;
+
+template <> struct MachineTraits<TargetX8664> {
+  //----------------------------------------------------------------------------
+  //     ______  ______  __    __
+  //    /\  __ \/\  ___\/\ "-./  \
+  //    \ \  __ \ \___  \ \ \-./\ \
+  //     \ \_\ \_\/\_____\ \_\ \ \_\
+  //      \/_/\/_/\/_____/\/_/  \/_/
+  //
+  //----------------------------------------------------------------------------
+  static constexpr bool Is64Bit = true;
+  static constexpr bool HasPopa = false;
+  static constexpr bool HasPusha = false;
+  static constexpr bool UsesX87 = false;
+  static constexpr ::Ice::RegX8664::GPRRegister Last8BitGPR =
+      ::Ice::RegX8664::GPRRegister::Encoded_Reg_r15d;
+
+  enum ScaleFactor { TIMES_1 = 0, TIMES_2 = 1, TIMES_4 = 2, TIMES_8 = 3 };
+
+  using GPRRegister = ::Ice::RegX8664::GPRRegister;
+  using XmmRegister = ::Ice::RegX8664::XmmRegister;
+  using ByteRegister = ::Ice::RegX8664::ByteRegister;
+
+  using Cond = ::Ice::CondX8664;
+
+  using RegisterSet = ::Ice::RegX8664;
+  static const GPRRegister Encoded_Reg_Accumulator = RegX8664::Encoded_Reg_eax;
+  static const GPRRegister Encoded_Reg_Counter = RegX8664::Encoded_Reg_ecx;
+  static const FixupKind PcRelFixup = llvm::ELF::R_386_PC32; // TODO(jpp): ???
+
+  class Operand {
+  public:
+    enum RexBits {
+      RexNone = 0x00,
+      RexBase = 0x40,
+      RexW = RexBase | (1 << 3),
+      RexR = RexBase | (1 << 2),
+      RexX = RexBase | (1 << 1),
+      RexB = RexBase | (1 << 0),
+    };
+
+    Operand(const Operand &other)
+        : fixup_(other.fixup_), rex_(other.rex_), length_(other.length_) {
+      memmove(&encoding_[0], &other.encoding_[0], other.length_);
+    }
+
+    Operand &operator=(const Operand &other) {
+      length_ = other.length_;
+      fixup_ = other.fixup_;
+      rex_ = other.rex_;
+      memmove(&encoding_[0], &other.encoding_[0], other.length_);
+      return *this;
+    }
+
+    uint8_t mod() const { return (encoding_at(0) >> 6) & 3; }
+
+    uint8_t rexX() const { return (rex_ & RexX) != RexX ? RexNone : RexX; }
+    uint8_t rexB() const { return (rex_ & RexB) != RexB ? RexNone : RexB; }
+
+    GPRRegister rm() const {
+      return static_cast<GPRRegister>((rexB() != 0 ? 0x08 : 0) |
+                                      (encoding_at(0) & 7));
+    }
+
+    ScaleFactor scale() const {
+      return static_cast<ScaleFactor>((encoding_at(1) >> 6) & 3);
+    }
+
+    GPRRegister index() const {
+      return static_cast<GPRRegister>((rexX() != 0 ? 0x08 : 0) |
+                                      ((encoding_at(1) >> 3) & 7));
+    }
+
+    GPRRegister base() const {
+      return static_cast<GPRRegister>((rexB() != 0 ? 0x08 : 0) |
+                                      (encoding_at(1) & 7));
+    }
+
+    int8_t disp8() const {
+      assert(length_ >= 2);
+      return static_cast<int8_t>(encoding_[length_ - 1]);
+    }
+
+    int32_t disp32() const {
+      assert(length_ >= 5);
+      return bit_copy<int32_t>(encoding_[length_ - 4]);
+    }
+
+    AssemblerFixup *fixup() const { return fixup_; }
+
+  protected:
+    Operand() : fixup_(nullptr), length_(0) {} // Needed by subclass Address.
+
+    void SetModRM(int mod, GPRRegister rm) {
+      assert((mod & ~3) == 0);
+      encoding_[0] = (mod << 6) | (rm & 0x07);
+      rex_ = (rm & 0x08) ? RexB : RexNone;
+      length_ = 1;
+    }
+
+    void SetSIB(ScaleFactor scale, GPRRegister index, GPRRegister base) {
+      assert(length_ == 1);
+      assert((scale & ~3) == 0);
+      encoding_[1] = (scale << 6) | ((index & 0x07) << 3) | (base & 0x07);
+      rex_ =
+          ((base & 0x08) ? RexB : RexNone) | ((index & 0x08) ? RexX : RexNone);
+      length_ = 2;
+    }
+
+    void SetDisp8(int8_t disp) {
+      assert(length_ == 1 || length_ == 2);
+      encoding_[length_++] = static_cast<uint8_t>(disp);
+    }
+
+    void SetDisp32(int32_t disp) {
+      assert(length_ == 1 || length_ == 2);
+      intptr_t disp_size = sizeof(disp);
+      memmove(&encoding_[length_], &disp, disp_size);
+      length_ += disp_size;
+    }
+
+    void SetFixup(AssemblerFixup *fixup) { fixup_ = fixup; }
+
+  private:
+    AssemblerFixup *fixup_;
+    uint8_t rex_ = 0;
+    uint8_t encoding_[6];
+    uint8_t length_;
+
+    explicit Operand(GPRRegister reg) : fixup_(nullptr) { SetModRM(3, reg); }
+
+    /// Get the operand encoding byte at the given index.
+    uint8_t encoding_at(intptr_t index) const {
+      assert(index >= 0 && index < length_);
+      return encoding_[index];
+    }
+
+    /// Returns whether or not this operand is really the given register in
+    /// disguise. Used from the assembler to generate better encodings.
+    bool IsRegister(GPRRegister reg) const {
+      return ((encoding_[0] & 0xF8) ==
+              0xC0) // Addressing mode is register only.
+             &&
+             (rm() == reg); // Register codes match.
+    }
+
+    template <class> friend class AssemblerX86Base;
+  };
+
+  class Address : public Operand {
+    Address() = delete;
+
+  public:
+    Address(const Address &other) : Operand(other) {}
+
+    Address &operator=(const Address &other) {
+      Operand::operator=(other);
+      return *this;
+    }
+
+    Address(GPRRegister base, int32_t disp) {
+      if (disp == 0 && (base & 7) != RegX8664::Encoded_Reg_ebp) {
+        SetModRM(0, base);
+        if ((base & 7) == RegX8664::Encoded_Reg_esp)
+          SetSIB(TIMES_1, RegX8664::Encoded_Reg_esp, base);
+      } else if (Utils::IsInt(8, disp)) {
+        SetModRM(1, base);
+        if ((base & 7) == RegX8664::Encoded_Reg_esp)
+          SetSIB(TIMES_1, RegX8664::Encoded_Reg_esp, base);
+        SetDisp8(disp);
+      } else {
+        SetModRM(2, base);
+        if ((base & 7) == RegX8664::Encoded_Reg_esp)
+          SetSIB(TIMES_1, RegX8664::Encoded_Reg_esp, base);
+        SetDisp32(disp);
+      }
+    }
+
+    Address(GPRRegister index, ScaleFactor scale, int32_t disp) {
+      assert(index != RegX8664::Encoded_Reg_esp); // Illegal addressing mode.
+      SetModRM(0, RegX8664::Encoded_Reg_esp);
+      SetSIB(scale, index, RegX8664::Encoded_Reg_ebp);
+      SetDisp32(disp);
+    }
+
+    Address(GPRRegister base, GPRRegister index, ScaleFactor scale,
+            int32_t disp) {
+      assert(index != RegX8664::Encoded_Reg_esp); // Illegal addressing mode.
+      if (disp == 0 && (base & 7) != RegX8664::Encoded_Reg_ebp) {
+        SetModRM(0, RegX8664::Encoded_Reg_esp);
+        SetSIB(scale, index, base);
+      } else if (Utils::IsInt(8, disp)) {
+        SetModRM(1, RegX8664::Encoded_Reg_esp);
+        SetSIB(scale, index, base);
+        SetDisp8(disp);
+      } else {
+        SetModRM(2, RegX8664::Encoded_Reg_esp);
+        SetSIB(scale, index, base);
+        SetDisp32(disp);
+      }
+    }
+
+    // PcRelTag is a special tag for requesting rip-relative addressing in
+    // X86-64.
+    // TODO(jpp): this is bogus. remove.
+    enum AbsoluteTag { ABSOLUTE };
+
+    Address(AbsoluteTag, const uintptr_t Addr) {
+      SetModRM(0, RegX8664::Encoded_Reg_ebp);
+      SetDisp32(Addr);
+    }
+
+    // TODO(jpp): remove this.
+    static Address Absolute(const uintptr_t Addr) {
+      return Address(ABSOLUTE, Addr);
+    }
+
+    Address(AbsoluteTag, RelocOffsetT Offset, AssemblerFixup *Fixup) {
+      SetModRM(0, RegX8664::Encoded_Reg_ebp);
+      // Use the Offset in the displacement for now. If we decide to process
+      // fixups later, we'll need to patch up the emitted displacement.
+      SetDisp32(Offset);
+      SetFixup(Fixup);
+    }
+
+    // TODO(jpp): remove this.
+    static Address Absolute(RelocOffsetT Offset, AssemblerFixup *Fixup) {
+      return Address(ABSOLUTE, Offset, Fixup);
+    }
+
+    static Address ofConstPool(Assembler *Asm, const Constant *Imm) {
+      // TODO(jpp): ???
+      AssemblerFixup *Fixup = Asm->createFixup(llvm::ELF::R_386_32, Imm);
+      const RelocOffsetT Offset = 0;
+      return Address(ABSOLUTE, Offset, Fixup);
+    }
+  };
+
+  //----------------------------------------------------------------------------
+  //     __      ______  __     __  ______  ______  __  __   __  ______
+  //    /\ \    /\  __ \/\ \  _ \ \/\  ___\/\  == \/\ \/\ "-.\ \/\  ___\
+  //    \ \ \___\ \ \/\ \ \ \/ ".\ \ \  __\\ \  __<\ \ \ \ \-.  \ \ \__ \
+  //     \ \_____\ \_____\ \__/".~\_\ \_____\ \_\ \_\ \_\ \_\\"\_\ \_____\
+  //      \/_____/\/_____/\/_/   \/_/\/_____/\/_/ /_/\/_/\/_/ \/_/\/_____/
+  //
+  //----------------------------------------------------------------------------
+  using Assembler = X8664::AssemblerX8664;
+};
+
+} // end of namespace X86Internal
+
+namespace X8664 {
+using Traits = ::Ice::X86Internal::MachineTraits<TargetX8664>;
+} // end of namespace X8664
+
+} // end of namespace Ice
+
+#endif // SUBZERO_SRC_ICETARGETLOWERINGX8664TRAITS_H