Adds the x86-64 assembler.

As part of this CL, x86-32 assembler tests are also introduced. They were implemented before the x86 base assembler template was modified for x86-64 support.

BUG=https://code.google.com/p/nativeclient/issues/detail?id=4077
R=stichnot@chromium.org

Review URL: https://codereview.chromium.org/1224173006.
diff --git a/Makefile.standalone b/Makefile.standalone
index c39b926..746d216 100644
--- a/Makefile.standalone
+++ b/Makefile.standalone
@@ -168,7 +168,6 @@
 
 SRCS = \
 	IceAssembler.cpp \
-	IceAssemblerX8664.cpp \
 	IceBrowserCompileServer.cpp \
 	IceCfg.cpp \
 	IceCfgNode.cpp \
@@ -212,9 +211,24 @@
 
 UNITTEST_SRCS = \
 	BitcodeMunge.cpp \
-        IceAssemblerX8632Test.cpp \
 	IceELFSectionTest.cpp \
-	IceParseInstsTest.cpp
+	IceParseInstsTest.cpp \
+        AssemblerX8632/LowLevel.cpp \
+        AssemblerX8632/DataMov.cpp \
+        AssemblerX8632/Locked.cpp \
+        AssemblerX8632/GPRArith.cpp \
+        AssemblerX8632/XmmArith.cpp \
+        AssemblerX8632/ControlFlow.cpp \
+        AssemblerX8632/Other.cpp \
+        AssemblerX8632/X87.cpp \
+        AssemblerX8664/LowLevel.cpp \
+        AssemblerX8664/DataMov.cpp \
+        AssemblerX8664/Locked.cpp \
+        AssemblerX8664/GPRArith.cpp \
+        AssemblerX8664/XmmArith.cpp \
+        AssemblerX8664/ControlFlow.cpp \
+        AssemblerX8664/Other.cpp
+
 
 UNITTEST_OBJS = $(patsubst %.cpp, $(OBJDIR)/unittest/%.o, $(UNITTEST_SRCS))
 UNITTEST_LIB_OBJS = $(filter-out $(OBJDIR)/main.o,$(OBJS))
@@ -271,6 +285,7 @@
 		unittest/*.h src/*.h src/*.def
 	$(CXX) -c $(CXXFLAGS) \
 		-Isrc/ \
+                -Iunittest/ \
 		-I$(LLVM_SRC_PATH)/utils/unittest/googletest/include \
 		-I$(LLVM_SRC_PATH) \
 		-DGTEST_HAS_RTTI=0 -DGTEST_USE_OWN_TR1_TUPLE \
@@ -279,7 +294,7 @@
 $(OBJS): | $(OBJDIR)
 $(SB_OBJS): | $(SB_OBJDIR)
 
-$(UNITTEST_OBJS): | $(OBJDIR)/unittest
+$(UNITTEST_OBJS): | $(OBJDIR)/unittest $(OBJDIR)/unittest/AssemblerX8632 $(OBJDIR)/unittest/AssemblerX8664
 
 $(OBJDIR):
 	@mkdir -p $@
@@ -289,6 +304,11 @@
 $(OBJDIR)/unittest: $(OBJDIR)
 	@mkdir -p $@
 
+$(OBJDIR)/unittest/AssemblerX8632: $(OBJDIR)/unittest
+	@mkdir -p $@
+$(OBJDIR)/unittest/AssemblerX8664: $(OBJDIR)/unittest
+	@mkdir -p $@
+
 RT_SRC := runtime/szrt.c runtime/szrt_ll.ll runtime/szrt_profiler.c
 RT_OBJ := build/runtime/szrt_native_x8632.o build/runtime/szrt_sb_x8632.o \
 	build/runtime/szrt_native_arm32.o build/runtime/szrt_sb_arm32.o
diff --git a/src/IceAssemblerX8664.cpp b/src/IceAssemblerX8664.cpp
deleted file mode 100644
index 910924d..0000000
--- a/src/IceAssemblerX8664.cpp
+++ /dev/null
@@ -1,48 +0,0 @@
-//===- subzero/src/IceAssemblerX8664.cpp ----------------------------------===//
-//
-//                        The Subzero Code Generator
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file implements the Assembler class for x86-64.
-///
-//===----------------------------------------------------------------------===//
-
-#include "IceAssemblerX8664.h"
-
-namespace Ice {
-namespace X8664 {
-
-void AssemblerX8664::alignFunction() {
-  llvm::report_fatal_error("Not yet implemented");
-}
-
-void AssemblerX8664::padWithNop(intptr_t) {
-  llvm::report_fatal_error("Not yet implemented");
-}
-
-SizeT AssemblerX8664::getBundleAlignLog2Bytes() const {
-  llvm::report_fatal_error("Not yet implemented");
-}
-const char *AssemblerX8664::getNonExecPadDirective() const {
-  llvm::report_fatal_error("Not yet implemented");
-}
-
-llvm::ArrayRef<uint8_t> AssemblerX8664::getNonExecBundlePadding() const {
-  llvm::report_fatal_error("Not yet implemented");
-}
-
-void AssemblerX8664::bindCfgNodeLabel(SizeT) {
-  llvm::report_fatal_error("Not yet implemented");
-}
-
-bool AssemblerX8664::fixupIsPCRel(FixupKind) const {
-  llvm::report_fatal_error("Not yet implemented");
-}
-
-} // namespace X8664
-} // namespace Ice
diff --git a/src/IceAssemblerX8664.h b/src/IceAssemblerX8664.h
index f2ffd7f..5666810 100644
--- a/src/IceAssemblerX8664.h
+++ b/src/IceAssemblerX8664.h
@@ -1,4 +1,12 @@
-//===- subzero/src/IceAssemblerX8664.h - Assembler for x86-64 -*- C++ -*---===//
+//===- subzero/src/IceAssemblerX8664.h - Assembler for x86-64 ---*- C++ -*-===//
+//
+// Copyright (c) 2013, the Dart project authors.  Please see the AUTHORS file
+// for details. All rights reserved. Use of this source code is governed by a
+// BSD-style license that can be found in the LICENSE file.
+//
+// Modified by the Subzero authors.
+//
+//===----------------------------------------------------------------------===//
 //
 //                        The Subzero Code Generator
 //
@@ -16,33 +24,32 @@
 #define SUBZERO_SRC_ICEASSEMBLERX8664_H
 
 #include "IceAssembler.h"
+#include "IceAssemblerX86Base.h"
 #include "IceDefs.h"
+#include "IceOperand.h"
+#include "IceTargetLoweringX8664Traits.h"
+#include "IceTypes.h"
+#include "IceUtils.h"
 
 namespace Ice {
+
+class TargetX8664;
+
 namespace X8664 {
 
-class AssemblerX8664 final : public Assembler {
+using Immediate = ::Ice::X86Internal::Immediate;
+using Label = ::Ice::X86Internal::Label;
+
+class AssemblerX8664 : public X86Internal::AssemblerX86Base<TargetX8664> {
   AssemblerX8664(const AssemblerX8664 &) = delete;
   AssemblerX8664 &operator=(const AssemblerX8664 &) = delete;
 
 public:
   explicit AssemblerX8664(bool use_far_branches = false)
-      : Assembler(Asm_X8664) {
-    assert(!use_far_branches);
-    (void)use_far_branches;
-    llvm::report_fatal_error("Not yet implemented");
-  }
-
+      : X86Internal::AssemblerX86Base<TargetX8664>(Asm_X8664,
+                                                   use_far_branches) {}
   ~AssemblerX8664() override = default;
 
-  void alignFunction() override;
-  void padWithNop(intptr_t Padding) override;
-  SizeT getBundleAlignLog2Bytes() const override;
-  const char *getNonExecPadDirective() const override;
-  llvm::ArrayRef<uint8_t> getNonExecBundlePadding() const override;
-  void bindCfgNodeLabel(SizeT NodeNumber) override;
-  bool fixupIsPCRel(FixupKind Kind) const override;
-
   static bool classof(const Assembler *Asm) {
     return Asm->getKind() == Asm_X8664;
   }
diff --git a/src/IceAssemblerX86Base.h b/src/IceAssemblerX86Base.h
index 1db270a..4d0333a 100644
--- a/src/IceAssemblerX86Base.h
+++ b/src/IceAssemblerX86Base.h
@@ -314,7 +314,7 @@
    */
   void call(typename Traits::GPRRegister reg);
   void call(const typename Traits::Address &address);
-  void call(const ConstantRelocatable *label);
+  void call(const ConstantRelocatable *label); // not testable.
   void call(const Immediate &abs_address);
 
   static const intptr_t kCallExternalLabelSize = 5;
@@ -324,7 +324,11 @@
   void popl(typename Traits::GPRRegister reg);
   void popl(const typename Traits::Address &address);
 
+  template <typename T = Traits,
+            typename = typename std::enable_if<T::HasPusha>::type>
   void pushal();
+  template <typename T = Traits,
+            typename = typename std::enable_if<T::HasPopa>::type>
   void popal();
 
   void setcc(typename Traits::Cond::BrCond condition,
@@ -332,7 +336,6 @@
   void setcc(typename Traits::Cond::BrCond condition,
              const typename Traits::Address &address);
 
-  // All mov() overloads are tested.
   void mov(Type Ty, typename Traits::GPRRegister dst, const Immediate &src);
   void mov(Type Ty, typename Traits::GPRRegister dst,
            typename Traits::GPRRegister src);
@@ -342,6 +345,8 @@
            typename Traits::GPRRegister src);
   void mov(Type Ty, const typename Traits::Address &dst, const Immediate &imm);
 
+  void movFromAh(const typename Traits::GPRRegister dst);
+
   void movzx(Type Ty, typename Traits::GPRRegister dst,
              typename Traits::GPRRegister src);
   void movzx(Type Ty, typename Traits::GPRRegister dst,
@@ -503,6 +508,7 @@
   void sqrtps(typename Traits::XmmRegister dst);
   void rsqrtps(typename Traits::XmmRegister dst);
   void reciprocalps(typename Traits::XmmRegister dst);
+
   void movhlps(typename Traits::XmmRegister dst,
                typename Traits::XmmRegister src);
   void movlhps(typename Traits::XmmRegister dst,
@@ -518,16 +524,12 @@
 
   void set1ps(typename Traits::XmmRegister dst,
               typename Traits::GPRRegister tmp, const Immediate &imm);
-  void shufps(typename Traits::XmmRegister dst,
-              typename Traits::XmmRegister src, const Immediate &mask);
 
   void minpd(typename Traits::XmmRegister dst,
              typename Traits::XmmRegister src);
   void maxpd(typename Traits::XmmRegister dst,
              typename Traits::XmmRegister src);
   void sqrtpd(typename Traits::XmmRegister dst);
-  void shufpd(typename Traits::XmmRegister dst,
-              typename Traits::XmmRegister src, const Immediate &mask);
 
   void pshufd(Type Ty, typename Traits::XmmRegister dst,
               typename Traits::XmmRegister src, const Immediate &mask);
@@ -606,8 +608,6 @@
 
   void pextr(Type Ty, typename Traits::GPRRegister dst,
              typename Traits::XmmRegister src, const Immediate &imm);
-  void pextr(Type Ty, typename Traits::GPRRegister dst,
-             const typename Traits::Address &src, const Immediate &imm);
 
   void pmovsxdq(typename Traits::XmmRegister dst,
                 typename Traits::XmmRegister src);
@@ -630,19 +630,49 @@
   void roundsd(typename Traits::XmmRegister dst,
                typename Traits::XmmRegister src, RoundingMode mode);
 
-  void fld(Type Ty, const typename Traits::Address &src);
-  void fstp(Type Ty, const typename Traits::Address &dst);
-  void fstp(typename Traits::X87STRegister st);
+  //----------------------------------------------------------------------------
+  //
+  // Begin: X87 instructions. Only available when Traits::UsesX87.
+  //
+  //----------------------------------------------------------------------------
+  template <typename T = Traits,
+            typename = typename std::enable_if<T::UsesX87>::type>
+  void fld(Type Ty, const typename T::Address &src);
+  template <typename T = Traits,
+            typename = typename std::enable_if<T::UsesX87>::type>
+  void fstp(Type Ty, const typename T::Address &dst);
+  template <typename T = Traits,
+            typename = typename std::enable_if<T::UsesX87>::type>
+  void fstp(typename T::X87STRegister st);
 
-  void fnstcw(const typename Traits::Address &dst);
-  void fldcw(const typename Traits::Address &src);
+  template <typename T = Traits,
+            typename = typename std::enable_if<T::UsesX87>::type>
+  void fnstcw(const typename T::Address &dst);
+  template <typename T = Traits,
+            typename = typename std::enable_if<T::UsesX87>::type>
+  void fldcw(const typename T::Address &src);
 
-  void fistpl(const typename Traits::Address &dst);
-  void fistps(const typename Traits::Address &dst);
-  void fildl(const typename Traits::Address &src);
-  void filds(const typename Traits::Address &src);
+  template <typename T = Traits,
+            typename = typename std::enable_if<T::UsesX87>::type>
+  void fistpl(const typename T::Address &dst);
+  template <typename T = Traits,
+            typename = typename std::enable_if<T::UsesX87>::type>
+  void fistps(const typename T::Address &dst);
+  template <typename T = Traits,
+            typename = typename std::enable_if<T::UsesX87>::type>
+  void fildl(const typename T::Address &src);
+  template <typename T = Traits,
+            typename = typename std::enable_if<T::UsesX87>::type>
+  void filds(const typename T::Address &src);
 
+  template <typename T = Traits,
+            typename = typename std::enable_if<T::UsesX87>::type>
   void fincstp();
+  //----------------------------------------------------------------------------
+  //
+  // End: X87 instructions.
+  //
+  //----------------------------------------------------------------------------
 
   void cmp(Type Ty, typename Traits::GPRRegister reg0,
            typename Traits::GPRRegister reg1);
@@ -754,9 +784,13 @@
   void mul(Type Ty, typename Traits::GPRRegister reg);
   void mul(Type Ty, const typename Traits::Address &address);
 
+  template <class T = Traits,
+            typename = typename std::enable_if<!T::Is64Bit>::type>
   void incl(typename Traits::GPRRegister reg);
   void incl(const typename Traits::Address &address);
 
+  template <class T = Traits,
+            typename = typename std::enable_if<!T::Is64Bit>::type>
   void decl(typename Traits::GPRRegister reg);
   void decl(const typename Traits::Address &address);
 
@@ -825,16 +859,14 @@
   void ud2();
 
   // j(Label) is fully tested.
-  // j(ConstantRelocatable) is not tested as the test can not easily create such
-  // an argument.
   void j(typename Traits::Cond::BrCond condition, Label *label,
          bool near = kFarJump);
   void j(typename Traits::Cond::BrCond condition,
-         const ConstantRelocatable *label);
+         const ConstantRelocatable *label); // not testable.
 
   void jmp(typename Traits::GPRRegister reg);
   void jmp(Label *label, bool near = kFarJump);
-  void jmp(const ConstantRelocatable *label);
+  void jmp(const ConstantRelocatable *label); // not testable.
 
   void mfence();
 
@@ -855,12 +887,20 @@
 
   intptr_t CodeSize() const { return Buffer.size(); }
 
-private:
+protected:
   inline void emitUint8(uint8_t value);
+
+private:
+  static constexpr Type RexTypeIrrelevant = IceType_i32;
+  static constexpr Type IceType_ForceRexW = IceType_i64;
+  static constexpr typename Traits::GPRRegister RexRegIrrelevant =
+      Traits::GPRRegister::Encoded_Reg_eax;
+
   inline void emitInt16(int16_t value);
   inline void emitInt32(int32_t value);
   inline void emitRegisterOperand(int rm, int reg);
-  inline void emitXmmRegisterOperand(int rm, typename Traits::XmmRegister reg);
+  template <typename RegType, typename RmType>
+  inline void emitXmmRegisterOperand(RegType reg, RmType rm);
   inline void emitFixup(AssemblerFixup *fixup);
   inline void emitOperandSizeOverride();
 
@@ -910,6 +950,100 @@
   template <uint32_t Tag>
   void arith_int(Type Ty, const typename Traits::Address &address,
                  const Immediate &imm);
+
+  // gprEncoding returns Reg encoding for operand emission. For x86-64 we mask
+  // out the 4th bit as it is encoded in the REX.[RXB] bits. No other bits are
+  // touched because we don't want to mask errors.
+  template <typename RegType, typename T = Traits>
+  typename std::enable_if<T::Is64Bit, typename T::GPRRegister>::type
+  gprEncoding(const RegType Reg) {
+    return static_cast<typename Traits::GPRRegister>(static_cast<uint8_t>(Reg) &
+                                                     ~0x08);
+  }
+
+  template <typename RegType, typename T = Traits>
+  typename std::enable_if<!T::Is64Bit, typename T::GPRRegister>::type
+  gprEncoding(const RegType Reg) {
+    return static_cast<typename T::GPRRegister>(Reg);
+  }
+
+  template <typename RegType>
+  bool is8BitRegisterRequiringRex(const Type Ty, const RegType Reg) {
+    static constexpr bool IsGPR =
+        std::is_same<typename std::decay<RegType>::type,
+                     typename Traits::ByteRegister>::value ||
+        std::is_same<typename std::decay<RegType>::type,
+                     typename Traits::GPRRegister>::value;
+
+    return IsGPR && (Reg & 0x04) != 0 && (Reg & 0x08) == 0 &&
+           isByteSizedArithType(Ty);
+  };
+
+  // assembleAndEmitRex is used for determining which (if any) rex prefix should
+  // be emitted for the current instruction. It allows different types for Reg
+  // and Rm because they could be of different types (e.g., in mov[sz]x
+  // instrutions.) If Addr is not nullptr, then Rm is ignored, and Rex.B is
+  // determined by Addr instead. TyRm is still used to determine Addr's size.
+  template <typename RegType, typename RmType, typename T = Traits>
+  typename std::enable_if<T::Is64Bit, void>::type
+  assembleAndEmitRex(const Type TyReg, const RegType Reg, const Type TyRm,
+                     const RmType Rm,
+                     const typename T::Address *Addr = nullptr) {
+    const uint8_t W = (TyReg == IceType_i64 || TyRm == IceType_i64)
+                          ? T::Operand::RexW
+                          : T::Operand::RexNone;
+    const uint8_t R = (Reg & 0x08) ? T::Operand::RexR : T::Operand::RexNone;
+    const uint8_t X = (Addr != nullptr) ? Addr->rexX() : T::Operand::RexNone;
+    const uint8_t B =
+        (Addr != nullptr) ? Addr->rexB() : (Rm & 0x08) ? T::Operand::RexB
+                                                       : T::Operand::RexNone;
+    const uint8_t Prefix = W | R | X | B;
+    if (Prefix != T::Operand::RexNone) {
+      emitUint8(Prefix);
+    } else if (is8BitRegisterRequiringRex(TyReg, Reg) ||
+               (Addr == nullptr && is8BitRegisterRequiringRex(TyRm, Rm))) {
+      emitUint8(T::Operand::RexBase);
+    }
+  }
+
+  template <typename RegType, typename RmType, typename T = Traits>
+  typename std::enable_if<!T::Is64Bit, void>::type
+  assembleAndEmitRex(const Type, const RegType, const Type, const RmType,
+                     const typename T::Address * = nullptr) {}
+
+  // emitRexRB is used for emitting a Rex prefix instructions with two explicit
+  // register operands in its mod-rm byte.
+  template <typename RegType, typename RmType>
+  void emitRexRB(const Type Ty, const RegType Reg, const RmType Rm) {
+    assembleAndEmitRex(Ty, Reg, Ty, Rm);
+  }
+
+  template <typename RegType, typename RmType>
+  void emitRexRB(const Type TyReg, const RegType Reg, const Type TyRm,
+                 const RmType Rm) {
+    assembleAndEmitRex(TyReg, Reg, TyRm, Rm);
+  }
+
+  // emitRexB is used for emitting a Rex prefix if one is needed on encoding the
+  // Reg field in an x86 instruction. It is invoked by the template when Reg is
+  // the single register operand in the instruction (e.g., push Reg.)
+  template <typename RmType> void emitRexB(const Type Ty, const RmType Rm) {
+    emitRexRB(Ty, RexRegIrrelevant, Ty, Rm);
+  }
+
+  // emitRex is used for emitting a Rex prefix for an address and a GPR. The
+  // address may contain zero, one, or two registers.
+  template <typename RegType>
+  void emitRex(const Type Ty, const typename Traits::Address &Addr,
+               const RegType Reg) {
+    assembleAndEmitRex(Ty, Reg, Ty, RexRegIrrelevant, &Addr);
+  }
+
+  template <typename RegType>
+  void emitRex(const Type AddrTy, const typename Traits::Address &Addr,
+               const Type TyReg, const RegType Reg) {
+    assembleAndEmitRex(TyReg, Reg, AddrTy, RexRegIrrelevant, &Addr);
+  }
 };
 
 template <class Machine>
@@ -928,15 +1062,17 @@
 }
 
 template <class Machine>
-inline void AssemblerX86Base<Machine>::emitRegisterOperand(int rm, int reg) {
+inline void AssemblerX86Base<Machine>::emitRegisterOperand(int reg, int rm) {
+  assert(reg >= 0 && reg < 8);
   assert(rm >= 0 && rm < 8);
-  Buffer.emit<uint8_t>(0xC0 + (rm << 3) + reg);
+  Buffer.emit<uint8_t>(0xC0 + (reg << 3) + rm);
 }
 
 template <class Machine>
-inline void AssemblerX86Base<Machine>::emitXmmRegisterOperand(
-    int rm, typename Traits::XmmRegister reg) {
-  emitRegisterOperand(rm, static_cast<typename Traits::GPRRegister>(reg));
+template <typename RegType, typename RmType>
+inline void AssemblerX86Base<Machine>::emitXmmRegisterOperand(RegType reg,
+                                                              RmType rm) {
+  emitRegisterOperand(gprEncoding(reg), gprEncoding(rm));
 }
 
 template <class Machine>
diff --git a/src/IceAssemblerX86BaseImpl.h b/src/IceAssemblerX86BaseImpl.h
index f8ba4d4..0661c9f 100644
--- a/src/IceAssemblerX86BaseImpl.h
+++ b/src/IceAssemblerX86BaseImpl.h
@@ -96,13 +96,15 @@
 template <class Machine>
 void AssemblerX86Base<Machine>::call(typename Traits::GPRRegister reg) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRexB(RexTypeIrrelevant, reg);
   emitUint8(0xFF);
-  emitRegisterOperand(2, reg);
+  emitRegisterOperand(2, gprEncoding(reg));
 }
 
 template <class Machine>
 void AssemblerX86Base<Machine>::call(const typename Traits::Address &address) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRex(RexTypeIrrelevant, address, RexRegIrrelevant);
   emitUint8(0xFF);
   emitOperand(2, address);
 }
@@ -132,28 +134,37 @@
 template <class Machine>
 void AssemblerX86Base<Machine>::pushl(typename Traits::GPRRegister reg) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
-  emitUint8(0x50 + reg);
+  emitRexB(RexTypeIrrelevant, reg);
+  emitUint8(0x50 + gprEncoding(reg));
 }
 
 template <class Machine>
 void AssemblerX86Base<Machine>::popl(typename Traits::GPRRegister reg) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
-  emitUint8(0x58 + reg);
+  // Any type that would not force a REX prefix to be emitted can be provided
+  // here.
+  emitRexB(RexTypeIrrelevant, reg);
+  emitUint8(0x58 + gprEncoding(reg));
 }
 
 template <class Machine>
 void AssemblerX86Base<Machine>::popl(const typename Traits::Address &address) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRex(RexTypeIrrelevant, address, RexRegIrrelevant);
   emitUint8(0x8F);
   emitOperand(0, address);
 }
 
-template <class Machine> void AssemblerX86Base<Machine>::pushal() {
+template <class Machine>
+template <typename, typename>
+void AssemblerX86Base<Machine>::pushal() {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x60);
 }
 
-template <class Machine> void AssemblerX86Base<Machine>::popal() {
+template <class Machine>
+template <typename, typename>
+void AssemblerX86Base<Machine>::popal() {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x61);
 }
@@ -162,15 +173,17 @@
 void AssemblerX86Base<Machine>::setcc(typename Traits::Cond::BrCond condition,
                                       typename Traits::ByteRegister dst) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRexB(IceType_i8, dst);
   emitUint8(0x0F);
   emitUint8(0x90 + condition);
-  emitUint8(0xC0 + dst);
+  emitUint8(0xC0 + gprEncoding(dst));
 }
 
 template <class Machine>
 void AssemblerX86Base<Machine>::setcc(typename Traits::Cond::BrCond condition,
                                       const typename Traits::Address &address) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRex(RexTypeIrrelevant, address, RexRegIrrelevant);
   emitUint8(0x0F);
   emitUint8(0x90 + condition);
   emitOperand(0, address);
@@ -179,16 +192,18 @@
 template <class Machine>
 void AssemblerX86Base<Machine>::mov(Type Ty, typename Traits::GPRRegister dst,
                                     const Immediate &imm) {
+  assert(Ty != IceType_i64 && "i64 not supported yet.");
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
-  if (isByteSizedType(Ty)) {
-    emitUint8(0xB0 + dst);
-    emitUint8(imm.value() & 0xFF);
-    return;
-  }
   if (Ty == IceType_i16)
     emitOperandSizeOverride();
-  emitUint8(0xB8 + dst);
-  emitImmediate(Ty, imm);
+  emitRexB(Ty, dst);
+  if (isByteSizedType(Ty)) {
+    emitUint8(0xB0 + gprEncoding(dst));
+    emitUint8(imm.value() & 0xFF);
+  } else {
+    emitUint8(0xB8 + gprEncoding(dst));
+    emitImmediate(Ty, imm);
+  }
 }
 
 template <class Machine>
@@ -197,12 +212,13 @@
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   if (Ty == IceType_i16)
     emitOperandSizeOverride();
+  emitRexRB(Ty, src, dst);
   if (isByteSizedType(Ty)) {
     emitUint8(0x88);
   } else {
     emitUint8(0x89);
   }
-  emitRegisterOperand(src, dst);
+  emitRegisterOperand(gprEncoding(src), gprEncoding(dst));
 }
 
 template <class Machine>
@@ -211,12 +227,13 @@
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   if (Ty == IceType_i16)
     emitOperandSizeOverride();
+  emitRex(Ty, src, dst);
   if (isByteSizedType(Ty)) {
     emitUint8(0x8A);
   } else {
     emitUint8(0x8B);
   }
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
@@ -226,21 +243,24 @@
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   if (Ty == IceType_i16)
     emitOperandSizeOverride();
+  emitRex(Ty, dst, src);
   if (isByteSizedType(Ty)) {
     emitUint8(0x88);
   } else {
     emitUint8(0x89);
   }
-  emitOperand(src, dst);
+  emitOperand(gprEncoding(src), dst);
 }
 
 template <class Machine>
 void AssemblerX86Base<Machine>::mov(Type Ty,
                                     const typename Traits::Address &dst,
                                     const Immediate &imm) {
+  assert(Ty != IceType_i64 && "i64 not supported yet.");
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   if (Ty == IceType_i16)
     emitOperandSizeOverride();
+  emitRex(Ty, dst, RexRegIrrelevant);
   if (isByteSizedType(Ty)) {
     emitUint8(0xC6);
     emitOperand(0, dst);
@@ -259,9 +279,10 @@
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   bool ByteSized = isByteSizedType(SrcTy);
   assert(ByteSized || SrcTy == IceType_i16);
+  emitRexRB(RexTypeIrrelevant, dst, SrcTy, src);
   emitUint8(0x0F);
   emitUint8(ByteSized ? 0xB6 : 0xB7);
-  emitRegisterOperand(dst, src);
+  emitRegisterOperand(gprEncoding(dst), gprEncoding(src));
 }
 
 template <class Machine>
@@ -271,9 +292,10 @@
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   bool ByteSized = isByteSizedType(SrcTy);
   assert(ByteSized || SrcTy == IceType_i16);
+  emitRex(SrcTy, src, RexTypeIrrelevant, dst);
   emitUint8(0x0F);
   emitUint8(ByteSized ? 0xB6 : 0xB7);
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
@@ -282,10 +304,15 @@
                                       typename Traits::GPRRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   bool ByteSized = isByteSizedType(SrcTy);
-  assert(ByteSized || SrcTy == IceType_i16);
-  emitUint8(0x0F);
-  emitUint8(ByteSized ? 0xBE : 0xBF);
-  emitRegisterOperand(dst, src);
+  emitRexRB(IceType_ForceRexW, dst, SrcTy, src);
+  if (ByteSized || SrcTy == IceType_i16) {
+    emitUint8(0x0F);
+    emitUint8(ByteSized ? 0xBE : 0xBF);
+  } else {
+    assert(Traits::Is64Bit && SrcTy == IceType_i32);
+    emitUint8(0x63);
+  }
+  emitRegisterOperand(gprEncoding(dst), gprEncoding(src));
 }
 
 template <class Machine>
@@ -294,10 +321,15 @@
                                       const typename Traits::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   bool ByteSized = isByteSizedType(SrcTy);
-  assert(ByteSized || SrcTy == IceType_i16);
-  emitUint8(0x0F);
-  emitUint8(ByteSized ? 0xBE : 0xBF);
-  emitOperand(dst, src);
+  emitRex(SrcTy, src, IceType_ForceRexW, dst);
+  if (ByteSized || SrcTy == IceType_i16) {
+    emitUint8(0x0F);
+    emitUint8(ByteSized ? 0xBE : 0xBF);
+  } else {
+    assert(Traits::Is64Bit && SrcTy == IceType_i32);
+    emitUint8(0x63);
+  }
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
@@ -307,8 +339,9 @@
   assert(Ty == IceType_i16 || Ty == IceType_i32);
   if (Ty == IceType_i16)
     emitOperandSizeOverride();
+  emitRex(Ty, src, dst);
   emitUint8(0x8D);
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
@@ -321,9 +354,10 @@
     emitOperandSizeOverride();
   else
     assert(Ty == IceType_i32);
+  emitRexRB(Ty, dst, src);
   emitUint8(0x0F);
   emitUint8(0x40 + cond);
-  emitRegisterOperand(dst, src);
+  emitRegisterOperand(gprEncoding(dst), gprEncoding(src));
 }
 
 template <class Machine>
@@ -336,9 +370,10 @@
     emitOperandSizeOverride();
   else
     assert(Ty == IceType_i32);
+  emitRex(Ty, src, dst);
   emitUint8(0x0F);
   emitUint8(0x40 + cond);
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine> void AssemblerX86Base<Machine>::rep_movsb() {
@@ -352,9 +387,10 @@
                                       const typename Traits::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(isFloat32Asserting32Or64(Ty) ? 0xF3 : 0xF2);
+  emitRex(RexTypeIrrelevant, src, dst);
   emitUint8(0x0F);
   emitUint8(0x10);
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
@@ -363,9 +399,10 @@
                                       typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(isFloat32Asserting32Or64(Ty) ? 0xF3 : 0xF2);
+  emitRex(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x11);
-  emitOperand(src, dst);
+  emitOperand(gprEncoding(src), dst);
 }
 
 template <class Machine>
@@ -373,6 +410,7 @@
                                       typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(isFloat32Asserting32Or64(Ty) ? 0xF3 : 0xF2);
+  emitRexRB(RexTypeIrrelevant, src, dst);
   emitUint8(0x0F);
   emitUint8(0x11);
   emitXmmRegisterOperand(src, dst);
@@ -383,9 +421,10 @@
                                      typename Traits::GPRRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x6E);
-  emitRegisterOperand(dst, src);
+  emitRegisterOperand(gprEncoding(dst), gprEncoding(src));
 }
 
 template <class Machine>
@@ -393,9 +432,10 @@
                                      const typename Traits::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRex(RexTypeIrrelevant, src, dst);
   emitUint8(0x0F);
   emitUint8(0x6E);
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
@@ -403,9 +443,10 @@
                                      typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRexRB(RexTypeIrrelevant, src, dst);
   emitUint8(0x0F);
   emitUint8(0x7E);
-  emitRegisterOperand(src, dst);
+  emitRegisterOperand(gprEncoding(src), gprEncoding(dst));
 }
 
 template <class Machine>
@@ -413,9 +454,10 @@
                                      typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRex(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x7E);
-  emitOperand(src, dst);
+  emitOperand(gprEncoding(src), dst);
 }
 
 template <class Machine>
@@ -423,9 +465,10 @@
                                      typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0xF3);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x7E);
-  emitRegisterOperand(dst, src);
+  emitXmmRegisterOperand(dst, src);
 }
 
 template <class Machine>
@@ -433,9 +476,10 @@
                                      typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRex(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0xD6);
-  emitOperand(src, dst);
+  emitOperand(gprEncoding(src), dst);
 }
 
 template <class Machine>
@@ -443,9 +487,10 @@
                                      const typename Traits::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0xF3);
+  emitRex(RexTypeIrrelevant, src, dst);
   emitUint8(0x0F);
   emitUint8(0x7E);
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
@@ -453,6 +498,7 @@
                                       typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(isFloat32Asserting32Or64(Ty) ? 0xF3 : 0xF2);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x58);
   emitXmmRegisterOperand(dst, src);
@@ -463,9 +509,10 @@
                                       const typename Traits::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(isFloat32Asserting32Or64(Ty) ? 0xF3 : 0xF2);
+  emitRex(RexTypeIrrelevant, src, dst);
   emitUint8(0x0F);
   emitUint8(0x58);
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
@@ -473,6 +520,7 @@
                                       typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(isFloat32Asserting32Or64(Ty) ? 0xF3 : 0xF2);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x5C);
   emitXmmRegisterOperand(dst, src);
@@ -483,9 +531,10 @@
                                       const typename Traits::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(isFloat32Asserting32Or64(Ty) ? 0xF3 : 0xF2);
+  emitRex(RexTypeIrrelevant, src, dst);
   emitUint8(0x0F);
   emitUint8(0x5C);
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
@@ -493,6 +542,7 @@
                                       typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(isFloat32Asserting32Or64(Ty) ? 0xF3 : 0xF2);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x59);
   emitXmmRegisterOperand(dst, src);
@@ -503,9 +553,10 @@
                                       const typename Traits::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(isFloat32Asserting32Or64(Ty) ? 0xF3 : 0xF2);
+  emitRex(RexTypeIrrelevant, src, dst);
   emitUint8(0x0F);
   emitUint8(0x59);
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
@@ -513,6 +564,7 @@
                                       typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(isFloat32Asserting32Or64(Ty) ? 0xF3 : 0xF2);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x5E);
   emitXmmRegisterOperand(dst, src);
@@ -523,29 +575,31 @@
                                       const typename Traits::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(isFloat32Asserting32Or64(Ty) ? 0xF3 : 0xF2);
+  emitRex(RexTypeIrrelevant, src, dst);
   emitUint8(0x0F);
   emitUint8(0x5E);
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
-void AssemblerX86Base<Machine>::fld(Type Ty,
-                                    const typename Traits::Address &src) {
+template <typename T, typename>
+void AssemblerX86Base<Machine>::fld(Type Ty, const typename T::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(isFloat32Asserting32Or64(Ty) ? 0xD9 : 0xDD);
   emitOperand(0, src);
 }
 
 template <class Machine>
-void AssemblerX86Base<Machine>::fstp(Type Ty,
-                                     const typename Traits::Address &dst) {
+template <typename T, typename>
+void AssemblerX86Base<Machine>::fstp(Type Ty, const typename T::Address &dst) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(isFloat32Asserting32Or64(Ty) ? 0xD9 : 0xDD);
   emitOperand(3, dst);
 }
 
 template <class Machine>
-void AssemblerX86Base<Machine>::fstp(typename Traits::X87STRegister st) {
+template <typename T, typename>
+void AssemblerX86Base<Machine>::fstp(typename T::X87STRegister st) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0xDD);
   emitUint8(0xD8 + st);
@@ -555,6 +609,7 @@
 void AssemblerX86Base<Machine>::movaps(typename Traits::XmmRegister dst,
                                        typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x28);
   emitXmmRegisterOperand(dst, src);
@@ -564,27 +619,30 @@
 void AssemblerX86Base<Machine>::movups(typename Traits::XmmRegister dst,
                                        typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x10);
-  emitRegisterOperand(dst, src);
+  emitXmmRegisterOperand(dst, src);
 }
 
 template <class Machine>
 void AssemblerX86Base<Machine>::movups(typename Traits::XmmRegister dst,
                                        const typename Traits::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRex(RexTypeIrrelevant, src, dst);
   emitUint8(0x0F);
   emitUint8(0x10);
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
 void AssemblerX86Base<Machine>::movups(const typename Traits::Address &dst,
                                        typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRex(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x11);
-  emitOperand(src, dst);
+  emitOperand(gprEncoding(src), dst);
 }
 
 template <class Machine>
@@ -592,6 +650,7 @@
                                      typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   if (isByteSizedArithType(Ty)) {
     emitUint8(0xFC);
@@ -608,6 +667,7 @@
                                      const typename Traits::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRex(RexTypeIrrelevant, src, dst);
   emitUint8(0x0F);
   if (isByteSizedArithType(Ty)) {
     emitUint8(0xFC);
@@ -616,7 +676,7 @@
   } else {
     emitUint8(0xFE);
   }
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
@@ -625,6 +685,7 @@
                                      typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0xDB);
   emitXmmRegisterOperand(dst, src);
@@ -636,9 +697,10 @@
                                      const typename Traits::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRex(RexTypeIrrelevant, src, dst);
   emitUint8(0x0F);
   emitUint8(0xDB);
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
@@ -647,6 +709,7 @@
                                       typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0xDF);
   emitXmmRegisterOperand(dst, src);
@@ -658,9 +721,10 @@
                                       const typename Traits::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRex(RexTypeIrrelevant, src, dst);
   emitUint8(0x0F);
   emitUint8(0xDF);
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
@@ -668,6 +732,7 @@
                                       typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   if (Ty == IceType_i16) {
     emitUint8(0xD5);
@@ -684,6 +749,7 @@
                                       const typename Traits::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRex(RexTypeIrrelevant, src, dst);
   emitUint8(0x0F);
   if (Ty == IceType_i16) {
     emitUint8(0xD5);
@@ -692,7 +758,7 @@
     emitUint8(0x38);
     emitUint8(0x40);
   }
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
@@ -701,6 +767,7 @@
                                         typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0xF4);
   emitXmmRegisterOperand(dst, src);
@@ -712,9 +779,10 @@
                                         const typename Traits::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRex(RexTypeIrrelevant, src, dst);
   emitUint8(0x0F);
   emitUint8(0xF4);
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
@@ -723,6 +791,7 @@
                                     typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0xEB);
   emitXmmRegisterOperand(dst, src);
@@ -734,9 +803,10 @@
                                     const typename Traits::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRex(RexTypeIrrelevant, src, dst);
   emitUint8(0x0F);
   emitUint8(0xEB);
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
@@ -744,6 +814,7 @@
                                      typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   if (isByteSizedArithType(Ty)) {
     emitUint8(0xF8);
@@ -760,6 +831,7 @@
                                      const typename Traits::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRex(RexTypeIrrelevant, src, dst);
   emitUint8(0x0F);
   if (isByteSizedArithType(Ty)) {
     emitUint8(0xF8);
@@ -768,7 +840,7 @@
   } else {
     emitUint8(0xFA);
   }
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
@@ -777,6 +849,7 @@
                                      typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0xEF);
   emitXmmRegisterOperand(dst, src);
@@ -788,9 +861,10 @@
                                      const typename Traits::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRex(RexTypeIrrelevant, src, dst);
   emitUint8(0x0F);
   emitUint8(0xEF);
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
@@ -798,6 +872,7 @@
                                      typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   if (Ty == IceType_i16) {
     emitUint8(0xF1);
@@ -813,6 +888,7 @@
                                      const typename Traits::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRex(RexTypeIrrelevant, src, dst);
   emitUint8(0x0F);
   if (Ty == IceType_i16) {
     emitUint8(0xF1);
@@ -820,7 +896,7 @@
     assert(Ty == IceType_i32);
     emitUint8(0xF2);
   }
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
@@ -829,6 +905,7 @@
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   assert(imm.is_int8());
   emitUint8(0x66);
+  emitRexB(RexTypeIrrelevant, dst);
   emitUint8(0x0F);
   if (Ty == IceType_i16) {
     emitUint8(0x71);
@@ -836,7 +913,7 @@
     assert(Ty == IceType_i32);
     emitUint8(0x72);
   }
-  emitRegisterOperand(6, dst);
+  emitRegisterOperand(6, gprEncoding(dst));
   emitUint8(imm.value() & 0xFF);
 }
 
@@ -845,6 +922,7 @@
                                      typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   if (Ty == IceType_i16) {
     emitUint8(0xE1);
@@ -860,6 +938,7 @@
                                      const typename Traits::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRex(RexTypeIrrelevant, src, dst);
   emitUint8(0x0F);
   if (Ty == IceType_i16) {
     emitUint8(0xE1);
@@ -867,7 +946,7 @@
     assert(Ty == IceType_i32);
     emitUint8(0xE2);
   }
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
@@ -876,6 +955,7 @@
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   assert(imm.is_int8());
   emitUint8(0x66);
+  emitRexB(RexTypeIrrelevant, dst);
   emitUint8(0x0F);
   if (Ty == IceType_i16) {
     emitUint8(0x71);
@@ -883,7 +963,7 @@
     assert(Ty == IceType_i32);
     emitUint8(0x72);
   }
-  emitRegisterOperand(4, dst);
+  emitRegisterOperand(4, gprEncoding(dst));
   emitUint8(imm.value() & 0xFF);
 }
 
@@ -892,6 +972,7 @@
                                      typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   if (Ty == IceType_i16) {
     emitUint8(0xD1);
@@ -909,6 +990,7 @@
                                      const typename Traits::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRex(RexTypeIrrelevant, src, dst);
   emitUint8(0x0F);
   if (Ty == IceType_i16) {
     emitUint8(0xD1);
@@ -918,7 +1000,7 @@
     assert(Ty == IceType_i32 || Ty == IceType_f32 || Ty == IceType_v4f32);
     emitUint8(0xD2);
   }
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
@@ -927,6 +1009,7 @@
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   assert(imm.is_int8());
   emitUint8(0x66);
+  emitRexB(RexTypeIrrelevant, dst);
   emitUint8(0x0F);
   if (Ty == IceType_i16) {
     emitUint8(0x71);
@@ -936,7 +1019,7 @@
     assert(Ty == IceType_i32 || Ty == IceType_f32 || Ty == IceType_v4f32);
     emitUint8(0x72);
   }
-  emitRegisterOperand(2, dst);
+  emitRegisterOperand(2, gprEncoding(dst));
   emitUint8(imm.value() & 0xFF);
 }
 
@@ -949,6 +1032,7 @@
                                       typename Traits::XmmRegister dst,
                                       typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x58);
   emitXmmRegisterOperand(dst, src);
@@ -959,9 +1043,10 @@
                                       typename Traits::XmmRegister dst,
                                       const typename Traits::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRex(RexTypeIrrelevant, src, dst);
   emitUint8(0x0F);
   emitUint8(0x58);
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
@@ -969,6 +1054,7 @@
                                       typename Traits::XmmRegister dst,
                                       typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x5C);
   emitXmmRegisterOperand(dst, src);
@@ -979,9 +1065,10 @@
                                       typename Traits::XmmRegister dst,
                                       const typename Traits::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRex(RexTypeIrrelevant, src, dst);
   emitUint8(0x0F);
   emitUint8(0x5C);
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
@@ -989,6 +1076,7 @@
                                       typename Traits::XmmRegister dst,
                                       typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x5E);
   emitXmmRegisterOperand(dst, src);
@@ -999,9 +1087,10 @@
                                       typename Traits::XmmRegister dst,
                                       const typename Traits::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRex(RexTypeIrrelevant, src, dst);
   emitUint8(0x0F);
   emitUint8(0x5E);
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
@@ -1009,6 +1098,7 @@
                                       typename Traits::XmmRegister dst,
                                       typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x59);
   emitXmmRegisterOperand(dst, src);
@@ -1019,15 +1109,17 @@
                                       typename Traits::XmmRegister dst,
                                       const typename Traits::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRex(RexTypeIrrelevant, src, dst);
   emitUint8(0x0F);
   emitUint8(0x59);
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
 void AssemblerX86Base<Machine>::minps(typename Traits::XmmRegister dst,
                                       typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x5D);
   emitXmmRegisterOperand(dst, src);
@@ -1037,6 +1129,7 @@
 void AssemblerX86Base<Machine>::maxps(typename Traits::XmmRegister dst,
                                       typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x5F);
   emitXmmRegisterOperand(dst, src);
@@ -1046,6 +1139,7 @@
 void AssemblerX86Base<Machine>::andps(typename Traits::XmmRegister dst,
                                       typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x54);
   emitXmmRegisterOperand(dst, src);
@@ -1055,15 +1149,17 @@
 void AssemblerX86Base<Machine>::andps(typename Traits::XmmRegister dst,
                                       const typename Traits::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRex(RexTypeIrrelevant, src, dst);
   emitUint8(0x0F);
   emitUint8(0x54);
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
 void AssemblerX86Base<Machine>::orps(typename Traits::XmmRegister dst,
                                      typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x56);
   emitXmmRegisterOperand(dst, src);
@@ -1075,6 +1171,7 @@
                                          typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x38);
   emitUint8(0x14);
@@ -1087,10 +1184,11 @@
                                          const typename Traits::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRex(RexTypeIrrelevant, src, dst);
   emitUint8(0x0F);
   emitUint8(0x38);
   emitUint8(0x14);
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
@@ -1099,6 +1197,7 @@
                                          typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x38);
   emitUint8(0x10);
@@ -1111,10 +1210,11 @@
                                          const typename Traits::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRex(RexTypeIrrelevant, src, dst);
   emitUint8(0x0F);
   emitUint8(0x38);
   emitUint8(0x10);
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
@@ -1122,6 +1222,7 @@
     typename Traits::XmmRegister dst, typename Traits::XmmRegister src,
     typename Traits::Cond::CmppsCond CmpCondition) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0xC2);
   emitXmmRegisterOperand(dst, src);
@@ -1133,15 +1234,17 @@
     typename Traits::XmmRegister dst, const typename Traits::Address &src,
     typename Traits::Cond::CmppsCond CmpCondition) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRex(RexTypeIrrelevant, src, dst);
   emitUint8(0x0F);
   emitUint8(0xC2);
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
   emitUint8(CmpCondition);
 }
 
 template <class Machine>
 void AssemblerX86Base<Machine>::sqrtps(typename Traits::XmmRegister dst) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRexRB(RexTypeIrrelevant, dst, dst);
   emitUint8(0x0F);
   emitUint8(0x51);
   emitXmmRegisterOperand(dst, dst);
@@ -1150,6 +1253,7 @@
 template <class Machine>
 void AssemblerX86Base<Machine>::rsqrtps(typename Traits::XmmRegister dst) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRexRB(RexTypeIrrelevant, dst, dst);
   emitUint8(0x0F);
   emitUint8(0x52);
   emitXmmRegisterOperand(dst, dst);
@@ -1158,6 +1262,7 @@
 template <class Machine>
 void AssemblerX86Base<Machine>::reciprocalps(typename Traits::XmmRegister dst) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRexRB(RexTypeIrrelevant, dst, dst);
   emitUint8(0x0F);
   emitUint8(0x53);
   emitXmmRegisterOperand(dst, dst);
@@ -1167,6 +1272,7 @@
 void AssemblerX86Base<Machine>::movhlps(typename Traits::XmmRegister dst,
                                         typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x12);
   emitXmmRegisterOperand(dst, src);
@@ -1176,6 +1282,7 @@
 void AssemblerX86Base<Machine>::movlhps(typename Traits::XmmRegister dst,
                                         typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x16);
   emitXmmRegisterOperand(dst, src);
@@ -1185,6 +1292,7 @@
 void AssemblerX86Base<Machine>::unpcklps(typename Traits::XmmRegister dst,
                                          typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x14);
   emitXmmRegisterOperand(dst, src);
@@ -1194,6 +1302,7 @@
 void AssemblerX86Base<Machine>::unpckhps(typename Traits::XmmRegister dst,
                                          typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x15);
   emitXmmRegisterOperand(dst, src);
@@ -1204,6 +1313,7 @@
                                          typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x14);
   emitXmmRegisterOperand(dst, src);
@@ -1214,6 +1324,7 @@
                                          typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x15);
   emitXmmRegisterOperand(dst, src);
@@ -1228,19 +1339,7 @@
   // Move value from tmp1 into dst.
   movd(dst, tmp1);
   // Broadcast low lane into other three lanes.
-  shufps(dst, dst, Immediate(0x0));
-}
-
-template <class Machine>
-void AssemblerX86Base<Machine>::shufps(typename Traits::XmmRegister dst,
-                                       typename Traits::XmmRegister src,
-                                       const Immediate &imm) {
-  AssemblerBuffer::EnsureCapacity ensured(&Buffer);
-  emitUint8(0x0F);
-  emitUint8(0xC6);
-  emitXmmRegisterOperand(dst, src);
-  assert(imm.is_uint8());
-  emitUint8(imm.value());
+  shufps(RexTypeIrrelevant, dst, dst, Immediate(0x0));
 }
 
 template <class Machine>
@@ -1250,6 +1349,7 @@
                                        const Immediate &imm) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x70);
   emitXmmRegisterOperand(dst, src);
@@ -1264,9 +1364,10 @@
                                        const Immediate &imm) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRex(RexTypeIrrelevant, src, dst);
   emitUint8(0x0F);
   emitUint8(0x70);
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
   assert(imm.is_uint8());
   emitUint8(imm.value());
 }
@@ -1277,6 +1378,7 @@
                                        typename Traits::XmmRegister src,
                                        const Immediate &imm) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0xC6);
   emitXmmRegisterOperand(dst, src);
@@ -1290,9 +1392,10 @@
                                        const typename Traits::Address &src,
                                        const Immediate &imm) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRex(RexTypeIrrelevant, src, dst);
   emitUint8(0x0F);
   emitUint8(0xC6);
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
   assert(imm.is_uint8());
   emitUint8(imm.value());
 }
@@ -1302,6 +1405,7 @@
                                       typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x5D);
   emitXmmRegisterOperand(dst, src);
@@ -1312,6 +1416,7 @@
                                       typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x5F);
   emitXmmRegisterOperand(dst, src);
@@ -1321,29 +1426,18 @@
 void AssemblerX86Base<Machine>::sqrtpd(typename Traits::XmmRegister dst) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRexRB(RexTypeIrrelevant, dst, dst);
   emitUint8(0x0F);
   emitUint8(0x51);
   emitXmmRegisterOperand(dst, dst);
 }
 
 template <class Machine>
-void AssemblerX86Base<Machine>::shufpd(typename Traits::XmmRegister dst,
-                                       typename Traits::XmmRegister src,
-                                       const Immediate &imm) {
-  AssemblerBuffer::EnsureCapacity ensured(&Buffer);
-  emitUint8(0x66);
-  emitUint8(0x0F);
-  emitUint8(0xC6);
-  emitXmmRegisterOperand(dst, src);
-  assert(imm.is_uint8());
-  emitUint8(imm.value());
-}
-
-template <class Machine>
 void AssemblerX86Base<Machine>::cvtdq2ps(Type /* Ignore */,
                                          typename Traits::XmmRegister dst,
                                          typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x5B);
   emitXmmRegisterOperand(dst, src);
@@ -1354,9 +1448,10 @@
                                          typename Traits::XmmRegister dst,
                                          const typename Traits::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRex(RexTypeIrrelevant, src, dst);
   emitUint8(0x0F);
   emitUint8(0x5B);
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
@@ -1365,6 +1460,7 @@
                                           typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0xF3);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x5B);
   emitXmmRegisterOperand(dst, src);
@@ -1376,9 +1472,10 @@
                                           const typename Traits::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0xF3);
+  emitRex(RexTypeIrrelevant, src, dst);
   emitUint8(0x0F);
   emitUint8(0x5B);
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
@@ -1387,9 +1484,10 @@
                                          typename Traits::GPRRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(isFloat32Asserting32Or64(DestTy) ? 0xF3 : 0xF2);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x2A);
-  emitRegisterOperand(dst, src);
+  emitXmmRegisterOperand(dst, src);
 }
 
 template <class Machine>
@@ -1398,9 +1496,10 @@
                                          const typename Traits::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(isFloat32Asserting32Or64(DestTy) ? 0xF3 : 0xF2);
+  emitRex(RexTypeIrrelevant, src, dst);
   emitUint8(0x0F);
   emitUint8(0x2A);
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
@@ -1410,6 +1509,7 @@
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   // ss2sd or sd2ss
   emitUint8(isFloat32Asserting32Or64(SrcTy) ? 0xF3 : 0xF2);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x5A);
   emitXmmRegisterOperand(dst, src);
@@ -1421,9 +1521,10 @@
     const typename Traits::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(isFloat32Asserting32Or64(SrcTy) ? 0xF3 : 0xF2);
+  emitRex(RexTypeIrrelevant, src, dst);
   emitUint8(0x0F);
   emitUint8(0x5A);
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
@@ -1432,6 +1533,7 @@
                                           typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(isFloat32Asserting32Or64(SrcTy) ? 0xF3 : 0xF2);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x2C);
   emitXmmRegisterOperand(dst, src);
@@ -1443,9 +1545,10 @@
                                           const typename Traits::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(isFloat32Asserting32Or64(SrcTy) ? 0xF3 : 0xF2);
+  emitRex(RexTypeIrrelevant, src, dst);
   emitUint8(0x0F);
   emitUint8(0x2C);
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
@@ -1454,6 +1557,7 @@
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   if (Ty == IceType_f64)
     emitUint8(0x66);
+  emitRexRB(RexTypeIrrelevant, a, b);
   emitUint8(0x0F);
   emitUint8(0x2E);
   emitXmmRegisterOperand(a, b);
@@ -1465,9 +1569,10 @@
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   if (Ty == IceType_f64)
     emitUint8(0x66);
+  emitRex(RexTypeIrrelevant, b, a);
   emitUint8(0x0F);
   emitUint8(0x2E);
-  emitOperand(a, b);
+  emitOperand(gprEncoding(a), b);
 }
 
 template <class Machine>
@@ -1475,6 +1580,7 @@
                                          typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x50);
   emitXmmRegisterOperand(dst, src);
@@ -1484,6 +1590,7 @@
 void AssemblerX86Base<Machine>::movmskps(typename Traits::GPRRegister dst,
                                          typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x50);
   emitXmmRegisterOperand(dst, src);
@@ -1495,9 +1602,10 @@
                                        const typename Traits::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(isFloat32Asserting32Or64(Ty) ? 0xF3 : 0xF2);
+  emitRex(RexTypeIrrelevant, src, dst);
   emitUint8(0x0F);
   emitUint8(0x51);
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
@@ -1506,6 +1614,7 @@
                                        typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(isFloat32Asserting32Or64(Ty) ? 0xF3 : 0xF2);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x51);
   emitXmmRegisterOperand(dst, src);
@@ -1516,9 +1625,10 @@
                                       const typename Traits::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRex(RexTypeIrrelevant, src, dst);
   emitUint8(0x0F);
   emitUint8(0x57);
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
@@ -1526,6 +1636,7 @@
                                       typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x57);
   emitXmmRegisterOperand(dst, src);
@@ -1536,6 +1647,7 @@
                                      typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x56);
   emitXmmRegisterOperand(dst, src);
@@ -1545,15 +1657,17 @@
 void AssemblerX86Base<Machine>::xorps(typename Traits::XmmRegister dst,
                                       const typename Traits::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRex(RexTypeIrrelevant, src, dst);
   emitUint8(0x0F);
   emitUint8(0x57);
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
 void AssemblerX86Base<Machine>::xorps(typename Traits::XmmRegister dst,
                                       typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x57);
   emitXmmRegisterOperand(dst, src);
@@ -1564,9 +1678,10 @@
                                       const typename Traits::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRex(RexTypeIrrelevant, src, dst);
   emitUint8(0x0F);
   emitUint8(0x54);
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
@@ -1574,6 +1689,7 @@
                                       typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x54);
   emitXmmRegisterOperand(dst, src);
@@ -1589,6 +1705,7 @@
   assert(isVectorFloatingType(Ty));
   (void)Ty;
   emitUint8(0x66);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x3A);
   emitUint8(0x21);
@@ -1606,10 +1723,11 @@
   assert(isVectorFloatingType(Ty));
   (void)Ty;
   emitUint8(0x66);
+  emitRex(RexTypeIrrelevant, src, dst);
   emitUint8(0x0F);
   emitUint8(0x3A);
   emitUint8(0x21);
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
   emitUint8(imm.value());
 }
 
@@ -1619,20 +1737,17 @@
                                       const Immediate &imm) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   assert(imm.is_uint8());
+  emitUint8(0x66);
+  emitRexRB(Ty, dst, src);
+  emitUint8(0x0F);
   if (Ty == IceType_i16) {
-    emitUint8(0x66);
-    emitUint8(0x0F);
     emitUint8(0xC4);
-    emitXmmRegisterOperand(dst, typename Traits::XmmRegister(src));
-    emitUint8(imm.value());
   } else {
-    emitUint8(0x66);
-    emitUint8(0x0F);
     emitUint8(0x3A);
     emitUint8(isByteSizedType(Ty) ? 0x20 : 0x22);
-    emitXmmRegisterOperand(dst, typename Traits::XmmRegister(src));
-    emitUint8(imm.value());
   }
+  emitXmmRegisterOperand(dst, src);
+  emitUint8(imm.value());
 }
 
 template <class Machine>
@@ -1641,20 +1756,17 @@
                                       const Immediate &imm) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   assert(imm.is_uint8());
+  emitUint8(0x66);
+  emitRex(RexTypeIrrelevant, src, dst);
+  emitUint8(0x0F);
   if (Ty == IceType_i16) {
-    emitUint8(0x66);
-    emitUint8(0x0F);
     emitUint8(0xC4);
-    emitOperand(dst, src);
-    emitUint8(imm.value());
   } else {
-    emitUint8(0x66);
-    emitUint8(0x0F);
     emitUint8(0x3A);
     emitUint8(isByteSizedType(Ty) ? 0x20 : 0x22);
-    emitOperand(dst, src);
-    emitUint8(imm.value());
   }
+  emitOperand(gprEncoding(dst), src);
+  emitUint8(imm.value());
 }
 
 template <class Machine>
@@ -1665,18 +1777,20 @@
   assert(imm.is_uint8());
   if (Ty == IceType_i16) {
     emitUint8(0x66);
+    emitRexRB(Ty, dst, src);
     emitUint8(0x0F);
     emitUint8(0xC5);
-    emitXmmRegisterOperand(typename Traits::XmmRegister(dst), src);
+    emitXmmRegisterOperand(dst, src);
     emitUint8(imm.value());
   } else {
     emitUint8(0x66);
+    emitRexRB(Ty, src, dst);
     emitUint8(0x0F);
     emitUint8(0x3A);
     emitUint8(isByteSizedType(Ty) ? 0x14 : 0x16);
     // SSE 4.1 versions are "MRI" because dst can be mem, while
     // pextrw (SSE2) is RMI because dst must be reg.
-    emitXmmRegisterOperand(src, typename Traits::XmmRegister(dst));
+    emitXmmRegisterOperand(src, dst);
     emitUint8(imm.value());
   }
 }
@@ -1686,6 +1800,7 @@
                                          typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x38);
   emitUint8(0x25);
@@ -1698,6 +1813,7 @@
                                        typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   if (isByteSizedArithType(Ty)) {
     emitUint8(0x74);
@@ -1715,6 +1831,7 @@
                                        const typename Traits::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRex(RexTypeIrrelevant, src, dst);
   emitUint8(0x0F);
   if (isByteSizedArithType(Ty)) {
     emitUint8(0x74);
@@ -1723,7 +1840,7 @@
   } else {
     emitUint8(0x76);
   }
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
@@ -1732,6 +1849,7 @@
                                        typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   if (isByteSizedArithType(Ty)) {
     emitUint8(0x64);
@@ -1749,6 +1867,7 @@
                                        const typename Traits::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRex(RexTypeIrrelevant, src, dst);
   emitUint8(0x0F);
   if (isByteSizedArithType(Ty)) {
     emitUint8(0x64);
@@ -1757,7 +1876,7 @@
   } else {
     emitUint8(0x66);
   }
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
@@ -1766,6 +1885,7 @@
                                         RoundingMode mode) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
+  emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x3A);
   emitUint8(0x0B);
@@ -1775,48 +1895,56 @@
 }
 
 template <class Machine>
-void AssemblerX86Base<Machine>::fnstcw(const typename Traits::Address &dst) {
+template <typename T, typename>
+void AssemblerX86Base<Machine>::fnstcw(const typename T::Address &dst) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0xD9);
   emitOperand(7, dst);
 }
 
 template <class Machine>
-void AssemblerX86Base<Machine>::fldcw(const typename Traits::Address &src) {
+template <typename T, typename>
+void AssemblerX86Base<Machine>::fldcw(const typename T::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0xD9);
   emitOperand(5, src);
 }
 
 template <class Machine>
-void AssemblerX86Base<Machine>::fistpl(const typename Traits::Address &dst) {
+template <typename T, typename>
+void AssemblerX86Base<Machine>::fistpl(const typename T::Address &dst) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0xDF);
   emitOperand(7, dst);
 }
 
 template <class Machine>
-void AssemblerX86Base<Machine>::fistps(const typename Traits::Address &dst) {
+template <typename T, typename>
+void AssemblerX86Base<Machine>::fistps(const typename T::Address &dst) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0xDB);
   emitOperand(3, dst);
 }
 
 template <class Machine>
-void AssemblerX86Base<Machine>::fildl(const typename Traits::Address &src) {
+template <typename T, typename>
+void AssemblerX86Base<Machine>::fildl(const typename T::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0xDF);
   emitOperand(5, src);
 }
 
 template <class Machine>
-void AssemblerX86Base<Machine>::filds(const typename Traits::Address &src) {
+template <typename T, typename>
+void AssemblerX86Base<Machine>::filds(const typename T::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0xDB);
   emitOperand(0, src);
 }
 
-template <class Machine> void AssemblerX86Base<Machine>::fincstp() {
+template <class Machine>
+template <typename, typename>
+void AssemblerX86Base<Machine>::fincstp() {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0xD9);
   emitUint8(0xF7);
@@ -1829,13 +1957,14 @@
                                           const Immediate &imm) {
   static_assert(Tag < 8, "Tag must be between 0..7");
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
-  if (isByteSizedType(Ty)) {
-    emitComplexI8(Tag, typename Traits::Operand(reg), imm);
-    return;
-  }
   if (Ty == IceType_i16)
     emitOperandSizeOverride();
-  emitComplex(Ty, Tag, typename Traits::Operand(reg), imm);
+  emitRexB(Ty, reg);
+  if (isByteSizedType(Ty)) {
+    emitComplexI8(Tag, typename Traits::Operand(reg), imm);
+  } else {
+    emitComplex(Ty, Tag, typename Traits::Operand(reg), imm);
+  }
 }
 
 template <class Machine>
@@ -1847,11 +1976,12 @@
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   if (Ty == IceType_i16)
     emitOperandSizeOverride();
+  emitRexRB(Ty, reg0, reg1);
   if (isByteSizedType(Ty))
     emitUint8(Tag * 8 + 2);
   else
     emitUint8(Tag * 8 + 3);
-  emitRegisterOperand(reg0, reg1);
+  emitRegisterOperand(gprEncoding(reg0), gprEncoding(reg1));
 }
 
 template <class Machine>
@@ -1863,11 +1993,12 @@
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   if (Ty == IceType_i16)
     emitOperandSizeOverride();
+  emitRex(Ty, address, reg);
   if (isByteSizedType(Ty))
     emitUint8(Tag * 8 + 2);
   else
     emitUint8(Tag * 8 + 3);
-  emitOperand(reg, address);
+  emitOperand(gprEncoding(reg), address);
 }
 
 template <class Machine>
@@ -1879,11 +2010,12 @@
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   if (Ty == IceType_i16)
     emitOperandSizeOverride();
+  emitRex(Ty, address, reg);
   if (isByteSizedType(Ty))
     emitUint8(Tag * 8 + 0);
   else
     emitUint8(Tag * 8 + 1);
-  emitOperand(reg, address);
+  emitOperand(gprEncoding(reg), address);
 }
 
 template <class Machine>
@@ -1892,13 +2024,14 @@
     Type Ty, const typename Traits::Address &address, const Immediate &imm) {
   static_assert(Tag < 8, "Tag must be between 0..7");
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
-  if (isByteSizedType(Ty)) {
-    emitComplexI8(Tag, address, imm);
-    return;
-  }
   if (Ty == IceType_i16)
     emitOperandSizeOverride();
-  emitComplex(Ty, Tag, address, imm);
+  emitRex(Ty, address, RexRegIrrelevant);
+  if (isByteSizedType(Ty)) {
+    emitComplexI8(Tag, address, imm);
+  } else {
+    emitComplex(Ty, Tag, address, imm);
+  }
 }
 
 template <class Machine>
@@ -1939,11 +2072,12 @@
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   if (Ty == IceType_i16)
     emitOperandSizeOverride();
+  emitRexRB(Ty, reg1, reg2);
   if (isByteSizedType(Ty))
     emitUint8(0x84);
   else
     emitUint8(0x85);
-  emitRegisterOperand(reg1, reg2);
+  emitRegisterOperand(gprEncoding(reg1), gprEncoding(reg2));
 }
 
 template <class Machine>
@@ -1953,11 +2087,12 @@
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   if (Ty == IceType_i16)
     emitOperandSizeOverride();
+  emitRex(Ty, addr, reg);
   if (isByteSizedType(Ty))
     emitUint8(0x84);
   else
     emitUint8(0x85);
-  emitOperand(reg, addr);
+  emitOperand(gprEncoding(reg), addr);
 }
 
 template <class Machine>
@@ -1969,13 +2104,14 @@
   // This is legal even if the register had high bits set since
   // this only sets flags registers based on the "AND" of the two operands,
   // and the immediate had zeros at those high bits.
-  if (immediate.is_uint8() && reg < 4) {
+  if (immediate.is_uint8() && reg <= Traits::Last8BitGPR) {
     // Use zero-extended 8-bit immediate.
+    emitRexB(Ty, reg);
     if (reg == Traits::Encoded_Reg_Accumulator) {
       emitUint8(0xA8);
     } else {
       emitUint8(0xF6);
-      emitUint8(0xC0 + reg);
+      emitUint8(0xC0 + gprEncoding(reg));
     }
     emitUint8(immediate.value() & 0xFF);
   } else if (reg == Traits::Encoded_Reg_Accumulator) {
@@ -1987,8 +2123,9 @@
   } else {
     if (Ty == IceType_i16)
       emitOperandSizeOverride();
+    emitRexB(Ty, reg);
     emitUint8(0xF7);
-    emitRegisterOperand(0, reg);
+    emitRegisterOperand(0, gprEncoding(reg));
     emitImmediate(Ty, immediate);
   }
 }
@@ -2002,12 +2139,14 @@
   // encoding short.
   if (immediate.is_uint8()) {
     // Use zero-extended 8-bit immediate.
+    emitRex(Ty, addr, RexRegIrrelevant);
     emitUint8(0xF6);
     emitOperand(0, addr);
     emitUint8(immediate.value() & 0xFF);
   } else {
     if (Ty == IceType_i16)
       emitOperandSizeOverride();
+    emitRex(Ty, addr, RexRegIrrelevant);
     emitUint8(0xF7);
     emitOperand(0, addr);
     emitImmediate(Ty, immediate);
@@ -2260,11 +2399,12 @@
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   if (Ty == IceType_i16)
     emitOperandSizeOverride();
+  emitRexB(Ty, reg);
   if (isByteSizedArithType(Ty))
     emitUint8(0xF6);
   else
     emitUint8(0xF7);
-  emitRegisterOperand(6, reg);
+  emitRegisterOperand(6, gprEncoding(reg));
 }
 
 template <class Machine>
@@ -2273,6 +2413,7 @@
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   if (Ty == IceType_i16)
     emitOperandSizeOverride();
+  emitRex(Ty, addr, RexRegIrrelevant);
   if (isByteSizedArithType(Ty))
     emitUint8(0xF6);
   else
@@ -2286,11 +2427,12 @@
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   if (Ty == IceType_i16)
     emitOperandSizeOverride();
+  emitRexB(Ty, reg);
   if (isByteSizedArithType(Ty))
     emitUint8(0xF6);
   else
     emitUint8(0xF7);
-  emitRegisterOperand(7, reg);
+  emitRegisterOperand(7, gprEncoding(reg));
 }
 
 template <class Machine>
@@ -2299,6 +2441,7 @@
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   if (Ty == IceType_i16)
     emitOperandSizeOverride();
+  emitRex(Ty, addr, RexRegIrrelevant);
   if (isByteSizedArithType(Ty))
     emitUint8(0xF6);
   else
@@ -2313,9 +2456,10 @@
   assert(Ty == IceType_i16 || Ty == IceType_i32);
   if (Ty == IceType_i16)
     emitOperandSizeOverride();
+  emitRexRB(Ty, dst, src);
   emitUint8(0x0F);
   emitUint8(0xAF);
-  emitRegisterOperand(dst, src);
+  emitRegisterOperand(gprEncoding(dst), gprEncoding(src));
 }
 
 template <class Machine>
@@ -2325,9 +2469,10 @@
   assert(Ty == IceType_i16 || Ty == IceType_i32);
   if (Ty == IceType_i16)
     emitOperandSizeOverride();
+  emitRex(Ty, address, reg);
   emitUint8(0x0F);
   emitUint8(0xAF);
-  emitOperand(reg, address);
+  emitOperand(gprEncoding(reg), address);
 }
 
 template <class Machine>
@@ -2337,13 +2482,14 @@
   assert(Ty == IceType_i16 || Ty == IceType_i32);
   if (Ty == IceType_i16)
     emitOperandSizeOverride();
+  emitRexRB(Ty, reg, reg);
   if (imm.is_int8()) {
     emitUint8(0x6B);
-    emitRegisterOperand(reg, reg);
+    emitRegisterOperand(gprEncoding(reg), gprEncoding(reg));
     emitUint8(imm.value() & 0xFF);
   } else {
     emitUint8(0x69);
-    emitRegisterOperand(reg, reg);
+    emitRegisterOperand(gprEncoding(reg), gprEncoding(reg));
     emitImmediate(Ty, imm);
   }
 }
@@ -2354,11 +2500,12 @@
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   if (Ty == IceType_i16)
     emitOperandSizeOverride();
+  emitRexB(Ty, reg);
   if (isByteSizedArithType(Ty))
     emitUint8(0xF6);
   else
     emitUint8(0xF7);
-  emitRegisterOperand(5, reg);
+  emitRegisterOperand(5, gprEncoding(reg));
 }
 
 template <class Machine>
@@ -2367,6 +2514,7 @@
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   if (Ty == IceType_i16)
     emitOperandSizeOverride();
+  emitRex(Ty, address, RexRegIrrelevant);
   if (isByteSizedArithType(Ty))
     emitUint8(0xF6);
   else
@@ -2379,11 +2527,12 @@
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   if (Ty == IceType_i16)
     emitOperandSizeOverride();
+  emitRexB(Ty, reg);
   if (isByteSizedArithType(Ty))
     emitUint8(0xF6);
   else
     emitUint8(0xF7);
-  emitRegisterOperand(4, reg);
+  emitRegisterOperand(4, gprEncoding(reg));
 }
 
 template <class Machine>
@@ -2392,6 +2541,7 @@
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   if (Ty == IceType_i16)
     emitOperandSizeOverride();
+  emitRex(Ty, address, RexRegIrrelevant);
   if (isByteSizedArithType(Ty))
     emitUint8(0xF6);
   else
@@ -2400,6 +2550,7 @@
 }
 
 template <class Machine>
+template <typename, typename>
 void AssemblerX86Base<Machine>::incl(typename Traits::GPRRegister reg) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x40 + reg);
@@ -2408,11 +2559,13 @@
 template <class Machine>
 void AssemblerX86Base<Machine>::incl(const typename Traits::Address &address) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRex(IceType_i32, address, RexRegIrrelevant);
   emitUint8(0xFF);
   emitOperand(0, address);
 }
 
 template <class Machine>
+template <typename, typename>
 void AssemblerX86Base<Machine>::decl(typename Traits::GPRRegister reg) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x48 + reg);
@@ -2421,6 +2574,7 @@
 template <class Machine>
 void AssemblerX86Base<Machine>::decl(const typename Traits::Address &address) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRex(IceType_i32, address, RexRegIrrelevant);
   emitUint8(0xFF);
   emitOperand(1, address);
 }
@@ -2512,9 +2666,10 @@
   assert(Ty == IceType_i16 || Ty == IceType_i32);
   if (Ty == IceType_i16)
     emitOperandSizeOverride();
+  emitRexRB(Ty, src, dst);
   emitUint8(0x0F);
   emitUint8(0xA5);
-  emitRegisterOperand(src, dst);
+  emitRegisterOperand(gprEncoding(src), gprEncoding(dst));
 }
 
 template <class Machine>
@@ -2526,9 +2681,10 @@
   assert(imm.is_int8());
   if (Ty == IceType_i16)
     emitOperandSizeOverride();
+  emitRexRB(Ty, src, dst);
   emitUint8(0x0F);
   emitUint8(0xA4);
-  emitRegisterOperand(src, dst);
+  emitRegisterOperand(gprEncoding(src), gprEncoding(dst));
   emitUint8(imm.value() & 0xFF);
 }
 
@@ -2540,9 +2696,10 @@
   assert(Ty == IceType_i16 || Ty == IceType_i32);
   if (Ty == IceType_i16)
     emitOperandSizeOverride();
+  emitRex(Ty, operand, src);
   emitUint8(0x0F);
   emitUint8(0xA5);
-  emitOperand(src, operand);
+  emitOperand(gprEncoding(src), operand);
 }
 
 template <class Machine>
@@ -2552,9 +2709,10 @@
   assert(Ty == IceType_i16 || Ty == IceType_i32);
   if (Ty == IceType_i16)
     emitOperandSizeOverride();
+  emitRexRB(Ty, src, dst);
   emitUint8(0x0F);
   emitUint8(0xAD);
-  emitRegisterOperand(src, dst);
+  emitRegisterOperand(gprEncoding(src), gprEncoding(dst));
 }
 
 template <class Machine>
@@ -2566,9 +2724,10 @@
   assert(imm.is_int8());
   if (Ty == IceType_i16)
     emitOperandSizeOverride();
+  emitRexRB(Ty, src, dst);
   emitUint8(0x0F);
   emitUint8(0xAC);
-  emitRegisterOperand(src, dst);
+  emitRegisterOperand(gprEncoding(src), gprEncoding(dst));
   emitUint8(imm.value() & 0xFF);
 }
 
@@ -2580,9 +2739,10 @@
   assert(Ty == IceType_i16 || Ty == IceType_i32);
   if (Ty == IceType_i16)
     emitOperandSizeOverride();
+  emitRex(Ty, dst, src);
   emitUint8(0x0F);
   emitUint8(0xAD);
-  emitOperand(src, dst);
+  emitOperand(gprEncoding(src), dst);
 }
 
 template <class Machine>
@@ -2590,11 +2750,12 @@
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   if (Ty == IceType_i16)
     emitOperandSizeOverride();
+  emitRexB(Ty, reg);
   if (isByteSizedArithType(Ty))
     emitUint8(0xF6);
   else
     emitUint8(0xF7);
-  emitRegisterOperand(3, reg);
+  emitRegisterOperand(3, gprEncoding(reg));
 }
 
 template <class Machine>
@@ -2603,6 +2764,7 @@
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   if (Ty == IceType_i16)
     emitOperandSizeOverride();
+  emitRex(Ty, addr, RexRegIrrelevant);
   if (isByteSizedArithType(Ty))
     emitUint8(0xF6);
   else
@@ -2613,8 +2775,9 @@
 template <class Machine>
 void AssemblerX86Base<Machine>::notl(typename Traits::GPRRegister reg) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRexB(IceType_i32, reg);
   emitUint8(0xF7);
-  emitUint8(0xD0 | reg);
+  emitUint8(0xD0 | gprEncoding(reg));
 }
 
 template <class Machine>
@@ -2623,8 +2786,9 @@
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   assert(Ty == IceType_i32);
   (void)Ty;
+  emitRexB(Ty, reg);
   emitUint8(0x0F);
-  emitUint8(0xC8 | reg);
+  emitUint8(0xC8 | gprEncoding(reg));
 }
 
 template <class Machine>
@@ -2634,9 +2798,10 @@
   assert(Ty == IceType_i16 || Ty == IceType_i32);
   if (Ty == IceType_i16)
     emitOperandSizeOverride();
+  emitRexRB(Ty, dst, src);
   emitUint8(0x0F);
   emitUint8(0xBC);
-  emitRegisterOperand(dst, src);
+  emitRegisterOperand(gprEncoding(dst), gprEncoding(src));
 }
 
 template <class Machine>
@@ -2646,9 +2811,10 @@
   assert(Ty == IceType_i16 || Ty == IceType_i32);
   if (Ty == IceType_i16)
     emitOperandSizeOverride();
+  emitRex(Ty, src, dst);
   emitUint8(0x0F);
   emitUint8(0xBC);
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
@@ -2658,9 +2824,10 @@
   assert(Ty == IceType_i16 || Ty == IceType_i32);
   if (Ty == IceType_i16)
     emitOperandSizeOverride();
+  emitRexRB(Ty, dst, src);
   emitUint8(0x0F);
   emitUint8(0xBD);
-  emitRegisterOperand(dst, src);
+  emitRegisterOperand(gprEncoding(dst), gprEncoding(src));
 }
 
 template <class Machine>
@@ -2670,18 +2837,20 @@
   assert(Ty == IceType_i16 || Ty == IceType_i32);
   if (Ty == IceType_i16)
     emitOperandSizeOverride();
+  emitRex(Ty, src, dst);
   emitUint8(0x0F);
   emitUint8(0xBD);
-  emitOperand(dst, src);
+  emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
 void AssemblerX86Base<Machine>::bt(typename Traits::GPRRegister base,
                                    typename Traits::GPRRegister offset) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRexRB(IceType_i32, offset, base);
   emitUint8(0x0F);
   emitUint8(0xA3);
-  emitRegisterOperand(offset, base);
+  emitRegisterOperand(gprEncoding(offset), gprEncoding(base));
 }
 
 template <class Machine> void AssemblerX86Base<Machine>::ret() {
@@ -2823,8 +2992,9 @@
 template <class Machine>
 void AssemblerX86Base<Machine>::jmp(typename Traits::GPRRegister reg) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRexB(RexTypeIrrelevant, reg);
   emitUint8(0xFF);
-  emitRegisterOperand(4, reg);
+  emitRegisterOperand(4, gprEncoding(reg));
 }
 
 template <class Machine>
@@ -2881,12 +3051,13 @@
     emitOperandSizeOverride();
   if (Locked)
     emitUint8(0xF0);
+  emitRex(Ty, address, reg);
   emitUint8(0x0F);
   if (isByteSizedArithType(Ty))
     emitUint8(0xB0);
   else
     emitUint8(0xB1);
-  emitOperand(reg, address);
+  emitOperand(gprEncoding(reg), address);
 }
 
 template <class Machine>
@@ -2895,6 +3066,7 @@
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   if (Locked)
     emitUint8(0xF0);
+  emitRex(IceType_i32, address, RexRegIrrelevant);
   emitUint8(0x0F);
   emitUint8(0xC7);
   emitOperand(1, address);
@@ -2910,12 +3082,13 @@
     emitOperandSizeOverride();
   if (Locked)
     emitUint8(0xF0);
+  emitRex(Ty, addr, reg);
   emitUint8(0x0F);
   if (isByteSizedArithType(Ty))
     emitUint8(0xC0);
   else
     emitUint8(0xC1);
-  emitOperand(reg, addr);
+  emitOperand(gprEncoding(reg), addr);
 }
 
 template <class Machine>
@@ -2925,11 +3098,12 @@
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   if (Ty == IceType_i16)
     emitOperandSizeOverride();
+  emitRex(Ty, addr, reg);
   if (isByteSizedArithType(Ty))
     emitUint8(0x86);
   else
     emitUint8(0x87);
-  emitOperand(reg, addr);
+  emitOperand(gprEncoding(reg), addr);
 }
 
 template <class Machine>
@@ -3082,6 +3256,7 @@
   assert(imm.is_int8());
   if (Ty == IceType_i16)
     emitOperandSizeOverride();
+  emitRexB(Ty, reg);
   if (imm.value() == 1) {
     emitUint8(isByteSizedArithType(Ty) ? 0xD0 : 0xD1);
     emitOperand(rm, typename Traits::Operand(reg));
@@ -3101,6 +3276,7 @@
   (void)shifter;
   if (Ty == IceType_i16)
     emitOperandSizeOverride();
+  emitRexB(Ty, operand.rm());
   emitUint8(isByteSizedArithType(Ty) ? 0xD2 : 0xD3);
   emitOperand(rm, operand);
 }
diff --git a/src/IceConditionCodesX8664.h b/src/IceConditionCodesX8664.h
index 06155ea..d1d9dd8 100644
--- a/src/IceConditionCodesX8664.h
+++ b/src/IceConditionCodesX8664.h
@@ -21,6 +21,7 @@
 namespace Ice {
 
 class CondX8664 {
+public:
   /// An enum of condition codes used for branches and cmov. The enum value
   /// should match the value used to encode operands in binary instructions.
   enum BrCond {
diff --git a/src/IceInstX8664.def b/src/IceInstX8664.def
index 6857ed6..dd4b712 100644
--- a/src/IceInstX8664.def
+++ b/src/IceInstX8664.def
@@ -15,46 +15,51 @@
 #ifndef SUBZERO_SRC_ICEINSTX8664_DEF
 #define SUBZERO_SRC_ICEINSTX8664_DEF
 
+// NOTE: we use the 32bit register names for two reasons:
+//       (1) it makes it easier to implement the x86 assembler template.
+//       (2) when generating code, subzero defaults to using 32 bit registers,
+//       so using the 32 bit register name would hopefully make this design
+//       more explicit.
 // NOTE: esp is not considered isInt, to avoid register allocating it.
 #define REGX8664_GPR_TABLE                                                     \
   /* val, encode, name64, name, name16, name8, scratch, preserved, stackptr,   \
      frameptr, isInt, isFP */                                                  \
-  X(Reg_rax, =            0, "rax", "eax" ,  "ax" , "al"  , 1, 0, 0, 0, 1, 0)  \
-  X(Reg_rcx, = Reg_rax +  1, "rcx", "ecx" ,  "cx" , "cl"  , 1, 0, 0, 0, 1, 0)  \
-  X(Reg_rdx, = Reg_rax +  2, "rdx", "edx" ,  "dx" , "dl"  , 1, 0, 0, 0, 1, 0)  \
-  X(Reg_rbx, = Reg_rax +  3, "rbx", "ebx" ,  "bx" , "bl"  , 0, 1, 0, 0, 1, 0)  \
-  X(Reg_rsp, = Reg_rax +  4, "rsp", "esp" ,  "sp" , "spl" , 0, 0, 1, 0, 0, 0)  \
-  X(Reg_rbp, = Reg_rax +  5, "rbp", "ebp" ,  "bp" , "bpl" , 0, 0, 0, 1, 1, 0)  \
-  X(Reg_rsi, = Reg_rax +  6, "rsi", "esi" ,  "si" , "sil" , 1, 0, 0, 0, 1, 0)  \
-  X(Reg_rdi, = Reg_rax +  7, "rdi", "edi" ,  "di" , "dil" , 1, 0, 0, 0, 1, 0)  \
-  X(Reg_r8,  = Reg_rax +  8, "r8" , "r8d" ,  "r8w", "r8l" , 1, 0, 0, 0, 1, 0)  \
-  X(Reg_r9,  = Reg_rax +  9, "r9" , "r9d" ,  "r9w", "r9l" , 1, 0, 0, 0, 1, 0)  \
-  X(Reg_r10, = Reg_rax + 10, "r10", "r10d", "r10w", "r10l", 1, 0, 0, 0, 1, 0)  \
-  X(Reg_r11, = Reg_rax + 11, "r11", "r11d", "r11w", "r11l", 1, 0, 0, 0, 1, 0)  \
-  X(Reg_r12, = Reg_rax + 12, "r12", "r12d", "r12w", "r12l", 0, 1, 0, 0, 1, 0)  \
-  X(Reg_r13, = Reg_rax + 13, "r13", "r13d", "r13w", "r12l", 0, 1, 0, 0, 1, 0)  \
-  X(Reg_r14, = Reg_rax + 14, "r14", "r14d", "r14w", "r14l", 0, 1, 0, 0, 1, 0)  \
-  X(Reg_r15, = Reg_rax + 15, "r15", "r15d", "r15w", "r15l", 0, 1, 0, 0, 1, 0)
+  X(Reg_eax,  =  0, "rax", "eax" ,  "ax" , "al"  , 1, 0, 0, 0, 1, 0)           \
+  X(Reg_ecx,  =  1, "rcx", "ecx" ,  "cx" , "cl"  , 1, 0, 0, 0, 1, 0)           \
+  X(Reg_edx,  =  2, "rdx", "edx" ,  "dx" , "dl"  , 1, 0, 0, 0, 1, 0)           \
+  X(Reg_ebx,  =  3, "rbx", "ebx" ,  "bx" , "bl"  , 0, 1, 0, 0, 1, 0)           \
+  X(Reg_esp,  =  4, "rsp", "esp" ,  "sp" , "spl" , 0, 0, 1, 0, 0, 0)           \
+  X(Reg_ebp,  =  5, "rbp", "ebp" ,  "bp" , "bpl" , 0, 0, 0, 1, 1, 0)           \
+  X(Reg_esi,  =  6, "rsi", "esi" ,  "si" , "sil" , 1, 0, 0, 0, 1, 0)           \
+  X(Reg_edi,  =  7, "rdi", "edi" ,  "di" , "dil" , 1, 0, 0, 0, 1, 0)           \
+  X(Reg_r8d,  =  8, "r8" , "r8d" ,  "r8w", "r8l" , 1, 0, 0, 0, 1, 0)           \
+  X(Reg_r9d,  =  9, "r9" , "r9d" ,  "r9w", "r9l" , 1, 0, 0, 0, 1, 0)           \
+  X(Reg_r10d, = 10, "r10", "r10d", "r10w", "r10l", 1, 0, 0, 0, 1, 0)           \
+  X(Reg_r11d, = 11, "r11", "r11d", "r11w", "r11l", 1, 0, 0, 0, 1, 0)           \
+  X(Reg_r12d, = 12, "r12", "r12d", "r12w", "r12l", 0, 1, 0, 0, 1, 0)           \
+  X(Reg_r13d, = 13, "r13", "r13d", "r13w", "r12l", 0, 1, 0, 0, 1, 0)           \
+  X(Reg_r14d, = 14, "r14", "r14d", "r14w", "r14l", 0, 1, 0, 0, 1, 0)           \
+  X(Reg_r15d, = 15, "r15", "r15d", "r15w", "r15l", 0, 1, 0, 0, 1, 0)
 
 #define REGX8664_XMM_TABLE                                                     \
   /* val, encode, name64, name, name16, name8, scratch, preserved, stackptr,   \
      frameptr, isInt, isFP */                                                  \
-  X(Reg_xmm0,  =             0, "xmm0" , "", "", "", 1, 0, 0, 0, 0, 1)         \
-  X(Reg_xmm1,  = Reg_xmm0 +  1, "xmm1" , "", "", "", 1, 0, 0, 0, 0, 1)         \
-  X(Reg_xmm2,  = Reg_xmm0 +  2, "xmm2" , "", "", "", 1, 0, 0, 0, 0, 1)         \
-  X(Reg_xmm3,  = Reg_xmm0 +  3, "xmm3" , "", "", "", 1, 0, 0, 0, 0, 1)         \
-  X(Reg_xmm4,  = Reg_xmm0 +  4, "xmm4" , "", "", "", 1, 0, 0, 0, 0, 1)         \
-  X(Reg_xmm5,  = Reg_xmm0 +  5, "xmm5" , "", "", "", 1, 0, 0, 0, 0, 1)         \
-  X(Reg_xmm6,  = Reg_xmm0 +  6, "xmm6" , "", "", "", 1, 0, 0, 0, 0, 1)         \
-  X(Reg_xmm7,  = Reg_xmm0 +  7, "xmm7" , "", "", "", 1, 0, 0, 0, 0, 1)         \
-  X(Reg_xmm8,  = Reg_xmm0 +  8, "xmm8" , "", "", "", 1, 0, 0, 0, 0, 1)         \
-  X(Reg_xmm9,  = Reg_xmm0 +  9, "xmm9" , "", "", "", 1, 0, 0, 0, 0, 1)         \
-  X(Reg_xmm10, = Reg_xmm0 + 10, "xmm10", "", "", "", 1, 0, 0, 0, 0, 1)         \
-  X(Reg_xmm11, = Reg_xmm0 + 11, "xmm11", "", "", "", 1, 0, 0, 0, 0, 1)         \
-  X(Reg_xmm12, = Reg_xmm0 + 12, "xmm12", "", "", "", 1, 0, 0, 0, 0, 1)         \
-  X(Reg_xmm13, = Reg_xmm0 + 13, "xmm13", "", "", "", 1, 0, 0, 0, 0, 1)         \
-  X(Reg_xmm14, = Reg_xmm0 + 14, "xmm14", "", "", "", 1, 0, 0, 0, 0, 1)         \
-  X(Reg_xmm15, = Reg_xmm0 + 15, "xmm15", "", "", "", 1, 0, 0, 0, 0, 1)
+  X(Reg_xmm0,  =  0, "xmm0" , "", "", "", 1, 0, 0, 0, 0, 1)                    \
+  X(Reg_xmm1,  =  1, "xmm1" , "", "", "", 1, 0, 0, 0, 0, 1)                    \
+  X(Reg_xmm2,  =  2, "xmm2" , "", "", "", 1, 0, 0, 0, 0, 1)                    \
+  X(Reg_xmm3,  =  3, "xmm3" , "", "", "", 1, 0, 0, 0, 0, 1)                    \
+  X(Reg_xmm4,  =  4, "xmm4" , "", "", "", 1, 0, 0, 0, 0, 1)                    \
+  X(Reg_xmm5,  =  5, "xmm5" , "", "", "", 1, 0, 0, 0, 0, 1)                    \
+  X(Reg_xmm6,  =  6, "xmm6" , "", "", "", 1, 0, 0, 0, 0, 1)                    \
+  X(Reg_xmm7,  =  7, "xmm7" , "", "", "", 1, 0, 0, 0, 0, 1)                    \
+  X(Reg_xmm8,  =  8, "xmm8" , "", "", "", 1, 0, 0, 0, 0, 1)                    \
+  X(Reg_xmm9,  =  9, "xmm9" , "", "", "", 1, 0, 0, 0, 0, 1)                    \
+  X(Reg_xmm10, = 10, "xmm10", "", "", "", 1, 0, 0, 0, 0, 1)                    \
+  X(Reg_xmm11, = 11, "xmm11", "", "", "", 1, 0, 0, 0, 0, 1)                    \
+  X(Reg_xmm12, = 12, "xmm12", "", "", "", 1, 0, 0, 0, 0, 1)                    \
+  X(Reg_xmm13, = 13, "xmm13", "", "", "", 1, 0, 0, 0, 0, 1)                    \
+  X(Reg_xmm14, = 14, "xmm14", "", "", "", 1, 0, 0, 0, 0, 1)                    \
+  X(Reg_xmm15, = 15, "xmm15", "", "", "", 1, 0, 0, 0, 0, 1)
 //#define X(val, encode, name, name32, name16, name8, scratch, preserved,
 //          stackptr, frameptr, isI8, isInt, isFP)
 
@@ -77,8 +82,8 @@
 
 #define REGX8664_TABLE_BOUNDS                                                  \
   /* val         , init */                                                     \
-  X(Reg_GPR_First, = Reg_rax  )                                                \
-  X(Reg_GPR_Last , = Reg_r15  )                                                \
+  X(Reg_GPR_First, = Reg_eax  )                                                \
+  X(Reg_GPR_Last , = Reg_r15d )                                                \
   X(Reg_XMM_First, = Reg_xmm0 )                                                \
   X(Reg_XMM_Last , = Reg_xmm15)
 // define X(val, init)
diff --git a/src/IceRegistersX8664.h b/src/IceRegistersX8664.h
index 3e4c868..f95fe9e 100644
--- a/src/IceRegistersX8664.h
+++ b/src/IceRegistersX8664.h
@@ -79,10 +79,6 @@
   }
 
   static inline ByteRegister getEncodedByteReg(int32_t RegNum) {
-    // In x86-64, AH is not encodable when the REX prefix is used; the same
-    // encoding is used for spl. Therefore, ah needs special handling.
-    if (RegNum == Reg_ah)
-      return Encoded_Reg_spl;
     return ByteRegister(RegNum - Reg_GPR_First);
   }
 
diff --git a/src/IceTargetLoweringX8632Traits.h b/src/IceTargetLoweringX8632Traits.h
index ca15ea5..29066aa 100644
--- a/src/IceTargetLoweringX8632Traits.h
+++ b/src/IceTargetLoweringX8632Traits.h
@@ -48,6 +48,13 @@
   //      \/_/\/_/\/_____/\/_/  \/_/
   //
   //----------------------------------------------------------------------------
+  static constexpr bool Is64Bit = false;
+  static constexpr bool HasPopa = true;
+  static constexpr bool HasPusha = true;
+  static constexpr bool UsesX87 = true;
+  static constexpr ::Ice::RegX8632::GPRRegister Last8BitGPR =
+      ::Ice::RegX8632::GPRRegister::Encoded_Reg_ebx;
+
   enum ScaleFactor { TIMES_1 = 0, TIMES_2 = 1, TIMES_4 = 2, TIMES_8 = 3 };
 
   using GPRRegister = ::Ice::RegX8632::GPRRegister;
diff --git a/src/IceTargetLoweringX8664Traits.h b/src/IceTargetLoweringX8664Traits.h
new file mode 100644
index 0000000..fea1a8f
--- /dev/null
+++ b/src/IceTargetLoweringX8664Traits.h
@@ -0,0 +1,296 @@
+//===- subzero/src/IceTargetLoweringX8664Traits.h - x86-64 traits -*- C++ -*-=//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file declares the X8664 Target Lowering Traits.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef SUBZERO_SRC_ICETARGETLOWERINGX8664TRAITS_H
+#define SUBZERO_SRC_ICETARGETLOWERINGX8664TRAITS_H
+
+#include "IceAssembler.h"
+#include "IceConditionCodesX8664.h"
+#include "IceDefs.h"
+#include "IceInst.h"
+#include "IceInstX8664.def"
+#include "IceOperand.h"
+#include "IceRegistersX8664.h"
+#include "IceTargetLowering.h"
+
+namespace Ice {
+
+class TargetX8664;
+
+namespace X8664 {
+class AssemblerX8664;
+} // end of namespace X8664
+
+namespace X86Internal {
+
+template <class Machine> struct Insts;
+template <class Machine> struct MachineTraits;
+
+template <> struct MachineTraits<TargetX8664> {
+  //----------------------------------------------------------------------------
+  //     ______  ______  __    __
+  //    /\  __ \/\  ___\/\ "-./  \
+  //    \ \  __ \ \___  \ \ \-./\ \
+  //     \ \_\ \_\/\_____\ \_\ \ \_\
+  //      \/_/\/_/\/_____/\/_/  \/_/
+  //
+  //----------------------------------------------------------------------------
+  static constexpr bool Is64Bit = true;
+  static constexpr bool HasPopa = false;
+  static constexpr bool HasPusha = false;
+  static constexpr bool UsesX87 = false;
+  static constexpr ::Ice::RegX8664::GPRRegister Last8BitGPR =
+      ::Ice::RegX8664::GPRRegister::Encoded_Reg_r15d;
+
+  enum ScaleFactor { TIMES_1 = 0, TIMES_2 = 1, TIMES_4 = 2, TIMES_8 = 3 };
+
+  using GPRRegister = ::Ice::RegX8664::GPRRegister;
+  using XmmRegister = ::Ice::RegX8664::XmmRegister;
+  using ByteRegister = ::Ice::RegX8664::ByteRegister;
+
+  using Cond = ::Ice::CondX8664;
+
+  using RegisterSet = ::Ice::RegX8664;
+  static const GPRRegister Encoded_Reg_Accumulator = RegX8664::Encoded_Reg_eax;
+  static const GPRRegister Encoded_Reg_Counter = RegX8664::Encoded_Reg_ecx;
+  static const FixupKind PcRelFixup = llvm::ELF::R_386_PC32; // TODO(jpp): ???
+
+  class Operand {
+  public:
+    enum RexBits {
+      RexNone = 0x00,
+      RexBase = 0x40,
+      RexW = RexBase | (1 << 3),
+      RexR = RexBase | (1 << 2),
+      RexX = RexBase | (1 << 1),
+      RexB = RexBase | (1 << 0),
+    };
+
+    Operand(const Operand &other)
+        : fixup_(other.fixup_), rex_(other.rex_), length_(other.length_) {
+      memmove(&encoding_[0], &other.encoding_[0], other.length_);
+    }
+
+    Operand &operator=(const Operand &other) {
+      length_ = other.length_;
+      fixup_ = other.fixup_;
+      rex_ = other.rex_;
+      memmove(&encoding_[0], &other.encoding_[0], other.length_);
+      return *this;
+    }
+
+    uint8_t mod() const { return (encoding_at(0) >> 6) & 3; }
+
+    uint8_t rexX() const { return (rex_ & RexX) != RexX ? RexNone : RexX; }
+    uint8_t rexB() const { return (rex_ & RexB) != RexB ? RexNone : RexB; }
+
+    GPRRegister rm() const {
+      return static_cast<GPRRegister>((rexB() != 0 ? 0x08 : 0) |
+                                      (encoding_at(0) & 7));
+    }
+
+    ScaleFactor scale() const {
+      return static_cast<ScaleFactor>((encoding_at(1) >> 6) & 3);
+    }
+
+    GPRRegister index() const {
+      return static_cast<GPRRegister>((rexX() != 0 ? 0x08 : 0) |
+                                      ((encoding_at(1) >> 3) & 7));
+    }
+
+    GPRRegister base() const {
+      return static_cast<GPRRegister>((rexB() != 0 ? 0x08 : 0) |
+                                      (encoding_at(1) & 7));
+    }
+
+    int8_t disp8() const {
+      assert(length_ >= 2);
+      return static_cast<int8_t>(encoding_[length_ - 1]);
+    }
+
+    int32_t disp32() const {
+      assert(length_ >= 5);
+      return bit_copy<int32_t>(encoding_[length_ - 4]);
+    }
+
+    AssemblerFixup *fixup() const { return fixup_; }
+
+  protected:
+    Operand() : fixup_(nullptr), length_(0) {} // Needed by subclass Address.
+
+    void SetModRM(int mod, GPRRegister rm) {
+      assert((mod & ~3) == 0);
+      encoding_[0] = (mod << 6) | (rm & 0x07);
+      rex_ = (rm & 0x08) ? RexB : RexNone;
+      length_ = 1;
+    }
+
+    void SetSIB(ScaleFactor scale, GPRRegister index, GPRRegister base) {
+      assert(length_ == 1);
+      assert((scale & ~3) == 0);
+      encoding_[1] = (scale << 6) | ((index & 0x07) << 3) | (base & 0x07);
+      rex_ =
+          ((base & 0x08) ? RexB : RexNone) | ((index & 0x08) ? RexX : RexNone);
+      length_ = 2;
+    }
+
+    void SetDisp8(int8_t disp) {
+      assert(length_ == 1 || length_ == 2);
+      encoding_[length_++] = static_cast<uint8_t>(disp);
+    }
+
+    void SetDisp32(int32_t disp) {
+      assert(length_ == 1 || length_ == 2);
+      intptr_t disp_size = sizeof(disp);
+      memmove(&encoding_[length_], &disp, disp_size);
+      length_ += disp_size;
+    }
+
+    void SetFixup(AssemblerFixup *fixup) { fixup_ = fixup; }
+
+  private:
+    AssemblerFixup *fixup_;
+    uint8_t rex_ = 0;
+    uint8_t encoding_[6];
+    uint8_t length_;
+
+    explicit Operand(GPRRegister reg) : fixup_(nullptr) { SetModRM(3, reg); }
+
+    /// Get the operand encoding byte at the given index.
+    uint8_t encoding_at(intptr_t index) const {
+      assert(index >= 0 && index < length_);
+      return encoding_[index];
+    }
+
+    /// Returns whether or not this operand is really the given register in
+    /// disguise. Used from the assembler to generate better encodings.
+    bool IsRegister(GPRRegister reg) const {
+      return ((encoding_[0] & 0xF8) ==
+              0xC0) // Addressing mode is register only.
+             &&
+             (rm() == reg); // Register codes match.
+    }
+
+    template <class> friend class AssemblerX86Base;
+  };
+
+  class Address : public Operand {
+    Address() = delete;
+
+  public:
+    Address(const Address &other) : Operand(other) {}
+
+    Address &operator=(const Address &other) {
+      Operand::operator=(other);
+      return *this;
+    }
+
+    Address(GPRRegister base, int32_t disp) {
+      if (disp == 0 && (base & 7) != RegX8664::Encoded_Reg_ebp) {
+        SetModRM(0, base);
+        if ((base & 7) == RegX8664::Encoded_Reg_esp)
+          SetSIB(TIMES_1, RegX8664::Encoded_Reg_esp, base);
+      } else if (Utils::IsInt(8, disp)) {
+        SetModRM(1, base);
+        if ((base & 7) == RegX8664::Encoded_Reg_esp)
+          SetSIB(TIMES_1, RegX8664::Encoded_Reg_esp, base);
+        SetDisp8(disp);
+      } else {
+        SetModRM(2, base);
+        if ((base & 7) == RegX8664::Encoded_Reg_esp)
+          SetSIB(TIMES_1, RegX8664::Encoded_Reg_esp, base);
+        SetDisp32(disp);
+      }
+    }
+
+    Address(GPRRegister index, ScaleFactor scale, int32_t disp) {
+      assert(index != RegX8664::Encoded_Reg_esp); // Illegal addressing mode.
+      SetModRM(0, RegX8664::Encoded_Reg_esp);
+      SetSIB(scale, index, RegX8664::Encoded_Reg_ebp);
+      SetDisp32(disp);
+    }
+
+    Address(GPRRegister base, GPRRegister index, ScaleFactor scale,
+            int32_t disp) {
+      assert(index != RegX8664::Encoded_Reg_esp); // Illegal addressing mode.
+      if (disp == 0 && (base & 7) != RegX8664::Encoded_Reg_ebp) {
+        SetModRM(0, RegX8664::Encoded_Reg_esp);
+        SetSIB(scale, index, base);
+      } else if (Utils::IsInt(8, disp)) {
+        SetModRM(1, RegX8664::Encoded_Reg_esp);
+        SetSIB(scale, index, base);
+        SetDisp8(disp);
+      } else {
+        SetModRM(2, RegX8664::Encoded_Reg_esp);
+        SetSIB(scale, index, base);
+        SetDisp32(disp);
+      }
+    }
+
+    // PcRelTag is a special tag for requesting rip-relative addressing in
+    // X86-64.
+    // TODO(jpp): this is bogus. remove.
+    enum AbsoluteTag { ABSOLUTE };
+
+    Address(AbsoluteTag, const uintptr_t Addr) {
+      SetModRM(0, RegX8664::Encoded_Reg_ebp);
+      SetDisp32(Addr);
+    }
+
+    // TODO(jpp): remove this.
+    static Address Absolute(const uintptr_t Addr) {
+      return Address(ABSOLUTE, Addr);
+    }
+
+    Address(AbsoluteTag, RelocOffsetT Offset, AssemblerFixup *Fixup) {
+      SetModRM(0, RegX8664::Encoded_Reg_ebp);
+      // Use the Offset in the displacement for now. If we decide to process
+      // fixups later, we'll need to patch up the emitted displacement.
+      SetDisp32(Offset);
+      SetFixup(Fixup);
+    }
+
+    // TODO(jpp): remove this.
+    static Address Absolute(RelocOffsetT Offset, AssemblerFixup *Fixup) {
+      return Address(ABSOLUTE, Offset, Fixup);
+    }
+
+    static Address ofConstPool(Assembler *Asm, const Constant *Imm) {
+      // TODO(jpp): ???
+      AssemblerFixup *Fixup = Asm->createFixup(llvm::ELF::R_386_32, Imm);
+      const RelocOffsetT Offset = 0;
+      return Address(ABSOLUTE, Offset, Fixup);
+    }
+  };
+
+  //----------------------------------------------------------------------------
+  //     __      ______  __     __  ______  ______  __  __   __  ______
+  //    /\ \    /\  __ \/\ \  _ \ \/\  ___\/\  == \/\ \/\ "-.\ \/\  ___\
+  //    \ \ \___\ \ \/\ \ \ \/ ".\ \ \  __\\ \  __<\ \ \ \ \-.  \ \ \__ \
+  //     \ \_____\ \_____\ \__/".~\_\ \_____\ \_\ \_\ \_\ \_\\"\_\ \_____\
+  //      \/_____/\/_____/\/_/   \/_/\/_____/\/_/ /_/\/_/\/_/ \/_/\/_____/
+  //
+  //----------------------------------------------------------------------------
+  using Assembler = X8664::AssemblerX8664;
+};
+
+} // end of namespace X86Internal
+
+namespace X8664 {
+using Traits = ::Ice::X86Internal::MachineTraits<TargetX8664>;
+} // end of namespace X8664
+
+} // end of namespace Ice
+
+#endif // SUBZERO_SRC_ICETARGETLOWERINGX8664TRAITS_H
diff --git a/unittest/AssemblerX8632/ControlFlow.cpp b/unittest/AssemblerX8632/ControlFlow.cpp
new file mode 100644
index 0000000..35adf07
--- /dev/null
+++ b/unittest/AssemblerX8632/ControlFlow.cpp
@@ -0,0 +1,244 @@
+//===- subzero/unittest/AssemblerX8632/ControleFlow.cpp -------------------===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#include "IceAssemblerX8632.h"
+#include "AssemblerX8632/TestUtil.h"
+
+namespace Ice {
+namespace X8632 {
+namespace Test {
+namespace {
+
+TEST_F(AssemblerX8632Test, J) {
+#define TestJ(C, Near, Src0, Value0, Src1, Value1, Dest)                       \
+  do {                                                                         \
+    const bool NearJmp = AssemblerX8632::k##Near##Jump;                        \
+    Label ShouldBeTaken;                                                       \
+    __ mov(IceType_i32, GPRRegister::Encoded_Reg_##Src0, Immediate(Value0));   \
+    __ mov(IceType_i32, GPRRegister::Encoded_Reg_##Src1, Immediate(Value1));   \
+    __ mov(IceType_i32, GPRRegister::Encoded_Reg_##Dest, Immediate(0xBEEF));   \
+    __ cmp(IceType_i32, GPRRegister::Encoded_Reg_##Src0,                       \
+           GPRRegister::Encoded_Reg_##Src1);                                   \
+    __ j(Cond::Br_##C, &ShouldBeTaken, NearJmp);                               \
+    __ mov(IceType_i32, GPRRegister::Encoded_Reg_##Dest, Immediate(0xC0FFEE)); \
+    __ bind(&ShouldBeTaken);                                                   \
+    AssembledTest test = assemble();                                           \
+    test.run();                                                                \
+    EXPECT_EQ(Value0, test.Src0()) << "Br_" #C ", " #Near;                     \
+    EXPECT_EQ(Value1, test.Src1()) << "Br_" #C ", " #Near;                     \
+    EXPECT_EQ(0xBEEFul, test.Dest()) << "Br_" #C ", " #Near;                   \
+    reset();                                                                   \
+  } while (0)
+
+  TestJ(o, Near, eax, 0x80000000ul, ebx, 0x1ul, ecx);
+  TestJ(o, Far, ebx, 0x80000000ul, ecx, 0x1ul, edx);
+  TestJ(no, Near, ecx, 0x1ul, edx, 0x1ul, edi);
+  TestJ(no, Far, edx, 0x1ul, edi, 0x1ul, esi);
+  TestJ(b, Near, edi, 0x1ul, esi, 0x80000000ul, eax);
+  TestJ(b, Far, esi, 0x1ul, eax, 0x80000000ul, ebx);
+  TestJ(ae, Near, eax, 0x80000000ul, ebx, 0x1ul, ecx);
+  TestJ(ae, Far, ebx, 0x80000000ul, ecx, 0x1ul, edx);
+  TestJ(e, Near, ecx, 0x80000000ul, edx, 0x80000000ul, edi);
+  TestJ(e, Far, edx, 0x80000000ul, edi, 0x80000000ul, esi);
+  TestJ(ne, Near, edi, 0x80000000ul, esi, 0x1ul, eax);
+  TestJ(ne, Far, esi, 0x80000000ul, eax, 0x1ul, ebx);
+  TestJ(be, Near, eax, 0x1ul, ebx, 0x80000000ul, ecx);
+  TestJ(be, Far, ebx, 0x1ul, ecx, 0x80000000ul, edx);
+  TestJ(a, Near, ecx, 0x80000000ul, edx, 0x1ul, edi);
+  TestJ(a, Far, edx, 0x80000000ul, edi, 0x1ul, esi);
+  TestJ(s, Near, edi, 0x1ul, esi, 0x80000000ul, eax);
+  TestJ(s, Far, esi, 0x1ul, eax, 0x80000000ul, ebx);
+  TestJ(ns, Near, eax, 0x80000000ul, ebx, 0x1ul, ecx);
+  TestJ(ns, Far, ebx, 0x80000000ul, ecx, 0x1ul, edx);
+  TestJ(p, Near, ecx, 0x80000000ul, edx, 0x1ul, edi);
+  TestJ(p, Far, edx, 0x80000000ul, edi, 0x1ul, esi);
+  TestJ(np, Near, edi, 0x1ul, esi, 0x80000000ul, eax);
+  TestJ(np, Far, esi, 0x1ul, eax, 0x80000000ul, ebx);
+  TestJ(l, Near, eax, 0x80000000ul, ebx, 0x1ul, ecx);
+  TestJ(l, Far, ebx, 0x80000000ul, ecx, 0x1ul, edx);
+  TestJ(ge, Near, ecx, 0x1ul, edx, 0x80000000ul, edi);
+  TestJ(ge, Far, edx, 0x1ul, edi, 0x80000000ul, esi);
+  TestJ(le, Near, edi, 0x80000000ul, esi, 0x1ul, eax);
+  TestJ(le, Far, esi, 0x80000000ul, eax, 0x1ul, ebx);
+  TestJ(g, Near, eax, 0x1ul, ebx, 0x80000000ul, ecx);
+  TestJ(g, Far, ebx, 0x1ul, ecx, 0x80000000ul, edx);
+
+#undef TestJ
+}
+
+TEST_F(AssemblerX8632Test, CallImm) {
+  __ call(Immediate(16));
+  __ hlt();
+  __ hlt();
+  __ hlt();
+  __ hlt();
+  __ hlt();
+  __ hlt();
+  __ hlt();
+  __ hlt();
+  __ hlt();
+  __ hlt();
+  __ hlt();
+  __ hlt();
+  __ mov(IceType_i32, GPRRegister::Encoded_Reg_eax, Immediate(0xf00f));
+  __ popl(GPRRegister::Encoded_Reg_ebx);
+
+  AssembledTest test = assemble();
+
+  test.run();
+
+  EXPECT_EQ(0xF00Fu, test.eax());
+}
+
+TEST_F(AssemblerX8632Test, CallReg) {
+  __ call(Immediate(16));
+  __ popl(GPRRegister::Encoded_Reg_edx);
+  __ pushl(GPRRegister::Encoded_Reg_edx);
+  __ ret();
+  __ hlt();
+  __ hlt();
+  __ hlt();
+  __ hlt();
+  __ hlt();
+  __ hlt();
+  __ hlt();
+  __ hlt();
+  __ hlt();
+  __ popl(GPRRegister::Encoded_Reg_ebx);
+  __ call(GPRRegister::Encoded_Reg_ebx);
+
+  AssembledTest test = assemble();
+
+  test.run();
+
+  EXPECT_EQ(15u, test.edx() - test.ebx());
+}
+
+TEST_F(AssemblerX8632Test, CallAddr) {
+  __ call(Immediate(16));
+  __ mov(IceType_i8, GPRRegister::Encoded_Reg_eax, Immediate(0xf4));
+  __ ret();
+  __ hlt();
+  __ hlt();
+  __ hlt();
+  __ hlt();
+  __ hlt();
+  __ hlt();
+  __ hlt();
+  __ hlt();
+  __ hlt();
+  __ mov(IceType_i32, GPRRegister::Encoded_Reg_eax, Immediate(0xf1f2f300));
+  __ call(Address(GPRRegister::Encoded_Reg_esp, 0));
+  __ popl(GPRRegister::Encoded_Reg_edx);
+
+  AssembledTest test = assemble();
+
+  test.run();
+
+  EXPECT_EQ(0xf1f2f3f4, test.eax());
+}
+
+TEST_F(AssemblerX8632Test, Jmp) {
+// TestImplReg uses jmp(Label), so jmp(Label) needs to be tested before it.
+#define TestImplAddr(Near)                                                     \
+  do {                                                                         \
+    Label ForwardJmp;                                                          \
+    Label BackwardJmp;                                                         \
+    Label Done;                                                                \
+                                                                               \
+    __ jmp(&ForwardJmp, AssemblerX8632::k##Near##Jump);                        \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ bind(&BackwardJmp);                                                     \
+    __ jmp(&Done, AssemblerX8632::k##Near##Jump);                              \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ bind(&ForwardJmp);                                                      \
+    __ jmp(&BackwardJmp, AssemblerX8632::k##NearJump);                         \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ bind(&Done);                                                            \
+  } while (0)
+
+#define TestImplReg(Dst)                                                       \
+  do {                                                                         \
+    __ call(Immediate(16));                                                    \
+    Label Done;                                                                \
+    __ jmp(&Done, AssemblerX8632::kNearJump);                                  \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ popl(GPRRegister::Encoded_Reg_##Dst);                                   \
+    __ jmp(GPRRegister::Encoded_Reg_##Dst);                                    \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ bind(&Done);                                                            \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.run();                                                                \
+                                                                               \
+    reset();                                                                   \
+  } while (0)
+
+  TestImplAddr(Near);
+  TestImplAddr(Far);
+
+  TestImplReg(eax);
+  TestImplReg(ebx);
+  TestImplReg(ecx);
+  TestImplReg(edx);
+  TestImplReg(esi);
+  TestImplReg(edi);
+
+#undef TestImplReg
+#undef TestImplAddr
+}
+
+} // end of anonymous namespace
+} // end of namespace Test
+} // end of namespace X8632
+} // end of namespace Ice
diff --git a/unittest/AssemblerX8632/DataMov.cpp b/unittest/AssemblerX8632/DataMov.cpp
new file mode 100644
index 0000000..cb2012e
--- /dev/null
+++ b/unittest/AssemblerX8632/DataMov.cpp
@@ -0,0 +1,1192 @@
+//===- subzero/unittest/AssemblerX8632/DataMov.cpp ------------------------===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#include "AssemblerX8632/TestUtil.h"
+
+namespace Ice {
+namespace X8632 {
+namespace Test {
+namespace {
+
+TEST_F(AssemblerX8632Test, MovRegImm) {
+  constexpr uint32_t ExpectedEax = 0x000000FFul;
+  constexpr uint32_t ExpectedEbx = 0x0000FF00ul;
+  constexpr uint32_t ExpectedEcx = 0x00FF0000ul;
+  constexpr uint32_t ExpectedEdx = 0xFF000000ul;
+  constexpr uint32_t ExpectedEdi = 0x6AAA0006ul;
+  constexpr uint32_t ExpectedEsi = 0x6000AAA6ul;
+
+  __ mov(IceType_i32, GPRRegister::Encoded_Reg_eax, Immediate(ExpectedEax));
+  __ mov(IceType_i32, GPRRegister::Encoded_Reg_ebx, Immediate(ExpectedEbx));
+  __ mov(IceType_i32, GPRRegister::Encoded_Reg_ecx, Immediate(ExpectedEcx));
+  __ mov(IceType_i32, GPRRegister::Encoded_Reg_edx, Immediate(ExpectedEdx));
+  __ mov(IceType_i32, GPRRegister::Encoded_Reg_edi, Immediate(ExpectedEdi));
+  __ mov(IceType_i32, GPRRegister::Encoded_Reg_esi, Immediate(ExpectedEsi));
+
+  AssembledTest test = assemble();
+  test.run();
+  EXPECT_EQ(ExpectedEax, test.eax());
+  EXPECT_EQ(ExpectedEbx, test.ebx());
+  EXPECT_EQ(ExpectedEcx, test.ecx());
+  EXPECT_EQ(ExpectedEdx, test.edx());
+  EXPECT_EQ(ExpectedEdi, test.edi());
+  EXPECT_EQ(ExpectedEsi, test.esi());
+}
+
+TEST_F(AssemblerX8632Test, MovMemImm) {
+  const uint32_t T0 = allocateDword();
+  constexpr uint32_t ExpectedT0 = 0x00111100ul;
+  const uint32_t T1 = allocateDword();
+  constexpr uint32_t ExpectedT1 = 0x00222200ul;
+  const uint32_t T2 = allocateDword();
+  constexpr uint32_t ExpectedT2 = 0x03333000ul;
+  const uint32_t T3 = allocateDword();
+  constexpr uint32_t ExpectedT3 = 0x00444400ul;
+
+  __ mov(IceType_i32, dwordAddress(T0), Immediate(ExpectedT0));
+  __ mov(IceType_i32, dwordAddress(T1), Immediate(ExpectedT1));
+  __ mov(IceType_i32, dwordAddress(T2), Immediate(ExpectedT2));
+  __ mov(IceType_i32, dwordAddress(T3), Immediate(ExpectedT3));
+
+  AssembledTest test = assemble();
+  test.run();
+  EXPECT_EQ(0ul, test.eax());
+  EXPECT_EQ(0ul, test.ebx());
+  EXPECT_EQ(0ul, test.ecx());
+  EXPECT_EQ(0ul, test.edx());
+  EXPECT_EQ(0ul, test.edi());
+  EXPECT_EQ(0ul, test.esi());
+  EXPECT_EQ(ExpectedT0, test.contentsOfDword(T0));
+  EXPECT_EQ(ExpectedT1, test.contentsOfDword(T1));
+  EXPECT_EQ(ExpectedT2, test.contentsOfDword(T2));
+  EXPECT_EQ(ExpectedT3, test.contentsOfDword(T3));
+}
+
+TEST_F(AssemblerX8632Test, MovMemReg) {
+  const uint32_t T0 = allocateDword();
+  constexpr uint32_t ExpectedT0 = 0x00111100ul;
+  const uint32_t T1 = allocateDword();
+  constexpr uint32_t ExpectedT1 = 0x00222200ul;
+  const uint32_t T2 = allocateDword();
+  constexpr uint32_t ExpectedT2 = 0x00333300ul;
+  const uint32_t T3 = allocateDword();
+  constexpr uint32_t ExpectedT3 = 0x00444400ul;
+  const uint32_t T4 = allocateDword();
+  constexpr uint32_t ExpectedT4 = 0x00555500ul;
+  const uint32_t T5 = allocateDword();
+  constexpr uint32_t ExpectedT5 = 0x00666600ul;
+
+  __ mov(IceType_i32, GPRRegister::Encoded_Reg_eax, Immediate(ExpectedT0));
+  __ mov(IceType_i32, dwordAddress(T0), GPRRegister::Encoded_Reg_eax);
+  __ mov(IceType_i32, GPRRegister::Encoded_Reg_ebx, Immediate(ExpectedT1));
+  __ mov(IceType_i32, dwordAddress(T1), GPRRegister::Encoded_Reg_ebx);
+  __ mov(IceType_i32, GPRRegister::Encoded_Reg_ecx, Immediate(ExpectedT2));
+  __ mov(IceType_i32, dwordAddress(T2), GPRRegister::Encoded_Reg_ecx);
+  __ mov(IceType_i32, GPRRegister::Encoded_Reg_edx, Immediate(ExpectedT3));
+  __ mov(IceType_i32, dwordAddress(T3), GPRRegister::Encoded_Reg_edx);
+  __ mov(IceType_i32, GPRRegister::Encoded_Reg_edi, Immediate(ExpectedT4));
+  __ mov(IceType_i32, dwordAddress(T4), GPRRegister::Encoded_Reg_edi);
+  __ mov(IceType_i32, GPRRegister::Encoded_Reg_esi, Immediate(ExpectedT5));
+  __ mov(IceType_i32, dwordAddress(T5), GPRRegister::Encoded_Reg_esi);
+
+  AssembledTest test = assemble();
+  test.run();
+  EXPECT_EQ(ExpectedT0, test.contentsOfDword(T0));
+  EXPECT_EQ(ExpectedT1, test.contentsOfDword(T1));
+  EXPECT_EQ(ExpectedT2, test.contentsOfDword(T2));
+  EXPECT_EQ(ExpectedT3, test.contentsOfDword(T3));
+  EXPECT_EQ(ExpectedT4, test.contentsOfDword(T4));
+  EXPECT_EQ(ExpectedT5, test.contentsOfDword(T5));
+}
+
+TEST_F(AssemblerX8632Test, MovRegReg) {
+  __ mov(IceType_i32, GPRRegister::Encoded_Reg_eax, Immediate(0x20));
+  __ mov(IceType_i32, GPRRegister::Encoded_Reg_ebx,
+         GPRRegister::Encoded_Reg_eax);
+  __ mov(IceType_i32, GPRRegister::Encoded_Reg_ecx,
+         GPRRegister::Encoded_Reg_ebx);
+  __ mov(IceType_i32, GPRRegister::Encoded_Reg_edx,
+         GPRRegister::Encoded_Reg_ecx);
+  __ mov(IceType_i32, GPRRegister::Encoded_Reg_edi,
+         GPRRegister::Encoded_Reg_edx);
+  __ mov(IceType_i32, GPRRegister::Encoded_Reg_esi,
+         GPRRegister::Encoded_Reg_edi);
+
+  __ mov(IceType_i32, GPRRegister::Encoded_Reg_esi, Immediate(0x55000000ul));
+  __ mov(IceType_i32, GPRRegister::Encoded_Reg_eax,
+         GPRRegister::Encoded_Reg_esi);
+
+  AssembledTest test = assemble();
+  test.run();
+  EXPECT_EQ(0x55000000ul, test.eax());
+  EXPECT_EQ(0x20ul, test.ebx());
+  EXPECT_EQ(0x20ul, test.ecx());
+  EXPECT_EQ(0x20ul, test.edx());
+  EXPECT_EQ(0x20ul, test.edi());
+  EXPECT_EQ(0x55000000ul, test.esi());
+}
+
+TEST_F(AssemblerX8632Test, MovRegMem) {
+  const uint32_t T0 = allocateDword();
+  constexpr uint32_t ExpectedT0 = 0x00111100ul;
+  const uint32_t T1 = allocateDword();
+  constexpr uint32_t ExpectedT1 = 0x00222200ul;
+  const uint32_t T2 = allocateDword();
+  constexpr uint32_t ExpectedT2 = 0x00333300ul;
+  const uint32_t T3 = allocateDword();
+  constexpr uint32_t ExpectedT3 = 0x00444400ul;
+  const uint32_t T4 = allocateDword();
+  constexpr uint32_t ExpectedT4 = 0x00555500ul;
+  const uint32_t T5 = allocateDword();
+  constexpr uint32_t ExpectedT5 = 0x00666600ul;
+
+  __ mov(IceType_i32, dwordAddress(T0), Immediate(ExpectedT0));
+  __ mov(IceType_i32, GPRRegister::Encoded_Reg_eax, dwordAddress(T0));
+
+  __ mov(IceType_i32, dwordAddress(T1), Immediate(ExpectedT1));
+  __ mov(IceType_i32, GPRRegister::Encoded_Reg_ebx, dwordAddress(T1));
+
+  __ mov(IceType_i32, dwordAddress(T2), Immediate(ExpectedT2));
+  __ mov(IceType_i32, GPRRegister::Encoded_Reg_ecx, dwordAddress(T2));
+
+  __ mov(IceType_i32, dwordAddress(T3), Immediate(ExpectedT3));
+  __ mov(IceType_i32, GPRRegister::Encoded_Reg_edx, dwordAddress(T3));
+
+  __ mov(IceType_i32, dwordAddress(T4), Immediate(ExpectedT4));
+  __ mov(IceType_i32, GPRRegister::Encoded_Reg_edi, dwordAddress(T4));
+
+  __ mov(IceType_i32, dwordAddress(T5), Immediate(ExpectedT5));
+  __ mov(IceType_i32, GPRRegister::Encoded_Reg_esi, dwordAddress(T5));
+
+  AssembledTest test = assemble();
+  test.run();
+  EXPECT_EQ(ExpectedT0, test.eax());
+  EXPECT_EQ(ExpectedT1, test.ebx());
+  EXPECT_EQ(ExpectedT2, test.ecx());
+  EXPECT_EQ(ExpectedT3, test.edx());
+  EXPECT_EQ(ExpectedT4, test.edi());
+  EXPECT_EQ(ExpectedT5, test.esi());
+}
+
+TEST_F(AssemblerX8632Test, Movzx) {
+#define TestMovzx8bitWithRegDest(Src, Dst, Imm)                                \
+  do {                                                                         \
+    static_assert(((Imm)&0xFF) == (Imm), #Imm " is not an 8bit immediate");    \
+    __ mov(IceType_i8, GPRRegister::Encoded_Reg_##Src, Immediate(Imm));        \
+    __ movzx(IceType_i8, GPRRegister::Encoded_Reg_##Dst,                       \
+             GPRRegister::Encoded_Reg_##Src);                                  \
+    AssembledTest test = assemble();                                           \
+    test.run();                                                                \
+    ASSERT_EQ(Imm, test.Dst()) << "(" #Src ", " #Dst ", " #Imm ")";            \
+    reset();                                                                   \
+  } while (0)
+
+#define TestMovzx16bitWithRegDest(Src, Dst, Imm)                               \
+  do {                                                                         \
+    static_assert(((Imm)&0xFFFF) == (Imm), #Imm " is not a 16bit immediate");  \
+    __ mov(IceType_i16, GPRRegister::Encoded_Reg_##Src, Immediate(Imm));       \
+    __ movzx(IceType_i16, GPRRegister::Encoded_Reg_##Dst,                      \
+             GPRRegister::Encoded_Reg_##Src);                                  \
+    AssembledTest test = assemble();                                           \
+    test.run();                                                                \
+    ASSERT_EQ(Imm, test.Dst()) << "(" #Src ", " #Dst ", " #Imm ")";            \
+    reset();                                                                   \
+  } while (0)
+
+#define TestMovzx8bitWithAddrSrc(Dst, Imm)                                     \
+  do {                                                                         \
+    static_assert(((Imm)&0xFF) == (Imm), #Imm " is not an 8bit immediate");    \
+    const uint32_t T0 = allocateDword();                                       \
+    const uint32_t V0 = Imm;                                                   \
+    __ movzx(IceType_i8, GPRRegister::Encoded_Reg_##Dst, dwordAddress(T0));    \
+    AssembledTest test = assemble();                                           \
+    test.setDwordTo(T0, V0);                                                   \
+    test.run();                                                                \
+    ASSERT_EQ(Imm, test.Dst()) << "(Addr, " #Dst ", " #Imm ")";                \
+    reset();                                                                   \
+  } while (0)
+
+#define TestMovzx16bitWithAddrSrc(Dst, Imm)                                    \
+  do {                                                                         \
+    static_assert(((Imm)&0xFFFF) == (Imm), #Imm " is not a 16bit immediate");  \
+    const uint32_t T0 = allocateDword();                                       \
+    const uint32_t V0 = Imm;                                                   \
+    __ movzx(IceType_i16, GPRRegister::Encoded_Reg_##Dst, dwordAddress(T0));   \
+    AssembledTest test = assemble();                                           \
+    test.setDwordTo(T0, V0);                                                   \
+    test.run();                                                                \
+    ASSERT_EQ(Imm, test.Dst()) << "(Addr, " #Dst ", " #Imm ")";                \
+    reset();                                                                   \
+  } while (0)
+
+#define TestMovzx(Dst)                                                         \
+  do {                                                                         \
+    TestMovzx8bitWithRegDest(eax, Dst, 0x81u);                                 \
+    TestMovzx8bitWithRegDest(ebx, Dst, 0x82u);                                 \
+    TestMovzx8bitWithRegDest(ecx, Dst, 0x83u);                                 \
+    TestMovzx8bitWithRegDest(edx, Dst, 0x84u);                                 \
+    /* esi is encoded as dh */                                                 \
+    TestMovzx8bitWithRegDest(esi, Dst, 0x85u);                                 \
+    /* edi is encoded as bh */                                                 \
+    TestMovzx8bitWithRegDest(edi, Dst, 0x86u);                                 \
+    /* ebp is encoded as ch */                                                 \
+    TestMovzx8bitWithRegDest(ebp, Dst, 0x87u);                                 \
+    /* esp is encoded as ah */                                                 \
+    TestMovzx8bitWithRegDest(esp, Dst, 0x88u);                                 \
+    TestMovzx8bitWithAddrSrc(Dst, 0x8Fu);                                      \
+                                                                               \
+    TestMovzx16bitWithRegDest(eax, Dst, 0x8118u);                              \
+    TestMovzx16bitWithRegDest(ebx, Dst, 0x8228u);                              \
+    TestMovzx16bitWithRegDest(ecx, Dst, 0x8338u);                              \
+    TestMovzx16bitWithRegDest(edx, Dst, 0x8448u);                              \
+    TestMovzx16bitWithAddrSrc(Dst, 0x8FF8u);                                   \
+  } while (0)
+
+  TestMovzx(eax);
+  TestMovzx(ebx);
+  TestMovzx(ecx);
+  TestMovzx(edx);
+  TestMovzx(esi);
+  TestMovzx(edi);
+
+#undef TestMovzx
+#undef TestMovzx16bitWithAddrDest
+#undef TestMovzx8bitWithAddrDest
+#undef TestMovzx16bitWithRegDest
+#undef TestMovzx8bitWithRegDest
+}
+
+TEST_F(AssemblerX8632Test, Movsx) {
+#define TestMovsx8bitWithRegDest(Src, Dst, Imm)                                \
+  do {                                                                         \
+    static_assert(((Imm)&0xFF) == (Imm), #Imm " is not an 8bit immediate");    \
+    __ mov(IceType_i8, GPRRegister::Encoded_Reg_##Src, Immediate(Imm));        \
+    __ movsx(IceType_i8, GPRRegister::Encoded_Reg_##Dst,                       \
+             GPRRegister::Encoded_Reg_##Src);                                  \
+    AssembledTest test = assemble();                                           \
+    test.run();                                                                \
+    ASSERT_EQ((0xFFFFFF00 | (Imm)), test.Dst())                                \
+        << "(" #Src ", " #Dst ", " #Imm ")";                                   \
+    reset();                                                                   \
+  } while (0)
+
+#define TestMovsx16bitWithRegDest(Src, Dst, Imm)                               \
+  do {                                                                         \
+    static_assert(((Imm)&0xFFFF) == (Imm), #Imm " is not a 16bit immediate");  \
+    __ mov(IceType_i16, GPRRegister::Encoded_Reg_##Src, Immediate(Imm));       \
+    __ movsx(IceType_i16, GPRRegister::Encoded_Reg_##Dst,                      \
+             GPRRegister::Encoded_Reg_##Src);                                  \
+    AssembledTest test = assemble();                                           \
+    test.run();                                                                \
+    ASSERT_EQ((0xFFFF0000 | (Imm)), test.Dst())                                \
+        << "(" #Src ", " #Dst ", " #Imm ")";                                   \
+    reset();                                                                   \
+  } while (0)
+
+#define TestMovsx8bitWithAddrSrc(Dst, Imm)                                     \
+  do {                                                                         \
+    static_assert(((Imm)&0xFF) == (Imm), #Imm " is not an 8bit immediate");    \
+    const uint32_t T0 = allocateDword();                                       \
+    const uint32_t V0 = Imm;                                                   \
+    __ movsx(IceType_i8, GPRRegister::Encoded_Reg_##Dst, dwordAddress(T0));    \
+    AssembledTest test = assemble();                                           \
+    test.setDwordTo(T0, V0);                                                   \
+    test.run();                                                                \
+    ASSERT_EQ((0xFFFFFF00 | (Imm)), test.Dst())                                \
+        << "(Addr, " #Dst ", " #Imm ")";                                       \
+    reset();                                                                   \
+  } while (0)
+
+#define TestMovsx16bitWithAddrSrc(Dst, Imm)                                    \
+  do {                                                                         \
+    static_assert(((Imm)&0xFFFF) == (Imm), #Imm " is not a 16bit immediate");  \
+    const uint32_t T0 = allocateDword();                                       \
+    const uint32_t V0 = Imm;                                                   \
+    __ movsx(IceType_i16, GPRRegister::Encoded_Reg_##Dst, dwordAddress(T0));   \
+    AssembledTest test = assemble();                                           \
+    test.setDwordTo(T0, V0);                                                   \
+    test.run();                                                                \
+    ASSERT_EQ((0xFFFF0000 | (Imm)), test.Dst())                                \
+        << "(Addr, " #Dst ", " #Imm ")";                                       \
+    reset();                                                                   \
+  } while (0)
+
+#define TestMovsx(Dst)                                                         \
+  do {                                                                         \
+    TestMovsx8bitWithRegDest(eax, Dst, 0x81u);                                 \
+    TestMovsx8bitWithRegDest(ebx, Dst, 0x82u);                                 \
+    TestMovsx8bitWithRegDest(ecx, Dst, 0x83u);                                 \
+    TestMovsx8bitWithRegDest(edx, Dst, 0x84u);                                 \
+    /* esi is encoded as dh */                                                 \
+    TestMovsx8bitWithRegDest(esi, Dst, 0x85u);                                 \
+    /* edi is encoded as bh */                                                 \
+    TestMovsx8bitWithRegDest(edi, Dst, 0x86u);                                 \
+    /* ebp is encoded as ch */                                                 \
+    TestMovsx8bitWithRegDest(ebp, Dst, 0x87u);                                 \
+    /* esp is encoded as ah */                                                 \
+    TestMovsx8bitWithRegDest(esp, Dst, 0x88u);                                 \
+    TestMovsx8bitWithAddrSrc(Dst, 0x8Fu);                                      \
+                                                                               \
+    TestMovsx16bitWithRegDest(eax, Dst, 0x8118u);                              \
+    TestMovsx16bitWithRegDest(ebx, Dst, 0x8228u);                              \
+    TestMovsx16bitWithRegDest(ecx, Dst, 0x8338u);                              \
+    TestMovsx16bitWithRegDest(edx, Dst, 0x8448u);                              \
+    TestMovsx16bitWithAddrSrc(Dst, 0x8FF8u);                                   \
+  } while (0)
+
+  TestMovsx(eax);
+  TestMovsx(ebx);
+  TestMovsx(ecx);
+  TestMovsx(edx);
+  TestMovsx(esi);
+  TestMovsx(edi);
+
+#undef TestMovsx
+#undef TestMovsx16bitWithAddrDest
+#undef TestMovsx8bitWithAddrDest
+#undef TestMovsx16bitWithRegDest
+#undef TestMovsx8bitWithRegDest
+}
+
+TEST_F(AssemblerX8632LowLevelTest, RepMovsb) {
+  __ rep_movsb();
+
+  static constexpr uint32_t ByteCount = 2;
+  static constexpr uint8_t Prefix = 0xF3;
+  static constexpr uint8_t Opcode = 0xA4;
+
+  ASSERT_EQ(ByteCount, codeBytesSize());
+  verifyBytes<ByteCount>(codeBytes(), Prefix, Opcode);
+}
+
+TEST_F(AssemblerX8632Test, MovssXmmAddr) {
+#define TestMovssXmmAddrFloatLength(FloatLength, Xmm, Value)                   \
+  do {                                                                         \
+    static_assert((FloatLength) == 32 || (FloatLength) == 64,                  \
+                  "Invalid fp length #FloatLength");                           \
+    using Type = std::conditional<FloatLength == 32, float, double>::type;     \
+                                                                               \
+    static constexpr char TestString[] = "(" #FloatLength ", " #Xmm ")";       \
+    static constexpr bool IsDouble = std::is_same<Type, double>::value;        \
+    const uint32_t T0 = allocateQword();                                       \
+    const Type V0 = Value;                                                     \
+                                                                               \
+    __ movss(IceType_f##FloatLength, XmmRegister::Encoded_Reg_##Xmm,           \
+             dwordAddress(T0));                                                \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    if (IsDouble) {                                                            \
+      test.setQwordTo(T0, static_cast<double>(V0));                            \
+    } else {                                                                   \
+      test.setDwordTo(T0, static_cast<float>(V0));                             \
+    }                                                                          \
+    test.run();                                                                \
+    ASSERT_DOUBLE_EQ(Value, test.Xmm<Type>()) << TestString << " value is "    \
+                                              << Value;                        \
+    reset();                                                                   \
+  } while (0)
+
+#define TestMovssXmmAddr(FloatLength)                                          \
+  do {                                                                         \
+    using Type = std::conditional<FloatLength == 32, float, double>::type;     \
+    for (const Type Value : {0.0, -0.0, 1.0, -1.0, 3.14, 99999.9999}) {        \
+      TestMovssXmmAddrFloatLength(FloatLength, xmm0, Value);                   \
+      TestMovssXmmAddrFloatLength(FloatLength, xmm1, Value);                   \
+      TestMovssXmmAddrFloatLength(FloatLength, xmm2, Value);                   \
+      TestMovssXmmAddrFloatLength(FloatLength, xmm3, Value);                   \
+      TestMovssXmmAddrFloatLength(FloatLength, xmm4, Value);                   \
+      TestMovssXmmAddrFloatLength(FloatLength, xmm5, Value);                   \
+      TestMovssXmmAddrFloatLength(FloatLength, xmm6, Value);                   \
+      TestMovssXmmAddrFloatLength(FloatLength, xmm7, Value);                   \
+    }                                                                          \
+  } while (0)
+
+  TestMovssXmmAddr(32);
+  TestMovssXmmAddr(64);
+
+#undef TestMovssXmmAddr
+#undef TestMovssXmmAddrType
+}
+
+TEST_F(AssemblerX8632Test, MovssAddrXmm) {
+#define TestMovssAddrXmmFloatLength(FloatLength, Xmm, Value)                   \
+  do {                                                                         \
+    static_assert((FloatLength) == 32 || (FloatLength) == 64,                  \
+                  "Invalid fp length #FloatLength");                           \
+    using Type = std::conditional<FloatLength == 32, float, double>::type;     \
+                                                                               \
+    static constexpr char TestString[] = "(" #FloatLength ", " #Xmm ")";       \
+    static constexpr bool IsDouble = std::is_same<Type, double>::value;        \
+    const uint32_t T0 = allocateQword();                                       \
+    const Type V0 = Value;                                                     \
+    const uint32_t T1 = allocateQword();                                       \
+    static_assert(std::numeric_limits<Type>::has_quiet_NaN,                    \
+                  "f" #FloatLength " does not have quiet nan.");               \
+    const Type V1 = std::numeric_limits<Type>::quiet_NaN();                    \
+                                                                               \
+    __ movss(IceType_f##FloatLength, XmmRegister::Encoded_Reg_##Xmm,           \
+             dwordAddress(T0));                                                \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    if (IsDouble) {                                                            \
+      test.setQwordTo(T0, static_cast<double>(V0));                            \
+      test.setQwordTo(T1, static_cast<double>(V1));                            \
+    } else {                                                                   \
+      test.setDwordTo(T0, static_cast<float>(V0));                             \
+      test.setDwordTo(T1, static_cast<float>(V1));                             \
+    }                                                                          \
+    test.run();                                                                \
+    ASSERT_DOUBLE_EQ(Value, test.Xmm<Type>()) << TestString << " value is "    \
+                                              << Value;                        \
+    reset();                                                                   \
+  } while (0)
+
+#define TestMovssAddrXmm(FloatLength)                                          \
+  do {                                                                         \
+    using Type = std::conditional<FloatLength == 32, float, double>::type;     \
+    for (const Type Value : {0.0, -0.0, 1.0, -1.0, 3.14, 99999.9999}) {        \
+      TestMovssAddrXmmFloatLength(FloatLength, xmm0, Value);                   \
+      TestMovssAddrXmmFloatLength(FloatLength, xmm1, Value);                   \
+      TestMovssAddrXmmFloatLength(FloatLength, xmm2, Value);                   \
+      TestMovssAddrXmmFloatLength(FloatLength, xmm3, Value);                   \
+      TestMovssAddrXmmFloatLength(FloatLength, xmm4, Value);                   \
+      TestMovssAddrXmmFloatLength(FloatLength, xmm5, Value);                   \
+      TestMovssAddrXmmFloatLength(FloatLength, xmm6, Value);                   \
+      TestMovssAddrXmmFloatLength(FloatLength, xmm7, Value);                   \
+    }                                                                          \
+  } while (0)
+
+  TestMovssAddrXmm(32);
+  TestMovssAddrXmm(64);
+
+#undef TestMovssAddrXmm
+#undef TestMovssAddrXmmType
+}
+
+TEST_F(AssemblerX8632Test, MovssXmmXmm) {
+#define TestMovssXmmXmmFloatLength(FloatLength, Src, Dst, Value)               \
+  do {                                                                         \
+    static_assert((FloatLength) == 32 || (FloatLength) == 64,                  \
+                  "Invalid fp length #FloatLength");                           \
+    using Type = std::conditional<FloatLength == 32, float, double>::type;     \
+                                                                               \
+    static constexpr char TestString[] =                                       \
+        "(" #FloatLength ", " #Src ", " #Dst ")";                              \
+    static constexpr bool IsDouble = std::is_same<Type, double>::value;        \
+    const uint32_t T0 = allocateQword();                                       \
+    const Type V0 = Value;                                                     \
+    const uint32_t T1 = allocateQword();                                       \
+    static_assert(std::numeric_limits<Type>::has_quiet_NaN,                    \
+                  "f" #FloatLength " does not have quiet nan.");               \
+    const Type V1 = std::numeric_limits<Type>::quiet_NaN();                    \
+                                                                               \
+    __ movss(IceType_f##FloatLength, XmmRegister::Encoded_Reg_##Src,           \
+             dwordAddress(T0));                                                \
+    __ movss(IceType_f##FloatLength, XmmRegister::Encoded_Reg_##Dst,           \
+             dwordAddress(T1));                                                \
+    __ movss(IceType_f##FloatLength, XmmRegister::Encoded_Reg_##Dst,           \
+             XmmRegister::Encoded_Reg_##Src);                                  \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    if (IsDouble) {                                                            \
+      test.setQwordTo(T0, static_cast<double>(V0));                            \
+      test.setQwordTo(T1, static_cast<double>(V1));                            \
+    } else {                                                                   \
+      test.setDwordTo(T0, static_cast<float>(V0));                             \
+      test.setDwordTo(T1, static_cast<float>(V1));                             \
+    }                                                                          \
+    test.run();                                                                \
+    ASSERT_DOUBLE_EQ(Value, test.Dst<Type>()) << TestString << " value is "    \
+                                              << Value;                        \
+    reset();                                                                   \
+  } while (0)
+
+#define TestMovssXmmXmm(FloatLength)                                           \
+  do {                                                                         \
+    using Type = std::conditional<FloatLength == 32, float, double>::type;     \
+    for (const Type Value : {0.0, -0.0, 1.0, -1.0, 3.14, 99999.9999}) {        \
+      TestMovssXmmXmmFloatLength(FloatLength, xmm0, xmm1, Value);              \
+      TestMovssXmmXmmFloatLength(FloatLength, xmm1, xmm2, Value);              \
+      TestMovssXmmXmmFloatLength(FloatLength, xmm2, xmm3, Value);              \
+      TestMovssXmmXmmFloatLength(FloatLength, xmm3, xmm4, Value);              \
+      TestMovssXmmXmmFloatLength(FloatLength, xmm4, xmm5, Value);              \
+      TestMovssXmmXmmFloatLength(FloatLength, xmm5, xmm6, Value);              \
+      TestMovssXmmXmmFloatLength(FloatLength, xmm6, xmm7, Value);              \
+      TestMovssXmmXmmFloatLength(FloatLength, xmm7, xmm0, Value);              \
+    }                                                                          \
+  } while (0)
+
+  TestMovssXmmXmm(32);
+  TestMovssXmmXmm(64);
+
+#undef TestMovssXmmXmm
+#undef TestMovssXmmXmmType
+}
+
+TEST_F(AssemblerX8632Test, MovdToXmm) {
+#define TestMovdXmmReg(Src, Dst, Value)                                        \
+  do {                                                                         \
+    assert(((Value)&0xFFFFFFFF) == (Value));                                   \
+    static constexpr char TestString[] = "(" #Src ", " #Dst ")";               \
+    const uint32_t T0 = allocateQword();                                       \
+    const uint64_t V0 = 0xFFFFFFFF00000000ull;                                 \
+                                                                               \
+    __ mov(IceType_i32, GPRRegister::Encoded_Reg_##Src, Immediate(Value));     \
+    __ movss(IceType_f64, XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0));   \
+    __ movd(XmmRegister::Encoded_Reg_##Dst, GPRRegister::Encoded_Reg_##Src);   \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+                                                                               \
+    test.setQwordTo(T0, V0);                                                   \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(Value, test.Dst<uint64_t>()) << TestString << " value is "       \
+                                           << Value;                           \
+    reset();                                                                   \
+  } while (0)
+
+#define TestMovdXmmAddr(Dst, Value)                                            \
+  do {                                                                         \
+    assert(((Value)&0xFFFFFFFF) == (Value));                                   \
+    static constexpr char TestString[] = "(" #Dst ", Addr)";                   \
+    const uint32_t T0 = allocateQword();                                       \
+    const uint32_t V0 = Value;                                                 \
+    const uint32_t T1 = allocateQword();                                       \
+    const uint64_t V1 = 0xFFFFFFFF00000000ull;                                 \
+                                                                               \
+    __ movss(IceType_f64, XmmRegister::Encoded_Reg_##Dst, dwordAddress(T1));   \
+    __ movd(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0));                 \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+                                                                               \
+    test.setDwordTo(T0, V0);                                                   \
+    test.setQwordTo(T1, V1);                                                   \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(Value, test.Dst<uint64_t>()) << TestString << " value is "       \
+                                           << Value;                           \
+    reset();                                                                   \
+  } while (0)
+
+#define TestMovd(Dst)                                                          \
+  do {                                                                         \
+    for (uint32_t Value : {0u, 1u, 0x7FFFFFFFu, 0x80000000u, 0xFFFFFFFFu}) {   \
+      TestMovdXmmReg(eax, Dst, Value);                                         \
+      TestMovdXmmReg(ebx, Dst, Value);                                         \
+      TestMovdXmmReg(ecx, Dst, Value);                                         \
+      TestMovdXmmReg(edx, Dst, Value);                                         \
+      TestMovdXmmReg(esi, Dst, Value);                                         \
+      TestMovdXmmReg(edi, Dst, Value);                                         \
+      TestMovdXmmAddr(Dst, Value);                                             \
+    }                                                                          \
+  } while (0)
+
+  TestMovd(xmm0);
+  TestMovd(xmm1);
+  TestMovd(xmm2);
+  TestMovd(xmm3);
+  TestMovd(xmm4);
+  TestMovd(xmm5);
+  TestMovd(xmm6);
+  TestMovd(xmm7);
+
+#undef TestMovdXmmAddr
+#undef TestMovdXmmReg
+#undef TestMovd
+}
+
+TEST_F(AssemblerX8632Test, MovdFromXmm) {
+#define TestMovdRegXmm(Src, Dst, Value)                                        \
+  do {                                                                         \
+    assert(((Value)&0xFFFFFFFF) == (Value));                                   \
+    static constexpr char TestString[] = "(" #Src ", " #Dst ")";               \
+    const uint32_t T0 = allocateDword();                                       \
+    const uint32_t V0 = Value;                                                 \
+                                                                               \
+    __ movss(IceType_f64, XmmRegister::Encoded_Reg_##Src, dwordAddress(T0));   \
+    __ movd(GPRRegister::Encoded_Reg_##Dst, XmmRegister::Encoded_Reg_##Src);   \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+                                                                               \
+    test.setDwordTo(T0, V0);                                                   \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(Value, test.contentsOfDword(T0)) << TestString << " value is "   \
+                                               << Value;                       \
+    reset();                                                                   \
+  } while (0)
+
+#define TestMovdAddrXmm(Src, Value)                                            \
+  do {                                                                         \
+    assert(((Value)&0xFFFFFFFF) == (Value));                                   \
+    static constexpr char TestString[] = "(" #Src ", Addr)";                   \
+    const uint32_t T0 = allocateDword();                                       \
+    const uint32_t V0 = Value;                                                 \
+    const uint32_t T1 = allocateDword();                                       \
+    const uint32_t V1 = ~(Value);                                              \
+                                                                               \
+    __ movss(IceType_f64, XmmRegister::Encoded_Reg_##Src, dwordAddress(T0));   \
+    __ movd(dwordAddress(T1), XmmRegister::Encoded_Reg_##Src);                 \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+                                                                               \
+    test.setDwordTo(T0, V0);                                                   \
+    test.setDwordTo(T1, V1);                                                   \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(Value, test.contentsOfDword(T1)) << TestString << " value is "   \
+                                               << Value;                       \
+    reset();                                                                   \
+  } while (0)
+
+#define TestMovd(Src)                                                          \
+  do {                                                                         \
+    for (uint32_t Value : {0u, 1u, 0x7FFFFFFFu, 0x80000000u, 0xFFFFFFFFu}) {   \
+      TestMovdRegXmm(Src, eax, Value);                                         \
+      TestMovdRegXmm(Src, ebx, Value);                                         \
+      TestMovdRegXmm(Src, ecx, Value);                                         \
+      TestMovdRegXmm(Src, edx, Value);                                         \
+      TestMovdRegXmm(Src, esi, Value);                                         \
+      TestMovdRegXmm(Src, edi, Value);                                         \
+      TestMovdAddrXmm(Src, Value);                                             \
+    }                                                                          \
+  } while (0)
+
+  TestMovd(xmm0);
+  TestMovd(xmm1);
+  TestMovd(xmm2);
+  TestMovd(xmm3);
+  TestMovd(xmm4);
+  TestMovd(xmm5);
+  TestMovd(xmm6);
+  TestMovd(xmm7);
+
+#undef TestMovdAddrXmm
+#undef TestMovdRegXmm
+#undef TestMovd
+}
+
+TEST_F(AssemblerX8632Test, MovqXmmAddr) {
+#define TestMovd(Dst, Value)                                                   \
+  do {                                                                         \
+    static constexpr char TestString[] = "(" #Dst ", Addr)";                   \
+    const uint32_t T0 = allocateQword();                                       \
+    const uint64_t V0 = Value;                                                 \
+    const uint32_t T1 = allocateQword();                                       \
+    const uint64_t V1 = ~(Value);                                              \
+                                                                               \
+    __ movss(IceType_f64, XmmRegister::Encoded_Reg_##Dst, dwordAddress(T1));   \
+    __ movq(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0));                 \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+                                                                               \
+    test.setQwordTo(T0, V0);                                                   \
+    test.setQwordTo(T1, V1);                                                   \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(Value, test.Dst<uint64_t>()) << TestString << " value is "       \
+                                           << Value;                           \
+    reset();                                                                   \
+  } while (0)
+
+  for (uint32_t Value : {0u, 1u, 0x7FFFFFFFu, 0x80000000u, 0xFFFFFFFFu}) {
+    TestMovd(xmm0, Value);
+    TestMovd(xmm1, Value);
+    TestMovd(xmm2, Value);
+    TestMovd(xmm3, Value);
+    TestMovd(xmm4, Value);
+    TestMovd(xmm5, Value);
+    TestMovd(xmm6, Value);
+    TestMovd(xmm7, Value);
+  }
+
+#undef TestMovd
+}
+
+TEST_F(AssemblerX8632Test, MovqAddrXmm) {
+#define TestMovd(Dst, Value)                                                   \
+  do {                                                                         \
+    static constexpr char TestString[] = "(" #Dst ", Addr)";                   \
+    const uint32_t T0 = allocateQword();                                       \
+    const uint64_t V0 = Value;                                                 \
+    const uint32_t T1 = allocateQword();                                       \
+    const uint64_t V1 = ~(Value);                                              \
+                                                                               \
+    __ movq(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0));                 \
+    __ movq(dwordAddress(T1), XmmRegister::Encoded_Reg_##Dst);                 \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+                                                                               \
+    test.setQwordTo(T0, V0);                                                   \
+    test.setQwordTo(T1, V1);                                                   \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(Value, test.Dst<uint64_t>()) << TestString << " value is "       \
+                                           << Value;                           \
+    reset();                                                                   \
+  } while (0)
+
+  for (uint32_t Value : {0u, 1u, 0x7FFFFFFFu, 0x80000000u, 0xFFFFFFFFu}) {
+    TestMovd(xmm0, Value);
+    TestMovd(xmm1, Value);
+    TestMovd(xmm2, Value);
+    TestMovd(xmm3, Value);
+    TestMovd(xmm4, Value);
+    TestMovd(xmm5, Value);
+    TestMovd(xmm6, Value);
+    TestMovd(xmm7, Value);
+  }
+
+#undef TestMovd
+}
+
+TEST_F(AssemblerX8632Test, MovqXmmXmm) {
+#define TestMovd(Src, Dst, Value)                                              \
+  do {                                                                         \
+    static constexpr char TestString[] = "(" #Src ", " #Dst ")";               \
+    const uint32_t T0 = allocateQword();                                       \
+    const uint64_t V0 = Value;                                                 \
+    const uint32_t T1 = allocateQword();                                       \
+    const uint64_t V1 = ~(Value);                                              \
+                                                                               \
+    __ movq(XmmRegister::Encoded_Reg_##Src, dwordAddress(T0));                 \
+    __ movq(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T1));                 \
+    __ movq(XmmRegister::Encoded_Reg_##Dst, XmmRegister::Encoded_Reg_##Src);   \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+                                                                               \
+    test.setQwordTo(T0, V0);                                                   \
+    test.setQwordTo(T1, V1);                                                   \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(Value, test.Dst<uint64_t>()) << TestString << " value is "       \
+                                           << Value;                           \
+    reset();                                                                   \
+  } while (0)
+
+  for (uint32_t Value : {0u, 1u, 0x7FFFFFFFu, 0x80000000u, 0xFFFFFFFFu}) {
+    TestMovd(xmm0, xmm1, Value);
+    TestMovd(xmm1, xmm2, Value);
+    TestMovd(xmm2, xmm3, Value);
+    TestMovd(xmm3, xmm4, Value);
+    TestMovd(xmm4, xmm5, Value);
+    TestMovd(xmm5, xmm6, Value);
+    TestMovd(xmm6, xmm7, Value);
+    TestMovd(xmm7, xmm0, Value);
+  }
+
+#undef TestMovd
+}
+
+TEST_F(AssemblerX8632Test, MovupsXmmAddr) {
+#define TestMovups(Dst)                                                        \
+  do {                                                                         \
+    static constexpr char TestString[] = "(" #Dst ")";                         \
+    const uint32_t T0 = allocateDqword();                                      \
+    const Dqword V0(1.0f, -1.0, std::numeric_limits<float>::quiet_NaN(),       \
+                    std::numeric_limits<float>::infinity());                   \
+                                                                               \
+    __ movups(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0));               \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0);                                                  \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(V0, test.Dst<Dqword>()) << TestString;                           \
+    reset();                                                                   \
+  } while (0)
+
+  TestMovups(xmm0);
+  TestMovups(xmm1);
+  TestMovups(xmm2);
+  TestMovups(xmm3);
+  TestMovups(xmm4);
+  TestMovups(xmm5);
+  TestMovups(xmm6);
+  TestMovups(xmm7);
+
+#undef TestMovups
+}
+
+TEST_F(AssemblerX8632Test, MovupsAddrXmm) {
+#define TestMovups(Src)                                                        \
+  do {                                                                         \
+    static constexpr char TestString[] = "(" #Src ")";                         \
+    const uint32_t T0 = allocateDqword();                                      \
+    const Dqword V0(1.0f, -1.0, std::numeric_limits<float>::quiet_NaN(),       \
+                    std::numeric_limits<float>::infinity());                   \
+    const uint32_t T1 = allocateDqword();                                      \
+    const Dqword V1(0.0, 0.0, 0.0, 0.0);                                       \
+                                                                               \
+    __ movups(XmmRegister::Encoded_Reg_##Src, dwordAddress(T0));               \
+    __ movups(dwordAddress(T1), XmmRegister::Encoded_Reg_##Src);               \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0);                                                  \
+    test.setDqwordTo(T1, V1);                                                  \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(V0, test.contentsOfDqword(T1)) << TestString;                    \
+    reset();                                                                   \
+  } while (0)
+
+  TestMovups(xmm0);
+  TestMovups(xmm1);
+  TestMovups(xmm2);
+  TestMovups(xmm3);
+  TestMovups(xmm4);
+  TestMovups(xmm5);
+  TestMovups(xmm6);
+  TestMovups(xmm7);
+
+#undef TestMovups
+}
+
+TEST_F(AssemblerX8632Test, MovupsXmmXmm) {
+#define TestMovups(Dst, Src)                                                   \
+  do {                                                                         \
+    static constexpr char TestString[] = "(" #Dst ", " #Src ")";               \
+    const uint32_t T0 = allocateDqword();                                      \
+    const Dqword V0(1.0f, -1.0, std::numeric_limits<float>::quiet_NaN(),       \
+                    std::numeric_limits<float>::infinity());                   \
+    const uint32_t T1 = allocateDqword();                                      \
+    const Dqword V1(0.0, 0.0, 0.0, 0.0);                                       \
+                                                                               \
+    __ movups(XmmRegister::Encoded_Reg_##Src, dwordAddress(T0));               \
+    __ movups(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T1));               \
+    __ movups(XmmRegister::Encoded_Reg_##Dst, XmmRegister::Encoded_Reg_##Src); \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0);                                                  \
+    test.setDqwordTo(T1, V1);                                                  \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(V0, test.Dst<Dqword>()) << TestString;                           \
+    reset();                                                                   \
+  } while (0)
+
+  TestMovups(xmm0, xmm1);
+  TestMovups(xmm1, xmm2);
+  TestMovups(xmm2, xmm3);
+  TestMovups(xmm3, xmm4);
+  TestMovups(xmm4, xmm5);
+  TestMovups(xmm5, xmm6);
+  TestMovups(xmm6, xmm7);
+  TestMovups(xmm7, xmm0);
+
+#undef TestMovups
+}
+
+TEST_F(AssemblerX8632Test, MovapsXmmXmm) {
+#define TestMovaps(Dst, Src)                                                   \
+  do {                                                                         \
+    static constexpr char TestString[] = "(" #Dst ", " #Src ")";               \
+    const uint32_t T0 = allocateDqword();                                      \
+    const Dqword V0(1.0f, -1.0, std::numeric_limits<float>::quiet_NaN(),       \
+                    std::numeric_limits<float>::infinity());                   \
+    const uint32_t T1 = allocateDqword();                                      \
+    const Dqword V1(0.0, 0.0, 0.0, 0.0);                                       \
+                                                                               \
+    __ movups(XmmRegister::Encoded_Reg_##Src, dwordAddress(T0));               \
+    __ movups(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T1));               \
+    __ movaps(XmmRegister::Encoded_Reg_##Dst, XmmRegister::Encoded_Reg_##Src); \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0);                                                  \
+    test.setDqwordTo(T1, V1);                                                  \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(V0, test.Dst<Dqword>()) << TestString;                           \
+    reset();                                                                   \
+  } while (0)
+
+  TestMovaps(xmm0, xmm1);
+  TestMovaps(xmm1, xmm2);
+  TestMovaps(xmm2, xmm3);
+  TestMovaps(xmm3, xmm4);
+  TestMovaps(xmm4, xmm5);
+  TestMovaps(xmm5, xmm6);
+  TestMovaps(xmm6, xmm7);
+  TestMovaps(xmm7, xmm0);
+
+#undef TestMovaps
+}
+
+TEST_F(AssemblerX8632Test, Movhlps_Movlhps) {
+#define TestImplSingle(Dst, Src, Inst, Expect)                                 \
+  do {                                                                         \
+    static constexpr char TestString[] = "(" #Dst ", " #Src ", " #Inst ")";    \
+    const uint32_t T0 = allocateDqword();                                      \
+    const Dqword V0(uint64_t(0xAAAAAAAABBBBBBBBull),                           \
+                    uint64_t(0xCCCCCCCCDDDDDDDDull));                          \
+    const uint32_t T1 = allocateDqword();                                      \
+    const Dqword V1(uint64_t(0xEEEEEEEEFFFFFFFFull),                           \
+                    uint64_t(0x9999999988888888ull));                          \
+                                                                               \
+    __ movups(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0));               \
+    __ movups(XmmRegister::Encoded_Reg_##Src, dwordAddress(T1));               \
+    __ Inst(XmmRegister::Encoded_Reg_##Dst, XmmRegister::Encoded_Reg_##Src);   \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0);                                                  \
+    test.setDqwordTo(T1, V1);                                                  \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(Dqword Expect, test.Dst<Dqword>()) << TestString;                \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImpl(Dst, Src)                                                     \
+  do {                                                                         \
+    TestImplSingle(Dst, Src, movhlps, (uint64_t(0x9999999988888888ull),        \
+                                       uint64_t(0xCCCCCCCCDDDDDDDDull)));      \
+    TestImplSingle(Dst, Src, movlhps, (uint64_t(0xAAAAAAAABBBBBBBBull),        \
+                                       uint64_t(0xEEEEEEEEFFFFFFFFull)));      \
+  } while (0)
+
+  TestImpl(xmm0, xmm1);
+  TestImpl(xmm1, xmm2);
+  TestImpl(xmm2, xmm3);
+  TestImpl(xmm3, xmm4);
+  TestImpl(xmm4, xmm5);
+  TestImpl(xmm5, xmm6);
+  TestImpl(xmm6, xmm7);
+  TestImpl(xmm7, xmm0);
+
+#undef TestImpl
+#undef TestImplSingle
+}
+
+TEST_F(AssemblerX8632Test, Movmsk) {
+#define TestMovmskGPRXmm(GPR, Src, Value1, Expected, Inst)                     \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #GPR ", " #Src ", " #Value1 ", " #Expected ", " #Inst ")";         \
+    const uint32_t T0 = allocateDqword();                                      \
+    const Dqword V0 Value1;                                                    \
+                                                                               \
+    __ movups(XmmRegister::Encoded_Reg_##Src, dwordAddress(T0));               \
+    __ Inst(GPRRegister::Encoded_Reg_##GPR, XmmRegister::Encoded_Reg_##Src);   \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0);                                                  \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(Expected, test.GPR()) << TestString;                             \
+    reset();                                                                   \
+  } while (0)
+
+#define TestMovmsk(GPR, Src)                                                   \
+  do {                                                                         \
+    TestMovmskGPRXmm(GPR, Src, (-1.0, 1.0, -1.0, 1.0), 0x05ul, movmskps);      \
+    TestMovmskGPRXmm(GPR, Src, (1.0, -1.0), 0x02ul, movmskpd);                 \
+  } while (0)
+
+  TestMovmsk(eax, xmm0);
+  TestMovmsk(ebx, xmm1);
+  TestMovmsk(ecx, xmm2);
+  TestMovmsk(edx, xmm3);
+  TestMovmsk(esi, xmm4);
+  TestMovmsk(edi, xmm5);
+  TestMovmsk(eax, xmm6);
+  TestMovmsk(ebx, xmm7);
+
+#undef TestMovmskGPRXmm
+#undef TestMovmsk
+}
+
+TEST_F(AssemblerX8632Test, Pmovsxdq) {
+#define TestPmovsxdqXmmXmm(Dst, Src, Value1)                                   \
+  do {                                                                         \
+    static constexpr char TestString[] = "(" #Dst ", " #Src ", " #Value1 ")";  \
+    const uint32_t T0 = allocateDqword();                                      \
+    const Dqword V0 Value1;                                                    \
+    const uint32_t T1 = allocateDqword();                                      \
+    const Dqword V1(uint64_t(0), uint64_t(0));                                 \
+                                                                               \
+    __ movups(XmmRegister::Encoded_Reg_##Src, dwordAddress(T0));               \
+    __ movups(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T1));               \
+    __ pmovsxdq(XmmRegister::Encoded_Reg_##Dst,                                \
+                XmmRegister::Encoded_Reg_##Src);                               \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0);                                                  \
+    test.setDqwordTo(T1, V1);                                                  \
+    test.run();                                                                \
+                                                                               \
+    const Dqword Expected(uint64_t(V0.I32[0]), uint64_t(V0.I32[1]));           \
+    ASSERT_EQ(Expected, test.Dst<Dqword>()) << TestString;                     \
+    reset();                                                                   \
+  } while (0)
+
+#define TestPmovsxdq(Dst, Src)                                                 \
+  do {                                                                         \
+    TestPmovsxdqXmmXmm(Dst, Src, (uint64_t(0x700000007FFFFFFFull),             \
+                                  uint64_t(0xAAAAAAAAEEEEEEEEull)));           \
+    TestPmovsxdqXmmXmm(Dst, Src, (uint64_t(0x800000007FFFFFFFull),             \
+                                  uint64_t(0xAAAAAAAAEEEEEEEEull)));           \
+    TestPmovsxdqXmmXmm(Dst, Src, (uint64_t(0x70000000FFFFFFFFull),             \
+                                  uint64_t(0xAAAAAAAAEEEEEEEEull)));           \
+    TestPmovsxdqXmmXmm(Dst, Src, (uint64_t(0x80000000FFFFFFFFull),             \
+                                  uint64_t(0xAAAAAAAAEEEEEEEEull)));           \
+  } while (0)
+
+  TestPmovsxdq(xmm0, xmm1);
+  TestPmovsxdq(xmm1, xmm2);
+  TestPmovsxdq(xmm2, xmm3);
+  TestPmovsxdq(xmm3, xmm4);
+  TestPmovsxdq(xmm4, xmm5);
+  TestPmovsxdq(xmm5, xmm6);
+  TestPmovsxdq(xmm6, xmm7);
+  TestPmovsxdq(xmm7, xmm0);
+
+#undef TestPmovsxdq
+#undef TestPmovsxdqXmmXmm
+}
+
+TEST_F(AssemblerX8632Test, CmovRegReg) {
+#define TestCmovRegReg(C, Src0, Value0, Src1, Value1, Dest, IsTrue)            \
+  do {                                                                         \
+    __ mov(IceType_i32, GPRRegister::Encoded_Reg_##Src0, Immediate(Value0));   \
+    __ mov(IceType_i32, GPRRegister::Encoded_Reg_##Src1, Immediate(Value1));   \
+    __ mov(IceType_i32, GPRRegister::Encoded_Reg_##Dest, Immediate(Value0));   \
+    __ cmp(IceType_i32, GPRRegister::Encoded_Reg_##Src0,                       \
+           GPRRegister::Encoded_Reg_##Src1);                                   \
+    __ cmov(IceType_i32, Cond::Br_##C, GPRRegister::Encoded_Reg_##Dest,        \
+            GPRRegister::Encoded_Reg_##Src1);                                  \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.run();                                                                \
+    ASSERT_EQ((IsTrue) ? (Value1) : (Value0), test.Dest())                     \
+        << "(" #C ", " #Src0 ", " #Value0 ", " #Src1 ", " #Value1 ", " #Dest   \
+           ", " #IsTrue ")";                                                   \
+                                                                               \
+    reset();                                                                   \
+  } while (0)
+
+  TestCmovRegReg(o, eax, 0x80000000u, ebx, 0x1u, ecx, 1u);
+  TestCmovRegReg(o, eax, 0x1u, ebx, 0x10000000u, ecx, 0u);
+
+  TestCmovRegReg(no, ebx, 0x1u, ecx, 0x10000000u, edx, 1u);
+  TestCmovRegReg(no, ebx, 0x80000000u, ecx, 0x1u, edx, 0u);
+
+  TestCmovRegReg(b, ecx, 0x1, edx, 0x80000000u, eax, 1u);
+  TestCmovRegReg(b, ecx, 0x80000000u, edx, 0x1u, eax, 0u);
+
+  TestCmovRegReg(ae, edx, 0x80000000u, edi, 0x1u, ebx, 1u);
+  TestCmovRegReg(ae, edx, 0x1u, edi, 0x80000000u, ebx, 0u);
+
+  TestCmovRegReg(e, edi, 0x1u, esi, 0x1u, ecx, 1u);
+  TestCmovRegReg(e, edi, 0x1u, esi, 0x11111u, ecx, 0u);
+
+  TestCmovRegReg(ne, esi, 0x80000000u, eax, 0x1u, edx, 1u);
+  TestCmovRegReg(ne, esi, 0x1u, eax, 0x1u, edx, 0u);
+
+  TestCmovRegReg(be, eax, 0x1u, ebx, 0x80000000u, eax, 1u);
+  TestCmovRegReg(be, eax, 0x80000000u, ebx, 0x1u, eax, 0u);
+
+  TestCmovRegReg(a, ebx, 0x80000000u, ecx, 0x1u, ebx, 1u);
+  TestCmovRegReg(a, ebx, 0x1u, ecx, 0x80000000u, ebx, 0u);
+
+  TestCmovRegReg(s, ecx, 0x1u, edx, 0x80000000u, ecx, 1u);
+  TestCmovRegReg(s, ecx, 0x80000000u, edx, 0x1u, ecx, 0u);
+
+  TestCmovRegReg(ns, edx, 0x80000000u, edi, 0x1u, ecx, 1u);
+  TestCmovRegReg(ns, edx, 0x1u, edi, 0x80000000u, ecx, 0u);
+
+  TestCmovRegReg(p, edi, 0x80000000u, esi, 0x1u, edx, 1u);
+  TestCmovRegReg(p, edi, 0x1u, esi, 0x80000000u, edx, 0u);
+
+  TestCmovRegReg(np, esi, 0x1u, edi, 0x80000000u, eax, 1u);
+  TestCmovRegReg(np, esi, 0x80000000u, edi, 0x1u, eax, 0u);
+
+  TestCmovRegReg(l, edi, 0x80000000u, eax, 0x1u, ebx, 1u);
+  TestCmovRegReg(l, edi, 0x1u, eax, 0x80000000u, ebx, 0u);
+
+  TestCmovRegReg(ge, eax, 0x1u, ebx, 0x80000000u, ecx, 1u);
+  TestCmovRegReg(ge, eax, 0x80000000u, ebx, 0x1u, ecx, 0u);
+
+  TestCmovRegReg(le, ebx, 0x80000000u, ecx, 0x1u, edx, 1u);
+  TestCmovRegReg(le, ebx, 0x1u, ecx, 0x80000000u, edx, 0u);
+
+#undef TestCmovRegReg
+}
+
+TEST_F(AssemblerX8632Test, CmovRegAddr) {
+#define TestCmovRegAddr(C, Src0, Value0, Value1, Dest, IsTrue)                 \
+  do {                                                                         \
+    const uint32_t T0 = allocateDword();                                       \
+    const uint32_t V0 = Value1;                                                \
+    __ mov(IceType_i32, GPRRegister::Encoded_Reg_##Src0, Immediate(Value0));   \
+    __ mov(IceType_i32, GPRRegister::Encoded_Reg_##Dest, Immediate(Value0));   \
+    __ cmp(IceType_i32, GPRRegister::Encoded_Reg_##Src0, dwordAddress(T0));    \
+    __ cmov(IceType_i32, Cond::Br_##C, GPRRegister::Encoded_Reg_##Dest,        \
+            dwordAddress(T0));                                                 \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDwordTo(T0, V0);                                                   \
+    test.run();                                                                \
+    ASSERT_EQ((IsTrue) ? (Value1) : (Value0), test.Dest())                     \
+        << "(" #C ", " #Src0 ", " #Value0 ", " #Value1 ", " #Dest ", " #IsTrue \
+           ")";                                                                \
+                                                                               \
+    reset();                                                                   \
+  } while (0)
+
+  TestCmovRegAddr(o, eax, 0x80000000u, 0x1u, ecx, 1u);
+  TestCmovRegAddr(o, eax, 0x1u, 0x10000000u, ecx, 0u);
+
+  TestCmovRegAddr(no, ebx, 0x1u, 0x10000000u, edx, 1u);
+  TestCmovRegAddr(no, ebx, 0x80000000u, 0x1u, edx, 0u);
+
+  TestCmovRegAddr(b, ecx, 0x1, 0x80000000u, eax, 1u);
+  TestCmovRegAddr(b, ecx, 0x80000000u, 0x1u, eax, 0u);
+
+  TestCmovRegAddr(ae, edx, 0x80000000u, 0x1u, ebx, 1u);
+  TestCmovRegAddr(ae, edx, 0x1u, 0x80000000u, ebx, 0u);
+
+  TestCmovRegAddr(e, edi, 0x1u, 0x1u, ecx, 1u);
+  TestCmovRegAddr(e, edi, 0x1u, 0x11111u, ecx, 0u);
+
+  TestCmovRegAddr(ne, esi, 0x80000000u, 0x1u, edx, 1u);
+  TestCmovRegAddr(ne, esi, 0x1u, 0x1u, edx, 0u);
+
+  TestCmovRegAddr(be, eax, 0x1u, 0x80000000u, eax, 1u);
+  TestCmovRegAddr(be, eax, 0x80000000u, 0x1u, eax, 0u);
+
+  TestCmovRegAddr(a, ebx, 0x80000000u, 0x1u, ebx, 1u);
+  TestCmovRegAddr(a, ebx, 0x1u, 0x80000000u, ebx, 0u);
+
+  TestCmovRegAddr(s, ecx, 0x1u, 0x80000000u, ecx, 1u);
+  TestCmovRegAddr(s, ecx, 0x80000000u, 0x1u, ecx, 0u);
+
+  TestCmovRegAddr(ns, edx, 0x80000000u, 0x1u, ecx, 1u);
+  TestCmovRegAddr(ns, edx, 0x1u, 0x80000000u, ecx, 0u);
+
+  TestCmovRegAddr(p, edi, 0x80000000u, 0x1u, edx, 1u);
+  TestCmovRegAddr(p, edi, 0x1u, 0x80000000u, edx, 0u);
+
+  TestCmovRegAddr(np, esi, 0x1u, 0x80000000u, eax, 1u);
+  TestCmovRegAddr(np, esi, 0x80000000u, 0x1u, eax, 0u);
+
+  TestCmovRegAddr(l, edi, 0x80000000u, 0x1u, ebx, 1u);
+  TestCmovRegAddr(l, edi, 0x1u, 0x80000000u, ebx, 0u);
+
+  TestCmovRegAddr(ge, eax, 0x1u, 0x80000000u, ecx, 1u);
+  TestCmovRegAddr(ge, eax, 0x80000000u, 0x1u, ecx, 0u);
+
+  TestCmovRegAddr(le, ebx, 0x80000000u, 0x1u, edx, 1u);
+  TestCmovRegAddr(le, ebx, 0x1u, 0x80000000u, edx, 0u);
+
+#undef TestCmovRegAddr
+}
+
+} // end of anonymous namespace
+} // end of namespace Test
+} // end of namespace X8632
+} // end of namespace Ice
diff --git a/unittest/AssemblerX8632/GPRArith.cpp b/unittest/AssemblerX8632/GPRArith.cpp
new file mode 100644
index 0000000..6ca27ac
--- /dev/null
+++ b/unittest/AssemblerX8632/GPRArith.cpp
@@ -0,0 +1,1884 @@
+//===- subzero/unittest/AssemblerX8632/GPRArith.cpp -----------------------===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#include "AssemblerX8632/TestUtil.h"
+
+namespace Ice {
+namespace X8632 {
+namespace Test {
+namespace {
+
+TEST_F(AssemblerX8632LowLevelTest, PushalPopal) {
+  // These are invalid in x86-64, so we can't write tests which will execute
+  // these instructions.
+  __ pushal();
+  __ popal();
+
+  constexpr size_t ByteCount = 2;
+  ASSERT_EQ(ByteCount, codeBytesSize());
+
+  constexpr uint8_t Pushal = 0x60;
+  constexpr uint8_t Popal = 0x61;
+
+  verifyBytes<ByteCount>(codeBytes(), Pushal, Popal);
+}
+
+TEST_F(AssemblerX8632Test, PopAddr) {
+  const uint32_t T0 = allocateDword();
+  constexpr uint32_t V0 = 0xEFAB;
+
+  __ mov(IceType_i32, GPRRegister::Encoded_Reg_eax, Immediate(0xC0FFEE));
+  __ pushl(GPRRegister::Encoded_Reg_eax);
+  __ popl(dwordAddress(T0));
+
+  AssembledTest test = assemble();
+  test.setDwordTo(T0, V0);
+
+  test.run();
+
+  ASSERT_EQ(0xC0FFEEul, test.contentsOfDword(T0));
+}
+
+TEST_F(AssemblerX8632Test, SetCC) {
+#define TestSetCC(C, Src0, Value0, Src1, Value1, Dest, IsTrue)                 \
+  do {                                                                         \
+    const uint32_t T0 = allocateDword();                                       \
+    constexpr uint32_t V0 = 0xF00F00;                                          \
+    __ mov(IceType_i32, GPRRegister::Encoded_Reg_##Src0, Immediate(Value0));   \
+    __ mov(IceType_i32, GPRRegister::Encoded_Reg_##Src1, Immediate(Value1));   \
+    __ cmp(IceType_i32, GPRRegister::Encoded_Reg_##Src0,                       \
+           GPRRegister::Encoded_Reg_##Src1);                                   \
+    __ mov(IceType_i32, GPRRegister::Encoded_Reg_##Dest, Immediate(0));        \
+    __ setcc(Cond::Br_##C,                                                     \
+             RegX8632::getEncodedByteReg(GPRRegister::Encoded_Reg_##Dest));    \
+    __ setcc(Cond::Br_##C, dwordAddress(T0));                                  \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDwordTo(T0, V0);                                                   \
+                                                                               \
+    test.run();                                                                \
+                                                                               \
+    EXPECT_EQ(IsTrue, test.Dest())                                             \
+        << "(" #C ", " #Src0 ", " #Value0 ", " #Src1 ", " #Value1 ", " #Dest   \
+           ", " #IsTrue ")";                                                   \
+    EXPECT_EQ((0xF00F00 | IsTrue), test.contentsOfDword(T0))                   \
+        << "(" #C ", " #Src0 ", " #Value0 ", " #Src1 ", " #Value1 ", " #Dest   \
+           ", " #IsTrue ")";                                                   \
+                                                                               \
+    reset();                                                                   \
+  } while (0)
+
+  TestSetCC(o, eax, 0x80000000u, ebx, 0x1u, ecx, 1u);
+  TestSetCC(o, eax, 0x1u, ebx, 0x10000000u, ecx, 0u);
+
+  TestSetCC(no, ebx, 0x1u, ecx, 0x10000000u, edx, 1u);
+  TestSetCC(no, ebx, 0x80000000u, ecx, 0x1u, edx, 0u);
+
+  TestSetCC(b, ecx, 0x1, edx, 0x80000000u, eax, 1u);
+  TestSetCC(b, ecx, 0x80000000u, edx, 0x1u, eax, 0u);
+
+  TestSetCC(ae, edx, 0x80000000u, edi, 0x1u, ebx, 1u);
+  TestSetCC(ae, edx, 0x1u, edi, 0x80000000u, ebx, 0u);
+
+  TestSetCC(e, edi, 0x1u, esi, 0x1u, ecx, 1u);
+  TestSetCC(e, edi, 0x1u, esi, 0x11111u, ecx, 0u);
+
+  TestSetCC(ne, esi, 0x80000000u, eax, 0x1u, edx, 1u);
+  TestSetCC(ne, esi, 0x1u, eax, 0x1u, edx, 0u);
+
+  TestSetCC(be, eax, 0x1u, ebx, 0x80000000u, eax, 1u);
+  TestSetCC(be, eax, 0x80000000u, ebx, 0x1u, eax, 0u);
+
+  TestSetCC(a, ebx, 0x80000000u, ecx, 0x1u, ebx, 1u);
+  TestSetCC(a, ebx, 0x1u, ecx, 0x80000000u, ebx, 0u);
+
+  TestSetCC(s, ecx, 0x1u, edx, 0x80000000u, ecx, 1u);
+  TestSetCC(s, ecx, 0x80000000u, edx, 0x1u, ecx, 0u);
+
+  TestSetCC(ns, edx, 0x80000000u, edi, 0x1u, ecx, 1u);
+  TestSetCC(ns, edx, 0x1u, edi, 0x80000000u, ecx, 0u);
+
+  TestSetCC(p, edi, 0x80000000u, esi, 0x1u, edx, 1u);
+  TestSetCC(p, edi, 0x1u, esi, 0x80000000u, edx, 0u);
+
+  TestSetCC(np, esi, 0x1u, edi, 0x80000000u, eax, 1u);
+  TestSetCC(np, esi, 0x80000000u, edi, 0x1u, eax, 0u);
+
+  TestSetCC(l, edi, 0x80000000u, eax, 0x1u, ebx, 1u);
+  TestSetCC(l, edi, 0x1u, eax, 0x80000000u, ebx, 0u);
+
+  TestSetCC(ge, eax, 0x1u, ebx, 0x80000000u, ecx, 1u);
+  TestSetCC(ge, eax, 0x80000000u, ebx, 0x1u, ecx, 0u);
+
+  TestSetCC(le, ebx, 0x80000000u, ecx, 0x1u, edx, 1u);
+  TestSetCC(le, ebx, 0x1u, ecx, 0x80000000u, edx, 0u);
+
+#undef TestSetCC
+}
+
+TEST_F(AssemblerX8632Test, Lea) {
+#define TestLeaBaseDisp(Base, BaseValue, Disp, Dst)                            \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Base ", " #BaseValue ", " #Dst ")";                               \
+    if (GPRRegister::Encoded_Reg_##Base != GPRRegister::Encoded_Reg_esp &&     \
+        GPRRegister::Encoded_Reg_##Base != GPRRegister::Encoded_Reg_ebp) {     \
+      __ mov(IceType_i32, GPRRegister::Encoded_Reg_##Base,                     \
+             Immediate(BaseValue));                                            \
+    }                                                                          \
+    __ lea(IceType_i32, GPRRegister::Encoded_Reg_##Dst,                        \
+           Address(GPRRegister::Encoded_Reg_##Base, Disp));                    \
+    AssembledTest test = assemble();                                           \
+    test.run();                                                                \
+    ASSERT_EQ(test.Base() + (Disp), test.Dst()) << TestString << " with Disp " \
+                                                << Disp;                       \
+    reset();                                                                   \
+  } while (0)
+
+#define TestLeaIndex32bitDisp(Index, IndexValue, Disp, Dst0, Dst1, Dst2, Dst3) \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Index ", " #IndexValue ", " #Dst0 ", " #Dst1 ", " #Dst2           \
+        ", " #Dst3 ")";                                                        \
+    __ mov(IceType_i32, GPRRegister::Encoded_Reg_##Index,                      \
+           Immediate(IndexValue));                                             \
+    __ lea(IceType_i32, GPRRegister::Encoded_Reg_##Dst0,                       \
+           Address(GPRRegister::Encoded_Reg_##Index, Traits::TIMES_1, Disp));  \
+    __ lea(IceType_i32, GPRRegister::Encoded_Reg_##Dst1,                       \
+           Address(GPRRegister::Encoded_Reg_##Index, Traits::TIMES_2, Disp));  \
+    __ lea(IceType_i32, GPRRegister::Encoded_Reg_##Dst2,                       \
+           Address(GPRRegister::Encoded_Reg_##Index, Traits::TIMES_4, Disp));  \
+    __ lea(IceType_i32, GPRRegister::Encoded_Reg_##Dst3,                       \
+           Address(GPRRegister::Encoded_Reg_##Index, Traits::TIMES_8, Disp));  \
+    AssembledTest test = assemble();                                           \
+    test.run();                                                                \
+    ASSERT_EQ((test.Index() << Traits::TIMES_1) + (Disp), test.Dst0())         \
+        << TestString << " " << Disp;                                          \
+    ASSERT_EQ((test.Index() << Traits::TIMES_2) + (Disp), test.Dst1())         \
+        << TestString << " " << Disp;                                          \
+    ASSERT_EQ((test.Index() << Traits::TIMES_4) + (Disp), test.Dst2())         \
+        << TestString << " " << Disp;                                          \
+    ASSERT_EQ((test.Index() << Traits::TIMES_8) + (Disp), test.Dst3())         \
+        << TestString << " " << Disp;                                          \
+    reset();                                                                   \
+  } while (0)
+
+#define TestLeaBaseIndexDisp(Base, BaseValue, Index, IndexValue, Disp, Dst0,   \
+                             Dst1, Dst2, Dst3)                                 \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Base ", " #BaseValue ", " #Index ", " #IndexValue ", " #Dst0      \
+        ", " #Dst1 ", " #Dst2 ", " #Dst3 ")";                                  \
+    if (GPRRegister::Encoded_Reg_##Base != GPRRegister::Encoded_Reg_esp &&     \
+        GPRRegister::Encoded_Reg_##Base != GPRRegister::Encoded_Reg_ebp) {     \
+      __ mov(IceType_i32, GPRRegister::Encoded_Reg_##Base,                     \
+             Immediate(BaseValue));                                            \
+    }                                                                          \
+    /* esp is not a valid index register. */                                   \
+    if (GPRRegister::Encoded_Reg_##Index != GPRRegister::Encoded_Reg_ebp) {    \
+      __ mov(IceType_i32, GPRRegister::Encoded_Reg_##Index,                    \
+             Immediate(IndexValue));                                           \
+    }                                                                          \
+    __ lea(IceType_i32, GPRRegister::Encoded_Reg_##Dst0,                       \
+           Address(GPRRegister::Encoded_Reg_##Base,                            \
+                   GPRRegister::Encoded_Reg_##Index, Traits::TIMES_1, Disp));  \
+    __ lea(IceType_i32, GPRRegister::Encoded_Reg_##Dst1,                       \
+           Address(GPRRegister::Encoded_Reg_##Base,                            \
+                   GPRRegister::Encoded_Reg_##Index, Traits::TIMES_2, Disp));  \
+    __ lea(IceType_i32, GPRRegister::Encoded_Reg_##Dst2,                       \
+           Address(GPRRegister::Encoded_Reg_##Base,                            \
+                   GPRRegister::Encoded_Reg_##Index, Traits::TIMES_4, Disp));  \
+    __ lea(IceType_i32, GPRRegister::Encoded_Reg_##Dst3,                       \
+           Address(GPRRegister::Encoded_Reg_##Base,                            \
+                   GPRRegister::Encoded_Reg_##Index, Traits::TIMES_8, Disp));  \
+    AssembledTest test = assemble();                                           \
+    test.run();                                                                \
+    uint32_t ExpectedIndexValue = test.Index();                                \
+    if (GPRRegister::Encoded_Reg_##Index == GPRRegister::Encoded_Reg_esp) {    \
+      ExpectedIndexValue = 0;                                                  \
+    }                                                                          \
+    ASSERT_EQ(test.Base() + (ExpectedIndexValue << Traits::TIMES_1) + (Disp),  \
+              test.Dst0())                                                     \
+        << TestString << " " << Disp;                                          \
+    ASSERT_EQ(test.Base() + (ExpectedIndexValue << Traits::TIMES_2) + (Disp),  \
+              test.Dst1())                                                     \
+        << TestString << " " << Disp;                                          \
+    ASSERT_EQ(test.Base() + (ExpectedIndexValue << Traits::TIMES_4) + (Disp),  \
+              test.Dst2())                                                     \
+        << TestString << " " << Disp;                                          \
+    ASSERT_EQ(test.Base() + (ExpectedIndexValue << Traits::TIMES_8) + (Disp),  \
+              test.Dst3())                                                     \
+        << TestString << " " << Disp;                                          \
+    reset();                                                                   \
+  } while (0)
+
+  for (const int32_t Disp :
+       {0x00, 0x06, -0x06, 0x0600, -0x6000, 0x6000000, -0x6000000}) {
+    TestLeaBaseDisp(eax, 0x10000Fu, Disp, ebx);
+    TestLeaBaseDisp(ebx, 0x20000Fu, Disp, ecx);
+    TestLeaBaseDisp(ecx, 0x30000Fu, Disp, edx);
+    TestLeaBaseDisp(edx, 0x40000Fu, Disp, esi);
+    TestLeaBaseDisp(esi, 0x50000Fu, Disp, edi);
+    TestLeaBaseDisp(edi, 0x60000Fu, Disp, eax);
+    TestLeaBaseDisp(esp, 0x11000Fu, Disp, eax);
+    TestLeaBaseDisp(ebp, 0x22000Fu, Disp, ecx);
+  }
+
+  // esp is not a valid index register.
+  // ebp is not valid in this addressing mode (rm = 0).
+  for (const int32_t Disp :
+       {0x00, 0x06, -0x06, 0x0600, -0x6000, 0x6000000, -0x6000000}) {
+    TestLeaIndex32bitDisp(eax, 0x2000u, Disp, ebx, ecx, edx, esi);
+    TestLeaIndex32bitDisp(ebx, 0x4000u, Disp, ecx, edx, esi, edi);
+    TestLeaIndex32bitDisp(ecx, 0x6000u, Disp, edx, esi, edi, eax);
+    TestLeaIndex32bitDisp(edx, 0x8000u, Disp, esi, edi, eax, ebx);
+    TestLeaIndex32bitDisp(esi, 0xA000u, Disp, edi, eax, ebx, ecx);
+    TestLeaIndex32bitDisp(edi, 0xC000u, Disp, eax, ebx, ecx, edx);
+  }
+
+  for (const int32_t Disp :
+       {0x00, 0x06, -0x06, 0x0600, -0x6000, 0x6000000, -0x6000000}) {
+    TestLeaBaseIndexDisp(eax, 0x100000u, ebx, 0x600u, Disp, ecx, edx, esi, edi);
+    TestLeaBaseIndexDisp(ebx, 0x200000u, ecx, 0x500u, Disp, edx, esi, edi, eax);
+    TestLeaBaseIndexDisp(ecx, 0x300000u, edx, 0x400u, Disp, esi, edi, eax, ebx);
+    TestLeaBaseIndexDisp(edx, 0x400000u, esi, 0x300u, Disp, edi, eax, ebx, ecx);
+    TestLeaBaseIndexDisp(esi, 0x500000u, edi, 0x200u, Disp, eax, ebx, ecx, edx);
+    TestLeaBaseIndexDisp(edi, 0x600000u, eax, 0x100u, Disp, ebx, ecx, edx, esi);
+
+    /* Initializers are ignored when Src[01] is ebp/esp. */
+    TestLeaBaseIndexDisp(esp, 0, ebx, 0x6000u, Disp, ecx, edx, esi, edi);
+    TestLeaBaseIndexDisp(esp, 0, ecx, 0x5000u, Disp, edx, esi, edi, eax);
+    TestLeaBaseIndexDisp(esp, 0, edx, 0x4000u, Disp, esi, edi, eax, ebx);
+    TestLeaBaseIndexDisp(esp, 0, esi, 0x3000u, Disp, edi, eax, ebx, ecx);
+    TestLeaBaseIndexDisp(esp, 0, edi, 0x2000u, Disp, eax, ebx, ecx, edx);
+    TestLeaBaseIndexDisp(esp, 0, eax, 0x1000u, Disp, ebx, ecx, edx, esi);
+
+    TestLeaBaseIndexDisp(ebp, 0, ebx, 0x6000u, Disp, ecx, edx, esi, edi);
+    TestLeaBaseIndexDisp(ebp, 0, ecx, 0x5000u, Disp, edx, esi, edi, eax);
+    TestLeaBaseIndexDisp(ebp, 0, edx, 0x4000u, Disp, esi, edi, eax, ebx);
+    TestLeaBaseIndexDisp(ebp, 0, esi, 0x3000u, Disp, edi, eax, ebx, ecx);
+    TestLeaBaseIndexDisp(ebp, 0, edi, 0x2000u, Disp, eax, ebx, ecx, edx);
+    TestLeaBaseIndexDisp(ebp, 0, eax, 0x1000u, Disp, ebx, ecx, edx, esi);
+
+    TestLeaBaseIndexDisp(eax, 0x1000000u, ebp, 0, Disp, ecx, edx, esi, edi);
+    TestLeaBaseIndexDisp(ebx, 0x2000000u, ebp, 0, Disp, edx, esi, edi, eax);
+    TestLeaBaseIndexDisp(ecx, 0x3000000u, ebp, 0, Disp, esi, edi, eax, ebx);
+    TestLeaBaseIndexDisp(edx, 0x4000000u, ebp, 0, Disp, edi, eax, ebx, ecx);
+    TestLeaBaseIndexDisp(esi, 0x5000000u, ebp, 0, Disp, eax, ebx, ecx, edx);
+    TestLeaBaseIndexDisp(edi, 0x6000000u, ebp, 0, Disp, ebx, ecx, edx, esi);
+
+    TestLeaBaseIndexDisp(esp, 0, ebp, 0, Disp, ebx, ecx, edx, esi);
+  }
+
+// Absolute addressing mode is tested in the Low Level tests. The encoding used
+// by the assembler has different meanings in x86-32 and x86-64.
+#undef TestLeaBaseIndexDisp
+#undef TestLeaScaled32bitDisp
+#undef TestLeaBaseDisp
+}
+
+TEST_F(AssemblerX8632LowLevelTest, LeaAbsolute) {
+#define TestLeaAbsolute(Dst, Value)                                            \
+  do {                                                                         \
+    static constexpr char TestString[] = "(" #Dst ", " #Value ")";             \
+    __ lea(IceType_i32, GPRRegister::Encoded_Reg_##Dst,                        \
+           Address(Address::ABSOLUTE, Value));                                 \
+    static constexpr uint32_t ByteCount = 6;                                   \
+    ASSERT_EQ(ByteCount, codeBytesSize()) << TestString;                       \
+    static constexpr uint8_t Opcode = 0x8D;                                    \
+    static constexpr uint8_t ModRM =                                           \
+        /*mod=*/0x00 | /*reg*/ (GPRRegister::Encoded_Reg_##Dst << 3) |         \
+        /*rm*/ GPRRegister::Encoded_Reg_ebp;                                   \
+    verifyBytes<ByteCount>(codeBytes(), Opcode, ModRM, (Value)&0xFF,           \
+                           (Value >> 8) & 0xFF, (Value >> 16) & 0xFF,          \
+                           (Value >> 24) & 0xFF);                              \
+    reset();                                                                   \
+  } while (0)
+
+  TestLeaAbsolute(eax, 0x11BEEF22);
+  TestLeaAbsolute(ebx, 0x33BEEF44);
+  TestLeaAbsolute(ecx, 0x55BEEF66);
+  TestLeaAbsolute(edx, 0x77BEEF88);
+  TestLeaAbsolute(esi, 0x99BEEFAA);
+  TestLeaAbsolute(edi, 0xBBBEEFBB);
+
+#undef TesLeaAbsolute
+}
+
+TEST_F(AssemblerX8632Test, Test) {
+  static constexpr uint32_t Mask8 = 0xFF;
+  static constexpr uint32_t Mask16 = 0xFFFF;
+  static constexpr uint32_t Mask32 = 0xFFFFFFFF;
+
+#define TestImplRegReg(Dst, Value0, Src, Value1, Size)                         \
+  do {                                                                         \
+    static constexpr bool NearJump = true;                                     \
+    static constexpr char TestString[] =                                       \
+        "(" #Dst ", " #Value0 ", " #Src ", " #Value1 ", " #Size ")";           \
+    static constexpr uint32_t ValueIfTrue = 0xBEEFFEEB;                        \
+    static constexpr uint32_t ValueIfFalse = 0x11111111;                       \
+                                                                               \
+    __ mov(IceType_i##Size, GPRRegister::Encoded_Reg_##Dst,                    \
+           Immediate(Value0));                                                 \
+    __ mov(IceType_i##Size, GPRRegister::Encoded_Reg_##Src,                    \
+           Immediate(Value1));                                                 \
+    __ test(IceType_i##Size, GPRRegister::Encoded_Reg_##Dst,                   \
+            GPRRegister::Encoded_Reg_##Src);                                   \
+    __ mov(IceType_i32, GPRRegister::Encoded_Reg_##Dst,                        \
+           Immediate(ValueIfFalse));                                           \
+    Label Done;                                                                \
+    __ j(Cond::Br_e, &Done, NearJump);                                         \
+    __ mov(IceType_i32, GPRRegister::Encoded_Reg_##Dst,                        \
+           Immediate(ValueIfTrue));                                            \
+    __ bind(&Done);                                                            \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(((Value0)&Mask##Size) & ((Value1)&Mask##Size) ? ValueIfTrue      \
+                                                            : ValueIfFalse,    \
+              test.Dst())                                                      \
+        << TestString;                                                         \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplRegImm(Dst, Value0, Imm, Size)                                 \
+  do {                                                                         \
+    static constexpr bool NearJump = true;                                     \
+    static constexpr char TestString[] =                                       \
+        "(" #Dst ", " #Value0 ", " #Imm ", " #Size ")";                        \
+    static constexpr uint32_t ValueIfTrue = 0xBEEFFEEB;                        \
+    static constexpr uint32_t ValueIfFalse = 0x11111111;                       \
+                                                                               \
+    __ mov(IceType_i##Size, GPRRegister::Encoded_Reg_##Dst,                    \
+           Immediate(Value0));                                                 \
+    __ test(IceType_i##Size, GPRRegister::Encoded_Reg_##Dst,                   \
+            Immediate((Imm)&Mask##Size));                                      \
+    __ mov(IceType_i32, GPRRegister::Encoded_Reg_##Dst,                        \
+           Immediate(ValueIfFalse));                                           \
+    Label Done;                                                                \
+    __ j(Cond::Br_e, &Done, NearJump);                                         \
+    __ mov(IceType_i32, GPRRegister::Encoded_Reg_##Dst,                        \
+           Immediate(ValueIfTrue));                                            \
+    __ bind(&Done);                                                            \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(((Value0)&Mask##Size) & ((Imm)&Mask##Size) ? ValueIfTrue         \
+                                                         : ValueIfFalse,       \
+              test.Dst())                                                      \
+        << TestString;                                                         \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplAddrReg(Value0, Src, Value1, Size)                             \
+  do {                                                                         \
+    static constexpr bool NearJump = true;                                     \
+    static constexpr char TestString[] =                                       \
+        "(Addr, " #Value0 ", " #Src ", " #Value1 ", " #Size ")";               \
+    static constexpr uint32_t ValueIfTrue = 0xBEEFFEEB;                        \
+    static constexpr uint32_t ValueIfFalse = 0x11111111;                       \
+    const uint32_t T0 = allocateDword();                                       \
+                                                                               \
+    __ mov(IceType_i##Size, GPRRegister::Encoded_Reg_##Src,                    \
+           Immediate(Value1));                                                 \
+    __ test(IceType_i##Size, dwordAddress(T0),                                 \
+            GPRRegister::Encoded_Reg_##Src);                                   \
+    __ mov(IceType_i32, dwordAddress(T0), Immediate(ValueIfFalse));            \
+    Label Done;                                                                \
+    __ j(Cond::Br_e, &Done, NearJump);                                         \
+    __ mov(IceType_i32, dwordAddress(T0), Immediate(ValueIfTrue));             \
+    __ bind(&Done);                                                            \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDwordTo(T0, uint32_t(Value0));                                     \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(((Value0)&Mask##Size) & ((Value1)&Mask##Size) ? ValueIfTrue      \
+                                                            : ValueIfFalse,    \
+              test.contentsOfDword(T0))                                        \
+        << TestString;                                                         \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplAddrImm(Value0, Value1, Size)                                  \
+  do {                                                                         \
+    static constexpr bool NearJump = true;                                     \
+    static constexpr char TestString[] =                                       \
+        "(Addr, " #Value0 ", " #Value1 ", " #Size ")";                         \
+    static constexpr uint32_t ValueIfTrue = 0xBEEFFEEB;                        \
+    static constexpr uint32_t ValueIfFalse = 0x11111111;                       \
+    const uint32_t T0 = allocateDword();                                       \
+                                                                               \
+    __ test(IceType_i##Size, dwordAddress(T0),                                 \
+            Immediate((Value1)&Mask##Size));                                   \
+    __ mov(IceType_i32, dwordAddress(T0), Immediate(ValueIfFalse));            \
+    Label Done;                                                                \
+    __ j(Cond::Br_e, &Done, NearJump);                                         \
+    __ mov(IceType_i32, dwordAddress(T0), Immediate(ValueIfTrue));             \
+    __ bind(&Done);                                                            \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDwordTo(T0, uint32_t(Value0));                                     \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(((Value0)&Mask##Size) & ((Value1)&Mask##Size) ? ValueIfTrue      \
+                                                            : ValueIfFalse,    \
+              test.contentsOfDword(T0))                                        \
+        << TestString;                                                         \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplValues(Dst, Value0, Src, Value1, Size)                         \
+  do {                                                                         \
+    TestImplRegReg(Dst, Value0, Src, Value1, Size);                            \
+    TestImplRegImm(Dst, Value0, Value1, Size);                                 \
+    TestImplAddrReg(Value0, Src, Value1, Size);                                \
+    TestImplAddrImm(Value0, Value1, Size);                                     \
+  } while (0)
+
+#define TestImplSize(Dst, Src, Size)                                           \
+  do {                                                                         \
+    TestImplValues(Dst, 0xF0F12101, Src, 0x00000000, Size);                    \
+    TestImplValues(Dst, 0xF0000000, Src, 0xF0000000, Size);                    \
+    TestImplValues(Dst, 0x0F00000F, Src, 0xF00000F0, Size);                    \
+  } while (0)
+
+#define TestImpl(Dst, Src)                                                     \
+  do {                                                                         \
+    TestImplSize(Dst, Src, 8);                                                 \
+    TestImplSize(Dst, Src, 16);                                                \
+    TestImplSize(Dst, Src, 32);                                                \
+  } while (0)
+
+  TestImpl(eax, ebx);
+  TestImpl(ebx, ecx);
+  TestImpl(ecx, edx);
+  TestImpl(edx, esi);
+  TestImpl(esi, edi);
+  TestImpl(edi, eax);
+
+#undef TestImpl
+#undef TestImplSize
+#undef TestImplValues
+#undef TestImplAddrImm
+#undef TestImplAddrReg
+#undef TestImplRegImm
+#undef TestImplRegReg
+}
+
+// No mull/div because x86.
+// No shift because x86.
+TEST_F(AssemblerX8632Test, Arith_most) {
+  static constexpr uint32_t Mask8 = 0xFF;
+  static constexpr uint32_t Mask16 = 0xFFFF;
+  static constexpr uint32_t Mask32 = 0xFFFFFFFF;
+
+#define TestImplRegReg(Inst, Dst, Value0, Src, Value1, Type, Size, Op)         \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Inst ", " #Dst ", " #Value0 ", " #Src ", " #Value1                \
+        ", " #Type #Size "_t, " #Op ")";                                       \
+                                                                               \
+    __ mov(IceType_i##Size, GPRRegister::Encoded_Reg_##Dst,                    \
+           Immediate(Value0));                                                 \
+    __ mov(IceType_i##Size, GPRRegister::Encoded_Reg_##Src,                    \
+           Immediate(Value1));                                                 \
+    __ Inst(IceType_i##Size, GPRRegister::Encoded_Reg_##Dst,                   \
+            GPRRegister::Encoded_Reg_##Src);                                   \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(Mask##Size &static_cast<uint32_t>(                               \
+                  static_cast<Type##Size##_t>((Value0)&Mask##Size)             \
+                      Op static_cast<Type##Size##_t>((Value1)&Mask##Size)),    \
+              Mask##Size &test.Dst())                                          \
+        << TestString;                                                         \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplRegAddr(Inst, Dst, Value0, Value1, Type, Size, Op)             \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Inst ", " #Dst ", " #Value0 ", Addr, " #Value1 ", " #Type #Size   \
+        "_t, " #Op ")";                                                        \
+    const uint32_t T0 = allocateDword();                                       \
+    const uint32_t V0 = Value1;                                                \
+                                                                               \
+    __ mov(IceType_i##Size, GPRRegister::Encoded_Reg_##Dst,                    \
+           Immediate(Value0));                                                 \
+    __ mov(IceType_i##Size, dwordAddress(T0), Immediate(Value1));              \
+    __ Inst(IceType_i##Size, GPRRegister::Encoded_Reg_##Dst,                   \
+            dwordAddress(T0));                                                 \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDwordTo(T0, V0);                                                   \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(Mask##Size &static_cast<uint32_t>(                               \
+                  static_cast<Type##Size##_t>((Value0)&Mask##Size)             \
+                      Op static_cast<Type##Size##_t>((Value1)&Mask##Size)),    \
+              Mask##Size &test.Dst())                                          \
+        << TestString;                                                         \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplRegImm(Inst, Dst, Value0, Imm, Type, Size, Op)                 \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Inst ", " #Dst ", " #Value0 ", Imm(" #Imm "), " #Type #Size       \
+        "_t, " #Op ")";                                                        \
+                                                                               \
+    __ mov(IceType_i##Size, GPRRegister::Encoded_Reg_##Dst,                    \
+           Immediate(Value0));                                                 \
+    __ Inst(IceType_i##Size, GPRRegister::Encoded_Reg_##Dst,                   \
+            Immediate((Imm)&Mask##Size));                                      \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(Mask##Size &static_cast<uint32_t>(                               \
+                  static_cast<Type##Size##_t>((Value0)&Mask##Size)             \
+                      Op static_cast<Type##Size##_t>((Imm)&Mask##Size)),       \
+              Mask##Size &test.Dst())                                          \
+        << TestString;                                                         \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplAddrReg(Inst, Value0, Src, Value1, Type, Size, Op)             \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Inst ", Addr, " #Value0 ", " #Src ", " #Value1 ", " #Type #Size   \
+        "_t, " #Op ")";                                                        \
+    const uint32_t T0 = allocateDword();                                       \
+    const uint32_t V0 = Value0;                                                \
+                                                                               \
+    __ mov(IceType_i##Size, GPRRegister::Encoded_Reg_##Src,                    \
+           Immediate(Value1));                                                 \
+    __ Inst(IceType_i##Size, dwordAddress(T0),                                 \
+            GPRRegister::Encoded_Reg_##Src);                                   \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDwordTo(T0, V0);                                                   \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(Mask##Size &static_cast<uint32_t>(                               \
+                  static_cast<Type##Size##_t>((Value0)&Mask##Size)             \
+                      Op static_cast<Type##Size##_t>((Value1)&Mask##Size)),    \
+              Mask##Size &test.contentsOfDword(T0))                            \
+        << TestString;                                                         \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplAddrImm(Inst, Value0, Imm, Type, Size, Op)                     \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Inst ", Addr, " #Value0 ", Imm, " #Imm ", " #Type #Size           \
+        "_t, " #Op ")";                                                        \
+    const uint32_t T0 = allocateDword();                                       \
+    const uint32_t V0 = Value0;                                                \
+                                                                               \
+    __ Inst(IceType_i##Size, dwordAddress(T0), Immediate((Imm)&Mask##Size));   \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDwordTo(T0, V0);                                                   \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(Mask##Size &static_cast<uint32_t>(                               \
+                  static_cast<Type##Size##_t>((Value0)&Mask##Size)             \
+                      Op static_cast<Type##Size##_t>((Imm)&Mask##Size)),       \
+              Mask##Size &test.contentsOfDword(T0))                            \
+        << TestString;                                                         \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplOp(Inst, Dst, Value0, Src, Value1, Type, Size, Op)             \
+  do {                                                                         \
+    TestImplRegReg(Inst, Dst, Value0, Src, Value1, Type, Size, Op);            \
+    TestImplRegAddr(Inst, Dst, Value0, Value1, Type, Size, Op);                \
+    TestImplRegImm(Inst, Dst, Value0, Value1, Type, Size, Op);                 \
+    TestImplAddrReg(Inst, Value0, Src, Value1, Type, Size, Op);                \
+    TestImplAddrImm(Inst, Value0, Value1, Type, Size, Op);                     \
+  } while (0)
+
+#define TestImplValues(Dst, Value0, Src, Value1, Size)                         \
+  do {                                                                         \
+    TestImplOp(And, Dst, Value0, Src, Value1, int, Size, &);                   \
+    TestImplOp(And, Dst, Value0, Src, Value1, uint, Size, &);                  \
+    TestImplOp(Or, Dst, Value0, Src, Value1, int, Size, | );                   \
+    TestImplOp(Or, Dst, Value0, Src, Value1, uint, Size, | );                  \
+    TestImplOp(Xor, Dst, Value0, Src, Value1, int, Size, ^);                   \
+    TestImplOp(Xor, Dst, Value0, Src, Value1, uint, Size, ^);                  \
+    TestImplOp(add, Dst, Value0, Src, Value1, int, Size, +);                   \
+    TestImplOp(add, Dst, Value0, Src, Value1, uint, Size, +);                  \
+    TestImplOp(sub, Dst, Value0, Src, Value1, int, Size, -);                   \
+    TestImplOp(sub, Dst, Value0, Src, Value1, uint, Size, -);                  \
+  } while (0)
+
+#define TestImplSize(Dst, Src, Size)                                           \
+  do {                                                                         \
+    TestImplValues(Dst, 0xF0F12101, Src, 0x00000000, Size);                    \
+    TestImplValues(Dst, 0xF0000000, Src, 0xF0000000, Size);                    \
+    TestImplValues(Dst, 0x0F00000F, Src, 0xF0000070, Size);                    \
+    TestImplValues(Dst, 0x0F00F00F, Src, 0xF000F070, Size);                    \
+  } while (0)
+
+#define TestImpl(Dst, Src)                                                     \
+  do {                                                                         \
+    if (GPRRegister::Encoded_Reg_##Src <= 3 &&                                 \
+        GPRRegister::Encoded_Reg_##Dst <= 3) {                                 \
+      TestImplSize(Dst, Src, 8);                                               \
+    }                                                                          \
+    TestImplSize(Dst, Src, 16);                                                \
+    TestImplSize(Dst, Src, 32);                                                \
+  } while (0)
+
+  TestImpl(eax, ebx);
+  TestImpl(ebx, ecx);
+  TestImpl(ecx, edx);
+  TestImpl(edx, esi);
+  TestImpl(esi, edi);
+  TestImpl(edi, eax);
+
+#undef TestImpl
+#undef TestImplSize
+#undef TestImplValues
+#undef TestImplOp
+#undef TestImplAddrImm
+#undef TestImplAddrReg
+#undef TestImplRegImm
+#undef TestImplRegAddr
+#undef TestImplRegReg
+}
+
+TEST_F(AssemblerX8632Test, Arith_BorrowNCarry) {
+  const uint32_t Mask8 = 0x000000FF;
+  const uint32_t Mask16 = 0x0000FFFF;
+  const uint32_t Mask32 = 0xFFFFFFFF;
+
+  const uint64_t ResultMask8 = 0x000000000000FFFFull;
+  const uint64_t ResultMask16 = 0x00000000FFFFFFFFull;
+  const uint64_t ResultMask32 = 0xFFFFFFFFFFFFFFFFull;
+
+#define TestImplRegReg(Inst0, Inst1, Dst0, Dst1, Value0, Src0, Src1, Value1,   \
+                       Op, Size)                                               \
+  do {                                                                         \
+    static_assert(Size == 8 || Size == 16 || Size == 32,                       \
+                  "Invalid size " #Size);                                      \
+    static constexpr char TestString[] =                                       \
+        "(" #Inst0 ", " #Inst1 ", " #Dst0 ", " #Dst1 ", " #Value0 ", " #Src0   \
+        ", " #Src1 ", " #Value1 ", " #Op ", " #Size ")";                       \
+    __ mov(IceType_i##Size, GPRRegister::Encoded_Reg_##Dst0,                   \
+           Immediate(uint64_t(Value0) & Mask##Size));                          \
+    __ mov(IceType_i##Size, GPRRegister::Encoded_Reg_##Dst1,                   \
+           Immediate((uint64_t(Value0) >> Size) & Mask##Size));                \
+    __ mov(IceType_i##Size, GPRRegister::Encoded_Reg_##Src0,                   \
+           Immediate(uint64_t(Value1) & Mask##Size));                          \
+    __ mov(IceType_i##Size, GPRRegister::Encoded_Reg_##Src1,                   \
+           Immediate((uint64_t(Value1) >> Size) & Mask##Size));                \
+    __ Inst0(IceType_i##Size, GPRRegister::Encoded_Reg_##Dst0,                 \
+             GPRRegister::Encoded_Reg_##Src0);                                 \
+    __ Inst1(IceType_i##Size, GPRRegister::Encoded_Reg_##Dst1,                 \
+             GPRRegister::Encoded_Reg_##Src1);                                 \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.run();                                                                \
+                                                                               \
+    static constexpr uint64_t Result =                                         \
+        (uint64_t(Value0) & ResultMask##Size)Op(uint64_t(Value1) &             \
+                                                ResultMask##Size);             \
+    static constexpr uint32_t Expected0 = Result & Mask##Size;                 \
+    static constexpr uint32_t Expected1 = (Result >> Size) & Mask##Size;       \
+    ASSERT_EQ(Expected0, test.Dst0()) << TestString << ": 0";                  \
+    ASSERT_EQ(Expected1, test.Dst1()) << TestString << ": 1";                  \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplRegAddr(Inst0, Inst1, Dst0, Dst1, Value0, Value1, Op, Size)    \
+  do {                                                                         \
+    static_assert(Size == 8 || Size == 16 || Size == 32,                       \
+                  "Invalid size " #Size);                                      \
+    static constexpr char TestString[] =                                       \
+        "(" #Inst0 ", " #Inst1 ", " #Dst0 ", " #Dst1 ", " #Value0              \
+        ", Addr, " #Value1 ", " #Op ", " #Size ")";                            \
+    const uint32_t T0 = allocateDword();                                       \
+    const uint32_t V0 = uint64_t(Value1) & Mask##Size;                         \
+    const uint32_t T1 = allocateDword();                                       \
+    const uint32_t V1 = (uint64_t(Value1) >> Size) & Mask##Size;               \
+    __ mov(IceType_i##Size, GPRRegister::Encoded_Reg_##Dst0,                   \
+           Immediate(uint64_t(Value0) & Mask##Size));                          \
+    __ mov(IceType_i##Size, GPRRegister::Encoded_Reg_##Dst1,                   \
+           Immediate((uint64_t(Value0) >> Size) & Mask##Size));                \
+    __ Inst0(IceType_i##Size, GPRRegister::Encoded_Reg_##Dst0,                 \
+             dwordAddress(T0));                                                \
+    __ Inst1(IceType_i##Size, GPRRegister::Encoded_Reg_##Dst1,                 \
+             dwordAddress(T1));                                                \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDwordTo(T0, V0);                                                   \
+    test.setDwordTo(T1, V1);                                                   \
+    test.run();                                                                \
+                                                                               \
+    static constexpr uint64_t Result =                                         \
+        (uint64_t(Value0) & ResultMask##Size)Op(uint64_t(Value1) &             \
+                                                ResultMask##Size);             \
+    static constexpr uint32_t Expected0 = Result & Mask##Size;                 \
+    static constexpr uint32_t Expected1 = (Result >> Size) & Mask##Size;       \
+    ASSERT_EQ(Expected0, test.Dst0()) << TestString << ": 0";                  \
+    ASSERT_EQ(Expected1, test.Dst1()) << TestString << ": 1";                  \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplRegImm(Inst0, Inst1, Dst0, Dst1, Value0, Imm, Op, Size)        \
+  do {                                                                         \
+    static_assert(Size == 8 || Size == 16 || Size == 32,                       \
+                  "Invalid size " #Size);                                      \
+    static constexpr char TestString[] =                                       \
+        "(" #Inst0 ", " #Inst1 ", " #Dst0 ", " #Dst1 ", " #Value0              \
+        ", Imm(" #Imm "), " #Op ", " #Size ")";                                \
+    __ mov(IceType_i##Size, GPRRegister::Encoded_Reg_##Dst0,                   \
+           Immediate(uint64_t(Value0) & Mask##Size));                          \
+    __ mov(IceType_i##Size, GPRRegister::Encoded_Reg_##Dst1,                   \
+           Immediate((uint64_t(Value0) >> Size) & Mask##Size));                \
+    __ Inst0(IceType_i##Size, GPRRegister::Encoded_Reg_##Dst0,                 \
+             Immediate(uint64_t(Imm) & Mask##Size));                           \
+    __ Inst1(IceType_i##Size, GPRRegister::Encoded_Reg_##Dst1,                 \
+             Immediate((uint64_t(Imm) >> Size) & Mask##Size));                 \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.run();                                                                \
+                                                                               \
+    static constexpr uint64_t Result =                                         \
+        (uint64_t(Value0) & ResultMask##Size)Op(uint64_t(Imm) &                \
+                                                ResultMask##Size);             \
+    static constexpr uint32_t Expected0 = Result & Mask##Size;                 \
+    static constexpr uint32_t Expected1 = (Result >> Size) & Mask##Size;       \
+    ASSERT_EQ(Expected0, test.Dst0()) << TestString << ": 0";                  \
+    ASSERT_EQ(Expected1, test.Dst1()) << TestString << ": 1";                  \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplAddrReg(Inst0, Inst1, Value0, Src0, Src1, Value1, Op, Size)    \
+  do {                                                                         \
+    static_assert(Size == 8 || Size == 16 || Size == 32,                       \
+                  "Invalid size " #Size);                                      \
+    static constexpr char TestString[] =                                       \
+        "(" #Inst0 ", " #Inst1 ", Addr, " #Value0 ", " #Src0 ", " #Src1        \
+        ", " #Value1 ", " #Op ", " #Size ")";                                  \
+    const uint32_t T0 = allocateDword();                                       \
+    const uint32_t V0 = uint64_t(Value0) & Mask##Size;                         \
+    const uint32_t T1 = allocateDword();                                       \
+    const uint32_t V1 = (uint64_t(Value0) >> Size) & Mask##Size;               \
+    __ mov(IceType_i##Size, GPRRegister::Encoded_Reg_##Src0,                   \
+           Immediate(uint64_t(Value1) & Mask##Size));                          \
+    __ mov(IceType_i##Size, GPRRegister::Encoded_Reg_##Src1,                   \
+           Immediate((uint64_t(Value1) >> Size) & Mask##Size));                \
+    __ Inst0(IceType_i##Size, dwordAddress(T0),                                \
+             GPRRegister::Encoded_Reg_##Src0);                                 \
+    __ Inst1(IceType_i##Size, dwordAddress(T1),                                \
+             GPRRegister::Encoded_Reg_##Src1);                                 \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDwordTo(T0, V0);                                                   \
+    test.setDwordTo(T1, V1);                                                   \
+    test.run();                                                                \
+                                                                               \
+    static constexpr uint64_t Result =                                         \
+        (uint64_t(Value0) & ResultMask##Size)Op(uint64_t(Value1) &             \
+                                                ResultMask##Size);             \
+    static constexpr uint32_t Expected0 = Result & Mask##Size;                 \
+    static constexpr uint32_t Expected1 = (Result >> Size) & Mask##Size;       \
+    ASSERT_EQ(Expected0, test.contentsOfDword(T0)) << TestString << ": 0";     \
+    ASSERT_EQ(Expected1, test.contentsOfDword(T1)) << TestString << ": 1";     \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplAddrImm(Inst0, Inst1, Value0, Imm, Op, Size)                   \
+  do {                                                                         \
+    static_assert(Size == 8 || Size == 16 || Size == 32,                       \
+                  "Invalid size " #Size);                                      \
+    static constexpr char TestString[] =                                       \
+        "(" #Inst0 ", " #Inst1 ", Addr, " #Value0 ", Imm(" #Imm "), " #Op      \
+        ", " #Size ")";                                                        \
+    const uint32_t T0 = allocateDword();                                       \
+    const uint32_t V0 = uint64_t(Value0) & Mask##Size;                         \
+    const uint32_t T1 = allocateDword();                                       \
+    const uint32_t V1 = (uint64_t(Value0) >> Size) & Mask##Size;               \
+    __ Inst0(IceType_i##Size, dwordAddress(T0),                                \
+             Immediate(uint64_t(Imm) & Mask##Size));                           \
+    __ Inst1(IceType_i##Size, dwordAddress(T1),                                \
+             Immediate((uint64_t(Imm) >> Size) & Mask##Size));                 \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDwordTo(T0, V0);                                                   \
+    test.setDwordTo(T1, V1);                                                   \
+    test.run();                                                                \
+                                                                               \
+    static constexpr uint64_t Result =                                         \
+        (uint64_t(Value0) & ResultMask##Size)Op(uint64_t(Imm) &                \
+                                                ResultMask##Size);             \
+    static constexpr uint32_t Expected0 = Result & Mask##Size;                 \
+    static constexpr uint32_t Expected1 = (Result >> Size) & Mask##Size;       \
+    ASSERT_EQ(Expected0, test.contentsOfDword(T0)) << TestString << ": 0";     \
+    ASSERT_EQ(Expected1, test.contentsOfDword(T1)) << TestString << ": 1";     \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplOp(Inst0, Inst1, Dst0, Dst1, Value0, Src0, Src1, Value1, Op,   \
+                   Size)                                                       \
+  do {                                                                         \
+    TestImplRegReg(Inst0, Inst1, Dst0, Dst1, Value0, Src0, Src1, Value1, Op,   \
+                   Size);                                                      \
+    TestImplRegAddr(Inst0, Inst1, Dst0, Dst1, Value0, Value1, Op, Size);       \
+    TestImplRegImm(Inst0, Inst1, Dst0, Dst1, Value0, Value1, Op, Size);        \
+    TestImplAddrReg(Inst0, Inst1, Value0, Src0, Src1, Value1, Op, Size);       \
+    TestImplAddrImm(Inst0, Inst1, Value0, Value1, Op, Size);                   \
+  } while (0)
+
+#define TestImplValues(Dst0, Dst1, Value0, Src0, Src1, Value1, Size)           \
+  do {                                                                         \
+    TestImplOp(add, adc, Dst0, Dst1, Value0, Src0, Src1, Value1, +, Size);     \
+    TestImplOp(sub, sbb, Dst0, Dst1, Value0, Src0, Src1, Value1, -, Size);     \
+  } while (0)
+
+#define TestImplSize(Dst0, Dst1, Src0, Src1, Size)                             \
+  do {                                                                         \
+    TestImplValues(Dst0, Dst1, 0xFFFFFFFFFFFFFF00ull, Src0, Src1,              \
+                   0xFFFFFFFF0000017Full, Size);                               \
+  } while (0)
+
+#define TestImpl(Dst0, Dst1, Src0, Src1)                                       \
+  do {                                                                         \
+    if (GPRRegister::Encoded_Reg_##Dst0 <= 3 &&                                \
+        GPRRegister::Encoded_Reg_##Dst1 <= 3 &&                                \
+        GPRRegister::Encoded_Reg_##Src0 <= 3 &&                                \
+        GPRRegister::Encoded_Reg_##Src1 <= 3) {                                \
+      TestImplSize(Dst0, Dst1, Src0, Src1, 8);                                 \
+    }                                                                          \
+    TestImplSize(Dst0, Dst1, Src0, Src1, 16);                                  \
+    TestImplSize(Dst0, Dst1, Src0, Src1, 32);                                  \
+  } while (0)
+
+  TestImpl(eax, ebx, ecx, edx);
+  TestImpl(ebx, ecx, edx, esi);
+  TestImpl(ecx, edx, esi, edi);
+  TestImpl(edx, esi, edi, eax);
+  TestImpl(esi, edi, eax, ebx);
+  TestImpl(edi, eax, ebx, ecx);
+
+#undef TestImpl
+#undef TestImplSize
+#undef TestImplValues
+#undef TestImplOp
+#undef TestImplAddrImm
+#undef TestImplAddrReg
+#undef TestImplRegImm
+#undef TestImplRegAddr
+#undef TestImplRegReg
+}
+
+TEST_F(AssemblerX8632LowLevelTest, Cbw_Cwd_Cdq) {
+#define TestImpl(Inst, BytesSize, ...)                                         \
+  do {                                                                         \
+    __ Inst();                                                                 \
+    ASSERT_EQ(BytesSize, codeBytesSize()) << #Inst;                            \
+    verifyBytes<BytesSize>(codeBytes(), __VA_ARGS__);                          \
+    reset();                                                                   \
+  } while (0)
+
+  TestImpl(cbw, 2u, 0x66, 0x98);
+  TestImpl(cwd, 2u, 0x66, 0x99);
+  TestImpl(cdq, 1u, 0x99);
+
+#undef TestImpl
+}
+
+TEST_F(AssemblerX8632Test, SingleOperandMul) {
+  static constexpr uint32_t Mask8 = 0x000000FF;
+  static constexpr uint32_t Mask16 = 0x0000FFFF;
+  static constexpr uint32_t Mask32 = 0xFFFFFFFF;
+
+#define TestImplReg(Inst, Value0, Src, Value1, Type, Size)                     \
+  do {                                                                         \
+    static_assert(GPRRegister::Encoded_Reg_eax !=                              \
+                      GPRRegister::Encoded_Reg_##Src,                          \
+                  "eax can not be src1.");                                     \
+                                                                               \
+    static constexpr char TestString[] =                                       \
+        "(" #Inst ", " #Value0 ", " #Src ", " #Value1 ", " #Type ", " #Size    \
+        ")";                                                                   \
+    static constexpr Type##64_t OperandEax =                                   \
+        static_cast<Type##Size##_t>((Value0)&Mask##Size);                      \
+    static constexpr Type##64_t OperandOther =                                 \
+        static_cast<Type##Size##_t>((Value1)&Mask##Size);                      \
+    static constexpr uint32_t ExpectedEax =                                    \
+        Mask##Size & (OperandEax * OperandOther);                              \
+    static constexpr uint32_t ExpectedEdx =                                    \
+        Mask##Size & ((OperandEax * OperandOther) >> Size);                    \
+                                                                               \
+    __ mov(IceType_i##Size, GPRRegister::Encoded_Reg_eax,                      \
+           Immediate((Value0)&Mask##Size));                                    \
+    __ mov(IceType_i##Size, GPRRegister::Encoded_Reg_##Src,                    \
+           Immediate((Value1)&Mask##Size));                                    \
+    __ Inst(IceType_i##Size, GPRRegister::Encoded_Reg_##Src);                  \
+                                                                               \
+    if (Size == 8) {                                                           \
+      /* mov %ah, %dl */                                                       \
+      __ mov(IceType_i##Size, GPRRegister::Encoded_Reg_edx,                    \
+             GPRRegister::Encoded_Reg_esp);                                    \
+      __ And(IceType_i16, GPRRegister::Encoded_Reg_eax, Immediate(0x00FF));    \
+      if (GPRRegister::Encoded_Reg_##Src == GPRRegister::Encoded_Reg_esi) {    \
+        /* src == dh; clear dx's upper 8 bits. */                              \
+        __ And(IceType_i16, GPRRegister::Encoded_Reg_edx, Immediate(0x00FF));  \
+      }                                                                        \
+    }                                                                          \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(ExpectedEax, test.eax()) << TestString;                          \
+    ASSERT_EQ(ExpectedEdx, test.edx()) << TestString;                          \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplAddr(Inst, Value0, Value1, Type, Size)                         \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Inst ", " #Value0 ", Addr, " #Value1 ", " #Type ", " #Size ")";   \
+    static const uint32_t T0 = allocateDword();                                \
+    static constexpr uint32_t V0 = Value1;                                     \
+    static constexpr Type##64_t OperandEax =                                   \
+        static_cast<Type##Size##_t>((Value0)&Mask##Size);                      \
+    static constexpr Type##64_t OperandOther =                                 \
+        static_cast<Type##Size##_t>((Value1)&Mask##Size);                      \
+    static constexpr uint32_t ExpectedEax =                                    \
+        Mask##Size & (OperandEax * OperandOther);                              \
+    static constexpr uint32_t ExpectedEdx =                                    \
+        Mask##Size & ((OperandEax * OperandOther) >> Size);                    \
+                                                                               \
+    __ mov(IceType_i##Size, GPRRegister::Encoded_Reg_eax,                      \
+           Immediate((Value0)&Mask##Size));                                    \
+    __ Inst(IceType_i##Size, dwordAddress(T0));                                \
+                                                                               \
+    if (Size == 8) {                                                           \
+      /* mov %ah, %dl */                                                       \
+      __ mov(IceType_i##Size, GPRRegister::Encoded_Reg_edx,                    \
+             GPRRegister::Encoded_Reg_esp);                                    \
+      __ And(IceType_i16, GPRRegister::Encoded_Reg_eax, Immediate(0x00FF));    \
+    }                                                                          \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDwordTo(T0, V0);                                                   \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(ExpectedEax, test.eax()) << TestString;                          \
+    ASSERT_EQ(ExpectedEdx, test.edx()) << TestString;                          \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplOp(Inst, Value0, Src, Value1, Type, Size)                      \
+  do {                                                                         \
+    TestImplReg(Inst, Value0, Src, Value1, Type, Size);                        \
+    TestImplAddr(Inst, Value0, Value1, Type, Size);                            \
+  } while (0)
+
+#define TestImplValue(Value0, Src, Value1, Size)                               \
+  do {                                                                         \
+    TestImplOp(mul, Value0, Src, Value1, uint, Size);                          \
+    TestImplOp(imul, Value0, Src, Value1, int, Size);                          \
+  } while (0)
+
+#define TestImplSize(Src, Size)                                                \
+  do {                                                                         \
+    TestImplValue(10, Src, 1, Size);                                           \
+    TestImplValue(10, Src, -1, Size);                                          \
+    TestImplValue(-10, Src, 37, Size);                                         \
+    TestImplValue(-10, Src, -15, Size);                                        \
+  } while (0)
+
+#define TestImpl(Src)                                                          \
+  do {                                                                         \
+    TestImplSize(Src, 8);                                                      \
+    TestImplSize(Src, 16);                                                     \
+    TestImplSize(Src, 32);                                                     \
+  } while (0)
+
+  TestImpl(ebx);
+  TestImpl(ecx);
+  TestImpl(edx);
+  TestImpl(esi);
+  TestImpl(edi);
+
+#undef TestImpl
+#undef TestImplSize
+#undef TestImplValue
+#undef TestImplOp
+#undef TestImplAddr
+#undef TestImplReg
+}
+
+TEST_F(AssemblerX8632Test, TwoOperandImul) {
+  static constexpr uint32_t Mask16 = 0x0000FFFF;
+  static constexpr uint32_t Mask32 = 0xFFFFFFFF;
+
+#define TestImplRegReg(Dst, Value0, Src, Value1, Size)                         \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Dst ", " #Value0 ", " #Src ", " #Value1 ", " #Size ")";           \
+    static constexpr int64_t Operand0 =                                        \
+        static_cast<int##Size##_t>((Value0)&Mask##Size);                       \
+    static constexpr int64_t Operand1 =                                        \
+        static_cast<int##Size##_t>((Value1)&Mask##Size);                       \
+    static constexpr uint32_t Expected = Mask##Size & (Operand0 * Operand1);   \
+                                                                               \
+    __ mov(IceType_i##Size, GPRRegister::Encoded_Reg_##Dst,                    \
+           Immediate((Value0)&Mask##Size));                                    \
+    __ mov(IceType_i##Size, GPRRegister::Encoded_Reg_##Src,                    \
+           Immediate((Value1)&Mask##Size));                                    \
+    __ imul(IceType_i##Size, GPRRegister::Encoded_Reg_##Dst,                   \
+            GPRRegister::Encoded_Reg_##Src);                                   \
+                                                                               \
+    if (Size == 8) {                                                           \
+      /* mov %ah, %dl */                                                       \
+      __ mov(IceType_i##Size, GPRRegister::Encoded_Reg_edx,                    \
+             GPRRegister::Encoded_Reg_esp);                                    \
+      __ And(IceType_i16, GPRRegister::Encoded_Reg_eax, Immediate(0x00FF));    \
+      if (GPRRegister::Encoded_Reg_##Src == GPRRegister::Encoded_Reg_esi) {    \
+        /* src == dh; clear dx's upper 8 bits. */                              \
+        __ And(IceType_i16, GPRRegister::Encoded_Reg_edx, Immediate(0x00FF));  \
+      }                                                                        \
+    }                                                                          \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(Expected, test.Dst()) << TestString;                             \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplRegImm(Dst, Value0, Imm, Size)                                 \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Dst ", " #Value0 ", Imm(" #Imm "), " #Size ")";                   \
+    static constexpr int64_t Operand0 =                                        \
+        static_cast<int##Size##_t>((Value0)&Mask##Size);                       \
+    static constexpr int64_t Operand1 =                                        \
+        static_cast<int##Size##_t>((Imm)&Mask##Size);                          \
+    static constexpr uint32_t Expected = Mask##Size & (Operand0 * Operand1);   \
+                                                                               \
+    __ mov(IceType_i##Size, GPRRegister::Encoded_Reg_##Dst,                    \
+           Immediate((Value0)&Mask##Size));                                    \
+    __ imul(IceType_i##Size, GPRRegister::Encoded_Reg_##Dst, Immediate(Imm));  \
+                                                                               \
+    if (Size == 8) {                                                           \
+      /* mov %ah, %dl */                                                       \
+      __ mov(IceType_i##Size, GPRRegister::Encoded_Reg_edx,                    \
+             GPRRegister::Encoded_Reg_esp);                                    \
+      __ And(IceType_i16, GPRRegister::Encoded_Reg_eax, Immediate(0x00FF));    \
+    }                                                                          \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(Expected, test.Dst()) << TestString;                             \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplRegAddr(Dst, Value0, Value1, Size)                             \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Dst ", " #Value0 ", Addr," #Value1 ", " #Size ")";                \
+    static constexpr int64_t Operand0 =                                        \
+        static_cast<int##Size##_t>((Value0)&Mask##Size);                       \
+    static constexpr int64_t Operand1 =                                        \
+        static_cast<int##Size##_t>((Value1)&Mask##Size);                       \
+    static constexpr uint32_t Expected = Mask##Size & (Operand0 * Operand1);   \
+    const uint32_t T0 = allocateDword();                                       \
+                                                                               \
+    __ mov(IceType_i##Size, GPRRegister::Encoded_Reg_##Dst,                    \
+           Immediate((Value0)&Mask##Size));                                    \
+    __ imul(IceType_i##Size, GPRRegister::Encoded_Reg_##Dst,                   \
+            dwordAddress(T0));                                                 \
+                                                                               \
+    if (Size == 8) {                                                           \
+      /* mov %ah, %dl */                                                       \
+      __ mov(IceType_i##Size, GPRRegister::Encoded_Reg_edx,                    \
+             GPRRegister::Encoded_Reg_esp);                                    \
+      __ And(IceType_i16, GPRRegister::Encoded_Reg_eax, Immediate(0x00FF));    \
+    }                                                                          \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDwordTo(T0, static_cast<uint32_t>(Operand1));                      \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(Expected, test.Dst()) << TestString;                             \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplValue(Dst, Value0, Src, Value1, Size)                          \
+  do {                                                                         \
+    TestImplRegReg(Dst, Value0, Src, Value1, Size);                            \
+    TestImplRegImm(Dst, Value0, Value1, Size);                                 \
+    TestImplRegAddr(Dst, Value0, Value1, Size);                                \
+  } while (0)
+
+#define TestImplSize(Dst, Src, Size)                                           \
+  do {                                                                         \
+    TestImplValue(Dst, 1, Src, 1, Size);                                       \
+    TestImplValue(Dst, -10, Src, 0x4050AA20, Size);                            \
+    TestImplValue(Dst, -2, Src, -55, Size);                                    \
+  } while (0)
+
+#define TestImpl(Dst, Src)                                                     \
+  do {                                                                         \
+    TestImplSize(Dst, Src, 16);                                                \
+    TestImplSize(Dst, Src, 32);                                                \
+  } while (0)
+
+  TestImpl(eax, ebx);
+  TestImpl(ebx, ecx);
+  TestImpl(ecx, edx);
+  TestImpl(edx, esi);
+  TestImpl(esi, edi);
+  TestImpl(edi, eax);
+
+#undef TestImpl
+#undef TestImplSize
+#undef TestImplValue
+#undef TestImplRegAddr
+#undef TestImplRegImm
+#undef TestImplRegReg
+}
+
+TEST_F(AssemblerX8632Test, Div) {
+  static constexpr uint32_t Mask8 = 0x000000FF;
+  static constexpr uint32_t Mask16 = 0x0000FFFF;
+  static constexpr uint32_t Mask32 = 0xFFFFFFFF;
+
+  static constexpr uint64_t Operand0Mask8 = 0x00000000000000FFull;
+  static constexpr uint64_t Operand0Mask16 = 0x00000000FFFFFFFFull;
+  static constexpr uint64_t Operand0Mask32 = 0xFFFFFFFFFFFFFFFFull;
+
+  using Operand0Type_int8 = int16_t;
+  using Operand0Type_uint8 = uint16_t;
+  using Operand0Type_int16 = int32_t;
+  using Operand0Type_uint16 = uint32_t;
+  using Operand0Type_int32 = int64_t;
+  using Operand0Type_uint32 = uint64_t;
+
+#define TestImplReg(Inst, Value0, Src, Value1, Type, Size)                     \
+  do {                                                                         \
+    static_assert(GPRRegister::Encoded_Reg_eax !=                              \
+                      GPRRegister::Encoded_Reg_##Src,                          \
+                  "eax can not be src1.");                                     \
+    static_assert(GPRRegister::Encoded_Reg_edx !=                              \
+                      GPRRegister::Encoded_Reg_##Src,                          \
+                  "edx can not be src1.");                                     \
+                                                                               \
+    static constexpr char TestString[] =                                       \
+        "(" #Inst ", " #Value0 ", " #Src ", " #Value1 ", " #Type ", " #Size    \
+        ")";                                                                   \
+    static constexpr Operand0Type_##Type##Size Operand0 =                      \
+        static_cast<Type##64_t>(Value0) & Operand0Mask##Size;                  \
+    static constexpr Type##Size##_t Operand0Lo = Operand0 & Mask##Size;        \
+    static constexpr Type##Size##_t Operand0Hi =                               \
+        (Operand0 >> Size) & Mask##Size;                                       \
+    static constexpr Type##Size##_t Operand1 =                                 \
+        static_cast<Type##Size##_t>(Value1) & Mask##Size;                      \
+    __ mov(IceType_i##Size, GPRRegister::Encoded_Reg_eax,                      \
+           Immediate(Operand0Lo));                                             \
+    if (Size == 8) {                                                           \
+      /* mov Operand0Hi, %ah */                                                \
+      __ mov(IceType_i8, GPRRegister::Encoded_Reg_esp, Immediate(Operand0Hi)); \
+    } else {                                                                   \
+      __ mov(IceType_i##Size, GPRRegister::Encoded_Reg_edx,                    \
+             Immediate(Operand0Hi));                                           \
+    }                                                                          \
+    __ mov(IceType_i##Size, GPRRegister::Encoded_Reg_##Src,                    \
+           Immediate(Operand1));                                               \
+    __ Inst(IceType_i##Size, GPRRegister::Encoded_Reg_##Src);                  \
+    if (Size == 8) {                                                           \
+      /* mov %ah, %dl */                                                       \
+      __ mov(IceType_i##Size, GPRRegister::Encoded_Reg_edx,                    \
+             GPRRegister::Encoded_Reg_esp);                                    \
+      __ And(IceType_i16, GPRRegister::Encoded_Reg_eax, Immediate(0x00FF));    \
+      if (GPRRegister::Encoded_Reg_##Src == GPRRegister::Encoded_Reg_esi) {    \
+        __ And(IceType_i16, GPRRegister::Encoded_Reg_edx, Immediate(0x00FF));  \
+      }                                                                        \
+    }                                                                          \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.run();                                                                \
+                                                                               \
+    static constexpr uint32_t Quocient = (Operand0 / Operand1) & Mask##Size;   \
+    static constexpr uint32_t Reminder = (Operand0 % Operand1) & Mask##Size;   \
+    EXPECT_EQ(Quocient, test.eax()) << TestString;                             \
+    EXPECT_EQ(Reminder, test.edx()) << TestString;                             \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplAddr(Inst, Value0, Value1, Type, Size)                         \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Inst ", " #Value0 ", Addr, " #Value1 ", " #Type ", " #Size ")";   \
+    static constexpr Operand0Type_##Type##Size Operand0 =                      \
+        static_cast<Type##64_t>(Value0) & Operand0Mask##Size;                  \
+    static constexpr Type##Size##_t Operand0Lo = Operand0 & Mask##Size;        \
+    static constexpr Type##Size##_t Operand0Hi =                               \
+        (Operand0 >> Size) & Mask##Size;                                       \
+    const uint32_t T0 = allocateDword();                                       \
+    static constexpr Type##Size##_t V0 =                                       \
+        static_cast<Type##Size##_t>(Value1) & Mask##Size;                      \
+    __ mov(IceType_i##Size, GPRRegister::Encoded_Reg_eax,                      \
+           Immediate(Operand0Lo));                                             \
+    if (Size == 8) {                                                           \
+      /* mov Operand0Hi, %ah */                                                \
+      __ mov(IceType_i8, GPRRegister::Encoded_Reg_esp, Immediate(Operand0Hi)); \
+    } else {                                                                   \
+      __ mov(IceType_i##Size, GPRRegister::Encoded_Reg_edx,                    \
+             Immediate(Operand0Hi));                                           \
+    }                                                                          \
+    __ Inst(IceType_i##Size, dwordAddress(T0));                                \
+    if (Size == 8) {                                                           \
+      /* mov %ah, %dl */                                                       \
+      __ mov(IceType_i##Size, GPRRegister::Encoded_Reg_edx,                    \
+             GPRRegister::Encoded_Reg_esp);                                    \
+      __ And(IceType_i16, GPRRegister::Encoded_Reg_eax, Immediate(0x00FF));    \
+    }                                                                          \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDwordTo(T0, static_cast<uint32_t>(V0));                            \
+    test.run();                                                                \
+                                                                               \
+    static constexpr uint32_t Quocient = (Operand0 / V0) & Mask##Size;         \
+    static constexpr uint32_t Reminder = (Operand0 % V0) & Mask##Size;         \
+    EXPECT_EQ(Quocient, test.eax()) << TestString;                             \
+    EXPECT_EQ(Reminder, test.edx()) << TestString;                             \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplOp(Inst, Value0, Src, Value1, Type, Size)                      \
+  do {                                                                         \
+    TestImplReg(Inst, Value0, Src, Value1, Type, Size);                        \
+    TestImplAddr(Inst, Value0, Value1, Type, Size);                            \
+  } while (0)
+
+#define TestImplValue(Value0, Src, Value1, Size)                               \
+  do {                                                                         \
+    TestImplOp(div, Value0, Src, Value1, uint, Size);                          \
+    TestImplOp(idiv, Value0, Src, Value1, int, Size);                          \
+  } while (0)
+
+#define TestImplSize(Src, Size)                                                \
+  do {                                                                         \
+    TestImplValue(10, Src, 1, Size);                                           \
+    TestImplValue(10, Src, -1, Size);                                          \
+  } while (0)
+
+#define TestImpl(Src)                                                          \
+  do {                                                                         \
+    TestImplSize(Src, 8);                                                      \
+    TestImplSize(Src, 16);                                                     \
+    TestImplSize(Src, 32);                                                     \
+  } while (0)
+
+  TestImpl(ebx);
+  TestImpl(ecx);
+  TestImpl(esi);
+  TestImpl(edi);
+
+#undef TestImpl
+#undef TestImplSize
+#undef TestImplValue
+#undef TestImplOp
+#undef TestImplAddr
+#undef TestImplReg
+}
+
+// This is not executable in x86-64 because the one byte inc/dec instructions
+// became the REX prefixes. Therefore, these are tested with the low-level test
+// infrastructure.
+TEST_F(AssemblerX8632LowLevelTest, Incl_Decl_Reg) {
+#define TestImpl(Inst, Dst, BaseOpcode)                                        \
+  do {                                                                         \
+    __ Inst(GPRRegister::Encoded_Reg_##Dst);                                   \
+    static constexpr uint8_t ByteCount = 1;                                    \
+    ASSERT_EQ(ByteCount, codeBytesSize());                                     \
+    verifyBytes<ByteCount>(codeBytes(),                                        \
+                           BaseOpcode | GPRRegister::Encoded_Reg_##Dst);       \
+    reset();                                                                   \
+  } while (0)
+
+#define TestInc(Dst)                                                           \
+  do {                                                                         \
+    constexpr uint8_t InclOpcode = 0x40;                                       \
+    TestImpl(incl, Dst, InclOpcode);                                           \
+  } while (0)
+
+#define TestDec(Dst)                                                           \
+  do {                                                                         \
+    constexpr uint8_t DeclOpcode = 0x48;                                       \
+    TestImpl(decl, Dst, DeclOpcode);                                           \
+  } while (0)
+
+  TestInc(eax);
+  TestInc(ecx);
+  TestInc(edx);
+  TestInc(ebx);
+  TestInc(esp);
+  TestInc(ebp);
+  TestInc(esi);
+  TestInc(esi);
+
+  TestDec(eax);
+  TestDec(ecx);
+  TestDec(edx);
+  TestDec(ebx);
+  TestDec(esp);
+  TestDec(ebp);
+  TestDec(esi);
+  TestDec(esi);
+
+#undef TestInc
+#undef TestDec
+#undef TestImpl
+}
+
+TEST_F(AssemblerX8632Test, Incl_Decl_Addr) {
+#define TestImpl(Inst, Value0)                                                 \
+  do {                                                                         \
+    const bool IsInc = std::string(#Inst).find("incl") != std::string::npos;   \
+    const uint32_t T0 = allocateDword();                                       \
+    const uint32_t V0 = Value0;                                                \
+                                                                               \
+    __ Inst(dwordAddress(T0));                                                 \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDwordTo(T0, V0);                                                   \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(static_cast<uint32_t>(Value0 + (IsInc ? 1 : -1)),                \
+              test.contentsOfDword(T0));                                       \
+    reset();                                                                   \
+  } while (0)
+
+#define TestInc(Value0)                                                        \
+  do {                                                                         \
+    TestImpl(incl, Value0);                                                    \
+  } while (0)
+
+#define TestDec(Value0)                                                        \
+  do {                                                                         \
+    TestImpl(decl, Value0);                                                    \
+  } while (0)
+
+  TestInc(230);
+
+  TestDec(30);
+
+#undef TestInc
+#undef TestDec
+#undef TestImpl
+}
+
+TEST_F(AssemblerX8632Test, Shifts) {
+  static constexpr uint32_t Mask8 = 0x000000FF;
+  static constexpr uint32_t Mask16 = 0x0000FFFF;
+  static constexpr uint32_t Mask32 = 0xFFFFFFFF;
+
+#define TestImplRegImm(Inst, Dst, Value0, Imm, Op, Type, Size)                 \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Inst ", " #Dst ", " #Value0 ", Imm(" #Imm "), " #Op ", " #Type    \
+        ", " #Size ")";                                                        \
+    const bool IsRol = std::string(#Inst).find("rol") != std::string::npos;    \
+    const uint##Size##_t Expected =                                            \
+        Mask##Size & (static_cast<Type##Size##_t>(Value0) Op(Imm) |            \
+                      (!IsRol ? 0 : (Value0) >> (Size - Imm)));                \
+                                                                               \
+    __ mov(IceType_i##Size, GPRRegister::Encoded_Reg_##Dst,                    \
+           Immediate((Value0)&Mask##Size));                                    \
+    __ Inst(IceType_i##Size, GPRRegister::Encoded_Reg_##Dst,                   \
+            Immediate((Imm)&Mask##Size));                                      \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(static_cast<uint32_t>(Expected), test.Dst()) << TestString;      \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplRegRegImm(Inst, Dst, Value0, Src, Value1, Count, Op0, Op1,     \
+                          Type, Size)                                          \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Inst ", " #Dst ", " #Value0 ", " #Src ", " #Value1                \
+        ", Imm(" #Count "), " #Op0 ", " #Op1 ", " #Type ", " #Size ")";        \
+    const uint##Size##_t Expected =                                            \
+        Mask##Size & (static_cast<Type##Size##_t>(Value0) Op0(Count) |         \
+                      (static_cast<Type##64_t>(Value1) Op1(Size - Count)));    \
+                                                                               \
+    __ mov(IceType_i##Size, GPRRegister::Encoded_Reg_##Dst,                    \
+           Immediate((Value0)&Mask##Size));                                    \
+    __ mov(IceType_i##Size, GPRRegister::Encoded_Reg_##Src,                    \
+           Immediate((Value1)&Mask##Size));                                    \
+    __ Inst(IceType_i##Size, GPRRegister::Encoded_Reg_##Dst,                   \
+            GPRRegister::Encoded_Reg_##Src, Immediate(Count));                 \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(static_cast<uint32_t>(Expected), test.Dst()) << TestString;      \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplRegCl(Inst, Dst, Value0, Count, Op, Type, Size)                \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Inst ", " #Dst ", " #Value0 ", " #Count ", " #Op ", " #Type       \
+        ", " #Size ")";                                                        \
+    const bool IsRol = std::string(#Inst).find("rol") != std::string::npos;    \
+    const uint##Size##_t Expected =                                            \
+        Mask##Size & (static_cast<Type##Size##_t>(Value0) Op(Count) |          \
+                      (!IsRol ? 0 : Value0 >> (Size - Count)));                \
+                                                                               \
+    __ mov(IceType_i##Size, GPRRegister::Encoded_Reg_##Dst,                    \
+           Immediate((Value0)&Mask##Size));                                    \
+    __ mov(IceType_i8, GPRRegister::Encoded_Reg_ecx,                           \
+           Immediate((Count)&Mask##Size));                                     \
+    __ Inst(IceType_i##Size, GPRRegister::Encoded_Reg_##Dst,                   \
+            GPRRegister::Encoded_Reg_ecx);                                     \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(static_cast<uint32_t>(Expected), test.Dst()) << TestString;      \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplRegRegCl(Inst, Dst, Value0, Src, Value1, Count, Op0, Op1,      \
+                         Type, Size)                                           \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Inst ", " #Dst ", " #Value0 ", " #Src ", " #Value1 ", " #Count    \
+        ", " #Op0 ", " #Op1 ", " #Type ", " #Size ")";                         \
+    const uint##Size##_t Expected =                                            \
+        Mask##Size & (static_cast<Type##Size##_t>(Value0) Op0(Count) |         \
+                      (static_cast<Type##64_t>(Value1) Op1(Size - Count)));    \
+                                                                               \
+    __ mov(IceType_i##Size, GPRRegister::Encoded_Reg_##Dst,                    \
+           Immediate((Value0)&Mask##Size));                                    \
+    __ mov(IceType_i##Size, GPRRegister::Encoded_Reg_##Src,                    \
+           Immediate((Value1)&Mask##Size));                                    \
+    __ mov(IceType_i##Size, GPRRegister::Encoded_Reg_ecx,                      \
+           Immediate((Count)&0x7F));                                           \
+    __ Inst(IceType_i##Size, GPRRegister::Encoded_Reg_##Dst,                   \
+            GPRRegister::Encoded_Reg_##Src);                                   \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(static_cast<uint32_t>(Expected), test.Dst()) << TestString;      \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplAddrCl(Inst, Value0, Count, Op, Type, Size)                    \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Inst ", Addr, " #Value0 ", " #Count ", " #Op ", " #Type           \
+        ", " #Size ")";                                                        \
+    const bool IsRol = std::string(#Inst).find("rol") != std::string::npos;    \
+    const uint##Size##_t Expected =                                            \
+        Mask##Size & (static_cast<Type##Size##_t>(Value0) Op(Count) |          \
+                      (!IsRol ? 0 : Value0 >> (Size - Count)));                \
+    const uint32_t T0 = allocateDword();                                       \
+    const uint32_t V0 = Value0;                                                \
+                                                                               \
+    __ mov(IceType_i8, GPRRegister::Encoded_Reg_ecx,                           \
+           Immediate((Count)&Mask##Size));                                     \
+    __ Inst(IceType_i##Size, dwordAddress(T0), GPRRegister::Encoded_Reg_ecx);  \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDwordTo(T0, V0);                                                   \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(static_cast<uint32_t>(Expected),                                 \
+              Mask##Size &test.contentsOfDword(T0))                            \
+        << TestString;                                                         \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplAddrRegCl(Inst, Value0, Src, Value1, Count, Op0, Op1, Type,    \
+                          Size)                                                \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Inst ", Addr, " #Value0 ", " #Src ", " #Value1 ", " #Count        \
+        ", " #Op0 ", " #Op1 ", " #Type ", " #Size ")";                         \
+    const uint##Size##_t Expected =                                            \
+        Mask##Size & (static_cast<Type##Size##_t>(Value0) Op0(Count) |         \
+                      (static_cast<Type##64_t>(Value1) Op1(Size - Count)));    \
+    const uint32_t T0 = allocateDword();                                       \
+                                                                               \
+    __ mov(IceType_i##Size, GPRRegister::Encoded_Reg_##Src,                    \
+           Immediate((Value1)&Mask##Size));                                    \
+    __ mov(IceType_i##Size, GPRRegister::Encoded_Reg_ecx,                      \
+           Immediate((Count)&0x7F));                                           \
+    __ Inst(IceType_i##Size, dwordAddress(T0),                                 \
+            GPRRegister::Encoded_Reg_##Src);                                   \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDwordTo(T0, static_cast<uint32_t>(Value0));                        \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(static_cast<uint32_t>(Expected), test.contentsOfDword(T0))       \
+        << TestString;                                                         \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplOp(Inst, Dst, Value0, Count, Op, Type, Size)                   \
+  do {                                                                         \
+    static_assert(GPRRegister::Encoded_Reg_##Dst !=                            \
+                      GPRRegister::Encoded_Reg_ecx,                            \
+                  "ecx should not be specified as Dst");                       \
+    TestImplRegImm(Inst, Dst, Value0, Count, Op, Type, Size);                  \
+    TestImplRegImm(Inst, ecx, Value0, Count, Op, Type, Size);                  \
+    TestImplRegCl(Inst, Dst, Value0, Count, Op, Type, Size);                   \
+    TestImplAddrCl(Inst, Value0, Count, Op, Type, Size);                       \
+  } while (0)
+
+#define TestImplThreeOperandOp(Inst, Dst, Value0, Src, Value1, Count, Op0,     \
+                               Op1, Type, Size)                                \
+  do {                                                                         \
+    static_assert(GPRRegister::Encoded_Reg_##Dst !=                            \
+                      GPRRegister::Encoded_Reg_ecx,                            \
+                  "ecx should not be specified as Dst");                       \
+    static_assert(GPRRegister::Encoded_Reg_##Src !=                            \
+                      GPRRegister::Encoded_Reg_ecx,                            \
+                  "ecx should not be specified as Src");                       \
+    TestImplRegRegImm(Inst, Dst, Value0, Src, Value1, Count, Op0, Op1, Type,   \
+                      Size);                                                   \
+    TestImplRegRegCl(Inst, Dst, Value0, Src, Value1, Count, Op0, Op1, Type,    \
+                     Size);                                                    \
+    TestImplAddrRegCl(Inst, Value0, Src, Value1, Count, Op0, Op1, Type, Size); \
+  } while (0)
+
+#define TestImplValue(Dst, Value0, Count, Size)                                \
+  do {                                                                         \
+    TestImplOp(rol, Dst, Value0, Count, <<, uint, Size);                       \
+    TestImplOp(shl, Dst, Value0, Count, <<, uint, Size);                       \
+    TestImplOp(shr, Dst, Value0, Count, >>, uint, Size);                       \
+    TestImplOp(sar, Dst, Value0, Count, >>, int, Size);                        \
+  } while (0)
+
+#define TestImplThreeOperandValue(Dst, Value0, Src, Value1, Count, Size)       \
+  do {                                                                         \
+    TestImplThreeOperandOp(shld, Dst, Value0, Src, Value1, Count, <<, >>,      \
+                           uint, Size);                                        \
+    TestImplThreeOperandOp(shrd, Dst, Value0, Src, Value1, Count, >>, <<,      \
+                           uint, Size);                                        \
+  } while (0)
+
+#define TestImplSize(Dst, Size)                                                \
+  do {                                                                         \
+    TestImplValue(Dst, 0x8F, 3, Size);                                         \
+    TestImplValue(Dst, 0x8FFF, 7, Size);                                       \
+    TestImplValue(Dst, 0x8FFFF, 7, Size);                                      \
+  } while (0)
+
+#define TestImplThreeOperandSize(Dst, Src, Size)                               \
+  do {                                                                         \
+    TestImplThreeOperandValue(Dst, 0xFFF3, Src, 0xA000, 8, Size);              \
+  } while (0)
+
+#define TestImpl(Dst, Src)                                                     \
+  do {                                                                         \
+    if (GPRRegister::Encoded_Reg_##Dst < 4) {                                  \
+      TestImplSize(Dst, 8);                                                    \
+    }                                                                          \
+    TestImplSize(Dst, 16);                                                     \
+    TestImplThreeOperandSize(Dst, Src, 16);                                    \
+    TestImplSize(Dst, 32);                                                     \
+    TestImplThreeOperandSize(Dst, Src, 32);                                    \
+  } while (0)
+
+  TestImpl(eax, ebx);
+  TestImpl(ebx, edx);
+  TestImpl(edx, esi);
+  TestImpl(esi, edi);
+  TestImpl(edi, eax);
+
+#undef TestImpl
+#undef TestImplThreeOperandSize
+#undef TestImplSize
+#undef TestImplValue
+#undef TestImplThreeOperandValue
+#undef TestImplOp
+#undef TestImplThreeOperandOp
+#undef TestImplAddrCl
+#undef TestImplRegRegCl
+#undef TestImplRegCl
+#undef TestImplRegRegImm
+#undef TestImplRegImm
+}
+
+TEST_F(AssemblerX8632Test, Neg) {
+  static constexpr uint32_t Mask8 = 0x000000ff;
+  static constexpr uint32_t Mask16 = 0x0000ffff;
+  static constexpr uint32_t Mask32 = 0xffffffff;
+
+#define TestImplReg(Dst, Size)                                                 \
+  do {                                                                         \
+    static constexpr int32_t Value = 0xFF00A543;                               \
+    __ mov(IceType_i##Size, GPRRegister::Encoded_Reg_##Dst,                    \
+           Immediate(static_cast<int##Size##_t>(Value) & Mask##Size));         \
+    __ neg(IceType_i##Size, GPRRegister::Encoded_Reg_##Dst);                   \
+    __ mov(IceType_i##Size, GPRRegister::Encoded_Reg_eax,                      \
+           GPRRegister::Encoded_Reg_##Dst);                                    \
+    __ And(IceType_i32, GPRRegister::Encoded_Reg_eax, Immediate(Mask##Size));  \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(1 + (~static_cast<int##Size##_t>(Value) & Mask##Size),           \
+              test.eax())                                                      \
+        << "(" #Dst ", " #Size ")";                                            \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplAddr(Size)                                                     \
+  do {                                                                         \
+    static constexpr int32_t Value = 0xFF00A543;                               \
+    const uint32_t T0 = allocateDword();                                       \
+    __ neg(IceType_i##Size, dwordAddress(T0));                                 \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDwordTo(T0, Value &Mask##Size);                                    \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(1 + (~static_cast<int##Size##_t>(Value) & Mask##Size),           \
+              test.contentsOfDword(T0))                                        \
+        << "(Addr, " #Size ")";                                                \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImpl(Size)                                                         \
+  do {                                                                         \
+    TestImplAddr(Size);                                                        \
+    TestImplReg(eax, Size);                                                    \
+    TestImplReg(ebx, Size);                                                    \
+    TestImplReg(ecx, Size);                                                    \
+    TestImplReg(edx, Size);                                                    \
+    TestImplReg(esi, Size);                                                    \
+    TestImplReg(edi, Size);                                                    \
+  } while (0)
+
+  TestImpl(8);
+  TestImpl(16);
+  TestImpl(32);
+
+#undef TestImpl
+#undef TestImplAddr
+#undef TestImplReg
+}
+
+TEST_F(AssemblerX8632Test, Not) {
+#define TestImpl(Dst)                                                          \
+  do {                                                                         \
+    static constexpr uint32_t Value = 0xFF00A543;                              \
+    __ mov(IceType_i32, GPRRegister::Encoded_Reg_##Dst, Immediate(Value));     \
+    __ notl(GPRRegister::Encoded_Reg_##Dst);                                   \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(~Value, test.Dst()) << "(" #Dst ")";                             \
+    reset();                                                                   \
+  } while (0)
+
+  TestImpl(eax);
+  TestImpl(ebx);
+  TestImpl(ecx);
+  TestImpl(edx);
+  TestImpl(esi);
+  TestImpl(edi);
+
+#undef TestImpl
+}
+
+TEST_F(AssemblerX8632Test, Bswap) {
+#define TestImpl(Dst)                                                          \
+  do {                                                                         \
+    static constexpr uint32_t Value = 0xFF00A543;                              \
+    static constexpr uint32_t Expected = 0x43A500FF;                           \
+    __ mov(IceType_i32, GPRRegister::Encoded_Reg_##Dst, Immediate(Value));     \
+    __ bswap(IceType_i32, GPRRegister::Encoded_Reg_##Dst);                     \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(Expected, test.Dst()) << "(" #Dst ")";                           \
+    reset();                                                                   \
+  } while (0)
+
+  TestImpl(eax);
+  TestImpl(ebx);
+  TestImpl(ecx);
+  TestImpl(edx);
+  TestImpl(esi);
+  TestImpl(edi);
+
+#undef TestImpl
+}
+
+TEST_F(AssemblerX8632Test, Bt) {
+#define TestImpl(Dst, Value0, Src, Value1)                                     \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Dst ", " #Value0 ", " #Src ", " #Value1 ")";                      \
+    static constexpr uint32_t Expected = ((Value0) & (1u << (Value1))) != 0;   \
+                                                                               \
+    __ mov(IceType_i32, GPRRegister::Encoded_Reg_##Dst, Immediate(Value0));    \
+    __ mov(IceType_i32, GPRRegister::Encoded_Reg_##Src, Immediate(Value1));    \
+    __ bt(GPRRegister::Encoded_Reg_##Dst, GPRRegister::Encoded_Reg_##Src);     \
+    __ setcc(Cond::Br_b, ByteRegister::Encoded_Reg_al);                        \
+    __ And(IceType_i32, GPRRegister::Encoded_Reg_eax, Immediate(0xFFu));       \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(Expected, test.eax()) << TestString;                             \
+    reset();                                                                   \
+  } while (0)
+
+  TestImpl(eax, 0x08000000, ebx, 27u);
+  TestImpl(ebx, 0x08000000, ecx, 23u);
+  TestImpl(ecx, 0x00000000, edx, 1u);
+  TestImpl(edx, 0x08000300, esi, 9u);
+  TestImpl(esi, 0x08000300, edi, 10u);
+  TestImpl(edi, 0x7FFFEFFF, eax, 13u);
+
+#undef TestImpl
+}
+
+template <uint32_t Value, uint32_t Bits> class BitScanHelper {
+  BitScanHelper() = delete;
+
+public:
+  static_assert(Bits == 16 || Bits == 32, "Bits must be 16 or 32");
+  using ValueType =
+      typename std::conditional<Bits == 16, uint16_t, uint32_t>::type;
+
+private:
+  static constexpr ValueType BitIndex(bool Forward, ValueType Index) {
+    return (Value == 0)
+               ? BitScanHelper<Value, Bits>::NoBitSet
+               : (Value & (1u << Index)
+                      ? Index
+                      : BitIndex(Forward, (Forward ? Index + 1 : Index - 1)));
+  }
+
+public:
+  static constexpr ValueType NoBitSet = static_cast<ValueType>(-1);
+  static constexpr ValueType bsf = BitIndex(/*Forward*/ true, /*Index=*/0);
+  static constexpr ValueType bsr =
+      BitIndex(/*Forward*/ false, /*Index=*/Bits - 1);
+};
+
+TEST_F(AssemblerX8632Test, BitScanOperations) {
+#define TestImplRegReg(Inst, Dst, Src, Value1, Size)                           \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Inst ", " #Dst ", " #Src ", " #Value1 ", " #Size ")";             \
+    static constexpr uint32_t Expected = BitScanHelper<Value1, Size>::Inst;    \
+    const uint32_t ZeroFlag = allocateDword();                                 \
+    __ mov(IceType_i##Size, GPRRegister::Encoded_Reg_##Src,                    \
+           Immediate(Value1));                                                 \
+    __ Inst(IceType_i##Size, GPRRegister::Encoded_Reg_##Dst,                   \
+            GPRRegister::Encoded_Reg_##Src);                                   \
+    __ setcc(Cond::Br_e, dwordAddress(ZeroFlag));                              \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDwordTo(ZeroFlag, 0u);                                             \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ((Expected == BitScanHelper<Value1, Size>::NoBitSet),             \
+              test.contentsOfDword(ZeroFlag))                                  \
+        << TestString;                                                         \
+    if ((Expected != BitScanHelper<Value1, Size>::NoBitSet)) {                 \
+      ASSERT_EQ(Expected, test.Dst()) << TestString;                           \
+    }                                                                          \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplRegAddr(Inst, Dst, Value1, Size)                               \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Inst ", " #Dst ", Addr, " #Value1 ", " #Size ")";                 \
+    static constexpr uint32_t Expected = BitScanHelper<Value1, Size>::Inst;    \
+    const uint32_t T0 = allocateDword();                                       \
+    const uint32_t ZeroFlag = allocateDword();                                 \
+    __ Inst(IceType_i##Size, GPRRegister::Encoded_Reg_##Dst,                   \
+            dwordAddress(T0));                                                 \
+    __ setcc(Cond::Br_e, dwordAddress(ZeroFlag));                              \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDwordTo(T0, Value1);                                               \
+    test.setDwordTo(ZeroFlag, 0u);                                             \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ((Expected == BitScanHelper<Value1, Size>::NoBitSet),             \
+              test.contentsOfDword(ZeroFlag))                                  \
+        << TestString;                                                         \
+    if (Expected != BitScanHelper<Value1, Size>::NoBitSet) {                   \
+      ASSERT_EQ(Expected, test.Dst()) << TestString;                           \
+    }                                                                          \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplSize(Dst, Src, Value1, Size)                                   \
+  do {                                                                         \
+    TestImplRegReg(bsf, Dst, Src, Value1, Size);                               \
+    TestImplRegAddr(bsf, Dst, Value1, Size);                                   \
+    TestImplRegReg(bsr, Dst, Src, Value1, Size);                               \
+    TestImplRegAddr(bsf, Dst, Value1, Size);                                   \
+  } while (0)
+
+#define TestImplValue(Dst, Src, Value1)                                        \
+  do {                                                                         \
+    TestImplSize(Dst, Src, Value1, 16);                                        \
+    TestImplSize(Dst, Src, Value1, 32);                                        \
+  } while (0)
+
+#define TestImpl(Dst, Src)                                                     \
+  do {                                                                         \
+    TestImplValue(Dst, Src, 0x80000001);                                       \
+    TestImplValue(Dst, Src, 0x00000000);                                       \
+    TestImplValue(Dst, Src, 0x80001000);                                       \
+    TestImplValue(Dst, Src, 0x00FFFF00);                                       \
+  } while (0)
+
+  TestImpl(eax, ebx);
+  TestImpl(ebx, ecx);
+  TestImpl(ecx, edx);
+  TestImpl(edx, esi);
+  TestImpl(esi, edi);
+  TestImpl(edi, eax);
+
+#undef TestImpl
+#undef TestImplValue
+#undef TestImplSize
+#undef TestImplRegAddr
+#undef TestImplRegReg
+}
+
+} // end of anonymous namespace
+} // end of namespace Test
+} // end of namespace X8632
+} // end of namespace Ice
diff --git a/unittest/AssemblerX8632/Locked.cpp b/unittest/AssemblerX8632/Locked.cpp
new file mode 100644
index 0000000..82c1e14
--- /dev/null
+++ b/unittest/AssemblerX8632/Locked.cpp
@@ -0,0 +1,409 @@
+//===- subzero/unittest/AssemblerX8632/Locked.cpp -------------------------===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#include "AssemblerX8632/TestUtil.h"
+
+namespace Ice {
+namespace X8632 {
+namespace Test {
+namespace {
+
+TEST_F(AssemblerX8632LowLevelTest, Mfence) {
+  __ mfence();
+
+  static constexpr uint8_t ByteCount = 3;
+  ASSERT_EQ(ByteCount, codeBytesSize());
+  verifyBytes<ByteCount>(codeBytes(), 0x0F, 0xAE, 0xF0);
+}
+
+TEST_F(AssemblerX8632LowLevelTest, Lock) {
+  __ lock();
+
+  static constexpr uint8_t ByteCount = 1;
+  ASSERT_EQ(ByteCount, codeBytesSize());
+  verifyBytes<ByteCount>(codeBytes(), 0xF0);
+}
+
+TEST_F(AssemblerX8632Test, Xchg) {
+  static constexpr uint32_t Mask8 = 0x000000FF;
+  static constexpr uint32_t Mask16 = 0x0000FFFF;
+  static constexpr uint32_t Mask32 = 0xFFFFFFFF;
+
+#define TestImplAddrReg(Value0, Dst1, Value1, Size)                            \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Value0 ", " #Dst1 ", " #Value1 ", " #Size ")";                    \
+    const uint32_t T0 = allocateDword();                                       \
+    const uint32_t V0 = (Value0)&Mask##Size;                                   \
+    const uint32_t V1 = (Value1)&Mask##Size;                                   \
+                                                                               \
+    __ mov(IceType_i##Size, GPRRegister::Encoded_Reg_##Dst1,                   \
+           Immediate(Value1));                                                 \
+    __ xchg(IceType_i##Size, dwordAddress(T0),                                 \
+            GPRRegister::Encoded_Reg_##Dst1);                                  \
+    __ And(IceType_i32, GPRRegister::Encoded_Reg_##Dst1,                       \
+           Immediate(Mask##Size));                                             \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDwordTo(T0, V0);                                                   \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(V0, test.Dst1()) << TestString;                                  \
+    ASSERT_EQ(V1, test.contentsOfDword(T0)) << TestString;                     \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplSize(Dst1, Size)                                               \
+  do {                                                                         \
+    TestImplAddrReg(0xa2b34567, Dst1, 0x0507ddee, Size);                       \
+  } while (0)
+
+#define TestImpl(Dst1)                                                         \
+  do {                                                                         \
+    if (GPRRegister::Encoded_Reg_##Dst1 < 4) {                                 \
+      TestImplSize(Dst1, 8);                                                   \
+    }                                                                          \
+    TestImplSize(Dst1, 16);                                                    \
+    TestImplSize(Dst1, 32);                                                    \
+  } while (0)
+
+  TestImpl(eax);
+  TestImpl(ebx);
+  TestImpl(ecx);
+  TestImpl(edx);
+  TestImpl(esi);
+  TestImpl(edi);
+
+#undef TestImpl
+#undef TestImplSize
+#undef TestImplAddrReg
+}
+
+TEST_F(AssemblerX8632Test, Xadd) {
+  static constexpr bool NotLocked = false;
+  static constexpr bool Locked = true;
+
+  static constexpr uint32_t Mask8 = 0x000000FF;
+  static constexpr uint32_t Mask16 = 0x0000FFFF;
+  static constexpr uint32_t Mask32 = 0xFFFFFFFF;
+
+#define TestImplAddrReg(Value0, Dst1, Value1, LockedOrNot, Size)               \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Value0 ", " #Dst1 ", " #Value1 ", " #Size ")";                    \
+    const uint32_t T0 = allocateDword();                                       \
+    const uint32_t V0 = (Value0)&Mask##Size;                                   \
+    const uint32_t V1 = (Value1)&Mask##Size;                                   \
+                                                                               \
+    __ mov(IceType_i##Size, GPRRegister::Encoded_Reg_##Dst1,                   \
+           Immediate(Value1));                                                 \
+    __ xadd(IceType_i##Size, dwordAddress(T0),                                 \
+            GPRRegister::Encoded_Reg_##Dst1, LockedOrNot);                     \
+    __ And(IceType_i32, GPRRegister::Encoded_Reg_##Dst1,                       \
+           Immediate(Mask##Size));                                             \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDwordTo(T0, V0);                                                   \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(V0, test.Dst1()) << TestString;                                  \
+    ASSERT_EQ(Mask##Size &(V1 + V0), test.contentsOfDword(T0)) << TestString;  \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplSize(Dst1, Size)                                               \
+  do {                                                                         \
+    TestImplAddrReg(0xa2b34567, Dst1, 0x0507ddee, NotLocked, Size);            \
+    TestImplAddrReg(0xa2b34567, Dst1, 0x0507ddee, Locked, Size);               \
+  } while (0)
+
+#define TestImpl(Dst1)                                                         \
+  do {                                                                         \
+    if (GPRRegister::Encoded_Reg_##Dst1 < 4) {                                 \
+      TestImplSize(Dst1, 8);                                                   \
+    }                                                                          \
+    TestImplSize(Dst1, 16);                                                    \
+    TestImplSize(Dst1, 32);                                                    \
+  } while (0)
+
+  TestImpl(eax);
+  TestImpl(ebx);
+  TestImpl(ecx);
+  TestImpl(edx);
+  TestImpl(esi);
+  TestImpl(edi);
+
+#undef TestImpl
+#undef TestImplSize
+#undef TestImplAddrReg
+}
+
+TEST_F(AssemblerX8632LowLevelTest, Xadd) {
+  static constexpr bool NotLocked = false;
+  static constexpr bool Locked = true;
+
+  // Ensures that xadd emits a lock prefix accordingly.
+  {
+    __ xadd(IceType_i8, Address::Absolute(0x1FF00),
+            GPRRegister::Encoded_Reg_esi, NotLocked);
+    static constexpr uint8_t ByteCountNotLocked8 = 7;
+    ASSERT_EQ(ByteCountNotLocked8, codeBytesSize());
+    verifyBytes<ByteCountNotLocked8>(codeBytes(), 0x0F, 0xC0, 0x35, 0x00, 0xFF,
+                                     0x01, 0x00);
+    reset();
+
+    __ xadd(IceType_i8, Address::Absolute(0x1FF00),
+            GPRRegister::Encoded_Reg_esi, Locked);
+    static constexpr uint8_t ByteCountLocked8 = 1 + ByteCountNotLocked8;
+    ASSERT_EQ(ByteCountLocked8, codeBytesSize());
+    verifyBytes<ByteCountLocked8>(codeBytes(), 0xF0, 0x0F, 0xC0, 0x35, 0x00,
+                                  0xFF, 0x01, 0x00);
+    reset();
+  }
+
+  {
+    __ xadd(IceType_i16, Address::Absolute(0x1FF00),
+            GPRRegister::Encoded_Reg_esi, NotLocked);
+    static constexpr uint8_t ByteCountNotLocked16 = 8;
+    ASSERT_EQ(ByteCountNotLocked16, codeBytesSize());
+    verifyBytes<ByteCountNotLocked16>(codeBytes(), 0x66, 0x0F, 0xC1, 0x35, 0x00,
+                                      0xFF, 0x01, 0x00);
+    reset();
+
+    __ xadd(IceType_i16, Address::Absolute(0x1FF00),
+            GPRRegister::Encoded_Reg_esi, Locked);
+    static constexpr uint8_t ByteCountLocked16 = 1 + ByteCountNotLocked16;
+    ASSERT_EQ(ByteCountLocked16, codeBytesSize());
+    verifyBytes<ByteCountLocked16>(codeBytes(), 0x66, 0xF0, 0x0F, 0xC1, 0x35,
+                                   0x00, 0xFF, 0x01, 0x00);
+    reset();
+  }
+
+  {
+    __ xadd(IceType_i32, Address::Absolute(0x1FF00),
+            GPRRegister::Encoded_Reg_esi, NotLocked);
+    static constexpr uint8_t ByteCountNotLocked32 = 7;
+    ASSERT_EQ(ByteCountNotLocked32, codeBytesSize());
+    verifyBytes<ByteCountNotLocked32>(codeBytes(), 0x0F, 0xC1, 0x35, 0x00, 0xFF,
+                                      0x01, 0x00);
+    reset();
+
+    __ xadd(IceType_i32, Address::Absolute(0x1FF00),
+            GPRRegister::Encoded_Reg_esi, Locked);
+    static constexpr uint8_t ByteCountLocked32 = 1 + ByteCountNotLocked32;
+    ASSERT_EQ(ByteCountLocked32, codeBytesSize());
+    verifyBytes<ByteCountLocked32>(codeBytes(), 0xF0, 0x0F, 0xC1, 0x35, 0x00,
+                                   0xFF, 0x01, 0x00);
+    reset();
+  }
+}
+
+TEST_F(AssemblerX8632Test, Cmpxchg8b) {
+  static constexpr bool NotLocked = false;
+  static constexpr bool Locked = true;
+
+#define TestImpl(Value0, Value1, ValueMem, LockedOrNot)                        \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Value0 ", " #Value1 ", " #ValueMem ", " #LockedOrNot ")";         \
+    const uint32_t T0 = allocateQword();                                       \
+    static constexpr uint64_t V0 = ValueMem;                                   \
+    const uint32_t ZeroFlag = allocateDword();                                 \
+                                                                               \
+    __ mov(IceType_i32, GPRRegister::Encoded_Reg_eax,                          \
+           Immediate(uint64_t(Value0) & 0xFFFFFFFF));                          \
+    __ mov(IceType_i32, GPRRegister::Encoded_Reg_edx,                          \
+           Immediate(uint64_t(Value0) >> 32));                                 \
+    __ mov(IceType_i32, GPRRegister::Encoded_Reg_ebx,                          \
+           Immediate(uint64_t(Value1) & 0xFFFFFFFF));                          \
+    __ mov(IceType_i32, GPRRegister::Encoded_Reg_ecx,                          \
+           Immediate(uint64_t(Value1) >> 32));                                 \
+    __ cmpxchg8b(dwordAddress(T0), LockedOrNot);                               \
+    __ setcc(Cond::Br_e, dwordAddress(ZeroFlag));                              \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setQwordTo(T0, V0);                                                   \
+    test.setDwordTo(ZeroFlag, uint32_t(0xFF));                                 \
+    test.run();                                                                \
+                                                                               \
+    if (V0 == (Value0)) {                                                      \
+      ASSERT_EQ(uint64_t(Value1), test.contentsOfQword(T0)) << TestString;     \
+      ASSERT_EQ(1u, test.contentsOfDword(ZeroFlag)) << TestString;             \
+    } else {                                                                   \
+      ASSERT_EQ(uint64_t(ValueMem) & 0xFFFFFFFF, test.eax()) << TestString;    \
+      ASSERT_EQ((uint64_t(ValueMem) >> 32) & 0xFFFFFFFF, test.edx())           \
+          << TestString;                                                       \
+      ASSERT_EQ(0u, test.contentsOfDword(ZeroFlag)) << TestString;             \
+    }                                                                          \
+    reset();                                                                   \
+  } while (0)
+
+  TestImpl(0x98987676543210ull, 0x1, 0x98987676543210ull, NotLocked);
+  TestImpl(0x98987676543210ull, 0x1, 0x98987676543210ull, Locked);
+  TestImpl(0x98987676543210ull, 0x1, 0x98987676543211ull, NotLocked);
+  TestImpl(0x98987676543210ull, 0x1, 0x98987676543211ull, Locked);
+
+#undef TestImpl
+}
+
+TEST_F(AssemblerX8632LowLevelTest, Cmpxchg8b) {
+  static constexpr bool NotLocked = false;
+  static constexpr bool Locked = true;
+
+  // Ensures that cmpxchg8b emits a lock prefix accordingly.
+  __ cmpxchg8b(Address::Absolute(0x1FF00), NotLocked);
+  static constexpr uint8_t ByteCountNotLocked = 7;
+  ASSERT_EQ(ByteCountNotLocked, codeBytesSize());
+  verifyBytes<ByteCountNotLocked>(codeBytes(), 0x0F, 0xC7, 0x0D, 0x00, 0xFF,
+                                  0x01, 0x00);
+  reset();
+
+  __ cmpxchg8b(Address::Absolute(0x1FF00), Locked);
+  static constexpr uint8_t ByteCountLocked = 1 + ByteCountNotLocked;
+  ASSERT_EQ(ByteCountLocked, codeBytesSize());
+  verifyBytes<ByteCountLocked>(codeBytes(), 0xF0, 0x0F, 0xC7, 0x0D, 0x00, 0xFF,
+                               0x01, 0x00);
+  reset();
+}
+
+TEST_F(AssemblerX8632Test, Cmpxchg) {
+  static constexpr bool NotLocked = false;
+  static constexpr bool Locked = true;
+
+  static constexpr uint32_t Mask8 = 0x000000FF;
+  static constexpr uint32_t Mask16 = 0x0000FFFF;
+  static constexpr uint32_t Mask32 = 0xFFFFFFFF;
+
+#define TestImplAddrReg(Value0, Src, Value1, ValueMem, LockedOrNot, Size)      \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Value0 ", " #Src ", " #Value1 ", " #ValueMem ", " #LockedOrNot    \
+        ", " #Size ")";                                                        \
+    const uint32_t T0 = allocateDword();                                       \
+    static constexpr uint32_t V0 = (ValueMem)&Mask##Size;                      \
+    const uint32_t ZeroFlag = allocateDword();                                 \
+                                                                               \
+    __ mov(IceType_i##Size, GPRRegister::Encoded_Reg_eax,                      \
+           Immediate((Value0)&Mask##Size));                                    \
+    __ mov(IceType_i##Size, GPRRegister::Encoded_Reg_##Src,                    \
+           Immediate((Value1)&Mask##Size));                                    \
+    __ cmpxchg(IceType_i##Size, dwordAddress(T0),                              \
+               GPRRegister::Encoded_Reg_##Src, LockedOrNot);                   \
+    __ setcc(Cond::Br_e, dwordAddress(ZeroFlag));                              \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDwordTo(T0, V0);                                                   \
+    test.setDwordTo(ZeroFlag, uint32_t(0xFF));                                 \
+    test.run();                                                                \
+                                                                               \
+    if (V0 == (Mask##Size & (Value0))) {                                       \
+      ASSERT_EQ(uint32_t((Value1)&Mask##Size), test.contentsOfDword(T0))       \
+          << TestString;                                                       \
+      ASSERT_EQ(1u, test.contentsOfDword(ZeroFlag)) << TestString;             \
+    } else {                                                                   \
+      ASSERT_EQ(uint32_t((ValueMem)&Mask##Size), test.eax()) << TestString;    \
+      ASSERT_EQ(0u, test.contentsOfDword(ZeroFlag)) << TestString;             \
+    }                                                                          \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplValue(Value0, Src, Value1, ValueMem, LockedOrNot)              \
+  do {                                                                         \
+    if (GPRRegister::Encoded_Reg_##Src < 4) {                                  \
+      TestImplAddrReg(Value0, Src, Value1, ValueMem, LockedOrNot, 8);          \
+    }                                                                          \
+    TestImplAddrReg(Value0, Src, Value1, ValueMem, LockedOrNot, 16);           \
+    TestImplAddrReg(Value0, Src, Value1, ValueMem, LockedOrNot, 32);           \
+  } while (0)
+
+#define TestImpl(Src, LockedOrNot)                                             \
+  do {                                                                         \
+    TestImplValue(0xFFFFFFFF, Src, 0x1, 0xFFFFFFFF, LockedOrNot);              \
+    TestImplValue(0x0FFF0F0F, Src, 0x1, 0xFFFFFFFF, LockedOrNot);              \
+  } while (0)
+
+  TestImpl(ebx, Locked);
+  TestImpl(edx, NotLocked);
+  TestImpl(ecx, Locked);
+  TestImpl(ecx, NotLocked);
+  TestImpl(edx, Locked);
+  TestImpl(edx, NotLocked);
+  TestImpl(esi, Locked);
+  TestImpl(esi, NotLocked);
+  TestImpl(edi, Locked);
+  TestImpl(edi, NotLocked);
+
+#undef TestImpl
+#undef TestImplValue
+#undef TestImplAddrReg
+}
+
+TEST_F(AssemblerX8632LowLevelTest, Cmpxchg) {
+  static constexpr bool NotLocked = false;
+  static constexpr bool Locked = true;
+
+  // Ensures that cmpxchg emits a lock prefix accordingly.
+  {
+    __ cmpxchg(IceType_i8, Address::Absolute(0x1FF00),
+               GPRRegister::Encoded_Reg_esi, NotLocked);
+    static constexpr uint8_t ByteCountNotLocked8 = 7;
+    ASSERT_EQ(ByteCountNotLocked8, codeBytesSize());
+    verifyBytes<ByteCountNotLocked8>(codeBytes(), 0x0F, 0xB0, 0x35, 0x00, 0xFF,
+                                     0x01, 0x00);
+    reset();
+
+    __ cmpxchg(IceType_i8, Address::Absolute(0x1FF00),
+               GPRRegister::Encoded_Reg_esi, Locked);
+    static constexpr uint8_t ByteCountLocked8 = 1 + ByteCountNotLocked8;
+    ASSERT_EQ(ByteCountLocked8, codeBytesSize());
+    verifyBytes<ByteCountLocked8>(codeBytes(), 0xF0, 0x0F, 0xB0, 0x35, 0x00,
+                                  0xFF, 0x01, 0x00);
+    reset();
+  }
+
+  {
+    __ cmpxchg(IceType_i16, Address::Absolute(0x1FF00),
+               GPRRegister::Encoded_Reg_esi, NotLocked);
+    static constexpr uint8_t ByteCountNotLocked16 = 8;
+    ASSERT_EQ(ByteCountNotLocked16, codeBytesSize());
+    verifyBytes<ByteCountNotLocked16>(codeBytes(), 0x66, 0x0F, 0xB1, 0x35, 0x00,
+                                      0xFF, 0x01, 0x00);
+    reset();
+
+    __ cmpxchg(IceType_i16, Address::Absolute(0x1FF00),
+               GPRRegister::Encoded_Reg_esi, Locked);
+    static constexpr uint8_t ByteCountLocked16 = 1 + ByteCountNotLocked16;
+    ASSERT_EQ(ByteCountLocked16, codeBytesSize());
+    verifyBytes<ByteCountLocked16>(codeBytes(), 0x66, 0xF0, 0x0F, 0xB1, 0x35,
+                                   0x00, 0xFF, 0x01, 0x00);
+    reset();
+  }
+
+  {
+    __ cmpxchg(IceType_i32, Address::Absolute(0x1FF00),
+               GPRRegister::Encoded_Reg_esi, NotLocked);
+    static constexpr uint8_t ByteCountNotLocked32 = 7;
+    ASSERT_EQ(ByteCountNotLocked32, codeBytesSize());
+    verifyBytes<ByteCountNotLocked32>(codeBytes(), 0x0F, 0xB1, 0x35, 0x00, 0xFF,
+                                      0x01, 0x00);
+    reset();
+
+    __ cmpxchg(IceType_i32, Address::Absolute(0x1FF00),
+               GPRRegister::Encoded_Reg_esi, Locked);
+    static constexpr uint8_t ByteCountLocked32 = 1 + ByteCountNotLocked32;
+    ASSERT_EQ(ByteCountLocked32, codeBytesSize());
+    verifyBytes<ByteCountLocked32>(codeBytes(), 0xF0, 0x0F, 0xB1, 0x35, 0x00,
+                                   0xFF, 0x01, 0x00);
+    reset();
+  }
+}
+
+} // end of anonymous namespace
+} // end of namespace Test
+} // end of namespace X8632
+} // end of namespace Ice
diff --git a/unittest/AssemblerX8632/LowLevel.cpp b/unittest/AssemblerX8632/LowLevel.cpp
new file mode 100644
index 0000000..7593b4c
--- /dev/null
+++ b/unittest/AssemblerX8632/LowLevel.cpp
@@ -0,0 +1,710 @@
+//===- subzero/unittest/AssemblerX8632/LowLevel.cpp -----------------------===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#include "AssemblerX8632/TestUtil.h"
+
+namespace Ice {
+namespace X8632 {
+namespace Test {
+namespace {
+
+TEST_F(AssemblerX8632LowLevelTest, Ret) {
+  __ ret();
+
+  constexpr size_t ByteCount = 1;
+  ASSERT_EQ(ByteCount, codeBytesSize());
+
+  verifyBytes<ByteCount>(codeBytes(), 0xc3);
+}
+
+TEST_F(AssemblerX8632LowLevelTest, RetImm) {
+  __ ret(Immediate(0x20));
+
+  constexpr size_t ByteCount = 3;
+  ASSERT_EQ(ByteCount, codeBytesSize());
+
+  verifyBytes<ByteCount>(codeBytes(), 0xC2, 0x20, 0x00);
+}
+
+TEST_F(AssemblerX8632LowLevelTest, CallImm4) {
+  __ call(Immediate(4));
+
+  constexpr size_t ByteCount = 5;
+  ASSERT_EQ(ByteCount, codeBytesSize());
+
+  verifyBytes<ByteCount>(codeBytes(), 0xe8, 0x00, 0x00, 0x00, 0x00);
+}
+
+TEST_F(AssemblerX8632LowLevelTest, PopRegs) {
+  __ popl(GPRRegister::Encoded_Reg_eax);
+  __ popl(GPRRegister::Encoded_Reg_ebx);
+  __ popl(GPRRegister::Encoded_Reg_ecx);
+  __ popl(GPRRegister::Encoded_Reg_edx);
+  __ popl(GPRRegister::Encoded_Reg_edi);
+  __ popl(GPRRegister::Encoded_Reg_esi);
+  __ popl(GPRRegister::Encoded_Reg_ebp);
+
+  constexpr size_t ByteCount = 7;
+  ASSERT_EQ(ByteCount, codeBytesSize());
+
+  constexpr uint8_t PopOpcode = 0x58;
+  verifyBytes<ByteCount>(codeBytes(), PopOpcode | GPRRegister::Encoded_Reg_eax,
+                         PopOpcode | GPRRegister::Encoded_Reg_ebx,
+                         PopOpcode | GPRRegister::Encoded_Reg_ecx,
+                         PopOpcode | GPRRegister::Encoded_Reg_edx,
+                         PopOpcode | GPRRegister::Encoded_Reg_edi,
+                         PopOpcode | GPRRegister::Encoded_Reg_esi,
+                         PopOpcode | GPRRegister::Encoded_Reg_ebp);
+}
+
+TEST_F(AssemblerX8632LowLevelTest, PushRegs) {
+  __ pushl(GPRRegister::Encoded_Reg_eax);
+  __ pushl(GPRRegister::Encoded_Reg_ebx);
+  __ pushl(GPRRegister::Encoded_Reg_ecx);
+  __ pushl(GPRRegister::Encoded_Reg_edx);
+  __ pushl(GPRRegister::Encoded_Reg_edi);
+  __ pushl(GPRRegister::Encoded_Reg_esi);
+  __ pushl(GPRRegister::Encoded_Reg_ebp);
+
+  constexpr size_t ByteCount = 7;
+  ASSERT_EQ(ByteCount, codeBytesSize());
+
+  constexpr uint8_t PushOpcode = 0x50;
+  verifyBytes<ByteCount>(codeBytes(), PushOpcode | GPRRegister::Encoded_Reg_eax,
+                         PushOpcode | GPRRegister::Encoded_Reg_ebx,
+                         PushOpcode | GPRRegister::Encoded_Reg_ecx,
+                         PushOpcode | GPRRegister::Encoded_Reg_edx,
+                         PushOpcode | GPRRegister::Encoded_Reg_edi,
+                         PushOpcode | GPRRegister::Encoded_Reg_esi,
+                         PushOpcode | GPRRegister::Encoded_Reg_ebp);
+}
+
+TEST_F(AssemblerX8632LowLevelTest, MovRegisterZero) {
+  __ mov(IceType_i32, GPRRegister::Encoded_Reg_eax, Immediate(0x00));
+  __ mov(IceType_i32, GPRRegister::Encoded_Reg_ebx, Immediate(0x00));
+  __ mov(IceType_i32, GPRRegister::Encoded_Reg_ecx, Immediate(0x00));
+  __ mov(IceType_i32, GPRRegister::Encoded_Reg_edx, Immediate(0x00));
+  __ mov(IceType_i32, GPRRegister::Encoded_Reg_edi, Immediate(0x00));
+  __ mov(IceType_i32, GPRRegister::Encoded_Reg_esi, Immediate(0x00));
+
+  constexpr size_t MovReg32BitImmBytes = 5;
+  constexpr size_t ByteCount = 6 * MovReg32BitImmBytes;
+  ASSERT_EQ(ByteCount, codeBytesSize());
+
+  constexpr uint8_t MovOpcode = 0xb8;
+  verifyBytes<ByteCount>(
+      codeBytes(), MovOpcode | GPRRegister::Encoded_Reg_eax, 0x00, 0x00, 0x00,
+      0x00, MovOpcode | GPRRegister::Encoded_Reg_ebx, 0x00, 0x00, 0x00, 0x00,
+      MovOpcode | GPRRegister::Encoded_Reg_ecx, 0x00, 0x00, 0x00, 0x00,
+      MovOpcode | GPRRegister::Encoded_Reg_edx, 0x00, 0x00, 0x00, 0x00,
+      MovOpcode | GPRRegister::Encoded_Reg_edi, 0x00, 0x00, 0x00, 0x00,
+      MovOpcode | GPRRegister::Encoded_Reg_esi, 0x00, 0x00, 0x00, 0x00);
+}
+
+TEST_F(AssemblerX8632LowLevelTest, Cmp) {
+#define TestRegReg(Inst, Dst, Src, OpType, ByteCountUntyped, ...)              \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Inst ", " #Dst ", " #Src ", " #OpType ", " #ByteCountUntyped      \
+        ",  " #__VA_ARGS__ ")";                                                \
+    static constexpr uint8_t ByteCount = ByteCountUntyped;                     \
+    __ Inst(IceType_##OpType, GPRRegister::Encoded_Reg_##Dst,                  \
+            GPRRegister::Encoded_Reg_##Src);                                   \
+    ASSERT_EQ(ByteCount, codeBytesSize()) << TestString;                       \
+    ASSERT_TRUE(verifyBytes<ByteCount>(codeBytes(), __VA_ARGS__))              \
+        << TestString;                                                         \
+    reset();                                                                   \
+  } while (0)
+
+#define TestRegImm(Inst, Dst, Imm, OpType, ByteCountUntyped, ...)              \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Inst ", " #Dst ", " #Imm ", " #OpType ", " #ByteCountUntyped      \
+        ",  " #__VA_ARGS__ ")";                                                \
+    static constexpr uint8_t ByteCount = ByteCountUntyped;                     \
+    __ Inst(IceType_##OpType, GPRRegister::Encoded_Reg_##Dst, Immediate(Imm)); \
+    ASSERT_EQ(ByteCount, codeBytesSize()) << TestString;                       \
+    ASSERT_TRUE(verifyBytes<ByteCount>(codeBytes(), __VA_ARGS__))              \
+        << TestString;                                                         \
+    reset();                                                                   \
+  } while (0)
+
+#define TestRegAbsoluteAddr(Inst, Dst, Disp, OpType, ByteCountUntyped, ...)    \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Inst ", " #Dst ", " #Disp ", " #OpType ", " #ByteCountUntyped     \
+        ",  " #__VA_ARGS__ ")";                                                \
+    static constexpr uint8_t ByteCount = ByteCountUntyped;                     \
+    __ Inst(IceType_##OpType, GPRRegister::Encoded_Reg_##Dst,                  \
+            Address(Address::ABSOLUTE, Disp));                                 \
+    ASSERT_EQ(ByteCount, codeBytesSize()) << TestString;                       \
+    ASSERT_TRUE(verifyBytes<ByteCount>(codeBytes(), __VA_ARGS__))              \
+        << TestString;                                                         \
+    reset();                                                                   \
+  } while (0)
+
+#define TestRegAddrBase(Inst, Dst, Base, Disp, OpType, ByteCountUntyped, ...)  \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Inst ", " #Dst ", " #Base ", " #Disp ", " #OpType                 \
+        ", " #ByteCountUntyped ",  " #__VA_ARGS__ ")";                         \
+    static constexpr uint8_t ByteCount = ByteCountUntyped;                     \
+    __ Inst(IceType_##OpType, GPRRegister::Encoded_Reg_##Dst,                  \
+            Address(GPRRegister::Encoded_Reg_##Base, Disp));                   \
+    ASSERT_EQ(ByteCount, codeBytesSize()) << TestString;                       \
+    ASSERT_TRUE(verifyBytes<ByteCount>(codeBytes(), __VA_ARGS__))              \
+        << TestString;                                                         \
+    reset();                                                                   \
+  } while (0)
+
+#define TestRegAddrScaledIndex(Inst, Dst, Index, Scale, Disp, OpType,          \
+                               ByteCountUntyped, ...)                          \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Inst ", " #Dst ", " #Index ", " #Scale ", " #Disp ", " #OpType    \
+        ", " #ByteCountUntyped ",  " #__VA_ARGS__ ")";                         \
+    static constexpr uint8_t ByteCount = ByteCountUntyped;                     \
+    __ Inst(IceType_##OpType, GPRRegister::Encoded_Reg_##Dst,                  \
+            Address(GPRRegister::Encoded_Reg_##Index, Traits::TIMES_##Scale,   \
+                    Disp));                                                    \
+    ASSERT_EQ(ByteCount, codeBytesSize()) << TestString;                       \
+    ASSERT_TRUE(verifyBytes<ByteCount>(codeBytes(), __VA_ARGS__))              \
+        << TestString;                                                         \
+    reset();                                                                   \
+  } while (0)
+
+#define TestRegAddrBaseScaledIndex(Inst, Dst, Base, Index, Scale, Disp,        \
+                                   OpType, ByteCountUntyped, ...)              \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Inst ", " #Dst ", " #Base ", " #Index ", " #Scale ", " #Disp      \
+        ", " #OpType ", " #ByteCountUntyped ",  " #__VA_ARGS__ ")";            \
+    static constexpr uint8_t ByteCount = ByteCountUntyped;                     \
+    __ Inst(IceType_##OpType, GPRRegister::Encoded_Reg_##Dst,                  \
+            Address(GPRRegister::Encoded_Reg_##Base,                           \
+                    GPRRegister::Encoded_Reg_##Index, Traits::TIMES_##Scale,   \
+                    Disp));                                                    \
+    ASSERT_EQ(ByteCount, codeBytesSize()) << TestString;                       \
+    ASSERT_TRUE(verifyBytes<ByteCount>(codeBytes(), __VA_ARGS__))              \
+        << TestString;                                                         \
+    reset();                                                                   \
+  } while (0)
+
+#define TestAddrBaseScaledIndexImm(Inst, Base, Index, Scale, Disp, Imm,        \
+                                   OpType, ByteCountUntyped, ...)              \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Inst ", " #Base ", " #Index ", " #Scale ", " #Disp ", " #Imm      \
+        ", " #OpType ", " #ByteCountUntyped ",  " #__VA_ARGS__ ")";            \
+    static constexpr uint8_t ByteCount = ByteCountUntyped;                     \
+    __ Inst(IceType_##OpType, Address(GPRRegister::Encoded_Reg_##Base,         \
+                                      GPRRegister::Encoded_Reg_##Index,        \
+                                      Traits::TIMES_##Scale, Disp),            \
+            Immediate(Imm));                                                   \
+    ASSERT_EQ(ByteCount, codeBytesSize()) << TestString;                       \
+    ASSERT_TRUE(verifyBytes<ByteCount>(codeBytes(), __VA_ARGS__))              \
+        << TestString;                                                         \
+    reset();                                                                   \
+  } while (0)
+
+#define TestAddrBaseScaledIndexReg(Inst, Base, Index, Scale, Disp, Src,        \
+                                   OpType, ByteCountUntyped, ...)              \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Inst ", " #Base ", " #Index ", " #Scale ", " #Disp ", " #Src      \
+        ", " #OpType ", " #ByteCountUntyped ",  " #__VA_ARGS__ ")";            \
+    static constexpr uint8_t ByteCount = ByteCountUntyped;                     \
+    __ Inst(IceType_##OpType, Address(GPRRegister::Encoded_Reg_##Base,         \
+                                      GPRRegister::Encoded_Reg_##Index,        \
+                                      Traits::TIMES_##Scale, Disp),            \
+            GPRRegister::Encoded_Reg_##Src);                                   \
+    ASSERT_EQ(ByteCount, codeBytesSize()) << TestString;                       \
+    ASSERT_TRUE(verifyBytes<ByteCount>(codeBytes(), __VA_ARGS__))              \
+        << TestString;                                                         \
+    reset();                                                                   \
+  } while (0)
+
+  /* cmp GPR, GPR */
+  TestRegReg(cmp, eax, ecx, i32, 2, 0x3B, 0xC1);
+  TestRegReg(cmp, ecx, edx, i32, 2, 0x3B, 0xCA);
+  TestRegReg(cmp, edx, ebx, i32, 2, 0x3B, 0xD3);
+  TestRegReg(cmp, ebx, esp, i32, 2, 0x3B, 0xDC);
+  TestRegReg(cmp, esp, ebp, i32, 2, 0x3B, 0xE5);
+  TestRegReg(cmp, ebp, esi, i32, 2, 0x3B, 0xEE);
+  TestRegReg(cmp, esi, edi, i32, 2, 0x3B, 0xF7);
+  TestRegReg(cmp, edi, eax, i32, 2, 0x3B, 0xF8);
+
+  TestRegReg(cmp, eax, ecx, i16, 3, 0x66, 0x3B, 0xC1);
+  TestRegReg(cmp, ecx, edx, i16, 3, 0x66, 0x3B, 0xCA);
+  TestRegReg(cmp, edx, ebx, i16, 3, 0x66, 0x3B, 0xD3);
+  TestRegReg(cmp, ebx, esp, i16, 3, 0x66, 0x3B, 0xDC);
+  TestRegReg(cmp, esp, ebp, i16, 3, 0x66, 0x3B, 0xE5);
+  TestRegReg(cmp, ebp, esi, i16, 3, 0x66, 0x3B, 0xEE);
+  TestRegReg(cmp, esi, edi, i16, 3, 0x66, 0x3B, 0xF7);
+  TestRegReg(cmp, edi, eax, i16, 3, 0x66, 0x3B, 0xF8);
+
+  TestRegReg(cmp, eax, ecx, i8, 2, 0x3A, 0xC1);
+  TestRegReg(cmp, ecx, edx, i8, 2, 0x3A, 0xCA);
+  TestRegReg(cmp, edx, ebx, i8, 2, 0x3A, 0xD3);
+  TestRegReg(cmp, ebx, esp, i8, 2, 0x3A, 0xDC);
+  TestRegReg(cmp, esp, ebp, i8, 2, 0x3A, 0xE5);
+  TestRegReg(cmp, ebp, esi, i8, 2, 0x3A, 0xEE);
+  TestRegReg(cmp, esi, edi, i8, 2, 0x3A, 0xF7);
+  TestRegReg(cmp, edi, eax, i8, 2, 0x3A, 0xF8);
+
+  /* cmp GPR, Imm8 */
+  TestRegImm(cmp, eax, 5, i32, 3, 0x83, 0xF8, 0x05);
+  TestRegImm(cmp, ecx, 5, i32, 3, 0x83, 0xF9, 0x05);
+  TestRegImm(cmp, edx, 5, i32, 3, 0x83, 0xFA, 0x05);
+  TestRegImm(cmp, ebx, 5, i32, 3, 0x83, 0xFB, 0x05);
+  TestRegImm(cmp, esp, 5, i32, 3, 0x83, 0xFC, 0x05);
+  TestRegImm(cmp, ebp, 5, i32, 3, 0x83, 0xFD, 0x05);
+  TestRegImm(cmp, esi, 5, i32, 3, 0x83, 0xFE, 0x05);
+  TestRegImm(cmp, edi, 5, i32, 3, 0x83, 0xFF, 0x05);
+
+  TestRegImm(cmp, eax, 5, i16, 4, 0x66, 0x83, 0xF8, 0x05);
+  TestRegImm(cmp, ecx, 5, i16, 4, 0x66, 0x83, 0xF9, 0x05);
+  TestRegImm(cmp, edx, 5, i16, 4, 0x66, 0x83, 0xFA, 0x05);
+  TestRegImm(cmp, ebx, 5, i16, 4, 0x66, 0x83, 0xFB, 0x05);
+  TestRegImm(cmp, esp, 5, i16, 4, 0x66, 0x83, 0xFC, 0x05);
+  TestRegImm(cmp, ebp, 5, i16, 4, 0x66, 0x83, 0xFD, 0x05);
+  TestRegImm(cmp, esi, 5, i16, 4, 0x66, 0x83, 0xFE, 0x05);
+  TestRegImm(cmp, edi, 5, i16, 4, 0x66, 0x83, 0xFF, 0x05);
+
+  TestRegImm(cmp, eax, 5, i8, 2, 0x3C, 0x05);
+  TestRegImm(cmp, ecx, 5, i8, 3, 0x80, 0xF9, 0x05);
+  TestRegImm(cmp, edx, 5, i8, 3, 0x80, 0xFA, 0x05);
+  TestRegImm(cmp, ebx, 5, i8, 3, 0x80, 0xFB, 0x05);
+  TestRegImm(cmp, esp, 5, i8, 3, 0x80, 0xFC, 0x05);
+  TestRegImm(cmp, ebp, 5, i8, 3, 0x80, 0xFD, 0x05);
+  TestRegImm(cmp, esi, 5, i8, 3, 0x80, 0xFE, 0x05);
+  TestRegImm(cmp, edi, 5, i8, 3, 0x80, 0xFF, 0x05);
+
+  /* cmp GPR, Imm16 */
+  TestRegImm(cmp, eax, 0x100, i32, 5, 0x3D, 0x00, 0x01, 0x00, 0x00);
+  TestRegImm(cmp, ecx, 0x100, i32, 6, 0x81, 0xF9, 0x00, 0x01, 0x00, 0x00);
+  TestRegImm(cmp, edx, 0x100, i32, 6, 0x81, 0xFA, 0x00, 0x01, 0x00, 0x00);
+  TestRegImm(cmp, ebx, 0x100, i32, 6, 0x81, 0xFB, 0x00, 0x01, 0x00, 0x00);
+  TestRegImm(cmp, esp, 0x100, i32, 6, 0x81, 0xFC, 0x00, 0x01, 0x00, 0x00);
+  TestRegImm(cmp, ebp, 0x100, i32, 6, 0x81, 0xFD, 0x00, 0x01, 0x00, 0x00);
+  TestRegImm(cmp, esi, 0x100, i32, 6, 0x81, 0xFE, 0x00, 0x01, 0x00, 0x00);
+  TestRegImm(cmp, edi, 0x100, i32, 6, 0x81, 0xFF, 0x00, 0x01, 0x00, 0x00);
+
+  TestRegImm(cmp, eax, 0x100, i16, 4, 0x66, 0x3D, 0x00, 0x01);
+  TestRegImm(cmp, ecx, 0x100, i16, 5, 0x66, 0x81, 0xF9, 0x00, 0x01);
+  TestRegImm(cmp, edx, 0x100, i16, 5, 0x66, 0x81, 0xFA, 0x00, 0x01);
+  TestRegImm(cmp, ebx, 0x100, i16, 5, 0x66, 0x81, 0xFB, 0x00, 0x01);
+  TestRegImm(cmp, esp, 0x100, i16, 5, 0x66, 0x81, 0xFC, 0x00, 0x01);
+  TestRegImm(cmp, ebp, 0x100, i16, 5, 0x66, 0x81, 0xFD, 0x00, 0x01);
+  TestRegImm(cmp, esi, 0x100, i16, 5, 0x66, 0x81, 0xFE, 0x00, 0x01);
+  TestRegImm(cmp, edi, 0x100, i16, 5, 0x66, 0x81, 0xFF, 0x00, 0x01);
+
+  /* cmp GPR, Absolute */
+  TestRegAbsoluteAddr(cmp, eax, 0xF00FBEEF, i32, 6, 0x3B, 0x05, 0xEF, 0xBE,
+                      0x0F, 0xF0);
+  TestRegAbsoluteAddr(cmp, eax, 0xF00FBEEF, i16, 7, 0x66, 0x3B, 0x05, 0xEF,
+                      0xBE, 0x0F, 0xF0);
+  TestRegAbsoluteAddr(cmp, eax, 0xF00FBEEF, i8, 6, 0x3A, 0x05, 0xEF, 0xBE, 0x0F,
+                      0xF0);
+
+  /* cmp GPR, 0(Base) */
+  TestRegAddrBase(cmp, eax, ecx, 0, i32, 2, 0x3B, 0x01);
+  TestRegAddrBase(cmp, ecx, edx, 0, i32, 2, 0x3B, 0x0A);
+  TestRegAddrBase(cmp, edx, ebx, 0, i32, 2, 0x3B, 0x13);
+  TestRegAddrBase(cmp, ebx, esp, 0, i32, 3, 0x3B, 0x1C, 0x24);
+  TestRegAddrBase(cmp, esp, ebp, 0, i32, 3, 0x3B, 0x65, 0x00);
+  TestRegAddrBase(cmp, ebp, esi, 0, i32, 2, 0x3B, 0x2E);
+  TestRegAddrBase(cmp, esi, edi, 0, i32, 2, 0x3B, 0x37);
+  TestRegAddrBase(cmp, edi, eax, 0, i32, 2, 0x3B, 0x38);
+
+  TestRegAddrBase(cmp, eax, ecx, 0, i16, 3, 0x66, 0x3B, 0x01);
+  TestRegAddrBase(cmp, ecx, edx, 0, i16, 3, 0x66, 0x3B, 0x0A);
+  TestRegAddrBase(cmp, edx, ebx, 0, i16, 3, 0x66, 0x3B, 0x13);
+  TestRegAddrBase(cmp, ebx, esp, 0, i16, 4, 0x66, 0x3B, 0x1C, 0x24);
+  TestRegAddrBase(cmp, esp, ebp, 0, i16, 4, 0x66, 0x3B, 0x65, 0x00);
+  TestRegAddrBase(cmp, ebp, esi, 0, i16, 3, 0x66, 0x3B, 0x2E);
+  TestRegAddrBase(cmp, esi, edi, 0, i16, 3, 0x66, 0x3B, 0x37);
+  TestRegAddrBase(cmp, edi, eax, 0, i16, 3, 0x66, 0x3B, 0x38);
+
+  TestRegAddrBase(cmp, eax, ecx, 0, i8, 2, 0x3A, 0x01);
+  TestRegAddrBase(cmp, ecx, edx, 0, i8, 2, 0x3A, 0x0A);
+  TestRegAddrBase(cmp, edx, ebx, 0, i8, 2, 0x3A, 0x13);
+  TestRegAddrBase(cmp, ebx, esp, 0, i8, 3, 0x3A, 0x1C, 0x24);
+  TestRegAddrBase(cmp, esp, ebp, 0, i8, 3, 0x3A, 0x65, 0x00);
+  TestRegAddrBase(cmp, ebp, esi, 0, i8, 2, 0x3A, 0x2E);
+  TestRegAddrBase(cmp, esi, edi, 0, i8, 2, 0x3A, 0x37);
+  TestRegAddrBase(cmp, edi, eax, 0, i8, 2, 0x3A, 0x38);
+
+  /* cmp GPR, Imm8(Base) */
+  TestRegAddrBase(cmp, eax, ecx, 0x40, i32, 3, 0x3B, 0x41, 0x40);
+  TestRegAddrBase(cmp, ecx, edx, 0x40, i32, 3, 0x3B, 0x4A, 0x40);
+  TestRegAddrBase(cmp, edx, ebx, 0x40, i32, 3, 0x3B, 0x53, 0x40);
+  TestRegAddrBase(cmp, ebx, esp, 0x40, i32, 4, 0x3B, 0x5C, 0x24, 0x40);
+  TestRegAddrBase(cmp, esp, ebp, 0x40, i32, 3, 0x3B, 0x65, 0x40);
+  TestRegAddrBase(cmp, ebp, esi, 0x40, i32, 3, 0x3B, 0x6E, 0x40);
+  TestRegAddrBase(cmp, esi, edi, 0x40, i32, 3, 0x3B, 0x77, 0x40);
+  TestRegAddrBase(cmp, edi, eax, 0x40, i32, 3, 0x3B, 0x78, 0x40);
+
+  TestRegAddrBase(cmp, eax, ecx, 0x40, i16, 4, 0x66, 0x3B, 0x41, 0x40);
+  TestRegAddrBase(cmp, ecx, edx, 0x40, i16, 4, 0x66, 0x3B, 0x4A, 0x40);
+  TestRegAddrBase(cmp, edx, ebx, 0x40, i16, 4, 0x66, 0x3B, 0x53, 0x40);
+  TestRegAddrBase(cmp, ebx, esp, 0x40, i16, 5, 0x66, 0x3B, 0x5C, 0x24, 0x40);
+  TestRegAddrBase(cmp, esp, ebp, 0x40, i16, 4, 0x66, 0x3B, 0x65, 0x40);
+  TestRegAddrBase(cmp, ebp, esi, 0x40, i16, 4, 0x66, 0x3B, 0x6E, 0x40);
+  TestRegAddrBase(cmp, esi, edi, 0x40, i16, 4, 0x66, 0x3B, 0x77, 0x40);
+  TestRegAddrBase(cmp, edi, eax, 0x40, i16, 4, 0x66, 0x3B, 0x78, 0x40);
+
+  TestRegAddrBase(cmp, eax, ecx, 0x40, i8, 3, 0x3A, 0x41, 0x40);
+  TestRegAddrBase(cmp, ecx, edx, 0x40, i8, 3, 0x3A, 0x4A, 0x40);
+  TestRegAddrBase(cmp, edx, ebx, 0x40, i8, 3, 0x3A, 0x53, 0x40);
+  TestRegAddrBase(cmp, ebx, esp, 0x40, i8, 4, 0x3A, 0x5C, 0x24, 0x40);
+  TestRegAddrBase(cmp, esp, ebp, 0x40, i8, 3, 0x3A, 0x65, 0x40);
+  TestRegAddrBase(cmp, ebp, esi, 0x40, i8, 3, 0x3A, 0x6E, 0x40);
+  TestRegAddrBase(cmp, esi, edi, 0x40, i8, 3, 0x3A, 0x77, 0x40);
+  TestRegAddrBase(cmp, edi, eax, 0x40, i8, 3, 0x3A, 0x78, 0x40);
+
+  /* cmp GPR, Imm32(Base) */
+  TestRegAddrBase(cmp, eax, ecx, 0xF0, i32, 6, 0x3B, 0x81, 0xF0, 0x00, 0x00,
+                  0x00);
+  TestRegAddrBase(cmp, ecx, edx, 0xF0, i32, 6, 0x3B, 0x8A, 0xF0, 0x00, 0x00,
+                  0x00);
+  TestRegAddrBase(cmp, edx, ebx, 0xF0, i32, 6, 0x3B, 0x93, 0xF0, 0x00, 0x00,
+                  0x00);
+  TestRegAddrBase(cmp, ebx, esp, 0xF0, i32, 7, 0x3B, 0x9C, 0x24, 0xF0, 0x00,
+                  0x00, 0x00);
+  TestRegAddrBase(cmp, esp, ebp, 0xF0, i32, 6, 0x3B, 0xA5, 0xF0, 0x00, 0x00,
+                  0x00);
+  TestRegAddrBase(cmp, ebp, esi, 0xF0, i32, 6, 0x3B, 0xAE, 0xF0, 0x00, 0x00,
+                  0x00);
+  TestRegAddrBase(cmp, esi, edi, 0xF0, i32, 6, 0x3B, 0xB7, 0xF0, 0x00, 0x00,
+                  0x00);
+  TestRegAddrBase(cmp, edi, eax, 0xF0, i32, 6, 0x3B, 0xB8, 0xF0, 0x00, 0x00,
+                  0x00);
+
+  TestRegAddrBase(cmp, eax, ecx, 0xF0, i16, 7, 0x66, 0x3B, 0x81, 0xF0, 0x00,
+                  0x00, 0x00);
+  TestRegAddrBase(cmp, ecx, edx, 0xF0, i16, 7, 0x66, 0x3B, 0x8A, 0xF0, 0x00,
+                  0x00, 0x00);
+  TestRegAddrBase(cmp, edx, ebx, 0xF0, i16, 7, 0x66, 0x3B, 0x93, 0xF0, 0x00,
+                  0x00, 0x00);
+  TestRegAddrBase(cmp, ebx, esp, 0xF0, i16, 8, 0x66, 0x3B, 0x9C, 0x24, 0xF0,
+                  0x00, 0x00, 0x00);
+  TestRegAddrBase(cmp, esp, ebp, 0xF0, i16, 7, 0x66, 0x3B, 0xa5, 0xF0, 0x00,
+                  0x00, 0x00);
+  TestRegAddrBase(cmp, ebp, esi, 0xF0, i16, 7, 0x66, 0x3B, 0xaE, 0xF0, 0x00,
+                  0x00, 0x00);
+  TestRegAddrBase(cmp, esi, edi, 0xF0, i16, 7, 0x66, 0x3B, 0xb7, 0xF0, 0x00,
+                  0x00, 0x00);
+  TestRegAddrBase(cmp, edi, eax, 0xF0, i16, 7, 0x66, 0x3B, 0xb8, 0xF0, 0x00,
+                  0x00, 0x00);
+
+  TestRegAddrBase(cmp, eax, ecx, 0xF0, i8, 6, 0x3A, 0x81, 0xF0, 0x00, 0x00,
+                  0x00);
+  TestRegAddrBase(cmp, ecx, edx, 0xF0, i8, 6, 0x3A, 0x8A, 0xF0, 0x00, 0x00,
+                  0x00);
+  TestRegAddrBase(cmp, edx, ebx, 0xF0, i8, 6, 0x3A, 0x93, 0xF0, 0x00, 0x00,
+                  0x00);
+  TestRegAddrBase(cmp, ebx, esp, 0xF0, i8, 7, 0x3A, 0x9C, 0x24, 0xF0, 0x00,
+                  0x00, 0x00);
+  TestRegAddrBase(cmp, esp, ebp, 0xF0, i8, 6, 0x3A, 0xA5, 0xF0, 0x00, 0x00,
+                  0x00);
+  TestRegAddrBase(cmp, ebp, esi, 0xF0, i8, 6, 0x3A, 0xAE, 0xF0, 0x00, 0x00,
+                  0x00);
+  TestRegAddrBase(cmp, esi, edi, 0xF0, i8, 6, 0x3A, 0xB7, 0xF0, 0x00, 0x00,
+                  0x00);
+  TestRegAddrBase(cmp, edi, eax, 0xF0, i8, 6, 0x3A, 0xB8, 0xF0, 0x00, 0x00,
+                  0x00);
+
+  /* cmp GPR, Imm(,Index,Scale) */
+  TestRegAddrScaledIndex(cmp, eax, ecx, 1, 0, i32, 7, 0x3B, 0x04, 0x0D, 0x00,
+                         0x00, 0x00, 0x00);
+  TestRegAddrScaledIndex(cmp, ecx, edx, 2, 0, i32, 7, 0x3B, 0x0C, 0x55, 0x00,
+                         0x00, 0x00, 0x00);
+  TestRegAddrScaledIndex(cmp, edx, ebx, 4, 0, i32, 7, 0x3B, 0x14, 0x9D, 0x00,
+                         0x00, 0x00, 0x00);
+  // esp cannot be an scaled index.
+  TestRegAddrScaledIndex(cmp, esp, ebp, 8, 0, i32, 7, 0x3B, 0x24, 0xED, 0x00,
+                         0x00, 0x00, 0x00);
+  TestRegAddrScaledIndex(cmp, ebp, esi, 1, 0, i32, 7, 0x3B, 0x2C, 0x35, 0x00,
+                         0x00, 0x00, 0x00);
+  TestRegAddrScaledIndex(cmp, esi, edi, 2, 0, i32, 7, 0x3B, 0x34, 0x7D, 0x00,
+                         0x00, 0x00, 0x00);
+  TestRegAddrScaledIndex(cmp, edi, eax, 4, 0, i32, 7, 0x3B, 0x3C, 0x85, 0x00,
+                         0x00, 0x00, 0x00);
+  TestRegAddrScaledIndex(cmp, ebx, ecx, 8, 0, i32, 7, 0x3B, 0x1C, 0xCD, 0x00,
+                         0x00, 0x00, 0x00);
+
+  TestRegAddrScaledIndex(cmp, eax, ecx, 8, 0, i16, 8, 0x66, 0x3B, 0x04, 0xCD,
+                         0x00, 0x00, 0x00, 0x00);
+  TestRegAddrScaledIndex(cmp, ecx, edx, 1, 0, i16, 8, 0x66, 0x3B, 0x0C, 0x15,
+                         0x00, 0x00, 0x00, 0x00);
+  TestRegAddrScaledIndex(cmp, edx, ebx, 2, 0, i16, 8, 0x66, 0x3B, 0x14, 0x5D,
+                         0x00, 0x00, 0x00, 0x00);
+  // esp cannot be an scaled index.
+  TestRegAddrScaledIndex(cmp, esp, ebp, 4, 0, i16, 8, 0x66, 0x3B, 0x24, 0xAD,
+                         0x00, 0x00, 0x00, 0x00);
+  TestRegAddrScaledIndex(cmp, ebp, esi, 8, 0, i16, 8, 0x66, 0x3B, 0x2C, 0xF5,
+                         0x00, 0x00, 0x00, 0x00);
+  TestRegAddrScaledIndex(cmp, esi, edi, 1, 0, i16, 8, 0x66, 0x3B, 0x34, 0x3D,
+                         0x00, 0x00, 0x00, 0x00);
+  TestRegAddrScaledIndex(cmp, edi, eax, 2, 0, i16, 8, 0x66, 0x3B, 0x3C, 0x45,
+                         0x00, 0x00, 0x00, 0x00);
+  TestRegAddrScaledIndex(cmp, ebx, ecx, 8, 0, i16, 8, 0x66, 0x3B, 0x1C, 0xCD,
+                         0x00, 0x00, 0x00, 0x00);
+
+  TestRegAddrScaledIndex(cmp, eax, ecx, 4, 0, i8, 7, 0x3A, 0x04, 0x8D, 0x00,
+                         0x00, 0x00, 0x00);
+  TestRegAddrScaledIndex(cmp, ecx, edx, 8, 0, i8, 7, 0x3A, 0x0C, 0xD5, 0x00,
+                         0x00, 0x00, 0x00);
+  TestRegAddrScaledIndex(cmp, edx, ebx, 1, 0, i8, 7, 0x3A, 0x14, 0x1D, 0x00,
+                         0x00, 0x00, 0x00);
+  // esp cannot be an scaled index.
+  TestRegAddrScaledIndex(cmp, esp, ebp, 2, 0, i8, 7, 0x3A, 0x24, 0x6D, 0x00,
+                         0x00, 0x00, 0x00);
+  TestRegAddrScaledIndex(cmp, ebp, esi, 4, 0, i8, 7, 0x3A, 0x2C, 0xB5, 0x00,
+                         0x00, 0x00, 0x00);
+  TestRegAddrScaledIndex(cmp, esi, edi, 8, 0, i8, 7, 0x3A, 0x34, 0xFD, 0x00,
+                         0x00, 0x00, 0x00);
+  TestRegAddrScaledIndex(cmp, edi, eax, 1, 0, i8, 7, 0x3A, 0x3C, 0x05, 0x00,
+                         0x00, 0x00, 0x00);
+  TestRegAddrScaledIndex(cmp, ebx, ecx, 8, 0, i8, 7, 0x3a, 0x1C, 0xCD, 0x00,
+                         0x00, 0x00, 0x00);
+
+  /* cmp GPR, 0(Base,Index,Scale) */
+  TestRegAddrBaseScaledIndex(cmp, eax, ecx, edx, 1, 0, i32, 3, 0x3B, 0x04,
+                             0x11);
+  TestRegAddrBaseScaledIndex(cmp, ecx, edx, ebx, 2, 0, i32, 3, 0x3B, 0x0C,
+                             0x5A);
+  // esp cannot be an scaled index.
+  TestRegAddrBaseScaledIndex(cmp, ebx, esp, ebp, 4, 0, i32, 3, 0x3B, 0x1C,
+                             0xAC);
+  TestRegAddrBaseScaledIndex(cmp, esp, ebp, esi, 8, 0, i32, 4, 0x3B, 0x64, 0xF5,
+                             0x00);
+  TestRegAddrBaseScaledIndex(cmp, ebp, esi, edi, 1, 0, i32, 3, 0x3B, 0x2C,
+                             0x3E);
+  TestRegAddrBaseScaledIndex(cmp, esi, edi, eax, 2, 0, i32, 3, 0x3B, 0x34,
+                             0x47);
+  TestRegAddrBaseScaledIndex(cmp, edi, eax, ebx, 4, 0, i32, 3, 0x3B, 0x3C,
+                             0x98);
+  TestRegAddrBaseScaledIndex(cmp, ebx, ecx, edx, 8, 0, i32, 3, 0x3B, 0x1C,
+                             0xD1);
+
+  TestRegAddrBaseScaledIndex(cmp, eax, ecx, edx, 1, 0, i16, 4, 0x66, 0x3B, 0x04,
+                             0x11);
+  TestRegAddrBaseScaledIndex(cmp, ecx, edx, ebx, 2, 0, i16, 4, 0x66, 0x3B, 0x0C,
+                             0x5A);
+  // esp cannot be an scaled index.
+  TestRegAddrBaseScaledIndex(cmp, ebx, esp, ebp, 4, 0, i16, 4, 0x66, 0x3B, 0x1C,
+                             0xAC);
+  TestRegAddrBaseScaledIndex(cmp, esp, ebp, esi, 8, 0, i16, 5, 0x66, 0x3B, 0x64,
+                             0xF5, 0x00);
+  TestRegAddrBaseScaledIndex(cmp, ebp, esi, edi, 1, 0, i16, 4, 0x66, 0x3B, 0x2C,
+                             0x3E);
+  TestRegAddrBaseScaledIndex(cmp, esi, edi, eax, 2, 0, i16, 4, 0x66, 0x3B, 0x34,
+                             0x47);
+  TestRegAddrBaseScaledIndex(cmp, edi, eax, ebx, 4, 0, i16, 4, 0x66, 0x3B, 0x3C,
+                             0x98);
+  TestRegAddrBaseScaledIndex(cmp, ebx, ecx, edx, 8, 0, i16, 4, 0x66, 0x3B, 0x1C,
+                             0xD1);
+
+  TestRegAddrBaseScaledIndex(cmp, eax, ecx, edx, 1, 0, i8, 3, 0x3A, 0x04, 0x11);
+  TestRegAddrBaseScaledIndex(cmp, ecx, edx, ebx, 2, 0, i8, 3, 0x3A, 0x0C, 0x5A);
+  // esp cannot be an scaled index.
+  TestRegAddrBaseScaledIndex(cmp, ebx, esp, ebp, 4, 0, i8, 3, 0x3A, 0x1C, 0xAC);
+  TestRegAddrBaseScaledIndex(cmp, esp, ebp, esi, 8, 0, i8, 4, 0x3A, 0x64, 0xF5,
+                             0x00);
+  TestRegAddrBaseScaledIndex(cmp, ebp, esi, edi, 1, 0, i8, 3, 0x3A, 0x2C, 0x3E);
+  TestRegAddrBaseScaledIndex(cmp, esi, edi, eax, 2, 0, i8, 3, 0x3A, 0x34, 0x47);
+  TestRegAddrBaseScaledIndex(cmp, edi, eax, ebx, 4, 0, i8, 3, 0x3A, 0x3C, 0x98);
+  TestRegAddrBaseScaledIndex(cmp, ebx, ecx, edx, 8, 0, i8, 3, 0x3A, 0x1C, 0xD1);
+
+  /* cmp GPR, Imm8(Base,Index,Scale) */
+  TestRegAddrBaseScaledIndex(cmp, eax, ecx, edx, 1, 0x40, i32, 4, 0x3B, 0x44,
+                             0x11, 0x40);
+  TestRegAddrBaseScaledIndex(cmp, ecx, edx, ebx, 2, 0x40, i32, 4, 0x3B, 0x4C,
+                             0x5A, 0x40);
+  // esp cannot be an scaled index.
+  TestRegAddrBaseScaledIndex(cmp, ebx, esp, ebp, 4, 0x40, i32, 4, 0x3B, 0x5C,
+                             0xAC, 0x40);
+  TestRegAddrBaseScaledIndex(cmp, esp, ebp, esi, 8, 0x40, i32, 4, 0x3B, 0x64,
+                             0xF5, 0x40);
+  TestRegAddrBaseScaledIndex(cmp, ebp, esi, edi, 1, 0x40, i32, 4, 0x3B, 0x6C,
+                             0x3E, 0x40);
+  TestRegAddrBaseScaledIndex(cmp, esi, edi, eax, 2, 0x40, i32, 4, 0x3B, 0x74,
+                             0x47, 0x40);
+  TestRegAddrBaseScaledIndex(cmp, edi, eax, ebx, 4, 0x40, i32, 4, 0x3B, 0x7C,
+                             0x98, 0x40);
+  TestRegAddrBaseScaledIndex(cmp, ebx, ecx, edx, 8, 0x40, i32, 4, 0x3B, 0x5C,
+                             0xD1, 0x40);
+
+  TestRegAddrBaseScaledIndex(cmp, eax, ecx, edx, 1, 0x40, i16, 5, 0x66, 0x3B,
+                             0x44, 0x11, 0x40);
+  TestRegAddrBaseScaledIndex(cmp, ecx, edx, ebx, 2, 0x40, i16, 5, 0x66, 0x3B,
+                             0x4C, 0x5A, 0x40);
+  // esp cannot be an scaled index.
+  TestRegAddrBaseScaledIndex(cmp, ebx, esp, ebp, 4, 0x40, i16, 5, 0x66, 0x3B,
+                             0x5C, 0xAC, 0x40);
+  TestRegAddrBaseScaledIndex(cmp, esp, ebp, esi, 8, 0x40, i16, 5, 0x66, 0x3B,
+                             0x64, 0xF5, 0x40);
+  TestRegAddrBaseScaledIndex(cmp, ebp, esi, edi, 1, 0x40, i16, 5, 0x66, 0x3B,
+                             0x6C, 0x3E, 0x40);
+  TestRegAddrBaseScaledIndex(cmp, esi, edi, eax, 2, 0x40, i16, 5, 0x66, 0x3B,
+                             0x74, 0x47, 0x40);
+  TestRegAddrBaseScaledIndex(cmp, edi, eax, ebx, 4, 0x40, i16, 5, 0x66, 0x3B,
+                             0x7C, 0x98, 0x40);
+  TestRegAddrBaseScaledIndex(cmp, ebx, ecx, edx, 8, 0x40, i16, 5, 0x66, 0x3B,
+                             0x5C, 0xD1, 0x40);
+
+  TestRegAddrBaseScaledIndex(cmp, eax, ecx, edx, 1, 0x40, i8, 4, 0x3A, 0x44,
+                             0x11, 0x40);
+  TestRegAddrBaseScaledIndex(cmp, ecx, edx, ebx, 2, 0x40, i8, 4, 0x3A, 0x4C,
+                             0x5A, 0x40);
+  // esp cannot be an scaled index.
+  TestRegAddrBaseScaledIndex(cmp, ebx, esp, ebp, 4, 0x40, i8, 4, 0x3A, 0x5C,
+                             0xAC, 0x40);
+  TestRegAddrBaseScaledIndex(cmp, esp, ebp, esi, 8, 0x40, i8, 4, 0x3A, 0x64,
+                             0xF5, 0x40);
+  TestRegAddrBaseScaledIndex(cmp, ebp, esi, edi, 1, 0x40, i8, 4, 0x3A, 0x6C,
+                             0x3E, 0x40);
+  TestRegAddrBaseScaledIndex(cmp, esi, edi, eax, 2, 0x40, i8, 4, 0x3A, 0x74,
+                             0x47, 0x40);
+  TestRegAddrBaseScaledIndex(cmp, edi, eax, ebx, 4, 0x40, i8, 4, 0x3A, 0x7C,
+                             0x98, 0x40);
+  TestRegAddrBaseScaledIndex(cmp, ebx, ecx, edx, 8, 0x40, i8, 4, 0x3A, 0x5C,
+                             0xD1, 0x40);
+
+  /* cmp GPR, Imm32(Base,Index,Scale) */
+  TestRegAddrBaseScaledIndex(cmp, eax, ecx, edx, 1, 0xF0, i32, 7, 0x3B, 0x84,
+                             0x11, 0xF0, 0x00, 0x00, 0x00);
+  TestRegAddrBaseScaledIndex(cmp, ecx, edx, ebx, 2, 0xF0, i32, 7, 0x3B, 0x8C,
+                             0x5A, 0xF0, 0x00, 0x00, 0x00);
+  // esp cannot be an scaled index.
+  TestRegAddrBaseScaledIndex(cmp, ebx, esp, ebp, 4, 0xF0, i32, 7, 0x3B, 0x9C,
+                             0xAC, 0xF0, 0x00, 0x00, 0x00);
+  TestRegAddrBaseScaledIndex(cmp, esp, ebp, esi, 8, 0xF0, i32, 7, 0x3B, 0xA4,
+                             0xF5, 0xF0, 0x00, 0x00, 0x00);
+  TestRegAddrBaseScaledIndex(cmp, ebp, esi, edi, 1, 0xF0, i32, 7, 0x3B, 0xAC,
+                             0x3E, 0xF0, 0x00, 0x00, 0x00);
+  TestRegAddrBaseScaledIndex(cmp, esi, edi, eax, 2, 0xF0, i32, 7, 0x3B, 0xB4,
+                             0x47, 0xF0, 0x00, 0x00, 0x00);
+  TestRegAddrBaseScaledIndex(cmp, edi, eax, ebx, 4, 0xF0, i32, 7, 0x3B, 0xBC,
+                             0x98, 0xF0, 0x00, 0x00, 0x00);
+  TestRegAddrBaseScaledIndex(cmp, ebx, ecx, edx, 8, 0xF0, i32, 7, 0x3B, 0x9C,
+                             0xD1, 0xF0, 0x00, 0x00, 0x00);
+
+  TestRegAddrBaseScaledIndex(cmp, eax, ecx, edx, 1, 0xF0, i16, 8, 0x66, 0x3B,
+                             0x84, 0x11, 0xF0, 0x00, 0x00, 0x00);
+  TestRegAddrBaseScaledIndex(cmp, ecx, edx, ebx, 2, 0xF0, i16, 8, 0x66, 0x3B,
+                             0x8C, 0x5A, 0xF0, 0x00, 0x00, 0x00);
+  // esp cannot be an scaled index.
+  TestRegAddrBaseScaledIndex(cmp, ebx, esp, ebp, 4, 0xF0, i16, 8, 0x66, 0x3B,
+                             0x9C, 0xAC, 0xF0, 0x00, 0x00, 0x00);
+  TestRegAddrBaseScaledIndex(cmp, esp, ebp, esi, 8, 0xF0, i16, 8, 0x66, 0x3B,
+                             0xA4, 0xF5, 0xF0, 0x00, 0x00, 0x00);
+  TestRegAddrBaseScaledIndex(cmp, ebp, esi, edi, 1, 0xF0, i16, 8, 0x66, 0x3B,
+                             0xAC, 0x3E, 0xF0, 0x00, 0x00, 0x00);
+  TestRegAddrBaseScaledIndex(cmp, esi, edi, eax, 2, 0xF0, i16, 8, 0x66, 0x3B,
+                             0xB4, 0x47, 0xF0, 0x00, 0x00, 0x00);
+  TestRegAddrBaseScaledIndex(cmp, edi, eax, ebx, 4, 0xF0, i16, 8, 0x66, 0x3B,
+                             0xBC, 0x98, 0xF0, 0x00, 0x00, 0x00);
+  TestRegAddrBaseScaledIndex(cmp, ebx, ecx, edx, 8, 0xF0, i16, 8, 0x66, 0x3B,
+                             0x9C, 0xD1, 0xF0, 0x00, 0x00, 0x00);
+
+  TestRegAddrBaseScaledIndex(cmp, eax, ecx, edx, 1, 0xF0, i8, 7, 0x3A, 0x84,
+                             0x11, 0xF0, 0x00, 0x00, 0x00);
+  TestRegAddrBaseScaledIndex(cmp, ecx, edx, ebx, 2, 0xF0, i8, 7, 0x3A, 0x8C,
+                             0x5A, 0xF0, 0x00, 0x00, 0x00);
+  // esp cannot be an scaled index.
+  TestRegAddrBaseScaledIndex(cmp, ebx, esp, ebp, 4, 0xF0, i8, 7, 0x3A, 0x9C,
+                             0xAC, 0xF0, 0x00, 0x00, 0x00);
+  TestRegAddrBaseScaledIndex(cmp, esp, ebp, esi, 8, 0xF0, i8, 7, 0x3A, 0xA4,
+                             0xF5, 0xF0, 0x00, 0x00, 0x00);
+  TestRegAddrBaseScaledIndex(cmp, ebp, esi, edi, 1, 0xF0, i8, 7, 0x3A, 0xAC,
+                             0x3E, 0xF0, 0x00, 0x00, 0x00);
+  TestRegAddrBaseScaledIndex(cmp, esi, edi, eax, 2, 0xF0, i8, 7, 0x3A, 0xB4,
+                             0x47, 0xF0, 0x00, 0x00, 0x00);
+  TestRegAddrBaseScaledIndex(cmp, edi, eax, ebx, 4, 0xF0, i8, 7, 0x3A, 0xBC,
+                             0x98, 0xF0, 0x00, 0x00, 0x00);
+  TestRegAddrBaseScaledIndex(cmp, ebx, ecx, edx, 8, 0xF0, i8, 7, 0x3A, 0x9C,
+                             0xD1, 0xF0, 0x00, 0x00, 0x00);
+
+  /* cmp Addr, Imm */
+  // Note: at this point we trust the assembler knows how to encode addresses,
+  // so no more exhaustive addressing mode testing.
+  TestAddrBaseScaledIndexImm(cmp, eax, ecx, 1, 0xF0, 0x12, i32, 8, 0x83, 0xBC,
+                             0x08, 0xF0, 0x00, 0x00, 0x00, 0x12);
+  TestAddrBaseScaledIndexImm(cmp, ecx, edx, 1, 0xF0, 0xF0, i32, 11, 0x81, 0xBC,
+                             0x11, 0xF0, 0x00, 0x00, 0x00, 0xF0, 0x00, 0x00,
+                             0x00);
+
+  TestAddrBaseScaledIndexImm(cmp, eax, ecx, 1, 0xF0, 0x12, i16, 9, 0x66, 0x83,
+                             0xBC, 0x08, 0xF0, 0x00, 0x00, 0x00, 0x12);
+  TestAddrBaseScaledIndexImm(cmp, ecx, edx, 1, 0xF0, 0xF0, i16, 10, 0x66, 0x81,
+                             0xBC, 0x11, 0xF0, 0x00, 0x00, 0x00, 0xF0, 0x00);
+
+  TestAddrBaseScaledIndexImm(cmp, eax, ecx, 1, 0xF0, 0x12, i8, 8, 0x80, 0xBC,
+                             0x08, 0xF0, 0x00, 0x00, 0x00, 0x12);
+
+  /* cmp Addr, GPR */
+  TestAddrBaseScaledIndexReg(cmp, eax, ecx, 1, 0xF0, edx, i32, 7, 0x39, 0x94,
+                             0x08, 0xF0, 0x00, 0x00, 0x00);
+
+  TestAddrBaseScaledIndexReg(cmp, eax, ecx, 1, 0xF0, edx, i16, 8, 0x66, 0x39,
+                             0x94, 0x08, 0xF0, 0x00, 0x00, 0x00);
+
+  TestAddrBaseScaledIndexReg(cmp, eax, ecx, 1, 0xF0, edx, i8, 7, 0x38, 0x94,
+                             0x08, 0xF0, 0x00, 0x00, 0x00);
+
+#undef TestAddrBaseScaledIndexReg
+#undef TestAddrBaseScaledIndexImm
+#undef TestRegAddrBaseScaledIndex
+#undef TestRegAddrScaledIndex
+#undef TestRegAddrBase
+#undef TestRegAbsoluteAddr
+#undef TestRegImm
+#undef TestRegReg
+}
+
+TEST_F(AssemblerX8632Test, ScratchpadGettersAndSetters) {
+  const uint32_t S0 = allocateDword();
+  const uint32_t S1 = allocateDword();
+  const uint32_t S2 = allocateDword();
+  const uint32_t S3 = allocateDword();
+  AssembledTest test = assemble();
+  test.setDwordTo(S0, 0xBEEF0000u);
+  test.setDwordTo(S1, 0xDEADu);
+  test.setDwordTo(S2, 0x20406080u);
+  ASSERT_EQ(0xBEEF0000u, test.contentsOfDword(S0));
+  ASSERT_EQ(0xDEADu, test.contentsOfDword(S1));
+  ASSERT_EQ(0x20406080u, test.contentsOfDword(S2));
+  ASSERT_EQ(0xDEADBEEF0000ull, test.contentsOfQword(S0));
+  ASSERT_EQ(0x204060800000DEADull, test.contentsOfQword(S1));
+
+  test.setQwordTo(S1, 0x1234567890ABCDEFull);
+  ASSERT_EQ(0x1234567890ABCDEFull, test.contentsOfQword(S1));
+  test.setDwordTo(S0, 0xBEEF0000u);
+  ASSERT_EQ(0x90ABCDEFull, test.contentsOfDword(S1));
+  ASSERT_EQ(0x12345678ull, test.contentsOfDword(S2));
+
+  test.setDwordTo(S0, 1.0f);
+  ASSERT_FLOAT_EQ(1.0f, test.contentsOfDword<float>(S0));
+  test.setQwordTo(S0, 3.14);
+  ASSERT_DOUBLE_EQ(3.14, test.contentsOfQword<double>(S0));
+
+  test.setDqwordTo(S0, Dqword(1.0f, 2.0f, 3.0f, 4.0f));
+  ASSERT_EQ(Dqword(1.0f, 2.0f, 3.0f, 4.0f), test.contentsOfDqword(S0));
+  EXPECT_FLOAT_EQ(1.0f, test.contentsOfDword<float>(S0));
+  EXPECT_FLOAT_EQ(2.0f, test.contentsOfDword<float>(S1));
+  EXPECT_FLOAT_EQ(3.0f, test.contentsOfDword<float>(S2));
+  EXPECT_FLOAT_EQ(4.0f, test.contentsOfDword<float>(S3));
+}
+
+} // end of anonymous namespace
+} // end of namespace Test
+} // end of namespace X8632
+} // end of namespace Ice
diff --git a/unittest/AssemblerX8632/Other.cpp b/unittest/AssemblerX8632/Other.cpp
new file mode 100644
index 0000000..0e8276c
--- /dev/null
+++ b/unittest/AssemblerX8632/Other.cpp
@@ -0,0 +1,84 @@
+//===- subzero/unittest/AssemblerX8632/Other.cpp --------------------------===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#include "AssemblerX8632/TestUtil.h"
+
+namespace Ice {
+namespace X8632 {
+namespace Test {
+namespace {
+
+TEST_F(AssemblerX8632LowLevelTest, Nop) {
+#define TestImpl(Size, ...)                                                    \
+  do {                                                                         \
+    static constexpr char TestString[] = "(" #Size ", " #__VA_ARGS__ ")";      \
+    __ nop(Size);                                                              \
+    ASSERT_EQ(Size##u, codeBytesSize()) << TestString;                         \
+    ASSERT_TRUE(verifyBytes<Size>(codeBytes(), __VA_ARGS__)) << TestString;    \
+    reset();                                                                   \
+  } while (0);
+
+  TestImpl(1, 0x90);
+  TestImpl(2, 0x66, 0x90);
+  TestImpl(3, 0x0F, 0x1F, 0x00);
+  TestImpl(4, 0x0F, 0x1F, 0x40, 0x00);
+  TestImpl(5, 0x0F, 0x1F, 0x44, 0x00, 0x00);
+  TestImpl(6, 0x66, 0x0F, 0x1F, 0x44, 0x00, 0x00);
+  TestImpl(7, 0x0F, 0x1F, 0x80, 0x00, 0x00, 0x00, 0x00);
+  TestImpl(8, 0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00);
+
+#undef TestImpl
+}
+
+TEST_F(AssemblerX8632LowLevelTest, Int3) {
+  __ int3();
+  static constexpr uint32_t ByteCount = 1;
+  ASSERT_EQ(ByteCount, codeBytesSize());
+  verifyBytes<ByteCount>(codeBytes(), 0xCC);
+}
+
+TEST_F(AssemblerX8632LowLevelTest, Hlt) {
+  __ hlt();
+  static constexpr uint32_t ByteCount = 1;
+  ASSERT_EQ(ByteCount, codeBytesSize());
+  verifyBytes<ByteCount>(codeBytes(), 0xF4);
+}
+
+TEST_F(AssemblerX8632LowLevelTest, Ud2) {
+  __ ud2();
+  static constexpr uint32_t ByteCount = 2;
+  ASSERT_EQ(ByteCount, codeBytesSize());
+  verifyBytes<ByteCount>(codeBytes(), 0x0F, 0x0B);
+}
+
+TEST_F(AssemblerX8632LowLevelTest, EmitSegmentOverride) {
+#define TestImpl(Prefix)                                                       \
+  do {                                                                         \
+    static constexpr uint8_t ByteCount = 1;                                    \
+    __ emitSegmentOverride(Prefix);                                            \
+    ASSERT_EQ(ByteCount, codeBytesSize()) << Prefix;                           \
+    verifyBytes<ByteCount>(codeBytes(), Prefix);                               \
+    reset();                                                                   \
+  } while (0)
+
+  TestImpl(0x26);
+  TestImpl(0x2E);
+  TestImpl(0x36);
+  TestImpl(0x3E);
+  TestImpl(0x64);
+  TestImpl(0x65);
+  TestImpl(0x66);
+  TestImpl(0x67);
+
+#undef TestImpl
+}
+
+} // end of anonymous namespace
+} // end of namespace Test
+} // end of namespace X8632
+} // end of namespace Ice
diff --git a/unittest/AssemblerX8632/TestUtil.h b/unittest/AssemblerX8632/TestUtil.h
new file mode 100644
index 0000000..190a5dd
--- /dev/null
+++ b/unittest/AssemblerX8632/TestUtil.h
@@ -0,0 +1,848 @@
+//===- subzero/unittest/unittest/AssemblerX8632/TestUtil.h ------*- C++ -*-===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Utility classes for testing the X8632 Assembler.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ASSEMBLERX8632_TESTUTIL_H_
+#define ASSEMBLERX8632_TESTUTIL_H_
+
+#include "IceAssemblerX8632.h"
+
+#include "gtest/gtest.h"
+
+#include <cassert>
+#include <sys/mman.h>
+
+namespace Ice {
+namespace X8632 {
+namespace Test {
+
+class AssemblerX8632TestBase : public ::testing::Test {
+protected:
+  using Address = AssemblerX8632::Traits::Address;
+  using ByteRegister = AssemblerX8632::Traits::ByteRegister;
+  using Cond = AssemblerX8632::Traits::Cond;
+  using GPRRegister = AssemblerX8632::Traits::GPRRegister;
+  using Traits = AssemblerX8632::Traits;
+  using XmmRegister = AssemblerX8632::Traits::XmmRegister;
+  using X87STRegister = AssemblerX8632::Traits::X87STRegister;
+
+  AssemblerX8632TestBase() { reset(); }
+
+  void reset() { Assembler.reset(new AssemblerX8632()); }
+
+  AssemblerX8632 *assembler() const { return Assembler.get(); }
+
+  size_t codeBytesSize() const { return Assembler->getBufferView().size(); }
+
+  const uint8_t *codeBytes() const {
+    return static_cast<const uint8_t *>(
+        static_cast<const void *>(Assembler->getBufferView().data()));
+  }
+
+private:
+  std::unique_ptr<AssemblerX8632> Assembler;
+};
+
+// __ is a helper macro. It allows test cases to emit X8632 assembly
+// instructions with
+//
+//   __ mov(GPRRegister::Reg_Eax, 1);
+//   __ ret();
+//
+// and so on. The idea of having this was "stolen" from dart's unit tests.
+#define __ (this->assembler())->
+
+// AssemblerX8632LowLevelTest verify that the "basic" instructions the tests
+// rely on are encoded correctly. Therefore, instead of executing the assembled
+// code, these tests will verify that the assembled bytes are sane.
+class AssemblerX8632LowLevelTest : public AssemblerX8632TestBase {
+protected:
+  // verifyBytes is a template helper that takes a Buffer, and a variable number
+  // of bytes. As the name indicates, it is used to verify the bytes for an
+  // instruction encoding.
+  template <int N, int I> static bool verifyBytes(const uint8_t *) {
+    static_assert(I == N, "Invalid template instantiation.");
+    return true;
+  }
+
+  template <int N, int I = 0, typename... Args>
+  static bool verifyBytes(const uint8_t *Buffer, uint8_t Byte,
+                          Args... OtherBytes) {
+    static_assert(I < N, "Invalid template instantiation.");
+    EXPECT_EQ(Byte, Buffer[I]) << "Byte " << (I + 1) << " of " << N;
+    return verifyBytes<N, I + 1>(Buffer, OtherBytes...) && Buffer[I] == Byte;
+  }
+};
+
+// After these tests we should have a sane environment; we know the following
+// work:
+//
+//  (*) zeroing eax, ebx, ecx, edx, edi, and esi;
+//  (*) call $4 instruction (used for ip materialization);
+//  (*) register push and pop;
+//  (*) cmp reg, reg; and
+//  (*) returning from functions.
+//
+// We can now dive into testing each emitting method in AssemblerX8632. Each
+// test will emit some instructions for performing the test. The assembled
+// instructions will operate in a "safe" environment. All x86-32 registers are
+// spilled to the program stack, and the registers are then zeroed out, with the
+// exception of %esp and %ebp.
+//
+// The jitted code and the unittest code will share the same stack. Therefore,
+// test harnesses need to ensure it does not leave anything it pushed on the
+// stack.
+//
+// %ebp is initialized with a pointer for rIP-based addressing. This pointer is
+// used for position-independent access to a scratchpad area for use in tests.
+// This mechanism is used because the test framework needs to generate addresses
+// that work on both x86-32 and x86-64 hosts, but are encodable using our x86-32
+// assembler. This is made possible because the encoding for
+//
+//    pushq %rax (x86-64 only)
+//
+// is the same as the one for
+//
+//    pushl %eax (x86-32 only; not encodable in x86-64)
+//
+// Likewise, the encodings for
+//
+//    movl offset(%ebp), %reg (32-bit only)
+//    movl <src>, offset(%ebp) (32-bit only)
+//
+// and
+//
+//    movl offset(%rbp), %reg (64-bit only)
+//    movl <src>, offset(%rbp) (64-bit only)
+//
+// are also the same.
+//
+// We use a call instruction in order to generate a natural sized address on the
+// stack. Said address is then removed from the stack with a pop %rBP, which can
+// then be used to address memory safely in either x86-32 or x86-64, as long as
+// the test code does not perform any arithmetic operation that writes to %rBP.
+// This PC materialization technique is very common in x86-32 PIC.
+//
+// %rBP is used to provide the tests with a scratchpad area that can safely and
+// portably be written to and read from. This scratchpad area is also used to
+// store the "final" values in eax, ebx, ecx, edx, esi, and edi, allowing the
+// harnesses access to 6 "return values" instead of the usual single return
+// value supported by C++.
+//
+// The jitted code will look like the following:
+//
+// test:
+//       push %eax
+//       push %ebx
+//       push %ecx
+//       push %edx
+//       push %edi
+//       push %esi
+//       push %ebp
+//       call test$materialize_ip
+// test$materialize_ip:                           <<------- %eBP will point here
+//       pop  %ebp
+//       mov  $0, %eax
+//       mov  $0, %ebx
+//       mov  $0, %ecx
+//       mov  $0, %edx
+//       mov  $0, %edi
+//       mov  $0, %esi
+//
+//       << test code goes here >>
+//
+//       mov %eax, { 0 + $ScratchpadOffset}(%ebp)
+//       mov %ebx, { 4 + $ScratchpadOffset}(%ebp)
+//       mov %ecx, { 8 + $ScratchpadOffset}(%ebp)
+//       mov %edx, {12 + $ScratchpadOffset}(%ebp)
+//       mov %edi, {16 + $ScratchpadOffset}(%ebp)
+//       mov %esi, {20 + $ScratchpadOffset}(%ebp)
+//       mov %ebp, {24 + $ScratchpadOffset}(%ebp)
+//       mov %esp, {28 + $ScratchpadOffset}(%ebp)
+//       movups %xmm0, {32 + $ScratchpadOffset}(%ebp)
+//       movups %xmm1, {48 + $ScratchpadOffset}(%ebp)
+//       movups %xmm2, {64 + $ScratchpadOffset}(%ebp)
+//       movusp %xmm3, {80 + $ScratchpadOffset}(%ebp)
+//       movusp %xmm4, {96 + $ScratchpadOffset}(%ebp)
+//       movusp %xmm5, {112 + $ScratchpadOffset}(%ebp)
+//       movusp %xmm6, {128 + $ScratchpadOffset}(%ebp)
+//       movusp %xmm7, {144 + $ScratchpadOffset}(%ebp)
+//
+//       pop %ebp
+//       pop %esi
+//       pop %edi
+//       pop %edx
+//       pop %ecx
+//       pop %ebx
+//       pop %eax
+//       ret
+//
+//      << ... >>
+//
+// scratchpad:                              <<------- accessed via $Offset(%ebp)
+//
+//      << test scratch area >>
+//
+// TODO(jpp): test the
+//
+//    mov %reg, $Offset(%ebp)
+//    movups %xmm, $Offset(%ebp)
+//
+// encodings using the low level assembler test ensuring that the register
+// values can be written to the scratchpad area.
+class AssemblerX8632Test : public AssemblerX8632TestBase {
+protected:
+  // Dqword is used to represent 128-bit data types. The Dqword's contents are
+  // the same as the contents read from memory. Tests can then use the union
+  // members to verify the tests' outputs.
+  //
+  // NOTE: We want sizeof(Dqword) == sizeof(uint64_t) * 2. In other words, we
+  // want Dqword's contents to be **exactly** what the memory contents were so
+  // that we can do, e.g.,
+  //
+  // ...
+  // float Ret[4];
+  // // populate Ret
+  // return *reinterpret_cast<Dqword *>(&Ret);
+  //
+  // While being an ugly hack, this kind of return statements are used
+  // extensively in the PackedArith (see below) class.
+  union Dqword {
+    template <typename T0, typename T1, typename T2, typename T3,
+              typename = typename std::enable_if<
+                  std::is_floating_point<T0>::value>::type>
+    Dqword(T0 F0, T1 F1, T2 F2, T3 F3) {
+      F32[0] = F0;
+      F32[1] = F1;
+      F32[2] = F2;
+      F32[3] = F3;
+    }
+
+    template <typename T>
+    Dqword(typename std::enable_if<std::is_same<T, int32_t>::value, T>::type I0,
+           T I1, T I2, T I3) {
+      I32[0] = I0;
+      I32[1] = I1;
+      I32[2] = I2;
+      I32[3] = I3;
+    }
+
+    template <typename T>
+    Dqword(typename std::enable_if<std::is_same<T, uint64_t>::value, T>::type
+               U64_0,
+           T U64_1) {
+      U64[0] = U64_0;
+      U64[1] = U64_1;
+    }
+
+    template <typename T>
+    Dqword(typename std::enable_if<std::is_same<T, double>::value, T>::type D0,
+           T D1) {
+      F64[0] = D0;
+      F64[1] = D1;
+    }
+
+    bool operator==(const Dqword &Rhs) const {
+      return std::memcmp(this, &Rhs, sizeof(*this)) == 0;
+    }
+
+    double F64[2];
+    uint64_t U64[2];
+    int64_t I64[2];
+
+    float F32[4];
+    uint32_t U32[4];
+    int32_t I32[4];
+
+    uint16_t U16[8];
+    int16_t I16[8];
+
+    uint8_t U8[16];
+    int8_t I8[16];
+
+  private:
+    Dqword() = delete;
+  };
+
+  // As stated, we want this condition to hold, so we assert.
+  static_assert(sizeof(Dqword) == 2 * sizeof(uint64_t),
+                "Dqword has the wrong size.");
+
+  // PackedArith is an interface provider for Dqwords. PackedArith's C argument
+  // is the undelying Dqword's type, which is then used so that we can define
+  // operators in terms of C++ operators on the underlying elements' type.
+  template <typename C> class PackedArith {
+  public:
+    static constexpr uint32_t N = sizeof(Dqword) / sizeof(C);
+    static_assert(N * sizeof(C) == sizeof(Dqword),
+                  "Invalid template paramenter.");
+    static_assert((N & 1) == 0, "N should be divisible by 2");
+
+#define DefinePackedComparisonOperator(Op)                                     \
+  template <typename Container = C, int Size = N>                              \
+  typename std::enable_if<std::is_floating_point<Container>::value,            \
+                          Dqword>::type                                        \
+  operator Op(const Dqword &Rhs) const {                                       \
+    using ElemType =                                                           \
+        typename std::conditional<std::is_same<float, Container>::value,       \
+                                  int32_t, int64_t>::type;                     \
+    static_assert(sizeof(ElemType) == sizeof(Container),                       \
+                  "Check ElemType definition.");                               \
+    const ElemType *const RhsPtr =                                             \
+        reinterpret_cast<const ElemType *const>(&Rhs);                         \
+    const ElemType *const LhsPtr =                                             \
+        reinterpret_cast<const ElemType *const>(&Lhs);                         \
+    ElemType Ret[N];                                                           \
+    for (uint32_t i = 0; i < N; ++i) {                                         \
+      Ret[i] = (LhsPtr[i] Op RhsPtr[i]) ? -1 : 0;                              \
+    }                                                                          \
+    return *reinterpret_cast<Dqword *>(&Ret);                                  \
+  }
+
+    DefinePackedComparisonOperator(< );
+    DefinePackedComparisonOperator(<= );
+    DefinePackedComparisonOperator(> );
+    DefinePackedComparisonOperator(>= );
+    DefinePackedComparisonOperator(== );
+    DefinePackedComparisonOperator(!= );
+
+#undef DefinePackedComparisonOperator
+
+#define DefinePackedOrdUnordComparisonOperator(Op, Ordered)                    \
+  template <typename Container = C, int Size = N>                              \
+  typename std::enable_if<std::is_floating_point<Container>::value,            \
+                          Dqword>::type                                        \
+  Op(const Dqword &Rhs) const {                                                \
+    using ElemType =                                                           \
+        typename std::conditional<std::is_same<float, Container>::value,       \
+                                  int32_t, int64_t>::type;                     \
+    static_assert(sizeof(ElemType) == sizeof(Container),                       \
+                  "Check ElemType definition.");                               \
+    const Container *const RhsPtr =                                            \
+        reinterpret_cast<const Container *const>(&Rhs);                        \
+    const Container *const LhsPtr =                                            \
+        reinterpret_cast<const Container *const>(&Lhs);                        \
+    ElemType Ret[N];                                                           \
+    for (uint32_t i = 0; i < N; ++i) {                                         \
+      Ret[i] = (!(LhsPtr[i] == LhsPtr[i]) || !(RhsPtr[i] == RhsPtr[i])) !=     \
+                       (Ordered)                                               \
+                   ? -1                                                        \
+                   : 0;                                                        \
+    }                                                                          \
+    return *reinterpret_cast<Dqword *>(&Ret);                                  \
+  }
+
+    DefinePackedOrdUnordComparisonOperator(ord, true);
+    DefinePackedOrdUnordComparisonOperator(unord, false);
+#undef DefinePackedOrdUnordComparisonOperator
+
+#define DefinePackedArithOperator(Op, RhsIndexChanges, NeedsInt)               \
+  template <typename Container = C, int Size = N>                              \
+  Dqword operator Op(const Dqword &Rhs) const {                                \
+    using ElemTypeForFp = typename std::conditional<                           \
+        !(NeedsInt), Container,                                                \
+        typename std::conditional<                                             \
+            std::is_same<Container, float>::value, uint32_t,                   \
+            typename std::conditional<std::is_same<Container, double>::value,  \
+                                      uint64_t, void>::type>::type>::type;     \
+    using ElemType =                                                           \
+        typename std::conditional<std::is_integral<Container>::value,          \
+                                  Container, ElemTypeForFp>::type;             \
+    static_assert(!std::is_same<void, ElemType>::value,                        \
+                  "Check ElemType definition.");                               \
+    const ElemType *const RhsPtr =                                             \
+        reinterpret_cast<const ElemType *const>(&Rhs);                         \
+    const ElemType *const LhsPtr =                                             \
+        reinterpret_cast<const ElemType *const>(&Lhs);                         \
+    ElemType Ret[N];                                                           \
+    for (uint32_t i = 0; i < N; ++i) {                                         \
+      Ret[i] = LhsPtr[i] Op RhsPtr[(RhsIndexChanges) ? i : 0];                 \
+    }                                                                          \
+    return *reinterpret_cast<Dqword *>(&Ret);                                  \
+  }
+
+    DefinePackedArithOperator(>>, false, true);
+    DefinePackedArithOperator(<<, false, true);
+    DefinePackedArithOperator(+, true, false);
+    DefinePackedArithOperator(-, true, false);
+    DefinePackedArithOperator(/, true, false);
+    DefinePackedArithOperator(&, true, true);
+    DefinePackedArithOperator(|, true, true);
+    DefinePackedArithOperator (^, true, true);
+
+#undef DefinePackedArithOperator
+
+#define DefinePackedArithShiftImm(Op)                                          \
+  template <typename Container = C, int Size = N>                              \
+  Dqword operator Op(uint8_t imm) const {                                      \
+    const Container *const LhsPtr =                                            \
+        reinterpret_cast<const Container *const>(&Lhs);                        \
+    Container Ret[N];                                                          \
+    for (uint32_t i = 0; i < N; ++i) {                                         \
+      Ret[i] = LhsPtr[i] Op imm;                                               \
+    }                                                                          \
+    return *reinterpret_cast<Dqword *>(&Ret);                                  \
+  }
+
+    DefinePackedArithShiftImm(>> );
+    DefinePackedArithShiftImm(<< );
+
+#undef DefinePackedArithShiftImm
+
+    template <typename Container = C, int Size = N>
+    typename std::enable_if<std::is_signed<Container>::value ||
+                                std::is_floating_point<Container>::value,
+                            Dqword>::type
+    operator*(const Dqword &Rhs) const {
+      static_assert((std::is_integral<Container>::value &&
+                     sizeof(Container) < sizeof(uint64_t)) ||
+                        std::is_floating_point<Container>::value,
+                    "* is only defined for i(8|16|32), and fp types.");
+
+      const Container *const RhsPtr =
+          reinterpret_cast<const Container *const>(&Rhs);
+      const Container *const LhsPtr =
+          reinterpret_cast<const Container *const>(&Lhs);
+      Container Ret[Size];
+      for (uint32_t i = 0; i < Size; ++i) {
+        Ret[i] = LhsPtr[i] * RhsPtr[i];
+      }
+      return *reinterpret_cast<Dqword *>(&Ret);
+    }
+
+    template <typename Container = C, int Size = N,
+              typename = typename std::enable_if<
+                  !std::is_signed<Container>::value>::type>
+    Dqword operator*(const Dqword &Rhs) const {
+      static_assert(std::is_integral<Container>::value &&
+                        sizeof(Container) < sizeof(uint64_t),
+                    "* is only defined for ui(8|16|32)");
+      using NextType = typename std::conditional<
+          sizeof(Container) == 1, uint16_t,
+          typename std::conditional<sizeof(Container) == 2, uint32_t,
+                                    uint64_t>::type>::type;
+      static_assert(sizeof(Container) * 2 == sizeof(NextType),
+                    "Unexpected size");
+
+      const Container *const RhsPtr =
+          reinterpret_cast<const Container *const>(&Rhs);
+      const Container *const LhsPtr =
+          reinterpret_cast<const Container *const>(&Lhs);
+      NextType Ret[Size / 2];
+      for (uint32_t i = 0; i < Size; i += 2) {
+        Ret[i / 2] =
+            static_cast<NextType>(LhsPtr[i]) * static_cast<NextType>(RhsPtr[i]);
+      }
+      return *reinterpret_cast<Dqword *>(&Ret);
+    }
+
+    template <typename Container = C, int Size = N>
+    PackedArith<Container> operator~() const {
+      const Container *const LhsPtr =
+          reinterpret_cast<const Container *const>(&Lhs);
+      Container Ret[Size];
+      for (uint32_t i = 0; i < Size; ++i) {
+        Ret[i] = ~LhsPtr[i];
+      }
+      return PackedArith<Container>(*reinterpret_cast<Dqword *>(&Ret));
+    }
+
+#define MinMaxOperations(Name, Suffix)                                         \
+  template <typename Container = C, int Size = N>                              \
+  Dqword Name##Suffix(const Dqword &Rhs) const {                               \
+    static_assert(std::is_floating_point<Container>::value,                    \
+                  #Name #Suffix "ps is only available for fp.");               \
+    const Container *const RhsPtr =                                            \
+        reinterpret_cast<const Container *const>(&Rhs);                        \
+    const Container *const LhsPtr =                                            \
+        reinterpret_cast<const Container *const>(&Lhs);                        \
+    Container Ret[Size];                                                       \
+    for (uint32_t i = 0; i < Size; ++i) {                                      \
+      Ret[i] = std::Name(LhsPtr[i], RhsPtr[i]);                                \
+    }                                                                          \
+    return *reinterpret_cast<Dqword *>(&Ret);                                  \
+  }
+
+    MinMaxOperations(max, ps);
+    MinMaxOperations(max, pd);
+    MinMaxOperations(min, ps);
+    MinMaxOperations(min, pd);
+#undef MinMaxOperations
+
+    template <typename Container = C, int Size = N>
+    Dqword blendWith(const Dqword &Rhs, const Dqword &Mask) const {
+      using MaskType = typename std::conditional<
+          sizeof(Container) == 1, int8_t,
+          typename std::conditional<sizeof(Container) == 2, int16_t,
+                                    int32_t>::type>::type;
+      static_assert(sizeof(MaskType) == sizeof(Container),
+                    "MaskType has the wrong size.");
+      const Container *const RhsPtr =
+          reinterpret_cast<const Container *const>(&Rhs);
+      const Container *const LhsPtr =
+          reinterpret_cast<const Container *const>(&Lhs);
+      const MaskType *const MaskPtr =
+          reinterpret_cast<const MaskType *const>(&Mask);
+      Container Ret[Size];
+      for (int i = 0; i < Size; ++i) {
+        Ret[i] = ((MaskPtr[i] < 0) ? RhsPtr : LhsPtr)[i];
+      }
+      return *reinterpret_cast<Dqword *>(&Ret);
+    }
+
+  private:
+    // The AssemblerX8632Test class needs to be a friend so that it can create
+    // PackedArith objects (see below.)
+    friend class AssemblerX8632Test;
+
+    explicit PackedArith(const Dqword &MyLhs) : Lhs(MyLhs) {}
+
+    // Lhs can't be a & because operator~ returns a temporary object that needs
+    // access to its own Dqword.
+    const Dqword Lhs;
+  };
+
+  // Named constructor for PackedArith objects.
+  template <typename C> static PackedArith<C> packedAs(const Dqword &D) {
+    return PackedArith<C>(D);
+  }
+
+  AssemblerX8632Test() { reset(); }
+
+  void reset() {
+    AssemblerX8632TestBase::reset();
+
+    NeedsEpilogue = true;
+    // These dwords are allocated for saving the GPR state after the jitted code
+    // runs.
+    NumAllocatedDwords = AssembledTest::ScratchpadSlots;
+    addPrologue();
+  }
+
+  // AssembledTest is a wrapper around a PROT_EXEC mmap'ed buffer. This buffer
+  // contains both the test code as well as prologue/epilogue, and the
+  // scratchpad area that tests may use -- all tests use this scratchpad area
+  // for storing the processor's registers after the tests executed. This class
+  // also exposes helper methods for reading the register state after test
+  // execution, as well as for reading the scratchpad area.
+  class AssembledTest {
+    AssembledTest() = delete;
+    AssembledTest(const AssembledTest &) = delete;
+    AssembledTest &operator=(const AssembledTest &) = delete;
+
+  public:
+    static constexpr uint32_t MaximumCodeSize = 1 << 20;
+    static constexpr uint32_t EaxSlot = 0;
+    static constexpr uint32_t EbxSlot = 1;
+    static constexpr uint32_t EcxSlot = 2;
+    static constexpr uint32_t EdxSlot = 3;
+    static constexpr uint32_t EdiSlot = 4;
+    static constexpr uint32_t EsiSlot = 5;
+    static constexpr uint32_t EbpSlot = 6;
+    static constexpr uint32_t EspSlot = 7;
+    // save 4 dwords for each xmm registers.
+    static constexpr uint32_t Xmm0Slot = 8;
+    static constexpr uint32_t Xmm1Slot = 12;
+    static constexpr uint32_t Xmm2Slot = 16;
+    static constexpr uint32_t Xmm3Slot = 20;
+    static constexpr uint32_t Xmm4Slot = 24;
+    static constexpr uint32_t Xmm5Slot = 28;
+    static constexpr uint32_t Xmm6Slot = 32;
+    static constexpr uint32_t Xmm7Slot = 36;
+    static constexpr uint32_t ScratchpadSlots = 40;
+
+    AssembledTest(const uint8_t *Data, const size_t MySize,
+                  const size_t ExtraStorageDwords)
+        : Size(MaximumCodeSize + 4 * ExtraStorageDwords) {
+      // MaxCodeSize is needed because EXPECT_LT needs a symbol with a name --
+      // probably a compiler bug?
+      uint32_t MaxCodeSize = MaximumCodeSize;
+      EXPECT_LT(MySize, MaxCodeSize);
+      assert(MySize < MaximumCodeSize);
+      ExecutableData = mmap(nullptr, Size, PROT_WRITE | PROT_READ | PROT_EXEC,
+                            MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+      EXPECT_NE(MAP_FAILED, ExecutableData) << strerror(errno);
+      assert(MAP_FAILED != ExecutableData);
+      std::memcpy(ExecutableData, Data, MySize);
+    }
+
+    // We allow AssembledTest to be moved so that we can return objects of
+    // this type.
+    AssembledTest(AssembledTest &&Buffer)
+        : ExecutableData(Buffer.ExecutableData), Size(Buffer.Size) {
+      Buffer.ExecutableData = nullptr;
+      Buffer.Size = 0;
+    }
+
+    AssembledTest &operator=(AssembledTest &&Buffer) {
+      ExecutableData = Buffer.ExecutableData;
+      Buffer.ExecutableData = nullptr;
+      Size = Buffer.Size;
+      Buffer.Size = 0;
+      return *this;
+    }
+
+    ~AssembledTest() {
+      if (ExecutableData != nullptr) {
+        munmap(ExecutableData, Size);
+        ExecutableData = nullptr;
+      }
+    }
+
+    void run() const { reinterpret_cast<void (*)()>(ExecutableData)(); }
+
+    uint32_t eax() const { return contentsOfDword(AssembledTest::EaxSlot); }
+
+    uint32_t ebx() const { return contentsOfDword(AssembledTest::EbxSlot); }
+
+    uint32_t ecx() const { return contentsOfDword(AssembledTest::EcxSlot); }
+
+    uint32_t edx() const { return contentsOfDword(AssembledTest::EdxSlot); }
+
+    uint32_t edi() const { return contentsOfDword(AssembledTest::EdiSlot); }
+
+    uint32_t esi() const { return contentsOfDword(AssembledTest::EsiSlot); }
+
+    uint32_t ebp() const { return contentsOfDword(AssembledTest::EbpSlot); }
+
+    uint32_t esp() const { return contentsOfDword(AssembledTest::EspSlot); }
+
+    template <typename T> T xmm0() const {
+      return xmm<T>(AssembledTest::Xmm0Slot);
+    }
+
+    template <typename T> T xmm1() const {
+      return xmm<T>(AssembledTest::Xmm1Slot);
+    }
+
+    template <typename T> T xmm2() const {
+      return xmm<T>(AssembledTest::Xmm2Slot);
+    }
+
+    template <typename T> T xmm3() const {
+      return xmm<T>(AssembledTest::Xmm3Slot);
+    }
+
+    template <typename T> T xmm4() const {
+      return xmm<T>(AssembledTest::Xmm4Slot);
+    }
+
+    template <typename T> T xmm5() const {
+      return xmm<T>(AssembledTest::Xmm5Slot);
+    }
+
+    template <typename T> T xmm6() const {
+      return xmm<T>(AssembledTest::Xmm6Slot);
+    }
+
+    template <typename T> T xmm7() const {
+      return xmm<T>(AssembledTest::Xmm7Slot);
+    }
+
+    // contentsOfDword is used for reading the values in the scratchpad area.
+    // Valid arguments are the dword ids returned by
+    // AssemblerX8632Test::allocateDword() -- other inputs are considered
+    // invalid, and are not guaranteed to work if the implementation changes.
+    template <typename T = uint32_t, typename = typename std::enable_if<
+                                         sizeof(T) == sizeof(uint32_t)>::type>
+    T contentsOfDword(uint32_t Dword) const {
+      return *reinterpret_cast<T *>(static_cast<uint8_t *>(ExecutableData) +
+                                    dwordOffset(Dword));
+    }
+
+    template <typename T = uint64_t, typename = typename std::enable_if<
+                                         sizeof(T) == sizeof(uint64_t)>::type>
+    T contentsOfQword(uint32_t InitialDword) const {
+      return *reinterpret_cast<T *>(static_cast<uint8_t *>(ExecutableData) +
+                                    dwordOffset(InitialDword));
+    }
+
+    Dqword contentsOfDqword(uint32_t InitialDword) const {
+      return *reinterpret_cast<Dqword *>(
+                 static_cast<uint8_t *>(ExecutableData) +
+                 dwordOffset(InitialDword));
+    }
+
+    template <typename T = uint32_t, typename = typename std::enable_if<
+                                         sizeof(T) == sizeof(uint32_t)>::type>
+    void setDwordTo(uint32_t Dword, T value) {
+      *reinterpret_cast<uint32_t *>(static_cast<uint8_t *>(ExecutableData) +
+                                    dwordOffset(Dword)) =
+          *reinterpret_cast<uint32_t *>(&value);
+    }
+
+    template <typename T = uint64_t, typename = typename std::enable_if<
+                                         sizeof(T) == sizeof(uint64_t)>::type>
+    void setQwordTo(uint32_t InitialDword, T value) {
+      *reinterpret_cast<uint64_t *>(static_cast<uint8_t *>(ExecutableData) +
+                                    dwordOffset(InitialDword)) =
+          *reinterpret_cast<uint64_t *>(&value);
+    }
+
+    void setDqwordTo(uint32_t InitialDword, const Dqword &qdword) {
+      setQwordTo(InitialDword, qdword.U64[0]);
+      setQwordTo(InitialDword + 2, qdword.U64[1]);
+    }
+
+  private:
+    template <typename T>
+    typename std::enable_if<std::is_same<T, Dqword>::value, Dqword>::type
+    xmm(uint8_t Slot) const {
+      return contentsOfDqword(Slot);
+    }
+
+    template <typename T>
+    typename std::enable_if<!std::is_same<T, Dqword>::value, T>::type
+    xmm(uint8_t Slot) const {
+      constexpr bool TIs64Bit = sizeof(T) == sizeof(uint64_t);
+      using _64BitType = typename std::conditional<TIs64Bit, T, uint64_t>::type;
+      using _32BitType = typename std::conditional<TIs64Bit, uint32_t, T>::type;
+      if (TIs64Bit) {
+        return contentsOfQword<_64BitType>(Slot);
+      }
+      return contentsOfDword<_32BitType>(Slot);
+    }
+
+    static uint32_t dwordOffset(uint32_t Index) {
+      return MaximumCodeSize + (Index * 4);
+    }
+
+    void *ExecutableData = nullptr;
+    size_t Size;
+  };
+
+  // assemble created an AssembledTest with the jitted code. The first time
+  // assemble is executed it will add the epilogue to the jitted code (which is
+  // the reason why this method is not const qualified.
+  AssembledTest assemble() {
+    if (NeedsEpilogue) {
+      addEpilogue();
+    }
+
+    NeedsEpilogue = false;
+    return AssembledTest(codeBytes(), codeBytesSize(), NumAllocatedDwords);
+  }
+
+  // Allocates a new dword slot in the test's scratchpad area.
+  uint32_t allocateDword() { return NumAllocatedDwords++; }
+
+  // Allocates a new qword slot in the test's scratchpad area.
+  uint32_t allocateQword() {
+    uint32_t InitialDword = allocateDword();
+    allocateDword();
+    return InitialDword;
+  }
+
+  // Allocates a new dqword slot in the test's scratchpad area.
+  uint32_t allocateDqword() {
+    uint32_t InitialDword = allocateQword();
+    allocateQword();
+    return InitialDword;
+  }
+
+  Address dwordAddress(uint32_t Dword) {
+    return Address(GPRRegister::Encoded_Reg_ebp, dwordDisp(Dword));
+  }
+
+private:
+  // e??SlotAddress returns an AssemblerX8632::Traits::Address that can be used
+  // by the test cases to encode an address operand for accessing the slot for
+  // the specified register. These are all private for, when jitting the test
+  // code, tests should not tamper with these values. Besides, during the test
+  // execution these slots' contents are undefined and should not be accessed.
+  Address eaxSlotAddress() { return dwordAddress(AssembledTest::EaxSlot); }
+  Address ebxSlotAddress() { return dwordAddress(AssembledTest::EbxSlot); }
+  Address ecxSlotAddress() { return dwordAddress(AssembledTest::EcxSlot); }
+  Address edxSlotAddress() { return dwordAddress(AssembledTest::EdxSlot); }
+  Address ediSlotAddress() { return dwordAddress(AssembledTest::EdiSlot); }
+  Address esiSlotAddress() { return dwordAddress(AssembledTest::EsiSlot); }
+  Address ebpSlotAddress() { return dwordAddress(AssembledTest::EbpSlot); }
+  Address espSlotAddress() { return dwordAddress(AssembledTest::EspSlot); }
+  Address xmm0SlotAddress() { return dwordAddress(AssembledTest::Xmm0Slot); }
+  Address xmm1SlotAddress() { return dwordAddress(AssembledTest::Xmm1Slot); }
+  Address xmm2SlotAddress() { return dwordAddress(AssembledTest::Xmm2Slot); }
+  Address xmm3SlotAddress() { return dwordAddress(AssembledTest::Xmm3Slot); }
+  Address xmm4SlotAddress() { return dwordAddress(AssembledTest::Xmm4Slot); }
+  Address xmm5SlotAddress() { return dwordAddress(AssembledTest::Xmm5Slot); }
+  Address xmm6SlotAddress() { return dwordAddress(AssembledTest::Xmm6Slot); }
+  Address xmm7SlotAddress() { return dwordAddress(AssembledTest::Xmm7Slot); }
+
+  // Returns the displacement that should be used when accessing the specified
+  // Dword in the scratchpad area. It needs to adjust for the initial
+  // instructions that are emitted before the call that materializes the IP
+  // register.
+  uint32_t dwordDisp(uint32_t Dword) const {
+    EXPECT_LT(Dword, NumAllocatedDwords);
+    assert(Dword < NumAllocatedDwords);
+    static constexpr uint8_t PushBytes = 1;
+    static constexpr uint8_t CallImmBytes = 5;
+    return AssembledTest::MaximumCodeSize + (Dword * 4) -
+           (7 * PushBytes + CallImmBytes);
+  }
+
+  void addPrologue() {
+    __ pushl(GPRRegister::Encoded_Reg_eax);
+    __ pushl(GPRRegister::Encoded_Reg_ebx);
+    __ pushl(GPRRegister::Encoded_Reg_ecx);
+    __ pushl(GPRRegister::Encoded_Reg_edx);
+    __ pushl(GPRRegister::Encoded_Reg_edi);
+    __ pushl(GPRRegister::Encoded_Reg_esi);
+    __ pushl(GPRRegister::Encoded_Reg_ebp);
+
+    __ call(Immediate(4));
+    __ popl(GPRRegister::Encoded_Reg_ebp);
+    __ mov(IceType_i32, GPRRegister::Encoded_Reg_eax, Immediate(0x00));
+    __ mov(IceType_i32, GPRRegister::Encoded_Reg_ebx, Immediate(0x00));
+    __ mov(IceType_i32, GPRRegister::Encoded_Reg_ecx, Immediate(0x00));
+    __ mov(IceType_i32, GPRRegister::Encoded_Reg_edx, Immediate(0x00));
+    __ mov(IceType_i32, GPRRegister::Encoded_Reg_edi, Immediate(0x00));
+    __ mov(IceType_i32, GPRRegister::Encoded_Reg_esi, Immediate(0x00));
+  }
+
+  void addEpilogue() {
+    __ mov(IceType_i32, eaxSlotAddress(), GPRRegister::Encoded_Reg_eax);
+    __ mov(IceType_i32, ebxSlotAddress(), GPRRegister::Encoded_Reg_ebx);
+    __ mov(IceType_i32, ecxSlotAddress(), GPRRegister::Encoded_Reg_ecx);
+    __ mov(IceType_i32, edxSlotAddress(), GPRRegister::Encoded_Reg_edx);
+    __ mov(IceType_i32, ediSlotAddress(), GPRRegister::Encoded_Reg_edi);
+    __ mov(IceType_i32, esiSlotAddress(), GPRRegister::Encoded_Reg_esi);
+    __ mov(IceType_i32, ebpSlotAddress(), GPRRegister::Encoded_Reg_ebp);
+    __ mov(IceType_i32, espSlotAddress(), GPRRegister::Encoded_Reg_esp);
+    __ movups(xmm0SlotAddress(), XmmRegister::Encoded_Reg_xmm0);
+    __ movups(xmm1SlotAddress(), XmmRegister::Encoded_Reg_xmm1);
+    __ movups(xmm2SlotAddress(), XmmRegister::Encoded_Reg_xmm2);
+    __ movups(xmm3SlotAddress(), XmmRegister::Encoded_Reg_xmm3);
+    __ movups(xmm4SlotAddress(), XmmRegister::Encoded_Reg_xmm4);
+    __ movups(xmm5SlotAddress(), XmmRegister::Encoded_Reg_xmm5);
+    __ movups(xmm6SlotAddress(), XmmRegister::Encoded_Reg_xmm6);
+    __ movups(xmm7SlotAddress(), XmmRegister::Encoded_Reg_xmm7);
+
+    __ popl(GPRRegister::Encoded_Reg_ebp);
+    __ popl(GPRRegister::Encoded_Reg_esi);
+    __ popl(GPRRegister::Encoded_Reg_edi);
+    __ popl(GPRRegister::Encoded_Reg_edx);
+    __ popl(GPRRegister::Encoded_Reg_ecx);
+    __ popl(GPRRegister::Encoded_Reg_ebx);
+    __ popl(GPRRegister::Encoded_Reg_eax);
+
+    __ ret();
+  }
+
+  bool NeedsEpilogue;
+  uint32_t NumAllocatedDwords;
+};
+
+} // end of namespace Test
+} // end of namespace X8632
+} // end of namespace Ice
+
+#endif // ASSEMBLERX8632_TESTUTIL_H_
diff --git a/unittest/AssemblerX8632/X87.cpp b/unittest/AssemblerX8632/X87.cpp
new file mode 100644
index 0000000..d3f38a1
--- /dev/null
+++ b/unittest/AssemblerX8632/X87.cpp
@@ -0,0 +1,267 @@
+//===- subzero/unittest/AssemblerX8632/X87.cpp ----------------------------===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#include "AssemblerX8632/TestUtil.h"
+
+namespace Ice {
+namespace X8632 {
+namespace Test {
+namespace {
+
+TEST_F(AssemblerX8632LowLevelTest, Fld) {
+  __ fld(IceType_f32, Address(GPRRegister::Encoded_Reg_ebp, 1));
+  __ fld(IceType_f64, Address(GPRRegister::Encoded_Reg_ebp, 0x10000));
+
+  constexpr size_t ByteCount = 9;
+  ASSERT_EQ(ByteCount, codeBytesSize());
+
+  constexpr uint8_t Fld32Opcode = 0xd9;
+  constexpr uint8_t Fld32ModRM = (/*mod*/ 1 << 6) | (/*reg*/ 0 << 3) |
+                                 (/*rm*/ GPRRegister::Encoded_Reg_ebp);
+  constexpr uint8_t Fld64Opcode = 0xdd;
+  constexpr uint8_t Fld64ModRM = (/*mod*/ 2 << 6) | (/*reg*/ 0 << 3) |
+                                 (/*rm*/ GPRRegister::Encoded_Reg_ebp);
+  verifyBytes<ByteCount>(codeBytes(), Fld32Opcode, Fld32ModRM, 0x01,
+                         Fld64Opcode, Fld64ModRM, 0x00, 0x00, 0x01, 0x00);
+}
+
+TEST_F(AssemblerX8632LowLevelTest, FstpAddr) {
+  __ fstp(IceType_f32, Address(GPRRegister::Encoded_Reg_ebp, 1));
+  __ fstp(IceType_f64, Address(GPRRegister::Encoded_Reg_ebp, 0x10000));
+
+  constexpr size_t ByteCount = 9;
+  ASSERT_EQ(ByteCount, codeBytesSize());
+
+  constexpr uint8_t Fld32Opcode = 0xd9;
+  constexpr uint8_t Fld32ModRM = (/*mod*/ 1 << 6) | (/*reg*/ 3 << 3) |
+                                 (/*rm*/ GPRRegister::Encoded_Reg_ebp);
+  constexpr uint8_t Fld64Opcode = 0xdd;
+  constexpr uint8_t Fld64ModRM = (/*mod*/ 2 << 6) | (/*reg*/ 3 << 3) |
+                                 (/*rm*/ GPRRegister::Encoded_Reg_ebp);
+  verifyBytes<ByteCount>(codeBytes(), Fld32Opcode, Fld32ModRM, 0x01,
+                         Fld64Opcode, Fld64ModRM, 0x00, 0x00, 0x01, 0x00);
+}
+
+TEST_F(AssemblerX8632LowLevelTest, Fincstp) {
+  __ fincstp();
+
+  constexpr size_t ByteCount = 2;
+  ASSERT_EQ(ByteCount, codeBytesSize());
+
+  verifyBytes<ByteCount>(codeBytes(), 0xD9, 0XF7);
+}
+
+TEST_F(AssemblerX8632LowLevelTest, FnstcwAddr) {
+  __ fnstcw(Address(GPRRegister::Encoded_Reg_ebp, 0x12345));
+
+  constexpr size_t ByteCount = 6;
+  ASSERT_EQ(ByteCount, codeBytesSize());
+
+  constexpr uint8_t Opcode = 0xd9;
+  constexpr uint8_t ModRM = (/*mod*/ 2 << 6) | (/*reg*/ 7 << 3) |
+                            (/*rm*/ GPRRegister::Encoded_Reg_ebp);
+  verifyBytes<ByteCount>(codeBytes(), Opcode, ModRM, 0x45, 0x23, 0x01, 0x00);
+}
+
+TEST_F(AssemblerX8632LowLevelTest, FldcwAddr) {
+  __ fldcw(Address(GPRRegister::Encoded_Reg_ebp, 0x12345));
+
+  constexpr size_t ByteCount = 6;
+  ASSERT_EQ(ByteCount, codeBytesSize());
+
+  constexpr uint8_t Opcode = 0xd9;
+  constexpr uint8_t ModRM = (/*mod*/ 2 << 6) | (/*reg*/ 5 << 3) |
+                            (/*rm*/ GPRRegister::Encoded_Reg_ebp);
+  verifyBytes<ByteCount>(codeBytes(), Opcode, ModRM, 0x45, 0x23, 0x01, 0x00);
+}
+
+TEST_F(AssemblerX8632Test, FstpSt) {
+#define TestFstpSt(Size, MemorySize, Type)                                     \
+  do {                                                                         \
+    const uint32_t T1 = allocate##MemorySize();                                \
+    const Type OldValue1 = -1.0f;                                              \
+    const uint32_t T2 = allocate##MemorySize();                                \
+    const Type OldValue2 = -2.0f;                                              \
+    const uint32_t T3 = allocate##MemorySize();                                \
+    const Type OldValue3 = -3.0f;                                              \
+    const uint32_t T4 = allocate##MemorySize();                                \
+    const Type OldValue4 = -4.0f;                                              \
+    const uint32_t T5 = allocate##MemorySize();                                \
+    const Type OldValue5 = -5.0f;                                              \
+    const uint32_t T6 = allocate##MemorySize();                                \
+    const Type OldValue6 = -6.0f;                                              \
+    const uint32_t T7 = allocate##MemorySize();                                \
+    const Type OldValue7 = -7.0f;                                              \
+                                                                               \
+    const uint32_t N7 = allocate##MemorySize();                                \
+    constexpr Type NewValue7 = 777.77f;                                        \
+    const uint32_t N6 = allocate##MemorySize();                                \
+    constexpr Type NewValue6 = 666.66f;                                        \
+    const uint32_t N5 = allocate##MemorySize();                                \
+    constexpr Type NewValue5 = 555.55f;                                        \
+    const uint32_t N4 = allocate##MemorySize();                                \
+    constexpr Type NewValue4 = 444.44f;                                        \
+    const uint32_t N3 = allocate##MemorySize();                                \
+    constexpr Type NewValue3 = 333.33f;                                        \
+    const uint32_t N2 = allocate##MemorySize();                                \
+    constexpr Type NewValue2 = 222.22f;                                        \
+    const uint32_t N1 = allocate##MemorySize();                                \
+    constexpr Type NewValue1 = 111.11f;                                        \
+                                                                               \
+    __ fincstp();                                                              \
+    __ fincstp();                                                              \
+    __ fincstp();                                                              \
+    __ fincstp();                                                              \
+    __ fincstp();                                                              \
+    __ fincstp();                                                              \
+    __ fincstp();                                                              \
+                                                                               \
+    __ fld(IceType_f##Size, dwordAddress(N7));                                 \
+    __ fstp(X87STRegister::Encoded_X87ST_7);                                   \
+    __ fld(IceType_f##Size, dwordAddress(N6));                                 \
+    __ fstp(X87STRegister::Encoded_X87ST_6);                                   \
+    __ fld(IceType_f##Size, dwordAddress(N5));                                 \
+    __ fstp(X87STRegister::Encoded_X87ST_5);                                   \
+    __ fld(IceType_f##Size, dwordAddress(N4));                                 \
+    __ fstp(X87STRegister::Encoded_X87ST_4);                                   \
+    __ fld(IceType_f##Size, dwordAddress(N3));                                 \
+    __ fstp(X87STRegister::Encoded_X87ST_3);                                   \
+    __ fld(IceType_f##Size, dwordAddress(N2));                                 \
+    __ fstp(X87STRegister::Encoded_X87ST_2);                                   \
+    __ fld(IceType_f##Size, dwordAddress(N1));                                 \
+    __ fstp(X87STRegister::Encoded_X87ST_1);                                   \
+                                                                               \
+    __ fstp(IceType_f##Size, dwordAddress(T1));                                \
+    __ fstp(IceType_f##Size, dwordAddress(T2));                                \
+    __ fstp(IceType_f##Size, dwordAddress(T3));                                \
+    __ fstp(IceType_f##Size, dwordAddress(T4));                                \
+    __ fstp(IceType_f##Size, dwordAddress(T5));                                \
+    __ fstp(IceType_f##Size, dwordAddress(T6));                                \
+    __ fstp(IceType_f##Size, dwordAddress(T7));                                \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.set##MemorySize##To(T1, OldValue1);                                   \
+    test.set##MemorySize##To(N1, NewValue1);                                   \
+    test.set##MemorySize##To(T2, OldValue2);                                   \
+    test.set##MemorySize##To(N2, NewValue2);                                   \
+    test.set##MemorySize##To(T3, OldValue3);                                   \
+    test.set##MemorySize##To(N3, NewValue3);                                   \
+    test.set##MemorySize##To(T4, OldValue4);                                   \
+    test.set##MemorySize##To(N4, NewValue4);                                   \
+    test.set##MemorySize##To(T5, OldValue5);                                   \
+    test.set##MemorySize##To(N5, NewValue5);                                   \
+    test.set##MemorySize##To(T6, OldValue6);                                   \
+    test.set##MemorySize##To(N6, NewValue6);                                   \
+    test.set##MemorySize##To(T7, OldValue7);                                   \
+    test.set##MemorySize##To(N7, NewValue7);                                   \
+                                                                               \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_FLOAT_EQ(NewValue1, test.contentsOf##MemorySize<Type>(T1))          \
+        << "(" #Size ", " #MemorySize ", " #Type ")";                          \
+    ASSERT_FLOAT_EQ(NewValue1, test.contentsOf##MemorySize<Type>(N1))          \
+        << "(" #Size ", " #MemorySize ", " #Type ")";                          \
+    ASSERT_FLOAT_EQ(NewValue2, test.contentsOf##MemorySize<Type>(T2))          \
+        << "(" #Size ", " #MemorySize ", " #Type ")";                          \
+    ASSERT_FLOAT_EQ(NewValue2, test.contentsOf##MemorySize<Type>(N2))          \
+        << "(" #Size ", " #MemorySize ", " #Type ")";                          \
+    ASSERT_FLOAT_EQ(NewValue3, test.contentsOf##MemorySize<Type>(T3))          \
+        << "(" #Size ", " #MemorySize ", " #Type ")";                          \
+    ASSERT_FLOAT_EQ(NewValue3, test.contentsOf##MemorySize<Type>(N3))          \
+        << "(" #Size ", " #MemorySize ", " #Type ")";                          \
+    ASSERT_FLOAT_EQ(NewValue4, test.contentsOf##MemorySize<Type>(T4))          \
+        << "(" #Size ", " #MemorySize ", " #Type ")";                          \
+    ASSERT_FLOAT_EQ(NewValue4, test.contentsOf##MemorySize<Type>(N4))          \
+        << "(" #Size ", " #MemorySize ", " #Type ")";                          \
+    ASSERT_FLOAT_EQ(NewValue5, test.contentsOf##MemorySize<Type>(T5))          \
+        << "(" #Size ", " #MemorySize ", " #Type ")";                          \
+    ASSERT_FLOAT_EQ(NewValue5, test.contentsOf##MemorySize<Type>(N5))          \
+        << "(" #Size ", " #MemorySize ", " #Type ")";                          \
+    ASSERT_FLOAT_EQ(NewValue6, test.contentsOf##MemorySize<Type>(T6))          \
+        << "(" #Size ", " #MemorySize ", " #Type ")";                          \
+    ASSERT_FLOAT_EQ(NewValue6, test.contentsOf##MemorySize<Type>(N6))          \
+        << "(" #Size ", " #MemorySize ", " #Type ")";                          \
+    ASSERT_FLOAT_EQ(NewValue7, test.contentsOf##MemorySize<Type>(T7))          \
+        << "(" #Size ", " #MemorySize ", " #Type ")";                          \
+    ASSERT_FLOAT_EQ(NewValue7, test.contentsOf##MemorySize<Type>(N7))          \
+        << "(" #Size ", " #MemorySize ", " #Type ")";                          \
+                                                                               \
+    reset();                                                                   \
+  } while (0)
+
+  TestFstpSt(32, Dword, float);
+  TestFstpSt(64, Qword, double);
+
+#undef TestFstpSt
+}
+
+TEST_F(AssemblerX8632Test, Fild) {
+#define TestFild(OperandType, Size, MemorySize, FpType, IntType)               \
+  do {                                                                         \
+    const uint32_t T0 = allocate##MemorySize();                                \
+    constexpr IntType V0 = 0x1234;                                             \
+                                                                               \
+    __ fild##OperandType(dwordAddress(T0));                                    \
+    __ fstp(IceType_f##Size, dwordAddress(T0));                                \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+                                                                               \
+    test.set##MemorySize##To(T0, V0);                                          \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_FLOAT_EQ(static_cast<FpType>(V0),                                   \
+                    test.contentsOf##MemorySize<FpType>(T0))                   \
+        << "(" #OperandType ", " #Size ", " #MemorySize ", " #FpType           \
+           ", " #IntType ")";                                                  \
+                                                                               \
+    reset();                                                                   \
+  } while (0)
+
+  TestFild(s, 32, Dword, float, uint32_t);
+  TestFild(l, 64, Qword, double, uint64_t);
+#undef TestFild
+}
+
+TEST_F(AssemblerX8632Test, Fistp) {
+#define TestFistp(OperandType, Size, MemorySize, FpType, IntType)              \
+  do {                                                                         \
+    const uint32_t T0 = allocate##MemorySize();                                \
+    constexpr IntType V0 = 0x1234;                                             \
+    const uint32_t T1 = allocate##MemorySize();                                \
+    constexpr IntType V1 = 0xFFFF;                                             \
+                                                                               \
+    __ fild##OperandType(dwordAddress(T0));                                    \
+    __ fistp##OperandType(dwordAddress(T1));                                   \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+                                                                               \
+    test.set##MemorySize##To(T0, V0);                                          \
+    test.set##MemorySize##To(T1, V1);                                          \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(static_cast<IntType>(V0),                                        \
+              test.contentsOf##MemorySize<IntType>(T0))                        \
+        << "(" #OperandType ", " #Size ", " #MemorySize ", " #FpType           \
+           ", " #IntType ")";                                                  \
+    ASSERT_EQ(static_cast<IntType>(V0),                                        \
+              test.contentsOf##MemorySize<IntType>(T1))                        \
+        << "(" #OperandType ", " #Size ", " #MemorySize ", " #FpType           \
+           ", " #IntType ")";                                                  \
+                                                                               \
+    reset();                                                                   \
+  } while (0)
+
+  TestFistp(s, 32, Dword, float, uint32_t);
+  TestFistp(l, 64, Qword, double, uint64_t);
+#undef TestFistp
+}
+
+} // end of anonymous namespace
+} // end of namespace Test
+} // end of namespace X8632
+} // end of namespace Ice
diff --git a/unittest/AssemblerX8632/XmmArith.cpp b/unittest/AssemblerX8632/XmmArith.cpp
new file mode 100644
index 0000000..45ff3a9
--- /dev/null
+++ b/unittest/AssemblerX8632/XmmArith.cpp
@@ -0,0 +1,1816 @@
+//===- subzero/unittest/AssemblerX8632/XmmArith.cpp -----------------------===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#include "AssemblerX8632/TestUtil.h"
+
+namespace Ice {
+namespace X8632 {
+namespace Test {
+namespace {
+
+TEST_F(AssemblerX8632Test, ArithSS) {
+#define TestArithSSXmmXmm(FloatSize, Src, Value0, Dst, Value1, Inst, Op)       \
+  do {                                                                         \
+    static_assert(FloatSize == 32 || FloatSize == 64,                          \
+                  "Invalid fp size " #FloatSize);                              \
+    static constexpr char TestString[] =                                       \
+        "(" #FloatSize ", " #Src ", " #Value0 ", " #Dst ", " #Value1           \
+        ", " #Inst ", " #Op ")";                                               \
+    static constexpr bool IsDouble = FloatSize == 64;                          \
+    using Type = std::conditional<IsDouble, double, float>::type;              \
+    const uint32_t T0 = allocateQword();                                       \
+    const Type V0 = Value0;                                                    \
+    const uint32_t T1 = allocateQword();                                       \
+    const Type V1 = Value1;                                                    \
+                                                                               \
+    __ movss(IceType_f##FloatSize, XmmRegister::Encoded_Reg_##Dst,             \
+             dwordAddress(T0));                                                \
+    __ movss(IceType_f##FloatSize, XmmRegister::Encoded_Reg_##Src,             \
+             dwordAddress(T1));                                                \
+    __ Inst(IceType_f##FloatSize, XmmRegister::Encoded_Reg_##Dst,              \
+            XmmRegister::Encoded_Reg_##Src);                                   \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    if (IsDouble) {                                                            \
+      test.setQwordTo(T0, static_cast<double>(V0));                            \
+      test.setQwordTo(T1, static_cast<double>(V1));                            \
+    } else {                                                                   \
+      test.setDwordTo(T0, static_cast<float>(V0));                             \
+      test.setDwordTo(T1, static_cast<float>(V1));                             \
+    }                                                                          \
+                                                                               \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_DOUBLE_EQ(V0 Op V1, test.Dst<Type>()) << TestString;                \
+    reset();                                                                   \
+  } while (0)
+
+#define TestArithSSXmmAddr(FloatSize, Value0, Dst, Value1, Inst, Op)           \
+  do {                                                                         \
+    static_assert(FloatSize == 32 || FloatSize == 64,                          \
+                  "Invalid fp size " #FloatSize);                              \
+    static constexpr char TestString[] =                                       \
+        "(" #FloatSize ", Addr, " #Value0 ", " #Dst ", " #Value1 ", " #Inst    \
+        ", " #Op ")";                                                          \
+    static constexpr bool IsDouble = FloatSize == 64;                          \
+    using Type = std::conditional<IsDouble, double, float>::type;              \
+    const uint32_t T0 = allocateQword();                                       \
+    const Type V0 = Value0;                                                    \
+    const uint32_t T1 = allocateQword();                                       \
+    const Type V1 = Value1;                                                    \
+                                                                               \
+    __ movss(IceType_f##FloatSize, XmmRegister::Encoded_Reg_##Dst,             \
+             dwordAddress(T0));                                                \
+    __ Inst(IceType_f##FloatSize, XmmRegister::Encoded_Reg_##Dst,              \
+            dwordAddress(T1));                                                 \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    if (IsDouble) {                                                            \
+      test.setQwordTo(T0, static_cast<double>(V0));                            \
+      test.setQwordTo(T1, static_cast<double>(V1));                            \
+    } else {                                                                   \
+      test.setDwordTo(T0, static_cast<float>(V0));                             \
+      test.setDwordTo(T1, static_cast<float>(V1));                             \
+    }                                                                          \
+                                                                               \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_DOUBLE_EQ(V0 Op V1, test.Dst<Type>()) << TestString;                \
+    reset();                                                                   \
+  } while (0)
+
+#define TestArithSS(FloatSize, Src, Dst0, Dst1)                                \
+  do {                                                                         \
+    TestArithSSXmmXmm(FloatSize, Src, 1.0, Dst0, 10.0, addss, +);              \
+    TestArithSSXmmAddr(FloatSize, 2.0, Dst1, 20.0, addss, +);                  \
+    TestArithSSXmmXmm(FloatSize, Src, 3.0, Dst0, 30.0, subss, -);              \
+    TestArithSSXmmAddr(FloatSize, 4.0, Dst1, 40.0, subss, -);                  \
+    TestArithSSXmmXmm(FloatSize, Src, 5.0, Dst0, 50.0, mulss, *);              \
+    TestArithSSXmmAddr(FloatSize, 6.0, Dst1, 60.0, mulss, *);                  \
+    TestArithSSXmmXmm(FloatSize, Src, 7.0, Dst0, 70.0, divss, / );             \
+    TestArithSSXmmAddr(FloatSize, 8.0, Dst1, 80.0, divss, / );                 \
+  } while (0)
+
+  TestArithSS(32, xmm0, xmm1, xmm2);
+  TestArithSS(32, xmm1, xmm2, xmm3);
+  TestArithSS(32, xmm2, xmm3, xmm4);
+  TestArithSS(32, xmm3, xmm4, xmm5);
+  TestArithSS(32, xmm4, xmm5, xmm6);
+  TestArithSS(32, xmm5, xmm6, xmm7);
+  TestArithSS(32, xmm6, xmm7, xmm0);
+  TestArithSS(32, xmm7, xmm0, xmm1);
+
+  TestArithSS(64, xmm0, xmm1, xmm2);
+  TestArithSS(64, xmm1, xmm2, xmm3);
+  TestArithSS(64, xmm2, xmm3, xmm4);
+  TestArithSS(64, xmm3, xmm4, xmm5);
+  TestArithSS(64, xmm4, xmm5, xmm6);
+  TestArithSS(64, xmm5, xmm6, xmm7);
+  TestArithSS(64, xmm6, xmm7, xmm0);
+  TestArithSS(64, xmm7, xmm0, xmm1);
+
+#undef TestArithSS
+#undef TestArithSSXmmAddr
+#undef TestArithSSXmmXmm
+}
+
+TEST_F(AssemblerX8632Test, PArith) {
+#define TestPArithXmmXmm(Dst, Value0, Src, Value1, Inst, Op, Type, Size)       \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Dst ", " #Value0 ", " #Src ", " #Value1 ", " #Inst ", " #Op       \
+        ", " #Type ", " #Size ")";                                             \
+    const uint32_t T0 = allocateDqword();                                      \
+    const Dqword V0 Value0;                                                    \
+                                                                               \
+    const uint32_t T1 = allocateDqword();                                      \
+    const Dqword V1 Value1;                                                    \
+                                                                               \
+    __ movups(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0));               \
+    __ movups(XmmRegister::Encoded_Reg_##Src, dwordAddress(T1));               \
+    __ Inst(IceType_i##Size, XmmRegister::Encoded_Reg_##Dst,                   \
+            XmmRegister::Encoded_Reg_##Src);                                   \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0);                                                  \
+    test.setDqwordTo(T1, V1);                                                  \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(packedAs<Type##Size##_t>(V0) Op V1, test.Dst<Dqword>())          \
+        << TestString;                                                         \
+    reset();                                                                   \
+  } while (0)
+
+#define TestPArithXmmAddr(Dst, Value0, Value1, Inst, Op, Type, Size)           \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Dst ", " #Value0 ", Addr, " #Value1 ", " #Inst ", " #Op           \
+        ", " #Type ", " #Size ")";                                             \
+    const uint32_t T0 = allocateDqword();                                      \
+    const Dqword V0 Value0;                                                    \
+                                                                               \
+    const uint32_t T1 = allocateDqword();                                      \
+    const Dqword V1 Value1;                                                    \
+                                                                               \
+    __ movups(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0));               \
+    __ Inst(IceType_i##Size, XmmRegister::Encoded_Reg_##Dst,                   \
+            dwordAddress(T1));                                                 \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0);                                                  \
+    test.setDqwordTo(T1, V1);                                                  \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(packedAs<Type##Size##_t>(V0) Op V1, test.Dst<Dqword>())          \
+        << TestString;                                                         \
+    reset();                                                                   \
+  } while (0)
+
+#define TestPArithXmmImm(Dst, Value0, Imm, Inst, Op, Type, Size)               \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Dst ", " #Value0 ", " #Imm ", " #Inst ", " #Op ", " #Type         \
+        ", " #Size ")";                                                        \
+    const uint32_t T0 = allocateDqword();                                      \
+    const Dqword V0 Value0;                                                    \
+                                                                               \
+    __ movups(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0));               \
+    __ Inst(IceType_i##Size, XmmRegister::Encoded_Reg_##Dst, Immediate(Imm));  \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0);                                                  \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(packedAs<Type##Size##_t>(V0) Op Imm, test.Dst<Dqword>())         \
+        << TestString;                                                         \
+    reset();                                                                   \
+  } while (0)
+
+#define TestPAndnXmmXmm(Dst, Value0, Src, Value1, Type, Size)                  \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Dst ", " #Value0 ", " #Src ", " #Value1 ", pandn, " #Type         \
+        ", " #Size ")";                                                        \
+    const uint32_t T0 = allocateDqword();                                      \
+    const Dqword V0 Value0;                                                    \
+                                                                               \
+    const uint32_t T1 = allocateDqword();                                      \
+    const Dqword V1 Value1;                                                    \
+                                                                               \
+    __ movups(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0));               \
+    __ movups(XmmRegister::Encoded_Reg_##Src, dwordAddress(T1));               \
+    __ pandn(IceType_i##Size, XmmRegister::Encoded_Reg_##Dst,                  \
+             XmmRegister::Encoded_Reg_##Src);                                  \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0);                                                  \
+    test.setDqwordTo(T1, V1);                                                  \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(~(packedAs<Type##Size##_t>(V0)) & V1, test.Dst<Dqword>())        \
+        << TestString;                                                         \
+    reset();                                                                   \
+  } while (0)
+
+#define TestPAndnXmmAddr(Dst, Value0, Value1, Type, Size)                      \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Dst ", " #Value0 ", Addr, " #Value1 ", pandn, " #Type ", " #Size  \
+        ")";                                                                   \
+    const uint32_t T0 = allocateDqword();                                      \
+    const Dqword V0 Value0;                                                    \
+                                                                               \
+    const uint32_t T1 = allocateDqword();                                      \
+    const Dqword V1 Value1;                                                    \
+                                                                               \
+    __ movups(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0));               \
+    __ pandn(IceType_i##Size, XmmRegister::Encoded_Reg_##Dst,                  \
+             dwordAddress(T1));                                                \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0);                                                  \
+    test.setDqwordTo(T1, V1);                                                  \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ((~packedAs<Type##Size##_t>(V0)) & V1, test.Dst<Dqword>())        \
+        << TestString;                                                         \
+    reset();                                                                   \
+  } while (0)
+
+#define TestPArithSize(Dst, Src, Size)                                         \
+  do {                                                                         \
+    static_assert(Size == 8 || Size == 16 || Size == 32, "Invalid size.");     \
+    if (Size != 8) {                                                           \
+      TestPArithXmmXmm(                                                        \
+          Dst,                                                                 \
+          (uint64_t(0x8040201008040201ull), uint64_t(0x8080404002020101ull)),  \
+          Src, (uint64_t(3u), uint64_t(0u)), psra, >>, int, Size);             \
+      TestPArithXmmAddr(Dst, (uint64_t(0x8040201008040201ull),                 \
+                              uint64_t(0x8080404002020101ull)),                \
+                        (uint64_t(3u), uint64_t(0u)), psra, >>, int, Size);    \
+      TestPArithXmmImm(Dst, (uint64_t(0x8040201008040201ull),                  \
+                             uint64_t(0x8080404002020101ull)),                 \
+                       3u, psra, >>, int, Size);                               \
+      TestPArithXmmXmm(                                                        \
+          Dst,                                                                 \
+          (uint64_t(0x8040201008040201ull), uint64_t(0x8080404002020101ull)),  \
+          Src, (uint64_t(3u), uint64_t(0u)), psrl, >>, uint, Size);            \
+      TestPArithXmmAddr(Dst, (uint64_t(0x8040201008040201ull),                 \
+                              uint64_t(0x8080404002020101ull)),                \
+                        (uint64_t(3u), uint64_t(0u)), psrl, >>, uint, Size);   \
+      TestPArithXmmImm(Dst, (uint64_t(0x8040201008040201ull),                  \
+                             uint64_t(0x8080404002020101ull)),                 \
+                       3u, psrl, >>, uint, Size);                              \
+      TestPArithXmmXmm(                                                        \
+          Dst,                                                                 \
+          (uint64_t(0x8040201008040201ull), uint64_t(0x8080404002020101ull)),  \
+          Src, (uint64_t(3u), uint64_t(0u)), psll, <<, uint, Size);            \
+      TestPArithXmmAddr(Dst, (uint64_t(0x8040201008040201ull),                 \
+                              uint64_t(0x8080404002020101ull)),                \
+                        (uint64_t(3u), uint64_t(0u)), psll, <<, uint, Size);   \
+      TestPArithXmmImm(Dst, (uint64_t(0x8040201008040201ull),                  \
+                             uint64_t(0x8080404002020101ull)),                 \
+                       3u, psll, <<, uint, Size);                              \
+                                                                               \
+      TestPArithXmmXmm(Dst, (uint64_t(0x8040201008040201ull),                  \
+                             uint64_t(0x8080404002020101ull)),                 \
+                       Src, (uint64_t(0xFFFFFFFF00000000ull),                  \
+                             uint64_t(0x0123456789ABCDEull)),                  \
+                       pmull, *, int, Size);                                   \
+      TestPArithXmmAddr(                                                       \
+          Dst,                                                                 \
+          (uint64_t(0x8040201008040201ull), uint64_t(0x8080404002020101ull)),  \
+          (uint64_t(0xFFFFFFFF00000000ull), uint64_t(0x0123456789ABCDEull)),   \
+          pmull, *, int, Size);                                                \
+      if (Size != 16) {                                                        \
+        TestPArithXmmXmm(Dst, (uint64_t(0x8040201008040201ull),                \
+                               uint64_t(0x8080404002020101ull)),               \
+                         Src, (uint64_t(0xFFFFFFFF00000000ull),                \
+                               uint64_t(0x0123456789ABCDEull)),                \
+                         pmuludq, *, uint, Size);                              \
+        TestPArithXmmAddr(                                                     \
+            Dst, (uint64_t(0x8040201008040201ull),                             \
+                  uint64_t(0x8080404002020101ull)),                            \
+            (uint64_t(0xFFFFFFFF00000000ull), uint64_t(0x0123456789ABCDEull)), \
+            pmuludq, *, uint, Size);                                           \
+      }                                                                        \
+    }                                                                          \
+    TestPArithXmmXmm(Dst, (uint64_t(0x8040201008040201ull),                    \
+                           uint64_t(0x8080404002020101ull)),                   \
+                     Src, (uint64_t(0xFFFFFFFF00000000ull),                    \
+                           uint64_t(0x0123456789ABCDEull)),                    \
+                     padd, +, int, Size);                                      \
+    TestPArithXmmAddr(                                                         \
+        Dst,                                                                   \
+        (uint64_t(0x8040201008040201ull), uint64_t(0x8080404002020101ull)),    \
+        (uint64_t(0xFFFFFFFF00000000ull), uint64_t(0x0123456789ABCDEull)),     \
+        padd, +, int, Size);                                                   \
+    TestPArithXmmXmm(Dst, (uint64_t(0x8040201008040201ull),                    \
+                           uint64_t(0x8080404002020101ull)),                   \
+                     Src, (uint64_t(0xFFFFFFFF00000000ull),                    \
+                           uint64_t(0x0123456789ABCDEull)),                    \
+                     psub, -, int, Size);                                      \
+    TestPArithXmmAddr(                                                         \
+        Dst,                                                                   \
+        (uint64_t(0x8040201008040201ull), uint64_t(0x8080404002020101ull)),    \
+        (uint64_t(0xFFFFFFFF00000000ull), uint64_t(0x0123456789ABCDEull)),     \
+        psub, -, int, Size);                                                   \
+    TestPArithXmmXmm(Dst, (uint64_t(0x8040201008040201ull),                    \
+                           uint64_t(0x8080404002020101ull)),                   \
+                     Src, (uint64_t(0xFFFFFFFF00000000ull),                    \
+                           uint64_t(0x0123456789ABCDEull)),                    \
+                     pand, &, int, Size);                                      \
+    TestPArithXmmAddr(                                                         \
+        Dst,                                                                   \
+        (uint64_t(0x8040201008040201ull), uint64_t(0x8080404002020101ull)),    \
+        (uint64_t(0xFFFFFFFF00000000ull), uint64_t(0x0123456789ABCDEull)),     \
+        pand, &, int, Size);                                                   \
+                                                                               \
+    TestPAndnXmmXmm(Dst, (uint64_t(0x8040201008040201ull),                     \
+                          uint64_t(0x8080404002020101ull)),                    \
+                    Src, (uint64_t(0xFFFFFFFF00000000ull),                     \
+                          uint64_t(0x0123456789ABCDEull)),                     \
+                    int, Size);                                                \
+    TestPAndnXmmAddr(                                                          \
+        Dst,                                                                   \
+        (uint64_t(0x8040201008040201ull), uint64_t(0x8080404002020101ull)),    \
+        (uint64_t(0xFFFFFFFF00000000ull), uint64_t(0x0123456789ABCDEull)),     \
+        int, Size);                                                            \
+                                                                               \
+    TestPArithXmmXmm(Dst, (uint64_t(0x8040201008040201ull),                    \
+                           uint64_t(0x8080404002020101ull)),                   \
+                     Src, (uint64_t(0xFFFFFFFF00000000ull),                    \
+                           uint64_t(0x0123456789ABCDEull)),                    \
+                     por, |, int, Size);                                       \
+    TestPArithXmmAddr(                                                         \
+        Dst,                                                                   \
+        (uint64_t(0x8040201008040201ull), uint64_t(0x8080404002020101ull)),    \
+        (uint64_t(0xFFFFFFFF00000000ull), uint64_t(0x0123456789ABCDEull)),     \
+        por, |, int, Size);                                                    \
+    TestPArithXmmXmm(Dst, (uint64_t(0x8040201008040201ull),                    \
+                           uint64_t(0x8080404002020101ull)),                   \
+                     Src, (uint64_t(0xFFFFFFFF00000000ull),                    \
+                           uint64_t(0x0123456789ABCDEull)),                    \
+                     pxor, ^, int, Size);                                      \
+    TestPArithXmmAddr(                                                         \
+        Dst,                                                                   \
+        (uint64_t(0x8040201008040201ull), uint64_t(0x8080404002020101ull)),    \
+        (uint64_t(0xFFFFFFFF00000000ull), uint64_t(0x0123456789ABCDEull)),     \
+        pxor, ^, int, Size);                                                   \
+  } while (0)
+
+#define TestPArith(Src, Dst)                                                   \
+  do {                                                                         \
+    TestPArithSize(Src, Dst, 8);                                               \
+    TestPArithSize(Src, Dst, 16);                                              \
+    TestPArithSize(Src, Dst, 32);                                              \
+  } while (0)
+
+  TestPArith(xmm0, xmm1);
+  TestPArith(xmm1, xmm2);
+  TestPArith(xmm2, xmm3);
+  TestPArith(xmm3, xmm4);
+  TestPArith(xmm4, xmm5);
+  TestPArith(xmm5, xmm6);
+  TestPArith(xmm6, xmm7);
+  TestPArith(xmm7, xmm0);
+
+#undef TestPArith
+#undef TestPArithSize
+#undef TestPAndnXmmAddr
+#undef TestPAndnXmmXmm
+#undef TestPArithXmmImm
+#undef TestPArithXmmAddr
+#undef TestPArithXmmXmm
+}
+
+TEST_F(AssemblerX8632Test, ArithPS) {
+#define TestArithPSXmmXmm(Dst, Value0, Src, Value1, Inst, Op, Type)            \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Dst ", " #Value0 ", " #Src ", " #Value1 ", " #Inst ", " #Op       \
+        ", " #Type ")";                                                        \
+    const uint32_t T0 = allocateDqword();                                      \
+    const Dqword V0 Value0;                                                    \
+    const uint32_t T1 = allocateDqword();                                      \
+    const Dqword V1 Value1;                                                    \
+                                                                               \
+    __ movups(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0));               \
+    __ movups(XmmRegister::Encoded_Reg_##Src, dwordAddress(T1));               \
+    __ Inst(IceType_f32, XmmRegister::Encoded_Reg_##Dst,                       \
+            XmmRegister::Encoded_Reg_##Src);                                   \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0);                                                  \
+    test.setDqwordTo(T1, V1);                                                  \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(packedAs<Type>(V0) Op V1, test.Dst<Dqword>()) << TestString;     \
+                                                                               \
+    reset();                                                                   \
+  } while (0)
+
+#define TestArithPSXmmXmmUntyped(Dst, Value0, Src, Value1, Inst, Op, Type)     \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Dst ", " #Value0 ", " #Src ", " #Value1 ", " #Inst ", " #Op       \
+        ", " #Type ")";                                                        \
+    const uint32_t T0 = allocateDqword();                                      \
+    const Dqword V0 Value0;                                                    \
+    const uint32_t T1 = allocateDqword();                                      \
+    const Dqword V1 Value1;                                                    \
+                                                                               \
+    __ movups(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0));               \
+    __ movups(XmmRegister::Encoded_Reg_##Src, dwordAddress(T1));               \
+    __ Inst(XmmRegister::Encoded_Reg_##Dst, XmmRegister::Encoded_Reg_##Src);   \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0);                                                  \
+    test.setDqwordTo(T1, V1);                                                  \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(packedAs<Type>(V0) Op V1, test.Dst<Dqword>()) << TestString;     \
+                                                                               \
+    reset();                                                                   \
+  } while (0)
+
+#define TestArithPSXmmAddrUntyped(Dst, Value0, Value1, Inst, Op, Type)         \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Dst ", " #Value0 ", Addr, " #Value1 ", " #Inst ", " #Op           \
+        ", " #Type ")";                                                        \
+    const uint32_t T0 = allocateDqword();                                      \
+    const Dqword V0 Value0;                                                    \
+    const uint32_t T1 = allocateDqword();                                      \
+    const Dqword V1 Value1;                                                    \
+                                                                               \
+    __ movups(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0));               \
+    __ Inst(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T1));                 \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0);                                                  \
+    test.setDqwordTo(T1, V1);                                                  \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(packedAs<Type>(V0) Op V1, test.Dst<Dqword>()) << TestString;     \
+                                                                               \
+    reset();                                                                   \
+  } while (0)
+
+#define TestMinMaxPS(Dst, Value0, Src, Value1, Inst, Type)                     \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Dst ", " #Value0 ", " #Src ", " #Value1 ", " #Inst ", " #Type     \
+        ")";                                                                   \
+    const uint32_t T0 = allocateDqword();                                      \
+    const Dqword V0 Value0;                                                    \
+    const uint32_t T1 = allocateDqword();                                      \
+    const Dqword V1 Value1;                                                    \
+                                                                               \
+    __ movups(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0));               \
+    __ movups(XmmRegister::Encoded_Reg_##Src, dwordAddress(T1));               \
+    __ Inst(XmmRegister::Encoded_Reg_##Dst, XmmRegister::Encoded_Reg_##Src);   \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0);                                                  \
+    test.setDqwordTo(T1, V1);                                                  \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(packedAs<Type>(V0).Inst(V1), test.Dst<Dqword>()) << TestString;  \
+                                                                               \
+    reset();                                                                   \
+  } while (0)
+
+#define TestArithPSXmmAddr(Dst, Value0, Value1, Inst, Op, Type)                \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Dst ", " #Value0 ", Addr, " #Value1 ", " #Inst ", " #Op           \
+        ", " #Type ")";                                                        \
+    const uint32_t T0 = allocateDqword();                                      \
+    const Dqword V0 Value0;                                                    \
+    const uint32_t T1 = allocateDqword();                                      \
+    const Dqword V1 Value1;                                                    \
+                                                                               \
+    __ movups(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0));               \
+    __ Inst(IceType_f32, XmmRegister::Encoded_Reg_##Dst, dwordAddress(T1));    \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0);                                                  \
+    test.setDqwordTo(T1, V1);                                                  \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(packedAs<Type>(V0) Op V1, test.Dst<Dqword>()) << TestString;     \
+                                                                               \
+    reset();                                                                   \
+  } while (0)
+
+#define TestArithPS(Dst, Src)                                                  \
+  do {                                                                         \
+    TestArithPSXmmXmm(Dst, (1.0, 100.0, -1000.0, 20.0), Src,                   \
+                      (0.55, 0.43, 0.23, 1.21), addps, +, float);              \
+    TestArithPSXmmAddr(Dst, (1.0, 100.0, -1000.0, 20.0),                       \
+                       (0.55, 0.43, 0.23, 1.21), addps, +, float);             \
+    TestArithPSXmmXmm(Dst, (1.0, 100.0, -1000.0, 20.0), Src,                   \
+                      (0.55, 0.43, 0.23, 1.21), subps, -, float);              \
+    TestArithPSXmmAddr(Dst, (1.0, 100.0, -1000.0, 20.0),                       \
+                       (0.55, 0.43, 0.23, 1.21), subps, -, float);             \
+    TestArithPSXmmXmm(Dst, (1.0, 100.0, -1000.0, 20.0), Src,                   \
+                      (0.55, 0.43, 0.23, 1.21), mulps, *, float);              \
+    TestArithPSXmmAddr(Dst, (1.0, 100.0, -1000.0, 20.0),                       \
+                       (0.55, 0.43, 0.23, 1.21), mulps, *, float);             \
+    TestArithPSXmmXmm(Dst, (1.0, 100.0, -1000.0, 20.0), Src,                   \
+                      (0.55, 0.43, 0.23, 1.21), divps, /, float);              \
+    TestArithPSXmmAddr(Dst, (1.0, 100.0, -1000.0, 20.0),                       \
+                       (0.55, 0.43, 0.23, 1.21), divps, /, float);             \
+    TestArithPSXmmXmmUntyped(Dst, (1.0, 100.0, -1000.0, 20.0), Src,            \
+                             (0.55, 0.43, 0.23, 1.21), andps, &, float);       \
+    TestArithPSXmmAddrUntyped(Dst, (1.0, 100.0, -1000.0, 20.0),                \
+                              (0.55, 0.43, 0.23, 1.21), andps, &, float);      \
+    TestArithPSXmmXmmUntyped(Dst, (1.0, -1000.0), Src, (0.55, 1.21), andpd, &, \
+                             double);                                          \
+    TestArithPSXmmAddrUntyped(Dst, (1.0, -1000.0), (0.55, 1.21), andpd, &,     \
+                              double);                                         \
+    TestArithPSXmmXmmUntyped(Dst, (1.0, 100.0, -1000.0, 20.0), Src,            \
+                             (0.55, 0.43, 0.23, 1.21), orps, |, float);        \
+    TestArithPSXmmXmmUntyped(Dst, (1.0, -1000.0), Src, (0.55, 1.21), orpd, |,  \
+                             double);                                          \
+    TestMinMaxPS(Dst, (1.0, 100.0, -1000.0, 20.0), Src,                        \
+                 (0.55, 0.43, 0.23, 1.21), minps, float);                      \
+    TestMinMaxPS(Dst, (1.0, 100.0, -1000.0, 20.0), Src,                        \
+                 (0.55, 0.43, 0.23, 1.21), maxps, float);                      \
+    TestMinMaxPS(Dst, (1.0, -1000.0), Src, (0.55, 1.21), minpd, double);       \
+    TestMinMaxPS(Dst, (1.0, -1000.0), Src, (0.55, 1.21), maxpd, double);       \
+    TestArithPSXmmXmmUntyped(Dst, (1.0, 100.0, -1000.0, 20.0), Src,            \
+                             (0.55, 0.43, 0.23, 1.21), xorps, ^, float);       \
+    TestArithPSXmmAddrUntyped(Dst, (1.0, 100.0, -1000.0, 20.0),                \
+                              (0.55, 0.43, 0.23, 1.21), xorps, ^, float);      \
+    TestArithPSXmmXmmUntyped(Dst, (1.0, -1000.0), Src, (0.55, 1.21), xorpd, ^, \
+                             double);                                          \
+    TestArithPSXmmAddrUntyped(Dst, (1.0, -1000.0), (0.55, 1.21), xorpd, ^,     \
+                              double);                                         \
+  } while (0)
+
+#if 0
+
+#endif
+
+  TestArithPS(xmm0, xmm1);
+  TestArithPS(xmm1, xmm2);
+  TestArithPS(xmm2, xmm3);
+  TestArithPS(xmm3, xmm4);
+  TestArithPS(xmm4, xmm5);
+  TestArithPS(xmm5, xmm6);
+  TestArithPS(xmm6, xmm7);
+  TestArithPS(xmm7, xmm0);
+
+#undef TestArithPs
+#undef TestMinMaxPS
+#undef TestArithPSXmmXmmUntyped
+#undef TestArithPSXmmAddr
+#undef TestArithPSXmmXmm
+}
+
+TEST_F(AssemblerX8632Test, Blending) {
+  using f32 = float;
+  using i8 = uint8_t;
+
+#define TestBlendingXmmXmm(Dst, Value0, Src, Value1, M /*ask*/, Inst, Type)    \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Dst ", " #Value0 ", " #Src ", " #Value1 ", " #M ", " #Inst        \
+        ", " #Type ")";                                                        \
+    const uint32_t T0 = allocateDqword();                                      \
+    const Dqword V0 Value0;                                                    \
+    const uint32_t T1 = allocateDqword();                                      \
+    const Dqword V1 Value1;                                                    \
+    const uint32_t Mask = allocateDqword();                                    \
+    const Dqword MaskValue M;                                                  \
+                                                                               \
+    __ movups(XmmRegister::Encoded_Reg_xmm0, dwordAddress(Mask));              \
+    __ movups(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0));               \
+    __ movups(XmmRegister::Encoded_Reg_##Src, dwordAddress(T1));               \
+    __ Inst(IceType_##Type, XmmRegister::Encoded_Reg_##Dst,                    \
+            XmmRegister::Encoded_Reg_##Src);                                   \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0);                                                  \
+    test.setDqwordTo(T1, V1);                                                  \
+    test.setDqwordTo(Mask, MaskValue);                                         \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(packedAs<Type>(V0).blendWith(V1, MaskValue), test.Dst<Dqword>()) \
+        << TestString;                                                         \
+    reset();                                                                   \
+  } while (0)
+
+#define TestBlendingXmmAddr(Dst, Value0, Value1, M /*ask*/, Inst, Type)        \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Dst ", " #Value0 ", Addr, " #Value1 ", " #M ", " #Inst ", " #Type \
+        ")";                                                                   \
+    const uint32_t T0 = allocateDqword();                                      \
+    const Dqword V0 Value0;                                                    \
+    const uint32_t T1 = allocateDqword();                                      \
+    const Dqword V1 Value1;                                                    \
+    const uint32_t Mask = allocateDqword();                                    \
+    const Dqword MaskValue M;                                                  \
+                                                                               \
+    __ movups(XmmRegister::Encoded_Reg_xmm0, dwordAddress(Mask));              \
+    __ movups(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0));               \
+    __ Inst(IceType_##Type, XmmRegister::Encoded_Reg_##Dst, dwordAddress(T1)); \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0);                                                  \
+    test.setDqwordTo(T1, V1);                                                  \
+    test.setDqwordTo(Mask, MaskValue);                                         \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(packedAs<Type>(V0).blendWith(V1, MaskValue), test.Dst<Dqword>()) \
+        << TestString;                                                         \
+    reset();                                                                   \
+  } while (0)
+
+#define TestBlending(Src, Dst)                                                 \
+  do {                                                                         \
+    TestBlendingXmmXmm(                                                        \
+        Dst, (1.0, 2.0, 1.0, 2.0), Src, (-1.0, -2.0, -1.0, -2.0),              \
+        (uint64_t(0x8000000000000000ull), uint64_t(0x0000000080000000ull)),    \
+        blendvps, f32);                                                        \
+    TestBlendingXmmAddr(                                                       \
+        Dst, (1.0, 2.0, 1.0, 2.0), (-1.0, -2.0, -1.0, -2.0),                   \
+        (uint64_t(0x8000000000000000ull), uint64_t(0x0000000080000000ull)),    \
+        blendvps, f32);                                                        \
+    TestBlendingXmmXmm(                                                        \
+        Dst,                                                                   \
+        (uint64_t(0xFFFFFFFFFFFFFFFFull), uint64_t(0xBBBBBBBBBBBBBBBBull)),    \
+        Src,                                                                   \
+        (uint64_t(0xAAAAAAAAAAAAAAAAull), uint64_t(0xEEEEEEEEEEEEEEEEull)),    \
+        (uint64_t(0x8000000000000080ull), uint64_t(0x8080808000000000ull)),    \
+        pblendvb, i8);                                                         \
+    TestBlendingXmmAddr(                                                       \
+        Dst,                                                                   \
+        (uint64_t(0xFFFFFFFFFFFFFFFFull), uint64_t(0xBBBBBBBBBBBBBBBBull)),    \
+        (uint64_t(0xAAAAAAAAAAAAAAAAull), uint64_t(0xEEEEEEEEEEEEEEEEull)),    \
+        (uint64_t(0x8000000000000080ull), uint64_t(0x8080808000000000ull)),    \
+        pblendvb, i8);                                                         \
+  } while (0)
+
+  /* xmm0 is taken. It is the implicit mask . */
+  TestBlending(xmm1, xmm2);
+  TestBlending(xmm2, xmm3);
+  TestBlending(xmm3, xmm4);
+  TestBlending(xmm4, xmm5);
+  TestBlending(xmm5, xmm6);
+  TestBlending(xmm6, xmm7);
+  TestBlending(xmm7, xmm1);
+
+#undef TestBlending
+#undef TestBlendingXmmAddr
+#undef TestBlendingXmmXmm
+}
+
+TEST_F(AssemblerX8632Test, Cmpps) {
+#define TestCmppsXmmXmm(Dst, Src, C, Op)                                       \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Src ", " #Dst ", " #C ", " #Op ")";                               \
+    const uint32_t T0 = allocateDqword();                                      \
+    const Dqword V0(-1.0, 1.0, 3.14, 1024.5);                                  \
+    const uint32_t T1 = allocateDqword();                                      \
+    const Dqword V1(-1.0, 1.0, 3.14, 1024.5);                                  \
+                                                                               \
+    __ movups(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0));               \
+    __ movups(XmmRegister::Encoded_Reg_##Src, dwordAddress(T1));               \
+    __ cmpps(XmmRegister::Encoded_Reg_##Dst, XmmRegister::Encoded_Reg_##Src,   \
+             Cond::Cmpps_##C);                                                 \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0);                                                  \
+    test.setDqwordTo(T1, V1);                                                  \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(packedAs<float>(V0) Op V1, test.Dst<Dqword>()) << TestString;    \
+    ;                                                                          \
+    reset();                                                                   \
+  } while (0)
+
+#define TestCmppsXmmAddr(Dst, C, Op)                                           \
+  do {                                                                         \
+    static constexpr char TestString[] = "(" #Dst ", Addr, " #C ", " #Op ")";  \
+    const uint32_t T0 = allocateDqword();                                      \
+    const Dqword V0(-1.0, 1.0, 3.14, 1024.5);                                  \
+    const uint32_t T1 = allocateDqword();                                      \
+    const Dqword V1(-1.0, 1.0, 3.14, 1024.5);                                  \
+                                                                               \
+    __ movups(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0));               \
+    __ cmpps(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T1),                 \
+             Cond::Cmpps_##C);                                                 \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0);                                                  \
+    test.setDqwordTo(T1, V1);                                                  \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(packedAs<float>(V0) Op V1, test.Dst<Dqword>()) << TestString;    \
+    ;                                                                          \
+    reset();                                                                   \
+  } while (0)
+
+#define TestCmppsOrdUnordXmmXmm(Dst, Src, C)                                   \
+  do {                                                                         \
+    static constexpr char TestString[] = "(" #Src ", " #Dst ", " #C ")";       \
+    const uint32_t T0 = allocateDqword();                                      \
+    const Dqword V0(1.0, 1.0, std::numeric_limits<float>::quiet_NaN(),         \
+                    std::numeric_limits<float>::quiet_NaN());                  \
+    const uint32_t T1 = allocateDqword();                                      \
+    const Dqword V1(1.0, std::numeric_limits<float>::quiet_NaN(), 1.0,         \
+                    std::numeric_limits<float>::quiet_NaN());                  \
+                                                                               \
+    __ movups(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0));               \
+    __ movups(XmmRegister::Encoded_Reg_##Src, dwordAddress(T1));               \
+    __ cmpps(XmmRegister::Encoded_Reg_##Dst, XmmRegister::Encoded_Reg_##Src,   \
+             Cond::Cmpps_##C);                                                 \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0);                                                  \
+    test.setDqwordTo(T1, V1);                                                  \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(packedAs<float>(V0).C(V1), test.Dst<Dqword>()) << TestString;    \
+    ;                                                                          \
+    reset();                                                                   \
+  } while (0)
+
+#define TestCmppsOrdUnordXmmAddr(Dst, C)                                       \
+  do {                                                                         \
+    static constexpr char TestString[] = "(" #Dst ", " #C ")";                 \
+    const uint32_t T0 = allocateDqword();                                      \
+    const Dqword V0(1.0, 1.0, std::numeric_limits<float>::quiet_NaN(),         \
+                    std::numeric_limits<float>::quiet_NaN());                  \
+    const uint32_t T1 = allocateDqword();                                      \
+    const Dqword V1(1.0, std::numeric_limits<float>::quiet_NaN(), 1.0,         \
+                    std::numeric_limits<float>::quiet_NaN());                  \
+                                                                               \
+    __ movups(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0));               \
+    __ cmpps(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T1),                 \
+             Cond::Cmpps_##C);                                                 \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0);                                                  \
+    test.setDqwordTo(T1, V1);                                                  \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(packedAs<float>(V0).C(V1), test.Dst<Dqword>()) << TestString;    \
+    ;                                                                          \
+    reset();                                                                   \
+  } while (0)
+
+#define TestCmpps(Dst, Src)                                                    \
+  do {                                                                         \
+    TestCmppsXmmXmm(Dst, Src, eq, == );                                        \
+    TestCmppsXmmAddr(Dst, eq, == );                                            \
+    TestCmppsXmmXmm(Dst, Src, eq, == );                                        \
+    TestCmppsXmmAddr(Dst, eq, == );                                            \
+    TestCmppsXmmXmm(Dst, Src, eq, == );                                        \
+    TestCmppsXmmAddr(Dst, eq, == );                                            \
+    TestCmppsOrdUnordXmmXmm(Dst, Src, unord);                                  \
+    TestCmppsOrdUnordXmmAddr(Dst, unord);                                      \
+    TestCmppsXmmXmm(Dst, Src, eq, == );                                        \
+    TestCmppsXmmAddr(Dst, eq, == );                                            \
+    TestCmppsXmmXmm(Dst, Src, eq, == );                                        \
+    TestCmppsXmmAddr(Dst, eq, == );                                            \
+    TestCmppsXmmXmm(Dst, Src, eq, == );                                        \
+    TestCmppsXmmAddr(Dst, eq, == );                                            \
+    TestCmppsOrdUnordXmmXmm(Dst, Src, unord);                                  \
+    TestCmppsOrdUnordXmmAddr(Dst, unord);                                      \
+  } while (0)
+
+  TestCmpps(xmm0, xmm1);
+  TestCmpps(xmm1, xmm2);
+  TestCmpps(xmm2, xmm3);
+  TestCmpps(xmm3, xmm4);
+  TestCmpps(xmm4, xmm5);
+  TestCmpps(xmm5, xmm6);
+  TestCmpps(xmm6, xmm7);
+  TestCmpps(xmm7, xmm0);
+
+#undef TestCmpps
+#undef TestCmppsOrdUnordXmmAddr
+#undef TestCmppsOrdUnordXmmXmm
+#undef TestCmppsXmmAddr
+#undef TestCmppsXmmXmm
+}
+
+TEST_F(AssemblerX8632Test, Sqrtps_Rsqrtps_Reciprocalps_Sqrtpd) {
+#define TestImplSingle(Dst, Inst, Expect)                                      \
+  do {                                                                         \
+    static constexpr char TestString[] = "(" #Dst ", " #Inst ")";              \
+    const uint32_t T0 = allocateDqword();                                      \
+    const Dqword V0(1.0, 4.0, 20.0, 3.14);                                     \
+                                                                               \
+    __ movups(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0));               \
+    __ Inst(XmmRegister::Encoded_Reg_##Dst);                                   \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0);                                                  \
+    test.run();                                                                \
+    ASSERT_EQ(Dqword Expect, test.Dst<Dqword>()) << TestString;                \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImpl(Dst)                                                          \
+  do {                                                                         \
+    TestImplSingle(Dst, sqrtps, (uint64_t(0x400000003F800000ull),              \
+                                 uint64_t(0x3FE2D10B408F1BBDull)));            \
+    TestImplSingle(Dst, rsqrtps, (uint64_t(0x3EFFF0003F7FF000ull),             \
+                                  uint64_t(0x3F1078003E64F000ull)));           \
+    TestImplSingle(Dst, reciprocalps, (uint64_t(0x3E7FF0003F7FF000ull),        \
+                                       uint64_t(0x3EA310003D4CC000ull)));      \
+                                                                               \
+    TestImplSingle(Dst, sqrtpd, (uint64_t(0x4036A09E9365F5F3ull),              \
+                                 uint64_t(0x401C42FAE40282A8ull)));            \
+  } while (0)
+
+  TestImpl(xmm0);
+  TestImpl(xmm1);
+  TestImpl(xmm2);
+  TestImpl(xmm3);
+  TestImpl(xmm4);
+  TestImpl(xmm5);
+  TestImpl(xmm6);
+  TestImpl(xmm7);
+
+#undef TestImpl
+#undef TestImplSingle
+}
+
+TEST_F(AssemblerX8632Test, Unpck) {
+  const Dqword V0(uint64_t(0xAAAAAAAABBBBBBBBull),
+                  uint64_t(0xCCCCCCCCDDDDDDDDull));
+  const Dqword V1(uint64_t(0xEEEEEEEEFFFFFFFFull),
+                  uint64_t(0x9999999988888888ull));
+
+  const Dqword unpcklpsExpected(uint64_t(0xFFFFFFFFBBBBBBBBull),
+                                uint64_t(0xEEEEEEEEAAAAAAAAull));
+  const Dqword unpcklpdExpected(uint64_t(0xAAAAAAAABBBBBBBBull),
+                                uint64_t(0xEEEEEEEEFFFFFFFFull));
+  const Dqword unpckhpsExpected(uint64_t(0x88888888DDDDDDDDull),
+                                uint64_t(0x99999999CCCCCCCCull));
+  const Dqword unpckhpdExpected(uint64_t(0xCCCCCCCCDDDDDDDDull),
+                                uint64_t(0x9999999988888888ull));
+
+#define TestImplSingle(Dst, Src, Inst)                                         \
+  do {                                                                         \
+    static constexpr char TestString[] = "(" #Dst ", " #Src ", " #Inst ")";    \
+    const uint32_t T0 = allocateDqword();                                      \
+    const uint32_t T1 = allocateDqword();                                      \
+                                                                               \
+    __ movups(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0));               \
+    __ movups(XmmRegister::Encoded_Reg_##Src, dwordAddress(T1));               \
+    __ Inst(XmmRegister::Encoded_Reg_##Dst, XmmRegister::Encoded_Reg_##Src);   \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0);                                                  \
+    test.setDqwordTo(T1, V1);                                                  \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(Inst##Expected, test.Dst<Dqword>()) << TestString;               \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImpl(Dst, Src)                                                     \
+  do {                                                                         \
+    TestImplSingle(Dst, Src, unpcklps);                                        \
+    TestImplSingle(Dst, Src, unpcklpd);                                        \
+    TestImplSingle(Dst, Src, unpckhps);                                        \
+    TestImplSingle(Dst, Src, unpckhpd);                                        \
+  } while (0)
+
+  TestImpl(xmm0, xmm1);
+  TestImpl(xmm1, xmm2);
+  TestImpl(xmm2, xmm3);
+  TestImpl(xmm3, xmm4);
+  TestImpl(xmm4, xmm5);
+  TestImpl(xmm5, xmm6);
+  TestImpl(xmm6, xmm7);
+  TestImpl(xmm7, xmm0);
+
+#undef TestImpl
+#undef TestImplSingle
+}
+
+TEST_F(AssemblerX8632Test, Shufp) {
+  const Dqword V0(uint64_t(0x1111111122222222ull),
+                  uint64_t(0x5555555577777777ull));
+  const Dqword V1(uint64_t(0xAAAAAAAABBBBBBBBull),
+                  uint64_t(0xCCCCCCCCDDDDDDDDull));
+
+  const uint8_t pshufdImm = 0x63;
+  const Dqword pshufdExpected(uint64_t(0xBBBBBBBBCCCCCCCCull),
+                              uint64_t(0xAAAAAAAADDDDDDDDull));
+
+  const uint8_t shufpsImm = 0xf9;
+  const Dqword shufpsExpected(uint64_t(0x7777777711111111ull),
+                              uint64_t(0xCCCCCCCCCCCCCCCCull));
+
+#define TestImplSingleXmmXmm(Dst, Src, Inst)                                   \
+  do {                                                                         \
+    static constexpr char TestString[] = "(" #Dst ", " #Src ", " #Inst ")";    \
+    const uint32_t T0 = allocateDqword();                                      \
+    const uint32_t T1 = allocateDqword();                                      \
+                                                                               \
+    __ movups(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0));               \
+    __ movups(XmmRegister::Encoded_Reg_##Src, dwordAddress(T1));               \
+    __ Inst(IceType_f32, XmmRegister::Encoded_Reg_##Dst,                       \
+            XmmRegister::Encoded_Reg_##Src, Immediate(Inst##Imm));             \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0);                                                  \
+    test.setDqwordTo(T1, V1);                                                  \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(Inst##Expected, test.Dst<Dqword>()) << TestString;               \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplSingleXmmAddr(Dst, Inst)                                       \
+  do {                                                                         \
+    static constexpr char TestString[] = "(" #Dst ", Addr, " #Inst ")";        \
+    const uint32_t T0 = allocateDqword();                                      \
+    const uint32_t T1 = allocateDqword();                                      \
+                                                                               \
+    __ movups(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0));               \
+    __ Inst(IceType_f32, XmmRegister::Encoded_Reg_##Dst, dwordAddress(T1),     \
+            Immediate(Inst##Imm));                                             \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0);                                                  \
+    test.setDqwordTo(T1, V1);                                                  \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(Inst##Expected, test.Dst<Dqword>()) << TestString;               \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplSingleXmmXmmUntyped(Dst, Src, Inst)                            \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Dst ", " #Src ", " #Inst ", Untyped)";                            \
+    const uint32_t T0 = allocateDqword();                                      \
+    const uint32_t T1 = allocateDqword();                                      \
+                                                                               \
+    __ movups(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0));               \
+    __ movups(XmmRegister::Encoded_Reg_##Src, dwordAddress(T1));               \
+    __ Inst(XmmRegister::Encoded_Reg_##Dst, XmmRegister::Encoded_Reg_##Src,    \
+            Immediate(Inst##Imm));                                             \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0);                                                  \
+    test.setDqwordTo(T1, V1);                                                  \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(Inst##UntypedExpected, test.Dst<Dqword>()) << TestString;        \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImpl(Dst, Src)                                                     \
+  do {                                                                         \
+    TestImplSingleXmmXmm(Dst, Src, pshufd);                                    \
+    TestImplSingleXmmAddr(Dst, pshufd);                                        \
+    TestImplSingleXmmXmm(Dst, Src, shufps);                                    \
+    TestImplSingleXmmAddr(Dst, shufps);                                        \
+  } while (0)
+
+  TestImpl(xmm0, xmm1);
+  TestImpl(xmm1, xmm2);
+  TestImpl(xmm2, xmm3);
+  TestImpl(xmm3, xmm4);
+  TestImpl(xmm4, xmm5);
+  TestImpl(xmm5, xmm6);
+  TestImpl(xmm6, xmm7);
+  TestImpl(xmm7, xmm0);
+
+#undef TestImpl
+#undef TestImplSingleXmmXmmUntyped
+#undef TestImplSingleXmmAddr
+#undef TestImplSingleXmmXmm
+}
+
+TEST_F(AssemblerX8632Test, Cvt) {
+  const Dqword dq2ps32DstValue(-1.0f, -1.0f, -1.0f, -1.0f);
+  const Dqword dq2ps32SrcValue(-5, 3, 100, 200);
+  const Dqword dq2ps32Expected(-5.0f, 3.0f, 100.0, 200.0);
+
+  const Dqword dq2ps64DstValue(0.0f, 0.0f, -1.0f, -1.0f);
+  const Dqword dq2ps64SrcValue(-5, 3, 100, 200);
+  const Dqword dq2ps64Expected(-5.0f, 3.0f, 100.0, 200.0);
+
+  const Dqword tps2dq32DstValue(-1.0f, -1.0f, -1.0f, -1.0f);
+  const Dqword tps2dq32SrcValue(-5.0f, 3.0f, 100.0, 200.0);
+  const Dqword tps2dq32Expected(-5, 3, 100, 200);
+
+  const Dqword tps2dq64DstValue(-1.0f, -1.0f, -1.0f, -1.0f);
+  const Dqword tps2dq64SrcValue(-5.0f, 3.0f, 100.0, 200.0);
+  const Dqword tps2dq64Expected(-5, 3, 100, 200);
+
+  const Dqword si2ss32DstValue(-1.0f, -1.0f, -1.0f, -1.0f);
+  const int32_t si2ss32SrcValue = 5;
+  const Dqword si2ss32Expected(5.0f, -1.0f, -1.0f, -1.0f);
+
+  const Dqword si2ss64DstValue(-1.0, -1.0);
+  const int32_t si2ss64SrcValue = 5;
+  const Dqword si2ss64Expected(5.0, -1.0);
+
+  const int32_t tss2si32DstValue = 0xF00F0FF0;
+  const Dqword tss2si32SrcValue(-5.0f, -1.0f, -1.0f, -1.0f);
+  const int32_t tss2si32Expected = -5;
+
+  const int32_t tss2si64DstValue = 0xF00F0FF0;
+  const Dqword tss2si64SrcValue(-5.0, -1.0);
+  const int32_t tss2si64Expected = -5;
+
+  const Dqword float2float32DstValue(-1.0, -1.0);
+  const Dqword float2float32SrcValue(-5.0, 3, 100, 200);
+  const Dqword float2float32Expected(-5.0, -1.0);
+
+  const Dqword float2float64DstValue(-1.0, -1.0, -1.0, -1.0);
+  const Dqword float2float64SrcValue(-5.0, 3.0);
+  const Dqword float2float64Expected(-5.0, -1.0, -1.0, -1.0);
+
+#define TestImplPXmmXmm(Dst, Src, Inst, Size)                                  \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Dst ", " #Src ", cvt" #Inst ", f" #Size ")";                      \
+    const uint32_t T0 = allocateDqword();                                      \
+    const uint32_t T1 = allocateDqword();                                      \
+                                                                               \
+    __ movups(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0));               \
+    __ movups(XmmRegister::Encoded_Reg_##Src, dwordAddress(T1));               \
+    __ cvt##Inst(IceType_f##Size, XmmRegister::Encoded_Reg_##Dst,              \
+                 XmmRegister::Encoded_Reg_##Src);                              \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, Inst##Size##DstValue);                                \
+    test.setDqwordTo(T1, Inst##Size##SrcValue);                                \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(Inst##Size##Expected, test.Dst<Dqword>()) << TestString;         \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplSXmmReg(Dst, GPR, Inst, Size)                                  \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Dst ", " #GPR ", cvt" #Inst ", f" #Size ")";                      \
+    const uint32_t T0 = allocateDqword();                                      \
+                                                                               \
+    __ movups(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0));               \
+    __ mov(IceType_i32, GPRRegister::Encoded_Reg_##GPR,                        \
+           Immediate(Inst##Size##SrcValue));                                   \
+    __ cvt##Inst(IceType_f##Size, XmmRegister::Encoded_Reg_##Dst,              \
+                 GPRRegister::Encoded_Reg_##GPR);                              \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, Inst##Size##DstValue);                                \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(Inst##Size##Expected, test.Dst<Dqword>()) << TestString;         \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplSRegXmm(GPR, Src, Inst, Size)                                  \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #GPR ", " #Src ", cvt" #Inst ", f" #Size ")";                      \
+    const uint32_t T0 = allocateDqword();                                      \
+                                                                               \
+    __ mov(IceType_i32, GPRRegister::Encoded_Reg_##GPR,                        \
+           Immediate(Inst##Size##DstValue));                                   \
+    __ movups(XmmRegister::Encoded_Reg_##Src, dwordAddress(T0));               \
+    __ cvt##Inst(IceType_f##Size, GPRRegister::Encoded_Reg_##GPR,              \
+                 XmmRegister::Encoded_Reg_##Src);                              \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, Inst##Size##SrcValue);                                \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(static_cast<uint32_t>(Inst##Size##Expected), test.GPR())         \
+        << TestString;                                                         \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplPXmmAddr(Dst, Inst, Size)                                      \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Dst ", Addr, cvt" #Inst ", f" #Size ")";                          \
+    const uint32_t T0 = allocateDqword();                                      \
+    const uint32_t T1 = allocateDqword();                                      \
+                                                                               \
+    __ movups(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0));               \
+    __ cvt##Inst(IceType_f##Size, XmmRegister::Encoded_Reg_##Dst,              \
+                 dwordAddress(T1));                                            \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, Inst##Size##DstValue);                                \
+    test.setDqwordTo(T1, Inst##Size##SrcValue);                                \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(Inst##Size##Expected, test.Dst<Dqword>()) << TestString;         \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplSXmmAddr(Dst, Inst, Size)                                      \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Dst ", Addr, cvt" #Inst ", f" #Size ")";                          \
+    const uint32_t T0 = allocateDqword();                                      \
+    const uint32_t T1 = allocateDword();                                       \
+                                                                               \
+    __ movups(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0));               \
+    __ cvt##Inst(IceType_f##Size, XmmRegister::Encoded_Reg_##Dst,              \
+                 dwordAddress(T1));                                            \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, Inst##Size##DstValue);                                \
+    test.setDwordTo(T1, Inst##Size##SrcValue);                                 \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(Inst##Size##Expected, test.Dst<Dqword>()) << TestString;         \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplSRegAddr(GPR, Inst, Size)                                      \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #GPR ", Addr, cvt" #Inst ", f" #Size ")";                          \
+    const uint32_t T0 = allocateDqword();                                      \
+                                                                               \
+    __ mov(IceType_i32, GPRRegister::Encoded_Reg_##GPR,                        \
+           Immediate(Inst##Size##DstValue));                                   \
+    __ cvt##Inst(IceType_f##Size, GPRRegister::Encoded_Reg_##GPR,              \
+                 dwordAddress(T0));                                            \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, Inst##Size##SrcValue);                                \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(static_cast<uint32_t>(Inst##Size##Expected), test.GPR())         \
+        << TestString;                                                         \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplSize(Dst, Src, GPR, Size)                                      \
+  do {                                                                         \
+    TestImplPXmmXmm(Dst, Src, dq2ps, Size);                                    \
+    TestImplPXmmAddr(Src, dq2ps, Size);                                        \
+    TestImplPXmmXmm(Dst, Src, tps2dq, Size);                                   \
+    TestImplPXmmAddr(Src, tps2dq, Size);                                       \
+    TestImplSXmmReg(Dst, GPR, si2ss, Size);                                    \
+    TestImplSXmmAddr(Dst, si2ss, Size);                                        \
+    TestImplSRegXmm(GPR, Src, tss2si, Size);                                   \
+    TestImplSRegAddr(GPR, tss2si, Size);                                       \
+    TestImplPXmmXmm(Dst, Src, float2float, Size);                              \
+    TestImplPXmmAddr(Src, float2float, Size);                                  \
+  } while (0)
+
+#define TestImpl(Dst, Src, GPR)                                                \
+  do {                                                                         \
+    TestImplSize(Dst, Src, GPR, 32);                                           \
+    TestImplSize(Dst, Src, GPR, 64);                                           \
+  } while (0)
+
+  TestImpl(xmm0, xmm1, eax);
+  TestImpl(xmm1, xmm2, ebx);
+  TestImpl(xmm2, xmm3, ecx);
+  TestImpl(xmm3, xmm4, edx);
+  TestImpl(xmm4, xmm5, esi);
+  TestImpl(xmm5, xmm6, edi);
+  TestImpl(xmm6, xmm7, eax);
+  TestImpl(xmm7, xmm0, ebx);
+
+#undef TestImpl
+#undef TestImplSize
+#undef TestImplSRegAddr
+#undef TestImplSXmmAddr
+#undef TestImplPXmmAddr
+#undef TestImplSRegXmm
+#undef TestImplSXmmReg
+#undef TestImplPXmmXmm
+}
+
+TEST_F(AssemblerX8632Test, Ucomiss) {
+  static constexpr float qnan32 = std::numeric_limits<float>::quiet_NaN();
+  static constexpr double qnan64 = std::numeric_limits<float>::quiet_NaN();
+
+  Dqword test32DstValue(0.0, qnan32, qnan32, qnan32);
+  Dqword test32SrcValue(0.0, qnan32, qnan32, qnan32);
+
+  Dqword test64DstValue(0.0, qnan64);
+  Dqword test64SrcValue(0.0, qnan64);
+
+#define TestImplXmmXmm(Dst, Value0, Src, Value1, Size, CompType, BParity,      \
+                       BOther)                                                 \
+  do {                                                                         \
+    static constexpr char NearBranch = AssemblerX8632::kNearJump;              \
+    static constexpr char TestString[] =                                       \
+        "(" #Dst ", " #Value0 ", " #Src ", " #Value1 ", " #Size ", " #CompType \
+        ", " #BParity ", " #BOther ")";                                        \
+    const uint32_t T0 = allocateDqword();                                      \
+    test##Size##DstValue.F##Size[0] = Value0;                                  \
+    const uint32_t T1 = allocateDqword();                                      \
+    test##Size##SrcValue.F##Size[0] = Value1;                                  \
+    const uint32_t ImmIfTrue = 0xBEEF;                                         \
+    const uint32_t ImmIfFalse = 0xC0FFE;                                       \
+                                                                               \
+    __ movups(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0));               \
+    __ movups(XmmRegister::Encoded_Reg_##Src, dwordAddress(T1));               \
+    __ mov(IceType_i32, GPRRegister::Encoded_Reg_eax, Immediate(ImmIfFalse));  \
+    __ ucomiss(IceType_f##Size, XmmRegister::Encoded_Reg_##Dst,                \
+               XmmRegister::Encoded_Reg_##Src);                                \
+    Label Done;                                                                \
+    __ j(Cond::Br_##BParity, &Done, NearBranch);                               \
+    __ j(Cond::Br_##BOther, &Done, NearBranch);                                \
+    __ mov(IceType_i32, GPRRegister::Encoded_Reg_eax, Immediate(ImmIfTrue));   \
+    __ bind(&Done);                                                            \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, test##Size##DstValue);                                \
+    test.setDqwordTo(T1, test##Size##SrcValue);                                \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(ImmIfTrue, test.eax()) << TestString;                            \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplXmmAddr(Dst, Value0, Value1, Size, CompType, BParity, BOther)  \
+  do {                                                                         \
+    static constexpr char NearBranch = AssemblerX8632::kNearJump;              \
+    static constexpr char TestString[] =                                       \
+        "(" #Dst ", " #Value0 ", Addr, " #Value1 ", " #Size ", " #CompType     \
+        ", " #BParity ", " #BOther ")";                                        \
+    const uint32_t T0 = allocateDqword();                                      \
+    test##Size##DstValue.F##Size[0] = Value0;                                  \
+    const uint32_t T1 = allocateDqword();                                      \
+    test##Size##SrcValue.F##Size[0] = Value1;                                  \
+    const uint32_t ImmIfTrue = 0xBEEF;                                         \
+    const uint32_t ImmIfFalse = 0xC0FFE;                                       \
+                                                                               \
+    __ movups(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0));               \
+    __ mov(IceType_i32, GPRRegister::Encoded_Reg_eax, Immediate(ImmIfFalse));  \
+    __ ucomiss(IceType_f##Size, XmmRegister::Encoded_Reg_##Dst,                \
+               dwordAddress(T1));                                              \
+    Label Done;                                                                \
+    __ j(Cond::Br_##BParity, &Done, NearBranch);                               \
+    __ j(Cond::Br_##BOther, &Done, NearBranch);                                \
+    __ mov(IceType_i32, GPRRegister::Encoded_Reg_eax, Immediate(ImmIfTrue));   \
+    __ bind(&Done);                                                            \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, test##Size##DstValue);                                \
+    test.setDqwordTo(T1, test##Size##SrcValue);                                \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(ImmIfTrue, test.eax()) << TestString;                            \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplCond(Dst, Value0, Src, Value1, Size, CompType, BParity,        \
+                     BOther)                                                   \
+  do {                                                                         \
+    TestImplXmmXmm(Dst, Value0, Src, Value1, Size, CompType, BParity, BOther); \
+    TestImplXmmAddr(Dst, Value0, Value1, Size, CompType, BParity, BOther);     \
+  } while (0)
+
+#define TestImplSize(Dst, Src, Size)                                           \
+  do {                                                                         \
+    TestImplCond(Dst, 1.0, Src, 1.0, Size, isEq, p, ne);                       \
+    TestImplCond(Dst, 1.0, Src, 2.0, Size, isNe, p, e);                        \
+    TestImplCond(Dst, 1.0, Src, 2.0, Size, isLe, p, a);                        \
+    TestImplCond(Dst, 1.0, Src, 1.0, Size, isLe, p, a);                        \
+    TestImplCond(Dst, 1.0, Src, 2.0, Size, isLt, p, ae);                       \
+    TestImplCond(Dst, 2.0, Src, 1.0, Size, isGe, p, b);                        \
+    TestImplCond(Dst, 1.0, Src, 1.0, Size, isGe, p, b);                        \
+    TestImplCond(Dst, 2.0, Src, 1.0, Size, isGt, p, be);                       \
+    TestImplCond(Dst, qnan##Size, Src, 1.0, Size, isUnord, np, o);             \
+    TestImplCond(Dst, 1.0, Src, qnan##Size, Size, isUnord, np, s);             \
+    TestImplCond(Dst, qnan##Size, Src, qnan##Size, Size, isUnord, np, s);      \
+  } while (0)
+
+#define TestImpl(Dst, Src)                                                     \
+  do {                                                                         \
+    TestImplSize(Dst, Src, 32);                                                \
+    TestImplSize(Dst, Src, 64);                                                \
+  } while (0)
+
+  TestImpl(xmm0, xmm1);
+  TestImpl(xmm1, xmm2);
+  TestImpl(xmm2, xmm3);
+  TestImpl(xmm3, xmm4);
+  TestImpl(xmm4, xmm5);
+  TestImpl(xmm5, xmm6);
+  TestImpl(xmm6, xmm7);
+  TestImpl(xmm7, xmm0);
+
+#undef TestImpl
+#undef TestImplSize
+#undef TestImplCond
+#undef TestImplXmmAddr
+#undef TestImplXmmXmm
+}
+
+TEST_F(AssemblerX8632Test, Sqrtss) {
+  Dqword test32SrcValue(-100.0, -100.0, -100.0, -100.0);
+  Dqword test32DstValue(-1.0, -1.0, -1.0, -1.0);
+
+  Dqword test64SrcValue(-100.0, -100.0);
+  Dqword test64DstValue(-1.0, -1.0);
+
+#define TestSqrtssXmmXmm(Dst, Src, Value1, Result, Size)                       \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Dst ", " #Src ", " #Value1 ", " #Result ", " #Size ")";           \
+    const uint32_t T0 = allocateDqword();                                      \
+    test##Size##SrcValue.F##Size[0] = Value1;                                  \
+    const uint32_t T1 = allocateDqword();                                      \
+                                                                               \
+    __ movups(XmmRegister::Encoded_Reg_##Src, dwordAddress(T0));               \
+    __ movups(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T1));               \
+    __ sqrtss(IceType_f##Size, XmmRegister::Encoded_Reg_##Dst,                 \
+              XmmRegister::Encoded_Reg_##Src);                                 \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, test##Size##SrcValue);                                \
+    test.setDqwordTo(T1, test##Size##DstValue);                                \
+    test.run();                                                                \
+                                                                               \
+    Dqword Expected = test##Size##DstValue;                                    \
+    Expected.F##Size[0] = Result;                                              \
+    ASSERT_EQ(Expected, test.Dst<Dqword>()) << TestString;                     \
+    reset();                                                                   \
+  } while (0)
+
+#define TestSqrtssXmmAddr(Dst, Value1, Result, Size)                           \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Dst ", Addr, " #Value1 ", " #Result ", " #Size ")";               \
+    const uint32_t T0 = allocateDqword();                                      \
+    test##Size##SrcValue.F##Size[0] = Value1;                                  \
+    const uint32_t T1 = allocateDqword();                                      \
+                                                                               \
+    __ movups(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T1));               \
+    __ sqrtss(IceType_f##Size, XmmRegister::Encoded_Reg_##Dst,                 \
+              dwordAddress(T0));                                               \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, test##Size##SrcValue);                                \
+    test.setDqwordTo(T1, test##Size##DstValue);                                \
+    test.run();                                                                \
+                                                                               \
+    Dqword Expected = test##Size##DstValue;                                    \
+    Expected.F##Size[0] = Result;                                              \
+    ASSERT_EQ(Expected, test.Dst<Dqword>()) << TestString;                     \
+    reset();                                                                   \
+  } while (0)
+
+#define TestSqrtssSize(Dst, Src, Size)                                         \
+  do {                                                                         \
+    TestSqrtssXmmXmm(Dst, Src, 4.0, 2.0, Size);                                \
+    TestSqrtssXmmAddr(Dst, 4.0, 2.0, Size);                                    \
+    TestSqrtssXmmXmm(Dst, Src, 9.0, 3.0, Size);                                \
+    TestSqrtssXmmAddr(Dst, 9.0, 3.0, Size);                                    \
+    TestSqrtssXmmXmm(Dst, Src, 100.0, 10.0, Size);                             \
+    TestSqrtssXmmAddr(Dst, 100.0, 10.0, Size);                                 \
+  } while (0)
+
+#define TestSqrtss(Dst, Src)                                                   \
+  do {                                                                         \
+    TestSqrtssSize(Dst, Src, 32);                                              \
+    TestSqrtssSize(Dst, Src, 64);                                              \
+  } while (0)
+
+  TestSqrtss(xmm0, xmm1);
+  TestSqrtss(xmm1, xmm2);
+  TestSqrtss(xmm2, xmm3);
+  TestSqrtss(xmm3, xmm4);
+  TestSqrtss(xmm4, xmm5);
+  TestSqrtss(xmm5, xmm6);
+  TestSqrtss(xmm6, xmm7);
+  TestSqrtss(xmm7, xmm0);
+
+#undef TestSqrtss
+#undef TestSqrtssSize
+#undef TestSqrtssXmmAddr
+#undef TestSqrtssXmmXmm
+}
+
+TEST_F(AssemblerX8632Test, Insertps) {
+#define TestInsertpsXmmXmmImm(Dst, Value0, Src, Value1, Imm, Expected)         \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Dst ", " #Value0 ", " #Src ", " #Value1 ", " #Imm ", " #Expected  \
+        ")";                                                                   \
+    const uint32_t T0 = allocateDqword();                                      \
+    const Dqword V0 Value0;                                                    \
+    const uint32_t T1 = allocateDqword();                                      \
+    const Dqword V1 Value1;                                                    \
+                                                                               \
+    __ movups(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0));               \
+    __ movups(XmmRegister::Encoded_Reg_##Src, dwordAddress(T1));               \
+    __ insertps(IceType_v4f32, XmmRegister::Encoded_Reg_##Dst,                 \
+                XmmRegister::Encoded_Reg_##Src, Immediate(Imm));               \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0);                                                  \
+    test.setDqwordTo(T1, V1);                                                  \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(Dqword Expected, test.Dst<Dqword>()) << TestString;              \
+    reset();                                                                   \
+  } while (0)
+
+#define TestInsertpsXmmAddrImm(Dst, Value0, Value1, Imm, Expected)             \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Dst ", " #Value0 ", Addr, " #Value1 ", " #Imm ", " #Expected ")"; \
+    const uint32_t T0 = allocateDqword();                                      \
+    const Dqword V0 Value0;                                                    \
+    const uint32_t T1 = allocateDqword();                                      \
+    const Dqword V1 Value1;                                                    \
+                                                                               \
+    __ movups(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0));               \
+    __ insertps(IceType_v4f32, XmmRegister::Encoded_Reg_##Dst,                 \
+                dwordAddress(T1), Immediate(Imm));                             \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0);                                                  \
+    test.setDqwordTo(T1, V1);                                                  \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(Dqword Expected, test.Dst<Dqword>()) << TestString;              \
+    reset();                                                                   \
+  } while (0)
+
+#define TestInsertps(Dst, Src)                                                 \
+  do {                                                                         \
+    TestInsertpsXmmXmmImm(                                                     \
+        Dst, (uint64_t(-1), uint64_t(-1)), Src,                                \
+        (uint64_t(0xAAAAAAAABBBBBBBBull), uint64_t(0xCCCCCCCCDDDDDDDDull)),    \
+        0x99,                                                                  \
+        (uint64_t(0xDDDDDDDD00000000ull), uint64_t(0x00000000FFFFFFFFull)));   \
+    TestInsertpsXmmAddrImm(                                                    \
+        Dst, (uint64_t(-1), uint64_t(-1)),                                     \
+        (uint64_t(0xAAAAAAAABBBBBBBBull), uint64_t(0xCCCCCCCCDDDDDDDDull)),    \
+        0x99,                                                                  \
+        (uint64_t(0xBBBBBBBB00000000ull), uint64_t(0x00000000FFFFFFFFull)));   \
+    TestInsertpsXmmXmmImm(                                                     \
+        Dst, (uint64_t(-1), uint64_t(-1)), Src,                                \
+        (uint64_t(0xAAAAAAAABBBBBBBBull), uint64_t(0xCCCCCCCCDDDDDDDDull)),    \
+        0x9D,                                                                  \
+        (uint64_t(0xDDDDDDDD00000000ull), uint64_t(0x0000000000000000ull)));   \
+    TestInsertpsXmmAddrImm(                                                    \
+        Dst, (uint64_t(-1), uint64_t(-1)),                                     \
+        (uint64_t(0xAAAAAAAABBBBBBBBull), uint64_t(0xCCCCCCCCDDDDDDDDull)),    \
+        0x9D,                                                                  \
+        (uint64_t(0xBBBBBBBB00000000ull), uint64_t(0x0000000000000000ull)));   \
+  } while (0)
+
+  TestInsertps(xmm0, xmm1);
+  TestInsertps(xmm1, xmm2);
+  TestInsertps(xmm2, xmm3);
+  TestInsertps(xmm3, xmm4);
+  TestInsertps(xmm4, xmm5);
+  TestInsertps(xmm5, xmm6);
+  TestInsertps(xmm6, xmm7);
+  TestInsertps(xmm7, xmm0);
+
+#undef TestInsertps
+#undef TestInsertpsXmmXmmAddr
+#undef TestInsertpsXmmXmmImm
+}
+
+TEST_F(AssemblerX8632Test, Pinsr) {
+  static constexpr uint8_t Mask32 = 0x03;
+  static constexpr uint8_t Mask16 = 0x07;
+  static constexpr uint8_t Mask8 = 0x0F;
+
+#define TestPinsrXmmGPRImm(Dst, Value0, GPR, Value1, Imm, Size)                \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Dst ", " #Value0 ", " #GPR ", " #Value1 ", " #Imm ", " #Size ")"; \
+    const uint32_t T0 = allocateDqword();                                      \
+    const Dqword V0 Value0;                                                    \
+                                                                               \
+    __ movups(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0));               \
+    __ mov(IceType_i32, GPRRegister::Encoded_Reg_##GPR, Immediate(Value1));    \
+    __ pinsr(IceType_i##Size, XmmRegister::Encoded_Reg_##Dst,                  \
+             GPRRegister::Encoded_Reg_##GPR, Immediate(Imm));                  \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0);                                                  \
+    test.run();                                                                \
+                                                                               \
+    constexpr uint8_t sel = (Imm)&Mask##Size;                                  \
+    Dqword Expected = V0;                                                      \
+    Expected.U##Size[sel] = Value1;                                            \
+    ASSERT_EQ(Expected, test.Dst<Dqword>()) << TestString;                     \
+    reset();                                                                   \
+  } while (0)
+
+#define TestPinsrXmmAddrImm(Dst, Value0, Value1, Imm, Size)                    \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Dst ", " #Value0 ", Addr, " #Value1 ", " #Imm ", " #Size ")";     \
+    const uint32_t T0 = allocateDqword();                                      \
+    const Dqword V0 Value0;                                                    \
+    const uint32_t T1 = allocateDword();                                       \
+    const uint32_t V1 = Value1;                                                \
+                                                                               \
+    __ movups(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0));               \
+    __ pinsr(IceType_i##Size, XmmRegister::Encoded_Reg_##Dst,                  \
+             dwordAddress(T1), Immediate(Imm));                                \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0);                                                  \
+    test.setDwordTo(T1, V1);                                                   \
+    test.run();                                                                \
+                                                                               \
+    constexpr uint8_t sel = (Imm)&Mask##Size;                                  \
+    Dqword Expected = V0;                                                      \
+    Expected.U##Size[sel] = Value1;                                            \
+    ASSERT_EQ(Expected, test.Dst<Dqword>()) << TestString;                     \
+    reset();                                                                   \
+  } while (0)
+
+#define TestPinsrSize(Dst, GPR, Value1, Imm, Size)                             \
+  do {                                                                         \
+    TestPinsrXmmGPRImm(Dst, (uint64_t(0xAAAAAAAABBBBBBBBull),                  \
+                             uint64_t(0xFFFFFFFFDDDDDDDDull)),                 \
+                       GPR, Value1, Imm, Size);                                \
+    TestPinsrXmmAddrImm(Dst, (uint64_t(0xAAAAAAAABBBBBBBBull),                 \
+                              uint64_t(0xFFFFFFFFDDDDDDDDull)),                \
+                        Value1, Imm, Size);                                    \
+  } while (0)
+
+#define TestPinsr(Src, Dst)                                                    \
+  do {                                                                         \
+    TestPinsrSize(Src, Dst, 0xEE, 0x03, 8);                                    \
+    TestPinsrSize(Src, Dst, 0xFFEE, 0x03, 16);                                 \
+    TestPinsrSize(Src, Dst, 0xC0FFEE, 0x03, 32);                               \
+  } while (0)
+
+  TestPinsr(xmm0, eax);
+  TestPinsr(xmm1, ebx);
+  TestPinsr(xmm2, ecx);
+  TestPinsr(xmm3, edx);
+  TestPinsr(xmm4, esi);
+  TestPinsr(xmm5, edi);
+  TestPinsr(xmm6, eax);
+  TestPinsr(xmm7, ebx);
+
+#undef TestPinsr
+#undef TestPinsrSize
+#undef TestPinsrXmmAddrImm
+#undef TestPinsrXmmGPRImm
+}
+
+TEST_F(AssemblerX8632Test, Pextr) {
+  static constexpr uint8_t Mask32 = 0x03;
+  static constexpr uint8_t Mask16 = 0x07;
+  static constexpr uint8_t Mask8 = 0x0F;
+
+#define TestPextrGPRXmmImm(GPR, Src, Value1, Imm, Size)                        \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #GPR ", " #Src ", " #Value1 ", " #Imm ", " #Size ")";              \
+    const uint32_t T0 = allocateDqword();                                      \
+    const Dqword V0 Value1;                                                    \
+                                                                               \
+    __ movups(XmmRegister::Encoded_Reg_##Src, dwordAddress(T0));               \
+    __ pextr(IceType_i##Size, GPRRegister::Encoded_Reg_##GPR,                  \
+             XmmRegister::Encoded_Reg_##Src, Immediate(Imm));                  \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0);                                                  \
+    test.run();                                                                \
+                                                                               \
+    constexpr uint8_t sel = (Imm)&Mask##Size;                                  \
+    ASSERT_EQ(V0.U##Size[sel], test.GPR()) << TestString;                      \
+    reset();                                                                   \
+  } while (0)
+
+#define TestPextrSize(GPR, Src, Value1, Imm, Size)                             \
+  do {                                                                         \
+    TestPextrGPRXmmImm(GPR, Src, (uint64_t(0xAAAAAAAABBBBBBBBull),             \
+                                  uint64_t(0xFFFFFFFFDDDDDDDDull)),            \
+                       Imm, Size);                                             \
+  } while (0)
+
+#define TestPextr(Src, Dst)                                                    \
+  do {                                                                         \
+    TestPextrSize(Src, Dst, 0xEE, 0x03, 8);                                    \
+    TestPextrSize(Src, Dst, 0xFFEE, 0x03, 16);                                 \
+    TestPextrSize(Src, Dst, 0xC0FFEE, 0x03, 32);                               \
+  } while (0)
+
+  TestPextr(eax, xmm0);
+  TestPextr(ebx, xmm1);
+  TestPextr(ecx, xmm2);
+  TestPextr(edx, xmm3);
+  TestPextr(esi, xmm4);
+  TestPextr(edi, xmm5);
+  TestPextr(eax, xmm6);
+  TestPextr(ebx, xmm7);
+
+#undef TestPextr
+#undef TestPextrSize
+#undef TestPextrXmmGPRImm
+}
+
+TEST_F(AssemblerX8632Test, Pcmpeq_Pcmpgt) {
+#define TestPcmpXmmXmm(Dst, Value0, Src, Value1, Size, Inst, Op)               \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Dst ", " #Value0 ", " #Src ", " #Value1 ", " #Size ", " #Op ")";  \
+    const uint32_t T0 = allocateDqword();                                      \
+    const Dqword V0 Value0;                                                    \
+    const uint32_t T1 = allocateDqword();                                      \
+    const Dqword V1 Value1;                                                    \
+                                                                               \
+    __ movups(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0));               \
+    __ movups(XmmRegister::Encoded_Reg_##Src, dwordAddress(T1));               \
+    __ Inst(IceType_i##Size, XmmRegister::Encoded_Reg_##Dst,                   \
+            XmmRegister::Encoded_Reg_##Src);                                   \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0);                                                  \
+    test.setDqwordTo(T1, V1);                                                  \
+    test.run();                                                                \
+                                                                               \
+    Dqword Expected(uint64_t(0), uint64_t(0));                                 \
+    static constexpr uint8_t ArraySize =                                       \
+        sizeof(Dqword) / sizeof(uint##Size##_t);                               \
+    for (uint8_t i = 0; i < ArraySize; ++i) {                                  \
+      Expected.I##Size[i] = (V1.I##Size[i] Op V0.I##Size[i]) ? -1 : 0;         \
+    }                                                                          \
+    ASSERT_EQ(Expected, test.Dst<Dqword>()) << TestString;                     \
+    reset();                                                                   \
+  } while (0)
+
+#define TestPcmpXmmAddr(Dst, Value0, Value1, Size, Inst, Op)                   \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Dst ", " #Value0 ", Addr, " #Value1 ", " #Size ", " #Op ")";      \
+    const uint32_t T0 = allocateDqword();                                      \
+    const Dqword V0 Value0;                                                    \
+    const uint32_t T1 = allocateDqword();                                      \
+    const Dqword V1 Value1;                                                    \
+                                                                               \
+    __ movups(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0));               \
+    __ Inst(IceType_i##Size, XmmRegister::Encoded_Reg_##Dst,                   \
+            dwordAddress(T1));                                                 \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0);                                                  \
+    test.setDqwordTo(T1, V1);                                                  \
+    test.run();                                                                \
+                                                                               \
+    Dqword Expected(uint64_t(0), uint64_t(0));                                 \
+    static constexpr uint8_t ArraySize =                                       \
+        sizeof(Dqword) / sizeof(uint##Size##_t);                               \
+    for (uint8_t i = 0; i < ArraySize; ++i) {                                  \
+      Expected.I##Size[i] = (V1.I##Size[i] Op V0.I##Size[i]) ? -1 : 0;         \
+    }                                                                          \
+    ASSERT_EQ(Expected, test.Dst<Dqword>()) << TestString;                     \
+    reset();                                                                   \
+  } while (0)
+
+#define TestPcmpValues(Dst, Value0, Src, Value1, Size)                         \
+  do {                                                                         \
+    TestPcmpXmmXmm(Dst, Value0, Src, Value1, Size, pcmpeq, == );               \
+    TestPcmpXmmAddr(Dst, Value0, Value1, Size, pcmpeq, == );                   \
+    TestPcmpXmmXmm(Dst, Value0, Src, Value1, Size, pcmpgt, < );                \
+    TestPcmpXmmAddr(Dst, Value0, Value1, Size, pcmpgt, < );                    \
+  } while (0)
+
+#define TestPcmpSize(Dst, Src, Size)                                           \
+  do {                                                                         \
+    TestPcmpValues(Dst, (uint64_t(0x8888888888888888ull),                      \
+                         uint64_t(0x0000000000000000ull)),                     \
+                   Src, (uint64_t(0x0000008800008800ull),                      \
+                         uint64_t(0xFFFFFFFFFFFFFFFFull)),                     \
+                   Size);                                                      \
+    TestPcmpValues(Dst, (uint64_t(0x123567ABAB55DE01ull),                      \
+                         uint64_t(0x12345abcde12345Aull)),                     \
+                   Src, (uint64_t(0x0000008800008800ull),                      \
+                         uint64_t(0xAABBCCDD1234321Aull)),                     \
+                   Size);                                                      \
+  } while (0)
+
+#define TestPcmp(Dst, Src)                                                     \
+  do {                                                                         \
+    TestPcmpSize(xmm0, xmm1, 8);                                               \
+    TestPcmpSize(xmm0, xmm1, 16);                                              \
+    TestPcmpSize(xmm0, xmm1, 32);                                              \
+  } while (0)
+
+  TestPcmp(xmm0, xmm1);
+  TestPcmp(xmm1, xmm2);
+  TestPcmp(xmm2, xmm3);
+  TestPcmp(xmm3, xmm4);
+  TestPcmp(xmm4, xmm5);
+  TestPcmp(xmm5, xmm6);
+  TestPcmp(xmm6, xmm7);
+  TestPcmp(xmm7, xmm0);
+
+#undef TestPcmp
+#undef TestPcmpSize
+#undef TestPcmpValues
+#undef TestPcmpXmmAddr
+#undef TestPcmpXmmXmm
+}
+
+TEST_F(AssemblerX8632Test, Roundsd) {
+#define TestRoundsdXmmXmm(Dst, Src, Mode, Input, RN)                           \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Dst ", " #Src ", " #Mode ", " #Input ", " #RN ")";                \
+    const uint32_t T0 = allocateDqword();                                      \
+    const Dqword V0(-3.0, -3.0);                                               \
+    const uint32_t T1 = allocateDqword();                                      \
+    const Dqword V1(double(Input), -123.4);                                    \
+                                                                               \
+    __ movups(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0));               \
+    __ movups(XmmRegister::Encoded_Reg_##Src, dwordAddress(T1));               \
+    __ roundsd(XmmRegister::Encoded_Reg_##Dst, XmmRegister::Encoded_Reg_##Src, \
+               AssemblerX8632::k##Mode);                                       \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0);                                                  \
+    test.setDqwordTo(T1, V1);                                                  \
+    test.run();                                                                \
+                                                                               \
+    const Dqword Expected(double(RN), -3.0);                                   \
+    EXPECT_EQ(Expected, test.Dst<Dqword>()) << TestString;                     \
+    reset();                                                                   \
+  } while (0)
+
+#define TestRoundsd(Dst, Src)                                                  \
+  do {                                                                         \
+    TestRoundsdXmmXmm(Dst, Src, RoundToNearest, 5.51, 6);                      \
+    TestRoundsdXmmXmm(Dst, Src, RoundToNearest, 5.49, 5);                      \
+    TestRoundsdXmmXmm(Dst, Src, RoundDown, 5.51, 5);                           \
+    TestRoundsdXmmXmm(Dst, Src, RoundUp, 5.49, 6);                             \
+    TestRoundsdXmmXmm(Dst, Src, RoundToZero, 5.49, 5);                         \
+    TestRoundsdXmmXmm(Dst, Src, RoundToZero, 5.51, 5);                         \
+  } while (0)
+
+  TestRoundsd(xmm0, xmm1);
+  TestRoundsd(xmm1, xmm2);
+  TestRoundsd(xmm2, xmm3);
+  TestRoundsd(xmm3, xmm4);
+  TestRoundsd(xmm4, xmm5);
+  TestRoundsd(xmm5, xmm6);
+  TestRoundsd(xmm6, xmm7);
+  TestRoundsd(xmm7, xmm0);
+
+#undef TestRoundsd
+#undef TestRoundsdXmmXmm
+}
+
+TEST_F(AssemblerX8632Test, Set1ps) {
+#define TestImpl(Xmm, Src, Imm)                                                \
+  do {                                                                         \
+    __ set1ps(XmmRegister::Encoded_Reg_##Xmm, GPRRegister::Encoded_Reg_##Src,  \
+              Immediate(Imm));                                                 \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.run();                                                                \
+                                                                               \
+    const Dqword Expected((uint64_t(Imm) << 32) | uint32_t(Imm),               \
+                          (uint64_t(Imm) << 32) | uint32_t(Imm));              \
+    ASSERT_EQ(Expected, test.Xmm<Dqword>())                                    \
+        << "(" #Xmm ", " #Src ", " #Imm ")";                                   \
+    reset();                                                                   \
+  } while (0)
+
+  TestImpl(xmm0, ebx, 1);
+  TestImpl(xmm1, ecx, 2);
+  TestImpl(xmm2, edx, 3);
+  TestImpl(xmm3, esi, 4);
+  TestImpl(xmm4, edi, 5);
+  TestImpl(xmm5, eax, 6);
+  TestImpl(xmm6, ebx, 7);
+  TestImpl(xmm7, ecx, 8);
+
+#undef TestImpl
+}
+
+} // end of anonymous namespace
+} // end of namespace Test
+} // end of namespace X8632
+} // end of namespace Ice
diff --git a/unittest/AssemblerX8664/ControlFlow.cpp b/unittest/AssemblerX8664/ControlFlow.cpp
new file mode 100644
index 0000000..704f6fd
--- /dev/null
+++ b/unittest/AssemblerX8664/ControlFlow.cpp
@@ -0,0 +1,307 @@
+//===- subzero/unittest/AssemblerX8664/ControlFlow.cpp --------------------===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#include "AssemblerX8664/TestUtil.h"
+
+namespace Ice {
+namespace X8664 {
+namespace Test {
+namespace {
+
+TEST_F(AssemblerX8664Test, J) {
+#define TestJ(C, Near, Dest, Src0, Value0, Src1, Value1)                       \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #C ", " #Near ", " #Dest ", " #Src0 ", " #Value0 ", " #Src1        \
+        ", " #Value1 ")";                                                      \
+    const bool NearJmp = AssemblerX8664::k##Near##Jump;                        \
+    Label ShouldBeTaken;                                                       \
+    __ mov(IceType_i32, Encoded_GPR_##Src0(), Immediate(Value0));              \
+    __ mov(IceType_i32, Encoded_GPR_##Src1(), Immediate(Value1));              \
+    __ mov(IceType_i32, Encoded_GPR_##Dest(), Immediate(0xBEEF));              \
+    __ cmp(IceType_i32, Encoded_GPR_##Src0(), Encoded_GPR_##Src1());           \
+    __ j(Cond::Br_##C, &ShouldBeTaken, NearJmp);                               \
+    __ mov(IceType_i32, Encoded_GPR_##Dest(), Immediate(0xC0FFEE));            \
+    __ bind(&ShouldBeTaken);                                                   \
+    AssembledTest test = assemble();                                           \
+    test.run();                                                                \
+    ASSERT_EQ(Value0, test.Src0()) << TestString;                              \
+    ASSERT_EQ(Value1, test.Src1()) << TestString;                              \
+    ASSERT_EQ(0xBEEFul, test.Dest()) << TestString;                            \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImpl(Dst, Src0, Src1)                                              \
+  do {                                                                         \
+    TestJ(o, Near, Dst, Src0, 0x80000000ul, Src1, 0x1ul);                      \
+    TestJ(o, Far, Dst, Src0, 0x80000000ul, Src1, 0x1ul);                       \
+    TestJ(no, Near, Dst, Src0, 0x1ul, Src1, 0x1ul);                            \
+    TestJ(no, Far, Dst, Src0, 0x1ul, Src1, 0x1ul);                             \
+    TestJ(b, Near, Dst, Src0, 0x1ul, Src1, 0x80000000ul);                      \
+    TestJ(b, Far, Dst, Src0, 0x1ul, Src1, 0x80000000ul);                       \
+    TestJ(ae, Near, Dst, Src0, 0x80000000ul, Src1, 0x1ul);                     \
+    TestJ(ae, Far, Dst, Src0, 0x80000000ul, Src1, 0x1ul);                      \
+    TestJ(e, Near, Dst, Src0, 0x80000000ul, Src1, 0x80000000ul);               \
+    TestJ(e, Far, Dst, Src0, 0x80000000ul, Src1, 0x80000000ul);                \
+    TestJ(ne, Near, Dst, Src0, 0x80000000ul, Src1, 0x1ul);                     \
+    TestJ(ne, Far, Dst, Src0, 0x80000000ul, Src1, 0x1ul);                      \
+    TestJ(be, Near, Dst, Src0, 0x1ul, Src1, 0x80000000ul);                     \
+    TestJ(be, Far, Dst, Src0, 0x1ul, Src1, 0x80000000ul);                      \
+    TestJ(a, Near, Dst, Src0, 0x80000000ul, Src1, 0x1ul);                      \
+    TestJ(a, Far, Dst, Src0, 0x80000000ul, Src1, 0x1ul);                       \
+    TestJ(s, Near, Dst, Src0, 0x1ul, Src1, 0x80000000ul);                      \
+    TestJ(s, Far, Dst, Src0, 0x1ul, Src1, 0x80000000ul);                       \
+    TestJ(ns, Near, Dst, Src0, 0x80000000ul, Src1, 0x1ul);                     \
+    TestJ(ns, Far, Dst, Src0, 0x80000000ul, Src1, 0x1ul);                      \
+    TestJ(p, Near, Dst, Src0, 0x80000000ul, Src1, 0x1ul);                      \
+    TestJ(p, Far, Dst, Src0, 0x80000000ul, Src1, 0x1ul);                       \
+    TestJ(np, Near, Dst, Src0, 0x1ul, Src1, 0x80000000ul);                     \
+    TestJ(np, Far, Dst, Src0, 0x1ul, Src1, 0x80000000ul);                      \
+    TestJ(l, Near, Dst, Src0, 0x80000000ul, Src1, 0x1ul);                      \
+    TestJ(l, Far, Dst, Src0, 0x80000000ul, Src1, 0x1ul);                       \
+    TestJ(ge, Near, Dst, Src0, 0x1ul, Src1, 0x80000000ul);                     \
+    TestJ(ge, Far, Dst, Src0, 0x1ul, Src1, 0x80000000ul);                      \
+    TestJ(le, Near, Dst, Src0, 0x80000000ul, Src1, 0x1ul);                     \
+    TestJ(le, Far, Dst, Src0, 0x80000000ul, Src1, 0x1ul);                      \
+    TestJ(g, Near, Dst, Src0, 0x1ul, Src1, 0x80000000ul);                      \
+    TestJ(g, Far, Dst, Src0, 0x1ul, Src1, 0x80000000ul);                       \
+  } while (0)
+
+  TestImpl(r1, r2, r3);
+  TestImpl(r2, r3, r4);
+  TestImpl(r3, r4, r5);
+  TestImpl(r4, r5, r6);
+  TestImpl(r5, r6, r7);
+  TestImpl(r6, r7, r8);
+  TestImpl(r7, r8, r10);
+  TestImpl(r8, r10, r11);
+  TestImpl(r10, r11, r12);
+  TestImpl(r11, r12, r13);
+  TestImpl(r12, r13, r14);
+  TestImpl(r13, r14, r15);
+  TestImpl(r14, r15, r1);
+  TestImpl(r15, r1, r2);
+
+#undef TestImpl
+#undef TestJ
+}
+
+TEST_F(AssemblerX8664Test, CallImm) {
+  __ call(Immediate(16));
+  __ hlt();
+  __ hlt();
+  __ hlt();
+  __ hlt();
+  __ hlt();
+  __ hlt();
+  __ hlt();
+  __ hlt();
+  __ hlt();
+  __ hlt();
+  __ hlt();
+  __ hlt();
+  __ mov(IceType_i32, GPRRegister::Encoded_Reg_eax, Immediate(0xf00f));
+  __ popl(GPRRegister::Encoded_Reg_ebx);
+
+  AssembledTest test = assemble();
+
+  test.run();
+
+  EXPECT_EQ(0xF00Fu, test.eax());
+}
+
+TEST_F(AssemblerX8664Test, CallReg) {
+#define TestImpl(Dst, Src)                                                     \
+  do {                                                                         \
+    __ call(Immediate(16));                                                    \
+    int CallTargetAddr = codeBytesSize() + 12;                                 \
+    __ popl(Encoded_GPR_##Dst());                                              \
+    __ pushl(Encoded_GPR_##Dst());                                             \
+    __ ret();                                                                  \
+    for (int I = codeBytesSize(); I < CallTargetAddr; ++I) {                   \
+      __ hlt();                                                                \
+    }                                                                          \
+    __ popl(Encoded_GPR_##Src());                                              \
+    __ call(Encoded_GPR_##Src());                                              \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+                                                                               \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_LE(15u, test.Dst() - test.Src()) << "(" #Dst ", " #Src ")";         \
+    reset();                                                                   \
+  } while (0)
+
+  TestImpl(r1, r2);
+  TestImpl(r2, r3);
+  TestImpl(r3, r4);
+  TestImpl(r4, r5);
+  TestImpl(r5, r6);
+  TestImpl(r6, r7);
+  TestImpl(r7, r8);
+  TestImpl(r8, r10);
+  TestImpl(r10, r11);
+  TestImpl(r11, r12);
+  TestImpl(r12, r13);
+  TestImpl(r13, r14);
+  TestImpl(r14, r15);
+  TestImpl(r15, r1);
+
+#undef TestImpl
+}
+
+TEST_F(AssemblerX8664Test, CallAddr) {
+#define TestImpl(Dst, Src)                                                     \
+  do {                                                                         \
+    const uint32_t T0 = allocateQword();                                       \
+    const uint64_t V0 = 0xA0C0FFEEBEEFFEEFull;                                 \
+    __ call(Immediate(16));                                                    \
+    int CallTargetAddr = codeBytesSize() + 12;                                 \
+    __ mov(IceType_i8, Encoded_GPR_##Dst##l(), Immediate(0xf4));               \
+    __ ret();                                                                  \
+    for (int I = codeBytesSize(); I < CallTargetAddr; ++I) {                   \
+      __ hlt();                                                                \
+    }                                                                          \
+    __ mov(IceType_i64, Encoded_GPR_##Dst##q(), dwordAddress(T0));             \
+    __ mov(IceType_i64, Encoded_GPR_##Src##q(), Encoded_GPR_rsp());            \
+    __ call(Address(Encoded_GPR_##Src##q(), 0));                               \
+    __ popl(Encoded_GPR_##Src##q());                                           \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setQwordTo(T0, V0);                                                   \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(0xA0C0FFEEBEEFFEF4ull, test.Dst##q()) << "(" #Dst ", " #Src ")"; \
+    reset();                                                                   \
+  } while (0)
+
+  TestImpl(r1, r2);
+  TestImpl(r2, r3);
+  TestImpl(r3, r4);
+  TestImpl(r4, r5);
+  TestImpl(r5, r6);
+  TestImpl(r6, r7);
+  TestImpl(r7, r8);
+  TestImpl(r8, r10);
+  TestImpl(r10, r11);
+  TestImpl(r11, r12);
+  TestImpl(r12, r13);
+  TestImpl(r13, r14);
+  TestImpl(r14, r15);
+  TestImpl(r15, r1);
+
+#undef TestImpl
+}
+
+TEST_F(AssemblerX8664Test, Jmp) {
+// TestImplReg uses jmp(Label), so jmp(Label) needs to be tested before it.
+#define TestImplAddr(Near)                                                     \
+  do {                                                                         \
+    Label ForwardJmp;                                                          \
+    Label BackwardJmp;                                                         \
+    Label Done;                                                                \
+                                                                               \
+    __ jmp(&ForwardJmp, AssemblerX8664::k##Near##Jump);                        \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ bind(&BackwardJmp);                                                     \
+    __ jmp(&Done, AssemblerX8664::k##Near##Jump);                              \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ bind(&ForwardJmp);                                                      \
+    __ jmp(&BackwardJmp, AssemblerX8664::k##NearJump);                         \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ bind(&Done);                                                            \
+  } while (0)
+
+#define TestImplReg(Dst)                                                       \
+  do {                                                                         \
+    __ call(Immediate(16));                                                    \
+    Label Done;                                                                \
+    __ jmp(&Done, AssemblerX8664::kNearJump);                                  \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ popl(Encoded_GPR_##Dst());                                              \
+    __ jmp(Encoded_GPR_##Dst());                                               \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ hlt();                                                                  \
+    __ bind(&Done);                                                            \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.run();                                                                \
+                                                                               \
+    reset();                                                                   \
+  } while (0)
+
+  TestImplAddr(Near);
+  TestImplAddr(Far);
+
+  TestImplReg(r1);
+  TestImplReg(r2);
+  TestImplReg(r3);
+  TestImplReg(r4);
+  TestImplReg(r5);
+  TestImplReg(r6);
+  TestImplReg(r7);
+  TestImplReg(r8);
+  TestImplReg(r10);
+  TestImplReg(r11);
+  TestImplReg(r12);
+  TestImplReg(r13);
+  TestImplReg(r14);
+  TestImplReg(r15);
+
+#undef TestImplReg
+#undef TestImplAddr
+}
+
+} // end of anonymous namespace
+} // end of namespace Test
+} // end of namespace X8664
+} // end of namespace Ice
diff --git a/unittest/AssemblerX8664/DataMov.cpp b/unittest/AssemblerX8664/DataMov.cpp
new file mode 100644
index 0000000..0610b45
--- /dev/null
+++ b/unittest/AssemblerX8664/DataMov.cpp
@@ -0,0 +1,1314 @@
+//===- subzero/unittest/AssemblerX8664/DataMov.cpp ------------------------===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#include "AssemblerX8664/TestUtil.h"
+
+namespace Ice {
+namespace X8664 {
+namespace Test {
+namespace {
+
+TEST_F(AssemblerX8664Test, MovRegImm) {
+  static constexpr uint32_t Mask8 = 0x000000FF;
+  static constexpr uint32_t Mask16 = 0x0000FFFF;
+  static constexpr uint32_t Mask32 = 0xFFFFFFFF;
+
+#define MovRegImm(Reg, Suffix, Size)                                           \
+  do {                                                                         \
+    static constexpr char TestString[] = "(" #Reg ", " #Size ")";              \
+    static constexpr uint32_t Value = (0xABCD7645) & Mask##Size;               \
+    static constexpr uint32_t Marker = 0xBEEFFEEB;                             \
+    __ mov(IceType_i32, Encoded_GPR_##Reg##q(), Immediate(Marker));            \
+    __ mov(IceType_i##Size, Encoded_GPR_##Reg##Suffix(), Immediate(Value));    \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(Value, test.Reg##Suffix()) << TestString;                        \
+    ASSERT_EQ((Marker & ~Mask##Size) | Value, test.Reg##d()) << TestString;    \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImpl(Reg)                                                          \
+  do {                                                                         \
+    MovRegImm(Reg, l, 8);                                                      \
+    MovRegImm(Reg, w, 16);                                                     \
+    MovRegImm(Reg, d, 32);                                                     \
+    /* MovRegImm64 not implemented */                                          \
+  } while (0)
+
+  TestImpl(r1);
+  TestImpl(r2);
+  TestImpl(r3);
+  TestImpl(r4);
+  TestImpl(r5);
+  TestImpl(r6);
+  TestImpl(r7);
+  TestImpl(r8);
+  TestImpl(r10);
+  TestImpl(r11);
+  TestImpl(r12);
+  TestImpl(r13);
+  TestImpl(r14);
+  TestImpl(r15);
+
+#undef TestImpl
+#undef MovRegImm
+}
+
+TEST_F(AssemblerX8664Test, MovMemImm) {
+  const uint32_t T0 = allocateDword();
+  constexpr uint32_t ExpectedT0 = 0x00111100ul;
+  const uint32_t T1 = allocateDword();
+  constexpr uint32_t ExpectedT1 = 0x00222200ul;
+  const uint32_t T2 = allocateDword();
+  constexpr uint32_t ExpectedT2 = 0x03333000ul;
+  const uint32_t T3 = allocateDword();
+  constexpr uint32_t ExpectedT3 = 0x00444400ul;
+
+  __ mov(IceType_i32, dwordAddress(T0), Immediate(ExpectedT0));
+  __ mov(IceType_i16, dwordAddress(T1), Immediate(ExpectedT1));
+  __ mov(IceType_i8, dwordAddress(T2), Immediate(ExpectedT2));
+  __ mov(IceType_i32, dwordAddress(T3), Immediate(ExpectedT3));
+
+  AssembledTest test = assemble();
+  test.run();
+  EXPECT_EQ(0ul, test.eax());
+  EXPECT_EQ(0ul, test.ebx());
+  EXPECT_EQ(0ul, test.ecx());
+  EXPECT_EQ(0ul, test.edx());
+  EXPECT_EQ(0ul, test.edi());
+  EXPECT_EQ(0ul, test.esi());
+  EXPECT_EQ(ExpectedT0, test.contentsOfDword(T0));
+  EXPECT_EQ(ExpectedT1 & 0xFFFF, test.contentsOfDword(T1));
+  EXPECT_EQ(ExpectedT2 & 0xFF, test.contentsOfDword(T2));
+  EXPECT_EQ(ExpectedT3, test.contentsOfDword(T3));
+}
+
+TEST_F(AssemblerX8664Test, MovMemReg) {
+  static constexpr uint64_t Mask8 = 0x00000000000000FF;
+  static constexpr uint64_t Mask16 = 0x000000000000FFFF;
+  static constexpr uint64_t Mask32 = 0x00000000FFFFFFFF;
+  static constexpr uint64_t Mask64 = 0xFFFFFFFFFFFFFFFF;
+
+#define TestMemReg(Src, Size)                                                  \
+  do {                                                                         \
+    static constexpr char TestString[] = "(" #Src ", " #Size ")";              \
+    static constexpr uint32_t Value = 0x1a4d567e & Mask##Size;                 \
+    static constexpr uint64_t Marker = 0xD0DA33EEBEEFFEEB;                     \
+    const uint32_t T0 = allocateQword();                                       \
+                                                                               \
+    __ mov(IceType_i32, Encoded_GPR_##Src(), Immediate(Value));                \
+    __ mov(IceType_i##Size, dwordAddress(T0), Encoded_GPR_##Src());            \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setQwordTo(T0, Marker);                                               \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ((Marker & ~Mask##Size) | Value, test.contentsOfQword(T0))        \
+        << TestString;                                                         \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImpl(Src)                                                          \
+  do {                                                                         \
+    TestMemReg(Src, 8);                                                        \
+    TestMemReg(Src, 16);                                                       \
+    TestMemReg(Src, 32);                                                       \
+    TestMemReg(Src, 64);                                                       \
+  } while (0)
+
+  TestImpl(r1);
+  TestImpl(r2);
+  TestImpl(r3);
+  TestImpl(r4);
+  TestImpl(r5);
+  TestImpl(r6);
+  TestImpl(r7);
+  TestImpl(r8);
+  TestImpl(r10);
+  TestImpl(r11);
+  TestImpl(r12);
+  TestImpl(r13);
+  TestImpl(r14);
+  TestImpl(r15);
+
+#undef TestImpl
+#undef TestMemReg
+}
+
+TEST_F(AssemblerX8664Test, MovRegReg) {
+  static constexpr uint64_t Mask8 = 0x00000000000000FFull;
+  static constexpr uint64_t Mask16 = 0x000000000000FFFFull;
+  static constexpr uint64_t Mask32 = 0x00000000FFFFFFFFull;
+  static constexpr uint64_t Mask64 = 0xFFFFFFFFFFFFFFFFull;
+
+  static constexpr uint64_t MaskResult8 = 0x00000000000000FFull;
+  static constexpr uint64_t MaskResult16 = 0x000000000000FFFFull;
+  static constexpr uint64_t MaskResult32 = 0xFFFFFFFFFFFFFFFFull;
+  static constexpr uint64_t MaskResult64 = 0xFFFFFFFFFFFFFFFFull;
+
+#define TestRegReg(Dst, Src, Suffix, Size)                                     \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Dst ", " #Src ", " #Suffix ", " #Size ")";                        \
+    const uint8_t T0 = allocateQword();                                        \
+    static constexpr uint64_t Value = 0xA4DD30Af86CCE321ull & Mask##Size;      \
+    const uint8_t T1 = allocateQword();                                        \
+    static constexpr uint64_t Marker = 0xC0FFEEA0BEEFFEEFull;                  \
+                                                                               \
+    __ mov(IceType_i64, Encoded_GPR_##Src(), dwordAddress(T0));                \
+    __ mov(IceType_i64, Encoded_GPR_##Dst(), dwordAddress(T1));                \
+    __ mov(IceType_i##Size, Encoded_GPR_##Dst(), Encoded_GPR_##Src());         \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setQwordTo(T0, Value);                                                \
+    test.setQwordTo(T1, Marker);                                               \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ((Marker & ~MaskResult##Size) | Value, test.Dst()) << TestString; \
+    ASSERT_EQ(Value, test.Dst##Suffix()) << TestString;                        \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImpl(Dst, Src)                                                     \
+  do {                                                                         \
+    TestRegReg(Dst, Src, l, 8);                                                \
+    TestRegReg(Dst, Src, w, 16);                                               \
+    TestRegReg(Dst, Src, d, 32);                                               \
+    TestRegReg(Dst, Src, q, 64);                                               \
+  } while (0)
+
+  TestImpl(r1, r2);
+  TestImpl(r2, r3);
+  TestImpl(r3, r4);
+  TestImpl(r4, r5);
+  TestImpl(r5, r6);
+  TestImpl(r6, r7);
+  TestImpl(r7, r8);
+  TestImpl(r8, r10);
+  TestImpl(r10, r11);
+  TestImpl(r11, r12);
+  TestImpl(r12, r13);
+  TestImpl(r13, r14);
+  TestImpl(r14, r15);
+  TestImpl(r15, r1);
+
+#undef TestImpl
+#undef TestRegReg
+}
+
+TEST_F(AssemblerX8664Test, MovRegMem) {
+  static constexpr uint64_t Mask8 = 0x00000000000000FFull;
+  static constexpr uint64_t Mask16 = 0x000000000000FFFFull;
+  static constexpr uint64_t Mask32 = 0x00000000FFFFFFFFull;
+  static constexpr uint64_t Mask64 = 0xFFFFFFFFFFFFFFFFull;
+
+  static constexpr uint64_t MaskResult8 = ~0x00000000000000FFull;
+  static constexpr uint64_t MaskResult16 = ~0x000000000000FFFFull;
+  static constexpr uint64_t MaskResult32 = ~0xFFFFFFFFFFFFFFFFull;
+  static constexpr uint64_t MaskResult64 = ~0xFFFFFFFFFFFFFFFFull;
+
+#define TestRegAddr(Dst, Suffix, Size)                                         \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Dst ", Addr, " #Suffix ", " #Size ")";                            \
+    const uint8_t T0 = allocateQword();                                        \
+    static constexpr uint64_t Value = 0xA4DD30Af86CCE321ull & Mask##Size;      \
+    const uint8_t T1 = allocateQword();                                        \
+    static constexpr uint64_t Marker = 0xC0FFEEA0BEEFFEEFull;                  \
+                                                                               \
+    __ mov(IceType_i64, Encoded_GPR_##Dst(), dwordAddress(T1));                \
+    __ mov(IceType_i##Size, Encoded_GPR_##Dst(), dwordAddress(T0));            \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setQwordTo(T0, Value);                                                \
+    test.setQwordTo(T1, Marker);                                               \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ((Marker & MaskResult##Size) | Value, test.Dst()) << TestString;  \
+    ASSERT_EQ(Value, test.Dst##Suffix()) << TestString;                        \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImpl(Dst)                                                          \
+  do {                                                                         \
+    TestRegAddr(Dst, l, 8);                                                    \
+    TestRegAddr(Dst, w, 16);                                                   \
+    TestRegAddr(Dst, d, 32);                                                   \
+    TestRegAddr(Dst, q, 64);                                                   \
+  } while (0)
+
+  TestImpl(r1);
+  TestImpl(r2);
+  TestImpl(r3);
+  TestImpl(r4);
+  TestImpl(r5);
+  TestImpl(r6);
+  TestImpl(r7);
+  TestImpl(r8);
+  TestImpl(r10);
+  TestImpl(r11);
+  TestImpl(r12);
+  TestImpl(r13);
+  TestImpl(r14);
+  TestImpl(r15);
+
+#undef TestImpl
+#undef TestRegAddr
+}
+
+TEST_F(AssemblerX8664Test, Movzx) {
+  static constexpr uint32_t Mask8 = 0x000000FF;
+  static constexpr uint32_t Mask16 = 0x0000FFFF;
+
+#define TestImplRegReg(Dst, Src, Suffix, Size)                                 \
+  do {                                                                         \
+    const uint32_t T0 = allocateDqword();                                      \
+    static constexpr uint64_t V0 = 0xAAAAAAAAAAAAAAAAull;                      \
+    static constexpr uint32_t Value = (0xBEEF) & Mask##Size;                   \
+    __ mov(IceType_i64, Encoded_GPR_##Dst##q(), dwordAddress(T0));             \
+    __ mov(IceType_i##Size, Encoded_GPR_##Src##Suffix(), Immediate(Value));    \
+    __ movzx(IceType_i##Size, Encoded_GPR_##Dst##d(),                          \
+             Encoded_GPR_##Src##Suffix());                                     \
+    AssembledTest test = assemble();                                           \
+    test.setQwordTo(T0, V0);                                                   \
+    test.run();                                                                \
+    ASSERT_EQ(Value, test.Dst##q()) << "(" #Dst ", " #Src ", " #Size ")";      \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplRegAddr(Dst, Suffix, Size)                                     \
+  do {                                                                         \
+    const uint32_t T0 = allocateDqword();                                      \
+    static constexpr uint64_t V0 = 0xAAAAAAAAAAAAAAAAull;                      \
+    static constexpr uint32_t Value = (0xBEEF) & Mask##Size;                   \
+    __ movzx(IceType_i##Size, Encoded_GPR_##Dst##d(), dwordAddress(T0));       \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setQwordTo(T0, (V0 & ~Mask##Size) | Value);                           \
+    test.run();                                                                \
+    ASSERT_EQ(Value, test.Dst##q()) << "(" #Dst ", Addr, " #Size ")";          \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImpl(Dst, Src)                                                     \
+  do {                                                                         \
+    TestImplRegReg(Dst, Src, l, 8);                                            \
+    TestImplRegAddr(Dst, l, 8);                                                \
+    TestImplRegReg(Dst, Src, w, 16);                                           \
+    TestImplRegAddr(Dst, w, 16);                                               \
+  } while (0)
+
+  TestImpl(r1, r2);
+  TestImpl(r2, r3);
+  TestImpl(r3, r4);
+  TestImpl(r4, r5);
+  TestImpl(r5, r6);
+  TestImpl(r6, r7);
+  TestImpl(r7, r8);
+  TestImpl(r8, r10);
+  TestImpl(r10, r11);
+  TestImpl(r11, r12);
+  TestImpl(r12, r13);
+  TestImpl(r13, r14);
+  TestImpl(r14, r15);
+  TestImpl(r15, r1);
+
+#undef TestImpl
+#undef TestImplRegAddr
+#undef TestImplRegReg
+}
+
+TEST_F(AssemblerX8664Test, Movsx) {
+  static constexpr uint64_t Mask8 = 0x000000FF;
+  static constexpr uint64_t Mask16 = 0x0000FFFF;
+  static constexpr uint64_t Mask32 = 0xFFFFFFFF;
+
+#define TestImplRegReg(Dst, Src, Suffix, Size)                                 \
+  do {                                                                         \
+    const uint32_t T0 = allocateDqword();                                      \
+    static constexpr uint64_t V0 = 0xAAAAAAAAAAAAAAAAull;                      \
+    static constexpr uint64_t Value = (0xC0BEBEEF) & Mask##Size;               \
+    __ mov(IceType_i64, Encoded_GPR_##Dst##q(), dwordAddress(T0));             \
+    __ mov(IceType_i##Size, Encoded_GPR_##Src##Suffix(), Immediate(Value));    \
+    __ movsx(IceType_i##Size, Encoded_GPR_##Dst##d(),                          \
+             Encoded_GPR_##Src##Suffix());                                     \
+    AssembledTest test = assemble();                                           \
+    test.setQwordTo(T0, V0);                                                   \
+    test.run();                                                                \
+    ASSERT_EQ((uint64_t(-1) & ~Mask##Size) | Value, test.Dst##q())             \
+        << "(" #Dst ", " #Src ", " #Size ")";                                  \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplRegAddr(Dst, Suffix, Size)                                     \
+  do {                                                                         \
+    const uint32_t T0 = allocateDqword();                                      \
+    static constexpr uint64_t V0 = 0xC0BEBEEF & Mask##Size;                    \
+    static constexpr uint64_t Value = (0xC0BEBEEF) & Mask##Size;               \
+    __ movsx(IceType_i##Size, Encoded_GPR_##Dst##d(), dwordAddress(T0));       \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setQwordTo(T0, V0);                                                   \
+    test.run();                                                                \
+    ASSERT_EQ((uint64_t(-1) & ~Mask##Size) | Value, test.Dst##q())             \
+        << "(" #Dst ", Addr, " #Size ")";                                      \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImpl(Dst, Src)                                                     \
+  do {                                                                         \
+    TestImplRegReg(Dst, Src, l, 8);                                            \
+    TestImplRegAddr(Dst, l, 8);                                                \
+    TestImplRegReg(Dst, Src, w, 16);                                           \
+    TestImplRegAddr(Dst, w, 16);                                               \
+    TestImplRegReg(Dst, Src, w, 32);                                           \
+    TestImplRegAddr(Dst, w, 32);                                               \
+  } while (0)
+
+  TestImpl(r1, r2);
+  TestImpl(r2, r3);
+  TestImpl(r3, r4);
+  TestImpl(r4, r5);
+  TestImpl(r5, r6);
+  TestImpl(r6, r7);
+  TestImpl(r7, r8);
+  TestImpl(r8, r10);
+  TestImpl(r10, r11);
+  TestImpl(r11, r12);
+  TestImpl(r12, r13);
+  TestImpl(r13, r14);
+  TestImpl(r14, r15);
+  TestImpl(r15, r1);
+
+#undef TestImpl
+#undef TestImplRegAddr
+#undef TestImplRegReg
+}
+
+TEST_F(AssemblerX8664Test, Cmov) {
+#define TestRegReg(C, Dest, IsTrue, Src0, Value0, Src1, Value1)                \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #C ", " #Dest ", " #IsTrue ", " #Src0 ", " #Value0 ", " #Src1      \
+        ", " #Value1 ")";                                                      \
+    __ mov(IceType_i32, Encoded_GPR_##Src0(), Immediate(Value0));              \
+    __ mov(IceType_i32, Encoded_GPR_##Src1(), Immediate(Value1));              \
+    __ mov(IceType_i32, Encoded_GPR_##Dest(), Immediate(Value0));              \
+    __ cmp(IceType_i32, Encoded_GPR_##Src0(), Encoded_GPR_##Src1());           \
+    __ cmov(IceType_i32, Cond::Br_##C, Encoded_GPR_##Dest(),                   \
+            Encoded_GPR_##Src1());                                             \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.run();                                                                \
+    ASSERT_EQ((IsTrue) ? (Value1) : (Value0), test.Dest()) << TestString;      \
+                                                                               \
+    reset();                                                                   \
+  } while (0)
+
+#define TestRegAddr(C, Dest, IsTrue, Src0, Value0, Value1)                     \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #C ", " #Dest ", " #IsTrue ", " #Src0 ", " #Value0                 \
+        ", Addr, " #Value1 ")";                                                \
+    const uint32_t T0 = allocateDword();                                       \
+    const uint32_t V0 = Value1;                                                \
+    __ mov(IceType_i32, Encoded_GPR_##Src0(), Immediate(Value0));              \
+    __ mov(IceType_i32, Encoded_GPR_##Dest(), Immediate(Value0));              \
+    __ cmp(IceType_i32, Encoded_GPR_##Src0(), dwordAddress(T0));               \
+    __ cmov(IceType_i32, Cond::Br_##C, Encoded_GPR_##Dest(),                   \
+            dwordAddress(T0));                                                 \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDwordTo(T0, V0);                                                   \
+    test.run();                                                                \
+    ASSERT_EQ((IsTrue) ? (Value1) : (Value0), test.Dest()) << TestString;      \
+                                                                               \
+    reset();                                                                   \
+  } while (0)
+
+#define TestValue(C, Dest, IsTrue, Src0, Value0, Src1, Value1)                 \
+  do {                                                                         \
+    TestRegReg(C, Dest, IsTrue, Src0, Value0, Src1, Value1);                   \
+    TestRegAddr(C, Dest, IsTrue, Src0, Value0, Value1);                        \
+  } while (0)
+
+#define TestImpl(Dest, Src0, Src1)                                             \
+  do {                                                                         \
+    TestValue(o, Dest, 1u, Src0, 0x80000000u, Src1, 0x1u);                     \
+    TestValue(o, Dest, 0u, Src0, 0x1u, Src1, 0x10000000u);                     \
+    TestValue(no, Dest, 1u, Src0, 0x1u, Src1, 0x10000000u);                    \
+    TestValue(no, Dest, 0u, Src0, 0x80000000u, Src1, 0x1u);                    \
+    TestValue(b, Dest, 1u, Src0, 0x1, Src1, 0x80000000u);                      \
+    TestValue(b, Dest, 0u, Src0, 0x80000000u, Src1, 0x1u);                     \
+    TestValue(ae, Dest, 1u, Src0, 0x80000000u, Src1, 0x1u);                    \
+    TestValue(ae, Dest, 0u, Src0, 0x1u, Src1, 0x80000000u);                    \
+    TestValue(e, Dest, 1u, Src0, 0x1u, Src1, 0x1u);                            \
+    TestValue(e, Dest, 0u, Src0, 0x1u, Src1, 0x11111u);                        \
+    TestValue(ne, Dest, 1u, Src0, 0x80000000u, Src1, 0x1u);                    \
+    TestValue(ne, Dest, 0u, Src0, 0x1u, Src1, 0x1u);                           \
+    TestValue(be, Dest, 1u, Src0, 0x1u, Src1, 0x80000000u);                    \
+    TestValue(be, Dest, 0u, Src0, 0x80000000u, Src1, 0x1u);                    \
+    TestValue(a, Dest, 1u, Src0, 0x80000000u, Src1, 0x1u);                     \
+    TestValue(a, Dest, 0u, Src0, 0x1u, Src1, 0x80000000u);                     \
+    TestValue(s, Dest, 1u, Src0, 0x1u, Src1, 0x80000000u);                     \
+    TestValue(s, Dest, 0u, Src0, 0x80000000u, Src1, 0x1u);                     \
+    TestValue(ns, Dest, 1u, Src0, 0x80000000u, Src1, 0x1u);                    \
+    TestValue(ns, Dest, 0u, Src0, 0x1u, Src1, 0x80000000u);                    \
+    TestValue(p, Dest, 1u, Src0, 0x80000000u, Src1, 0x1u);                     \
+    TestValue(p, Dest, 0u, Src0, 0x1u, Src1, 0x80000000u);                     \
+    TestValue(np, Dest, 1u, Src0, 0x1u, Src1, 0x80000000u);                    \
+    TestValue(np, Dest, 0u, Src0, 0x80000000u, Src1, 0x1u);                    \
+    TestValue(l, Dest, 1u, Src0, 0x80000000u, Src1, 0x1u);                     \
+    TestValue(l, Dest, 0u, Src0, 0x1u, Src1, 0x80000000u);                     \
+    TestValue(ge, Dest, 1u, Src0, 0x1u, Src1, 0x80000000u);                    \
+    TestValue(ge, Dest, 0u, Src0, 0x80000000u, Src1, 0x1u);                    \
+    TestValue(le, Dest, 1u, Src0, 0x80000000u, Src1, 0x1u);                    \
+    TestValue(le, Dest, 0u, Src0, 0x1u, Src1, 0x80000000u);                    \
+  } while (0)
+
+  TestImpl(r1, r2, r3);
+
+#undef TestImpl
+#undef TestValue
+#undef TestRegAddr
+#undef TestRegReg
+}
+
+TEST_F(AssemblerX8664LowLevelTest, RepMovsb) {
+  __ rep_movsb();
+
+  static constexpr uint32_t ByteCount = 2;
+  static constexpr uint8_t Prefix = 0xF3;
+  static constexpr uint8_t Opcode = 0xA4;
+
+  ASSERT_EQ(ByteCount, codeBytesSize());
+  verifyBytes<ByteCount>(codeBytes(), Prefix, Opcode);
+}
+
+TEST_F(AssemblerX8664Test, MovssXmmAddr) {
+#define TestMovssXmmAddrFloatLength(FloatLength, Xmm, Value)                   \
+  do {                                                                         \
+    static_assert((FloatLength) == 32 || (FloatLength) == 64,                  \
+                  "Invalid fp length #FloatLength");                           \
+    using Type = std::conditional<FloatLength == 32, float, double>::type;     \
+                                                                               \
+    static constexpr char TestString[] = "(" #FloatLength ", " #Xmm ")";       \
+    static constexpr bool IsDouble = std::is_same<Type, double>::value;        \
+    const uint32_t T0 = allocateQword();                                       \
+    const Type V0 = Value;                                                     \
+                                                                               \
+    __ movss(IceType_f##FloatLength, Encoded_Xmm_##Xmm(), dwordAddress(T0));   \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    if (IsDouble) {                                                            \
+      test.setQwordTo(T0, static_cast<double>(V0));                            \
+    } else {                                                                   \
+      test.setDwordTo(T0, static_cast<float>(V0));                             \
+    }                                                                          \
+    test.run();                                                                \
+    ASSERT_DOUBLE_EQ(Value, test.Xmm<Type>()) << TestString << " value is "    \
+                                              << Value;                        \
+    reset();                                                                   \
+  } while (0)
+
+#define TestMovssXmmAddr(FloatLength)                                          \
+  do {                                                                         \
+    using Type = std::conditional<FloatLength == 32, float, double>::type;     \
+    for (const Type Value : {0.0, -0.0, 1.0, -1.0, 3.14, 99999.9999}) {        \
+      TestMovssXmmAddrFloatLength(FloatLength, xmm0, Value);                   \
+      TestMovssXmmAddrFloatLength(FloatLength, xmm1, Value);                   \
+      TestMovssXmmAddrFloatLength(FloatLength, xmm2, Value);                   \
+      TestMovssXmmAddrFloatLength(FloatLength, xmm3, Value);                   \
+      TestMovssXmmAddrFloatLength(FloatLength, xmm4, Value);                   \
+      TestMovssXmmAddrFloatLength(FloatLength, xmm5, Value);                   \
+      TestMovssXmmAddrFloatLength(FloatLength, xmm6, Value);                   \
+      TestMovssXmmAddrFloatLength(FloatLength, xmm7, Value);                   \
+      TestMovssXmmAddrFloatLength(FloatLength, xmm8, Value);                   \
+      TestMovssXmmAddrFloatLength(FloatLength, xmm9, Value);                   \
+      TestMovssXmmAddrFloatLength(FloatLength, xmm10, Value);                  \
+      TestMovssXmmAddrFloatLength(FloatLength, xmm11, Value);                  \
+      TestMovssXmmAddrFloatLength(FloatLength, xmm12, Value);                  \
+      TestMovssXmmAddrFloatLength(FloatLength, xmm13, Value);                  \
+      TestMovssXmmAddrFloatLength(FloatLength, xmm14, Value);                  \
+      TestMovssXmmAddrFloatLength(FloatLength, xmm15, Value);                  \
+    }                                                                          \
+  } while (0)
+
+  TestMovssXmmAddr(32);
+  TestMovssXmmAddr(64);
+
+#undef TestMovssXmmAddr
+#undef TestMovssXmmAddrType
+}
+
+TEST_F(AssemblerX8664Test, MovssAddrXmm) {
+#define TestMovssAddrXmmFloatLength(FloatLength, Xmm, Value)                   \
+  do {                                                                         \
+    static_assert((FloatLength) == 32 || (FloatLength) == 64,                  \
+                  "Invalid fp length #FloatLength");                           \
+    using Type = std::conditional<FloatLength == 32, float, double>::type;     \
+                                                                               \
+    static constexpr char TestString[] = "(" #FloatLength ", " #Xmm ")";       \
+    static constexpr bool IsDouble = std::is_same<Type, double>::value;        \
+    const uint32_t T0 = allocateQword();                                       \
+    const Type V0 = Value;                                                     \
+    const uint32_t T1 = allocateQword();                                       \
+    static_assert(std::numeric_limits<Type>::has_quiet_NaN,                    \
+                  "f" #FloatLength " does not have quiet nan.");               \
+    const Type V1 = std::numeric_limits<Type>::quiet_NaN();                    \
+                                                                               \
+    __ movss(IceType_f##FloatLength, Encoded_Xmm_##Xmm(), dwordAddress(T0));   \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    if (IsDouble) {                                                            \
+      test.setQwordTo(T0, static_cast<double>(V0));                            \
+      test.setQwordTo(T1, static_cast<double>(V1));                            \
+    } else {                                                                   \
+      test.setDwordTo(T0, static_cast<float>(V0));                             \
+      test.setDwordTo(T1, static_cast<float>(V1));                             \
+    }                                                                          \
+    test.run();                                                                \
+    ASSERT_DOUBLE_EQ(Value, test.Xmm<Type>()) << TestString << " value is "    \
+                                              << Value;                        \
+    reset();                                                                   \
+  } while (0)
+
+#define TestMovssAddrXmm(FloatLength)                                          \
+  do {                                                                         \
+    using Type = std::conditional<FloatLength == 32, float, double>::type;     \
+    for (const Type Value : {0.0, -0.0, 1.0, -1.0, 3.14, 99999.9999}) {        \
+      TestMovssAddrXmmFloatLength(FloatLength, xmm0, Value);                   \
+      TestMovssAddrXmmFloatLength(FloatLength, xmm1, Value);                   \
+      TestMovssAddrXmmFloatLength(FloatLength, xmm2, Value);                   \
+      TestMovssAddrXmmFloatLength(FloatLength, xmm3, Value);                   \
+      TestMovssAddrXmmFloatLength(FloatLength, xmm4, Value);                   \
+      TestMovssAddrXmmFloatLength(FloatLength, xmm5, Value);                   \
+      TestMovssAddrXmmFloatLength(FloatLength, xmm6, Value);                   \
+      TestMovssAddrXmmFloatLength(FloatLength, xmm7, Value);                   \
+      TestMovssAddrXmmFloatLength(FloatLength, xmm8, Value);                   \
+      TestMovssAddrXmmFloatLength(FloatLength, xmm9, Value);                   \
+      TestMovssAddrXmmFloatLength(FloatLength, xmm10, Value);                  \
+      TestMovssAddrXmmFloatLength(FloatLength, xmm11, Value);                  \
+      TestMovssAddrXmmFloatLength(FloatLength, xmm12, Value);                  \
+      TestMovssAddrXmmFloatLength(FloatLength, xmm13, Value);                  \
+      TestMovssAddrXmmFloatLength(FloatLength, xmm14, Value);                  \
+      TestMovssAddrXmmFloatLength(FloatLength, xmm15, Value);                  \
+    }                                                                          \
+  } while (0)
+
+  TestMovssAddrXmm(32);
+  TestMovssAddrXmm(64);
+
+#undef TestMovssAddrXmm
+#undef TestMovssAddrXmmType
+}
+
+TEST_F(AssemblerX8664Test, MovssXmmXmm) {
+#define TestMovssXmmXmmFloatLength(FloatLength, Src, Dst, Value)               \
+  do {                                                                         \
+    static_assert((FloatLength) == 32 || (FloatLength) == 64,                  \
+                  "Invalid fp length #FloatLength");                           \
+    using Type = std::conditional<FloatLength == 32, float, double>::type;     \
+                                                                               \
+    static constexpr char TestString[] =                                       \
+        "(" #FloatLength ", " #Src ", " #Dst ")";                              \
+    static constexpr bool IsDouble = std::is_same<Type, double>::value;        \
+    const uint32_t T0 = allocateQword();                                       \
+    const Type V0 = Value;                                                     \
+    const uint32_t T1 = allocateQword();                                       \
+    static_assert(std::numeric_limits<Type>::has_quiet_NaN,                    \
+                  "f" #FloatLength " does not have quiet nan.");               \
+    const Type V1 = std::numeric_limits<Type>::quiet_NaN();                    \
+                                                                               \
+    __ movss(IceType_f##FloatLength, Encoded_Xmm_##Src(), dwordAddress(T0));   \
+    __ movss(IceType_f##FloatLength, Encoded_Xmm_##Dst(), dwordAddress(T1));   \
+    __ movss(IceType_f##FloatLength, Encoded_Xmm_##Dst(),                      \
+             Encoded_Xmm_##Src());                                             \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    if (IsDouble) {                                                            \
+      test.setQwordTo(T0, static_cast<double>(V0));                            \
+      test.setQwordTo(T1, static_cast<double>(V1));                            \
+    } else {                                                                   \
+      test.setDwordTo(T0, static_cast<float>(V0));                             \
+      test.setDwordTo(T1, static_cast<float>(V1));                             \
+    }                                                                          \
+    test.run();                                                                \
+    ASSERT_DOUBLE_EQ(Value, test.Dst<Type>()) << TestString << " value is "    \
+                                              << Value;                        \
+    reset();                                                                   \
+  } while (0)
+
+#define TestMovssXmmXmm(FloatLength)                                           \
+  do {                                                                         \
+    using Type = std::conditional<FloatLength == 32, float, double>::type;     \
+    for (const Type Value : {0.0, -0.0, 1.0, -1.0, 3.14, 99999.9999}) {        \
+      TestMovssXmmXmmFloatLength(FloatLength, xmm0, xmm1, Value);              \
+      TestMovssXmmXmmFloatLength(FloatLength, xmm1, xmm2, Value);              \
+      TestMovssXmmXmmFloatLength(FloatLength, xmm2, xmm3, Value);              \
+      TestMovssXmmXmmFloatLength(FloatLength, xmm3, xmm4, Value);              \
+      TestMovssXmmXmmFloatLength(FloatLength, xmm4, xmm5, Value);              \
+      TestMovssXmmXmmFloatLength(FloatLength, xmm5, xmm6, Value);              \
+      TestMovssXmmXmmFloatLength(FloatLength, xmm6, xmm7, Value);              \
+      TestMovssXmmXmmFloatLength(FloatLength, xmm7, xmm8, Value);              \
+      TestMovssXmmXmmFloatLength(FloatLength, xmm8, xmm9, Value);              \
+      TestMovssXmmXmmFloatLength(FloatLength, xmm9, xmm10, Value);             \
+      TestMovssXmmXmmFloatLength(FloatLength, xmm10, xmm11, Value);            \
+      TestMovssXmmXmmFloatLength(FloatLength, xmm11, xmm12, Value);            \
+      TestMovssXmmXmmFloatLength(FloatLength, xmm12, xmm13, Value);            \
+      TestMovssXmmXmmFloatLength(FloatLength, xmm13, xmm14, Value);            \
+      TestMovssXmmXmmFloatLength(FloatLength, xmm14, xmm15, Value);            \
+      TestMovssXmmXmmFloatLength(FloatLength, xmm15, xmm0, Value);             \
+    }                                                                          \
+  } while (0)
+
+  TestMovssXmmXmm(32);
+  TestMovssXmmXmm(64);
+
+#undef TestMovssXmmXmm
+#undef TestMovssXmmXmmType
+}
+
+TEST_F(AssemblerX8664Test, MovdToXmm) {
+#define TestMovdXmmReg(Src, Dst, Value)                                        \
+  do {                                                                         \
+    assert(((Value)&0xFFFFFFFF) == (Value));                                   \
+    static constexpr char TestString[] = "(" #Src ", " #Dst ")";               \
+    const uint32_t T0 = allocateQword();                                       \
+    const uint64_t V0 = 0xFFFFFFFF00000000ull;                                 \
+                                                                               \
+    __ mov(IceType_i32, Encoded_GPR_##Src(), Immediate(Value));                \
+    __ movss(IceType_f64, Encoded_Xmm_##Dst(), dwordAddress(T0));              \
+    __ movd(Encoded_Xmm_##Dst(), Encoded_GPR_##Src());                         \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+                                                                               \
+    test.setQwordTo(T0, V0);                                                   \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(Value, test.Dst<uint64_t>()) << TestString << " value is "       \
+                                           << Value;                           \
+    reset();                                                                   \
+  } while (0)
+
+#define TestMovdXmmAddr(Dst, Value)                                            \
+  do {                                                                         \
+    assert(((Value)&0xFFFFFFFF) == (Value));                                   \
+    static constexpr char TestString[] = "(" #Dst ", Addr)";                   \
+    const uint32_t T0 = allocateQword();                                       \
+    const uint32_t V0 = Value;                                                 \
+    const uint32_t T1 = allocateQword();                                       \
+    const uint64_t V1 = 0xFFFFFFFF00000000ull;                                 \
+                                                                               \
+    __ movss(IceType_f64, Encoded_Xmm_##Dst(), dwordAddress(T1));              \
+    __ movd(Encoded_Xmm_##Dst(), dwordAddress(T0));                            \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+                                                                               \
+    test.setDwordTo(T0, V0);                                                   \
+    test.setQwordTo(T1, V1);                                                   \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(Value, test.Dst<uint64_t>()) << TestString << " value is "       \
+                                           << Value;                           \
+    reset();                                                                   \
+  } while (0)
+
+#define TestMovd(Dst)                                                          \
+  do {                                                                         \
+    for (uint32_t Value : {0u, 1u, 0x7FFFFFFFu, 0x80000000u, 0xFFFFFFFFu}) {   \
+      TestMovdXmmReg(r1, Dst, Value);                                          \
+      TestMovdXmmReg(r2, Dst, Value);                                          \
+      TestMovdXmmReg(r3, Dst, Value);                                          \
+      TestMovdXmmReg(r4, Dst, Value);                                          \
+      TestMovdXmmReg(r5, Dst, Value);                                          \
+      TestMovdXmmReg(r6, Dst, Value);                                          \
+      TestMovdXmmReg(r7, Dst, Value);                                          \
+      TestMovdXmmReg(r8, Dst, Value);                                          \
+      TestMovdXmmReg(r10, Dst, Value);                                         \
+      TestMovdXmmReg(r11, Dst, Value);                                         \
+      TestMovdXmmReg(r12, Dst, Value);                                         \
+      TestMovdXmmReg(r13, Dst, Value);                                         \
+      TestMovdXmmReg(r14, Dst, Value);                                         \
+      TestMovdXmmReg(r15, Dst, Value);                                         \
+      TestMovdXmmAddr(Dst, Value);                                             \
+    }                                                                          \
+  } while (0)
+
+  TestMovd(xmm0);
+  TestMovd(xmm1);
+  TestMovd(xmm2);
+  TestMovd(xmm3);
+  TestMovd(xmm4);
+  TestMovd(xmm5);
+  TestMovd(xmm6);
+  TestMovd(xmm7);
+  TestMovd(xmm8);
+  TestMovd(xmm9);
+  TestMovd(xmm10);
+  TestMovd(xmm11);
+  TestMovd(xmm12);
+  TestMovd(xmm13);
+  TestMovd(xmm14);
+  TestMovd(xmm15);
+
+#undef TestMovdXmmAddr
+#undef TestMovdXmmReg
+#undef TestMovd
+}
+
+TEST_F(AssemblerX8664Test, MovdFromXmm) {
+#define TestMovdRegXmm(Src, Dst, Value)                                        \
+  do {                                                                         \
+    assert(((Value)&0xFFFFFFFF) == (Value));                                   \
+    static constexpr char TestString[] = "(" #Src ", " #Dst ")";               \
+    const uint32_t T0 = allocateDword();                                       \
+    const uint32_t V0 = Value;                                                 \
+                                                                               \
+    __ movss(IceType_f64, Encoded_Xmm_##Src(), dwordAddress(T0));              \
+    __ movd(Encoded_GPR_##Dst(), Encoded_Xmm_##Src());                         \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+                                                                               \
+    test.setDwordTo(T0, V0);                                                   \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(Value, test.contentsOfDword(T0)) << TestString << " value is "   \
+                                               << Value;                       \
+    reset();                                                                   \
+  } while (0)
+
+#define TestMovdAddrXmm(Src, Value)                                            \
+  do {                                                                         \
+    assert(((Value)&0xFFFFFFFF) == (Value));                                   \
+    static constexpr char TestString[] = "(" #Src ", Addr)";                   \
+    const uint32_t T0 = allocateDword();                                       \
+    const uint32_t V0 = Value;                                                 \
+    const uint32_t T1 = allocateDword();                                       \
+    const uint32_t V1 = ~(Value);                                              \
+                                                                               \
+    __ movss(IceType_f64, Encoded_Xmm_##Src(), dwordAddress(T0));              \
+    __ movd(dwordAddress(T1), Encoded_Xmm_##Src());                            \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+                                                                               \
+    test.setDwordTo(T0, V0);                                                   \
+    test.setDwordTo(T1, V1);                                                   \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(Value, test.contentsOfDword(T1)) << TestString << " value is "   \
+                                               << Value;                       \
+    reset();                                                                   \
+  } while (0)
+
+#define TestMovd(Src)                                                          \
+  do {                                                                         \
+    for (uint32_t Value : {0u, 1u, 0x7FFFFFFFu, 0x80000000u, 0xFFFFFFFFu}) {   \
+      TestMovdRegXmm(Src, r1, Value);                                          \
+      TestMovdRegXmm(Src, r2, Value);                                          \
+      TestMovdRegXmm(Src, r3, Value);                                          \
+      TestMovdRegXmm(Src, r4, Value);                                          \
+      TestMovdRegXmm(Src, r5, Value);                                          \
+      TestMovdRegXmm(Src, r6, Value);                                          \
+      TestMovdRegXmm(Src, r7, Value);                                          \
+      TestMovdRegXmm(Src, r8, Value);                                          \
+      TestMovdRegXmm(Src, r10, Value);                                         \
+      TestMovdRegXmm(Src, r11, Value);                                         \
+      TestMovdRegXmm(Src, r12, Value);                                         \
+      TestMovdRegXmm(Src, r13, Value);                                         \
+      TestMovdRegXmm(Src, r14, Value);                                         \
+      TestMovdRegXmm(Src, r15, Value);                                         \
+      TestMovdAddrXmm(Src, Value);                                             \
+    }                                                                          \
+  } while (0)
+
+  TestMovd(xmm0);
+  TestMovd(xmm1);
+  TestMovd(xmm2);
+  TestMovd(xmm3);
+  TestMovd(xmm4);
+  TestMovd(xmm5);
+  TestMovd(xmm6);
+  TestMovd(xmm7);
+  TestMovd(xmm8);
+  TestMovd(xmm9);
+  TestMovd(xmm10);
+  TestMovd(xmm11);
+  TestMovd(xmm12);
+  TestMovd(xmm13);
+  TestMovd(xmm14);
+  TestMovd(xmm15);
+
+#undef TestMovdAddrXmm
+#undef TestMovdRegXmm
+#undef TestMovd
+}
+
+TEST_F(AssemblerX8664Test, MovqXmmAddr) {
+#define TestMovd(Dst, Value)                                                   \
+  do {                                                                         \
+    static constexpr char TestString[] = "(" #Dst ", Addr)";                   \
+    const uint32_t T0 = allocateQword();                                       \
+    const uint64_t V0 = Value;                                                 \
+    const uint32_t T1 = allocateQword();                                       \
+    const uint64_t V1 = ~(Value);                                              \
+                                                                               \
+    __ movss(IceType_f64, Encoded_Xmm_##Dst(), dwordAddress(T1));              \
+    __ movq(Encoded_Xmm_##Dst(), dwordAddress(T0));                            \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+                                                                               \
+    test.setQwordTo(T0, V0);                                                   \
+    test.setQwordTo(T1, V1);                                                   \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(Value, test.Dst<uint64_t>()) << TestString << " value is "       \
+                                           << Value;                           \
+    reset();                                                                   \
+  } while (0)
+
+  for (uint32_t Value : {0u, 1u, 0x7FFFFFFFu, 0x80000000u, 0xFFFFFFFFu}) {
+    TestMovd(xmm0, Value);
+    TestMovd(xmm1, Value);
+    TestMovd(xmm2, Value);
+    TestMovd(xmm3, Value);
+    TestMovd(xmm4, Value);
+    TestMovd(xmm5, Value);
+    TestMovd(xmm6, Value);
+    TestMovd(xmm7, Value);
+    TestMovd(xmm8, Value);
+    TestMovd(xmm9, Value);
+    TestMovd(xmm10, Value);
+    TestMovd(xmm11, Value);
+    TestMovd(xmm12, Value);
+    TestMovd(xmm13, Value);
+    TestMovd(xmm14, Value);
+    TestMovd(xmm15, Value);
+  }
+
+#undef TestMovd
+}
+
+TEST_F(AssemblerX8664Test, MovqAddrXmm) {
+#define TestMovd(Dst, Value)                                                   \
+  do {                                                                         \
+    static constexpr char TestString[] = "(" #Dst ", Addr)";                   \
+    const uint32_t T0 = allocateQword();                                       \
+    const uint64_t V0 = Value;                                                 \
+    const uint32_t T1 = allocateQword();                                       \
+    const uint64_t V1 = ~(Value);                                              \
+                                                                               \
+    __ movq(Encoded_Xmm_##Dst(), dwordAddress(T0));                            \
+    __ movq(dwordAddress(T1), Encoded_Xmm_##Dst());                            \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+                                                                               \
+    test.setQwordTo(T0, V0);                                                   \
+    test.setQwordTo(T1, V1);                                                   \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(Value, test.Dst<uint64_t>()) << TestString << " value is "       \
+                                           << Value;                           \
+    reset();                                                                   \
+  } while (0)
+
+  for (uint32_t Value : {0u, 1u, 0x7FFFFFFFu, 0x80000000u, 0xFFFFFFFFu}) {
+    TestMovd(xmm0, Value);
+    TestMovd(xmm1, Value);
+    TestMovd(xmm2, Value);
+    TestMovd(xmm3, Value);
+    TestMovd(xmm4, Value);
+    TestMovd(xmm5, Value);
+    TestMovd(xmm6, Value);
+    TestMovd(xmm7, Value);
+    TestMovd(xmm8, Value);
+    TestMovd(xmm9, Value);
+    TestMovd(xmm10, Value);
+    TestMovd(xmm11, Value);
+    TestMovd(xmm12, Value);
+    TestMovd(xmm13, Value);
+    TestMovd(xmm14, Value);
+    TestMovd(xmm15, Value);
+  }
+
+#undef TestMovd
+}
+
+TEST_F(AssemblerX8664Test, MovqXmmXmm) {
+#define TestMovd(Src, Dst, Value)                                              \
+  do {                                                                         \
+    static constexpr char TestString[] = "(" #Src ", " #Dst ")";               \
+    const uint32_t T0 = allocateQword();                                       \
+    const uint64_t V0 = Value;                                                 \
+    const uint32_t T1 = allocateQword();                                       \
+    const uint64_t V1 = ~(Value);                                              \
+                                                                               \
+    __ movq(Encoded_Xmm_##Src(), dwordAddress(T0));                            \
+    __ movq(Encoded_Xmm_##Dst(), dwordAddress(T1));                            \
+    __ movq(Encoded_Xmm_##Dst(), Encoded_Xmm_##Src());                         \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+                                                                               \
+    test.setQwordTo(T0, V0);                                                   \
+    test.setQwordTo(T1, V1);                                                   \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(Value, test.Dst<uint64_t>()) << TestString << " value is "       \
+                                           << Value;                           \
+    reset();                                                                   \
+  } while (0)
+
+  for (uint32_t Value : {0u, 1u, 0x7FFFFFFFu, 0x80000000u, 0xFFFFFFFFu}) {
+    TestMovd(xmm0, xmm1, Value);
+    TestMovd(xmm1, xmm2, Value);
+    TestMovd(xmm2, xmm3, Value);
+    TestMovd(xmm3, xmm4, Value);
+    TestMovd(xmm4, xmm5, Value);
+    TestMovd(xmm5, xmm6, Value);
+    TestMovd(xmm6, xmm7, Value);
+    TestMovd(xmm7, xmm8, Value);
+    TestMovd(xmm8, xmm9, Value);
+    TestMovd(xmm9, xmm10, Value);
+    TestMovd(xmm10, xmm11, Value);
+    TestMovd(xmm11, xmm12, Value);
+    TestMovd(xmm12, xmm13, Value);
+    TestMovd(xmm13, xmm14, Value);
+    TestMovd(xmm14, xmm15, Value);
+    TestMovd(xmm15, xmm0, Value);
+  }
+
+#undef TestMovd
+}
+
+TEST_F(AssemblerX8664Test, MovupsXmmAddr) {
+#define TestMovups(Dst)                                                        \
+  do {                                                                         \
+    static constexpr char TestString[] = "(" #Dst ")";                         \
+    const uint32_t T0 = allocateDqword();                                      \
+    const Dqword V0(1.0f, -1.0, std::numeric_limits<float>::quiet_NaN(),       \
+                    std::numeric_limits<float>::infinity());                   \
+                                                                               \
+    __ movups(Encoded_Xmm_##Dst(), dwordAddress(T0));                          \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0);                                                  \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(V0, test.Dst<Dqword>()) << TestString;                           \
+    reset();                                                                   \
+  } while (0)
+
+  TestMovups(xmm0);
+  TestMovups(xmm1);
+  TestMovups(xmm2);
+  TestMovups(xmm3);
+  TestMovups(xmm4);
+  TestMovups(xmm5);
+  TestMovups(xmm6);
+  TestMovups(xmm7);
+  TestMovups(xmm8);
+  TestMovups(xmm9);
+  TestMovups(xmm10);
+  TestMovups(xmm11);
+  TestMovups(xmm12);
+  TestMovups(xmm13);
+  TestMovups(xmm14);
+  TestMovups(xmm15);
+
+#undef TestMovups
+}
+
+TEST_F(AssemblerX8664Test, MovupsAddrXmm) {
+#define TestMovups(Src)                                                        \
+  do {                                                                         \
+    static constexpr char TestString[] = "(" #Src ")";                         \
+    const uint32_t T0 = allocateDqword();                                      \
+    const Dqword V0(1.0f, -1.0, std::numeric_limits<float>::quiet_NaN(),       \
+                    std::numeric_limits<float>::infinity());                   \
+    const uint32_t T1 = allocateDqword();                                      \
+    const Dqword V1(0.0, 0.0, 0.0, 0.0);                                       \
+                                                                               \
+    __ movups(Encoded_Xmm_##Src(), dwordAddress(T0));                          \
+    __ movups(dwordAddress(T1), Encoded_Xmm_##Src());                          \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0);                                                  \
+    test.setDqwordTo(T1, V1);                                                  \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(V0, test.contentsOfDqword(T1)) << TestString;                    \
+    reset();                                                                   \
+  } while (0)
+
+  TestMovups(xmm0);
+  TestMovups(xmm1);
+  TestMovups(xmm2);
+  TestMovups(xmm3);
+  TestMovups(xmm4);
+  TestMovups(xmm5);
+  TestMovups(xmm6);
+  TestMovups(xmm7);
+  TestMovups(xmm8);
+  TestMovups(xmm9);
+  TestMovups(xmm10);
+  TestMovups(xmm11);
+  TestMovups(xmm12);
+  TestMovups(xmm13);
+  TestMovups(xmm14);
+  TestMovups(xmm15);
+
+#undef TestMovups
+}
+
+TEST_F(AssemblerX8664Test, MovupsXmmXmm) {
+#define TestMovups(Dst, Src)                                                   \
+  do {                                                                         \
+    static constexpr char TestString[] = "(" #Dst ", " #Src ")";               \
+    const uint32_t T0 = allocateDqword();                                      \
+    const Dqword V0(1.0f, -1.0, std::numeric_limits<float>::quiet_NaN(),       \
+                    std::numeric_limits<float>::infinity());                   \
+    const uint32_t T1 = allocateDqword();                                      \
+    const Dqword V1(0.0, 0.0, 0.0, 0.0);                                       \
+                                                                               \
+    __ movups(Encoded_Xmm_##Src(), dwordAddress(T0));                          \
+    __ movups(Encoded_Xmm_##Dst(), dwordAddress(T1));                          \
+    __ movups(Encoded_Xmm_##Dst(), Encoded_Xmm_##Src());                       \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0);                                                  \
+    test.setDqwordTo(T1, V1);                                                  \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(V0, test.Dst<Dqword>()) << TestString;                           \
+    reset();                                                                   \
+  } while (0)
+
+  TestMovups(xmm0, xmm1);
+  TestMovups(xmm1, xmm2);
+  TestMovups(xmm2, xmm3);
+  TestMovups(xmm3, xmm4);
+  TestMovups(xmm4, xmm5);
+  TestMovups(xmm5, xmm6);
+  TestMovups(xmm6, xmm7);
+  TestMovups(xmm7, xmm8);
+  TestMovups(xmm8, xmm9);
+  TestMovups(xmm9, xmm10);
+  TestMovups(xmm10, xmm11);
+  TestMovups(xmm11, xmm12);
+  TestMovups(xmm12, xmm13);
+  TestMovups(xmm13, xmm14);
+  TestMovups(xmm14, xmm15);
+  TestMovups(xmm15, xmm0);
+
+#undef TestMovups
+}
+
+TEST_F(AssemblerX8664Test, MovapsXmmXmm) {
+#define TestMovaps(Dst, Src)                                                   \
+  do {                                                                         \
+    static constexpr char TestString[] = "(" #Dst ", " #Src ")";               \
+    const uint32_t T0 = allocateDqword();                                      \
+    const Dqword V0(1.0f, -1.0, std::numeric_limits<float>::quiet_NaN(),       \
+                    std::numeric_limits<float>::infinity());                   \
+    const uint32_t T1 = allocateDqword();                                      \
+    const Dqword V1(0.0, 0.0, 0.0, 0.0);                                       \
+                                                                               \
+    __ movups(Encoded_Xmm_##Src(), dwordAddress(T0));                          \
+    __ movups(Encoded_Xmm_##Dst(), dwordAddress(T1));                          \
+    __ movaps(Encoded_Xmm_##Dst(), Encoded_Xmm_##Src());                       \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0);                                                  \
+    test.setDqwordTo(T1, V1);                                                  \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(V0, test.Dst<Dqword>()) << TestString;                           \
+    reset();                                                                   \
+  } while (0)
+
+  TestMovaps(xmm0, xmm1);
+  TestMovaps(xmm1, xmm2);
+  TestMovaps(xmm2, xmm3);
+  TestMovaps(xmm3, xmm4);
+  TestMovaps(xmm4, xmm5);
+  TestMovaps(xmm5, xmm6);
+  TestMovaps(xmm6, xmm7);
+  TestMovaps(xmm7, xmm8);
+  TestMovaps(xmm8, xmm9);
+  TestMovaps(xmm9, xmm10);
+  TestMovaps(xmm10, xmm11);
+  TestMovaps(xmm11, xmm12);
+  TestMovaps(xmm12, xmm13);
+  TestMovaps(xmm13, xmm14);
+  TestMovaps(xmm14, xmm15);
+  TestMovaps(xmm15, xmm0);
+
+#undef TestMovaps
+}
+
+TEST_F(AssemblerX8664Test, Movhlps_Movlhps) {
+#define TestImplSingle(Dst, Src, Inst, Expect)                                 \
+  do {                                                                         \
+    static constexpr char TestString[] = "(" #Dst ", " #Src ", " #Inst ")";    \
+    const uint32_t T0 = allocateDqword();                                      \
+    const Dqword V0(uint64_t(0xAAAAAAAABBBBBBBBull),                           \
+                    uint64_t(0xCCCCCCCCDDDDDDDDull));                          \
+    const uint32_t T1 = allocateDqword();                                      \
+    const Dqword V1(uint64_t(0xEEEEEEEEFFFFFFFFull),                           \
+                    uint64_t(0x9999999988888888ull));                          \
+                                                                               \
+    __ movups(Encoded_Xmm_##Dst(), dwordAddress(T0));                          \
+    __ movups(Encoded_Xmm_##Src(), dwordAddress(T1));                          \
+    __ Inst(Encoded_Xmm_##Dst(), Encoded_Xmm_##Src());                         \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0);                                                  \
+    test.setDqwordTo(T1, V1);                                                  \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(Dqword Expect, test.Dst<Dqword>()) << TestString;                \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImpl(Dst, Src)                                                     \
+  do {                                                                         \
+    TestImplSingle(Dst, Src, movhlps, (uint64_t(0x9999999988888888ull),        \
+                                       uint64_t(0xCCCCCCCCDDDDDDDDull)));      \
+    TestImplSingle(Dst, Src, movlhps, (uint64_t(0xAAAAAAAABBBBBBBBull),        \
+                                       uint64_t(0xEEEEEEEEFFFFFFFFull)));      \
+  } while (0)
+
+  TestImpl(xmm0, xmm1);
+  TestImpl(xmm1, xmm2);
+  TestImpl(xmm2, xmm3);
+  TestImpl(xmm3, xmm4);
+  TestImpl(xmm4, xmm5);
+  TestImpl(xmm5, xmm6);
+  TestImpl(xmm6, xmm7);
+  TestImpl(xmm7, xmm8);
+  TestImpl(xmm8, xmm9);
+  TestImpl(xmm9, xmm10);
+  TestImpl(xmm10, xmm11);
+  TestImpl(xmm11, xmm12);
+  TestImpl(xmm12, xmm13);
+  TestImpl(xmm13, xmm14);
+  TestImpl(xmm14, xmm15);
+  TestImpl(xmm15, xmm0);
+
+#undef TestImpl
+#undef TestImplSingle
+}
+
+TEST_F(AssemblerX8664Test, Movmsk) {
+#define TestMovmskGPRXmm(GPR, Src, Value1, Expected, Inst)                     \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #GPR ", " #Src ", " #Value1 ", " #Expected ", " #Inst ")";         \
+    const uint32_t T0 = allocateDqword();                                      \
+    const Dqword V0 Value1;                                                    \
+                                                                               \
+    __ movups(Encoded_Xmm_##Src(), dwordAddress(T0));                          \
+    __ Inst(Encoded_GPR_##GPR(), Encoded_Xmm_##Src());                         \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0);                                                  \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(Expected, test.GPR()) << TestString;                             \
+    reset();                                                                   \
+  } while (0)
+
+#define TestMovmsk(GPR, Src)                                                   \
+  do {                                                                         \
+    TestMovmskGPRXmm(GPR, Src, (-1.0, 1.0, -1.0, 1.0), 0x05ul, movmskps);      \
+    TestMovmskGPRXmm(GPR, Src, (1.0, -1.0), 0x02ul, movmskpd);                 \
+  } while (0)
+
+  TestMovmsk(r1, xmm0);
+  TestMovmsk(r2, xmm1);
+  TestMovmsk(r3, xmm2);
+  TestMovmsk(r4, xmm3);
+  TestMovmsk(r5, xmm4);
+  TestMovmsk(r6, xmm5);
+  TestMovmsk(r7, xmm6);
+  TestMovmsk(r8, xmm7);
+  TestMovmsk(r10, xmm8);
+  TestMovmsk(r11, xmm9);
+  TestMovmsk(r12, xmm10);
+  TestMovmsk(r13, xmm11);
+  TestMovmsk(r14, xmm12);
+  TestMovmsk(r15, xmm13);
+  TestMovmsk(r1, xmm14);
+  TestMovmsk(r2, xmm15);
+
+#undef TestMovmskGPRXmm
+#undef TestMovmsk
+}
+
+TEST_F(AssemblerX8664Test, Pmovsxdq) {
+#define TestPmovsxdqXmmXmm(Dst, Src, Value1)                                   \
+  do {                                                                         \
+    static constexpr char TestString[] = "(" #Dst ", " #Src ", " #Value1 ")";  \
+    const uint32_t T0 = allocateDqword();                                      \
+    const Dqword V0 Value1;                                                    \
+    const uint32_t T1 = allocateDqword();                                      \
+    const Dqword V1(uint64_t(0), uint64_t(0));                                 \
+                                                                               \
+    __ movups(Encoded_Xmm_##Src(), dwordAddress(T0));                          \
+    __ movups(Encoded_Xmm_##Dst(), dwordAddress(T1));                          \
+    __ pmovsxdq(Encoded_Xmm_##Dst(), Encoded_Xmm_##Src());                     \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0);                                                  \
+    test.setDqwordTo(T1, V1);                                                  \
+    test.run();                                                                \
+                                                                               \
+    const Dqword Expected(uint64_t(V0.I32[0]), uint64_t(V0.I32[1]));           \
+    ASSERT_EQ(Expected, test.Dst<Dqword>()) << TestString;                     \
+    reset();                                                                   \
+  } while (0)
+
+#define TestPmovsxdq(Dst, Src)                                                 \
+  do {                                                                         \
+    TestPmovsxdqXmmXmm(Dst, Src, (uint64_t(0x700000007FFFFFFFull),             \
+                                  uint64_t(0xAAAAAAAAEEEEEEEEull)));           \
+    TestPmovsxdqXmmXmm(Dst, Src, (uint64_t(0x800000007FFFFFFFull),             \
+                                  uint64_t(0xAAAAAAAAEEEEEEEEull)));           \
+    TestPmovsxdqXmmXmm(Dst, Src, (uint64_t(0x70000000FFFFFFFFull),             \
+                                  uint64_t(0xAAAAAAAAEEEEEEEEull)));           \
+    TestPmovsxdqXmmXmm(Dst, Src, (uint64_t(0x80000000FFFFFFFFull),             \
+                                  uint64_t(0xAAAAAAAAEEEEEEEEull)));           \
+  } while (0)
+
+  TestPmovsxdq(xmm0, xmm1);
+  TestPmovsxdq(xmm1, xmm2);
+  TestPmovsxdq(xmm2, xmm3);
+  TestPmovsxdq(xmm3, xmm4);
+  TestPmovsxdq(xmm4, xmm5);
+  TestPmovsxdq(xmm5, xmm6);
+  TestPmovsxdq(xmm6, xmm7);
+  TestPmovsxdq(xmm7, xmm8);
+  TestPmovsxdq(xmm8, xmm9);
+  TestPmovsxdq(xmm9, xmm10);
+  TestPmovsxdq(xmm10, xmm11);
+  TestPmovsxdq(xmm11, xmm12);
+  TestPmovsxdq(xmm12, xmm13);
+  TestPmovsxdq(xmm13, xmm14);
+  TestPmovsxdq(xmm14, xmm15);
+  TestPmovsxdq(xmm15, xmm0);
+
+#undef TestPmovsxdq
+#undef TestPmovsxdqXmmXmm
+}
+
+} // end of anonymous namespace
+} // end of namespace Test
+} // end of namespace X8664
+} // end of namespace Ice
diff --git a/unittest/AssemblerX8664/GPRArith.cpp b/unittest/AssemblerX8664/GPRArith.cpp
new file mode 100644
index 0000000..f5404f8
--- /dev/null
+++ b/unittest/AssemblerX8664/GPRArith.cpp
@@ -0,0 +1,1905 @@
+//===- subzero/unittest/AssemblerX8664/GPRArith.cpp -----------------------===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#include "AssemblerX8664/TestUtil.h"
+
+namespace Ice {
+namespace X8664 {
+namespace Test {
+namespace {
+
+TEST_F(AssemblerX8664Test, PopAddr) {
+  const uint32_t T0 = allocateQword();
+  constexpr uint64_t V0 = 0x3AABBEFABBBAA3ull;
+
+  __ mov(IceType_i32, GPRRegister::Encoded_Reg_eax, Immediate(0xC0FFEE));
+  __ pushl(GPRRegister::Encoded_Reg_eax);
+  __ popl(dwordAddress(T0));
+
+  AssembledTest test = assemble();
+  test.setQwordTo(T0, V0);
+
+  test.run();
+
+  ASSERT_EQ(0xC0FFEEul, test.contentsOfQword(T0));
+}
+
+TEST_F(AssemblerX8664Test, SetCC) {
+#define TestSetCC(C, Dest, IsTrue, Src0, Value0, Src1, Value1)                 \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #C ", " #Dest ", " #IsTrue ", " #Src0 ", " #Value0 ", " #Src1      \
+        ", " #Value1 ")";                                                      \
+    const uint32_t T0 = allocateDword();                                       \
+    constexpr uint32_t V0 = 0xF00F00;                                          \
+    __ mov(IceType_i32, Encoded_GPR_##Src0(), Immediate(Value0));              \
+    __ mov(IceType_i32, Encoded_GPR_##Src1(), Immediate(Value1));              \
+    __ cmp(IceType_i32, Encoded_GPR_##Src0(), Encoded_GPR_##Src1());           \
+    __ mov(IceType_i32, Encoded_GPR_##Dest(), Immediate(0));                   \
+    __ setcc(Cond::Br_##C, RegX8664::getEncodedByteReg(Encoded_GPR_##Dest())); \
+    __ setcc(Cond::Br_##C, dwordAddress(T0));                                  \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDwordTo(T0, V0);                                                   \
+                                                                               \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(IsTrue, test.Dest()) << TestString;                              \
+    ASSERT_EQ((0xF00F00 | IsTrue), test.contentsOfDword(T0)) << TestString;    \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImpl(Dest, Src0, Src1)                                             \
+  do {                                                                         \
+    TestSetCC(o, Dest, 1u, Src0, 0x80000000u, Src1, 0x1u);                     \
+    TestSetCC(o, Dest, 0u, Src0, 0x1u, Src1, 0x10000000u);                     \
+    TestSetCC(no, Dest, 1u, Src0, 0x1u, Src1, 0x10000000u);                    \
+    TestSetCC(no, Dest, 0u, Src0, 0x80000000u, Src1, 0x1u);                    \
+    TestSetCC(b, Dest, 1u, Src0, 0x1, Src1, 0x80000000u);                      \
+    TestSetCC(b, Dest, 0u, Src0, 0x80000000u, Src1, 0x1u);                     \
+    TestSetCC(ae, Dest, 1u, Src0, 0x80000000u, Src1, 0x1u);                    \
+    TestSetCC(ae, Dest, 0u, Src0, 0x1u, Src1, 0x80000000u);                    \
+    TestSetCC(e, Dest, 1u, Src0, 0x1u, Src1, 0x1u);                            \
+    TestSetCC(e, Dest, 0u, Src0, 0x1u, Src1, 0x11111u);                        \
+    TestSetCC(ne, Dest, 1u, Src0, 0x80000000u, Src1, 0x1u);                    \
+    TestSetCC(ne, Dest, 0u, Src0, 0x1u, Src1, 0x1u);                           \
+    TestSetCC(be, Dest, 1u, Src0, 0x1u, Src1, 0x80000000u);                    \
+    TestSetCC(be, Dest, 0u, Src0, 0x80000000u, Src1, 0x1u);                    \
+    TestSetCC(a, Dest, 1u, Src0, 0x80000000u, Src1, 0x1u);                     \
+    TestSetCC(a, Dest, 0u, Src0, 0x1u, Src1, 0x80000000u);                     \
+    TestSetCC(s, Dest, 1u, Src0, 0x1u, Src1, 0x80000000u);                     \
+    TestSetCC(s, Dest, 0u, Src0, 0x80000000u, Src1, 0x1u);                     \
+    TestSetCC(ns, Dest, 1u, Src0, 0x80000000u, Src1, 0x1u);                    \
+    TestSetCC(ns, Dest, 0u, Src0, 0x1u, Src1, 0x80000000u);                    \
+    TestSetCC(p, Dest, 1u, Src0, 0x80000000u, Src1, 0x1u);                     \
+    TestSetCC(p, Dest, 0u, Src0, 0x1u, Src1, 0x80000000u);                     \
+    TestSetCC(np, Dest, 1u, Src0, 0x1u, Src1, 0x80000000u);                    \
+    TestSetCC(np, Dest, 0u, Src0, 0x80000000u, Src1, 0x1u);                    \
+    TestSetCC(l, Dest, 1u, Src0, 0x80000000u, Src1, 0x1u);                     \
+    TestSetCC(l, Dest, 0u, Src0, 0x1u, Src1, 0x80000000u);                     \
+    TestSetCC(ge, Dest, 1u, Src0, 0x1u, Src1, 0x80000000u);                    \
+    TestSetCC(ge, Dest, 0u, Src0, 0x80000000u, Src1, 0x1u);                    \
+    TestSetCC(le, Dest, 1u, Src0, 0x80000000u, Src1, 0x1u);                    \
+    TestSetCC(le, Dest, 0u, Src0, 0x1u, Src1, 0x80000000u);                    \
+  } while (0)
+
+  TestImpl(r1, r2, r3);
+  TestImpl(r2, r3, r4);
+  TestImpl(r3, r4, r5);
+  TestImpl(r4, r5, r6);
+  TestImpl(r5, r6, r7);
+  TestImpl(r6, r7, r8);
+  TestImpl(r7, r8, r10);
+  TestImpl(r8, r10, r11);
+  TestImpl(r10, r11, r12);
+  TestImpl(r11, r12, r13);
+  TestImpl(r12, r13, r14);
+  TestImpl(r13, r14, r15);
+  TestImpl(r14, r15, r1);
+  TestImpl(r15, r1, r2);
+
+#undef TestImpl
+#undef TestSetCC
+}
+
+TEST_F(AssemblerX8664Test, Lea) {
+#define TestLeaBaseDisp(Base, BaseValue, Disp, Dst)                            \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Base ", " #BaseValue ", " #Dst ")";                               \
+    if (Encoded_GPR_##Base() != Encoded_GPR_esp() &&                           \
+        Encoded_GPR_##Base() != Encoded_GPR_r9()) {                            \
+      __ mov(IceType_i32, Encoded_GPR_##Base(), Immediate(BaseValue));         \
+    }                                                                          \
+    __ lea(IceType_i32, Encoded_GPR_##Dst(),                                   \
+           Address(Encoded_GPR_##Base(), Disp));                               \
+    AssembledTest test = assemble();                                           \
+    test.run();                                                                \
+    ASSERT_EQ(test.Base##d() + (Disp), test.Dst##d())                          \
+        << TestString << " with Disp " << Disp;                                \
+    reset();                                                                   \
+  } while (0)
+
+#define TestLeaIndex32bitDisp(Index, IndexValue, Disp, Dst0, Dst1, Dst2, Dst3) \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Index ", " #IndexValue ", " #Dst0 ", " #Dst1 ", " #Dst2           \
+        ", " #Dst3 ")";                                                        \
+    if (Encoded_GPR_##Index() != Encoded_GPR_r9()) {                           \
+      __ mov(IceType_i32, Encoded_GPR_##Index(), Immediate(IndexValue));       \
+    }                                                                          \
+    __ lea(IceType_i32, Encoded_GPR_##Dst0(),                                  \
+           Address(Encoded_GPR_##Index(), Traits::TIMES_1, Disp));             \
+    __ lea(IceType_i32, Encoded_GPR_##Dst1(),                                  \
+           Address(Encoded_GPR_##Index(), Traits::TIMES_2, Disp));             \
+    __ lea(IceType_i32, Encoded_GPR_##Dst2(),                                  \
+           Address(Encoded_GPR_##Index(), Traits::TIMES_4, Disp));             \
+    __ lea(IceType_i32, Encoded_GPR_##Dst3(),                                  \
+           Address(Encoded_GPR_##Index(), Traits::TIMES_8, Disp));             \
+    AssembledTest test = assemble();                                           \
+    test.run();                                                                \
+    ASSERT_EQ((test.Index##d() << Traits::TIMES_1) + (Disp), test.Dst0##d())   \
+        << TestString << " " << Disp;                                          \
+    ASSERT_EQ((test.Index##d() << Traits::TIMES_2) + (Disp), test.Dst1##d())   \
+        << TestString << " " << Disp;                                          \
+    ASSERT_EQ((test.Index##d() << Traits::TIMES_4) + (Disp), test.Dst2##d())   \
+        << TestString << " " << Disp;                                          \
+    ASSERT_EQ((test.Index##d() << Traits::TIMES_8) + (Disp), test.Dst3##d())   \
+        << TestString << " " << Disp;                                          \
+    reset();                                                                   \
+  } while (0)
+
+#define TestLeaBaseIndexDisp(Base, BaseValue, Index, IndexValue, Disp, Dst0,   \
+                             Dst1, Dst2, Dst3)                                 \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Base ", " #BaseValue ", " #Index ", " #IndexValue ", " #Dst0      \
+        ", " #Dst1 ", " #Dst2 ", " #Dst3 ")";                                  \
+    if (Encoded_GPR_##Base() != Encoded_GPR_esp() &&                           \
+        Encoded_GPR_##Base() != Encoded_GPR_r9()) {                            \
+      __ mov(IceType_i32, Encoded_GPR_##Base(), Immediate(BaseValue));         \
+    }                                                                          \
+                                                                               \
+    if (Encoded_GPR_##Index() != Encoded_GPR_r9()) {                           \
+      __ mov(IceType_i32, Encoded_GPR_##Index(), Immediate(IndexValue));       \
+    }                                                                          \
+                                                                               \
+    __ lea(IceType_i32, Encoded_GPR_##Dst0(),                                  \
+           Address(Encoded_GPR_##Base(), Encoded_GPR_##Index(),                \
+                   Traits::TIMES_1, Disp));                                    \
+    __ lea(IceType_i32, Encoded_GPR_##Dst1(),                                  \
+           Address(Encoded_GPR_##Base(), Encoded_GPR_##Index(),                \
+                   Traits::TIMES_2, Disp));                                    \
+    __ lea(IceType_i32, Encoded_GPR_##Dst2(),                                  \
+           Address(Encoded_GPR_##Base(), Encoded_GPR_##Index(),                \
+                   Traits::TIMES_4, Disp));                                    \
+    __ lea(IceType_i32, Encoded_GPR_##Dst3(),                                  \
+           Address(Encoded_GPR_##Base(), Encoded_GPR_##Index(),                \
+                   Traits::TIMES_8, Disp));                                    \
+    AssembledTest test = assemble();                                           \
+    test.run();                                                                \
+    uint32_t ExpectedIndexValue = test.Index();                                \
+    if (Encoded_GPR_##Index() == Encoded_GPR_esp()) {                          \
+      ExpectedIndexValue = 0;                                                  \
+    }                                                                          \
+    ASSERT_EQ(test.Base##d() + (ExpectedIndexValue << Traits::TIMES_1) +       \
+                  (Disp),                                                      \
+              test.Dst0##d())                                                  \
+        << TestString << " " << Disp;                                          \
+    ASSERT_EQ(test.Base##d() + (ExpectedIndexValue << Traits::TIMES_2) +       \
+                  (Disp),                                                      \
+              test.Dst1##d())                                                  \
+        << TestString << " " << Disp;                                          \
+    ASSERT_EQ(test.Base##d() + (ExpectedIndexValue << Traits::TIMES_4) +       \
+                  (Disp),                                                      \
+              test.Dst2##d())                                                  \
+        << TestString << " " << Disp;                                          \
+    ASSERT_EQ(test.Base##d() + (ExpectedIndexValue << Traits::TIMES_8) +       \
+                  (Disp),                                                      \
+              test.Dst3##d())                                                  \
+        << TestString << " " << Disp;                                          \
+    reset();                                                                   \
+  } while (0)
+
+  for (const int32_t Disp :
+       {0x00, 0x06, -0x06, 0x0600, -0x6000, 0x6000000, -0x6000000}) {
+    TestLeaBaseDisp(r0, 0x22080Fu, Disp, r1);
+    TestLeaBaseDisp(r1, 0x10000Fu, Disp, r2);
+    TestLeaBaseDisp(r2, 0x20000Fu, Disp, r3);
+    TestLeaBaseDisp(r3, 0x30000Fu, Disp, r4);
+    TestLeaBaseDisp(r4, 0x40000Fu, Disp, r5);
+    TestLeaBaseDisp(r5, 0x50000Fu, Disp, r6);
+    TestLeaBaseDisp(r6, 0x60000Fu, Disp, r7);
+    TestLeaBaseDisp(r7, 0x11000Fu, Disp, r8);
+    TestLeaBaseDisp(r8, 0x11200Fu, Disp, r10);
+    TestLeaBaseDisp(r9, 0x000000u, Disp, r10);
+    TestLeaBaseDisp(r10, 0x22000Fu, Disp, r11);
+    TestLeaBaseDisp(r11, 0x22030Fu, Disp, r12);
+    TestLeaBaseDisp(r12, 0x22040Fu, Disp, r13);
+    TestLeaBaseDisp(r13, 0x22050Fu, Disp, r14);
+    TestLeaBaseDisp(r14, 0x22060Fu, Disp, r15);
+    TestLeaBaseDisp(r15, 0x22070Fu, Disp, r1);
+  }
+
+  // esp is not a valid index register.
+  // ebp is not valid in this addressing mode (rm = 0).
+  for (const int32_t Disp :
+       {0x00, 0x06, -0x06, 0x0600, -0x6000, 0x6000000, -0x6000000}) {
+    TestLeaIndex32bitDisp(r1, 0x2000u, Disp, r2, r3, r4, r6);
+    TestLeaIndex32bitDisp(r2, 0x4010u, Disp, r3, r4, r6, r7);
+    TestLeaIndex32bitDisp(r3, 0x6020u, Disp, r4, r6, r7, r5);
+    TestLeaIndex32bitDisp(r4, 0x8030u, Disp, r6, r7, r5, r10);
+    TestLeaIndex32bitDisp(r6, 0xA040u, Disp, r7, r5, r10, r1);
+    TestLeaIndex32bitDisp(r7, 0xC050u, Disp, r5, r10, r1, r11);
+    TestLeaIndex32bitDisp(r8, 0xC060u, Disp, r10, r1, r11, r12);
+    TestLeaIndex32bitDisp(r9, 0x0000u, Disp, r1, r11, r12, r13);
+    TestLeaIndex32bitDisp(r10, 0xC008u, Disp, r11, r12, r13, r14);
+    TestLeaIndex32bitDisp(r11, 0xC009u, Disp, r12, r13, r14, r15);
+    TestLeaIndex32bitDisp(r12, 0xC00Au, Disp, r13, r14, r15, r1);
+    TestLeaIndex32bitDisp(r13, 0xC00Bu, Disp, r14, r15, r1, r2);
+    TestLeaIndex32bitDisp(r14, 0xC00Cu, Disp, r15, r1, r2, r3);
+    TestLeaIndex32bitDisp(r15, 0xC00Du, Disp, r1, r2, r3, r4);
+  }
+
+  for (const int32_t Disp :
+       {0x00, 0x06, -0x06, 0x0600, -0x6000, 0x6000000, -0x6000000}) {
+    TestLeaBaseIndexDisp(r1, 0x100000u, r2, 0x600u, Disp, r3, r4, r6, r7);
+    TestLeaBaseIndexDisp(r2, 0x200000u, r3, 0x500u, Disp, r4, r6, r7, r8);
+    TestLeaBaseIndexDisp(r3, 0x300000u, r4, 0x400u, Disp, r6, r7, r8, r5);
+    TestLeaBaseIndexDisp(r4, 0x400000u, r6, 0x300u, Disp, r7, r8, r5, r10);
+    TestLeaBaseIndexDisp(r6, 0x500000u, r7, 0x200u, Disp, r8, r5, r10, r11);
+    TestLeaBaseIndexDisp(r7, 0x600000u, r8, 0x100u, Disp, r5, r10, r11, r12);
+    TestLeaBaseIndexDisp(r8, 0x600000u, r9, 0x1A0u, Disp, r10, r11, r12, r13);
+    TestLeaBaseIndexDisp(r9, 0x000000u, r10, 0x1B0u, Disp, r11, r12, r13, r14);
+    TestLeaBaseIndexDisp(r10, 0x602000u, r11, 0x1C0u, Disp, r12, r13, r14, r15);
+    TestLeaBaseIndexDisp(r11, 0x603000u, r12, 0x1D0u, Disp, r13, r14, r15, r1);
+    TestLeaBaseIndexDisp(r12, 0x604000u, r13, 0x1E0u, Disp, r14, r15, r1, r2);
+    TestLeaBaseIndexDisp(r13, 0x605000u, r14, 0x1F0u, Disp, r15, r1, r2, r3);
+    TestLeaBaseIndexDisp(r14, 0x606000u, r15, 0x10Au, Disp, r1, r2, r3, r4);
+    TestLeaBaseIndexDisp(r15, 0x607000u, r1, 0x10Bu, Disp, r2, r3, r4, r6);
+
+    TestLeaBaseIndexDisp(r0, 0, r2, 0x600u, Disp, r3, r4, r6, r7);
+    TestLeaBaseIndexDisp(r0, 0, r3, 0x500u, Disp, r4, r6, r7, r8);
+    TestLeaBaseIndexDisp(r0, 0, r4, 0x400u, Disp, r6, r7, r8, r5);
+    TestLeaBaseIndexDisp(r0, 0, r6, 0x300u, Disp, r7, r8, r5, r10);
+    TestLeaBaseIndexDisp(r0, 0, r7, 0x200u, Disp, r8, r5, r10, r11);
+    TestLeaBaseIndexDisp(r0, 0, r8, 0x100u, Disp, r5, r10, r11, r12);
+    TestLeaBaseIndexDisp(r0, 0, r9, 0x000u, Disp, r10, r11, r12, r13);
+    TestLeaBaseIndexDisp(r0, 0, r10, 0x1B0u, Disp, r11, r12, r13, r14);
+    TestLeaBaseIndexDisp(r0, 0, r11, 0x1C0u, Disp, r12, r13, r14, r15);
+    TestLeaBaseIndexDisp(r0, 0, r12, 0x1D0u, Disp, r13, r14, r15, r1);
+    TestLeaBaseIndexDisp(r0, 0, r13, 0x1E0u, Disp, r14, r15, r1, r2);
+    TestLeaBaseIndexDisp(r0, 0, r14, 0x1F0u, Disp, r15, r1, r2, r3);
+    TestLeaBaseIndexDisp(r0, 0, r15, 0x10Au, Disp, r1, r2, r3, r4);
+    TestLeaBaseIndexDisp(r0, 0, r1, 0x10Bu, Disp, r2, r3, r4, r6);
+
+    TestLeaBaseIndexDisp(r5, 0x100000u, r2, 0x600u, Disp, r3, r4, r6, r7);
+    TestLeaBaseIndexDisp(r5, 0x200000u, r3, 0x500u, Disp, r4, r6, r7, r8);
+    TestLeaBaseIndexDisp(r5, 0x300000u, r4, 0x400u, Disp, r6, r7, r8, r1);
+    TestLeaBaseIndexDisp(r5, 0x400000u, r6, 0x300u, Disp, r7, r8, r1, r10);
+    TestLeaBaseIndexDisp(r5, 0x500000u, r7, 0x200u, Disp, r8, r1, r10, r11);
+    TestLeaBaseIndexDisp(r5, 0x600000u, r8, 0x100u, Disp, r1, r10, r11, r12);
+    TestLeaBaseIndexDisp(r5, 0x600000u, r9, 0x000u, Disp, r10, r11, r12, r13);
+    TestLeaBaseIndexDisp(r5, 0x601000u, r10, 0x1B0u, Disp, r11, r12, r13, r14);
+    TestLeaBaseIndexDisp(r5, 0x602000u, r11, 0x1C0u, Disp, r12, r13, r14, r15);
+    TestLeaBaseIndexDisp(r5, 0x603000u, r12, 0x1D0u, Disp, r13, r14, r15, r1);
+    TestLeaBaseIndexDisp(r5, 0x604000u, r13, 0x1E0u, Disp, r14, r15, r1, r2);
+    TestLeaBaseIndexDisp(r5, 0x605000u, r14, 0x1F0u, Disp, r15, r1, r2, r3);
+    TestLeaBaseIndexDisp(r5, 0x606000u, r15, 0x10Au, Disp, r1, r2, r3, r4);
+    TestLeaBaseIndexDisp(r5, 0x607000u, r1, 0x10Bu, Disp, r2, r3, r4, r6);
+
+    TestLeaBaseIndexDisp(r2, 0x100000u, r5, 0x600u, Disp, r3, r4, r6, r7);
+    TestLeaBaseIndexDisp(r3, 0x200000u, r5, 0x500u, Disp, r4, r6, r7, r8);
+    TestLeaBaseIndexDisp(r4, 0x300000u, r5, 0x400u, Disp, r6, r7, r8, r1);
+    TestLeaBaseIndexDisp(r6, 0x400000u, r5, 0x300u, Disp, r7, r8, r1, r10);
+    TestLeaBaseIndexDisp(r7, 0x500000u, r5, 0x200u, Disp, r8, r1, r10, r11);
+    TestLeaBaseIndexDisp(r8, 0x600000u, r5, 0x100u, Disp, r1, r10, r11, r12);
+    TestLeaBaseIndexDisp(r9, 0x000000u, r5, 0x1A0u, Disp, r10, r11, r12, r13);
+    TestLeaBaseIndexDisp(r10, 0x601000u, r5, 0x1B0u, Disp, r11, r12, r13, r14);
+    TestLeaBaseIndexDisp(r11, 0x602000u, r5, 0x1C0u, Disp, r12, r13, r14, r15);
+    TestLeaBaseIndexDisp(r12, 0x603000u, r5, 0x1D0u, Disp, r13, r14, r15, r1);
+    TestLeaBaseIndexDisp(r13, 0x604000u, r5, 0x1E0u, Disp, r14, r15, r1, r2);
+    TestLeaBaseIndexDisp(r14, 0x605000u, r5, 0x1F0u, Disp, r15, r1, r2, r3);
+    TestLeaBaseIndexDisp(r15, 0x606000u, r5, 0x10Au, Disp, r1, r2, r3, r4);
+    TestLeaBaseIndexDisp(r1, 0x607000u, r5, 0x10Bu, Disp, r2, r3, r4, r6);
+
+    TestLeaBaseIndexDisp(r0, 0, r5, 0xC0BEBEEF, Disp, r2, r3, r4, r6);
+  }
+
+// Absolute addressing mode is tested in the Low Level tests. The encoding used
+// by the assembler has different meanings in x86-32 and x86-64.
+#undef TestLeaBaseIndexDisp
+#undef TestLeaScaled32bitDisp
+#undef TestLeaBaseDisp
+}
+
+TEST_F(AssemblerX8664LowLevelTest, LeaAbsolute) {
+#define TestLeaAbsolute(Dst, Value)                                            \
+  do {                                                                         \
+    static constexpr char TestString[] = "(" #Dst ", " #Value ")";             \
+    __ lea(IceType_i32, GPRRegister::Encoded_Reg_##Dst,                        \
+           Address(Address::ABSOLUTE, Value));                                 \
+    static constexpr uint32_t ByteCount = 6;                                   \
+    ASSERT_EQ(ByteCount, codeBytesSize()) << TestString;                       \
+    static constexpr uint8_t Opcode = 0x8D;                                    \
+    static constexpr uint8_t ModRM =                                           \
+        /*mod=*/0x00 | /*reg*/ (GPRRegister::Encoded_Reg_##Dst << 3) |         \
+        /*rm*/ GPRRegister::Encoded_Reg_ebp;                                   \
+    ASSERT_TRUE(verifyBytes<ByteCount>(                                        \
+        codeBytes(), Opcode, ModRM, (Value)&0xFF, (Value >> 8) & 0xFF,         \
+        (Value >> 16) & 0xFF, (Value >> 24) & 0xFF));                          \
+    reset();                                                                   \
+  } while (0)
+
+  TestLeaAbsolute(eax, 0x11BEEF22);
+  TestLeaAbsolute(ebx, 0x33BEEF44);
+  TestLeaAbsolute(ecx, 0x55BEEF66);
+  TestLeaAbsolute(edx, 0x77BEEF88);
+  TestLeaAbsolute(esi, 0x99BEEFAA);
+  TestLeaAbsolute(edi, 0xBBBEEFBB);
+
+#undef TesLeaAbsolute
+}
+
+TEST_F(AssemblerX8664Test, Test) {
+  static constexpr uint32_t Mask8 = 0xFF;
+  static constexpr uint32_t Mask16 = 0xFFFF;
+  static constexpr uint32_t Mask32 = 0xFFFFFFFF;
+
+#define TestImplRegReg(Dst, Value0, Src, Value1, Size)                         \
+  do {                                                                         \
+    static constexpr bool NearJump = true;                                     \
+    static constexpr char TestString[] =                                       \
+        "(" #Dst ", " #Value0 ", " #Src ", " #Value1 ", " #Size ")";           \
+    static constexpr uint32_t ValueIfTrue = 0xBEEFFEEB;                        \
+    static constexpr uint32_t ValueIfFalse = 0x11111111;                       \
+                                                                               \
+    __ mov(IceType_i##Size, Encoded_GPR_##Dst(), Immediate(Value0));           \
+    __ mov(IceType_i##Size, Encoded_GPR_##Src(), Immediate(Value1));           \
+    __ test(IceType_i##Size, Encoded_GPR_##Dst(), Encoded_GPR_##Src());        \
+    __ mov(IceType_i32, Encoded_GPR_##Dst(), Immediate(ValueIfFalse));         \
+    Label Done;                                                                \
+    __ j(Cond::Br_e, &Done, NearJump);                                         \
+    __ mov(IceType_i32, Encoded_GPR_##Dst(), Immediate(ValueIfTrue));          \
+    __ bind(&Done);                                                            \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(((Value0)&Mask##Size) & ((Value1)&Mask##Size) ? ValueIfTrue      \
+                                                            : ValueIfFalse,    \
+              test.Dst())                                                      \
+        << TestString;                                                         \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplRegImm(Dst, Value0, Imm, Size)                                 \
+  do {                                                                         \
+    static constexpr bool NearJump = true;                                     \
+    static constexpr char TestString[] =                                       \
+        "(" #Dst ", " #Value0 ", " #Imm ", " #Size ")";                        \
+    static constexpr uint32_t ValueIfTrue = 0xBEEFFEEB;                        \
+    static constexpr uint32_t ValueIfFalse = 0x11111111;                       \
+                                                                               \
+    __ mov(IceType_i##Size, Encoded_GPR_##Dst(), Immediate(Value0));           \
+    __ test(IceType_i##Size, Encoded_GPR_##Dst(),                              \
+            Immediate((Imm)&Mask##Size));                                      \
+    __ mov(IceType_i32, Encoded_GPR_##Dst(), Immediate(ValueIfFalse));         \
+    Label Done;                                                                \
+    __ j(Cond::Br_e, &Done, NearJump);                                         \
+    __ mov(IceType_i32, Encoded_GPR_##Dst(), Immediate(ValueIfTrue));          \
+    __ bind(&Done);                                                            \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(((Value0)&Mask##Size) & ((Imm)&Mask##Size) ? ValueIfTrue         \
+                                                         : ValueIfFalse,       \
+              test.Dst())                                                      \
+        << TestString;                                                         \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplAddrReg(Value0, Src, Value1, Size)                             \
+  do {                                                                         \
+    static constexpr bool NearJump = true;                                     \
+    static constexpr char TestString[] =                                       \
+        "(Addr, " #Value0 ", " #Src ", " #Value1 ", " #Size ")";               \
+    static constexpr uint32_t ValueIfTrue = 0xBEEFFEEB;                        \
+    static constexpr uint32_t ValueIfFalse = 0x11111111;                       \
+    const uint32_t T0 = allocateDword();                                       \
+                                                                               \
+    __ mov(IceType_i##Size, Encoded_GPR_##Src(), Immediate(Value1));           \
+    __ test(IceType_i##Size, dwordAddress(T0), Encoded_GPR_##Src());           \
+    __ mov(IceType_i32, dwordAddress(T0), Immediate(ValueIfFalse));            \
+    Label Done;                                                                \
+    __ j(Cond::Br_e, &Done, NearJump);                                         \
+    __ mov(IceType_i32, dwordAddress(T0), Immediate(ValueIfTrue));             \
+    __ bind(&Done);                                                            \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDwordTo(T0, uint32_t(Value0));                                     \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(((Value0)&Mask##Size) & ((Value1)&Mask##Size) ? ValueIfTrue      \
+                                                            : ValueIfFalse,    \
+              test.contentsOfDword(T0))                                        \
+        << TestString;                                                         \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplAddrImm(Value0, Value1, Size)                                  \
+  do {                                                                         \
+    static constexpr bool NearJump = true;                                     \
+    static constexpr char TestString[] =                                       \
+        "(Addr, " #Value0 ", " #Value1 ", " #Size ")";                         \
+    static constexpr uint32_t ValueIfTrue = 0xBEEFFEEB;                        \
+    static constexpr uint32_t ValueIfFalse = 0x11111111;                       \
+    const uint32_t T0 = allocateDword();                                       \
+                                                                               \
+    __ test(IceType_i##Size, dwordAddress(T0),                                 \
+            Immediate((Value1)&Mask##Size));                                   \
+    __ mov(IceType_i32, dwordAddress(T0), Immediate(ValueIfFalse));            \
+    Label Done;                                                                \
+    __ j(Cond::Br_e, &Done, NearJump);                                         \
+    __ mov(IceType_i32, dwordAddress(T0), Immediate(ValueIfTrue));             \
+    __ bind(&Done);                                                            \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDwordTo(T0, uint32_t(Value0));                                     \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(((Value0)&Mask##Size) & ((Value1)&Mask##Size) ? ValueIfTrue      \
+                                                            : ValueIfFalse,    \
+              test.contentsOfDword(T0))                                        \
+        << TestString;                                                         \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplValues(Dst, Value0, Src, Value1, Size)                         \
+  do {                                                                         \
+    TestImplRegReg(Dst, Value0, Src, Value1, Size);                            \
+    TestImplRegImm(Dst, Value0, Value1, Size);                                 \
+    TestImplAddrReg(Value0, Src, Value1, Size);                                \
+    TestImplAddrImm(Value0, Value1, Size);                                     \
+  } while (0)
+
+#define TestImplSize(Dst, Src, Size)                                           \
+  do {                                                                         \
+    TestImplValues(Dst, 0xF0F12101, Src, 0x00000000, Size);                    \
+    TestImplValues(Dst, 0xF0000000, Src, 0xF0000000, Size);                    \
+    TestImplValues(Dst, 0x0F00000F, Src, 0xF00000F0, Size);                    \
+  } while (0)
+
+#define TestImpl(Dst, Src)                                                     \
+  do {                                                                         \
+    TestImplSize(Dst, Src, 8);                                                 \
+    TestImplSize(Dst, Src, 16);                                                \
+    TestImplSize(Dst, Src, 32);                                                \
+  } while (0)
+
+  TestImpl(r1, r2);
+  TestImpl(r2, r3);
+  TestImpl(r3, r4);
+  TestImpl(r4, r5);
+  TestImpl(r5, r6);
+  TestImpl(r6, r7);
+  TestImpl(r7, r8);
+  TestImpl(r8, r10);
+  TestImpl(r10, r11);
+  TestImpl(r11, r12);
+  TestImpl(r12, r13);
+  TestImpl(r13, r14);
+  TestImpl(r14, r15);
+  TestImpl(r15, r1);
+
+#undef TestImpl
+#undef TestImplSize
+#undef TestImplValues
+#undef TestImplAddrImm
+#undef TestImplAddrReg
+#undef TestImplRegImm
+#undef TestImplRegReg
+}
+
+// No mull/div because x86.
+// No shift because x86.
+TEST_F(AssemblerX8664Test, Arith_most) {
+  static constexpr uint32_t Mask8 = 0xFF;
+  static constexpr uint32_t Mask16 = 0xFFFF;
+  static constexpr uint32_t Mask32 = 0xFFFFFFFF;
+
+#define TestImplRegReg(Inst, Dst, Value0, Src, Value1, Type, Size, Op)         \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Inst ", " #Dst ", " #Value0 ", " #Src ", " #Value1                \
+        ", " #Type #Size "_t, " #Op ")";                                       \
+                                                                               \
+    __ mov(IceType_i##Size, Encoded_GPR_##Dst(), Immediate(Value0));           \
+    __ mov(IceType_i##Size, Encoded_GPR_##Src(), Immediate(Value1));           \
+    __ Inst(IceType_i##Size, Encoded_GPR_##Dst(), Encoded_GPR_##Src());        \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(Mask##Size &static_cast<uint32_t>(                               \
+                  static_cast<Type##Size##_t>((Value0)&Mask##Size)             \
+                      Op static_cast<Type##Size##_t>((Value1)&Mask##Size)),    \
+              Mask##Size &test.Dst())                                          \
+        << TestString;                                                         \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplRegAddr(Inst, Dst, Value0, Value1, Type, Size, Op)             \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Inst ", " #Dst ", " #Value0 ", Addr, " #Value1 ", " #Type #Size   \
+        "_t, " #Op ")";                                                        \
+    const uint32_t T0 = allocateDword();                                       \
+    const uint32_t V0 = Value1;                                                \
+                                                                               \
+    __ mov(IceType_i##Size, Encoded_GPR_##Dst(), Immediate(Value0));           \
+    __ mov(IceType_i##Size, dwordAddress(T0), Immediate(Value1));              \
+    __ Inst(IceType_i##Size, Encoded_GPR_##Dst(), dwordAddress(T0));           \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDwordTo(T0, V0);                                                   \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(Mask##Size &static_cast<uint32_t>(                               \
+                  static_cast<Type##Size##_t>((Value0)&Mask##Size)             \
+                      Op static_cast<Type##Size##_t>((Value1)&Mask##Size)),    \
+              Mask##Size &test.Dst())                                          \
+        << TestString;                                                         \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplRegImm(Inst, Dst, Value0, Imm, Type, Size, Op)                 \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Inst ", " #Dst ", " #Value0 ", Imm(" #Imm "), " #Type #Size       \
+        "_t, " #Op ")";                                                        \
+                                                                               \
+    __ mov(IceType_i##Size, Encoded_GPR_##Dst(), Immediate(Value0));           \
+    __ Inst(IceType_i##Size, Encoded_GPR_##Dst(),                              \
+            Immediate((Imm)&Mask##Size));                                      \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(Mask##Size &static_cast<uint32_t>(                               \
+                  static_cast<Type##Size##_t>((Value0)&Mask##Size)             \
+                      Op static_cast<Type##Size##_t>((Imm)&Mask##Size)),       \
+              Mask##Size &test.Dst())                                          \
+        << TestString;                                                         \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplAddrReg(Inst, Value0, Src, Value1, Type, Size, Op)             \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Inst ", Addr, " #Value0 ", " #Src ", " #Value1 ", " #Type #Size   \
+        "_t, " #Op ")";                                                        \
+    const uint32_t T0 = allocateDword();                                       \
+    const uint32_t V0 = Value0;                                                \
+                                                                               \
+    __ mov(IceType_i##Size, Encoded_GPR_##Src(), Immediate(Value1));           \
+    __ Inst(IceType_i##Size, dwordAddress(T0), Encoded_GPR_##Src());           \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDwordTo(T0, V0);                                                   \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(Mask##Size &static_cast<uint32_t>(                               \
+                  static_cast<Type##Size##_t>((Value0)&Mask##Size)             \
+                      Op static_cast<Type##Size##_t>((Value1)&Mask##Size)),    \
+              Mask##Size &test.contentsOfDword(T0))                            \
+        << TestString;                                                         \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplAddrImm(Inst, Value0, Imm, Type, Size, Op)                     \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Inst ", Addr, " #Value0 ", Imm, " #Imm ", " #Type #Size           \
+        "_t, " #Op ")";                                                        \
+    const uint32_t T0 = allocateDword();                                       \
+    const uint32_t V0 = Value0;                                                \
+                                                                               \
+    __ Inst(IceType_i##Size, dwordAddress(T0), Immediate((Imm)&Mask##Size));   \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDwordTo(T0, V0);                                                   \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(Mask##Size &static_cast<uint32_t>(                               \
+                  static_cast<Type##Size##_t>((Value0)&Mask##Size)             \
+                      Op static_cast<Type##Size##_t>((Imm)&Mask##Size)),       \
+              Mask##Size &test.contentsOfDword(T0))                            \
+        << TestString;                                                         \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplOp(Inst, Dst, Value0, Src, Value1, Type, Size, Op)             \
+  do {                                                                         \
+    TestImplRegReg(Inst, Dst, Value0, Src, Value1, Type, Size, Op);            \
+    TestImplRegAddr(Inst, Dst, Value0, Value1, Type, Size, Op);                \
+    TestImplRegImm(Inst, Dst, Value0, Value1, Type, Size, Op);                 \
+    TestImplAddrReg(Inst, Value0, Src, Value1, Type, Size, Op);                \
+    TestImplAddrImm(Inst, Value0, Value1, Type, Size, Op);                     \
+  } while (0)
+
+#define TestImplValues(Dst, Value0, Src, Value1, Size)                         \
+  do {                                                                         \
+    TestImplOp(And, Dst, Value0, Src, Value1, int, Size, &);                   \
+    TestImplOp(And, Dst, Value0, Src, Value1, uint, Size, &);                  \
+    TestImplOp(Or, Dst, Value0, Src, Value1, int, Size, | );                   \
+    TestImplOp(Or, Dst, Value0, Src, Value1, uint, Size, | );                  \
+    TestImplOp(Xor, Dst, Value0, Src, Value1, int, Size, ^);                   \
+    TestImplOp(Xor, Dst, Value0, Src, Value1, uint, Size, ^);                  \
+    TestImplOp(add, Dst, Value0, Src, Value1, int, Size, +);                   \
+    TestImplOp(add, Dst, Value0, Src, Value1, uint, Size, +);                  \
+    TestImplOp(sub, Dst, Value0, Src, Value1, int, Size, -);                   \
+    TestImplOp(sub, Dst, Value0, Src, Value1, uint, Size, -);                  \
+  } while (0)
+
+#define TestImplSize(Dst, Src, Size)                                           \
+  do {                                                                         \
+    TestImplValues(Dst, 0xF0F12101, Src, 0x00000000, Size);                    \
+    TestImplValues(Dst, 0xF0000000, Src, 0xF0000000, Size);                    \
+    TestImplValues(Dst, 0x0F00000F, Src, 0xF0000070, Size);                    \
+    TestImplValues(Dst, 0x0F00F00F, Src, 0xF000F070, Size);                    \
+  } while (0)
+
+#define TestImpl(Dst, Src)                                                     \
+  do {                                                                         \
+    TestImplSize(Dst, Src, 8);                                                 \
+    TestImplSize(Dst, Src, 16);                                                \
+    TestImplSize(Dst, Src, 32);                                                \
+  } while (0)
+
+  TestImpl(r1, r2);
+  TestImpl(r2, r3);
+  TestImpl(r3, r4);
+  TestImpl(r4, r5);
+  TestImpl(r5, r6);
+  TestImpl(r6, r7);
+  TestImpl(r7, r8);
+  TestImpl(r8, r10);
+  TestImpl(r10, r11);
+  TestImpl(r11, r12);
+  TestImpl(r12, r13);
+  TestImpl(r13, r14);
+  TestImpl(r14, r15);
+  TestImpl(r15, r1);
+
+#undef TestImpl
+#undef TestImplSize
+#undef TestImplValues
+#undef TestImplOp
+#undef TestImplAddrImm
+#undef TestImplAddrReg
+#undef TestImplRegImm
+#undef TestImplRegAddr
+#undef TestImplRegReg
+}
+
+TEST_F(AssemblerX8664Test, Arith_BorrowNCarry) {
+  const uint32_t Mask8 = 0x000000FF;
+  const uint32_t Mask16 = 0x0000FFFF;
+  const uint32_t Mask32 = 0xFFFFFFFF;
+
+  const uint64_t ResultMask8 = 0x000000000000FFFFull;
+  const uint64_t ResultMask16 = 0x00000000FFFFFFFFull;
+  const uint64_t ResultMask32 = 0xFFFFFFFFFFFFFFFFull;
+
+#define TestImplRegReg(Inst0, Inst1, Dst0, Dst1, Value0, Src0, Src1, Value1,   \
+                       Op, Size)                                               \
+  do {                                                                         \
+    static_assert(Size == 8 || Size == 16 || Size == 32,                       \
+                  "Invalid size " #Size);                                      \
+    static constexpr char TestString[] =                                       \
+        "(" #Inst0 ", " #Inst1 ", " #Dst0 ", " #Dst1 ", " #Value0 ", " #Src0   \
+        ", " #Src1 ", " #Value1 ", " #Op ", " #Size ")";                       \
+    __ mov(IceType_i##Size, Encoded_GPR_##Dst0(),                              \
+           Immediate(uint64_t(Value0) & Mask##Size));                          \
+    __ mov(IceType_i##Size, Encoded_GPR_##Dst1(),                              \
+           Immediate((uint64_t(Value0) >> Size) & Mask##Size));                \
+    __ mov(IceType_i##Size, Encoded_GPR_##Src0(),                              \
+           Immediate(uint64_t(Value1) & Mask##Size));                          \
+    __ mov(IceType_i##Size, Encoded_GPR_##Src1(),                              \
+           Immediate((uint64_t(Value1) >> Size) & Mask##Size));                \
+    __ Inst0(IceType_i##Size, Encoded_GPR_##Dst0(), Encoded_GPR_##Src0());     \
+    __ Inst1(IceType_i##Size, Encoded_GPR_##Dst1(), Encoded_GPR_##Src1());     \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.run();                                                                \
+                                                                               \
+    static constexpr uint64_t Result =                                         \
+        (uint64_t(Value0) & ResultMask##Size)Op(uint64_t(Value1) &             \
+                                                ResultMask##Size);             \
+    static constexpr uint32_t Expected0 = Result & Mask##Size;                 \
+    static constexpr uint32_t Expected1 = (Result >> Size) & Mask##Size;       \
+    ASSERT_EQ(Expected0, test.Dst0()) << TestString << ": 0";                  \
+    ASSERT_EQ(Expected1, test.Dst1()) << TestString << ": 1";                  \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplRegAddr(Inst0, Inst1, Dst0, Dst1, Value0, Value1, Op, Size)    \
+  do {                                                                         \
+    static_assert(Size == 8 || Size == 16 || Size == 32,                       \
+                  "Invalid size " #Size);                                      \
+    static constexpr char TestString[] =                                       \
+        "(" #Inst0 ", " #Inst1 ", " #Dst0 ", " #Dst1 ", " #Value0              \
+        ", Addr, " #Value1 ", " #Op ", " #Size ")";                            \
+    const uint32_t T0 = allocateDword();                                       \
+    const uint32_t V0 = uint64_t(Value1) & Mask##Size;                         \
+    const uint32_t T1 = allocateDword();                                       \
+    const uint32_t V1 = (uint64_t(Value1) >> Size) & Mask##Size;               \
+    __ mov(IceType_i##Size, Encoded_GPR_##Dst0(),                              \
+           Immediate(uint64_t(Value0) & Mask##Size));                          \
+    __ mov(IceType_i##Size, Encoded_GPR_##Dst1(),                              \
+           Immediate((uint64_t(Value0) >> Size) & Mask##Size));                \
+    __ Inst0(IceType_i##Size, Encoded_GPR_##Dst0(), dwordAddress(T0));         \
+    __ Inst1(IceType_i##Size, Encoded_GPR_##Dst1(), dwordAddress(T1));         \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDwordTo(T0, V0);                                                   \
+    test.setDwordTo(T1, V1);                                                   \
+    test.run();                                                                \
+                                                                               \
+    static constexpr uint64_t Result =                                         \
+        (uint64_t(Value0) & ResultMask##Size)Op(uint64_t(Value1) &             \
+                                                ResultMask##Size);             \
+    static constexpr uint32_t Expected0 = Result & Mask##Size;                 \
+    static constexpr uint32_t Expected1 = (Result >> Size) & Mask##Size;       \
+    ASSERT_EQ(Expected0, test.Dst0()) << TestString << ": 0";                  \
+    ASSERT_EQ(Expected1, test.Dst1()) << TestString << ": 1";                  \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplRegImm(Inst0, Inst1, Dst0, Dst1, Value0, Imm, Op, Size)        \
+  do {                                                                         \
+    static_assert(Size == 8 || Size == 16 || Size == 32,                       \
+                  "Invalid size " #Size);                                      \
+    static constexpr char TestString[] =                                       \
+        "(" #Inst0 ", " #Inst1 ", " #Dst0 ", " #Dst1 ", " #Value0              \
+        ", Imm(" #Imm "), " #Op ", " #Size ")";                                \
+    __ mov(IceType_i##Size, Encoded_GPR_##Dst0(),                              \
+           Immediate(uint64_t(Value0) & Mask##Size));                          \
+    __ mov(IceType_i##Size, Encoded_GPR_##Dst1(),                              \
+           Immediate((uint64_t(Value0) >> Size) & Mask##Size));                \
+    __ Inst0(IceType_i##Size, Encoded_GPR_##Dst0(),                            \
+             Immediate(uint64_t(Imm) & Mask##Size));                           \
+    __ Inst1(IceType_i##Size, Encoded_GPR_##Dst1(),                            \
+             Immediate((uint64_t(Imm) >> Size) & Mask##Size));                 \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.run();                                                                \
+                                                                               \
+    static constexpr uint64_t Result =                                         \
+        (uint64_t(Value0) & ResultMask##Size)Op(uint64_t(Imm) &                \
+                                                ResultMask##Size);             \
+    static constexpr uint32_t Expected0 = Result & Mask##Size;                 \
+    static constexpr uint32_t Expected1 = (Result >> Size) & Mask##Size;       \
+    ASSERT_EQ(Expected0, test.Dst0()) << TestString << ": 0";                  \
+    ASSERT_EQ(Expected1, test.Dst1()) << TestString << ": 1";                  \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplAddrReg(Inst0, Inst1, Value0, Src0, Src1, Value1, Op, Size)    \
+  do {                                                                         \
+    static_assert(Size == 8 || Size == 16 || Size == 32,                       \
+                  "Invalid size " #Size);                                      \
+    static constexpr char TestString[] =                                       \
+        "(" #Inst0 ", " #Inst1 ", Addr, " #Value0 ", " #Src0 ", " #Src1        \
+        ", " #Value1 ", " #Op ", " #Size ")";                                  \
+    const uint32_t T0 = allocateDword();                                       \
+    const uint32_t V0 = uint64_t(Value0) & Mask##Size;                         \
+    const uint32_t T1 = allocateDword();                                       \
+    const uint32_t V1 = (uint64_t(Value0) >> Size) & Mask##Size;               \
+    __ mov(IceType_i##Size, Encoded_GPR_##Src0(),                              \
+           Immediate(uint64_t(Value1) & Mask##Size));                          \
+    __ mov(IceType_i##Size, Encoded_GPR_##Src1(),                              \
+           Immediate((uint64_t(Value1) >> Size) & Mask##Size));                \
+    __ Inst0(IceType_i##Size, dwordAddress(T0), Encoded_GPR_##Src0());         \
+    __ Inst1(IceType_i##Size, dwordAddress(T1), Encoded_GPR_##Src1());         \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDwordTo(T0, V0);                                                   \
+    test.setDwordTo(T1, V1);                                                   \
+    test.run();                                                                \
+                                                                               \
+    static constexpr uint64_t Result =                                         \
+        (uint64_t(Value0) & ResultMask##Size)Op(uint64_t(Value1) &             \
+                                                ResultMask##Size);             \
+    static constexpr uint32_t Expected0 = Result & Mask##Size;                 \
+    static constexpr uint32_t Expected1 = (Result >> Size) & Mask##Size;       \
+    ASSERT_EQ(Expected0, test.contentsOfDword(T0)) << TestString << ": 0";     \
+    ASSERT_EQ(Expected1, test.contentsOfDword(T1)) << TestString << ": 1";     \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplAddrImm(Inst0, Inst1, Value0, Imm, Op, Size)                   \
+  do {                                                                         \
+    static_assert(Size == 8 || Size == 16 || Size == 32,                       \
+                  "Invalid size " #Size);                                      \
+    static constexpr char TestString[] =                                       \
+        "(" #Inst0 ", " #Inst1 ", Addr, " #Value0 ", Imm(" #Imm "), " #Op      \
+        ", " #Size ")";                                                        \
+    const uint32_t T0 = allocateDword();                                       \
+    const uint32_t V0 = uint64_t(Value0) & Mask##Size;                         \
+    const uint32_t T1 = allocateDword();                                       \
+    const uint32_t V1 = (uint64_t(Value0) >> Size) & Mask##Size;               \
+    __ Inst0(IceType_i##Size, dwordAddress(T0),                                \
+             Immediate(uint64_t(Imm) & Mask##Size));                           \
+    __ Inst1(IceType_i##Size, dwordAddress(T1),                                \
+             Immediate((uint64_t(Imm) >> Size) & Mask##Size));                 \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDwordTo(T0, V0);                                                   \
+    test.setDwordTo(T1, V1);                                                   \
+    test.run();                                                                \
+                                                                               \
+    static constexpr uint64_t Result =                                         \
+        (uint64_t(Value0) & ResultMask##Size)Op(uint64_t(Imm) &                \
+                                                ResultMask##Size);             \
+    static constexpr uint32_t Expected0 = Result & Mask##Size;                 \
+    static constexpr uint32_t Expected1 = (Result >> Size) & Mask##Size;       \
+    ASSERT_EQ(Expected0, test.contentsOfDword(T0)) << TestString << ": 0";     \
+    ASSERT_EQ(Expected1, test.contentsOfDword(T1)) << TestString << ": 1";     \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplOp(Inst0, Inst1, Dst0, Dst1, Value0, Src0, Src1, Value1, Op,   \
+                   Size)                                                       \
+  do {                                                                         \
+    TestImplRegReg(Inst0, Inst1, Dst0, Dst1, Value0, Src0, Src1, Value1, Op,   \
+                   Size);                                                      \
+    TestImplRegAddr(Inst0, Inst1, Dst0, Dst1, Value0, Value1, Op, Size);       \
+    TestImplRegImm(Inst0, Inst1, Dst0, Dst1, Value0, Value1, Op, Size);        \
+    TestImplAddrReg(Inst0, Inst1, Value0, Src0, Src1, Value1, Op, Size);       \
+    TestImplAddrImm(Inst0, Inst1, Value0, Value1, Op, Size);                   \
+  } while (0)
+
+#define TestImplValues(Dst0, Dst1, Value0, Src0, Src1, Value1, Size)           \
+  do {                                                                         \
+    TestImplOp(add, adc, Dst0, Dst1, Value0, Src0, Src1, Value1, +, Size);     \
+    TestImplOp(sub, sbb, Dst0, Dst1, Value0, Src0, Src1, Value1, -, Size);     \
+  } while (0)
+
+#define TestImplSize(Dst0, Dst1, Src0, Src1, Size)                             \
+  do {                                                                         \
+    TestImplValues(Dst0, Dst1, 0xFFFFFFFFFFFFFF00ull, Src0, Src1,              \
+                   0xFFFFFFFF0000017Full, Size);                               \
+  } while (0)
+
+#define TestImpl(Dst0, Dst1, Src0, Src1)                                       \
+  do {                                                                         \
+    TestImplSize(Dst0, Dst1, Src0, Src1, 8);                                   \
+    TestImplSize(Dst0, Dst1, Src0, Src1, 16);                                  \
+    TestImplSize(Dst0, Dst1, Src0, Src1, 32);                                  \
+  } while (0)
+
+  TestImpl(r1, r2, r3, r5);
+  TestImpl(r2, r3, r4, r6);
+  TestImpl(r3, r4, r5, r7);
+  TestImpl(r4, r5, r6, r8);
+  TestImpl(r5, r6, r7, r10);
+  TestImpl(r6, r7, r8, r11);
+  TestImpl(r7, r8, r10, r12);
+  TestImpl(r8, r10, r11, r13);
+  TestImpl(r10, r11, r12, r14);
+  TestImpl(r11, r12, r13, r15);
+  TestImpl(r12, r13, r14, r1);
+  TestImpl(r13, r14, r15, r2);
+  TestImpl(r14, r15, r1, r3);
+  TestImpl(r15, r1, r2, r4);
+
+#undef TestImpl
+#undef TestImplSize
+#undef TestImplValues
+#undef TestImplOp
+#undef TestImplAddrImm
+#undef TestImplAddrReg
+#undef TestImplRegImm
+#undef TestImplRegAddr
+#undef TestImplRegReg
+}
+
+TEST_F(AssemblerX8664LowLevelTest, Cbw_Cwd_Cdq) {
+#define TestImpl(Inst, BytesSize, ...)                                         \
+  do {                                                                         \
+    __ Inst();                                                                 \
+    ASSERT_EQ(BytesSize, codeBytesSize()) << #Inst;                            \
+    ASSERT_TRUE(verifyBytes<BytesSize>(codeBytes(), __VA_ARGS__));             \
+    reset();                                                                   \
+  } while (0)
+
+  TestImpl(cbw, 2u, 0x66, 0x98);
+  TestImpl(cwd, 2u, 0x66, 0x99);
+  TestImpl(cdq, 1u, 0x99);
+
+#undef TestImpl
+}
+
+TEST_F(AssemblerX8664Test, SingleOperandMul) {
+  static constexpr uint32_t Mask8 = 0x000000FF;
+  static constexpr uint32_t Mask16 = 0x0000FFFF;
+  static constexpr uint32_t Mask32 = 0xFFFFFFFF;
+
+#define TestImplReg(Inst, Value0, Src, Value1, Type, Size)                     \
+  do {                                                                         \
+    static_assert(Encoded_GPR_eax() != Encoded_GPR_##Src(),                    \
+                  "eax can not be src1.");                                     \
+                                                                               \
+    static constexpr char TestString[] =                                       \
+        "(" #Inst ", " #Value0 ", " #Src ", " #Value1 ", " #Type ", " #Size    \
+        ")";                                                                   \
+    static constexpr Type##64_t OperandEax =                                   \
+        static_cast<Type##Size##_t>((Value0)&Mask##Size);                      \
+    static constexpr Type##64_t OperandOther =                                 \
+        static_cast<Type##Size##_t>((Value1)&Mask##Size);                      \
+    static constexpr uint32_t ExpectedEax =                                    \
+        Mask##Size & (OperandEax * OperandOther);                              \
+    static constexpr uint32_t ExpectedEdx =                                    \
+        Mask##Size & ((OperandEax * OperandOther) >> Size);                    \
+                                                                               \
+    __ mov(IceType_i##Size, Encoded_GPR_eax(),                                 \
+           Immediate((Value0)&Mask##Size));                                    \
+    __ mov(IceType_i##Size, Encoded_GPR_##Src(),                               \
+           Immediate((Value1)&Mask##Size));                                    \
+    __ Inst(IceType_i##Size, Encoded_GPR_##Src());                             \
+                                                                               \
+    if (Size == 8) {                                                           \
+      /* mov %ah, %dl */                                                       \
+      __ mov(IceType_i16, Encoded_GPR_dx(), Encoded_GPR_ax());                 \
+      __ shr(IceType_i32, Encoded_GPR_edx(), Immediate(8));                    \
+      __ And(IceType_i16, Encoded_GPR_ax(), Immediate(0x00FF));                \
+    }                                                                          \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(ExpectedEax, test.eax()) << TestString;                          \
+    ASSERT_EQ(ExpectedEdx, test.edx()) << TestString;                          \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplAddr(Inst, Value0, Value1, Type, Size)                         \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Inst ", " #Value0 ", Addr, " #Value1 ", " #Type ", " #Size ")";   \
+    static const uint32_t T0 = allocateDword();                                \
+    static constexpr uint32_t V0 = Value1;                                     \
+    static constexpr Type##64_t OperandEax =                                   \
+        static_cast<Type##Size##_t>((Value0)&Mask##Size);                      \
+    static constexpr Type##64_t OperandOther =                                 \
+        static_cast<Type##Size##_t>((Value1)&Mask##Size);                      \
+    static constexpr uint32_t ExpectedEax =                                    \
+        Mask##Size & (OperandEax * OperandOther);                              \
+    static constexpr uint32_t ExpectedEdx =                                    \
+        Mask##Size & ((OperandEax * OperandOther) >> Size);                    \
+                                                                               \
+    __ mov(IceType_i##Size, Encoded_GPR_eax(),                                 \
+           Immediate((Value0)&Mask##Size));                                    \
+    __ Inst(IceType_i##Size, dwordAddress(T0));                                \
+                                                                               \
+    if (Size == 8) {                                                           \
+      /* mov %ah, %dl */                                                       \
+      __ mov(IceType_i16, Encoded_GPR_dx(), Encoded_GPR_ax());                 \
+      __ shr(IceType_i32, Encoded_GPR_edx(), Immediate(8));                    \
+      __ And(IceType_i16, Encoded_GPR_ax(), Immediate(0x00FF));                \
+    }                                                                          \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDwordTo(T0, V0);                                                   \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(ExpectedEax, test.eax()) << TestString;                          \
+    ASSERT_EQ(ExpectedEdx, test.edx()) << TestString;                          \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplOp(Inst, Value0, Src, Value1, Type, Size)                      \
+  do {                                                                         \
+    TestImplReg(Inst, Value0, Src, Value1, Type, Size);                        \
+    TestImplAddr(Inst, Value0, Value1, Type, Size);                            \
+  } while (0)
+
+#define TestImplValue(Value0, Src, Value1, Size)                               \
+  do {                                                                         \
+    TestImplOp(mul, Value0, Src, Value1, uint, Size);                          \
+    TestImplOp(imul, Value0, Src, Value1, int, Size);                          \
+  } while (0)
+
+#define TestImplSize(Src, Size)                                                \
+  do {                                                                         \
+    TestImplValue(10, Src, 1, Size);                                           \
+    TestImplValue(10, Src, -1, Size);                                          \
+    TestImplValue(-10, Src, 37, Size);                                         \
+    TestImplValue(-10, Src, -15, Size);                                        \
+  } while (0)
+
+#define TestImpl(Src)                                                          \
+  do {                                                                         \
+    TestImplSize(Src, 8);                                                      \
+    TestImplSize(Src, 16);                                                     \
+    TestImplSize(Src, 32);                                                     \
+  } while (0)
+
+  TestImpl(r2);
+  TestImpl(r3);
+  TestImpl(r4);
+  TestImpl(r5);
+  TestImpl(r6);
+  TestImpl(r7);
+  TestImpl(r8);
+  TestImpl(r10);
+  TestImpl(r11);
+  TestImpl(r12);
+  TestImpl(r13);
+  TestImpl(r14);
+  TestImpl(r15);
+
+#undef TestImpl
+#undef TestImplSize
+#undef TestImplValue
+#undef TestImplOp
+#undef TestImplAddr
+#undef TestImplReg
+}
+
+TEST_F(AssemblerX8664Test, TwoOperandImul) {
+  static constexpr uint32_t Mask16 = 0x0000FFFF;
+  static constexpr uint32_t Mask32 = 0xFFFFFFFF;
+
+#define TestImplRegReg(Dst, Value0, Src, Value1, Size)                         \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Dst ", " #Value0 ", " #Src ", " #Value1 ", " #Size ")";           \
+    static constexpr int64_t Operand0 =                                        \
+        static_cast<int##Size##_t>((Value0)&Mask##Size);                       \
+    static constexpr int64_t Operand1 =                                        \
+        static_cast<int##Size##_t>((Value1)&Mask##Size);                       \
+    static constexpr uint32_t Expected = Mask##Size & (Operand0 * Operand1);   \
+                                                                               \
+    __ mov(IceType_i##Size, Encoded_GPR_##Dst(),                               \
+           Immediate((Value0)&Mask##Size));                                    \
+    __ mov(IceType_i##Size, Encoded_GPR_##Src(),                               \
+           Immediate((Value1)&Mask##Size));                                    \
+    __ imul(IceType_i##Size, Encoded_GPR_##Dst(), Encoded_GPR_##Src());        \
+                                                                               \
+    if (Size == 8) {                                                           \
+      /* mov %ah, %dl */                                                       \
+      __ mov(IceType_i16, Encoded_GPR_dx(), Encoded_GPR_ax());                 \
+      __ shr(IceType_i32, Encoded_GPR_edx(), Immediate(8));                    \
+      __ And(IceType_i16, Encoded_GPR_ax(), Immediate(0x00FF));                \
+    }                                                                          \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(Expected, test.Dst()) << TestString;                             \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplRegImm(Dst, Value0, Imm, Size)                                 \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Dst ", " #Value0 ", Imm(" #Imm "), " #Size ")";                   \
+    static constexpr int64_t Operand0 =                                        \
+        static_cast<int##Size##_t>((Value0)&Mask##Size);                       \
+    static constexpr int64_t Operand1 =                                        \
+        static_cast<int##Size##_t>((Imm)&Mask##Size);                          \
+    static constexpr uint32_t Expected = Mask##Size & (Operand0 * Operand1);   \
+                                                                               \
+    __ mov(IceType_i##Size, Encoded_GPR_##Dst(),                               \
+           Immediate((Value0)&Mask##Size));                                    \
+    __ imul(IceType_i##Size, Encoded_GPR_##Dst(), Immediate(Imm));             \
+                                                                               \
+    if (Size == 8) {                                                           \
+      /* mov %ah, %dl */                                                       \
+      __ mov(IceType_i16, Encoded_GPR_dx(), Encoded_GPR_ax());                 \
+      __ shr(IceType_i32, Encoded_GPR_edx(), Immediate(8));                    \
+      __ And(IceType_i16, Encoded_GPR_ax(), Immediate(0x00FF));                \
+    }                                                                          \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(Expected, test.Dst()) << TestString;                             \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplRegAddr(Dst, Value0, Value1, Size)                             \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Dst ", " #Value0 ", Addr," #Value1 ", " #Size ")";                \
+    static constexpr int64_t Operand0 =                                        \
+        static_cast<int##Size##_t>((Value0)&Mask##Size);                       \
+    static constexpr int64_t Operand1 =                                        \
+        static_cast<int##Size##_t>((Value1)&Mask##Size);                       \
+    static constexpr uint32_t Expected = Mask##Size & (Operand0 * Operand1);   \
+    const uint32_t T0 = allocateDword();                                       \
+                                                                               \
+    __ mov(IceType_i##Size, Encoded_GPR_##Dst(),                               \
+           Immediate((Value0)&Mask##Size));                                    \
+    __ imul(IceType_i##Size, Encoded_GPR_##Dst(), dwordAddress(T0));           \
+                                                                               \
+    if (Size == 8) {                                                           \
+      /* mov %ah, %dl */                                                       \
+      __ mov(IceType_i16, Encoded_GPR_dx(), Encoded_GPR_ax());                 \
+      __ shr(IceType_i32, Encoded_GPR_edx(), Immediate(8));                    \
+      __ And(IceType_i16, Encoded_GPR_ax(), Immediate(0x00FF));                \
+    }                                                                          \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDwordTo(T0, static_cast<uint32_t>(Operand1));                      \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(Expected, test.Dst()) << TestString;                             \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplValue(Dst, Value0, Src, Value1, Size)                          \
+  do {                                                                         \
+    TestImplRegReg(Dst, Value0, Src, Value1, Size);                            \
+    TestImplRegImm(Dst, Value0, Value1, Size);                                 \
+    TestImplRegAddr(Dst, Value0, Value1, Size);                                \
+  } while (0)
+
+#define TestImplSize(Dst, Src, Size)                                           \
+  do {                                                                         \
+    TestImplValue(Dst, 1, Src, 1, Size);                                       \
+    TestImplValue(Dst, -10, Src, 0x4050AA20, Size);                            \
+    TestImplValue(Dst, -2, Src, -55, Size);                                    \
+  } while (0)
+
+#define TestImpl(Dst, Src)                                                     \
+  do {                                                                         \
+    TestImplSize(Dst, Src, 16);                                                \
+    TestImplSize(Dst, Src, 32);                                                \
+  } while (0)
+
+  TestImpl(r1, r2);
+  TestImpl(r2, r3);
+  TestImpl(r3, r4);
+  TestImpl(r4, r5);
+  TestImpl(r5, r6);
+  TestImpl(r6, r7);
+  TestImpl(r7, r8);
+  TestImpl(r8, r10);
+  TestImpl(r10, r11);
+  TestImpl(r11, r12);
+  TestImpl(r12, r13);
+  TestImpl(r13, r14);
+  TestImpl(r14, r15);
+  TestImpl(r15, r1);
+
+#undef TestImpl
+#undef TestImplSize
+#undef TestImplValue
+#undef TestImplRegAddr
+#undef TestImplRegImm
+#undef TestImplRegReg
+}
+
+TEST_F(AssemblerX8664Test, Div) {
+  static constexpr uint32_t Mask8 = 0x000000FF;
+  static constexpr uint32_t Mask16 = 0x0000FFFF;
+  static constexpr uint32_t Mask32 = 0xFFFFFFFF;
+
+  static constexpr uint64_t Operand0Mask8 = 0x00000000000000FFull;
+  static constexpr uint64_t Operand0Mask16 = 0x00000000FFFFFFFFull;
+  static constexpr uint64_t Operand0Mask32 = 0xFFFFFFFFFFFFFFFFull;
+
+  using Operand0Type_int8 = int16_t;
+  using Operand0Type_uint8 = uint16_t;
+  using Operand0Type_int16 = int32_t;
+  using Operand0Type_uint16 = uint32_t;
+  using Operand0Type_int32 = int64_t;
+  using Operand0Type_uint32 = uint64_t;
+
+#define TestImplReg(Inst, Value0, Src, Value1, Type, Size)                     \
+  do {                                                                         \
+    static_assert(Encoded_GPR_eax() != Encoded_GPR_##Src(),                    \
+                  "eax can not be src1.");                                     \
+    static_assert(Encoded_GPR_edx() != Encoded_GPR_##Src(),                    \
+                  "edx can not be src1.");                                     \
+                                                                               \
+    static constexpr char TestString[] =                                       \
+        "(" #Inst ", " #Value0 ", " #Src ", " #Value1 ", " #Type ", " #Size    \
+        ")";                                                                   \
+    static constexpr Operand0Type_##Type##Size Operand0 =                      \
+        static_cast<Type##64_t>(Value0) & Operand0Mask##Size;                  \
+    static constexpr Type##Size##_t Operand0Lo = Operand0 & Mask##Size;        \
+    static constexpr Type##Size##_t Operand0Hi =                               \
+        (Operand0 >> Size) & Mask##Size;                                       \
+    static constexpr Type##Size##_t Operand1 =                                 \
+        static_cast<Type##Size##_t>(Value1) & Mask##Size;                      \
+    if (Size == 8) {                                                           \
+      /* mov Operand0Hi|Operand0Lo, %ah|%al */                                 \
+      __ mov(                                                                  \
+          IceType_i16, Encoded_GPR_eax(),                                      \
+          Immediate((static_cast<uint16_t>(Operand0Hi) << 8 | Operand0Lo)));   \
+    } else {                                                                   \
+      __ mov(IceType_i##Size, Encoded_GPR_eax(), Immediate(Operand0Lo));       \
+      __ mov(IceType_i##Size, Encoded_GPR_edx(), Immediate(Operand0Hi));       \
+    }                                                                          \
+    __ mov(IceType_i##Size, Encoded_GPR_##Src(), Immediate(Operand1));         \
+    __ Inst(IceType_i##Size, Encoded_GPR_##Src());                             \
+    if (Size == 8) {                                                           \
+      /* mov %ah, %dl */                                                       \
+      __ mov(IceType_i16, Encoded_GPR_dx(), Encoded_GPR_ax());                 \
+      __ shr(IceType_i32, Encoded_GPR_edx(), Immediate(8));                    \
+      __ And(IceType_i16, Encoded_GPR_eax(), Immediate(0x00FF));               \
+      if (Encoded_GPR_##Src() == Encoded_GPR_esi()) {                          \
+        __ And(IceType_i16, Encoded_GPR_edx(), Immediate(0x00FF));             \
+      }                                                                        \
+    }                                                                          \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.run();                                                                \
+                                                                               \
+    static constexpr uint32_t Quocient = (Operand0 / Operand1) & Mask##Size;   \
+    static constexpr uint32_t Reminder = (Operand0 % Operand1) & Mask##Size;   \
+    ASSERT_EQ(Quocient, test.eax()) << TestString;                             \
+    ASSERT_EQ(Reminder, test.edx()) << TestString;                             \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplAddr(Inst, Value0, Value1, Type, Size)                         \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Inst ", " #Value0 ", Addr, " #Value1 ", " #Type ", " #Size ")";   \
+    static constexpr Operand0Type_##Type##Size Operand0 =                      \
+        static_cast<Type##64_t>(Value0) & Operand0Mask##Size;                  \
+    static constexpr Type##Size##_t Operand0Lo = Operand0 & Mask##Size;        \
+    static constexpr Type##Size##_t Operand0Hi =                               \
+        (Operand0 >> Size) & Mask##Size;                                       \
+    const uint32_t T0 = allocateDword();                                       \
+    static constexpr Type##Size##_t V0 =                                       \
+        static_cast<Type##Size##_t>(Value1) & Mask##Size;                      \
+    if (Size == 8) {                                                           \
+      /* mov Operand0Hi|Operand0Lo, %ah|%al */                                 \
+      __ mov(                                                                  \
+          IceType_i16, Encoded_GPR_eax(),                                      \
+          Immediate((static_cast<uint16_t>(Operand0Hi) << 8 | Operand0Lo)));   \
+    } else {                                                                   \
+      __ mov(IceType_i##Size, Encoded_GPR_eax(), Immediate(Operand0Lo));       \
+      __ mov(IceType_i##Size, Encoded_GPR_edx(), Immediate(Operand0Hi));       \
+    }                                                                          \
+    __ Inst(IceType_i##Size, dwordAddress(T0));                                \
+    if (Size == 8) {                                                           \
+      /* mov %ah, %dl */                                                       \
+      __ mov(IceType_i16, Encoded_GPR_dx(), Encoded_GPR_ax());                 \
+      __ shr(IceType_i32, Encoded_GPR_edx(), Immediate(8));                    \
+      __ And(IceType_i16, Encoded_GPR_eax(), Immediate(0x00FF));               \
+    }                                                                          \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDwordTo(T0, static_cast<uint32_t>(V0));                            \
+    test.run();                                                                \
+                                                                               \
+    static constexpr uint32_t Quocient = (Operand0 / V0) & Mask##Size;         \
+    static constexpr uint32_t Reminder = (Operand0 % V0) & Mask##Size;         \
+    ASSERT_EQ(Quocient, test.eax()) << TestString;                             \
+    ASSERT_EQ(Reminder, test.edx()) << TestString;                             \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplOp(Inst, Value0, Src, Value1, Type, Size)                      \
+  do {                                                                         \
+    TestImplReg(Inst, Value0, Src, Value1, Type, Size);                        \
+    TestImplAddr(Inst, Value0, Value1, Type, Size);                            \
+  } while (0)
+
+#define TestImplValue(Value0, Src, Value1, Size)                               \
+  do {                                                                         \
+    TestImplOp(div, Value0, Src, Value1, uint, Size);                          \
+    TestImplOp(idiv, Value0, Src, Value1, int, Size);                          \
+  } while (0)
+
+#define TestImplSize(Src, Size)                                                \
+  do {                                                                         \
+    TestImplValue(10, Src, 1, Size);                                           \
+    TestImplValue(10, Src, -1, Size);                                          \
+  } while (0)
+
+#define TestImpl(Src)                                                          \
+  do {                                                                         \
+    TestImplSize(Src, 8);                                                      \
+    TestImplSize(Src, 16);                                                     \
+    TestImplSize(Src, 32);                                                     \
+  } while (0)
+
+  TestImpl(r2);
+  TestImpl(r3);
+  TestImpl(r5);
+  TestImpl(r6);
+  TestImpl(r7);
+  TestImpl(r8);
+  TestImpl(r10);
+  TestImpl(r11);
+  TestImpl(r12);
+  TestImpl(r13);
+  TestImpl(r14);
+  TestImpl(r15);
+
+#undef TestImpl
+#undef TestImplSize
+#undef TestImplValue
+#undef TestImplOp
+#undef TestImplAddr
+#undef TestImplReg
+}
+
+TEST_F(AssemblerX8664Test, Incl_Decl_Addr) {
+#define TestImpl(Inst, Value0)                                                 \
+  do {                                                                         \
+    const bool IsInc = std::string(#Inst).find("incl") != std::string::npos;   \
+    const uint32_t T0 = allocateDword();                                       \
+    const uint32_t V0 = Value0;                                                \
+                                                                               \
+    __ Inst(dwordAddress(T0));                                                 \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDwordTo(T0, V0);                                                   \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(static_cast<uint32_t>(Value0 + (IsInc ? 1 : -1)),                \
+              test.contentsOfDword(T0));                                       \
+    reset();                                                                   \
+  } while (0)
+
+#define TestInc(Value0)                                                        \
+  do {                                                                         \
+    TestImpl(incl, Value0);                                                    \
+  } while (0)
+
+#define TestDec(Value0)                                                        \
+  do {                                                                         \
+    TestImpl(decl, Value0);                                                    \
+  } while (0)
+
+  TestInc(230);
+
+  TestDec(30);
+
+#undef TestInc
+#undef TestDec
+#undef TestImpl
+}
+
+TEST_F(AssemblerX8664Test, Shifts) {
+  static constexpr uint32_t Mask8 = 0x000000FF;
+  static constexpr uint32_t Mask16 = 0x0000FFFF;
+  static constexpr uint32_t Mask32 = 0xFFFFFFFF;
+
+#define TestImplRegImm(Inst, Dst, Value0, Imm, Op, Type, Size)                 \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Inst ", " #Dst ", " #Value0 ", Imm(" #Imm "), " #Op ", " #Type    \
+        ", " #Size ")";                                                        \
+    const bool IsRol = std::string(#Inst).find("rol") != std::string::npos;    \
+    const uint##Size##_t Expected =                                            \
+        Mask##Size & (static_cast<Type##Size##_t>(Value0) Op(Imm) |            \
+                      (!IsRol ? 0 : (Value0) >> (Size - Imm)));                \
+                                                                               \
+    __ mov(IceType_i##Size, Encoded_GPR_##Dst(),                               \
+           Immediate((Value0)&Mask##Size));                                    \
+    __ Inst(IceType_i##Size, Encoded_GPR_##Dst(),                              \
+            Immediate((Imm)&Mask##Size));                                      \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(static_cast<uint32_t>(Expected), test.Dst()) << TestString;      \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplRegRegImm(Inst, Dst, Value0, Src, Value1, Count, Op0, Op1,     \
+                          Type, Size)                                          \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Inst ", " #Dst ", " #Value0 ", " #Src ", " #Value1                \
+        ", Imm(" #Count "), " #Op0 ", " #Op1 ", " #Type ", " #Size ")";        \
+    const uint##Size##_t Expected =                                            \
+        Mask##Size & (static_cast<Type##Size##_t>(Value0) Op0(Count) |         \
+                      (static_cast<Type##64_t>(Value1) Op1(Size - Count)));    \
+                                                                               \
+    __ mov(IceType_i##Size, Encoded_GPR_##Dst(),                               \
+           Immediate((Value0)&Mask##Size));                                    \
+    __ mov(IceType_i##Size, Encoded_GPR_##Src(),                               \
+           Immediate((Value1)&Mask##Size));                                    \
+    __ Inst(IceType_i##Size, Encoded_GPR_##Dst(), Encoded_GPR_##Src(),         \
+            Immediate(Count));                                                 \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(static_cast<uint32_t>(Expected), test.Dst()) << TestString;      \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplRegCl(Inst, Dst, Value0, Count, Op, Type, Size)                \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Inst ", " #Dst ", " #Value0 ", " #Count ", " #Op ", " #Type       \
+        ", " #Size ")";                                                        \
+    const bool IsRol = std::string(#Inst).find("rol") != std::string::npos;    \
+    const uint##Size##_t Expected =                                            \
+        Mask##Size & (static_cast<Type##Size##_t>(Value0) Op(Count) |          \
+                      (!IsRol ? 0 : Value0 >> (Size - Count)));                \
+                                                                               \
+    __ mov(IceType_i##Size, Encoded_GPR_##Dst(),                               \
+           Immediate((Value0)&Mask##Size));                                    \
+    __ mov(IceType_i8, Encoded_GPR_ecx(), Immediate((Count)&Mask##Size));      \
+    __ Inst(IceType_i##Size, Encoded_GPR_##Dst(), Encoded_GPR_ecx());          \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(static_cast<uint32_t>(Expected), test.Dst()) << TestString;      \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplRegRegCl(Inst, Dst, Value0, Src, Value1, Count, Op0, Op1,      \
+                         Type, Size)                                           \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Inst ", " #Dst ", " #Value0 ", " #Src ", " #Value1 ", " #Count    \
+        ", " #Op0 ", " #Op1 ", " #Type ", " #Size ")";                         \
+    const uint##Size##_t Expected =                                            \
+        Mask##Size & (static_cast<Type##Size##_t>(Value0) Op0(Count) |         \
+                      (static_cast<Type##64_t>(Value1) Op1(Size - Count)));    \
+                                                                               \
+    __ mov(IceType_i##Size, Encoded_GPR_##Dst(),                               \
+           Immediate((Value0)&Mask##Size));                                    \
+    __ mov(IceType_i##Size, Encoded_GPR_##Src(),                               \
+           Immediate((Value1)&Mask##Size));                                    \
+    __ mov(IceType_i##Size, Encoded_GPR_ecx(), Immediate((Count)&0x7F));       \
+    __ Inst(IceType_i##Size, Encoded_GPR_##Dst(), Encoded_GPR_##Src());        \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(static_cast<uint32_t>(Expected), test.Dst()) << TestString;      \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplAddrCl(Inst, Value0, Count, Op, Type, Size)                    \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Inst ", Addr, " #Value0 ", " #Count ", " #Op ", " #Type           \
+        ", " #Size ")";                                                        \
+    const bool IsRol = std::string(#Inst).find("rol") != std::string::npos;    \
+    const uint##Size##_t Expected =                                            \
+        Mask##Size & (static_cast<Type##Size##_t>(Value0) Op(Count) |          \
+                      (!IsRol ? 0 : Value0 >> (Size - Count)));                \
+    const uint32_t T0 = allocateDword();                                       \
+    const uint32_t V0 = Value0;                                                \
+                                                                               \
+    __ mov(IceType_i8, Encoded_GPR_ecx(), Immediate((Count)&Mask##Size));      \
+    __ Inst(IceType_i##Size, dwordAddress(T0), Encoded_GPR_ecx());             \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDwordTo(T0, V0);                                                   \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(static_cast<uint32_t>(Expected),                                 \
+              Mask##Size &test.contentsOfDword(T0))                            \
+        << TestString;                                                         \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplAddrRegCl(Inst, Value0, Src, Value1, Count, Op0, Op1, Type,    \
+                          Size)                                                \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Inst ", Addr, " #Value0 ", " #Src ", " #Value1 ", " #Count        \
+        ", " #Op0 ", " #Op1 ", " #Type ", " #Size ")";                         \
+    const uint##Size##_t Expected =                                            \
+        Mask##Size & (static_cast<Type##Size##_t>(Value0) Op0(Count) |         \
+                      (static_cast<Type##64_t>(Value1) Op1(Size - Count)));    \
+    const uint32_t T0 = allocateDword();                                       \
+                                                                               \
+    __ mov(IceType_i##Size, Encoded_GPR_##Src(),                               \
+           Immediate((Value1)&Mask##Size));                                    \
+    __ mov(IceType_i##Size, Encoded_GPR_ecx(), Immediate((Count)&0x7F));       \
+    __ Inst(IceType_i##Size, dwordAddress(T0), Encoded_GPR_##Src());           \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDwordTo(T0, static_cast<uint32_t>(Value0));                        \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(static_cast<uint32_t>(Expected), test.contentsOfDword(T0))       \
+        << TestString;                                                         \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplOp(Inst, Dst, Value0, Count, Op, Type, Size)                   \
+  do {                                                                         \
+    static_assert(Encoded_GPR_##Dst() != Encoded_GPR_ecx(),                    \
+                  "ecx should not be specified as Dst");                       \
+    TestImplRegImm(Inst, Dst, Value0, Count, Op, Type, Size);                  \
+    TestImplRegImm(Inst, ecx, Value0, Count, Op, Type, Size);                  \
+    TestImplRegCl(Inst, Dst, Value0, Count, Op, Type, Size);                   \
+    TestImplAddrCl(Inst, Value0, Count, Op, Type, Size);                       \
+  } while (0)
+
+#define TestImplThreeOperandOp(Inst, Dst, Value0, Src, Value1, Count, Op0,     \
+                               Op1, Type, Size)                                \
+  do {                                                                         \
+    static_assert(Encoded_GPR_##Dst() != Encoded_GPR_ecx(),                    \
+                  "ecx should not be specified as Dst");                       \
+    static_assert(Encoded_GPR_##Src() != Encoded_GPR_ecx(),                    \
+                  "ecx should not be specified as Src");                       \
+    TestImplRegRegImm(Inst, Dst, Value0, Src, Value1, Count, Op0, Op1, Type,   \
+                      Size);                                                   \
+    TestImplRegRegCl(Inst, Dst, Value0, Src, Value1, Count, Op0, Op1, Type,    \
+                     Size);                                                    \
+    TestImplAddrRegCl(Inst, Value0, Src, Value1, Count, Op0, Op1, Type, Size); \
+  } while (0)
+
+#define TestImplValue(Dst, Value0, Count, Size)                                \
+  do {                                                                         \
+    TestImplOp(rol, Dst, Value0, Count, <<, uint, Size);                       \
+    TestImplOp(shl, Dst, Value0, Count, <<, uint, Size);                       \
+    TestImplOp(shr, Dst, Value0, Count, >>, uint, Size);                       \
+    TestImplOp(sar, Dst, Value0, Count, >>, int, Size);                        \
+  } while (0)
+
+#define TestImplThreeOperandValue(Dst, Value0, Src, Value1, Count, Size)       \
+  do {                                                                         \
+    TestImplThreeOperandOp(shld, Dst, Value0, Src, Value1, Count, <<, >>,      \
+                           uint, Size);                                        \
+    TestImplThreeOperandOp(shrd, Dst, Value0, Src, Value1, Count, >>, <<,      \
+                           uint, Size);                                        \
+  } while (0)
+
+#define TestImplSize(Dst, Size)                                                \
+  do {                                                                         \
+    TestImplValue(Dst, 0x8F, 3, Size);                                         \
+    TestImplValue(Dst, 0x8FFF, 7, Size);                                       \
+    TestImplValue(Dst, 0x8FFFF, 7, Size);                                      \
+  } while (0)
+
+#define TestImplThreeOperandSize(Dst, Src, Size)                               \
+  do {                                                                         \
+    TestImplThreeOperandValue(Dst, 0xFFF3, Src, 0xA000, 8, Size);              \
+  } while (0)
+
+#define TestImpl(Dst, Src)                                                     \
+  do {                                                                         \
+    TestImplSize(Dst, 8);                                                      \
+    TestImplSize(Dst, 16);                                                     \
+    TestImplThreeOperandSize(Dst, Src, 16);                                    \
+    TestImplSize(Dst, 32);                                                     \
+    TestImplThreeOperandSize(Dst, Src, 32);                                    \
+  } while (0)
+
+  TestImpl(r1, r2);
+  TestImpl(r2, r4);
+  TestImpl(r4, r5);
+  TestImpl(r5, r6);
+  TestImpl(r6, r7);
+  TestImpl(r7, r8);
+  TestImpl(r8, r10);
+  TestImpl(r10, r11);
+  TestImpl(r11, r12);
+  TestImpl(r12, r13);
+  TestImpl(r13, r14);
+  TestImpl(r14, r15);
+  TestImpl(r15, r1);
+
+#undef TestImpl
+#undef TestImplThreeOperandSize
+#undef TestImplSize
+#undef TestImplValue
+#undef TestImplThreeOperandValue
+#undef TestImplOp
+#undef TestImplThreeOperandOp
+#undef TestImplAddrCl
+#undef TestImplRegRegCl
+#undef TestImplRegCl
+#undef TestImplRegRegImm
+#undef TestImplRegImm
+}
+
+TEST_F(AssemblerX8664Test, Neg) {
+  static constexpr uint32_t Mask8 = 0x000000ff;
+  static constexpr uint32_t Mask16 = 0x0000ffff;
+  static constexpr uint32_t Mask32 = 0xffffffff;
+
+#define TestImplReg(Dst, Size)                                                 \
+  do {                                                                         \
+    static constexpr int32_t Value = 0xFF00A543;                               \
+    __ mov(IceType_i##Size, Encoded_GPR_##Dst(),                               \
+           Immediate(static_cast<int##Size##_t>(Value) & Mask##Size));         \
+    __ neg(IceType_i##Size, Encoded_GPR_##Dst());                              \
+    __ mov(IceType_i##Size, Encoded_GPR_eax(), Encoded_GPR_##Dst());           \
+    __ And(IceType_i32, Encoded_GPR_eax(), Immediate(Mask##Size));             \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(1 + (~static_cast<int##Size##_t>(Value) & Mask##Size),           \
+              test.eax())                                                      \
+        << "(" #Dst ", " #Size ")";                                            \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplAddr(Size)                                                     \
+  do {                                                                         \
+    static constexpr int32_t Value = 0xFF00A543;                               \
+    const uint32_t T0 = allocateDword();                                       \
+    __ neg(IceType_i##Size, dwordAddress(T0));                                 \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDwordTo(T0, Value &Mask##Size);                                    \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(1 + (~static_cast<int##Size##_t>(Value) & Mask##Size),           \
+              test.contentsOfDword(T0))                                        \
+        << "(Addr, " #Size ")";                                                \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImpl(Size)                                                         \
+  do {                                                                         \
+    TestImplAddr(Size);                                                        \
+    TestImplReg(r1, Size);                                                     \
+    TestImplReg(r2, Size);                                                     \
+    TestImplReg(r3, Size);                                                     \
+    TestImplReg(r4, Size);                                                     \
+    TestImplReg(r5, Size);                                                     \
+    TestImplReg(r6, Size);                                                     \
+    TestImplReg(r7, Size);                                                     \
+    TestImplReg(r8, Size);                                                     \
+    TestImplReg(r10, Size);                                                    \
+    TestImplReg(r11, Size);                                                    \
+    TestImplReg(r12, Size);                                                    \
+    TestImplReg(r13, Size);                                                    \
+    TestImplReg(r14, Size);                                                    \
+    TestImplReg(r15, Size);                                                    \
+  } while (0)
+
+  TestImpl(8);
+  TestImpl(16);
+  TestImpl(32);
+
+#undef TestImpl
+#undef TestImplAddr
+#undef TestImplReg
+}
+
+TEST_F(AssemblerX8664Test, Not) {
+#define TestImpl(Dst)                                                          \
+  do {                                                                         \
+    static constexpr uint32_t Value = 0xFF00A543;                              \
+    __ mov(IceType_i32, Encoded_GPR_##Dst(), Immediate(Value));                \
+    __ notl(Encoded_GPR_##Dst());                                              \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(~Value, test.Dst()) << "(" #Dst ")";                             \
+    reset();                                                                   \
+  } while (0)
+
+  TestImpl(r1);
+  TestImpl(r2);
+  TestImpl(r3);
+  TestImpl(r4);
+  TestImpl(r5);
+  TestImpl(r6);
+  TestImpl(r7);
+  TestImpl(r8);
+  TestImpl(r10);
+  TestImpl(r11);
+  TestImpl(r12);
+  TestImpl(r13);
+  TestImpl(r14);
+  TestImpl(r15);
+
+#undef TestImpl
+}
+
+TEST_F(AssemblerX8664Test, Bswap) {
+#define TestImpl(Dst)                                                          \
+  do {                                                                         \
+    static constexpr uint32_t Value = 0xFF00A543;                              \
+    static constexpr uint32_t Expected = 0x43A500FF;                           \
+    __ mov(IceType_i32, Encoded_GPR_##Dst(), Immediate(Value));                \
+    __ bswap(IceType_i32, Encoded_GPR_##Dst());                                \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(Expected, test.Dst()) << "(" #Dst ")";                           \
+    reset();                                                                   \
+  } while (0)
+
+  TestImpl(r1);
+  TestImpl(r2);
+  TestImpl(r3);
+  TestImpl(r4);
+  TestImpl(r5);
+  TestImpl(r6);
+  TestImpl(r7);
+  TestImpl(r8);
+  TestImpl(r10);
+  TestImpl(r11);
+  TestImpl(r12);
+  TestImpl(r13);
+  TestImpl(r14);
+  TestImpl(r15);
+
+#undef TestImpl
+}
+
+TEST_F(AssemblerX8664Test, Bt) {
+#define TestImpl(Dst, Value0, Src, Value1)                                     \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Dst ", " #Value0 ", " #Src ", " #Value1 ")";                      \
+    static constexpr uint32_t Expected = ((Value0) & (1u << (Value1))) != 0;   \
+                                                                               \
+    __ mov(IceType_i32, Encoded_GPR_##Dst(), Immediate(Value0));               \
+    __ mov(IceType_i32, Encoded_GPR_##Src(), Immediate(Value1));               \
+    __ bt(Encoded_GPR_##Dst(), Encoded_GPR_##Src());                           \
+    __ setcc(Cond::Br_b, ByteRegister::Encoded_Reg_al);                        \
+    __ And(IceType_i32, Encoded_GPR_eax(), Immediate(0xFFu));                  \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(Expected, test.eax()) << TestString;                             \
+    reset();                                                                   \
+  } while (0)
+
+  TestImpl(r1, 0x08000000, r2, 27u);
+  TestImpl(r2, 0x08000000, r3, 23u);
+  TestImpl(r3, 0x00000000, r4, 1u);
+  TestImpl(r4, 0x08000300, r5, 9u);
+  TestImpl(r5, 0x08000300, r6, 10u);
+  TestImpl(r6, 0x7FFFEFFF, r7, 13u);
+  TestImpl(r7, 0x08000000, r8, 27u);
+  TestImpl(r8, 0x08000000, r10, 23u);
+  TestImpl(r10, 0x00000000, r11, 1u);
+  TestImpl(r11, 0x08000300, r12, 9u);
+  TestImpl(r12, 0x08000300, r13, 10u);
+  TestImpl(r13, 0x7FFFEFFF, r14, 13u);
+  TestImpl(r14, 0x08000000, r15, 27u);
+  TestImpl(r15, 0x08000000, r1, 23u);
+
+#undef TestImpl
+}
+
+template <uint32_t Value, uint32_t Bits> class BitScanHelper {
+  BitScanHelper() = delete;
+
+public:
+  static_assert(Bits == 16 || Bits == 32, "Bits must be 16 or 32");
+  using ValueType =
+      typename std::conditional<Bits == 16, uint16_t, uint32_t>::type;
+
+private:
+  static constexpr ValueType BitIndex(bool Forward, ValueType Index) {
+    return (Value == 0)
+               ? BitScanHelper<Value, Bits>::NoBitSet
+               : (Value & (1u << Index)
+                      ? Index
+                      : BitIndex(Forward, (Forward ? Index + 1 : Index - 1)));
+  }
+
+public:
+  static constexpr ValueType NoBitSet = static_cast<ValueType>(-1);
+  static constexpr ValueType bsf = BitIndex(/*Forward*/ true, /*Index=*/0);
+  static constexpr ValueType bsr =
+      BitIndex(/*Forward*/ false, /*Index=*/Bits - 1);
+};
+
+TEST_F(AssemblerX8664Test, BitScanOperations) {
+#define TestImplRegReg(Inst, Dst, Src, Value1, Size)                           \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Inst ", " #Dst ", " #Src ", " #Value1 ", " #Size ")";             \
+    static constexpr uint32_t Expected = BitScanHelper<Value1, Size>::Inst;    \
+    const uint32_t ZeroFlag = allocateDword();                                 \
+    __ mov(IceType_i##Size, Encoded_GPR_##Src(), Immediate(Value1));           \
+    __ Inst(IceType_i##Size, Encoded_GPR_##Dst(), Encoded_GPR_##Src());        \
+    __ setcc(Cond::Br_e, dwordAddress(ZeroFlag));                              \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDwordTo(ZeroFlag, 0u);                                             \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ((Expected == BitScanHelper<Value1, Size>::NoBitSet),             \
+              test.contentsOfDword(ZeroFlag))                                  \
+        << TestString;                                                         \
+    if ((Expected != BitScanHelper<Value1, Size>::NoBitSet)) {                 \
+      ASSERT_EQ(Expected, test.Dst()) << TestString;                           \
+    }                                                                          \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplRegAddr(Inst, Dst, Value1, Size)                               \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Inst ", " #Dst ", Addr, " #Value1 ", " #Size ")";                 \
+    static constexpr uint32_t Expected = BitScanHelper<Value1, Size>::Inst;    \
+    const uint32_t T0 = allocateDword();                                       \
+    const uint32_t ZeroFlag = allocateDword();                                 \
+    __ Inst(IceType_i##Size, Encoded_GPR_##Dst(), dwordAddress(T0));           \
+    __ setcc(Cond::Br_e, dwordAddress(ZeroFlag));                              \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDwordTo(T0, Value1);                                               \
+    test.setDwordTo(ZeroFlag, 0u);                                             \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ((Expected == BitScanHelper<Value1, Size>::NoBitSet),             \
+              test.contentsOfDword(ZeroFlag))                                  \
+        << TestString;                                                         \
+    if (Expected != BitScanHelper<Value1, Size>::NoBitSet) {                   \
+      ASSERT_EQ(Expected, test.Dst()) << TestString;                           \
+    }                                                                          \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplSize(Dst, Src, Value1, Size)                                   \
+  do {                                                                         \
+    TestImplRegReg(bsf, Dst, Src, Value1, Size);                               \
+    TestImplRegAddr(bsf, Dst, Value1, Size);                                   \
+    TestImplRegReg(bsr, Dst, Src, Value1, Size);                               \
+    TestImplRegAddr(bsf, Dst, Value1, Size);                                   \
+  } while (0)
+
+#define TestImplValue(Dst, Src, Value1)                                        \
+  do {                                                                         \
+    TestImplSize(Dst, Src, Value1, 16);                                        \
+    TestImplSize(Dst, Src, Value1, 32);                                        \
+  } while (0)
+
+#define TestImpl(Dst, Src)                                                     \
+  do {                                                                         \
+    TestImplValue(Dst, Src, 0x80000001);                                       \
+    TestImplValue(Dst, Src, 0x00000000);                                       \
+    TestImplValue(Dst, Src, 0x80001000);                                       \
+    TestImplValue(Dst, Src, 0x00FFFF00);                                       \
+  } while (0)
+
+  TestImpl(r1, r2);
+  TestImpl(r2, r3);
+  TestImpl(r3, r4);
+  TestImpl(r4, r5);
+  TestImpl(r5, r6);
+  TestImpl(r6, r7);
+  TestImpl(r7, r8);
+  TestImpl(r8, r10);
+  TestImpl(r10, r11);
+  TestImpl(r11, r12);
+  TestImpl(r12, r13);
+  TestImpl(r13, r14);
+  TestImpl(r14, r15);
+  TestImpl(r15, r1);
+
+#undef TestImpl
+#undef TestImplValue
+#undef TestImplSize
+#undef TestImplRegAddr
+#undef TestImplRegReg
+}
+
+} // end of anonymous namespace
+} // end of namespace Test
+} // end of namespace X8664
+} // end of namespace Ice
diff --git a/unittest/AssemblerX8664/Locked.cpp b/unittest/AssemblerX8664/Locked.cpp
new file mode 100644
index 0000000..f93f2d2
--- /dev/null
+++ b/unittest/AssemblerX8664/Locked.cpp
@@ -0,0 +1,449 @@
+//===- subzero/unittest/AssemblerX8664/Locked.cpp -------------------------===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#include "AssemblerX8664/TestUtil.h"
+
+namespace Ice {
+namespace X8664 {
+namespace Test {
+namespace {
+
+TEST_F(AssemblerX8664LowLevelTest, Mfence) {
+  __ mfence();
+
+  static constexpr uint8_t ByteCount = 3;
+  ASSERT_EQ(ByteCount, codeBytesSize());
+  verifyBytes<ByteCount>(codeBytes(), 0x0F, 0xAE, 0xF0);
+}
+
+TEST_F(AssemblerX8664LowLevelTest, Lock) {
+  __ lock();
+
+  static constexpr uint8_t ByteCount = 1;
+  ASSERT_EQ(ByteCount, codeBytesSize());
+  verifyBytes<ByteCount>(codeBytes(), 0xF0);
+}
+
+TEST_F(AssemblerX8664Test, Xchg) {
+  static constexpr uint32_t Mask8 = 0x000000FF;
+  static constexpr uint32_t Mask16 = 0x0000FFFF;
+  static constexpr uint32_t Mask32 = 0xFFFFFFFF;
+
+#define TestImplAddrReg(Value0, Dst1, Value1, Size)                            \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Value0 ", " #Dst1 ", " #Value1 ", " #Size ")";                    \
+    const uint32_t T0 = allocateDword();                                       \
+    const uint32_t V0 = (Value0)&Mask##Size;                                   \
+    const uint32_t V1 = (Value1)&Mask##Size;                                   \
+                                                                               \
+    __ mov(IceType_i##Size, Encoded_GPR_##Dst1(), Immediate(Value1));          \
+    __ xchg(IceType_i##Size, dwordAddress(T0), Encoded_GPR_##Dst1());          \
+    __ And(IceType_i32, Encoded_GPR_##Dst1(), Immediate(Mask##Size));          \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDwordTo(T0, V0);                                                   \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(V0, test.Dst1()) << TestString;                                  \
+    ASSERT_EQ(V1, test.contentsOfDword(T0)) << TestString;                     \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplSize(Dst1, Size)                                               \
+  do {                                                                         \
+    TestImplAddrReg(0xa2b34567, Dst1, 0x0507ddee, Size);                       \
+  } while (0)
+
+#define TestImpl(Dst1)                                                         \
+  do {                                                                         \
+    TestImplSize(Dst1, 8);                                                     \
+    TestImplSize(Dst1, 16);                                                    \
+    TestImplSize(Dst1, 32);                                                    \
+  } while (0)
+
+  TestImpl(r1);
+  TestImpl(r2);
+  TestImpl(r3);
+  TestImpl(r4);
+  TestImpl(r5);
+  TestImpl(r6);
+  TestImpl(r7);
+  TestImpl(r8);
+  TestImpl(r10);
+  TestImpl(r11);
+  TestImpl(r12);
+  TestImpl(r13);
+  TestImpl(r14);
+  TestImpl(r15);
+
+#undef TestImpl
+#undef TestImplSize
+#undef TestImplAddrReg
+}
+
+TEST_F(AssemblerX8664Test, Xadd) {
+  static constexpr bool NotLocked = false;
+  static constexpr bool Locked = true;
+
+  static constexpr uint32_t Mask8 = 0x000000FF;
+  static constexpr uint32_t Mask16 = 0x0000FFFF;
+  static constexpr uint32_t Mask32 = 0xFFFFFFFF;
+
+#define TestImplAddrReg(Value0, Dst1, Value1, LockedOrNot, Size)               \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Value0 ", " #Dst1 ", " #Value1 ", " #Size ")";                    \
+    const uint32_t T0 = allocateDword();                                       \
+    const uint32_t V0 = (Value0)&Mask##Size;                                   \
+    const uint32_t V1 = (Value1)&Mask##Size;                                   \
+                                                                               \
+    __ mov(IceType_i##Size, Encoded_GPR_##Dst1(), Immediate(Value1));          \
+    __ xadd(IceType_i##Size, dwordAddress(T0), Encoded_GPR_##Dst1(),           \
+            LockedOrNot);                                                      \
+    __ And(IceType_i32, Encoded_GPR_##Dst1(), Immediate(Mask##Size));          \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDwordTo(T0, V0);                                                   \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(V0, test.Dst1()) << TestString;                                  \
+    ASSERT_EQ(Mask##Size &(V1 + V0), test.contentsOfDword(T0)) << TestString;  \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplSize(Dst1, Size)                                               \
+  do {                                                                         \
+    TestImplAddrReg(0xa2b34567, Dst1, 0x0507ddee, NotLocked, Size);            \
+    TestImplAddrReg(0xa2b34567, Dst1, 0x0507ddee, Locked, Size);               \
+  } while (0)
+
+#define TestImpl(Dst1)                                                         \
+  do {                                                                         \
+    TestImplSize(Dst1, 8);                                                     \
+    TestImplSize(Dst1, 16);                                                    \
+    TestImplSize(Dst1, 32);                                                    \
+  } while (0)
+
+  TestImpl(r1);
+  TestImpl(r2);
+  TestImpl(r3);
+  TestImpl(r4);
+  TestImpl(r5);
+  TestImpl(r6);
+  TestImpl(r7);
+  TestImpl(r8);
+  TestImpl(r10);
+  TestImpl(r11);
+  TestImpl(r12);
+  TestImpl(r13);
+  TestImpl(r14);
+  TestImpl(r15);
+
+#undef TestImpl
+#undef TestImplSize
+#undef TestImplAddrReg
+}
+
+TEST_F(AssemblerX8664LowLevelTest, Xadd) {
+  static constexpr bool NotLocked = false;
+  static constexpr bool Locked = true;
+
+  // Ensures that xadd emits a lock prefix accordingly.
+  {
+    __ xadd(IceType_i8, Address::Absolute(0x1FF00), Encoded_GPR_r14(),
+            NotLocked);
+    static constexpr uint8_t ByteCountNotLocked8 = 8;
+    ASSERT_EQ(ByteCountNotLocked8, codeBytesSize());
+    ASSERT_TRUE(verifyBytes<ByteCountNotLocked8>(codeBytes(), 0x44, 0x0F, 0xC0,
+                                                 0x35, 0x00, 0xFF, 0x01, 0x00));
+    reset();
+
+    __ xadd(IceType_i8, Address::Absolute(0x1FF00), Encoded_GPR_r14(), Locked);
+    static constexpr uint8_t ByteCountLocked8 = 1 + ByteCountNotLocked8;
+    ASSERT_EQ(ByteCountLocked8, codeBytesSize());
+    ASSERT_TRUE(verifyBytes<ByteCountLocked8>(
+        codeBytes(), 0xF0, 0x44, 0x0F, 0xC0, 0x35, 0x00, 0xFF, 0x01, 0x00));
+    reset();
+  }
+
+  {
+    __ xadd(IceType_i16, Address::Absolute(0x1FF00), Encoded_GPR_r14(),
+            NotLocked);
+    static constexpr uint8_t ByteCountNotLocked16 = 9;
+    ASSERT_EQ(ByteCountNotLocked16, codeBytesSize());
+    ASSERT_TRUE(verifyBytes<ByteCountNotLocked16>(
+        codeBytes(), 0x66, 0x44, 0x0F, 0xC1, 0x35, 0x00, 0xFF, 0x01, 0x00));
+    reset();
+
+    __ xadd(IceType_i16, Address::Absolute(0x1FF00), Encoded_GPR_r14(), Locked);
+    static constexpr uint8_t ByteCountLocked16 = 1 + ByteCountNotLocked16;
+    ASSERT_EQ(ByteCountLocked16, codeBytesSize());
+    ASSERT_TRUE(verifyBytes<ByteCountLocked16>(codeBytes(), 0x66, 0xF0, 0x44,
+                                               0x0F, 0xC1, 0x35, 0x00, 0xFF,
+                                               0x01, 0x00));
+    reset();
+  }
+
+  {
+    __ xadd(IceType_i32, Address::Absolute(0x1FF00), Encoded_GPR_r14(),
+            NotLocked);
+    static constexpr uint8_t ByteCountNotLocked32 = 8;
+    ASSERT_EQ(ByteCountNotLocked32, codeBytesSize());
+    ASSERT_TRUE(verifyBytes<ByteCountNotLocked32>(
+        codeBytes(), 0x44, 0x0F, 0xC1, 0x35, 0x00, 0xFF, 0x01, 0x00));
+    reset();
+
+    __ xadd(IceType_i32, Address::Absolute(0x1FF00), Encoded_GPR_r14(), Locked);
+    static constexpr uint8_t ByteCountLocked32 = 1 + ByteCountNotLocked32;
+    ASSERT_EQ(ByteCountLocked32, codeBytesSize());
+    ASSERT_TRUE(verifyBytes<ByteCountLocked32>(
+        codeBytes(), 0xF0, 0x44, 0x0F, 0xC1, 0x35, 0x00, 0xFF, 0x01, 0x00));
+    reset();
+  }
+}
+
+TEST_F(AssemblerX8664LowLevelTest, EmitSegmentOverride) {
+#define TestImpl(Prefix)                                                       \
+  do {                                                                         \
+    static constexpr uint8_t ByteCount = 1;                                    \
+    __ emitSegmentOverride(Prefix);                                            \
+    ASSERT_EQ(ByteCount, codeBytesSize()) << Prefix;                           \
+    ASSERT_TRUE(verifyBytes<ByteCount>(codeBytes(), Prefix));                  \
+    reset();                                                                   \
+  } while (0)
+
+  TestImpl(0x26);
+  TestImpl(0x2E);
+  TestImpl(0x36);
+  TestImpl(0x3E);
+  TestImpl(0x64);
+  TestImpl(0x65);
+  TestImpl(0x66);
+  TestImpl(0x67);
+
+#undef TestImpl
+}
+
+TEST_F(AssemblerX8664Test, Cmpxchg8b) {
+  static constexpr bool NotLocked = false;
+  static constexpr bool Locked = true;
+
+#define TestImpl(Value0, Value1, ValueMem, LockedOrNot)                        \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Value0 ", " #Value1 ", " #ValueMem ", " #LockedOrNot ")";         \
+    const uint32_t T0 = allocateQword();                                       \
+    static constexpr uint64_t V0 = ValueMem;                                   \
+    const uint32_t ZeroFlag = allocateDword();                                 \
+                                                                               \
+    __ mov(IceType_i32, Encoded_GPR_eax(),                                     \
+           Immediate(uint64_t(Value0) & 0xFFFFFFFF));                          \
+    __ mov(IceType_i32, Encoded_GPR_edx(), Immediate(uint64_t(Value0) >> 32)); \
+    __ mov(IceType_i32, Encoded_GPR_ebx(),                                     \
+           Immediate(uint64_t(Value1) & 0xFFFFFFFF));                          \
+    __ mov(IceType_i32, Encoded_GPR_ecx(), Immediate(uint64_t(Value1) >> 32)); \
+    __ cmpxchg8b(dwordAddress(T0), LockedOrNot);                               \
+    __ setcc(Cond::Br_e, dwordAddress(ZeroFlag));                              \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setQwordTo(T0, V0);                                                   \
+    test.setDwordTo(ZeroFlag, uint32_t(0xFF));                                 \
+    test.run();                                                                \
+                                                                               \
+    if (V0 == (Value0)) {                                                      \
+      ASSERT_EQ(uint64_t(Value1), test.contentsOfQword(T0)) << TestString;     \
+      ASSERT_EQ(1u, test.contentsOfDword(ZeroFlag)) << TestString;             \
+    } else {                                                                   \
+      ASSERT_EQ(uint64_t(ValueMem) & 0xFFFFFFFF, test.eax()) << TestString;    \
+      ASSERT_EQ((uint64_t(ValueMem) >> 32) & 0xFFFFFFFF, test.edx())           \
+          << TestString;                                                       \
+      ASSERT_EQ(0u, test.contentsOfDword(ZeroFlag)) << TestString;             \
+    }                                                                          \
+    reset();                                                                   \
+  } while (0)
+
+  TestImpl(0x98987676543210ull, 0x1, 0x98987676543210ull, NotLocked);
+  TestImpl(0x98987676543210ull, 0x1, 0x98987676543210ull, Locked);
+  TestImpl(0x98987676543210ull, 0x1, 0x98987676543211ull, NotLocked);
+  TestImpl(0x98987676543210ull, 0x1, 0x98987676543211ull, Locked);
+
+#undef TestImpl
+}
+
+TEST_F(AssemblerX8664LowLevelTest, Cmpxchg8b) {
+  static constexpr bool NotLocked = false;
+  static constexpr bool Locked = true;
+
+  // Ensures that cmpxchg8b emits a lock prefix accordingly.
+  __ cmpxchg8b(Address::Absolute(0x1FF00), NotLocked);
+  static constexpr uint8_t ByteCountNotLocked = 7;
+  ASSERT_EQ(ByteCountNotLocked, codeBytesSize());
+  ASSERT_TRUE(verifyBytes<ByteCountNotLocked>(codeBytes(), 0x0F, 0xC7, 0x0D,
+                                              0x00, 0xFF, 0x01, 0x00));
+  reset();
+
+  __ cmpxchg8b(Address::Absolute(0x1FF00), Locked);
+  static constexpr uint8_t ByteCountLocked = 1 + ByteCountNotLocked;
+  ASSERT_EQ(ByteCountLocked, codeBytesSize());
+  ASSERT_TRUE(verifyBytes<ByteCountLocked>(codeBytes(), 0xF0, 0x0F, 0xC7, 0x0D,
+                                           0x00, 0xFF, 0x01, 0x00));
+  reset();
+}
+
+TEST_F(AssemblerX8664Test, Cmpxchg) {
+  static constexpr bool NotLocked = false;
+  static constexpr bool Locked = true;
+
+  static constexpr uint32_t Mask8 = 0x000000FF;
+  static constexpr uint32_t Mask16 = 0x0000FFFF;
+  static constexpr uint32_t Mask32 = 0xFFFFFFFF;
+
+#define TestImplAddrReg(Value0, Src, Value1, ValueMem, LockedOrNot, Size)      \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Value0 ", " #Src ", " #Value1 ", " #ValueMem ", " #LockedOrNot    \
+        ", " #Size ")";                                                        \
+    const uint32_t T0 = allocateDword();                                       \
+    static constexpr uint32_t V0 = (ValueMem)&Mask##Size;                      \
+    const uint32_t ZeroFlag = allocateDword();                                 \
+                                                                               \
+    __ mov(IceType_i##Size, Encoded_GPR_eax(),                                 \
+           Immediate((Value0)&Mask##Size));                                    \
+    __ mov(IceType_i##Size, Encoded_GPR_##Src(),                               \
+           Immediate((Value1)&Mask##Size));                                    \
+    __ cmpxchg(IceType_i##Size, dwordAddress(T0), Encoded_GPR_##Src(),         \
+               LockedOrNot);                                                   \
+    __ setcc(Cond::Br_e, dwordAddress(ZeroFlag));                              \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDwordTo(T0, V0);                                                   \
+    test.setDwordTo(ZeroFlag, uint32_t(0xFF));                                 \
+    test.run();                                                                \
+                                                                               \
+    if (V0 == (Mask##Size & (Value0))) {                                       \
+      ASSERT_EQ(uint32_t((Value1)&Mask##Size), test.contentsOfDword(T0))       \
+          << TestString;                                                       \
+      ASSERT_EQ(1u, test.contentsOfDword(ZeroFlag)) << TestString;             \
+    } else {                                                                   \
+      ASSERT_EQ(uint32_t((ValueMem)&Mask##Size), test.eax()) << TestString;    \
+      ASSERT_EQ(0u, test.contentsOfDword(ZeroFlag)) << TestString;             \
+    }                                                                          \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplValue(Value0, Src, Value1, ValueMem, LockedOrNot)              \
+  do {                                                                         \
+    TestImplAddrReg(Value0, Src, Value1, ValueMem, LockedOrNot, 8);            \
+    TestImplAddrReg(Value0, Src, Value1, ValueMem, LockedOrNot, 16);           \
+    TestImplAddrReg(Value0, Src, Value1, ValueMem, LockedOrNot, 32);           \
+  } while (0)
+
+#define TestImpl(Src, LockedOrNot)                                             \
+  do {                                                                         \
+    TestImplValue(0xFFFFFFFF, Src, 0x1, 0xFFFFFFFF, LockedOrNot);              \
+    TestImplValue(0x0FFF0F0F, Src, 0x1, 0xFFFFFFFF, LockedOrNot);              \
+  } while (0)
+
+  TestImpl(r2, Locked);
+  TestImpl(r2, NotLocked);
+  TestImpl(r3, Locked);
+  TestImpl(r3, NotLocked);
+  TestImpl(r4, Locked);
+  TestImpl(r4, NotLocked);
+  TestImpl(r5, Locked);
+  TestImpl(r5, NotLocked);
+  TestImpl(r6, Locked);
+  TestImpl(r6, NotLocked);
+  TestImpl(r7, Locked);
+  TestImpl(r7, NotLocked);
+  TestImpl(r8, Locked);
+  TestImpl(r8, NotLocked);
+  TestImpl(r10, Locked);
+  TestImpl(r10, NotLocked);
+  TestImpl(r11, Locked);
+  TestImpl(r11, NotLocked);
+  TestImpl(r12, Locked);
+  TestImpl(r12, NotLocked);
+  TestImpl(r13, Locked);
+  TestImpl(r13, NotLocked);
+  TestImpl(r14, Locked);
+  TestImpl(r14, NotLocked);
+  TestImpl(r15, Locked);
+  TestImpl(r15, NotLocked);
+
+#undef TestImpl
+#undef TestImplValue
+#undef TestImplAddrReg
+}
+
+TEST_F(AssemblerX8664LowLevelTest, Cmpxchg) {
+  static constexpr bool NotLocked = false;
+  static constexpr bool Locked = true;
+
+  // Ensures that cmpxchg emits a lock prefix accordingly.
+  {
+    __ cmpxchg(IceType_i8, Address::Absolute(0x1FF00), Encoded_GPR_r14(),
+               NotLocked);
+    static constexpr uint8_t ByteCountNotLocked8 = 8;
+    ASSERT_EQ(ByteCountNotLocked8, codeBytesSize());
+    ASSERT_TRUE(verifyBytes<ByteCountNotLocked8>(codeBytes(), 0x44, 0x0F, 0xB0,
+                                                 0x35, 0x00, 0xFF, 0x01, 0x00));
+    reset();
+
+    __ cmpxchg(IceType_i8, Address::Absolute(0x1FF00), Encoded_GPR_r14(),
+               Locked);
+    static constexpr uint8_t ByteCountLocked8 = 1 + ByteCountNotLocked8;
+    ASSERT_EQ(ByteCountLocked8, codeBytesSize());
+    ASSERT_TRUE(verifyBytes<ByteCountLocked8>(
+        codeBytes(), 0xF0, 0x44, 0x0F, 0xB0, 0x35, 0x00, 0xFF, 0x01, 0x00));
+    reset();
+  }
+
+  {
+    __ cmpxchg(IceType_i16, Address::Absolute(0x1FF00), Encoded_GPR_r14(),
+               NotLocked);
+    static constexpr uint8_t ByteCountNotLocked16 = 9;
+    ASSERT_EQ(ByteCountNotLocked16, codeBytesSize());
+    ASSERT_TRUE(verifyBytes<ByteCountNotLocked16>(
+        codeBytes(), 0x66, 0x44, 0x0F, 0xB1, 0x35, 0x00, 0xFF, 0x01, 0x00));
+    reset();
+
+    __ cmpxchg(IceType_i16, Address::Absolute(0x1FF00), Encoded_GPR_r14(),
+               Locked);
+    static constexpr uint8_t ByteCountLocked16 = 1 + ByteCountNotLocked16;
+    ASSERT_EQ(ByteCountLocked16, codeBytesSize());
+    ASSERT_TRUE(verifyBytes<ByteCountLocked16>(codeBytes(), 0x66, 0xF0, 0x44,
+                                               0x0F, 0xB1, 0x35, 0x00, 0xFF,
+                                               0x01, 0x00));
+    reset();
+  }
+
+  {
+    __ cmpxchg(IceType_i32, Address::Absolute(0x1FF00), Encoded_GPR_r14(),
+               NotLocked);
+    static constexpr uint8_t ByteCountNotLocked32 = 8;
+    ASSERT_EQ(ByteCountNotLocked32, codeBytesSize());
+    ASSERT_TRUE(verifyBytes<ByteCountNotLocked32>(
+        codeBytes(), 0x44, 0x0F, 0xB1, 0x35, 0x00, 0xFF, 0x01, 0x00));
+    reset();
+
+    __ cmpxchg(IceType_i32, Address::Absolute(0x1FF00), Encoded_GPR_r14(),
+               Locked);
+    static constexpr uint8_t ByteCountLocked32 = 1 + ByteCountNotLocked32;
+    ASSERT_EQ(ByteCountLocked32, codeBytesSize());
+    ASSERT_TRUE(verifyBytes<ByteCountLocked32>(
+        codeBytes(), 0xF0, 0x44, 0x0F, 0xB1, 0x35, 0x00, 0xFF, 0x01, 0x00));
+    reset();
+  }
+}
+
+} // end of anonymous namespace
+} // end of namespace Test
+} // end of namespace X8664
+} // end of namespace Ice
diff --git a/unittest/AssemblerX8664/LowLevel.cpp b/unittest/AssemblerX8664/LowLevel.cpp
new file mode 100644
index 0000000..3c87d8b
--- /dev/null
+++ b/unittest/AssemblerX8664/LowLevel.cpp
@@ -0,0 +1,1124 @@
+//===- subzero/unittest/AssemblerX8664/LowLevel.cpp -----------------------===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#include "AssemblerX8664/TestUtil.h"
+
+namespace Ice {
+namespace X8664 {
+namespace Test {
+namespace {
+
+TEST_F(AssemblerX8664LowLevelTest, Ret) {
+  __ ret();
+
+  constexpr size_t ByteCount = 1;
+  ASSERT_EQ(ByteCount, codeBytesSize());
+
+  verifyBytes<ByteCount>(codeBytes(), 0xc3);
+}
+
+TEST_F(AssemblerX8664LowLevelTest, RetImm) {
+  __ ret(Immediate(0x20));
+
+  constexpr size_t ByteCount = 3;
+  ASSERT_EQ(ByteCount, codeBytesSize());
+
+  verifyBytes<ByteCount>(codeBytes(), 0xC2, 0x20, 0x00);
+}
+
+TEST_F(AssemblerX8664LowLevelTest, CallImm4) {
+  __ call(Immediate(4));
+
+  constexpr size_t ByteCount = 5;
+  ASSERT_EQ(ByteCount, codeBytesSize());
+
+  verifyBytes<ByteCount>(codeBytes(), 0xe8, 0x00, 0x00, 0x00, 0x00);
+}
+
+TEST_F(AssemblerX8664LowLevelTest, PopRegs) {
+  __ popl(Encoded_GPR_eax());
+  __ popl(Encoded_GPR_ebx());
+  __ popl(Encoded_GPR_ecx());
+  __ popl(Encoded_GPR_edx());
+  __ popl(Encoded_GPR_edi());
+  __ popl(Encoded_GPR_esi());
+  __ popl(Encoded_GPR_ebp());
+  __ popl(Encoded_GPR_r8());
+  __ popl(Encoded_GPR_r9());
+  __ popl(Encoded_GPR_r10());
+  __ popl(Encoded_GPR_r11());
+  __ popl(Encoded_GPR_r12());
+  __ popl(Encoded_GPR_r13());
+  __ popl(Encoded_GPR_r14());
+  __ popl(Encoded_GPR_r15());
+
+  constexpr size_t ByteCount = 23;
+  ASSERT_EQ(ByteCount, codeBytesSize());
+
+  constexpr uint8_t Rex_B = 0x41;
+  constexpr uint8_t PopOpcode = 0x58;
+  verifyBytes<ByteCount>(
+      codeBytes(), PopOpcode | Encoded_GPR_eax(), PopOpcode | Encoded_GPR_ebx(),
+      PopOpcode | Encoded_GPR_ecx(), PopOpcode | Encoded_GPR_edx(),
+      PopOpcode | Encoded_GPR_edi(), PopOpcode | Encoded_GPR_esi(),
+      PopOpcode | Encoded_GPR_ebp(), Rex_B, PopOpcode | (Encoded_GPR_r8() & 7),
+      Rex_B, PopOpcode | (Encoded_GPR_r9() & 7), Rex_B,
+      PopOpcode | (Encoded_GPR_r10() & 7), Rex_B,
+      PopOpcode | (Encoded_GPR_r11() & 7), Rex_B,
+      PopOpcode | (Encoded_GPR_r12() & 7), Rex_B,
+      PopOpcode | (Encoded_GPR_r13() & 7), Rex_B,
+      PopOpcode | (Encoded_GPR_r14() & 7), Rex_B,
+      PopOpcode | (Encoded_GPR_r15() & 7));
+}
+
+TEST_F(AssemblerX8664LowLevelTest, PushRegs) {
+  __ pushl(Encoded_GPR_eax());
+  __ pushl(Encoded_GPR_ebx());
+  __ pushl(Encoded_GPR_ecx());
+  __ pushl(Encoded_GPR_edx());
+  __ pushl(Encoded_GPR_edi());
+  __ pushl(Encoded_GPR_esi());
+  __ pushl(Encoded_GPR_ebp());
+  __ pushl(Encoded_GPR_r8());
+  __ pushl(Encoded_GPR_r9());
+  __ pushl(Encoded_GPR_r10());
+  __ pushl(Encoded_GPR_r11());
+  __ pushl(Encoded_GPR_r12());
+  __ pushl(Encoded_GPR_r13());
+  __ pushl(Encoded_GPR_r14());
+  __ pushl(Encoded_GPR_r15());
+
+  constexpr size_t ByteCount = 23;
+  ASSERT_EQ(ByteCount, codeBytesSize());
+
+  constexpr uint8_t Rex_B = 0x41;
+  constexpr uint8_t PushOpcode = 0x50;
+  verifyBytes<ByteCount>(
+      codeBytes(), PushOpcode | Encoded_GPR_eax(),
+      PushOpcode | Encoded_GPR_ebx(), PushOpcode | Encoded_GPR_ecx(),
+      PushOpcode | Encoded_GPR_edx(), PushOpcode | Encoded_GPR_edi(),
+      PushOpcode | Encoded_GPR_esi(), PushOpcode | Encoded_GPR_ebp(), Rex_B,
+      PushOpcode | (Encoded_GPR_r8() & 7), Rex_B,
+      PushOpcode | (Encoded_GPR_r9() & 7), Rex_B,
+      PushOpcode | (Encoded_GPR_r10() & 7), Rex_B,
+      PushOpcode | (Encoded_GPR_r11() & 7), Rex_B,
+      PushOpcode | (Encoded_GPR_r12() & 7), Rex_B,
+      PushOpcode | (Encoded_GPR_r13() & 7), Rex_B,
+      PushOpcode | (Encoded_GPR_r14() & 7), Rex_B,
+      PushOpcode | (Encoded_GPR_r15() & 7));
+}
+
+TEST_F(AssemblerX8664LowLevelTest, MovRegisterZero) {
+  __ mov(IceType_i32, Encoded_GPR_eax(), Immediate(0x00));
+  __ mov(IceType_i32, Encoded_GPR_ebx(), Immediate(0x00));
+  __ mov(IceType_i32, Encoded_GPR_ecx(), Immediate(0x00));
+  __ mov(IceType_i32, Encoded_GPR_edx(), Immediate(0x00));
+  __ mov(IceType_i32, Encoded_GPR_edi(), Immediate(0x00));
+  __ mov(IceType_i32, Encoded_GPR_esi(), Immediate(0x00));
+  __ mov(IceType_i32, Encoded_GPR_ebp(), Immediate(0x00));
+  __ mov(IceType_i32, Encoded_GPR_r8(), Immediate(0x00));
+  __ mov(IceType_i32, Encoded_GPR_r10(), Immediate(0x00));
+  __ mov(IceType_i32, Encoded_GPR_r11(), Immediate(0x00));
+  __ mov(IceType_i32, Encoded_GPR_r12(), Immediate(0x00));
+  __ mov(IceType_i32, Encoded_GPR_r13(), Immediate(0x00));
+  __ mov(IceType_i32, Encoded_GPR_r14(), Immediate(0x00));
+  __ mov(IceType_i32, Encoded_GPR_r15(), Immediate(0x00));
+
+  constexpr uint8_t Rex_B = 0x41;
+  constexpr size_t MovReg32BitImmBytes = 5;
+  constexpr size_t ByteCount = 14 * MovReg32BitImmBytes + 7 /*Rex_B*/;
+
+  ASSERT_EQ(ByteCount, codeBytesSize());
+
+  constexpr uint8_t MovOpcode = 0xb8;
+  verifyBytes<ByteCount>(
+      codeBytes(), MovOpcode | Encoded_GPR_eax(), 0x00, 0x00, 0x00, 0x00,
+      MovOpcode | Encoded_GPR_ebx(), 0x00, 0x00, 0x00, 0x00,
+      MovOpcode | Encoded_GPR_ecx(), 0x00, 0x00, 0x00, 0x00,
+      MovOpcode | Encoded_GPR_edx(), 0x00, 0x00, 0x00, 0x00,
+      MovOpcode | Encoded_GPR_edi(), 0x00, 0x00, 0x00, 0x00,
+      MovOpcode | Encoded_GPR_esi(), 0x00, 0x00, 0x00, 0x00,
+      MovOpcode | Encoded_GPR_ebp(), 0x00, 0x00, 0x00, 0x00, Rex_B,
+      MovOpcode | (Encoded_GPR_r8() & 7), 0x00, 0x00, 0x00, 0x00, Rex_B,
+      MovOpcode | (Encoded_GPR_r10() & 7), 0x00, 0x00, 0x00, 0x00, Rex_B,
+      MovOpcode | (Encoded_GPR_r11() & 7), 0x00, 0x00, 0x00, 0x00, Rex_B,
+      MovOpcode | (Encoded_GPR_r12() & 7), 0x00, 0x00, 0x00, 0x00, Rex_B,
+      MovOpcode | (Encoded_GPR_r13() & 7), 0x00, 0x00, 0x00, 0x00, Rex_B,
+      MovOpcode | (Encoded_GPR_r14() & 7), 0x00, 0x00, 0x00, 0x00, Rex_B,
+      MovOpcode | (Encoded_GPR_r15() & 7), 0x00, 0x00, 0x00, 0x00);
+}
+
+TEST_F(AssemblerX8664LowLevelTest, Cmp) {
+#define TestRegReg(Inst, Dst, Src, OpType, ByteCountUntyped, ...)              \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Inst ", " #Dst ", " #Src ", " #OpType ", " #ByteCountUntyped      \
+        ",  " #__VA_ARGS__ ")";                                                \
+    static constexpr uint8_t ByteCount = ByteCountUntyped;                     \
+    __ Inst(IceType_##OpType, Encoded_GPR_##Dst(), Encoded_GPR_##Src());       \
+    ASSERT_EQ(ByteCount, codeBytesSize()) << TestString;                       \
+    ASSERT_TRUE(verifyBytes<ByteCount>(codeBytes(), __VA_ARGS__))              \
+        << TestString;                                                         \
+    reset();                                                                   \
+  } while (0)
+
+#define TestRegImm(Inst, Dst, Imm, OpType, ByteCountUntyped, ...)              \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Inst ", " #Dst ", " #Imm ", " #OpType ", " #ByteCountUntyped      \
+        ",  " #__VA_ARGS__ ")";                                                \
+    static constexpr uint8_t ByteCount = ByteCountUntyped;                     \
+    __ Inst(IceType_##OpType, Encoded_GPR_##Dst(), Immediate(Imm));            \
+    ASSERT_EQ(ByteCount, codeBytesSize()) << TestString;                       \
+    ASSERT_TRUE(verifyBytes<ByteCount>(codeBytes(), __VA_ARGS__))              \
+        << TestString;                                                         \
+    reset();                                                                   \
+  } while (0)
+
+#define TestRegAbsoluteAddr(Inst, Dst, Disp, OpType, ByteCountUntyped, ...)    \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Inst ", " #Dst ", " #Disp ", " #OpType ", " #ByteCountUntyped     \
+        ",  " #__VA_ARGS__ ")";                                                \
+    static constexpr uint8_t ByteCount = ByteCountUntyped;                     \
+    __ Inst(IceType_##OpType, Encoded_GPR_##Dst(),                             \
+            Address(Address::ABSOLUTE, Disp));                                 \
+    ASSERT_EQ(ByteCount, codeBytesSize()) << TestString;                       \
+    ASSERT_TRUE(verifyBytes<ByteCount>(codeBytes(), __VA_ARGS__))              \
+        << TestString;                                                         \
+    reset();                                                                   \
+  } while (0)
+
+#define TestRegAddrBase(Inst, Dst, Base, Disp, OpType, ByteCountUntyped, ...)  \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Inst ", " #Dst ", " #Base ", " #Disp ", " #OpType                 \
+        ", " #ByteCountUntyped ",  " #__VA_ARGS__ ")";                         \
+    static constexpr uint8_t ByteCount = ByteCountUntyped;                     \
+    __ Inst(IceType_##OpType, Encoded_GPR_##Dst(),                             \
+            Address(Encoded_GPR_##Base(), Disp));                              \
+    ASSERT_EQ(ByteCount, codeBytesSize()) << TestString;                       \
+    ASSERT_TRUE(verifyBytes<ByteCount>(codeBytes(), __VA_ARGS__))              \
+        << TestString;                                                         \
+    reset();                                                                   \
+  } while (0)
+
+#define TestRegAddrScaledIndex(Inst, Dst, Index, Scale, Disp, OpType,          \
+                               ByteCountUntyped, ...)                          \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Inst ", " #Dst ", " #Index ", " #Scale ", " #Disp ", " #OpType    \
+        ", " #ByteCountUntyped ",  " #__VA_ARGS__ ")";                         \
+    static constexpr uint8_t ByteCount = ByteCountUntyped;                     \
+    __ Inst(IceType_##OpType, Encoded_GPR_##Dst(),                             \
+            Address(Encoded_GPR_##Index(), Traits::TIMES_##Scale, Disp));      \
+    ASSERT_EQ(ByteCount, codeBytesSize()) << TestString;                       \
+    ASSERT_TRUE(verifyBytes<ByteCount>(codeBytes(), __VA_ARGS__))              \
+        << TestString;                                                         \
+    reset();                                                                   \
+  } while (0)
+
+#define TestRegAddrBaseScaledIndex(Inst, Dst, Base, Index, Scale, Disp,        \
+                                   OpType, ByteCountUntyped, ...)              \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Inst ", " #Dst ", " #Base ", " #Index ", " #Scale ", " #Disp      \
+        ", " #OpType ", " #ByteCountUntyped ",  " #__VA_ARGS__ ")";            \
+    static constexpr uint8_t ByteCount = ByteCountUntyped;                     \
+    __ Inst(IceType_##OpType, Encoded_GPR_##Dst(),                             \
+            Address(Encoded_GPR_##Base(), Encoded_GPR_##Index(),               \
+                    Traits::TIMES_##Scale, Disp));                             \
+    ASSERT_EQ(ByteCount, codeBytesSize()) << TestString;                       \
+    ASSERT_TRUE(verifyBytes<ByteCount>(codeBytes(), __VA_ARGS__))              \
+        << TestString;                                                         \
+    reset();                                                                   \
+  } while (0)
+
+#define TestAddrBaseScaledIndexImm(Inst, Base, Index, Scale, Disp, Imm,        \
+                                   OpType, ByteCountUntyped, ...)              \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Inst ", " #Base ", " #Index ", " #Scale ", " #Disp ", " #Imm      \
+        ", " #OpType ", " #ByteCountUntyped ",  " #__VA_ARGS__ ")";            \
+    static constexpr uint8_t ByteCount = ByteCountUntyped;                     \
+    __ Inst(IceType_##OpType,                                                  \
+            Address(Encoded_GPR_##Base(), Encoded_GPR_##Index(),               \
+                    Traits::TIMES_##Scale, Disp),                              \
+            Immediate(Imm));                                                   \
+    ASSERT_EQ(ByteCount, codeBytesSize()) << TestString;                       \
+    ASSERT_TRUE(verifyBytes<ByteCount>(codeBytes(), __VA_ARGS__))              \
+        << TestString;                                                         \
+    reset();                                                                   \
+  } while (0)
+
+#define TestAddrBaseScaledIndexReg(Inst, Base, Index, Scale, Disp, Src,        \
+                                   OpType, ByteCountUntyped, ...)              \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Inst ", " #Base ", " #Index ", " #Scale ", " #Disp ", " #Src      \
+        ", " #OpType ", " #ByteCountUntyped ",  " #__VA_ARGS__ ")";            \
+    static constexpr uint8_t ByteCount = ByteCountUntyped;                     \
+    __ Inst(IceType_##OpType,                                                  \
+            Address(Encoded_GPR_##Base(), Encoded_GPR_##Index(),               \
+                    Traits::TIMES_##Scale, Disp),                              \
+            Encoded_GPR_##Src());                                              \
+    ASSERT_EQ(ByteCount, codeBytesSize()) << TestString;                       \
+    ASSERT_TRUE(verifyBytes<ByteCount>(codeBytes(), __VA_ARGS__))              \
+        << TestString;                                                         \
+    reset();                                                                   \
+  } while (0)
+
+  /* cmp GPR, GPR */
+  TestRegReg(cmp, eax, ecx, i32, 2, 0x3B, 0xC1);
+  TestRegReg(cmp, ecx, edx, i32, 2, 0x3B, 0xCA);
+  TestRegReg(cmp, edx, ebx, i32, 2, 0x3B, 0xD3);
+  TestRegReg(cmp, ebx, esp, i32, 2, 0x3B, 0xDC);
+  TestRegReg(cmp, esp, ebp, i32, 2, 0x3B, 0xE5);
+  TestRegReg(cmp, ebp, esi, i32, 2, 0x3B, 0xEE);
+  TestRegReg(cmp, esi, edi, i32, 2, 0x3B, 0xF7);
+  TestRegReg(cmp, edi, r8, i32, 3, 0x41, 0x3B, 0xF8);
+  TestRegReg(cmp, r8, r9, i32, 3, 0x45, 0x3B, 0xC1);
+  TestRegReg(cmp, r9, r10, i32, 3, 0x45, 0x3B, 0xCA);
+  TestRegReg(cmp, r10, r11, i32, 3, 0x45, 0x3B, 0xD3);
+  TestRegReg(cmp, r11, r12, i32, 3, 0x45, 0x3B, 0xDC);
+  TestRegReg(cmp, r12, r13, i32, 3, 0x45, 0x3B, 0xE5);
+  TestRegReg(cmp, r13, r14, i32, 3, 0x45, 0x3B, 0xEE);
+  TestRegReg(cmp, r14, r15, i32, 3, 0x45, 0x3B, 0xF7);
+  TestRegReg(cmp, r15, eax, i32, 3, 0x44, 0x3B, 0xF8);
+
+  TestRegReg(cmp, eax, ecx, i16, 3, 0x66, 0x3B, 0xC1);
+  TestRegReg(cmp, ecx, edx, i16, 3, 0x66, 0x3B, 0xCA);
+  TestRegReg(cmp, edx, ebx, i16, 3, 0x66, 0x3B, 0xD3);
+  TestRegReg(cmp, ebx, esp, i16, 3, 0x66, 0x3B, 0xDC);
+  TestRegReg(cmp, esp, ebp, i16, 3, 0x66, 0x3B, 0xE5);
+  TestRegReg(cmp, ebp, esi, i16, 3, 0x66, 0x3B, 0xEE);
+  TestRegReg(cmp, esi, edi, i16, 3, 0x66, 0x3B, 0xF7);
+  TestRegReg(cmp, edi, r8, i16, 4, 0x66, 0x41, 0x3B, 0xF8);
+  TestRegReg(cmp, r8, r9, i16, 4, 0x66, 0x45, 0x3B, 0xC1);
+  TestRegReg(cmp, r9, r10, i16, 4, 0x66, 0x45, 0x3B, 0xCA);
+  TestRegReg(cmp, r10, r11, i16, 4, 0x66, 0x45, 0x3B, 0xD3);
+  TestRegReg(cmp, r11, r12, i16, 4, 0x66, 0x45, 0x3B, 0xDC);
+  TestRegReg(cmp, r12, r13, i16, 4, 0x66, 0x45, 0x3B, 0xE5);
+  TestRegReg(cmp, r13, r14, i16, 4, 0x66, 0x45, 0x3B, 0xEE);
+  TestRegReg(cmp, r14, r15, i16, 4, 0x66, 0x45, 0x3B, 0xF7);
+  TestRegReg(cmp, r15, eax, i16, 4, 0x66, 0x44, 0x3B, 0xF8);
+
+  TestRegReg(cmp, eax, ecx, i8, 2, 0x3A, 0xC1);
+  TestRegReg(cmp, ecx, edx, i8, 2, 0x3A, 0xCA);
+  TestRegReg(cmp, edx, ebx, i8, 2, 0x3A, 0xD3);
+  TestRegReg(cmp, ebx, esp, i8, 3, 0x40, 0x3A, 0xDC);
+  TestRegReg(cmp, esp, ebp, i8, 3, 0x40, 0x3A, 0xE5);
+  TestRegReg(cmp, ebp, esi, i8, 3, 0x40, 0x3A, 0xEE);
+  TestRegReg(cmp, esi, edi, i8, 3, 0x40, 0x3A, 0xF7);
+  TestRegReg(cmp, edi, r8, i8, 3, 0x41, 0x3A, 0xF8);
+  TestRegReg(cmp, r8, r9, i8, 3, 0x45, 0x3A, 0xC1);
+  TestRegReg(cmp, r9, r10, i8, 3, 0x45, 0x3A, 0xCA);
+  TestRegReg(cmp, r10, r11, i8, 3, 0x45, 0x3A, 0xD3);
+  TestRegReg(cmp, r11, r12, i8, 3, 0x45, 0x3A, 0xDC);
+  TestRegReg(cmp, r12, r13, i8, 3, 0x45, 0x3A, 0xE5);
+  TestRegReg(cmp, r13, r14, i8, 3, 0x45, 0x3A, 0xEE);
+  TestRegReg(cmp, r14, r15, i8, 3, 0x45, 0x3A, 0xF7);
+  TestRegReg(cmp, r15, eax, i8, 3, 0x44, 0x3A, 0xF8);
+
+  /* cmp GPR, Imm8 */
+  TestRegImm(cmp, eax, 5, i32, 3, 0x83, 0xF8, 0x05);
+  TestRegImm(cmp, ecx, 5, i32, 3, 0x83, 0xF9, 0x05);
+  TestRegImm(cmp, edx, 5, i32, 3, 0x83, 0xFA, 0x05);
+  TestRegImm(cmp, ebx, 5, i32, 3, 0x83, 0xFB, 0x05);
+  TestRegImm(cmp, esp, 5, i32, 3, 0x83, 0xFC, 0x05);
+  TestRegImm(cmp, ebp, 5, i32, 3, 0x83, 0xFD, 0x05);
+  TestRegImm(cmp, esi, 5, i32, 3, 0x83, 0xFE, 0x05);
+  TestRegImm(cmp, edi, 5, i32, 3, 0x83, 0xFF, 0x05);
+  TestRegImm(cmp, r8, 5, i32, 4, 0x41, 0x83, 0xF8, 0x05);
+  TestRegImm(cmp, r9, 5, i32, 4, 0x41, 0x83, 0xF9, 0x05);
+  TestRegImm(cmp, r10, 5, i32, 4, 0x41, 0x83, 0xFA, 0x05);
+  TestRegImm(cmp, r11, 5, i32, 4, 0x41, 0x83, 0xFB, 0x05);
+  TestRegImm(cmp, r12, 5, i32, 4, 0x41, 0x83, 0xFC, 0x05);
+  TestRegImm(cmp, r13, 5, i32, 4, 0x41, 0x83, 0xFD, 0x05);
+  TestRegImm(cmp, r14, 5, i32, 4, 0x41, 0x83, 0xFE, 0x05);
+  TestRegImm(cmp, r15, 5, i32, 4, 0x41, 0x83, 0xFF, 0x05);
+
+  TestRegImm(cmp, eax, 5, i16, 4, 0x66, 0x83, 0xF8, 0x05);
+  TestRegImm(cmp, ecx, 5, i16, 4, 0x66, 0x83, 0xF9, 0x05);
+  TestRegImm(cmp, edx, 5, i16, 4, 0x66, 0x83, 0xFA, 0x05);
+  TestRegImm(cmp, ebx, 5, i16, 4, 0x66, 0x83, 0xFB, 0x05);
+  TestRegImm(cmp, esp, 5, i16, 4, 0x66, 0x83, 0xFC, 0x05);
+  TestRegImm(cmp, ebp, 5, i16, 4, 0x66, 0x83, 0xFD, 0x05);
+  TestRegImm(cmp, esi, 5, i16, 4, 0x66, 0x83, 0xFE, 0x05);
+  TestRegImm(cmp, edi, 5, i16, 4, 0x66, 0x83, 0xFF, 0x05);
+  TestRegImm(cmp, r8, 5, i16, 5, 0x66, 0x41, 0x83, 0xF8, 0x05);
+  TestRegImm(cmp, r9, 5, i16, 5, 0x66, 0x41, 0x83, 0xF9, 0x05);
+  TestRegImm(cmp, r10, 5, i16, 5, 0x66, 0x41, 0x83, 0xFA, 0x05);
+  TestRegImm(cmp, r11, 5, i16, 5, 0x66, 0x41, 0x83, 0xFB, 0x05);
+  TestRegImm(cmp, r12, 5, i16, 5, 0x66, 0x41, 0x83, 0xFC, 0x05);
+  TestRegImm(cmp, r13, 5, i16, 5, 0x66, 0x41, 0x83, 0xFD, 0x05);
+  TestRegImm(cmp, r14, 5, i16, 5, 0x66, 0x41, 0x83, 0xFE, 0x05);
+  TestRegImm(cmp, r15, 5, i16, 5, 0x66, 0x41, 0x83, 0xFF, 0x05);
+
+  TestRegImm(cmp, eax, 5, i8, 2, 0x3C, 0x05);
+  TestRegImm(cmp, ecx, 5, i8, 3, 0x80, 0xF9, 0x05);
+  TestRegImm(cmp, edx, 5, i8, 3, 0x80, 0xFA, 0x05);
+  TestRegImm(cmp, ebx, 5, i8, 3, 0x80, 0xFB, 0x05);
+  TestRegImm(cmp, esp, 5, i8, 4, 0x40, 0x80, 0xFC, 0x05);
+  TestRegImm(cmp, ebp, 5, i8, 4, 0x40, 0x80, 0xFD, 0x05);
+  TestRegImm(cmp, esi, 5, i8, 4, 0x40, 0x80, 0xFE, 0x05);
+  TestRegImm(cmp, edi, 5, i8, 4, 0x40, 0x80, 0xFF, 0x05);
+  TestRegImm(cmp, r8, 5, i8, 4, 0x41, 0x80, 0xF8, 0x05);
+  TestRegImm(cmp, r9, 5, i8, 4, 0x41, 0x80, 0xF9, 0x05);
+  TestRegImm(cmp, r10, 5, i8, 4, 0x41, 0x80, 0xFA, 0x05);
+  TestRegImm(cmp, r11, 5, i8, 4, 0x41, 0x80, 0xFB, 0x05);
+  TestRegImm(cmp, r12, 5, i8, 4, 0x41, 0x80, 0xFC, 0x05);
+  TestRegImm(cmp, r13, 5, i8, 4, 0x41, 0x80, 0xFD, 0x05);
+  TestRegImm(cmp, r14, 5, i8, 4, 0x41, 0x80, 0xFE, 0x05);
+  TestRegImm(cmp, r15, 5, i8, 4, 0x41, 0x80, 0xFF, 0x05);
+
+  /* cmp GPR, Imm16 */
+  TestRegImm(cmp, eax, 0x100, i32, 5, 0x3D, 0x00, 0x01, 0x00, 0x00);
+  TestRegImm(cmp, ecx, 0x100, i32, 6, 0x81, 0xF9, 0x00, 0x01, 0x00, 0x00);
+  TestRegImm(cmp, edx, 0x100, i32, 6, 0x81, 0xFA, 0x00, 0x01, 0x00, 0x00);
+  TestRegImm(cmp, ebx, 0x100, i32, 6, 0x81, 0xFB, 0x00, 0x01, 0x00, 0x00);
+  TestRegImm(cmp, esp, 0x100, i32, 6, 0x81, 0xFC, 0x00, 0x01, 0x00, 0x00);
+  TestRegImm(cmp, ebp, 0x100, i32, 6, 0x81, 0xFD, 0x00, 0x01, 0x00, 0x00);
+  TestRegImm(cmp, esi, 0x100, i32, 6, 0x81, 0xFE, 0x00, 0x01, 0x00, 0x00);
+  TestRegImm(cmp, edi, 0x100, i32, 6, 0x81, 0xFF, 0x00, 0x01, 0x00, 0x00);
+  TestRegImm(cmp, r8, 0x100, i32, 7, 0x41, 0x81, 0xF8, 0x00, 0x01, 0x00, 0x00);
+  TestRegImm(cmp, r9, 0x100, i32, 7, 0x41, 0x81, 0xF9, 0x00, 0x01, 0x00, 0x00);
+  TestRegImm(cmp, r10, 0x100, i32, 7, 0x41, 0x81, 0xFA, 0x00, 0x01, 0x00, 0x00);
+  TestRegImm(cmp, r11, 0x100, i32, 7, 0x41, 0x81, 0xFB, 0x00, 0x01, 0x00, 0x00);
+  TestRegImm(cmp, r12, 0x100, i32, 7, 0x41, 0x81, 0xFC, 0x00, 0x01, 0x00, 0x00);
+  TestRegImm(cmp, r13, 0x100, i32, 7, 0x41, 0x81, 0xFD, 0x00, 0x01, 0x00, 0x00);
+  TestRegImm(cmp, r14, 0x100, i32, 7, 0x41, 0x81, 0xFE, 0x00, 0x01, 0x00, 0x00);
+  TestRegImm(cmp, r15, 0x100, i32, 7, 0x41, 0x81, 0xFF, 0x00, 0x01, 0x00, 0x00);
+
+  TestRegImm(cmp, eax, 0x100, i16, 4, 0x66, 0x3D, 0x00, 0x01);
+  TestRegImm(cmp, ecx, 0x100, i16, 5, 0x66, 0x81, 0xF9, 0x00, 0x01);
+  TestRegImm(cmp, edx, 0x100, i16, 5, 0x66, 0x81, 0xFA, 0x00, 0x01);
+  TestRegImm(cmp, ebx, 0x100, i16, 5, 0x66, 0x81, 0xFB, 0x00, 0x01);
+  TestRegImm(cmp, esp, 0x100, i16, 5, 0x66, 0x81, 0xFC, 0x00, 0x01);
+  TestRegImm(cmp, ebp, 0x100, i16, 5, 0x66, 0x81, 0xFD, 0x00, 0x01);
+  TestRegImm(cmp, esi, 0x100, i16, 5, 0x66, 0x81, 0xFE, 0x00, 0x01);
+  TestRegImm(cmp, edi, 0x100, i16, 5, 0x66, 0x81, 0xFF, 0x00, 0x01);
+  TestRegImm(cmp, r8, 0x100, i16, 6, 0x66, 0x41, 0x81, 0xF8, 0x00, 0x01);
+  TestRegImm(cmp, r9, 0x100, i16, 6, 0x66, 0x41, 0x81, 0xF9, 0x00, 0x01);
+  TestRegImm(cmp, r10, 0x100, i16, 6, 0x66, 0x41, 0x81, 0xFA, 0x00, 0x01);
+  TestRegImm(cmp, r11, 0x100, i16, 6, 0x66, 0x41, 0x81, 0xFB, 0x00, 0x01);
+  TestRegImm(cmp, r12, 0x100, i16, 6, 0x66, 0x41, 0x81, 0xFC, 0x00, 0x01);
+  TestRegImm(cmp, r13, 0x100, i16, 6, 0x66, 0x41, 0x81, 0xFD, 0x00, 0x01);
+  TestRegImm(cmp, r14, 0x100, i16, 6, 0x66, 0x41, 0x81, 0xFE, 0x00, 0x01);
+  TestRegImm(cmp, r15, 0x100, i16, 6, 0x66, 0x41, 0x81, 0xFF, 0x00, 0x01);
+
+  /* cmp GPR, Absolute */
+  TestRegAbsoluteAddr(cmp, eax, 0xF00FBEEF, i32, 6, 0x3B, 0x05, 0xEF, 0xBE,
+                      0x0F, 0xF0);
+  TestRegAbsoluteAddr(cmp, eax, 0xF00FBEEF, i16, 7, 0x66, 0x3B, 0x05, 0xEF,
+                      0xBE, 0x0F, 0xF0);
+  TestRegAbsoluteAddr(cmp, eax, 0xF00FBEEF, i8, 6, 0x3A, 0x05, 0xEF, 0xBE, 0x0F,
+                      0xF0);
+  TestRegAbsoluteAddr(cmp, r8, 0xF00FBEEF, i32, 7, 0x44, 0x3B, 0x05, 0xEF, 0xBE,
+                      0x0F, 0xF0);
+  TestRegAbsoluteAddr(cmp, r8, 0xF00FBEEF, i16, 8, 0x66, 0x44, 0x3B, 0x05, 0xEF,
+                      0xBE, 0x0F, 0xF0);
+  TestRegAbsoluteAddr(cmp, r8, 0xF00FBEEF, i8, 7, 0x44, 0x3A, 0x05, 0xEF, 0xBE,
+                      0x0F, 0xF0);
+
+  /* cmp GPR, 0(Base) */
+  TestRegAddrBase(cmp, eax, ecx, 0, i32, 2, 0x3B, 0x01);
+  TestRegAddrBase(cmp, ecx, edx, 0, i32, 2, 0x3B, 0x0A);
+  TestRegAddrBase(cmp, edx, ebx, 0, i32, 2, 0x3B, 0x13);
+  TestRegAddrBase(cmp, ebx, esp, 0, i32, 3, 0x3B, 0x1C, 0x24);
+  TestRegAddrBase(cmp, esp, ebp, 0, i32, 3, 0x3B, 0x65, 0x00);
+  TestRegAddrBase(cmp, ebp, esi, 0, i32, 2, 0x3B, 0x2E);
+  TestRegAddrBase(cmp, esi, edi, 0, i32, 2, 0x3B, 0x37);
+  TestRegAddrBase(cmp, edi, r8, 0, i32, 3, 0x41, 0x3B, 0x38);
+  TestRegAddrBase(cmp, r8, r9, 0, i32, 3, 0x45, 0x3B, 0x01);
+  TestRegAddrBase(cmp, r9, r10, 0, i32, 3, 0x45, 0x3B, 0x0A);
+  TestRegAddrBase(cmp, r10, r11, 0, i32, 3, 0x45, 0x3B, 0x13);
+  TestRegAddrBase(cmp, r11, r12, 0, i32, 4, 0x45, 0x3B, 0x1C, 0x24);
+  TestRegAddrBase(cmp, r12, r13, 0, i32, 4, 0x45, 0x3B, 0x65, 0x00);
+  TestRegAddrBase(cmp, r13, r14, 0, i32, 3, 0x45, 0x3B, 0x2E);
+  TestRegAddrBase(cmp, r14, r15, 0, i32, 3, 0x45, 0x3B, 0x37);
+  TestRegAddrBase(cmp, r15, eax, 0, i32, 3, 0x44, 0x3B, 0x38);
+
+  TestRegAddrBase(cmp, eax, ecx, 0, i16, 3, 0x66, 0x3B, 0x01);
+  TestRegAddrBase(cmp, ecx, edx, 0, i16, 3, 0x66, 0x3B, 0x0A);
+  TestRegAddrBase(cmp, edx, ebx, 0, i16, 3, 0x66, 0x3B, 0x13);
+  TestRegAddrBase(cmp, ebx, esp, 0, i16, 4, 0x66, 0x3B, 0x1C, 0x24);
+  TestRegAddrBase(cmp, esp, ebp, 0, i16, 4, 0x66, 0x3B, 0x65, 0x00);
+  TestRegAddrBase(cmp, ebp, esi, 0, i16, 3, 0x66, 0x3B, 0x2E);
+  TestRegAddrBase(cmp, esi, edi, 0, i16, 3, 0x66, 0x3B, 0x37);
+  TestRegAddrBase(cmp, edi, r8, 0, i16, 4, 0x66, 0x41, 0x3B, 0x38);
+  TestRegAddrBase(cmp, r8, r9, 0, i16, 4, 0x66, 0x45, 0x3B, 0x01);
+  TestRegAddrBase(cmp, r9, r10, 0, i16, 4, 0x66, 0x45, 0x3B, 0x0A);
+  TestRegAddrBase(cmp, r10, r11, 0, i16, 4, 0x66, 0x45, 0x3B, 0x13);
+  TestRegAddrBase(cmp, r11, r12, 0, i16, 5, 0x66, 0x45, 0x3B, 0x1C, 0x24);
+  TestRegAddrBase(cmp, r12, r13, 0, i16, 5, 0x66, 0x45, 0x3B, 0x65, 0x00);
+  TestRegAddrBase(cmp, r13, r14, 0, i16, 4, 0x66, 0x45, 0x3B, 0x2E);
+  TestRegAddrBase(cmp, r14, r15, 0, i16, 4, 0x66, 0x45, 0x3B, 0x37);
+  TestRegAddrBase(cmp, r15, eax, 0, i16, 4, 0x66, 0x44, 0x3B, 0x38);
+
+  TestRegAddrBase(cmp, eax, ecx, 0, i8, 2, 0x3A, 0x01);
+  TestRegAddrBase(cmp, ecx, edx, 0, i8, 2, 0x3A, 0x0A);
+  TestRegAddrBase(cmp, edx, ebx, 0, i8, 2, 0x3A, 0x13);
+  TestRegAddrBase(cmp, ebx, esp, 0, i8, 3, 0x3A, 0x1C, 0x24);
+  TestRegAddrBase(cmp, esp, ebp, 0, i8, 4, 0x40, 0x3A, 0x65, 0x00);
+  TestRegAddrBase(cmp, ebp, esi, 0, i8, 3, 0x40, 0x3A, 0x2E);
+  TestRegAddrBase(cmp, esi, edi, 0, i8, 3, 0x40, 0x3A, 0x37);
+  TestRegAddrBase(cmp, edi, r8, 0, i8, 3, 0x41, 0x3A, 0x38);
+  TestRegAddrBase(cmp, r8, r9, 0, i8, 3, 0x45, 0x3A, 0x01);
+  TestRegAddrBase(cmp, r9, r10, 0, i8, 3, 0x45, 0x3A, 0x0A);
+  TestRegAddrBase(cmp, r10, r11, 0, i8, 3, 0x45, 0x3A, 0x13);
+  TestRegAddrBase(cmp, r11, r12, 0, i8, 4, 0x45, 0x3A, 0x1C, 0x24);
+  TestRegAddrBase(cmp, r12, r13, 0, i8, 4, 0x45, 0x3A, 0x65, 0x00);
+  TestRegAddrBase(cmp, r13, r14, 0, i8, 3, 0x45, 0x3A, 0x2E);
+  TestRegAddrBase(cmp, r14, r15, 0, i8, 3, 0x45, 0x3A, 0x37);
+  TestRegAddrBase(cmp, r15, eax, 0, i8, 3, 0x44, 0x3A, 0x38);
+
+  /* cmp GPR, Imm8(Base) */
+  TestRegAddrBase(cmp, eax, ecx, 0x40, i32, 3, 0x3B, 0x41, 0x40);
+  TestRegAddrBase(cmp, ecx, edx, 0x40, i32, 3, 0x3B, 0x4A, 0x40);
+  TestRegAddrBase(cmp, edx, ebx, 0x40, i32, 3, 0x3B, 0x53, 0x40);
+  TestRegAddrBase(cmp, ebx, esp, 0x40, i32, 4, 0x3B, 0x5C, 0x24, 0x40);
+  TestRegAddrBase(cmp, esp, ebp, 0x40, i32, 3, 0x3B, 0x65, 0x40);
+  TestRegAddrBase(cmp, ebp, esi, 0x40, i32, 3, 0x3B, 0x6E, 0x40);
+  TestRegAddrBase(cmp, esi, edi, 0x40, i32, 3, 0x3B, 0x77, 0x40);
+  TestRegAddrBase(cmp, edi, r8, 0x40, i32, 4, 0x41, 0x3B, 0x78, 0x40);
+  TestRegAddrBase(cmp, r8, r9, 0x40, i32, 4, 0x45, 0x3B, 0x41, 0x40);
+  TestRegAddrBase(cmp, r9, r10, 0x40, i32, 4, 0x45, 0x3B, 0x4A, 0x40);
+  TestRegAddrBase(cmp, r10, r11, 0x40, i32, 4, 0x45, 0x3B, 0x53, 0x40);
+  TestRegAddrBase(cmp, r11, r12, 0x40, i32, 5, 0x45, 0x3B, 0x5C, 0x24, 0x40);
+  TestRegAddrBase(cmp, r12, r13, 0x40, i32, 4, 0x45, 0x3B, 0x65, 0x40);
+  TestRegAddrBase(cmp, r13, r14, 0x40, i32, 4, 0x45, 0x3B, 0x6E, 0x40);
+  TestRegAddrBase(cmp, r14, r15, 0x40, i32, 4, 0x45, 0x3B, 0x77, 0x40);
+  TestRegAddrBase(cmp, r15, eax, 0x40, i32, 4, 0x44, 0x3B, 0x78, 0x40);
+
+  TestRegAddrBase(cmp, eax, ecx, 0x40, i16, 4, 0x66, 0x3B, 0x41, 0x40);
+  TestRegAddrBase(cmp, ecx, edx, 0x40, i16, 4, 0x66, 0x3B, 0x4A, 0x40);
+  TestRegAddrBase(cmp, edx, ebx, 0x40, i16, 4, 0x66, 0x3B, 0x53, 0x40);
+  TestRegAddrBase(cmp, ebx, esp, 0x40, i16, 5, 0x66, 0x3B, 0x5C, 0x24, 0x40);
+  TestRegAddrBase(cmp, esp, ebp, 0x40, i16, 4, 0x66, 0x3B, 0x65, 0x40);
+  TestRegAddrBase(cmp, ebp, esi, 0x40, i16, 4, 0x66, 0x3B, 0x6E, 0x40);
+  TestRegAddrBase(cmp, esi, edi, 0x40, i16, 4, 0x66, 0x3B, 0x77, 0x40);
+  TestRegAddrBase(cmp, edi, r8, 0x40, i16, 5, 0x66, 0x41, 0x3B, 0x78, 0x40);
+  TestRegAddrBase(cmp, r8, r9, 0x40, i16, 5, 0x66, 0x45, 0x3B, 0x41, 0x40);
+  TestRegAddrBase(cmp, r9, r10, 0x40, i16, 5, 0x66, 0x45, 0x3B, 0x4A, 0x40);
+  TestRegAddrBase(cmp, r10, r11, 0x40, i16, 5, 0x66, 0x45, 0x3B, 0x53, 0x40);
+  TestRegAddrBase(cmp, r11, r12, 0x40, i16, 6, 0x66, 0x45, 0x3B, 0x5C, 0x24,
+                  0x40);
+  TestRegAddrBase(cmp, r12, r13, 0x40, i16, 5, 0x66, 0x45, 0x3B, 0x65, 0x40);
+  TestRegAddrBase(cmp, r13, r14, 0x40, i16, 5, 0x66, 0x45, 0x3B, 0x6E, 0x40);
+  TestRegAddrBase(cmp, r14, r15, 0x40, i16, 5, 0x66, 0x45, 0x3B, 0x77, 0x40);
+  TestRegAddrBase(cmp, r15, eax, 0x40, i16, 5, 0x66, 0x44, 0x3B, 0x78, 0x40);
+
+  TestRegAddrBase(cmp, eax, ecx, 0x40, i8, 3, 0x3A, 0x41, 0x40);
+  TestRegAddrBase(cmp, ecx, edx, 0x40, i8, 3, 0x3A, 0x4A, 0x40);
+  TestRegAddrBase(cmp, edx, ebx, 0x40, i8, 3, 0x3A, 0x53, 0x40);
+  TestRegAddrBase(cmp, ebx, esp, 0x40, i8, 4, 0x3A, 0x5C, 0x24, 0x40);
+  TestRegAddrBase(cmp, esp, ebp, 0x40, i8, 4, 0x40, 0x3A, 0x65, 0x40);
+  TestRegAddrBase(cmp, ebp, esi, 0x40, i8, 4, 0x40, 0x3A, 0x6E, 0x40);
+  TestRegAddrBase(cmp, esi, edi, 0x40, i8, 4, 0x40, 0x3A, 0x77, 0x40);
+  TestRegAddrBase(cmp, edi, r8, 0x40, i8, 4, 0x41, 0x3A, 0x78, 0x40);
+  TestRegAddrBase(cmp, r8, r9, 0x40, i8, 4, 0x45, 0x3A, 0x41, 0x40);
+  TestRegAddrBase(cmp, r9, r10, 0x40, i8, 4, 0x45, 0x3A, 0x4A, 0x40);
+  TestRegAddrBase(cmp, r10, r11, 0x40, i8, 4, 0x45, 0x3A, 0x53, 0x40);
+  TestRegAddrBase(cmp, r11, r12, 0x40, i8, 5, 0x45, 0x3A, 0x5C, 0x24, 0x40);
+  TestRegAddrBase(cmp, r12, r13, 0x40, i8, 4, 0x45, 0x3A, 0x65, 0x40);
+  TestRegAddrBase(cmp, r13, r14, 0x40, i8, 4, 0x45, 0x3A, 0x6E, 0x40);
+  TestRegAddrBase(cmp, r14, r15, 0x40, i8, 4, 0x45, 0x3A, 0x77, 0x40);
+  TestRegAddrBase(cmp, r15, eax, 0x40, i8, 4, 0x44, 0x3A, 0x78, 0x40);
+
+  /* cmp GPR, Imm32(Base) */
+  TestRegAddrBase(cmp, eax, ecx, 0xF0, i32, 6, 0x3B, 0x81, 0xF0, 0x00, 0x00,
+                  0x00);
+  TestRegAddrBase(cmp, ecx, edx, 0xF0, i32, 6, 0x3B, 0x8A, 0xF0, 0x00, 0x00,
+                  0x00);
+  TestRegAddrBase(cmp, edx, ebx, 0xF0, i32, 6, 0x3B, 0x93, 0xF0, 0x00, 0x00,
+                  0x00);
+  TestRegAddrBase(cmp, ebx, esp, 0xF0, i32, 7, 0x3B, 0x9C, 0x24, 0xF0, 0x00,
+                  0x00, 0x00);
+  TestRegAddrBase(cmp, esp, ebp, 0xF0, i32, 6, 0x3B, 0xA5, 0xF0, 0x00, 0x00,
+                  0x00);
+  TestRegAddrBase(cmp, ebp, esi, 0xF0, i32, 6, 0x3B, 0xAE, 0xF0, 0x00, 0x00,
+                  0x00);
+  TestRegAddrBase(cmp, esi, edi, 0xF0, i32, 6, 0x3B, 0xB7, 0xF0, 0x00, 0x00,
+                  0x00);
+  TestRegAddrBase(cmp, edi, r8, 0xF0, i32, 7, 0x41, 0x3B, 0xB8, 0xF0, 0x00,
+                  0x00, 0x00);
+  TestRegAddrBase(cmp, r8, r9, 0xF0, i32, 7, 0x45, 0x3B, 0x81, 0xF0, 0x00, 0x00,
+                  0x00);
+  TestRegAddrBase(cmp, r9, r10, 0xF0, i32, 7, 0x45, 0x3B, 0x8A, 0xF0, 0x00,
+                  0x00, 0x00);
+  TestRegAddrBase(cmp, r10, r11, 0xF0, i32, 7, 0x45, 0x3B, 0x93, 0xF0, 0x00,
+                  0x00, 0x00);
+  TestRegAddrBase(cmp, r11, r12, 0xF0, i32, 8, 0x45, 0x3B, 0x9C, 0x24, 0xF0,
+                  0x00, 0x00, 0x00);
+  TestRegAddrBase(cmp, r12, r13, 0xF0, i32, 7, 0x45, 0x3B, 0xA5, 0xF0, 0x00,
+                  0x00, 0x00);
+  TestRegAddrBase(cmp, r13, r14, 0xF0, i32, 7, 0x45, 0x3B, 0xAE, 0xF0, 0x00,
+                  0x00, 0x00);
+  TestRegAddrBase(cmp, r14, r15, 0xF0, i32, 7, 0x45, 0x3B, 0xB7, 0xF0, 0x00,
+                  0x00, 0x00);
+  TestRegAddrBase(cmp, r15, eax, 0xF0, i32, 7, 0x44, 0x3B, 0xB8, 0xF0, 0x00,
+                  0x00, 0x00);
+
+  TestRegAddrBase(cmp, eax, ecx, 0xF0, i16, 7, 0x66, 0x3B, 0x81, 0xF0, 0x00,
+                  0x00, 0x00);
+  TestRegAddrBase(cmp, ecx, edx, 0xF0, i16, 7, 0x66, 0x3B, 0x8A, 0xF0, 0x00,
+                  0x00, 0x00);
+  TestRegAddrBase(cmp, edx, ebx, 0xF0, i16, 7, 0x66, 0x3B, 0x93, 0xF0, 0x00,
+                  0x00, 0x00);
+  TestRegAddrBase(cmp, ebx, esp, 0xF0, i16, 8, 0x66, 0x3B, 0x9C, 0x24, 0xF0,
+                  0x00, 0x00, 0x00);
+  TestRegAddrBase(cmp, esp, ebp, 0xF0, i16, 7, 0x66, 0x3B, 0xa5, 0xF0, 0x00,
+                  0x00, 0x00);
+  TestRegAddrBase(cmp, ebp, esi, 0xF0, i16, 7, 0x66, 0x3B, 0xaE, 0xF0, 0x00,
+                  0x00, 0x00);
+  TestRegAddrBase(cmp, esi, edi, 0xF0, i16, 7, 0x66, 0x3B, 0xb7, 0xF0, 0x00,
+                  0x00, 0x00);
+  TestRegAddrBase(cmp, edi, r8, 0xF0, i16, 8, 0x66, 0x41, 0x3B, 0xb8, 0xF0,
+                  0x00, 0x00, 0x00);
+  TestRegAddrBase(cmp, r8, r9, 0xF0, i16, 8, 0x66, 0x45, 0x3B, 0x81, 0xF0, 0x00,
+                  0x00, 0x00);
+  TestRegAddrBase(cmp, r9, r10, 0xF0, i16, 8, 0x66, 0x45, 0x3B, 0x8A, 0xF0,
+                  0x00, 0x00, 0x00);
+  TestRegAddrBase(cmp, r10, r11, 0xF0, i16, 8, 0x66, 0x45, 0x3B, 0x93, 0xF0,
+                  0x00, 0x00, 0x00);
+  TestRegAddrBase(cmp, r11, r12, 0xF0, i16, 9, 0x66, 0x45, 0x3B, 0x9C, 0x24,
+                  0xF0, 0x00, 0x00, 0x00);
+  TestRegAddrBase(cmp, r12, r13, 0xF0, i16, 8, 0x66, 0x45, 0x3B, 0xa5, 0xF0,
+                  0x00, 0x00, 0x00);
+  TestRegAddrBase(cmp, r13, r14, 0xF0, i16, 8, 0x66, 0x45, 0x3B, 0xaE, 0xF0,
+                  0x00, 0x00, 0x00);
+  TestRegAddrBase(cmp, r14, r15, 0xF0, i16, 8, 0x66, 0x45, 0x3B, 0xb7, 0xF0,
+                  0x00, 0x00, 0x00);
+  TestRegAddrBase(cmp, r15, eax, 0xF0, i16, 8, 0x66, 0x44, 0x3B, 0xb8, 0xF0,
+                  0x00, 0x00, 0x00);
+
+  TestRegAddrBase(cmp, eax, ecx, 0xF0, i8, 6, 0x3A, 0x81, 0xF0, 0x00, 0x00,
+                  0x00);
+  TestRegAddrBase(cmp, ecx, edx, 0xF0, i8, 6, 0x3A, 0x8A, 0xF0, 0x00, 0x00,
+                  0x00);
+  TestRegAddrBase(cmp, edx, ebx, 0xF0, i8, 6, 0x3A, 0x93, 0xF0, 0x00, 0x00,
+                  0x00);
+  TestRegAddrBase(cmp, ebx, esp, 0xF0, i8, 7, 0x3A, 0x9C, 0x24, 0xF0, 0x00,
+                  0x00, 0x00);
+  TestRegAddrBase(cmp, esp, ebp, 0xF0, i8, 7, 0x40, 0x3A, 0xA5, 0xF0, 0x00,
+                  0x00, 0x00);
+  TestRegAddrBase(cmp, ebp, esi, 0xF0, i8, 7, 0x40, 0x3A, 0xAE, 0xF0, 0x00,
+                  0x00, 0x00);
+  TestRegAddrBase(cmp, esi, edi, 0xF0, i8, 7, 0x40, 0x3A, 0xB7, 0xF0, 0x00,
+                  0x00, 0x00);
+  TestRegAddrBase(cmp, edi, r8, 0xF0, i8, 7, 0x41, 0x3A, 0xB8, 0xF0, 0x00, 0x00,
+                  0x00);
+  TestRegAddrBase(cmp, r8, r9, 0xF0, i8, 7, 0x45, 0x3A, 0x81, 0xF0, 0x00, 0x00,
+                  0x00);
+  TestRegAddrBase(cmp, r9, r10, 0xF0, i8, 7, 0x45, 0x3A, 0x8A, 0xF0, 0x00, 0x00,
+                  0x00);
+  TestRegAddrBase(cmp, r10, r11, 0xF0, i8, 7, 0x45, 0x3A, 0x93, 0xF0, 0x00,
+                  0x00, 0x00);
+  TestRegAddrBase(cmp, r11, r12, 0xF0, i8, 8, 0x45, 0x3A, 0x9C, 0x24, 0xF0,
+                  0x00, 0x00, 0x00);
+  TestRegAddrBase(cmp, r12, r13, 0xF0, i8, 7, 0x45, 0x3A, 0xA5, 0xF0, 0x00,
+                  0x00, 0x00);
+  TestRegAddrBase(cmp, r13, r14, 0xF0, i8, 7, 0x45, 0x3A, 0xAE, 0xF0, 0x00,
+                  0x00, 0x00);
+  TestRegAddrBase(cmp, r14, r15, 0xF0, i8, 7, 0x45, 0x3A, 0xB7, 0xF0, 0x00,
+                  0x00, 0x00);
+  TestRegAddrBase(cmp, r15, eax, 0xF0, i8, 7, 0x44, 0x3A, 0xB8, 0xF0, 0x00,
+                  0x00, 0x00);
+
+  /* cmp GPR, Imm(,Index,Scale) */
+  TestRegAddrScaledIndex(cmp, eax, ecx, 1, 0, i32, 7, 0x3B, 0x04, 0x0D, 0x00,
+                         0x00, 0x00, 0x00);
+  TestRegAddrScaledIndex(cmp, ecx, edx, 2, 0, i32, 7, 0x3B, 0x0C, 0x55, 0x00,
+                         0x00, 0x00, 0x00);
+  TestRegAddrScaledIndex(cmp, edx, ebx, 4, 0, i32, 7, 0x3B, 0x14, 0x9D, 0x00,
+                         0x00, 0x00, 0x00);
+  TestRegAddrScaledIndex(cmp, r8, r9, 1, 0, i32, 8, 0x46, 0x3B, 0x04, 0x0D,
+                         0x00, 0x00, 0x00, 0x00);
+  TestRegAddrScaledIndex(cmp, r9, r10, 2, 0, i32, 8, 0x46, 0x3B, 0x0C, 0x55,
+                         0x00, 0x00, 0x00, 0x00);
+  TestRegAddrScaledIndex(cmp, r10, r11, 4, 0, i32, 8, 0x46, 0x3B, 0x14, 0x9D,
+                         0x00, 0x00, 0x00, 0x00);
+  // esp cannot be an scaled index.
+  TestRegAddrScaledIndex(cmp, esp, ebp, 8, 0, i32, 7, 0x3B, 0x24, 0xED, 0x00,
+                         0x00, 0x00, 0x00);
+  TestRegAddrScaledIndex(cmp, ebp, esi, 1, 0, i32, 7, 0x3B, 0x2C, 0x35, 0x00,
+                         0x00, 0x00, 0x00);
+  TestRegAddrScaledIndex(cmp, esi, edi, 2, 0, i32, 7, 0x3B, 0x34, 0x7D, 0x00,
+                         0x00, 0x00, 0x00);
+  TestRegAddrScaledIndex(cmp, edi, eax, 4, 0, i32, 7, 0x3B, 0x3C, 0x85, 0x00,
+                         0x00, 0x00, 0x00);
+  TestRegAddrScaledIndex(cmp, ebx, ecx, 8, 0, i32, 7, 0x3B, 0x1C, 0xCD, 0x00,
+                         0x00, 0x00, 0x00);
+  TestRegAddrScaledIndex(cmp, r12, r13, 8, 0, i32, 8, 0x46, 0x3B, 0x24, 0xED,
+                         0x00, 0x00, 0x00, 0x00);
+  TestRegAddrScaledIndex(cmp, r13, r14, 1, 0, i32, 8, 0x46, 0x3B, 0x2C, 0x35,
+                         0x00, 0x00, 0x00, 0x00);
+  TestRegAddrScaledIndex(cmp, r14, r15, 2, 0, i32, 8, 0x46, 0x3B, 0x34, 0x7D,
+                         0x00, 0x00, 0x00, 0x00);
+  TestRegAddrScaledIndex(cmp, r15, r8, 4, 0, i32, 8, 0x46, 0x3B, 0x3C, 0x85,
+                         0x00, 0x00, 0x00, 0x00);
+  TestRegAddrScaledIndex(cmp, r11, r9, 8, 0, i32, 8, 0x46, 0x3B, 0x1C, 0xCD,
+                         0x00, 0x00, 0x00, 0x00);
+
+  TestRegAddrScaledIndex(cmp, eax, ecx, 8, 0, i16, 8, 0x66, 0x3B, 0x04, 0xCD,
+                         0x00, 0x00, 0x00, 0x00);
+  TestRegAddrScaledIndex(cmp, ecx, edx, 1, 0, i16, 8, 0x66, 0x3B, 0x0C, 0x15,
+                         0x00, 0x00, 0x00, 0x00);
+  TestRegAddrScaledIndex(cmp, edx, ebx, 2, 0, i16, 8, 0x66, 0x3B, 0x14, 0x5D,
+                         0x00, 0x00, 0x00, 0x00);
+  TestRegAddrScaledIndex(cmp, r8, r9, 8, 0, i16, 9, 0x66, 0x46, 0x3B, 0x04,
+                         0xCD, 0x00, 0x00, 0x00, 0x00);
+  TestRegAddrScaledIndex(cmp, r9, r10, 1, 0, i16, 9, 0x66, 0x46, 0x3B, 0x0C,
+                         0x15, 0x00, 0x00, 0x00, 0x00);
+  TestRegAddrScaledIndex(cmp, r10, r11, 2, 0, i16, 9, 0x66, 0x46, 0x3B, 0x14,
+                         0x5D, 0x00, 0x00, 0x00, 0x00);
+  // esp cannot be an scaled index.
+  TestRegAddrScaledIndex(cmp, esp, ebp, 4, 0, i16, 8, 0x66, 0x3B, 0x24, 0xAD,
+                         0x00, 0x00, 0x00, 0x00);
+  TestRegAddrScaledIndex(cmp, ebp, esi, 8, 0, i16, 8, 0x66, 0x3B, 0x2C, 0xF5,
+                         0x00, 0x00, 0x00, 0x00);
+  TestRegAddrScaledIndex(cmp, esi, edi, 1, 0, i16, 8, 0x66, 0x3B, 0x34, 0x3D,
+                         0x00, 0x00, 0x00, 0x00);
+  TestRegAddrScaledIndex(cmp, edi, eax, 2, 0, i16, 8, 0x66, 0x3B, 0x3C, 0x45,
+                         0x00, 0x00, 0x00, 0x00);
+  TestRegAddrScaledIndex(cmp, ebx, ecx, 8, 0, i16, 8, 0x66, 0x3B, 0x1C, 0xCD,
+                         0x00, 0x00, 0x00, 0x00);
+
+  TestRegAddrScaledIndex(cmp, eax, ecx, 4, 0, i8, 7, 0x3A, 0x04, 0x8D, 0x00,
+                         0x00, 0x00, 0x00);
+  TestRegAddrScaledIndex(cmp, ecx, edx, 8, 0, i8, 7, 0x3A, 0x0C, 0xD5, 0x00,
+                         0x00, 0x00, 0x00);
+  TestRegAddrScaledIndex(cmp, edx, ebx, 1, 0, i8, 7, 0x3A, 0x14, 0x1D, 0x00,
+                         0x00, 0x00, 0x00);
+  TestRegAddrScaledIndex(cmp, r8, r9, 4, 0, i8, 8, 0x46, 0x3A, 0x04, 0x8D, 0x00,
+                         0x00, 0x00, 0x00);
+  TestRegAddrScaledIndex(cmp, r9, r10, 8, 0, i8, 8, 0x46, 0x3A, 0x0C, 0xD5,
+                         0x00, 0x00, 0x00, 0x00);
+  TestRegAddrScaledIndex(cmp, r10, r11, 1, 0, i8, 8, 0x46, 0x3A, 0x14, 0x1D,
+                         0x00, 0x00, 0x00, 0x00);
+  // esp cannot be an scaled index.
+  TestRegAddrScaledIndex(cmp, esp, ebp, 2, 0, i8, 8, 0x40, 0x3A, 0x24, 0x6D,
+                         0x00, 0x00, 0x00, 0x00);
+  TestRegAddrScaledIndex(cmp, ebp, esi, 4, 0, i8, 8, 0x40, 0x3A, 0x2C, 0xB5,
+                         0x00, 0x00, 0x00, 0x00);
+  TestRegAddrScaledIndex(cmp, esi, edi, 8, 0, i8, 8, 0x40, 0x3A, 0x34, 0xFD,
+                         0x00, 0x00, 0x00, 0x00);
+  TestRegAddrScaledIndex(cmp, edi, eax, 1, 0, i8, 8, 0x40, 0x3A, 0x3C, 0x05,
+                         0x00, 0x00, 0x00, 0x00);
+  TestRegAddrScaledIndex(cmp, ebx, ecx, 8, 0, i8, 7, 0x3a, 0x1C, 0xCD, 0x00,
+                         0x00, 0x00, 0x00);
+  TestRegAddrScaledIndex(cmp, r12, r13, 2, 0, i8, 8, 0x46, 0x3A, 0x24, 0x6D,
+                         0x00, 0x00, 0x00, 0x00);
+  TestRegAddrScaledIndex(cmp, r13, r14, 4, 0, i8, 8, 0x46, 0x3A, 0x2C, 0xB5,
+                         0x00, 0x00, 0x00, 0x00);
+  TestRegAddrScaledIndex(cmp, r14, r15, 8, 0, i8, 8, 0x46, 0x3A, 0x34, 0xFD,
+                         0x00, 0x00, 0x00, 0x00);
+  TestRegAddrScaledIndex(cmp, r15, r8, 1, 0, i8, 8, 0x46, 0x3A, 0x3C, 0x05,
+                         0x00, 0x00, 0x00, 0x00);
+  TestRegAddrScaledIndex(cmp, r11, r9, 8, 0, i8, 8, 0x46, 0x3a, 0x1C, 0xCD,
+                         0x00, 0x00, 0x00, 0x00);
+
+  /* cmp GPR, 0(Base,Index,Scale) */
+  TestRegAddrBaseScaledIndex(cmp, eax, ecx, edx, 1, 0, i32, 3, 0x3B, 0x04,
+                             0x11);
+  TestRegAddrBaseScaledIndex(cmp, ecx, edx, ebx, 2, 0, i32, 3, 0x3B, 0x0C,
+                             0x5A);
+  TestRegAddrBaseScaledIndex(cmp, r8, r9, r10, 1, 0, i32, 4, 0x47, 0x3B, 0x04,
+                             0x11);
+  TestRegAddrBaseScaledIndex(cmp, r9, r10, r11, 2, 0, i32, 4, 0x47, 0x3B, 0x0C,
+                             0x5A);
+  // esp cannot be an scaled index.
+  TestRegAddrBaseScaledIndex(cmp, ebx, esp, ebp, 4, 0, i32, 3, 0x3B, 0x1C,
+                             0xAC);
+  TestRegAddrBaseScaledIndex(cmp, esp, ebp, esi, 8, 0, i32, 4, 0x3B, 0x64, 0xF5,
+                             0x00);
+  TestRegAddrBaseScaledIndex(cmp, ebp, esi, edi, 1, 0, i32, 3, 0x3B, 0x2C,
+                             0x3E);
+  TestRegAddrBaseScaledIndex(cmp, esi, edi, eax, 2, 0, i32, 3, 0x3B, 0x34,
+                             0x47);
+  TestRegAddrBaseScaledIndex(cmp, edi, eax, ebx, 4, 0, i32, 3, 0x3B, 0x3C,
+                             0x98);
+  TestRegAddrBaseScaledIndex(cmp, ebx, ecx, edx, 8, 0, i32, 3, 0x3B, 0x1C,
+                             0xD1);
+  TestRegAddrBaseScaledIndex(cmp, r11, r12, r13, 4, 0, i32, 4, 0x47, 0x3B, 0x1C,
+                             0xAC);
+  TestRegAddrBaseScaledIndex(cmp, r12, r13, r14, 8, 0, i32, 5, 0x47, 0x3B, 0x64,
+                             0xF5, 0x00);
+  TestRegAddrBaseScaledIndex(cmp, r13, r14, r15, 1, 0, i32, 4, 0x47, 0x3B, 0x2C,
+                             0x3E);
+  TestRegAddrBaseScaledIndex(cmp, r14, r15, r8, 2, 0, i32, 4, 0x47, 0x3B, 0x34,
+                             0x47);
+  TestRegAddrBaseScaledIndex(cmp, r15, r8, r11, 4, 0, i32, 4, 0x47, 0x3B, 0x3C,
+                             0x98);
+  TestRegAddrBaseScaledIndex(cmp, r11, r9, r10, 8, 0, i32, 4, 0x47, 0x3B, 0x1C,
+                             0xD1);
+
+  TestRegAddrBaseScaledIndex(cmp, eax, ecx, edx, 1, 0, i16, 4, 0x66, 0x3B, 0x04,
+                             0x11);
+  TestRegAddrBaseScaledIndex(cmp, ecx, edx, ebx, 2, 0, i16, 4, 0x66, 0x3B, 0x0C,
+                             0x5A);
+  TestRegAddrBaseScaledIndex(cmp, r8, r9, r10, 1, 0, i16, 5, 0x66, 0x47, 0x3B,
+                             0x04, 0x11);
+  TestRegAddrBaseScaledIndex(cmp, r9, r10, r11, 2, 0, i16, 5, 0x66, 0x47, 0x3B,
+                             0x0C, 0x5A);
+  // esp cannot be an scaled index.
+  TestRegAddrBaseScaledIndex(cmp, ebx, esp, ebp, 4, 0, i16, 4, 0x66, 0x3B, 0x1C,
+                             0xAC);
+  TestRegAddrBaseScaledIndex(cmp, esp, ebp, esi, 8, 0, i16, 5, 0x66, 0x3B, 0x64,
+                             0xF5, 0x00);
+  TestRegAddrBaseScaledIndex(cmp, ebp, esi, edi, 1, 0, i16, 4, 0x66, 0x3B, 0x2C,
+                             0x3E);
+  TestRegAddrBaseScaledIndex(cmp, esi, edi, eax, 2, 0, i16, 4, 0x66, 0x3B, 0x34,
+                             0x47);
+  TestRegAddrBaseScaledIndex(cmp, edi, eax, ebx, 4, 0, i16, 4, 0x66, 0x3B, 0x3C,
+                             0x98);
+  TestRegAddrBaseScaledIndex(cmp, ebx, ecx, edx, 8, 0, i16, 4, 0x66, 0x3B, 0x1C,
+                             0xD1);
+  TestRegAddrBaseScaledIndex(cmp, r11, r12, r13, 4, 0, i16, 5, 0x66, 0x47, 0x3B,
+                             0x1C, 0xAC);
+  TestRegAddrBaseScaledIndex(cmp, r12, r13, r14, 8, 0, i16, 6, 0x66, 0x47, 0x3B,
+                             0x64, 0xF5, 0x00);
+  TestRegAddrBaseScaledIndex(cmp, r13, r14, r15, 1, 0, i16, 5, 0x66, 0x47, 0x3B,
+                             0x2C, 0x3E);
+  TestRegAddrBaseScaledIndex(cmp, r14, r15, r8, 2, 0, i16, 5, 0x66, 0x47, 0x3B,
+                             0x34, 0x47);
+  TestRegAddrBaseScaledIndex(cmp, r15, r8, r11, 4, 0, i16, 5, 0x66, 0x47, 0x3B,
+                             0x3C, 0x98);
+  TestRegAddrBaseScaledIndex(cmp, r11, r9, r10, 8, 0, i16, 5, 0x66, 0x47, 0x3B,
+                             0x1C, 0xD1);
+
+  TestRegAddrBaseScaledIndex(cmp, eax, ecx, edx, 1, 0, i8, 3, 0x3A, 0x04, 0x11);
+  TestRegAddrBaseScaledIndex(cmp, ecx, edx, ebx, 2, 0, i8, 3, 0x3A, 0x0C, 0x5A);
+  TestRegAddrBaseScaledIndex(cmp, r8, r9, r10, 1, 0, i8, 4, 0x47, 0x3A, 0x04,
+                             0x11);
+  TestRegAddrBaseScaledIndex(cmp, r9, r10, r11, 2, 0, i8, 4, 0x47, 0x3A, 0x0C,
+                             0x5A);
+  // esp cannot be an scaled index.
+  TestRegAddrBaseScaledIndex(cmp, ebx, esp, ebp, 4, 0, i8, 3, 0x3A, 0x1C, 0xAC);
+  TestRegAddrBaseScaledIndex(cmp, esp, ebp, esi, 8, 0, i8, 5, 0x40, 0x3A, 0x64,
+                             0xF5, 0x00);
+  TestRegAddrBaseScaledIndex(cmp, ebp, esi, edi, 1, 0, i8, 4, 0x40, 0x3A, 0x2C,
+                             0x3E);
+  TestRegAddrBaseScaledIndex(cmp, esi, edi, eax, 2, 0, i8, 4, 0x40, 0x3A, 0x34,
+                             0x47);
+  TestRegAddrBaseScaledIndex(cmp, edi, eax, ebx, 4, 0, i8, 4, 0x40, 0x3A, 0x3C,
+                             0x98);
+  TestRegAddrBaseScaledIndex(cmp, ebx, ecx, edx, 8, 0, i8, 3, 0x3A, 0x1C, 0xD1);
+  TestRegAddrBaseScaledIndex(cmp, r11, r12, r13, 4, 0, i8, 4, 0x47, 0x3A, 0x1C,
+                             0xAC);
+  TestRegAddrBaseScaledIndex(cmp, r12, r13, r14, 8, 0, i8, 5, 0x47, 0x3A, 0x64,
+                             0xF5, 0x00);
+  TestRegAddrBaseScaledIndex(cmp, r13, r14, r15, 1, 0, i8, 4, 0x47, 0x3A, 0x2C,
+                             0x3E);
+  TestRegAddrBaseScaledIndex(cmp, r14, r15, r8, 2, 0, i8, 4, 0x47, 0x3A, 0x34,
+                             0x47);
+  TestRegAddrBaseScaledIndex(cmp, r15, r8, r11, 4, 0, i8, 4, 0x47, 0x3A, 0x3C,
+                             0x98);
+  TestRegAddrBaseScaledIndex(cmp, r11, r9, r10, 8, 0, i8, 4, 0x47, 0x3A, 0x1C,
+                             0xD1);
+
+  /* cmp GPR, Imm8(Base,Index,Scale) */
+  TestRegAddrBaseScaledIndex(cmp, eax, ecx, edx, 1, 0x40, i32, 4, 0x3B, 0x44,
+                             0x11, 0x40);
+  TestRegAddrBaseScaledIndex(cmp, ecx, edx, ebx, 2, 0x40, i32, 4, 0x3B, 0x4C,
+                             0x5A, 0x40);
+  TestRegAddrBaseScaledIndex(cmp, r8, r9, r10, 1, 0x40, i32, 5, 0x47, 0x3B,
+                             0x44, 0x11, 0x40);
+  TestRegAddrBaseScaledIndex(cmp, r9, r10, r11, 2, 0x40, i32, 5, 0x47, 0x3B,
+                             0x4C, 0x5A, 0x40);
+  // esp cannot be an scaled index.
+  TestRegAddrBaseScaledIndex(cmp, ebx, esp, ebp, 4, 0x40, i32, 4, 0x3B, 0x5C,
+                             0xAC, 0x40);
+  TestRegAddrBaseScaledIndex(cmp, esp, ebp, esi, 8, 0x40, i32, 4, 0x3B, 0x64,
+                             0xF5, 0x40);
+  TestRegAddrBaseScaledIndex(cmp, ebp, esi, edi, 1, 0x40, i32, 4, 0x3B, 0x6C,
+                             0x3E, 0x40);
+  TestRegAddrBaseScaledIndex(cmp, esi, edi, eax, 2, 0x40, i32, 4, 0x3B, 0x74,
+                             0x47, 0x40);
+  TestRegAddrBaseScaledIndex(cmp, edi, eax, ebx, 4, 0x40, i32, 4, 0x3B, 0x7C,
+                             0x98, 0x40);
+  TestRegAddrBaseScaledIndex(cmp, ebx, ecx, edx, 8, 0x40, i32, 4, 0x3B, 0x5C,
+                             0xD1, 0x40);
+  TestRegAddrBaseScaledIndex(cmp, r11, r12, r13, 4, 0x40, i32, 5, 0x47, 0x3B,
+                             0x5C, 0xAC, 0x40);
+  TestRegAddrBaseScaledIndex(cmp, r12, r13, r14, 8, 0x40, i32, 5, 0x47, 0x3B,
+                             0x64, 0xF5, 0x40);
+  TestRegAddrBaseScaledIndex(cmp, r13, r14, r15, 1, 0x40, i32, 5, 0x47, 0x3B,
+                             0x6C, 0x3E, 0x40);
+  TestRegAddrBaseScaledIndex(cmp, r14, r15, r8, 2, 0x40, i32, 5, 0x47, 0x3B,
+                             0x74, 0x47, 0x40);
+  TestRegAddrBaseScaledIndex(cmp, r15, r8, r11, 4, 0x40, i32, 5, 0x47, 0x3B,
+                             0x7C, 0x98, 0x40);
+  TestRegAddrBaseScaledIndex(cmp, r11, r9, r10, 8, 0x40, i32, 5, 0x47, 0x3B,
+                             0x5C, 0xD1, 0x40);
+
+  TestRegAddrBaseScaledIndex(cmp, eax, ecx, edx, 1, 0x40, i16, 5, 0x66, 0x3B,
+                             0x44, 0x11, 0x40);
+  TestRegAddrBaseScaledIndex(cmp, ecx, edx, ebx, 2, 0x40, i16, 5, 0x66, 0x3B,
+                             0x4C, 0x5A, 0x40);
+  TestRegAddrBaseScaledIndex(cmp, r8, r9, r10, 1, 0x40, i16, 6, 0x66, 0x47,
+                             0x3B, 0x44, 0x11, 0x40);
+  TestRegAddrBaseScaledIndex(cmp, r9, r10, r11, 2, 0x40, i16, 6, 0x66, 0x47,
+                             0x3B, 0x4C, 0x5A, 0x40);
+  // esp cannot be an scaled index.
+  TestRegAddrBaseScaledIndex(cmp, ebx, esp, ebp, 4, 0x40, i16, 5, 0x66, 0x3B,
+                             0x5C, 0xAC, 0x40);
+  TestRegAddrBaseScaledIndex(cmp, esp, ebp, esi, 8, 0x40, i16, 5, 0x66, 0x3B,
+                             0x64, 0xF5, 0x40);
+  TestRegAddrBaseScaledIndex(cmp, ebp, esi, edi, 1, 0x40, i16, 5, 0x66, 0x3B,
+                             0x6C, 0x3E, 0x40);
+  TestRegAddrBaseScaledIndex(cmp, esi, edi, eax, 2, 0x40, i16, 5, 0x66, 0x3B,
+                             0x74, 0x47, 0x40);
+  TestRegAddrBaseScaledIndex(cmp, edi, eax, ebx, 4, 0x40, i16, 5, 0x66, 0x3B,
+                             0x7C, 0x98, 0x40);
+  TestRegAddrBaseScaledIndex(cmp, ebx, ecx, edx, 8, 0x40, i16, 5, 0x66, 0x3B,
+                             0x5C, 0xD1, 0x40);
+  TestRegAddrBaseScaledIndex(cmp, r11, r12, r13, 4, 0x40, i16, 6, 0x66, 0x47,
+                             0x3B, 0x5C, 0xAC, 0x40);
+  TestRegAddrBaseScaledIndex(cmp, r12, r13, r14, 8, 0x40, i16, 6, 0x66, 0x47,
+                             0x3B, 0x64, 0xF5, 0x40);
+  TestRegAddrBaseScaledIndex(cmp, r13, r14, r15, 1, 0x40, i16, 6, 0x66, 0x47,
+                             0x3B, 0x6C, 0x3E, 0x40);
+  TestRegAddrBaseScaledIndex(cmp, r14, r15, r8, 2, 0x40, i16, 6, 0x66, 0x47,
+                             0x3B, 0x74, 0x47, 0x40);
+  TestRegAddrBaseScaledIndex(cmp, r15, r8, r11, 4, 0x40, i16, 6, 0x66, 0x47,
+                             0x3B, 0x7C, 0x98, 0x40);
+  TestRegAddrBaseScaledIndex(cmp, r11, r9, r10, 8, 0x40, i16, 6, 0x66, 0x47,
+                             0x3B, 0x5C, 0xD1, 0x40);
+
+  TestRegAddrBaseScaledIndex(cmp, eax, ecx, edx, 1, 0x40, i8, 4, 0x3A, 0x44,
+                             0x11, 0x40);
+  TestRegAddrBaseScaledIndex(cmp, ecx, edx, ebx, 2, 0x40, i8, 4, 0x3A, 0x4C,
+                             0x5A, 0x40);
+  TestRegAddrBaseScaledIndex(cmp, r8, r9, r10, 1, 0x40, i8, 5, 0x47, 0x3A, 0x44,
+                             0x11, 0x40);
+  TestRegAddrBaseScaledIndex(cmp, r9, r10, r11, 2, 0x40, i8, 5, 0x47, 0x3A,
+                             0x4C, 0x5A, 0x40);
+  // esp cannot be an scaled index.
+  TestRegAddrBaseScaledIndex(cmp, ebx, esp, ebp, 4, 0x40, i8, 4, 0x3A, 0x5C,
+                             0xAC, 0x40);
+  TestRegAddrBaseScaledIndex(cmp, esp, ebp, esi, 8, 0x40, i8, 5, 0x40, 0x3A,
+                             0x64, 0xF5, 0x40);
+  TestRegAddrBaseScaledIndex(cmp, ebp, esi, edi, 1, 0x40, i8, 5, 0x40, 0x3A,
+                             0x6C, 0x3E, 0x40);
+  TestRegAddrBaseScaledIndex(cmp, esi, edi, eax, 2, 0x40, i8, 5, 0x40, 0x3A,
+                             0x74, 0x47, 0x40);
+  TestRegAddrBaseScaledIndex(cmp, edi, eax, ebx, 4, 0x40, i8, 5, 0x40, 0x3A,
+                             0x7C, 0x98, 0x40);
+  TestRegAddrBaseScaledIndex(cmp, ebx, ecx, edx, 8, 0x40, i8, 4, 0x3A, 0x5C,
+                             0xD1, 0x40);
+  TestRegAddrBaseScaledIndex(cmp, r11, r12, r13, 4, 0x40, i8, 5, 0x47, 0x3A,
+                             0x5C, 0xAC, 0x40);
+  TestRegAddrBaseScaledIndex(cmp, r12, r13, r14, 8, 0x40, i8, 5, 0x47, 0x3A,
+                             0x64, 0xF5, 0x40);
+  TestRegAddrBaseScaledIndex(cmp, r13, r14, r15, 1, 0x40, i8, 5, 0x47, 0x3A,
+                             0x6C, 0x3E, 0x40);
+  TestRegAddrBaseScaledIndex(cmp, r14, r15, r8, 2, 0x40, i8, 5, 0x47, 0x3A,
+                             0x74, 0x47, 0x40);
+  TestRegAddrBaseScaledIndex(cmp, r15, r8, r11, 4, 0x40, i8, 5, 0x47, 0x3A,
+                             0x7C, 0x98, 0x40);
+  TestRegAddrBaseScaledIndex(cmp, r11, r9, r10, 8, 0x40, i8, 5, 0x47, 0x3A,
+                             0x5C, 0xD1, 0x40);
+
+  /* cmp GPR, Imm32(Base,Index,Scale) */
+  TestRegAddrBaseScaledIndex(cmp, eax, ecx, edx, 1, 0xF0, i32, 7, 0x3B, 0x84,
+                             0x11, 0xF0, 0x00, 0x00, 0x00);
+  TestRegAddrBaseScaledIndex(cmp, ecx, edx, ebx, 2, 0xF0, i32, 7, 0x3B, 0x8C,
+                             0x5A, 0xF0, 0x00, 0x00, 0x00);
+  TestRegAddrBaseScaledIndex(cmp, r8, r9, r10, 1, 0xF0, i32, 8, 0x47, 0x3B,
+                             0x84, 0x11, 0xF0, 0x00, 0x00, 0x00);
+  TestRegAddrBaseScaledIndex(cmp, r9, r10, r11, 2, 0xF0, i32, 8, 0x47, 0x3B,
+                             0x8C, 0x5A, 0xF0, 0x00, 0x00, 0x00);
+  // esp cannot be an scaled index.
+  TestRegAddrBaseScaledIndex(cmp, ebx, esp, ebp, 4, 0xF0, i32, 7, 0x3B, 0x9C,
+                             0xAC, 0xF0, 0x00, 0x00, 0x00);
+  TestRegAddrBaseScaledIndex(cmp, esp, ebp, esi, 8, 0xF0, i32, 7, 0x3B, 0xA4,
+                             0xF5, 0xF0, 0x00, 0x00, 0x00);
+  TestRegAddrBaseScaledIndex(cmp, ebp, esi, edi, 1, 0xF0, i32, 7, 0x3B, 0xAC,
+                             0x3E, 0xF0, 0x00, 0x00, 0x00);
+  TestRegAddrBaseScaledIndex(cmp, esi, edi, eax, 2, 0xF0, i32, 7, 0x3B, 0xB4,
+                             0x47, 0xF0, 0x00, 0x00, 0x00);
+  TestRegAddrBaseScaledIndex(cmp, edi, eax, ebx, 4, 0xF0, i32, 7, 0x3B, 0xBC,
+                             0x98, 0xF0, 0x00, 0x00, 0x00);
+  TestRegAddrBaseScaledIndex(cmp, ebx, ecx, edx, 8, 0xF0, i32, 7, 0x3B, 0x9C,
+                             0xD1, 0xF0, 0x00, 0x00, 0x00);
+  TestRegAddrBaseScaledIndex(cmp, r11, r12, r13, 4, 0xF0, i32, 8, 0x47, 0x3B,
+                             0x9C, 0xAC, 0xF0, 0x00, 0x00, 0x00);
+  TestRegAddrBaseScaledIndex(cmp, r12, r13, r14, 8, 0xF0, i32, 8, 0x47, 0x3B,
+                             0xA4, 0xF5, 0xF0, 0x00, 0x00, 0x00);
+  TestRegAddrBaseScaledIndex(cmp, r13, r14, r15, 1, 0xF0, i32, 8, 0x47, 0x3B,
+                             0xAC, 0x3E, 0xF0, 0x00, 0x00, 0x00);
+  TestRegAddrBaseScaledIndex(cmp, r14, r15, r8, 2, 0xF0, i32, 8, 0x47, 0x3B,
+                             0xB4, 0x47, 0xF0, 0x00, 0x00, 0x00);
+  TestRegAddrBaseScaledIndex(cmp, r15, r8, r11, 4, 0xF0, i32, 8, 0x47, 0x3B,
+                             0xBC, 0x98, 0xF0, 0x00, 0x00, 0x00);
+  TestRegAddrBaseScaledIndex(cmp, r11, r9, r10, 8, 0xF0, i32, 8, 0x47, 0x3B,
+                             0x9C, 0xD1, 0xF0, 0x00, 0x00, 0x00);
+
+  TestRegAddrBaseScaledIndex(cmp, eax, ecx, edx, 1, 0xF0, i16, 8, 0x66, 0x3B,
+                             0x84, 0x11, 0xF0, 0x00, 0x00, 0x00);
+  TestRegAddrBaseScaledIndex(cmp, ecx, edx, ebx, 2, 0xF0, i16, 8, 0x66, 0x3B,
+                             0x8C, 0x5A, 0xF0, 0x00, 0x00, 0x00);
+  TestRegAddrBaseScaledIndex(cmp, r8, r9, r10, 1, 0xF0, i16, 9, 0x66, 0x47,
+                             0x3B, 0x84, 0x11, 0xF0, 0x00, 0x00, 0x00);
+  TestRegAddrBaseScaledIndex(cmp, r9, r10, r11, 2, 0xF0, i16, 9, 0x66, 0x47,
+                             0x3B, 0x8C, 0x5A, 0xF0, 0x00, 0x00, 0x00);
+  // esp cannot be an scaled index.
+  TestRegAddrBaseScaledIndex(cmp, ebx, esp, ebp, 4, 0xF0, i16, 8, 0x66, 0x3B,
+                             0x9C, 0xAC, 0xF0, 0x00, 0x00, 0x00);
+  TestRegAddrBaseScaledIndex(cmp, esp, ebp, esi, 8, 0xF0, i16, 8, 0x66, 0x3B,
+                             0xA4, 0xF5, 0xF0, 0x00, 0x00, 0x00);
+  TestRegAddrBaseScaledIndex(cmp, ebp, esi, edi, 1, 0xF0, i16, 8, 0x66, 0x3B,
+                             0xAC, 0x3E, 0xF0, 0x00, 0x00, 0x00);
+  TestRegAddrBaseScaledIndex(cmp, esi, edi, eax, 2, 0xF0, i16, 8, 0x66, 0x3B,
+                             0xB4, 0x47, 0xF0, 0x00, 0x00, 0x00);
+  TestRegAddrBaseScaledIndex(cmp, edi, eax, ebx, 4, 0xF0, i16, 8, 0x66, 0x3B,
+                             0xBC, 0x98, 0xF0, 0x00, 0x00, 0x00);
+  TestRegAddrBaseScaledIndex(cmp, ebx, ecx, edx, 8, 0xF0, i16, 8, 0x66, 0x3B,
+                             0x9C, 0xD1, 0xF0, 0x00, 0x00, 0x00);
+  TestRegAddrBaseScaledIndex(cmp, r11, r12, r13, 4, 0xF0, i16, 9, 0x66, 0x47,
+                             0x3B, 0x9C, 0xAC, 0xF0, 0x00, 0x00, 0x00);
+  TestRegAddrBaseScaledIndex(cmp, r12, r13, r14, 8, 0xF0, i16, 9, 0x66, 0x47,
+                             0x3B, 0xA4, 0xF5, 0xF0, 0x00, 0x00, 0x00);
+  TestRegAddrBaseScaledIndex(cmp, r13, r14, r15, 1, 0xF0, i16, 9, 0x66, 0x47,
+                             0x3B, 0xAC, 0x3E, 0xF0, 0x00, 0x00, 0x00);
+  TestRegAddrBaseScaledIndex(cmp, r14, r15, r8, 2, 0xF0, i16, 9, 0x66, 0x47,
+                             0x3B, 0xB4, 0x47, 0xF0, 0x00, 0x00, 0x00);
+  TestRegAddrBaseScaledIndex(cmp, r15, r8, r11, 4, 0xF0, i16, 9, 0x66, 0x47,
+                             0x3B, 0xBC, 0x98, 0xF0, 0x00, 0x00, 0x00);
+  TestRegAddrBaseScaledIndex(cmp, r11, r9, r10, 8, 0xF0, i16, 9, 0x66, 0x47,
+                             0x3B, 0x9C, 0xD1, 0xF0, 0x00, 0x00, 0x00);
+
+  TestRegAddrBaseScaledIndex(cmp, eax, ecx, edx, 1, 0xF0, i8, 7, 0x3A, 0x84,
+                             0x11, 0xF0, 0x00, 0x00, 0x00);
+  TestRegAddrBaseScaledIndex(cmp, ecx, edx, ebx, 2, 0xF0, i8, 7, 0x3A, 0x8C,
+                             0x5A, 0xF0, 0x00, 0x00, 0x00);
+  TestRegAddrBaseScaledIndex(cmp, r8, r9, r10, 1, 0xF0, i8, 8, 0x47, 0x3A, 0x84,
+                             0x11, 0xF0, 0x00, 0x00, 0x00);
+  TestRegAddrBaseScaledIndex(cmp, r9, r10, r11, 2, 0xF0, i8, 8, 0x47, 0x3A,
+                             0x8C, 0x5A, 0xF0, 0x00, 0x00, 0x00);
+  // esp cannot be an scaled index.
+  TestRegAddrBaseScaledIndex(cmp, ebx, esp, ebp, 4, 0xF0, i8, 7, 0x3A, 0x9C,
+                             0xAC, 0xF0, 0x00, 0x00, 0x00);
+  TestRegAddrBaseScaledIndex(cmp, esp, ebp, esi, 8, 0xF0, i8, 8, 0x40, 0x3A,
+                             0xA4, 0xF5, 0xF0, 0x00, 0x00, 0x00);
+  TestRegAddrBaseScaledIndex(cmp, ebp, esi, edi, 1, 0xF0, i8, 8, 0x40, 0x3A,
+                             0xAC, 0x3E, 0xF0, 0x00, 0x00, 0x00);
+  TestRegAddrBaseScaledIndex(cmp, esi, edi, eax, 2, 0xF0, i8, 8, 0x40, 0x3A,
+                             0xB4, 0x47, 0xF0, 0x00, 0x00, 0x00);
+  TestRegAddrBaseScaledIndex(cmp, edi, eax, ebx, 4, 0xF0, i8, 8, 0x40, 0x3A,
+                             0xBC, 0x98, 0xF0, 0x00, 0x00, 0x00);
+  TestRegAddrBaseScaledIndex(cmp, ebx, ecx, edx, 8, 0xF0, i8, 7, 0x3A, 0x9C,
+                             0xD1, 0xF0, 0x00, 0x00, 0x00);
+  TestRegAddrBaseScaledIndex(cmp, r11, r12, r13, 4, 0xF0, i8, 8, 0x47, 0x3A,
+                             0x9C, 0xAC, 0xF0, 0x00, 0x00, 0x00);
+  TestRegAddrBaseScaledIndex(cmp, r12, r13, r14, 8, 0xF0, i8, 8, 0x47, 0x3A,
+                             0xA4, 0xF5, 0xF0, 0x00, 0x00, 0x00);
+  TestRegAddrBaseScaledIndex(cmp, r13, r14, r15, 1, 0xF0, i8, 8, 0x47, 0x3A,
+                             0xAC, 0x3E, 0xF0, 0x00, 0x00, 0x00);
+  TestRegAddrBaseScaledIndex(cmp, r14, r15, r8, 2, 0xF0, i8, 8, 0x47, 0x3A,
+                             0xB4, 0x47, 0xF0, 0x00, 0x00, 0x00);
+  TestRegAddrBaseScaledIndex(cmp, r15, r8, r11, 4, 0xF0, i8, 8, 0x47, 0x3A,
+                             0xBC, 0x98, 0xF0, 0x00, 0x00, 0x00);
+  TestRegAddrBaseScaledIndex(cmp, r11, r9, r10, 8, 0xF0, i8, 8, 0x47, 0x3A,
+                             0x9C, 0xD1, 0xF0, 0x00, 0x00, 0x00);
+
+  /* cmp Addr, Imm */
+  // Note: at this point we trust the assembler knows how to encode addresses,
+  // so no more exhaustive addressing mode testing.
+  TestAddrBaseScaledIndexImm(cmp, eax, ecx, 1, 0xF0, 0x12, i32, 8, 0x83, 0xBC,
+                             0x08, 0xF0, 0x00, 0x00, 0x00, 0x12);
+  TestAddrBaseScaledIndexImm(cmp, ecx, edx, 1, 0xF0, 0xF0, i32, 11, 0x81, 0xBC,
+                             0x11, 0xF0, 0x00, 0x00, 0x00, 0xF0, 0x00, 0x00,
+                             0x00);
+  TestAddrBaseScaledIndexImm(cmp, r8, r9, 1, 0xF0, 0x12, i32, 9, 0x43, 0x83,
+                             0xBC, 0x08, 0xF0, 0x00, 0x00, 0x00, 0x12);
+  TestAddrBaseScaledIndexImm(cmp, r9, r10, 1, 0xF0, 0xF0, i32, 12, 0x43, 0x81,
+                             0xBC, 0x11, 0xF0, 0x00, 0x00, 0x00, 0xF0, 0x00,
+                             0x00, 0x00);
+
+  TestAddrBaseScaledIndexImm(cmp, eax, ecx, 1, 0xF0, 0x12, i16, 9, 0x66, 0x83,
+                             0xBC, 0x08, 0xF0, 0x00, 0x00, 0x00, 0x12);
+  TestAddrBaseScaledIndexImm(cmp, ecx, edx, 1, 0xF0, 0xF0, i16, 10, 0x66, 0x81,
+                             0xBC, 0x11, 0xF0, 0x00, 0x00, 0x00, 0xF0, 0x00);
+  TestAddrBaseScaledIndexImm(cmp, r8, r9, 1, 0xF0, 0x12, i16, 10, 0x66, 0x43,
+                             0x83, 0xBC, 0x08, 0xF0, 0x00, 0x00, 0x00, 0x12);
+  TestAddrBaseScaledIndexImm(cmp, r9, r10, 1, 0xF0, 0xF0, i16, 11, 0x66, 0x43,
+                             0x81, 0xBC, 0x11, 0xF0, 0x00, 0x00, 0x00, 0xF0,
+                             0x00);
+
+  TestAddrBaseScaledIndexImm(cmp, eax, ecx, 1, 0xF0, 0x12, i8, 8, 0x80, 0xBC,
+                             0x08, 0xF0, 0x00, 0x00, 0x00, 0x12);
+  TestAddrBaseScaledIndexImm(cmp, r8, r9, 1, 0xF0, 0x12, i8, 9, 0x43, 0x80,
+                             0xBC, 0x08, 0xF0, 0x00, 0x00, 0x00, 0x12);
+
+  /* cmp Addr, GPR */
+  TestAddrBaseScaledIndexReg(cmp, eax, ecx, 1, 0xF0, edx, i32, 7, 0x39, 0x94,
+                             0x08, 0xF0, 0x00, 0x00, 0x00);
+  TestAddrBaseScaledIndexReg(cmp, r8, r9, 1, 0xF0, r10, i32, 8, 0x47, 0x39,
+                             0x94, 0x08, 0xF0, 0x00, 0x00, 0x00);
+
+  TestAddrBaseScaledIndexReg(cmp, eax, ecx, 1, 0xF0, edx, i16, 8, 0x66, 0x39,
+                             0x94, 0x08, 0xF0, 0x00, 0x00, 0x00);
+  TestAddrBaseScaledIndexReg(cmp, r8, r9, 1, 0xF0, r10, i16, 9, 0x66, 0x47,
+                             0x39, 0x94, 0x08, 0xF0, 0x00, 0x00, 0x00);
+
+  TestAddrBaseScaledIndexReg(cmp, eax, ecx, 1, 0xF0, edx, i8, 7, 0x38, 0x94,
+                             0x08, 0xF0, 0x00, 0x00, 0x00);
+  TestAddrBaseScaledIndexReg(cmp, r8, r9, 1, 0xF0, r10, i8, 8, 0x47, 0x38, 0x94,
+                             0x08, 0xF0, 0x00, 0x00, 0x00);
+
+#undef TestAddrBaseScaledIndexReg
+#undef TestAddrBaseScaledIndexImm
+#undef TestRegAddrBaseScaledIndex
+#undef TestRegAddrScaledIndex
+#undef TestRegAddrBase
+#undef TestRegAbsoluteAddr
+#undef TestRegImm
+#undef TestRegReg
+}
+
+TEST_F(AssemblerX8664Test, ScratchpadGettersAndSetters) {
+  const uint32_t S0 = allocateDword();
+  const uint32_t S1 = allocateDword();
+  const uint32_t S2 = allocateDword();
+  const uint32_t S3 = allocateDword();
+  AssembledTest test = assemble();
+  test.setDwordTo(S0, 0xBEEF0000u);
+  test.setDwordTo(S1, 0xDEADu);
+  test.setDwordTo(S2, 0x20406080u);
+  ASSERT_EQ(0xBEEF0000u, test.contentsOfDword(S0));
+  ASSERT_EQ(0xDEADu, test.contentsOfDword(S1));
+  ASSERT_EQ(0x20406080u, test.contentsOfDword(S2));
+  ASSERT_EQ(0xDEADBEEF0000ull, test.contentsOfQword(S0));
+  ASSERT_EQ(0x204060800000DEADull, test.contentsOfQword(S1));
+
+  test.setQwordTo(S1, 0x1234567890ABCDEFull);
+  ASSERT_EQ(0x1234567890ABCDEFull, test.contentsOfQword(S1));
+  test.setDwordTo(S0, 0xBEEF0000u);
+  ASSERT_EQ(0x90ABCDEFull, test.contentsOfDword(S1));
+  ASSERT_EQ(0x12345678ull, test.contentsOfDword(S2));
+
+  test.setDwordTo(S0, 1.0f);
+  ASSERT_FLOAT_EQ(1.0f, test.contentsOfDword<float>(S0));
+  test.setQwordTo(S0, 3.14);
+  ASSERT_DOUBLE_EQ(3.14, test.contentsOfQword<double>(S0));
+
+  test.setDqwordTo(S0, Dqword(1.0f, 2.0f, 3.0f, 4.0f));
+  ASSERT_EQ(Dqword(1.0f, 2.0f, 3.0f, 4.0f), test.contentsOfDqword(S0));
+  EXPECT_FLOAT_EQ(1.0f, test.contentsOfDword<float>(S0));
+  EXPECT_FLOAT_EQ(2.0f, test.contentsOfDword<float>(S1));
+  EXPECT_FLOAT_EQ(3.0f, test.contentsOfDword<float>(S2));
+  EXPECT_FLOAT_EQ(4.0f, test.contentsOfDword<float>(S3));
+}
+
+} // end of anonymous namespace
+} // end of namespace Test
+} // end of namespace X8664
+} // end of namespace Ice
diff --git a/unittest/AssemblerX8664/Other.cpp b/unittest/AssemblerX8664/Other.cpp
new file mode 100644
index 0000000..f9f4753
--- /dev/null
+++ b/unittest/AssemblerX8664/Other.cpp
@@ -0,0 +1,62 @@
+//===- subzero/unittest/AssemblerX8664/GPRArith.cpp -----------------------===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#include "AssemblerX8664/TestUtil.h"
+
+namespace Ice {
+namespace X8664 {
+namespace Test {
+namespace {
+
+TEST_F(AssemblerX8664LowLevelTest, Nop) {
+#define TestImpl(Size, ...)                                                    \
+  do {                                                                         \
+    static constexpr char TestString[] = "(" #Size ", " #__VA_ARGS__ ")";      \
+    __ nop(Size);                                                              \
+    ASSERT_EQ(Size##u, codeBytesSize()) << TestString;                         \
+    ASSERT_TRUE(verifyBytes<Size>(codeBytes(), __VA_ARGS__)) << TestString;    \
+    reset();                                                                   \
+  } while (0);
+
+  TestImpl(1, 0x90);
+  TestImpl(2, 0x66, 0x90);
+  TestImpl(3, 0x0F, 0x1F, 0x00);
+  TestImpl(4, 0x0F, 0x1F, 0x40, 0x00);
+  TestImpl(5, 0x0F, 0x1F, 0x44, 0x00, 0x00);
+  TestImpl(6, 0x66, 0x0F, 0x1F, 0x44, 0x00, 0x00);
+  TestImpl(7, 0x0F, 0x1F, 0x80, 0x00, 0x00, 0x00, 0x00);
+  TestImpl(8, 0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00);
+
+#undef TestImpl
+}
+
+TEST_F(AssemblerX8664LowLevelTest, Int3) {
+  __ int3();
+  static constexpr uint32_t ByteCount = 1;
+  ASSERT_EQ(ByteCount, codeBytesSize());
+  verifyBytes<ByteCount>(codeBytes(), 0xCC);
+}
+
+TEST_F(AssemblerX8664LowLevelTest, Hlt) {
+  __ hlt();
+  static constexpr uint32_t ByteCount = 1;
+  ASSERT_EQ(ByteCount, codeBytesSize());
+  verifyBytes<ByteCount>(codeBytes(), 0xF4);
+}
+
+TEST_F(AssemblerX8664LowLevelTest, Ud2) {
+  __ ud2();
+  static constexpr uint32_t ByteCount = 2;
+  ASSERT_EQ(ByteCount, codeBytesSize());
+  verifyBytes<ByteCount>(codeBytes(), 0x0F, 0x0B);
+}
+
+} // end of anonymous namespace
+} // end of namespace Test
+} // end of namespace X8664
+} // end of namespace Ice
diff --git a/unittest/AssemblerX8664/TestUtil.h b/unittest/AssemblerX8664/TestUtil.h
new file mode 100644
index 0000000..4615459
--- /dev/null
+++ b/unittest/AssemblerX8664/TestUtil.h
@@ -0,0 +1,1097 @@
+
+//===- subzero/unittest/unittest/AssemblerX8664/TestUtil.h ------*- C++ -*-===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Utility classes for testing the X8664 Assembler.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ASSEMBLERX8664_TESTUTIL_H_
+#define ASSEMBLERX8664_TESTUTIL_H_
+
+#include "IceAssemblerX8664.h"
+
+#include "gtest/gtest.h"
+
+#include <cassert>
+#include <sys/mman.h>
+
+namespace Ice {
+namespace X8664 {
+namespace Test {
+
+class AssemblerX8664TestBase : public ::testing::Test {
+protected:
+  using Address = AssemblerX8664::Traits::Address;
+  using ByteRegister = AssemblerX8664::Traits::ByteRegister;
+  using Cond = AssemblerX8664::Traits::Cond;
+  using GPRRegister = AssemblerX8664::Traits::GPRRegister;
+  using Traits = AssemblerX8664::Traits;
+  using XmmRegister = AssemblerX8664::Traits::XmmRegister;
+
+// The following are "nicknames" for all possible GPRs in x86-64. With those, we
+// can use, e.g.,
+//
+//  Encoded_GPR_al()
+//
+// instead of GPRRegister::Encoded_Reg_eax for 8 bit operands. They also
+// introduce "regular" nicknames for legacy x86-32 register (e.g., eax becomes
+// r1; esp, r0).
+#define LegacyRegAliases(NewName, Name64, Name32, Name16, Name8)               \
+  static constexpr GPRRegister Encoded_GPR_##NewName() {                       \
+    return GPRRegister::Encoded_Reg_##Name32;                                  \
+  }                                                                            \
+  static constexpr GPRRegister Encoded_GPR_##NewName##q() {                    \
+    return GPRRegister::Encoded_Reg_##Name32;                                  \
+  }                                                                            \
+  static constexpr GPRRegister Encoded_GPR_##NewName##d() {                    \
+    return GPRRegister::Encoded_Reg_##Name32;                                  \
+  }                                                                            \
+  static constexpr GPRRegister Encoded_GPR_##NewName##w() {                    \
+    return GPRRegister::Encoded_Reg_##Name32;                                  \
+  }                                                                            \
+  static constexpr GPRRegister Encoded_GPR_##NewName##l() {                    \
+    return GPRRegister::Encoded_Reg_##Name32;                                  \
+  }                                                                            \
+  static constexpr GPRRegister Encoded_GPR_##Name64() {                        \
+    return GPRRegister::Encoded_Reg_##Name32;                                  \
+  }                                                                            \
+  static constexpr GPRRegister Encoded_GPR_##Name32() {                        \
+    return GPRRegister::Encoded_Reg_##Name32;                                  \
+  }                                                                            \
+  static constexpr GPRRegister Encoded_GPR_##Name16() {                        \
+    return GPRRegister::Encoded_Reg_##Name32;                                  \
+  }                                                                            \
+  static constexpr GPRRegister Encoded_GPR_##Name8() {                         \
+    return GPRRegister::Encoded_Reg_##Name32;                                  \
+  }
+#define NewRegAliases(Name)                                                    \
+  static constexpr GPRRegister Encoded_GPR_##Name() {                          \
+    return GPRRegister::Encoded_Reg_##Name##d;                                 \
+  }                                                                            \
+  static constexpr GPRRegister Encoded_GPR_##Name##q() {                       \
+    return GPRRegister::Encoded_Reg_##Name##d;                                 \
+  }                                                                            \
+  static constexpr GPRRegister Encoded_GPR_##Name##d() {                       \
+    return GPRRegister::Encoded_Reg_##Name##d;                                 \
+  }                                                                            \
+  static constexpr GPRRegister Encoded_GPR_##Name##w() {                       \
+    return GPRRegister::Encoded_Reg_##Name##d;                                 \
+  }                                                                            \
+  static constexpr GPRRegister Encoded_GPR_##Name##l() {                       \
+    return GPRRegister::Encoded_Reg_##Name##d;                                 \
+  }
+#define XmmRegAliases(Name)                                                    \
+  static constexpr XmmRegister Encoded_Xmm_##Name() {                          \
+    return XmmRegister::Encoded_Reg_##Name;                                    \
+  }
+  LegacyRegAliases(r0, rsp, esp, sp, spl);
+  LegacyRegAliases(r1, rax, eax, ax, al);
+  LegacyRegAliases(r2, rbx, ebx, bx, bl);
+  LegacyRegAliases(r3, rcx, ecx, cx, cl);
+  LegacyRegAliases(r4, rdx, edx, dx, dl);
+  LegacyRegAliases(r5, rbp, ebp, bp, bpl);
+  LegacyRegAliases(r6, rsi, esi, si, sil);
+  LegacyRegAliases(r7, rdi, edi, di, dil);
+  NewRegAliases(r8);
+  NewRegAliases(r9);
+  NewRegAliases(r10);
+  NewRegAliases(r11);
+  NewRegAliases(r12);
+  NewRegAliases(r13);
+  NewRegAliases(r14);
+  NewRegAliases(r15);
+  XmmRegAliases(xmm0);
+  XmmRegAliases(xmm1);
+  XmmRegAliases(xmm2);
+  XmmRegAliases(xmm3);
+  XmmRegAliases(xmm4);
+  XmmRegAliases(xmm5);
+  XmmRegAliases(xmm6);
+  XmmRegAliases(xmm7);
+  XmmRegAliases(xmm8);
+  XmmRegAliases(xmm9);
+  XmmRegAliases(xmm10);
+  XmmRegAliases(xmm11);
+  XmmRegAliases(xmm12);
+  XmmRegAliases(xmm13);
+  XmmRegAliases(xmm14);
+  XmmRegAliases(xmm15);
+#undef XmmRegAliases
+#undef NewRegAliases
+#undef LegacyRegAliases
+
+  AssemblerX8664TestBase() { reset(); }
+
+  void reset() { Assembler.reset(new AssemblerX8664()); }
+
+  AssemblerX8664 *assembler() const { return Assembler.get(); }
+
+  size_t codeBytesSize() const { return Assembler->getBufferView().size(); }
+
+  const uint8_t *codeBytes() const {
+    return static_cast<const uint8_t *>(
+        static_cast<const void *>(Assembler->getBufferView().data()));
+  }
+
+private:
+  std::unique_ptr<AssemblerX8664> Assembler;
+};
+
+// __ is a helper macro. It allows test cases to emit X8664 assembly
+// instructions with
+//
+//   __ mov(GPRRegister::Reg_Eax, 1);
+//   __ ret();
+//
+// and so on. The idea of having this was "stolen" from dart's unit tests.
+#define __ (this->assembler())->
+
+// AssemblerX8664LowLevelTest verify that the "basic" instructions the tests
+// rely on are encoded correctly. Therefore, instead of executing the assembled
+// code, these tests will verify that the assembled bytes are sane.
+class AssemblerX8664LowLevelTest : public AssemblerX8664TestBase {
+protected:
+  // verifyBytes is a template helper that takes a Buffer, and a variable number
+  // of bytes. As the name indicates, it is used to verify the bytes for an
+  // instruction encoding.
+  template <int N, int I> static bool verifyBytes(const uint8_t *) {
+    static_assert(I == N, "Invalid template instantiation.");
+    return true;
+  }
+
+  template <int N, int I = 0, typename... Args>
+  static bool verifyBytes(const uint8_t *Buffer, uint8_t Byte,
+                          Args... OtherBytes) {
+    static_assert(I < N, "Invalid template instantiation.");
+    EXPECT_EQ(Byte, Buffer[I]) << "Byte " << (I + 1) << " of " << N;
+    return verifyBytes<N, I + 1>(Buffer, OtherBytes...) && Buffer[I] == Byte;
+  }
+};
+
+// After these tests we should have a sane environment; we know the following
+// work:
+//
+//  (*) zeroing eax, ebx, ecx, edx, edi, and esi;
+//  (*) call $4 instruction (used for ip materialization);
+//  (*) register push and pop;
+//  (*) cmp reg, reg; and
+//  (*) returning from functions.
+//
+// We can now dive into testing each emitting method in AssemblerX8664. Each
+// test will emit some instructions for performing the test. The assembled
+// instructions will operate in a "safe" environment. All x86-64 registers are
+// spilled to the program stack, and the registers are then zeroed out, with the
+// exception of %esp and %r9.
+//
+// The jitted code and the unittest code will share the same stack. Therefore,
+// test harnesses need to ensure it does not leave anything it pushed on the
+// stack.
+//
+// %r9 is initialized with a pointer for rIP-based addressing. This pointer is
+// used for position-independent access to a scratchpad area for use in tests.
+// In theory we could use rip-based addressing, but in practice that would
+// require creating fixups, which would, in turn, require creating a global
+// context. We therefore rely on the same technique used for pic code in x86-32
+// (i.e., IP materialization). Upon a test start up, a call(NextInstruction) is
+// executed. We then pop the return address from the stack, and use it for pic
+// addressing.
+//
+// The jitted code will look like the following:
+//
+// test:
+//       push   %r9
+//       call   test$materialize_ip
+// test$materialize_ip:                           <<------- %r9 will point here
+//       pop    %r9
+//       push   %rax
+//       push   %rbx
+//       push   %rcx
+//       push   %rdx
+//       push   %rbp
+//       push   %rdi
+//       push   %rsi
+//       push   %r8
+//       push   %r10
+//       push   %r11
+//       push   %r12
+//       push   %r13
+//       push   %r14
+//       push   %r15
+//       mov    $0, %rax
+//       mov    $0, %rbx
+//       mov    $0, %rcx
+//       mov    $0, %rdx
+//       mov    $0, %rbp
+//       mov    $0, %rdi
+//       mov    $0, %rsi
+//       mov    $0, %r8
+//       mov    $0, %r10
+//       mov    $0, %r11
+//       mov    $0, %r12
+//       mov    $0, %r13
+//       mov    $0, %r14
+//       mov    $0, %r15
+//
+//       << test code goes here >>
+//
+//       mov    %rax, {  0 + $ScratchpadOffset}(%rbp)
+//       mov    %rbx, {  8 + $ScratchpadOffset}(%rbp)
+//       mov    %rcx, { 16 + $ScratchpadOffset}(%rbp)
+//       mov    %rdx, { 24 + $ScratchpadOffset}(%rbp)
+//       mov    %rdi, { 32 + $ScratchpadOffset}(%rbp)
+//       mov    %rsi, { 40 + $ScratchpadOffset}(%rbp)
+//       mov    %rbp, { 48 + $ScratchpadOffset}(%rbp)
+//       mov    %rsp, { 56 + $ScratchpadOffset}(%rbp)
+//       mov    %r8,  { 64 + $ScratchpadOffset}(%rbp)
+//       mov    %r9,  { 72 + $ScratchpadOffset}(%rbp)
+//       mov    %r10, { 80 + $ScratchpadOffset}(%rbp)
+//       mov    %r11, { 88 + $ScratchpadOffset}(%rbp)
+//       mov    %r12, { 96 + $ScratchpadOffset}(%rbp)
+//       mov    %r13, {104 + $ScratchpadOffset}(%rbp)
+//       mov    %r14, {112 + $ScratchpadOffset}(%rbp)
+//       mov    %r15, {120 + $ScratchpadOffset}(%rbp)
+//       movups %xmm0,  {128 + $ScratchpadOffset}(%rbp)
+//       movups %xmm1,  {136 + $ScratchpadOffset}(%rbp)
+//       movups %xmm2,  {144 + $ScratchpadOffset}(%rbp)
+//       movups %xmm3,  {152 + $ScratchpadOffset}(%rbp)
+//       movups %xmm4,  {160 + $ScratchpadOffset}(%rbp)
+//       movups %xmm5,  {168 + $ScratchpadOffset}(%rbp)
+//       movups %xmm6,  {176 + $ScratchpadOffset}(%rbp)
+//       movups %xmm7,  {184 + $ScratchpadOffset}(%rbp)
+//       movups %xmm8,  {192 + $ScratchpadOffset}(%rbp)
+//       movups %xmm9,  {200 + $ScratchpadOffset}(%rbp)
+//       movups %xmm10, {208 + $ScratchpadOffset}(%rbp)
+//       movups %xmm11, {216 + $ScratchpadOffset}(%rbp)
+//       movups %xmm12, {224 + $ScratchpadOffset}(%rbp)
+//       movups %xmm13, {232 + $ScratchpadOffset}(%rbp)
+//       movups %xmm14, {240 + $ScratchpadOffset}(%rbp)
+//       movups %xmm15, {248 + $ScratchpadOffset}(%rbp)
+//
+//       pop    %r15
+//       pop    %r14
+//       pop    %r13
+//       pop    %r12
+//       pop    %r11
+//       pop    %r10
+//       pop    %r8
+//       pop    %rsi
+//       pop    %rdi
+//       pop    %rbp
+//       pop    %rdx
+//       pop    %rcx
+//       pop    %rbx
+//       pop    %rax
+//       pop    %r9
+//       ret
+//
+//      << ... >>
+//
+// scratchpad:                              <<------- accessed via $Offset(%ebp)
+//
+//      << test scratch area >>
+//
+// TODO(jpp): test the
+//
+//    mov %reg, $Offset(%ebp)
+//    movups %xmm, $Offset(%ebp)
+//
+// encodings using the low level assembler test ensuring that the register
+// values can be written to the scratchpad area.
+//
+// r9 was deliberately choosen so that every instruction accessing memory would
+// fail if the rex prefix was not emitted for it.
+class AssemblerX8664Test : public AssemblerX8664TestBase {
+protected:
+  // Dqword is used to represent 128-bit data types. The Dqword's contents are
+  // the same as the contents read from memory. Tests can then use the union
+  // members to verify the tests' outputs.
+  //
+  // NOTE: We want sizeof(Dqword) == sizeof(uint64_t) * 2. In other words, we
+  // want Dqword's contents to be **exactly** what the memory contents were so
+  // that we can do, e.g.,
+  //
+  // ...
+  // float Ret[4];
+  // // populate Ret
+  // return *reinterpret_cast<Dqword *>(&Ret);
+  //
+  // While being an ugly hack, this kind of return statements are used
+  // extensively in the PackedArith (see below) class.
+  union Dqword {
+    template <typename T0, typename T1, typename T2, typename T3,
+              typename = typename std::enable_if<
+                  std::is_floating_point<T0>::value>::type>
+    Dqword(T0 F0, T1 F1, T2 F2, T3 F3) {
+      F32[0] = F0;
+      F32[1] = F1;
+      F32[2] = F2;
+      F32[3] = F3;
+    }
+
+    template <typename T>
+    Dqword(typename std::enable_if<std::is_same<T, int32_t>::value, T>::type I0,
+           T I1, T I2, T I3) {
+      I32[0] = I0;
+      I32[1] = I1;
+      I32[2] = I2;
+      I32[3] = I3;
+    }
+
+    template <typename T>
+    Dqword(typename std::enable_if<std::is_same<T, uint64_t>::value, T>::type
+               U64_0,
+           T U64_1) {
+      U64[0] = U64_0;
+      U64[1] = U64_1;
+    }
+
+    template <typename T>
+    Dqword(typename std::enable_if<std::is_same<T, double>::value, T>::type D0,
+           T D1) {
+      F64[0] = D0;
+      F64[1] = D1;
+    }
+
+    bool operator==(const Dqword &Rhs) const {
+      return std::memcmp(this, &Rhs, sizeof(*this)) == 0;
+    }
+
+    double F64[2];
+    uint64_t U64[2];
+    int64_t I64[2];
+
+    float F32[4];
+    uint32_t U32[4];
+    int32_t I32[4];
+
+    uint16_t U16[8];
+    int16_t I16[8];
+
+    uint8_t U8[16];
+    int8_t I8[16];
+
+  private:
+    Dqword() = delete;
+  };
+
+  // As stated, we want this condition to hold, so we assert.
+  static_assert(sizeof(Dqword) == 2 * sizeof(uint64_t),
+                "Dqword has the wrong size.");
+
+  // PackedArith is an interface provider for Dqwords. PackedArith's C argument
+  // is the undelying Dqword's type, which is then used so that we can define
+  // operators in terms of C++ operators on the underlying elements' type.
+  template <typename C> class PackedArith {
+  public:
+    static constexpr uint32_t N = sizeof(Dqword) / sizeof(C);
+    static_assert(N * sizeof(C) == sizeof(Dqword),
+                  "Invalid template paramenter.");
+    static_assert((N & 1) == 0, "N should be divisible by 2");
+
+#define DefinePackedComparisonOperator(Op)                                     \
+  template <typename Container = C, int Size = N>                              \
+  typename std::enable_if<std::is_floating_point<Container>::value,            \
+                          Dqword>::type                                        \
+  operator Op(const Dqword &Rhs) const {                                       \
+    using ElemType =                                                           \
+        typename std::conditional<std::is_same<float, Container>::value,       \
+                                  int32_t, int64_t>::type;                     \
+    static_assert(sizeof(ElemType) == sizeof(Container),                       \
+                  "Check ElemType definition.");                               \
+    const ElemType *const RhsPtr =                                             \
+        reinterpret_cast<const ElemType *const>(&Rhs);                         \
+    const ElemType *const LhsPtr =                                             \
+        reinterpret_cast<const ElemType *const>(&Lhs);                         \
+    ElemType Ret[N];                                                           \
+    for (uint32_t i = 0; i < N; ++i) {                                         \
+      Ret[i] = (LhsPtr[i] Op RhsPtr[i]) ? -1 : 0;                              \
+    }                                                                          \
+    return *reinterpret_cast<Dqword *>(&Ret);                                  \
+  }
+
+    DefinePackedComparisonOperator(< );
+    DefinePackedComparisonOperator(<= );
+    DefinePackedComparisonOperator(> );
+    DefinePackedComparisonOperator(>= );
+    DefinePackedComparisonOperator(== );
+    DefinePackedComparisonOperator(!= );
+
+#undef DefinePackedComparisonOperator
+
+#define DefinePackedOrdUnordComparisonOperator(Op, Ordered)                    \
+  template <typename Container = C, int Size = N>                              \
+  typename std::enable_if<std::is_floating_point<Container>::value,            \
+                          Dqword>::type                                        \
+  Op(const Dqword &Rhs) const {                                                \
+    using ElemType =                                                           \
+        typename std::conditional<std::is_same<float, Container>::value,       \
+                                  int32_t, int64_t>::type;                     \
+    static_assert(sizeof(ElemType) == sizeof(Container),                       \
+                  "Check ElemType definition.");                               \
+    const Container *const RhsPtr =                                            \
+        reinterpret_cast<const Container *const>(&Rhs);                        \
+    const Container *const LhsPtr =                                            \
+        reinterpret_cast<const Container *const>(&Lhs);                        \
+    ElemType Ret[N];                                                           \
+    for (uint32_t i = 0; i < N; ++i) {                                         \
+      Ret[i] = (!(LhsPtr[i] == LhsPtr[i]) || !(RhsPtr[i] == RhsPtr[i])) !=     \
+                       (Ordered)                                               \
+                   ? -1                                                        \
+                   : 0;                                                        \
+    }                                                                          \
+    return *reinterpret_cast<Dqword *>(&Ret);                                  \
+  }
+
+    DefinePackedOrdUnordComparisonOperator(ord, true);
+    DefinePackedOrdUnordComparisonOperator(unord, false);
+#undef DefinePackedOrdUnordComparisonOperator
+
+#define DefinePackedArithOperator(Op, RhsIndexChanges, NeedsInt)               \
+  template <typename Container = C, int Size = N>                              \
+  Dqword operator Op(const Dqword &Rhs) const {                                \
+    using ElemTypeForFp = typename std::conditional<                           \
+        !(NeedsInt), Container,                                                \
+        typename std::conditional<                                             \
+            std::is_same<Container, float>::value, uint32_t,                   \
+            typename std::conditional<std::is_same<Container, double>::value,  \
+                                      uint64_t, void>::type>::type>::type;     \
+    using ElemType =                                                           \
+        typename std::conditional<std::is_integral<Container>::value,          \
+                                  Container, ElemTypeForFp>::type;             \
+    static_assert(!std::is_same<void, ElemType>::value,                        \
+                  "Check ElemType definition.");                               \
+    const ElemType *const RhsPtr =                                             \
+        reinterpret_cast<const ElemType *const>(&Rhs);                         \
+    const ElemType *const LhsPtr =                                             \
+        reinterpret_cast<const ElemType *const>(&Lhs);                         \
+    ElemType Ret[N];                                                           \
+    for (uint32_t i = 0; i < N; ++i) {                                         \
+      Ret[i] = LhsPtr[i] Op RhsPtr[(RhsIndexChanges) ? i : 0];                 \
+    }                                                                          \
+    return *reinterpret_cast<Dqword *>(&Ret);                                  \
+  }
+
+    DefinePackedArithOperator(>>, false, true);
+    DefinePackedArithOperator(<<, false, true);
+    DefinePackedArithOperator(+, true, false);
+    DefinePackedArithOperator(-, true, false);
+    DefinePackedArithOperator(/, true, false);
+    DefinePackedArithOperator(&, true, true);
+    DefinePackedArithOperator(|, true, true);
+    DefinePackedArithOperator (^, true, true);
+
+#undef DefinePackedArithOperator
+
+#define DefinePackedArithShiftImm(Op)                                          \
+  template <typename Container = C, int Size = N>                              \
+  Dqword operator Op(uint8_t imm) const {                                      \
+    const Container *const LhsPtr =                                            \
+        reinterpret_cast<const Container *const>(&Lhs);                        \
+    Container Ret[N];                                                          \
+    for (uint32_t i = 0; i < N; ++i) {                                         \
+      Ret[i] = LhsPtr[i] Op imm;                                               \
+    }                                                                          \
+    return *reinterpret_cast<Dqword *>(&Ret);                                  \
+  }
+
+    DefinePackedArithShiftImm(>> );
+    DefinePackedArithShiftImm(<< );
+
+#undef DefinePackedArithShiftImm
+
+    template <typename Container = C, int Size = N>
+    typename std::enable_if<std::is_signed<Container>::value ||
+                                std::is_floating_point<Container>::value,
+                            Dqword>::type
+    operator*(const Dqword &Rhs) const {
+      static_assert((std::is_integral<Container>::value &&
+                     sizeof(Container) < sizeof(uint64_t)) ||
+                        std::is_floating_point<Container>::value,
+                    "* is only defined for i(8|16|32), and fp types.");
+
+      const Container *const RhsPtr =
+          reinterpret_cast<const Container *const>(&Rhs);
+      const Container *const LhsPtr =
+          reinterpret_cast<const Container *const>(&Lhs);
+      Container Ret[Size];
+      for (uint32_t i = 0; i < Size; ++i) {
+        Ret[i] = LhsPtr[i] * RhsPtr[i];
+      }
+      return *reinterpret_cast<Dqword *>(&Ret);
+    }
+
+    template <typename Container = C, int Size = N,
+              typename = typename std::enable_if<
+                  !std::is_signed<Container>::value>::type>
+    Dqword operator*(const Dqword &Rhs) const {
+      static_assert(std::is_integral<Container>::value &&
+                        sizeof(Container) < sizeof(uint64_t),
+                    "* is only defined for ui(8|16|32)");
+      using NextType = typename std::conditional<
+          sizeof(Container) == 1, uint16_t,
+          typename std::conditional<sizeof(Container) == 2, uint32_t,
+                                    uint64_t>::type>::type;
+      static_assert(sizeof(Container) * 2 == sizeof(NextType),
+                    "Unexpected size");
+
+      const Container *const RhsPtr =
+          reinterpret_cast<const Container *const>(&Rhs);
+      const Container *const LhsPtr =
+          reinterpret_cast<const Container *const>(&Lhs);
+      NextType Ret[Size / 2];
+      for (uint32_t i = 0; i < Size; i += 2) {
+        Ret[i / 2] =
+            static_cast<NextType>(LhsPtr[i]) * static_cast<NextType>(RhsPtr[i]);
+      }
+      return *reinterpret_cast<Dqword *>(&Ret);
+    }
+
+    template <typename Container = C, int Size = N>
+    PackedArith<Container> operator~() const {
+      const Container *const LhsPtr =
+          reinterpret_cast<const Container *const>(&Lhs);
+      Container Ret[Size];
+      for (uint32_t i = 0; i < Size; ++i) {
+        Ret[i] = ~LhsPtr[i];
+      }
+      return PackedArith<Container>(*reinterpret_cast<Dqword *>(&Ret));
+    }
+
+#define MinMaxOperations(Name, Suffix)                                         \
+  template <typename Container = C, int Size = N>                              \
+  Dqword Name##Suffix(const Dqword &Rhs) const {                               \
+    static_assert(std::is_floating_point<Container>::value,                    \
+                  #Name #Suffix "ps is only available for fp.");               \
+    const Container *const RhsPtr =                                            \
+        reinterpret_cast<const Container *const>(&Rhs);                        \
+    const Container *const LhsPtr =                                            \
+        reinterpret_cast<const Container *const>(&Lhs);                        \
+    Container Ret[Size];                                                       \
+    for (uint32_t i = 0; i < Size; ++i) {                                      \
+      Ret[i] = std::Name(LhsPtr[i], RhsPtr[i]);                                \
+    }                                                                          \
+    return *reinterpret_cast<Dqword *>(&Ret);                                  \
+  }
+
+    MinMaxOperations(max, ps);
+    MinMaxOperations(max, pd);
+    MinMaxOperations(min, ps);
+    MinMaxOperations(min, pd);
+#undef MinMaxOperations
+
+    template <typename Container = C, int Size = N>
+    Dqword blendWith(const Dqword &Rhs, const Dqword &Mask) const {
+      using MaskType = typename std::conditional<
+          sizeof(Container) == 1, int8_t,
+          typename std::conditional<sizeof(Container) == 2, int16_t,
+                                    int32_t>::type>::type;
+      static_assert(sizeof(MaskType) == sizeof(Container),
+                    "MaskType has the wrong size.");
+      const Container *const RhsPtr =
+          reinterpret_cast<const Container *const>(&Rhs);
+      const Container *const LhsPtr =
+          reinterpret_cast<const Container *const>(&Lhs);
+      const MaskType *const MaskPtr =
+          reinterpret_cast<const MaskType *const>(&Mask);
+      Container Ret[Size];
+      for (int i = 0; i < Size; ++i) {
+        Ret[i] = ((MaskPtr[i] < 0) ? RhsPtr : LhsPtr)[i];
+      }
+      return *reinterpret_cast<Dqword *>(&Ret);
+    }
+
+  private:
+    // The AssemblerX8664Test class needs to be a friend so that it can create
+    // PackedArith objects (see below.)
+    friend class AssemblerX8664Test;
+
+    explicit PackedArith(const Dqword &MyLhs) : Lhs(MyLhs) {}
+
+    // Lhs can't be a & because operator~ returns a temporary object that needs
+    // access to its own Dqword.
+    const Dqword Lhs;
+  };
+
+  // Named constructor for PackedArith objects.
+  template <typename C> static PackedArith<C> packedAs(const Dqword &D) {
+    return PackedArith<C>(D);
+  }
+
+  AssemblerX8664Test() { reset(); }
+
+  void reset() {
+    AssemblerX8664TestBase::reset();
+
+    NeedsEpilogue = true;
+    // These dwords are allocated for saving the GPR state after the jitted code
+    // runs.
+    NumAllocatedDwords = AssembledTest::ScratchpadSlots;
+    addPrologue();
+  }
+
+  // AssembledTest is a wrapper around a PROT_EXEC mmap'ed buffer. This buffer
+  // contains both the test code as well as prologue/epilogue, and the
+  // scratchpad area that tests may use -- all tests use this scratchpad area
+  // for storing the processor's registers after the tests executed. This class
+  // also exposes helper methods for reading the register state after test
+  // execution, as well as for reading the scratchpad area.
+  class AssembledTest {
+    AssembledTest() = delete;
+    AssembledTest(const AssembledTest &) = delete;
+    AssembledTest &operator=(const AssembledTest &) = delete;
+
+  public:
+    static constexpr uint32_t MaximumCodeSize = 1 << 20;
+    static constexpr uint32_t raxSlot() { return 0; }
+    static constexpr uint32_t rbxSlot() { return 2; }
+    static constexpr uint32_t rcxSlot() { return 4; }
+    static constexpr uint32_t rdxSlot() { return 6; }
+    static constexpr uint32_t rdiSlot() { return 8; }
+    static constexpr uint32_t rsiSlot() { return 10; }
+    static constexpr uint32_t rbpSlot() { return 12; }
+    static constexpr uint32_t rspSlot() { return 14; }
+    static constexpr uint32_t r8Slot() { return 16; }
+    static constexpr uint32_t r9Slot() { return 18; }
+    static constexpr uint32_t r10Slot() { return 20; }
+    static constexpr uint32_t r11Slot() { return 22; }
+    static constexpr uint32_t r12Slot() { return 24; }
+    static constexpr uint32_t r13Slot() { return 26; }
+    static constexpr uint32_t r14Slot() { return 28; }
+    static constexpr uint32_t r15Slot() { return 30; }
+
+    // save 4 dwords for each xmm registers.
+    static constexpr uint32_t xmm0Slot() { return 32; }
+    static constexpr uint32_t xmm1Slot() { return 36; }
+    static constexpr uint32_t xmm2Slot() { return 40; }
+    static constexpr uint32_t xmm3Slot() { return 44; }
+    static constexpr uint32_t xmm4Slot() { return 48; }
+    static constexpr uint32_t xmm5Slot() { return 52; }
+    static constexpr uint32_t xmm6Slot() { return 56; }
+    static constexpr uint32_t xmm7Slot() { return 60; }
+    static constexpr uint32_t xmm8Slot() { return 64; }
+    static constexpr uint32_t xmm9Slot() { return 68; }
+    static constexpr uint32_t xmm10Slot() { return 72; }
+    static constexpr uint32_t xmm11Slot() { return 76; }
+    static constexpr uint32_t xmm12Slot() { return 80; }
+    static constexpr uint32_t xmm13Slot() { return 84; }
+    static constexpr uint32_t xmm14Slot() { return 88; }
+    static constexpr uint32_t xmm15Slot() { return 92; }
+
+    static constexpr uint32_t ScratchpadSlots = 96;
+
+    AssembledTest(const uint8_t *Data, const size_t MySize,
+                  const size_t ExtraStorageDwords)
+        : Size(MaximumCodeSize + 4 * ExtraStorageDwords) {
+      // MaxCodeSize is needed because EXPECT_LT needs a symbol with a name --
+      // probably a compiler bug?
+      uint32_t MaxCodeSize = MaximumCodeSize;
+      EXPECT_LT(MySize, MaxCodeSize);
+      assert(MySize < MaximumCodeSize);
+      ExecutableData = mmap(nullptr, Size, PROT_WRITE | PROT_READ | PROT_EXEC,
+                            MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+      EXPECT_NE(MAP_FAILED, ExecutableData) << strerror(errno);
+      assert(MAP_FAILED != ExecutableData);
+      std::memcpy(ExecutableData, Data, MySize);
+    }
+
+    // We allow AssembledTest to be moved so that we can return objects of
+    // this type.
+    AssembledTest(AssembledTest &&Buffer)
+        : ExecutableData(Buffer.ExecutableData), Size(Buffer.Size) {
+      Buffer.ExecutableData = nullptr;
+      Buffer.Size = 0;
+    }
+
+    AssembledTest &operator=(AssembledTest &&Buffer) {
+      ExecutableData = Buffer.ExecutableData;
+      Buffer.ExecutableData = nullptr;
+      Size = Buffer.Size;
+      Buffer.Size = 0;
+      return *this;
+    }
+
+    ~AssembledTest() {
+      if (ExecutableData != nullptr) {
+        munmap(ExecutableData, Size);
+        ExecutableData = nullptr;
+      }
+    }
+
+    void run() const { reinterpret_cast<void (*)()>(ExecutableData)(); }
+
+#define LegacyRegAccessors(NewName, Name64, Name32, Name16, Name8)             \
+  static_assert(Encoded_GPR_##NewName() == Encoded_GPR_##Name64(),             \
+                "Invalid aliasing.");                                          \
+  uint64_t NewName() const {                                                   \
+    return contentsOfQword(AssembledTest::Name64##Slot());                     \
+  }                                                                            \
+  static_assert(Encoded_GPR_##NewName##q() == Encoded_GPR_##Name64(),          \
+                "Invalid aliasing.");                                          \
+  uint64_t NewName##q() const {                                                \
+    return contentsOfQword(AssembledTest::Name64##Slot());                     \
+  }                                                                            \
+  static_assert(Encoded_GPR_##NewName##d() == Encoded_GPR_##Name64(),          \
+                "Invalid aliasing.");                                          \
+  uint32_t NewName##d() const {                                                \
+    return contentsOfQword(AssembledTest::Name64##Slot());                     \
+  }                                                                            \
+  static_assert(Encoded_GPR_##NewName##w() == Encoded_GPR_##Name64(),          \
+                "Invalid aliasing.");                                          \
+  uint16_t NewName##w() const {                                                \
+    return contentsOfQword(AssembledTest::Name64##Slot());                     \
+  }                                                                            \
+  static_assert(Encoded_GPR_##NewName##l() == Encoded_GPR_##Name64(),          \
+                "Invalid aliasing.");                                          \
+  uint8_t NewName##l() const {                                                 \
+    return contentsOfQword(AssembledTest::Name64##Slot());                     \
+  }                                                                            \
+  static_assert(Encoded_GPR_##Name64() == Encoded_GPR_##Name64(),              \
+                "Invalid aliasing.");                                          \
+  uint64_t Name64() const {                                                    \
+    return contentsOfQword(AssembledTest::Name64##Slot());                     \
+  }                                                                            \
+  static_assert(Encoded_GPR_##Name32() == Encoded_GPR_##Name64(),              \
+                "Invalid aliasing.");                                          \
+  uint32_t Name32() const {                                                    \
+    return contentsOfQword(AssembledTest::Name64##Slot());                     \
+  }                                                                            \
+  static_assert(Encoded_GPR_##Name16() == Encoded_GPR_##Name64(),              \
+                "Invalid aliasing.");                                          \
+  uint16_t Name16() const {                                                    \
+    return contentsOfQword(AssembledTest::Name64##Slot());                     \
+  }                                                                            \
+  static_assert(Encoded_GPR_##Name8() == Encoded_GPR_##Name64(),               \
+                "Invalid aliasing.");                                          \
+  uint8_t Name8() const {                                                      \
+    return contentsOfQword(AssembledTest::Name64##Slot());                     \
+  }
+#define NewRegAccessors(NewName)                                               \
+  uint64_t NewName() const {                                                   \
+    return contentsOfQword(AssembledTest::NewName##Slot());                    \
+  }                                                                            \
+  uint64_t NewName##q() const {                                                \
+    return contentsOfQword(AssembledTest::NewName##Slot());                    \
+  }                                                                            \
+  uint32_t NewName##d() const {                                                \
+    return contentsOfQword(AssembledTest::NewName##Slot());                    \
+  }                                                                            \
+  uint16_t NewName##w() const {                                                \
+    return contentsOfQword(AssembledTest::NewName##Slot());                    \
+  }                                                                            \
+  uint8_t NewName##l() const {                                                 \
+    return contentsOfQword(AssembledTest::NewName##Slot());                    \
+  }
+#define XmmRegAccessor(Name)                                                   \
+  template <typename T> T Name() const {                                       \
+    return xmm<T>(AssembledTest::Name##Slot());                                \
+  }
+    LegacyRegAccessors(r0, rsp, esp, sp, spl);
+    LegacyRegAccessors(r1, rax, eax, ax, al);
+    LegacyRegAccessors(r2, rbx, ebx, bx, bl);
+    LegacyRegAccessors(r3, rcx, ecx, cx, cl);
+    LegacyRegAccessors(r4, rdx, edx, dx, dl);
+    LegacyRegAccessors(r5, rbp, ebp, bp, bpl);
+    LegacyRegAccessors(r6, rsi, esi, si, sil);
+    LegacyRegAccessors(r7, rdi, edi, di, dil);
+    NewRegAccessors(r8);
+    NewRegAccessors(r9);
+    NewRegAccessors(r10);
+    NewRegAccessors(r11);
+    NewRegAccessors(r12);
+    NewRegAccessors(r13);
+    NewRegAccessors(r14);
+    NewRegAccessors(r15);
+    XmmRegAccessor(xmm0);
+    XmmRegAccessor(xmm1);
+    XmmRegAccessor(xmm2);
+    XmmRegAccessor(xmm3);
+    XmmRegAccessor(xmm4);
+    XmmRegAccessor(xmm5);
+    XmmRegAccessor(xmm6);
+    XmmRegAccessor(xmm7);
+    XmmRegAccessor(xmm8);
+    XmmRegAccessor(xmm9);
+    XmmRegAccessor(xmm10);
+    XmmRegAccessor(xmm11);
+    XmmRegAccessor(xmm12);
+    XmmRegAccessor(xmm13);
+    XmmRegAccessor(xmm14);
+    XmmRegAccessor(xmm15);
+#undef XmmRegAccessor
+#undef NewRegAccessors
+#undef LegacyRegAccessors
+
+    // contentsOfDword is used for reading the values in the scratchpad area.
+    // Valid arguments are the dword ids returned by
+    // AssemblerX8664Test::allocateDword() -- other inputs are considered
+    // invalid, and are not guaranteed to work if the implementation changes.
+    template <typename T = uint32_t, typename = typename std::enable_if<
+                                         sizeof(T) == sizeof(uint32_t)>::type>
+    T contentsOfDword(uint32_t Dword) const {
+      return *reinterpret_cast<T *>(static_cast<uint8_t *>(ExecutableData) +
+                                    dwordOffset(Dword));
+    }
+
+    template <typename T = uint64_t, typename = typename std::enable_if<
+                                         sizeof(T) == sizeof(uint64_t)>::type>
+    T contentsOfQword(uint32_t InitialDword) const {
+      return *reinterpret_cast<T *>(static_cast<uint8_t *>(ExecutableData) +
+                                    dwordOffset(InitialDword));
+    }
+
+    Dqword contentsOfDqword(uint32_t InitialDword) const {
+      return *reinterpret_cast<Dqword *>(
+                 static_cast<uint8_t *>(ExecutableData) +
+                 dwordOffset(InitialDword));
+    }
+
+    template <typename T = uint32_t, typename = typename std::enable_if<
+                                         sizeof(T) == sizeof(uint32_t)>::type>
+    void setDwordTo(uint32_t Dword, T value) {
+      *reinterpret_cast<uint32_t *>(static_cast<uint8_t *>(ExecutableData) +
+                                    dwordOffset(Dword)) =
+          *reinterpret_cast<uint32_t *>(&value);
+    }
+
+    template <typename T = uint64_t, typename = typename std::enable_if<
+                                         sizeof(T) == sizeof(uint64_t)>::type>
+    void setQwordTo(uint32_t InitialDword, T value) {
+      *reinterpret_cast<uint64_t *>(static_cast<uint8_t *>(ExecutableData) +
+                                    dwordOffset(InitialDword)) =
+          *reinterpret_cast<uint64_t *>(&value);
+    }
+
+    void setDqwordTo(uint32_t InitialDword, const Dqword &qdword) {
+      setQwordTo(InitialDword, qdword.U64[0]);
+      setQwordTo(InitialDword + 2, qdword.U64[1]);
+    }
+
+  private:
+    template <typename T>
+    typename std::enable_if<std::is_same<T, Dqword>::value, Dqword>::type
+    xmm(uint8_t Slot) const {
+      return contentsOfDqword(Slot);
+    }
+
+    template <typename T>
+    typename std::enable_if<!std::is_same<T, Dqword>::value, T>::type
+    xmm(uint8_t Slot) const {
+      constexpr bool TIs64Bit = sizeof(T) == sizeof(uint64_t);
+      using _64BitType = typename std::conditional<TIs64Bit, T, uint64_t>::type;
+      using _32BitType = typename std::conditional<TIs64Bit, uint32_t, T>::type;
+      if (TIs64Bit) {
+        return contentsOfQword<_64BitType>(Slot);
+      }
+      return contentsOfDword<_32BitType>(Slot);
+    }
+
+    static uint32_t dwordOffset(uint32_t Index) {
+      return MaximumCodeSize + (Index * 4);
+    }
+
+    void *ExecutableData = nullptr;
+    size_t Size;
+  };
+
+  // assemble created an AssembledTest with the jitted code. The first time
+  // assemble is executed it will add the epilogue to the jitted code (which is
+  // the reason why this method is not const qualified.
+  AssembledTest assemble() {
+    if (NeedsEpilogue) {
+      addEpilogue();
+    }
+
+    NeedsEpilogue = false;
+    return AssembledTest(codeBytes(), codeBytesSize(), NumAllocatedDwords);
+  }
+
+  // Allocates a new dword slot in the test's scratchpad area.
+  uint32_t allocateDword() { return NumAllocatedDwords++; }
+
+  // Allocates a new qword slot in the test's scratchpad area.
+  uint32_t allocateQword() {
+    uint32_t InitialDword = allocateDword();
+    allocateDword();
+    return InitialDword;
+  }
+
+  // Allocates a new dqword slot in the test's scratchpad area.
+  uint32_t allocateDqword() {
+    uint32_t InitialDword = allocateQword();
+    allocateQword();
+    return InitialDword;
+  }
+
+  Address dwordAddress(uint32_t Dword) {
+    return Address(Encoded_GPR_r9(), dwordDisp(Dword));
+  }
+
+private:
+  // e??SlotAddress returns an AssemblerX8664::Traits::Address that can be used
+  // by the test cases to encode an address operand for accessing the slot for
+  // the specified register. These are all private for, when jitting the test
+  // code, tests should not tamper with these values. Besides, during the test
+  // execution these slots' contents are undefined and should not be accessed.
+  Address raxSlotAddress() { return dwordAddress(AssembledTest::raxSlot()); }
+  Address rbxSlotAddress() { return dwordAddress(AssembledTest::rbxSlot()); }
+  Address rcxSlotAddress() { return dwordAddress(AssembledTest::rcxSlot()); }
+  Address rdxSlotAddress() { return dwordAddress(AssembledTest::rdxSlot()); }
+  Address rdiSlotAddress() { return dwordAddress(AssembledTest::rdiSlot()); }
+  Address rsiSlotAddress() { return dwordAddress(AssembledTest::rsiSlot()); }
+  Address rbpSlotAddress() { return dwordAddress(AssembledTest::rbpSlot()); }
+  Address rspSlotAddress() { return dwordAddress(AssembledTest::rspSlot()); }
+  Address r8SlotAddress() { return dwordAddress(AssembledTest::r8Slot()); }
+  Address r9SlotAddress() { return dwordAddress(AssembledTest::r9Slot()); }
+  Address r10SlotAddress() { return dwordAddress(AssembledTest::r10Slot()); }
+  Address r11SlotAddress() { return dwordAddress(AssembledTest::r11Slot()); }
+  Address r12SlotAddress() { return dwordAddress(AssembledTest::r12Slot()); }
+  Address r13SlotAddress() { return dwordAddress(AssembledTest::r13Slot()); }
+  Address r14SlotAddress() { return dwordAddress(AssembledTest::r14Slot()); }
+  Address r15SlotAddress() { return dwordAddress(AssembledTest::r15Slot()); }
+  Address xmm0SlotAddress() { return dwordAddress(AssembledTest::xmm0Slot()); }
+  Address xmm1SlotAddress() { return dwordAddress(AssembledTest::xmm1Slot()); }
+  Address xmm2SlotAddress() { return dwordAddress(AssembledTest::xmm2Slot()); }
+  Address xmm3SlotAddress() { return dwordAddress(AssembledTest::xmm3Slot()); }
+  Address xmm4SlotAddress() { return dwordAddress(AssembledTest::xmm4Slot()); }
+  Address xmm5SlotAddress() { return dwordAddress(AssembledTest::xmm5Slot()); }
+  Address xmm6SlotAddress() { return dwordAddress(AssembledTest::xmm6Slot()); }
+  Address xmm7SlotAddress() { return dwordAddress(AssembledTest::xmm7Slot()); }
+  Address xmm8SlotAddress() { return dwordAddress(AssembledTest::xmm8Slot()); }
+  Address xmm9SlotAddress() { return dwordAddress(AssembledTest::xmm9Slot()); }
+  Address xmm10SlotAddress() {
+    return dwordAddress(AssembledTest::xmm10Slot());
+  }
+  Address xmm11SlotAddress() {
+    return dwordAddress(AssembledTest::xmm11Slot());
+  }
+  Address xmm12SlotAddress() {
+    return dwordAddress(AssembledTest::xmm12Slot());
+  }
+  Address xmm13SlotAddress() {
+    return dwordAddress(AssembledTest::xmm13Slot());
+  }
+  Address xmm14SlotAddress() {
+    return dwordAddress(AssembledTest::xmm14Slot());
+  }
+  Address xmm15SlotAddress() {
+    return dwordAddress(AssembledTest::xmm15Slot());
+  }
+
+  // Returns the displacement that should be used when accessing the specified
+  // Dword in the scratchpad area. It needs to adjust for the initial
+  // instructions that are emitted before the call that materializes the IP
+  // register.
+  uint32_t dwordDisp(uint32_t Dword) const {
+    EXPECT_LT(Dword, NumAllocatedDwords);
+    assert(Dword < NumAllocatedDwords);
+    static constexpr uint8_t PushR9Bytes = 2;
+    static constexpr uint8_t CallImmBytes = 5;
+    return AssembledTest::MaximumCodeSize + (Dword * 4) -
+           (PushR9Bytes + CallImmBytes);
+  }
+
+  void addPrologue() {
+    __ pushl(Encoded_GPR_r9());
+    __ call(Immediate(4));
+    __ popl(Encoded_GPR_r9());
+
+    __ pushl(Encoded_GPR_rax());
+    __ pushl(Encoded_GPR_rbx());
+    __ pushl(Encoded_GPR_rcx());
+    __ pushl(Encoded_GPR_rdx());
+    __ pushl(Encoded_GPR_rbp());
+    __ pushl(Encoded_GPR_rdi());
+    __ pushl(Encoded_GPR_rsi());
+    __ pushl(Encoded_GPR_r8());
+    __ pushl(Encoded_GPR_r10());
+    __ pushl(Encoded_GPR_r11());
+    __ pushl(Encoded_GPR_r12());
+    __ pushl(Encoded_GPR_r13());
+    __ pushl(Encoded_GPR_r14());
+    __ pushl(Encoded_GPR_r15());
+
+    __ mov(IceType_i32, Encoded_GPR_rax(), Immediate(0x00));
+    __ mov(IceType_i32, Encoded_GPR_rbx(), Immediate(0x00));
+    __ mov(IceType_i32, Encoded_GPR_rcx(), Immediate(0x00));
+    __ mov(IceType_i32, Encoded_GPR_rdx(), Immediate(0x00));
+    __ mov(IceType_i32, Encoded_GPR_rbp(), Immediate(0x00));
+    __ mov(IceType_i32, Encoded_GPR_rdi(), Immediate(0x00));
+    __ mov(IceType_i32, Encoded_GPR_rsi(), Immediate(0x00));
+    __ mov(IceType_i32, Encoded_GPR_r8(), Immediate(0x00));
+    __ mov(IceType_i32, Encoded_GPR_r10(), Immediate(0x00));
+    __ mov(IceType_i32, Encoded_GPR_r11(), Immediate(0x00));
+    __ mov(IceType_i32, Encoded_GPR_r12(), Immediate(0x00));
+    __ mov(IceType_i32, Encoded_GPR_r13(), Immediate(0x00));
+    __ mov(IceType_i32, Encoded_GPR_r14(), Immediate(0x00));
+    __ mov(IceType_i32, Encoded_GPR_r15(), Immediate(0x00));
+  }
+
+  void addEpilogue() {
+    __ mov(IceType_i64, raxSlotAddress(), Encoded_GPR_rax());
+    __ mov(IceType_i64, rbxSlotAddress(), Encoded_GPR_rbx());
+    __ mov(IceType_i64, rcxSlotAddress(), Encoded_GPR_rcx());
+    __ mov(IceType_i64, rdxSlotAddress(), Encoded_GPR_rdx());
+    __ mov(IceType_i64, rdiSlotAddress(), Encoded_GPR_rdi());
+    __ mov(IceType_i64, rsiSlotAddress(), Encoded_GPR_rsi());
+    __ mov(IceType_i64, rbpSlotAddress(), Encoded_GPR_rbp());
+    __ mov(IceType_i64, rspSlotAddress(), Encoded_GPR_rsp());
+    __ mov(IceType_i64, r8SlotAddress(), Encoded_GPR_r8());
+    __ mov(IceType_i64, r9SlotAddress(), Encoded_GPR_r9());
+    __ mov(IceType_i64, r10SlotAddress(), Encoded_GPR_r10());
+    __ mov(IceType_i64, r11SlotAddress(), Encoded_GPR_r11());
+    __ mov(IceType_i64, r12SlotAddress(), Encoded_GPR_r12());
+    __ mov(IceType_i64, r13SlotAddress(), Encoded_GPR_r13());
+    __ mov(IceType_i64, r14SlotAddress(), Encoded_GPR_r14());
+    __ mov(IceType_i64, r15SlotAddress(), Encoded_GPR_r15());
+    __ movups(xmm0SlotAddress(), Encoded_Xmm_xmm0());
+    __ movups(xmm1SlotAddress(), Encoded_Xmm_xmm1());
+    __ movups(xmm2SlotAddress(), Encoded_Xmm_xmm2());
+    __ movups(xmm3SlotAddress(), Encoded_Xmm_xmm3());
+    __ movups(xmm4SlotAddress(), Encoded_Xmm_xmm4());
+    __ movups(xmm5SlotAddress(), Encoded_Xmm_xmm5());
+    __ movups(xmm6SlotAddress(), Encoded_Xmm_xmm6());
+    __ movups(xmm7SlotAddress(), Encoded_Xmm_xmm7());
+    __ movups(xmm8SlotAddress(), Encoded_Xmm_xmm8());
+    __ movups(xmm9SlotAddress(), Encoded_Xmm_xmm9());
+    __ movups(xmm10SlotAddress(), Encoded_Xmm_xmm10());
+    __ movups(xmm11SlotAddress(), Encoded_Xmm_xmm11());
+    __ movups(xmm12SlotAddress(), Encoded_Xmm_xmm12());
+    __ movups(xmm13SlotAddress(), Encoded_Xmm_xmm13());
+    __ movups(xmm14SlotAddress(), Encoded_Xmm_xmm14());
+    __ movups(xmm15SlotAddress(), Encoded_Xmm_xmm15());
+
+    __ popl(Encoded_GPR_r15());
+    __ popl(Encoded_GPR_r14());
+    __ popl(Encoded_GPR_r13());
+    __ popl(Encoded_GPR_r12());
+    __ popl(Encoded_GPR_r11());
+    __ popl(Encoded_GPR_r10());
+    __ popl(Encoded_GPR_r8());
+    __ popl(Encoded_GPR_rsi());
+    __ popl(Encoded_GPR_rdi());
+    __ popl(Encoded_GPR_rbp());
+    __ popl(Encoded_GPR_rdx());
+    __ popl(Encoded_GPR_rcx());
+    __ popl(Encoded_GPR_rbx());
+    __ popl(Encoded_GPR_rax());
+    __ popl(Encoded_GPR_r9());
+
+    __ ret();
+  }
+
+  bool NeedsEpilogue;
+  uint32_t NumAllocatedDwords;
+};
+
+} // end of namespace Test
+} // end of namespace X8664
+} // end of namespace Ice
+
+#endif // ASSEMBLERX8664_TESTUTIL_H_
diff --git a/unittest/AssemblerX8664/XmmArith.cpp b/unittest/AssemblerX8664/XmmArith.cpp
new file mode 100644
index 0000000..ac51c02
--- /dev/null
+++ b/unittest/AssemblerX8664/XmmArith.cpp
@@ -0,0 +1,1914 @@
+//===- subzero/unittest/AssemblerX8664/XmmArith.cpp -----------------------===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#include "AssemblerX8664/TestUtil.h"
+
+namespace Ice {
+namespace X8664 {
+namespace Test {
+namespace {
+
+TEST_F(AssemblerX8664Test, ArithSS) {
+#define TestArithSSXmmXmm(FloatSize, Src, Value0, Dst, Value1, Inst, Op)       \
+  do {                                                                         \
+    static_assert(FloatSize == 32 || FloatSize == 64,                          \
+                  "Invalid fp size " #FloatSize);                              \
+    static constexpr char TestString[] =                                       \
+        "(" #FloatSize ", " #Src ", " #Value0 ", " #Dst ", " #Value1           \
+        ", " #Inst ", " #Op ")";                                               \
+    static constexpr bool IsDouble = FloatSize == 64;                          \
+    using Type = std::conditional<IsDouble, double, float>::type;              \
+    const uint32_t T0 = allocateQword();                                       \
+    const Type V0 = Value0;                                                    \
+    const uint32_t T1 = allocateQword();                                       \
+    const Type V1 = Value1;                                                    \
+                                                                               \
+    __ movss(IceType_f##FloatSize, Encoded_Xmm_##Dst(), dwordAddress(T0));     \
+    __ movss(IceType_f##FloatSize, Encoded_Xmm_##Src(), dwordAddress(T1));     \
+    __ Inst(IceType_f##FloatSize, Encoded_Xmm_##Dst(), Encoded_Xmm_##Src());   \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    if (IsDouble) {                                                            \
+      test.setQwordTo(T0, static_cast<double>(V0));                            \
+      test.setQwordTo(T1, static_cast<double>(V1));                            \
+    } else {                                                                   \
+      test.setDwordTo(T0, static_cast<float>(V0));                             \
+      test.setDwordTo(T1, static_cast<float>(V1));                             \
+    }                                                                          \
+                                                                               \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_DOUBLE_EQ(V0 Op V1, test.Dst<Type>()) << TestString;                \
+    reset();                                                                   \
+  } while (0)
+
+#define TestArithSSXmmAddr(FloatSize, Value0, Dst, Value1, Inst, Op)           \
+  do {                                                                         \
+    static_assert(FloatSize == 32 || FloatSize == 64,                          \
+                  "Invalid fp size " #FloatSize);                              \
+    static constexpr char TestString[] =                                       \
+        "(" #FloatSize ", Addr, " #Value0 ", " #Dst ", " #Value1 ", " #Inst    \
+        ", " #Op ")";                                                          \
+    static constexpr bool IsDouble = FloatSize == 64;                          \
+    using Type = std::conditional<IsDouble, double, float>::type;              \
+    const uint32_t T0 = allocateQword();                                       \
+    const Type V0 = Value0;                                                    \
+    const uint32_t T1 = allocateQword();                                       \
+    const Type V1 = Value1;                                                    \
+                                                                               \
+    __ movss(IceType_f##FloatSize, Encoded_Xmm_##Dst(), dwordAddress(T0));     \
+    __ Inst(IceType_f##FloatSize, Encoded_Xmm_##Dst(), dwordAddress(T1));      \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    if (IsDouble) {                                                            \
+      test.setQwordTo(T0, static_cast<double>(V0));                            \
+      test.setQwordTo(T1, static_cast<double>(V1));                            \
+    } else {                                                                   \
+      test.setDwordTo(T0, static_cast<float>(V0));                             \
+      test.setDwordTo(T1, static_cast<float>(V1));                             \
+    }                                                                          \
+                                                                               \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_DOUBLE_EQ(V0 Op V1, test.Dst<Type>()) << TestString;                \
+    reset();                                                                   \
+  } while (0)
+
+#define TestArithSS(FloatSize, Src, Dst0, Dst1)                                \
+  do {                                                                         \
+    TestArithSSXmmXmm(FloatSize, Src, 1.0, Dst0, 10.0, addss, +);              \
+    TestArithSSXmmAddr(FloatSize, 2.0, Dst1, 20.0, addss, +);                  \
+    TestArithSSXmmXmm(FloatSize, Src, 3.0, Dst0, 30.0, subss, -);              \
+    TestArithSSXmmAddr(FloatSize, 4.0, Dst1, 40.0, subss, -);                  \
+    TestArithSSXmmXmm(FloatSize, Src, 5.0, Dst0, 50.0, mulss, *);              \
+    TestArithSSXmmAddr(FloatSize, 6.0, Dst1, 60.0, mulss, *);                  \
+    TestArithSSXmmXmm(FloatSize, Src, 7.0, Dst0, 70.0, divss, / );             \
+    TestArithSSXmmAddr(FloatSize, 8.0, Dst1, 80.0, divss, / );                 \
+  } while (0)
+
+#define TestImpl(Src, Dst0, Dst1)                                              \
+  do {                                                                         \
+    TestArithSS(32, Src, Dst0, Dst1);                                          \
+    TestArithSS(64, Src, Dst0, Dst1);                                          \
+  } while (0)
+
+  TestImpl(xmm0, xmm1, xmm2);
+  TestImpl(xmm1, xmm2, xmm3);
+  TestImpl(xmm2, xmm3, xmm4);
+  TestImpl(xmm3, xmm4, xmm5);
+  TestImpl(xmm4, xmm5, xmm6);
+  TestImpl(xmm5, xmm6, xmm7);
+  TestImpl(xmm6, xmm7, xmm8);
+  TestImpl(xmm7, xmm8, xmm9);
+  TestImpl(xmm8, xmm9, xmm10);
+  TestImpl(xmm9, xmm10, xmm11);
+  TestImpl(xmm10, xmm11, xmm12);
+  TestImpl(xmm11, xmm12, xmm13);
+  TestImpl(xmm12, xmm13, xmm14);
+  TestImpl(xmm13, xmm14, xmm15);
+  TestImpl(xmm14, xmm15, xmm0);
+  TestImpl(xmm15, xmm0, xmm1);
+
+#undef TestImpl
+#undef TestArithSS
+#undef TestArithSSXmmAddr
+#undef TestArithSSXmmXmm
+}
+
+TEST_F(AssemblerX8664Test, PArith) {
+#define TestPArithXmmXmm(Dst, Value0, Src, Value1, Inst, Op, Type, Size)       \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Dst ", " #Value0 ", " #Src ", " #Value1 ", " #Inst ", " #Op       \
+        ", " #Type ", " #Size ")";                                             \
+    const uint32_t T0 = allocateDqword();                                      \
+    const Dqword V0 Value0;                                                    \
+                                                                               \
+    const uint32_t T1 = allocateDqword();                                      \
+    const Dqword V1 Value1;                                                    \
+                                                                               \
+    __ movups(Encoded_Xmm_##Dst(), dwordAddress(T0));                          \
+    __ movups(Encoded_Xmm_##Src(), dwordAddress(T1));                          \
+    __ Inst(IceType_i##Size, Encoded_Xmm_##Dst(), Encoded_Xmm_##Src());        \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0);                                                  \
+    test.setDqwordTo(T1, V1);                                                  \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(packedAs<Type##Size##_t>(V0) Op V1, test.Dst<Dqword>())          \
+        << TestString;                                                         \
+    reset();                                                                   \
+  } while (0)
+
+#define TestPArithXmmAddr(Dst, Value0, Value1, Inst, Op, Type, Size)           \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Dst ", " #Value0 ", Addr, " #Value1 ", " #Inst ", " #Op           \
+        ", " #Type ", " #Size ")";                                             \
+    const uint32_t T0 = allocateDqword();                                      \
+    const Dqword V0 Value0;                                                    \
+                                                                               \
+    const uint32_t T1 = allocateDqword();                                      \
+    const Dqword V1 Value1;                                                    \
+                                                                               \
+    __ movups(Encoded_Xmm_##Dst(), dwordAddress(T0));                          \
+    __ Inst(IceType_i##Size, Encoded_Xmm_##Dst(), dwordAddress(T1));           \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0);                                                  \
+    test.setDqwordTo(T1, V1);                                                  \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(packedAs<Type##Size##_t>(V0) Op V1, test.Dst<Dqword>())          \
+        << TestString;                                                         \
+    reset();                                                                   \
+  } while (0)
+
+#define TestPArithXmmImm(Dst, Value0, Imm, Inst, Op, Type, Size)               \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Dst ", " #Value0 ", " #Imm ", " #Inst ", " #Op ", " #Type         \
+        ", " #Size ")";                                                        \
+    const uint32_t T0 = allocateDqword();                                      \
+    const Dqword V0 Value0;                                                    \
+                                                                               \
+    __ movups(Encoded_Xmm_##Dst(), dwordAddress(T0));                          \
+    __ Inst(IceType_i##Size, Encoded_Xmm_##Dst(), Immediate(Imm));             \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0);                                                  \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(packedAs<Type##Size##_t>(V0) Op Imm, test.Dst<Dqword>())         \
+        << TestString;                                                         \
+    reset();                                                                   \
+  } while (0)
+
+#define TestPAndnXmmXmm(Dst, Value0, Src, Value1, Type, Size)                  \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Dst ", " #Value0 ", " #Src ", " #Value1 ", pandn, " #Type         \
+        ", " #Size ")";                                                        \
+    const uint32_t T0 = allocateDqword();                                      \
+    const Dqword V0 Value0;                                                    \
+                                                                               \
+    const uint32_t T1 = allocateDqword();                                      \
+    const Dqword V1 Value1;                                                    \
+                                                                               \
+    __ movups(Encoded_Xmm_##Dst(), dwordAddress(T0));                          \
+    __ movups(Encoded_Xmm_##Src(), dwordAddress(T1));                          \
+    __ pandn(IceType_i##Size, Encoded_Xmm_##Dst(), Encoded_Xmm_##Src());       \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0);                                                  \
+    test.setDqwordTo(T1, V1);                                                  \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(~(packedAs<Type##Size##_t>(V0)) & V1, test.Dst<Dqword>())        \
+        << TestString;                                                         \
+    reset();                                                                   \
+  } while (0)
+
+#define TestPAndnXmmAddr(Dst, Value0, Value1, Type, Size)                      \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Dst ", " #Value0 ", Addr, " #Value1 ", pandn, " #Type ", " #Size  \
+        ")";                                                                   \
+    const uint32_t T0 = allocateDqword();                                      \
+    const Dqword V0 Value0;                                                    \
+                                                                               \
+    const uint32_t T1 = allocateDqword();                                      \
+    const Dqword V1 Value1;                                                    \
+                                                                               \
+    __ movups(Encoded_Xmm_##Dst(), dwordAddress(T0));                          \
+    __ pandn(IceType_i##Size, Encoded_Xmm_##Dst(), dwordAddress(T1));          \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0);                                                  \
+    test.setDqwordTo(T1, V1);                                                  \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ((~packedAs<Type##Size##_t>(V0)) & V1, test.Dst<Dqword>())        \
+        << TestString;                                                         \
+    reset();                                                                   \
+  } while (0)
+
+#define TestPArithSize(Dst, Src, Size)                                         \
+  do {                                                                         \
+    static_assert(Size == 8 || Size == 16 || Size == 32, "Invalid size.");     \
+    if (Size != 8) {                                                           \
+      TestPArithXmmXmm(                                                        \
+          Dst,                                                                 \
+          (uint64_t(0x8040201008040201ull), uint64_t(0x8080404002020101ull)),  \
+          Src, (uint64_t(3u), uint64_t(0u)), psra, >>, int, Size);             \
+      TestPArithXmmAddr(Dst, (uint64_t(0x8040201008040201ull),                 \
+                              uint64_t(0x8080404002020101ull)),                \
+                        (uint64_t(3u), uint64_t(0u)), psra, >>, int, Size);    \
+      TestPArithXmmImm(Dst, (uint64_t(0x8040201008040201ull),                  \
+                             uint64_t(0x8080404002020101ull)),                 \
+                       3u, psra, >>, int, Size);                               \
+      TestPArithXmmXmm(                                                        \
+          Dst,                                                                 \
+          (uint64_t(0x8040201008040201ull), uint64_t(0x8080404002020101ull)),  \
+          Src, (uint64_t(3u), uint64_t(0u)), psrl, >>, uint, Size);            \
+      TestPArithXmmAddr(Dst, (uint64_t(0x8040201008040201ull),                 \
+                              uint64_t(0x8080404002020101ull)),                \
+                        (uint64_t(3u), uint64_t(0u)), psrl, >>, uint, Size);   \
+      TestPArithXmmImm(Dst, (uint64_t(0x8040201008040201ull),                  \
+                             uint64_t(0x8080404002020101ull)),                 \
+                       3u, psrl, >>, uint, Size);                              \
+      TestPArithXmmXmm(                                                        \
+          Dst,                                                                 \
+          (uint64_t(0x8040201008040201ull), uint64_t(0x8080404002020101ull)),  \
+          Src, (uint64_t(3u), uint64_t(0u)), psll, <<, uint, Size);            \
+      TestPArithXmmAddr(Dst, (uint64_t(0x8040201008040201ull),                 \
+                              uint64_t(0x8080404002020101ull)),                \
+                        (uint64_t(3u), uint64_t(0u)), psll, <<, uint, Size);   \
+      TestPArithXmmImm(Dst, (uint64_t(0x8040201008040201ull),                  \
+                             uint64_t(0x8080404002020101ull)),                 \
+                       3u, psll, <<, uint, Size);                              \
+                                                                               \
+      TestPArithXmmXmm(Dst, (uint64_t(0x8040201008040201ull),                  \
+                             uint64_t(0x8080404002020101ull)),                 \
+                       Src, (uint64_t(0xFFFFFFFF00000000ull),                  \
+                             uint64_t(0x0123456789ABCDEull)),                  \
+                       pmull, *, int, Size);                                   \
+      TestPArithXmmAddr(                                                       \
+          Dst,                                                                 \
+          (uint64_t(0x8040201008040201ull), uint64_t(0x8080404002020101ull)),  \
+          (uint64_t(0xFFFFFFFF00000000ull), uint64_t(0x0123456789ABCDEull)),   \
+          pmull, *, int, Size);                                                \
+      if (Size != 16) {                                                        \
+        TestPArithXmmXmm(Dst, (uint64_t(0x8040201008040201ull),                \
+                               uint64_t(0x8080404002020101ull)),               \
+                         Src, (uint64_t(0xFFFFFFFF00000000ull),                \
+                               uint64_t(0x0123456789ABCDEull)),                \
+                         pmuludq, *, uint, Size);                              \
+        TestPArithXmmAddr(                                                     \
+            Dst, (uint64_t(0x8040201008040201ull),                             \
+                  uint64_t(0x8080404002020101ull)),                            \
+            (uint64_t(0xFFFFFFFF00000000ull), uint64_t(0x0123456789ABCDEull)), \
+            pmuludq, *, uint, Size);                                           \
+      }                                                                        \
+    }                                                                          \
+    TestPArithXmmXmm(Dst, (uint64_t(0x8040201008040201ull),                    \
+                           uint64_t(0x8080404002020101ull)),                   \
+                     Src, (uint64_t(0xFFFFFFFF00000000ull),                    \
+                           uint64_t(0x0123456789ABCDEull)),                    \
+                     padd, +, int, Size);                                      \
+    TestPArithXmmAddr(                                                         \
+        Dst,                                                                   \
+        (uint64_t(0x8040201008040201ull), uint64_t(0x8080404002020101ull)),    \
+        (uint64_t(0xFFFFFFFF00000000ull), uint64_t(0x0123456789ABCDEull)),     \
+        padd, +, int, Size);                                                   \
+    TestPArithXmmXmm(Dst, (uint64_t(0x8040201008040201ull),                    \
+                           uint64_t(0x8080404002020101ull)),                   \
+                     Src, (uint64_t(0xFFFFFFFF00000000ull),                    \
+                           uint64_t(0x0123456789ABCDEull)),                    \
+                     psub, -, int, Size);                                      \
+    TestPArithXmmAddr(                                                         \
+        Dst,                                                                   \
+        (uint64_t(0x8040201008040201ull), uint64_t(0x8080404002020101ull)),    \
+        (uint64_t(0xFFFFFFFF00000000ull), uint64_t(0x0123456789ABCDEull)),     \
+        psub, -, int, Size);                                                   \
+    TestPArithXmmXmm(Dst, (uint64_t(0x8040201008040201ull),                    \
+                           uint64_t(0x8080404002020101ull)),                   \
+                     Src, (uint64_t(0xFFFFFFFF00000000ull),                    \
+                           uint64_t(0x0123456789ABCDEull)),                    \
+                     pand, &, int, Size);                                      \
+    TestPArithXmmAddr(                                                         \
+        Dst,                                                                   \
+        (uint64_t(0x8040201008040201ull), uint64_t(0x8080404002020101ull)),    \
+        (uint64_t(0xFFFFFFFF00000000ull), uint64_t(0x0123456789ABCDEull)),     \
+        pand, &, int, Size);                                                   \
+                                                                               \
+    TestPAndnXmmXmm(Dst, (uint64_t(0x8040201008040201ull),                     \
+                          uint64_t(0x8080404002020101ull)),                    \
+                    Src, (uint64_t(0xFFFFFFFF00000000ull),                     \
+                          uint64_t(0x0123456789ABCDEull)),                     \
+                    int, Size);                                                \
+    TestPAndnXmmAddr(                                                          \
+        Dst,                                                                   \
+        (uint64_t(0x8040201008040201ull), uint64_t(0x8080404002020101ull)),    \
+        (uint64_t(0xFFFFFFFF00000000ull), uint64_t(0x0123456789ABCDEull)),     \
+        int, Size);                                                            \
+                                                                               \
+    TestPArithXmmXmm(Dst, (uint64_t(0x8040201008040201ull),                    \
+                           uint64_t(0x8080404002020101ull)),                   \
+                     Src, (uint64_t(0xFFFFFFFF00000000ull),                    \
+                           uint64_t(0x0123456789ABCDEull)),                    \
+                     por, |, int, Size);                                       \
+    TestPArithXmmAddr(                                                         \
+        Dst,                                                                   \
+        (uint64_t(0x8040201008040201ull), uint64_t(0x8080404002020101ull)),    \
+        (uint64_t(0xFFFFFFFF00000000ull), uint64_t(0x0123456789ABCDEull)),     \
+        por, |, int, Size);                                                    \
+    TestPArithXmmXmm(Dst, (uint64_t(0x8040201008040201ull),                    \
+                           uint64_t(0x8080404002020101ull)),                   \
+                     Src, (uint64_t(0xFFFFFFFF00000000ull),                    \
+                           uint64_t(0x0123456789ABCDEull)),                    \
+                     pxor, ^, int, Size);                                      \
+    TestPArithXmmAddr(                                                         \
+        Dst,                                                                   \
+        (uint64_t(0x8040201008040201ull), uint64_t(0x8080404002020101ull)),    \
+        (uint64_t(0xFFFFFFFF00000000ull), uint64_t(0x0123456789ABCDEull)),     \
+        pxor, ^, int, Size);                                                   \
+  } while (0)
+
+#define TestPArith(Src, Dst)                                                   \
+  do {                                                                         \
+    TestPArithSize(Src, Dst, 8);                                               \
+    TestPArithSize(Src, Dst, 16);                                              \
+    TestPArithSize(Src, Dst, 32);                                              \
+  } while (0)
+
+  TestPArith(xmm0, xmm1);
+  TestPArith(xmm1, xmm2);
+  TestPArith(xmm2, xmm3);
+  TestPArith(xmm3, xmm4);
+  TestPArith(xmm4, xmm5);
+  TestPArith(xmm5, xmm6);
+  TestPArith(xmm6, xmm7);
+  TestPArith(xmm7, xmm8);
+  TestPArith(xmm8, xmm9);
+  TestPArith(xmm9, xmm10);
+  TestPArith(xmm10, xmm11);
+  TestPArith(xmm11, xmm12);
+  TestPArith(xmm12, xmm13);
+  TestPArith(xmm13, xmm14);
+  TestPArith(xmm14, xmm15);
+  TestPArith(xmm15, xmm0);
+
+#undef TestPArith
+#undef TestPArithSize
+#undef TestPAndnXmmAddr
+#undef TestPAndnXmmXmm
+#undef TestPArithXmmImm
+#undef TestPArithXmmAddr
+#undef TestPArithXmmXmm
+}
+
+TEST_F(AssemblerX8664Test, ArithPS) {
+#define TestArithPSXmmXmm(Dst, Value0, Src, Value1, Inst, Op, Type)            \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Dst ", " #Value0 ", " #Src ", " #Value1 ", " #Inst ", " #Op       \
+        ", " #Type ")";                                                        \
+    const uint32_t T0 = allocateDqword();                                      \
+    const Dqword V0 Value0;                                                    \
+    const uint32_t T1 = allocateDqword();                                      \
+    const Dqword V1 Value1;                                                    \
+                                                                               \
+    __ movups(Encoded_Xmm_##Dst(), dwordAddress(T0));                          \
+    __ movups(Encoded_Xmm_##Src(), dwordAddress(T1));                          \
+    __ Inst(IceType_f32, Encoded_Xmm_##Dst(), Encoded_Xmm_##Src());            \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0);                                                  \
+    test.setDqwordTo(T1, V1);                                                  \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(packedAs<Type>(V0) Op V1, test.Dst<Dqword>()) << TestString;     \
+                                                                               \
+    reset();                                                                   \
+  } while (0)
+
+#define TestArithPSXmmXmmUntyped(Dst, Value0, Src, Value1, Inst, Op, Type)     \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Dst ", " #Value0 ", " #Src ", " #Value1 ", " #Inst ", " #Op       \
+        ", " #Type ")";                                                        \
+    const uint32_t T0 = allocateDqword();                                      \
+    const Dqword V0 Value0;                                                    \
+    const uint32_t T1 = allocateDqword();                                      \
+    const Dqword V1 Value1;                                                    \
+                                                                               \
+    __ movups(Encoded_Xmm_##Dst(), dwordAddress(T0));                          \
+    __ movups(Encoded_Xmm_##Src(), dwordAddress(T1));                          \
+    __ Inst(Encoded_Xmm_##Dst(), Encoded_Xmm_##Src());                         \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0);                                                  \
+    test.setDqwordTo(T1, V1);                                                  \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(packedAs<Type>(V0) Op V1, test.Dst<Dqword>()) << TestString;     \
+                                                                               \
+    reset();                                                                   \
+  } while (0)
+
+#define TestArithPSXmmAddrUntyped(Dst, Value0, Value1, Inst, Op, Type)         \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Dst ", " #Value0 ", Addr, " #Value1 ", " #Inst ", " #Op           \
+        ", " #Type ")";                                                        \
+    const uint32_t T0 = allocateDqword();                                      \
+    const Dqword V0 Value0;                                                    \
+    const uint32_t T1 = allocateDqword();                                      \
+    const Dqword V1 Value1;                                                    \
+                                                                               \
+    __ movups(Encoded_Xmm_##Dst(), dwordAddress(T0));                          \
+    __ Inst(Encoded_Xmm_##Dst(), dwordAddress(T1));                            \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0);                                                  \
+    test.setDqwordTo(T1, V1);                                                  \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(packedAs<Type>(V0) Op V1, test.Dst<Dqword>()) << TestString;     \
+                                                                               \
+    reset();                                                                   \
+  } while (0)
+
+#define TestMinMaxPS(Dst, Value0, Src, Value1, Inst, Type)                     \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Dst ", " #Value0 ", " #Src ", " #Value1 ", " #Inst ", " #Type     \
+        ")";                                                                   \
+    const uint32_t T0 = allocateDqword();                                      \
+    const Dqword V0 Value0;                                                    \
+    const uint32_t T1 = allocateDqword();                                      \
+    const Dqword V1 Value1;                                                    \
+                                                                               \
+    __ movups(Encoded_Xmm_##Dst(), dwordAddress(T0));                          \
+    __ movups(Encoded_Xmm_##Src(), dwordAddress(T1));                          \
+    __ Inst(Encoded_Xmm_##Dst(), Encoded_Xmm_##Src());                         \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0);                                                  \
+    test.setDqwordTo(T1, V1);                                                  \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(packedAs<Type>(V0).Inst(V1), test.Dst<Dqword>()) << TestString;  \
+                                                                               \
+    reset();                                                                   \
+  } while (0)
+
+#define TestArithPSXmmAddr(Dst, Value0, Value1, Inst, Op, Type)                \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Dst ", " #Value0 ", Addr, " #Value1 ", " #Inst ", " #Op           \
+        ", " #Type ")";                                                        \
+    const uint32_t T0 = allocateDqword();                                      \
+    const Dqword V0 Value0;                                                    \
+    const uint32_t T1 = allocateDqword();                                      \
+    const Dqword V1 Value1;                                                    \
+                                                                               \
+    __ movups(Encoded_Xmm_##Dst(), dwordAddress(T0));                          \
+    __ Inst(IceType_f32, Encoded_Xmm_##Dst(), dwordAddress(T1));               \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0);                                                  \
+    test.setDqwordTo(T1, V1);                                                  \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(packedAs<Type>(V0) Op V1, test.Dst<Dqword>()) << TestString;     \
+                                                                               \
+    reset();                                                                   \
+  } while (0)
+
+#define TestArithPS(Dst, Src)                                                  \
+  do {                                                                         \
+    TestArithPSXmmXmm(Dst, (1.0, 100.0, -1000.0, 20.0), Src,                   \
+                      (0.55, 0.43, 0.23, 1.21), addps, +, float);              \
+    TestArithPSXmmAddr(Dst, (1.0, 100.0, -1000.0, 20.0),                       \
+                       (0.55, 0.43, 0.23, 1.21), addps, +, float);             \
+    TestArithPSXmmXmm(Dst, (1.0, 100.0, -1000.0, 20.0), Src,                   \
+                      (0.55, 0.43, 0.23, 1.21), subps, -, float);              \
+    TestArithPSXmmAddr(Dst, (1.0, 100.0, -1000.0, 20.0),                       \
+                       (0.55, 0.43, 0.23, 1.21), subps, -, float);             \
+    TestArithPSXmmXmm(Dst, (1.0, 100.0, -1000.0, 20.0), Src,                   \
+                      (0.55, 0.43, 0.23, 1.21), mulps, *, float);              \
+    TestArithPSXmmAddr(Dst, (1.0, 100.0, -1000.0, 20.0),                       \
+                       (0.55, 0.43, 0.23, 1.21), mulps, *, float);             \
+    TestArithPSXmmXmm(Dst, (1.0, 100.0, -1000.0, 20.0), Src,                   \
+                      (0.55, 0.43, 0.23, 1.21), divps, /, float);              \
+    TestArithPSXmmAddr(Dst, (1.0, 100.0, -1000.0, 20.0),                       \
+                       (0.55, 0.43, 0.23, 1.21), divps, /, float);             \
+    TestArithPSXmmXmmUntyped(Dst, (1.0, 100.0, -1000.0, 20.0), Src,            \
+                             (0.55, 0.43, 0.23, 1.21), andps, &, float);       \
+    TestArithPSXmmAddrUntyped(Dst, (1.0, 100.0, -1000.0, 20.0),                \
+                              (0.55, 0.43, 0.23, 1.21), andps, &, float);      \
+    TestArithPSXmmXmmUntyped(Dst, (1.0, -1000.0), Src, (0.55, 1.21), andpd, &, \
+                             double);                                          \
+    TestArithPSXmmAddrUntyped(Dst, (1.0, -1000.0), (0.55, 1.21), andpd, &,     \
+                              double);                                         \
+    TestArithPSXmmXmmUntyped(Dst, (1.0, 100.0, -1000.0, 20.0), Src,            \
+                             (0.55, 0.43, 0.23, 1.21), orps, |, float);        \
+    TestArithPSXmmXmmUntyped(Dst, (1.0, -1000.0), Src, (0.55, 1.21), orpd, |,  \
+                             double);                                          \
+    TestMinMaxPS(Dst, (1.0, 100.0, -1000.0, 20.0), Src,                        \
+                 (0.55, 0.43, 0.23, 1.21), minps, float);                      \
+    TestMinMaxPS(Dst, (1.0, 100.0, -1000.0, 20.0), Src,                        \
+                 (0.55, 0.43, 0.23, 1.21), maxps, float);                      \
+    TestMinMaxPS(Dst, (1.0, -1000.0), Src, (0.55, 1.21), minpd, double);       \
+    TestMinMaxPS(Dst, (1.0, -1000.0), Src, (0.55, 1.21), maxpd, double);       \
+    TestArithPSXmmXmmUntyped(Dst, (1.0, 100.0, -1000.0, 20.0), Src,            \
+                             (0.55, 0.43, 0.23, 1.21), xorps, ^, float);       \
+    TestArithPSXmmAddrUntyped(Dst, (1.0, 100.0, -1000.0, 20.0),                \
+                              (0.55, 0.43, 0.23, 1.21), xorps, ^, float);      \
+    TestArithPSXmmXmmUntyped(Dst, (1.0, -1000.0), Src, (0.55, 1.21), xorpd, ^, \
+                             double);                                          \
+    TestArithPSXmmAddrUntyped(Dst, (1.0, -1000.0), (0.55, 1.21), xorpd, ^,     \
+                              double);                                         \
+  } while (0)
+
+  TestArithPS(xmm0, xmm1);
+  TestArithPS(xmm1, xmm2);
+  TestArithPS(xmm2, xmm3);
+  TestArithPS(xmm3, xmm4);
+  TestArithPS(xmm4, xmm5);
+  TestArithPS(xmm5, xmm6);
+  TestArithPS(xmm6, xmm7);
+  TestArithPS(xmm7, xmm8);
+  TestArithPS(xmm8, xmm9);
+  TestArithPS(xmm9, xmm10);
+  TestArithPS(xmm10, xmm11);
+  TestArithPS(xmm11, xmm12);
+  TestArithPS(xmm12, xmm13);
+  TestArithPS(xmm13, xmm14);
+  TestArithPS(xmm14, xmm15);
+  TestArithPS(xmm15, xmm0);
+
+#undef TestArithPs
+#undef TestMinMaxPS
+#undef TestArithPSXmmXmmUntyped
+#undef TestArithPSXmmAddr
+#undef TestArithPSXmmXmm
+}
+
+TEST_F(AssemblerX8664Test, Blending) {
+  using f32 = float;
+  using i8 = uint8_t;
+
+#define TestBlendingXmmXmm(Dst, Value0, Src, Value1, M /*ask*/, Inst, Type)    \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Dst ", " #Value0 ", " #Src ", " #Value1 ", " #M ", " #Inst        \
+        ", " #Type ")";                                                        \
+    const uint32_t T0 = allocateDqword();                                      \
+    const Dqword V0 Value0;                                                    \
+    const uint32_t T1 = allocateDqword();                                      \
+    const Dqword V1 Value1;                                                    \
+    const uint32_t Mask = allocateDqword();                                    \
+    const Dqword MaskValue M;                                                  \
+                                                                               \
+    __ movups(Encoded_Xmm_xmm0(), dwordAddress(Mask));                         \
+    __ movups(Encoded_Xmm_##Dst(), dwordAddress(T0));                          \
+    __ movups(Encoded_Xmm_##Src(), dwordAddress(T1));                          \
+    __ Inst(IceType_##Type, Encoded_Xmm_##Dst(), Encoded_Xmm_##Src());         \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0);                                                  \
+    test.setDqwordTo(T1, V1);                                                  \
+    test.setDqwordTo(Mask, MaskValue);                                         \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(packedAs<Type>(V0).blendWith(V1, MaskValue), test.Dst<Dqword>()) \
+        << TestString;                                                         \
+    reset();                                                                   \
+  } while (0)
+
+#define TestBlendingXmmAddr(Dst, Value0, Value1, M /*ask*/, Inst, Type)        \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Dst ", " #Value0 ", Addr, " #Value1 ", " #M ", " #Inst ", " #Type \
+        ")";                                                                   \
+    const uint32_t T0 = allocateDqword();                                      \
+    const Dqword V0 Value0;                                                    \
+    const uint32_t T1 = allocateDqword();                                      \
+    const Dqword V1 Value1;                                                    \
+    const uint32_t Mask = allocateDqword();                                    \
+    const Dqword MaskValue M;                                                  \
+                                                                               \
+    __ movups(Encoded_Xmm_xmm0(), dwordAddress(Mask));                         \
+    __ movups(Encoded_Xmm_##Dst(), dwordAddress(T0));                          \
+    __ Inst(IceType_##Type, Encoded_Xmm_##Dst(), dwordAddress(T1));            \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0);                                                  \
+    test.setDqwordTo(T1, V1);                                                  \
+    test.setDqwordTo(Mask, MaskValue);                                         \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(packedAs<Type>(V0).blendWith(V1, MaskValue), test.Dst<Dqword>()) \
+        << TestString;                                                         \
+    reset();                                                                   \
+  } while (0)
+
+#define TestBlending(Src, Dst)                                                 \
+  do {                                                                         \
+    TestBlendingXmmXmm(                                                        \
+        Dst, (1.0, 2.0, 1.0, 2.0), Src, (-1.0, -2.0, -1.0, -2.0),              \
+        (uint64_t(0x8000000000000000ull), uint64_t(0x0000000080000000ull)),    \
+        blendvps, f32);                                                        \
+    TestBlendingXmmAddr(                                                       \
+        Dst, (1.0, 2.0, 1.0, 2.0), (-1.0, -2.0, -1.0, -2.0),                   \
+        (uint64_t(0x8000000000000000ull), uint64_t(0x0000000080000000ull)),    \
+        blendvps, f32);                                                        \
+    TestBlendingXmmXmm(                                                        \
+        Dst,                                                                   \
+        (uint64_t(0xFFFFFFFFFFFFFFFFull), uint64_t(0xBBBBBBBBBBBBBBBBull)),    \
+        Src,                                                                   \
+        (uint64_t(0xAAAAAAAAAAAAAAAAull), uint64_t(0xEEEEEEEEEEEEEEEEull)),    \
+        (uint64_t(0x8000000000000080ull), uint64_t(0x8080808000000000ull)),    \
+        pblendvb, i8);                                                         \
+    TestBlendingXmmAddr(                                                       \
+        Dst,                                                                   \
+        (uint64_t(0xFFFFFFFFFFFFFFFFull), uint64_t(0xBBBBBBBBBBBBBBBBull)),    \
+        (uint64_t(0xAAAAAAAAAAAAAAAAull), uint64_t(0xEEEEEEEEEEEEEEEEull)),    \
+        (uint64_t(0x8000000000000080ull), uint64_t(0x8080808000000000ull)),    \
+        pblendvb, i8);                                                         \
+  } while (0)
+
+  /* xmm0 is taken. It is the implicit mask . */
+  TestBlending(xmm1, xmm2);
+  TestBlending(xmm2, xmm3);
+  TestBlending(xmm3, xmm4);
+  TestBlending(xmm4, xmm5);
+  TestBlending(xmm5, xmm6);
+  TestBlending(xmm6, xmm7);
+  TestBlending(xmm7, xmm8);
+  TestBlending(xmm8, xmm9);
+  TestBlending(xmm9, xmm10);
+  TestBlending(xmm10, xmm11);
+  TestBlending(xmm11, xmm12);
+  TestBlending(xmm12, xmm13);
+  TestBlending(xmm13, xmm14);
+  TestBlending(xmm14, xmm15);
+  TestBlending(xmm15, xmm1);
+
+#undef TestBlending
+#undef TestBlendingXmmAddr
+#undef TestBlendingXmmXmm
+}
+
+TEST_F(AssemblerX8664Test, Cmpps) {
+#define TestCmppsXmmXmm(Dst, Src, C, Op)                                       \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Src ", " #Dst ", " #C ", " #Op ")";                               \
+    const uint32_t T0 = allocateDqword();                                      \
+    const Dqword V0(-1.0, 1.0, 3.14, 1024.5);                                  \
+    const uint32_t T1 = allocateDqword();                                      \
+    const Dqword V1(-1.0, 1.0, 3.14, 1024.5);                                  \
+                                                                               \
+    __ movups(Encoded_Xmm_##Dst(), dwordAddress(T0));                          \
+    __ movups(Encoded_Xmm_##Src(), dwordAddress(T1));                          \
+    __ cmpps(Encoded_Xmm_##Dst(), Encoded_Xmm_##Src(), Cond::Cmpps_##C);       \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0);                                                  \
+    test.setDqwordTo(T1, V1);                                                  \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(packedAs<float>(V0) Op V1, test.Dst<Dqword>()) << TestString;    \
+    ;                                                                          \
+    reset();                                                                   \
+  } while (0)
+
+#define TestCmppsXmmAddr(Dst, C, Op)                                           \
+  do {                                                                         \
+    static constexpr char TestString[] = "(" #Dst ", Addr, " #C ", " #Op ")";  \
+    const uint32_t T0 = allocateDqword();                                      \
+    const Dqword V0(-1.0, 1.0, 3.14, 1024.5);                                  \
+    const uint32_t T1 = allocateDqword();                                      \
+    const Dqword V1(-1.0, 1.0, 3.14, 1024.5);                                  \
+                                                                               \
+    __ movups(Encoded_Xmm_##Dst(), dwordAddress(T0));                          \
+    __ cmpps(Encoded_Xmm_##Dst(), dwordAddress(T1), Cond::Cmpps_##C);          \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0);                                                  \
+    test.setDqwordTo(T1, V1);                                                  \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(packedAs<float>(V0) Op V1, test.Dst<Dqword>()) << TestString;    \
+    ;                                                                          \
+    reset();                                                                   \
+  } while (0)
+
+#define TestCmppsOrdUnordXmmXmm(Dst, Src, C)                                   \
+  do {                                                                         \
+    static constexpr char TestString[] = "(" #Src ", " #Dst ", " #C ")";       \
+    const uint32_t T0 = allocateDqword();                                      \
+    const Dqword V0(1.0, 1.0, std::numeric_limits<float>::quiet_NaN(),         \
+                    std::numeric_limits<float>::quiet_NaN());                  \
+    const uint32_t T1 = allocateDqword();                                      \
+    const Dqword V1(1.0, std::numeric_limits<float>::quiet_NaN(), 1.0,         \
+                    std::numeric_limits<float>::quiet_NaN());                  \
+                                                                               \
+    __ movups(Encoded_Xmm_##Dst(), dwordAddress(T0));                          \
+    __ movups(Encoded_Xmm_##Src(), dwordAddress(T1));                          \
+    __ cmpps(Encoded_Xmm_##Dst(), Encoded_Xmm_##Src(), Cond::Cmpps_##C);       \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0);                                                  \
+    test.setDqwordTo(T1, V1);                                                  \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(packedAs<float>(V0).C(V1), test.Dst<Dqword>()) << TestString;    \
+    ;                                                                          \
+    reset();                                                                   \
+  } while (0)
+
+#define TestCmppsOrdUnordXmmAddr(Dst, C)                                       \
+  do {                                                                         \
+    static constexpr char TestString[] = "(" #Dst ", " #C ")";                 \
+    const uint32_t T0 = allocateDqword();                                      \
+    const Dqword V0(1.0, 1.0, std::numeric_limits<float>::quiet_NaN(),         \
+                    std::numeric_limits<float>::quiet_NaN());                  \
+    const uint32_t T1 = allocateDqword();                                      \
+    const Dqword V1(1.0, std::numeric_limits<float>::quiet_NaN(), 1.0,         \
+                    std::numeric_limits<float>::quiet_NaN());                  \
+                                                                               \
+    __ movups(Encoded_Xmm_##Dst(), dwordAddress(T0));                          \
+    __ cmpps(Encoded_Xmm_##Dst(), dwordAddress(T1), Cond::Cmpps_##C);          \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0);                                                  \
+    test.setDqwordTo(T1, V1);                                                  \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(packedAs<float>(V0).C(V1), test.Dst<Dqword>()) << TestString;    \
+    ;                                                                          \
+    reset();                                                                   \
+  } while (0)
+
+#define TestCmpps(Dst, Src)                                                    \
+  do {                                                                         \
+    TestCmppsXmmXmm(Dst, Src, eq, == );                                        \
+    TestCmppsXmmAddr(Dst, eq, == );                                            \
+    TestCmppsXmmXmm(Dst, Src, eq, == );                                        \
+    TestCmppsXmmAddr(Dst, eq, == );                                            \
+    TestCmppsXmmXmm(Dst, Src, eq, == );                                        \
+    TestCmppsXmmAddr(Dst, eq, == );                                            \
+    TestCmppsOrdUnordXmmXmm(Dst, Src, unord);                                  \
+    TestCmppsOrdUnordXmmAddr(Dst, unord);                                      \
+    TestCmppsXmmXmm(Dst, Src, eq, == );                                        \
+    TestCmppsXmmAddr(Dst, eq, == );                                            \
+    TestCmppsXmmXmm(Dst, Src, eq, == );                                        \
+    TestCmppsXmmAddr(Dst, eq, == );                                            \
+    TestCmppsXmmXmm(Dst, Src, eq, == );                                        \
+    TestCmppsXmmAddr(Dst, eq, == );                                            \
+    TestCmppsOrdUnordXmmXmm(Dst, Src, unord);                                  \
+    TestCmppsOrdUnordXmmAddr(Dst, unord);                                      \
+  } while (0)
+
+  TestCmpps(xmm0, xmm1);
+  TestCmpps(xmm1, xmm2);
+  TestCmpps(xmm2, xmm3);
+  TestCmpps(xmm3, xmm4);
+  TestCmpps(xmm4, xmm5);
+  TestCmpps(xmm5, xmm6);
+  TestCmpps(xmm6, xmm7);
+  TestCmpps(xmm7, xmm8);
+  TestCmpps(xmm8, xmm9);
+  TestCmpps(xmm9, xmm10);
+  TestCmpps(xmm10, xmm11);
+  TestCmpps(xmm11, xmm12);
+  TestCmpps(xmm12, xmm13);
+  TestCmpps(xmm13, xmm14);
+  TestCmpps(xmm14, xmm15);
+  TestCmpps(xmm15, xmm0);
+
+#undef TestCmpps
+#undef TestCmppsOrdUnordXmmAddr
+#undef TestCmppsOrdUnordXmmXmm
+#undef TestCmppsXmmAddr
+#undef TestCmppsXmmXmm
+}
+
+TEST_F(AssemblerX8664Test, Sqrtps_Rsqrtps_Reciprocalps_Sqrtpd) {
+#define TestImplSingle(Dst, Inst, Expect)                                      \
+  do {                                                                         \
+    static constexpr char TestString[] = "(" #Dst ", " #Inst ")";              \
+    const uint32_t T0 = allocateDqword();                                      \
+    const Dqword V0(1.0, 4.0, 20.0, 3.14);                                     \
+                                                                               \
+    __ movups(Encoded_Xmm_##Dst(), dwordAddress(T0));                          \
+    __ Inst(Encoded_Xmm_##Dst());                                              \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0);                                                  \
+    test.run();                                                                \
+    ASSERT_EQ(Dqword Expect, test.Dst<Dqword>()) << TestString;                \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImpl(Dst)                                                          \
+  do {                                                                         \
+    TestImplSingle(Dst, sqrtps, (uint64_t(0x400000003F800000ull),              \
+                                 uint64_t(0x3FE2D10B408F1BBDull)));            \
+    TestImplSingle(Dst, rsqrtps, (uint64_t(0x3EFFF0003F7FF000ull),             \
+                                  uint64_t(0x3F1078003E64F000ull)));           \
+    TestImplSingle(Dst, reciprocalps, (uint64_t(0x3E7FF0003F7FF000ull),        \
+                                       uint64_t(0x3EA310003D4CC000ull)));      \
+                                                                               \
+    TestImplSingle(Dst, sqrtpd, (uint64_t(0x4036A09E9365F5F3ull),              \
+                                 uint64_t(0x401C42FAE40282A8ull)));            \
+  } while (0)
+
+  TestImpl(xmm0);
+  TestImpl(xmm1);
+  TestImpl(xmm2);
+  TestImpl(xmm3);
+  TestImpl(xmm4);
+  TestImpl(xmm5);
+  TestImpl(xmm6);
+  TestImpl(xmm7);
+  TestImpl(xmm8);
+  TestImpl(xmm9);
+  TestImpl(xmm10);
+  TestImpl(xmm11);
+  TestImpl(xmm12);
+  TestImpl(xmm13);
+  TestImpl(xmm14);
+  TestImpl(xmm15);
+
+#undef TestImpl
+#undef TestImplSingle
+}
+
+TEST_F(AssemblerX8664Test, Unpck) {
+  const Dqword V0(uint64_t(0xAAAAAAAABBBBBBBBull),
+                  uint64_t(0xCCCCCCCCDDDDDDDDull));
+  const Dqword V1(uint64_t(0xEEEEEEEEFFFFFFFFull),
+                  uint64_t(0x9999999988888888ull));
+
+  const Dqword unpcklpsExpected(uint64_t(0xFFFFFFFFBBBBBBBBull),
+                                uint64_t(0xEEEEEEEEAAAAAAAAull));
+  const Dqword unpcklpdExpected(uint64_t(0xAAAAAAAABBBBBBBBull),
+                                uint64_t(0xEEEEEEEEFFFFFFFFull));
+  const Dqword unpckhpsExpected(uint64_t(0x88888888DDDDDDDDull),
+                                uint64_t(0x99999999CCCCCCCCull));
+  const Dqword unpckhpdExpected(uint64_t(0xCCCCCCCCDDDDDDDDull),
+                                uint64_t(0x9999999988888888ull));
+
+#define TestImplSingle(Dst, Src, Inst)                                         \
+  do {                                                                         \
+    static constexpr char TestString[] = "(" #Dst ", " #Src ", " #Inst ")";    \
+    const uint32_t T0 = allocateDqword();                                      \
+    const uint32_t T1 = allocateDqword();                                      \
+                                                                               \
+    __ movups(Encoded_Xmm_##Dst(), dwordAddress(T0));                          \
+    __ movups(Encoded_Xmm_##Src(), dwordAddress(T1));                          \
+    __ Inst(Encoded_Xmm_##Dst(), Encoded_Xmm_##Src());                         \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0);                                                  \
+    test.setDqwordTo(T1, V1);                                                  \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(Inst##Expected, test.Dst<Dqword>()) << TestString;               \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImpl(Dst, Src)                                                     \
+  do {                                                                         \
+    TestImplSingle(Dst, Src, unpcklps);                                        \
+    TestImplSingle(Dst, Src, unpcklpd);                                        \
+    TestImplSingle(Dst, Src, unpckhps);                                        \
+    TestImplSingle(Dst, Src, unpckhpd);                                        \
+  } while (0)
+
+  TestImpl(xmm0, xmm1);
+  TestImpl(xmm1, xmm2);
+  TestImpl(xmm2, xmm3);
+  TestImpl(xmm3, xmm4);
+  TestImpl(xmm4, xmm5);
+  TestImpl(xmm5, xmm6);
+  TestImpl(xmm6, xmm7);
+  TestImpl(xmm7, xmm8);
+  TestImpl(xmm8, xmm9);
+  TestImpl(xmm9, xmm10);
+  TestImpl(xmm10, xmm11);
+  TestImpl(xmm11, xmm12);
+  TestImpl(xmm12, xmm13);
+  TestImpl(xmm13, xmm14);
+  TestImpl(xmm14, xmm15);
+  TestImpl(xmm15, xmm0);
+
+#undef TestImpl
+#undef TestImplSingle
+}
+
+TEST_F(AssemblerX8664Test, Shufp) {
+  const Dqword V0(uint64_t(0x1111111122222222ull),
+                  uint64_t(0x5555555577777777ull));
+  const Dqword V1(uint64_t(0xAAAAAAAABBBBBBBBull),
+                  uint64_t(0xCCCCCCCCDDDDDDDDull));
+
+  const uint8_t pshufdImm = 0x63;
+  const Dqword pshufdExpected(uint64_t(0xBBBBBBBBCCCCCCCCull),
+                              uint64_t(0xAAAAAAAADDDDDDDDull));
+
+  const uint8_t shufpsImm = 0xf9;
+  const Dqword shufpsExpected(uint64_t(0x7777777711111111ull),
+                              uint64_t(0xCCCCCCCCCCCCCCCCull));
+
+#define TestImplSingleXmmXmm(Dst, Src, Inst)                                   \
+  do {                                                                         \
+    static constexpr char TestString[] = "(" #Dst ", " #Src ", " #Inst ")";    \
+    const uint32_t T0 = allocateDqword();                                      \
+    const uint32_t T1 = allocateDqword();                                      \
+                                                                               \
+    __ movups(Encoded_Xmm_##Dst(), dwordAddress(T0));                          \
+    __ movups(Encoded_Xmm_##Src(), dwordAddress(T1));                          \
+    __ Inst(IceType_f32, Encoded_Xmm_##Dst(), Encoded_Xmm_##Src(),             \
+            Immediate(Inst##Imm));                                             \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0);                                                  \
+    test.setDqwordTo(T1, V1);                                                  \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(Inst##Expected, test.Dst<Dqword>()) << TestString;               \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplSingleXmmAddr(Dst, Inst)                                       \
+  do {                                                                         \
+    static constexpr char TestString[] = "(" #Dst ", Addr, " #Inst ")";        \
+    const uint32_t T0 = allocateDqword();                                      \
+    const uint32_t T1 = allocateDqword();                                      \
+                                                                               \
+    __ movups(Encoded_Xmm_##Dst(), dwordAddress(T0));                          \
+    __ Inst(IceType_f32, Encoded_Xmm_##Dst(), dwordAddress(T1),                \
+            Immediate(Inst##Imm));                                             \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0);                                                  \
+    test.setDqwordTo(T1, V1);                                                  \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(Inst##Expected, test.Dst<Dqword>()) << TestString;               \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplSingleXmmXmmUntyped(Dst, Src, Inst)                            \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Dst ", " #Src ", " #Inst ", Untyped)";                            \
+    const uint32_t T0 = allocateDqword();                                      \
+    const uint32_t T1 = allocateDqword();                                      \
+                                                                               \
+    __ movups(Encoded_Xmm_##Dst(), dwordAddress(T0));                          \
+    __ movups(Encoded_Xmm_##Src(), dwordAddress(T1));                          \
+    __ Inst(Encoded_Xmm_##Dst(), Encoded_Xmm_##Src(), Immediate(Inst##Imm));   \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0);                                                  \
+    test.setDqwordTo(T1, V1);                                                  \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(Inst##UntypedExpected, test.Dst<Dqword>()) << TestString;        \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImpl(Dst, Src)                                                     \
+  do {                                                                         \
+    TestImplSingleXmmXmm(Dst, Src, pshufd);                                    \
+    TestImplSingleXmmAddr(Dst, pshufd);                                        \
+    TestImplSingleXmmXmm(Dst, Src, shufps);                                    \
+    TestImplSingleXmmAddr(Dst, shufps);                                        \
+  } while (0)
+
+  TestImpl(xmm0, xmm1);
+  TestImpl(xmm1, xmm2);
+  TestImpl(xmm2, xmm3);
+  TestImpl(xmm3, xmm4);
+  TestImpl(xmm4, xmm5);
+  TestImpl(xmm5, xmm6);
+  TestImpl(xmm6, xmm7);
+  TestImpl(xmm7, xmm8);
+  TestImpl(xmm8, xmm9);
+  TestImpl(xmm9, xmm10);
+  TestImpl(xmm10, xmm11);
+  TestImpl(xmm11, xmm12);
+  TestImpl(xmm12, xmm13);
+  TestImpl(xmm13, xmm14);
+  TestImpl(xmm14, xmm15);
+  TestImpl(xmm15, xmm0);
+
+#undef TestImpl
+#undef TestImplSingleXmmXmmUntyped
+#undef TestImplSingleXmmAddr
+#undef TestImplSingleXmmXmm
+}
+
+TEST_F(AssemblerX8664Test, Cvt) {
+  const Dqword dq2ps32DstValue(-1.0f, -1.0f, -1.0f, -1.0f);
+  const Dqword dq2ps32SrcValue(-5, 3, 100, 200);
+  const Dqword dq2ps32Expected(-5.0f, 3.0f, 100.0, 200.0);
+
+  const Dqword dq2ps64DstValue(0.0f, 0.0f, -1.0f, -1.0f);
+  const Dqword dq2ps64SrcValue(-5, 3, 100, 200);
+  const Dqword dq2ps64Expected(-5.0f, 3.0f, 100.0, 200.0);
+
+  const Dqword tps2dq32DstValue(-1.0f, -1.0f, -1.0f, -1.0f);
+  const Dqword tps2dq32SrcValue(-5.0f, 3.0f, 100.0, 200.0);
+  const Dqword tps2dq32Expected(-5, 3, 100, 200);
+
+  const Dqword tps2dq64DstValue(-1.0f, -1.0f, -1.0f, -1.0f);
+  const Dqword tps2dq64SrcValue(-5.0f, 3.0f, 100.0, 200.0);
+  const Dqword tps2dq64Expected(-5, 3, 100, 200);
+
+  const Dqword si2ss32DstValue(-1.0f, -1.0f, -1.0f, -1.0f);
+  const int32_t si2ss32SrcValue = 5;
+  const Dqword si2ss32Expected(5.0f, -1.0f, -1.0f, -1.0f);
+
+  const Dqword si2ss64DstValue(-1.0, -1.0);
+  const int32_t si2ss64SrcValue = 5;
+  const Dqword si2ss64Expected(5.0, -1.0);
+
+  const int32_t tss2si32DstValue = 0xF00F0FF0;
+  const Dqword tss2si32SrcValue(-5.0f, -1.0f, -1.0f, -1.0f);
+  const int32_t tss2si32Expected = -5;
+
+  const int32_t tss2si64DstValue = 0xF00F0FF0;
+  const Dqword tss2si64SrcValue(-5.0, -1.0);
+  const int32_t tss2si64Expected = -5;
+
+  const Dqword float2float32DstValue(-1.0, -1.0);
+  const Dqword float2float32SrcValue(-5.0, 3, 100, 200);
+  const Dqword float2float32Expected(-5.0, -1.0);
+
+  const Dqword float2float64DstValue(-1.0, -1.0, -1.0, -1.0);
+  const Dqword float2float64SrcValue(-5.0, 3.0);
+  const Dqword float2float64Expected(-5.0, -1.0, -1.0, -1.0);
+
+#define TestImplPXmmXmm(Dst, Src, Inst, Size)                                  \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Dst ", " #Src ", cvt" #Inst ", f" #Size ")";                      \
+    const uint32_t T0 = allocateDqword();                                      \
+    const uint32_t T1 = allocateDqword();                                      \
+                                                                               \
+    __ movups(Encoded_Xmm_##Dst(), dwordAddress(T0));                          \
+    __ movups(Encoded_Xmm_##Src(), dwordAddress(T1));                          \
+    __ cvt##Inst(IceType_f##Size, Encoded_Xmm_##Dst(), Encoded_Xmm_##Src());   \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, Inst##Size##DstValue);                                \
+    test.setDqwordTo(T1, Inst##Size##SrcValue);                                \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(Inst##Size##Expected, test.Dst<Dqword>()) << TestString;         \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplSXmmReg(Dst, GPR, Inst, Size)                                  \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Dst ", " #GPR ", cvt" #Inst ", f" #Size ")";                      \
+    const uint32_t T0 = allocateDqword();                                      \
+                                                                               \
+    __ movups(Encoded_Xmm_##Dst(), dwordAddress(T0));                          \
+    __ mov(IceType_i32, Encoded_GPR_##GPR(), Immediate(Inst##Size##SrcValue)); \
+    __ cvt##Inst(IceType_f##Size, Encoded_Xmm_##Dst(), Encoded_GPR_##GPR());   \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, Inst##Size##DstValue);                                \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(Inst##Size##Expected, test.Dst<Dqword>()) << TestString;         \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplSRegXmm(GPR, Src, Inst, Size)                                  \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #GPR ", " #Src ", cvt" #Inst ", f" #Size ")";                      \
+    const uint32_t T0 = allocateDqword();                                      \
+                                                                               \
+    __ mov(IceType_i32, Encoded_GPR_##GPR(), Immediate(Inst##Size##DstValue)); \
+    __ movups(Encoded_Xmm_##Src(), dwordAddress(T0));                          \
+    __ cvt##Inst(IceType_f##Size, Encoded_GPR_##GPR(), Encoded_Xmm_##Src());   \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, Inst##Size##SrcValue);                                \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(static_cast<uint32_t>(Inst##Size##Expected), test.GPR())         \
+        << TestString;                                                         \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplPXmmAddr(Dst, Inst, Size)                                      \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Dst ", Addr, cvt" #Inst ", f" #Size ")";                          \
+    const uint32_t T0 = allocateDqword();                                      \
+    const uint32_t T1 = allocateDqword();                                      \
+                                                                               \
+    __ movups(Encoded_Xmm_##Dst(), dwordAddress(T0));                          \
+    __ cvt##Inst(IceType_f##Size, Encoded_Xmm_##Dst(), dwordAddress(T1));      \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, Inst##Size##DstValue);                                \
+    test.setDqwordTo(T1, Inst##Size##SrcValue);                                \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(Inst##Size##Expected, test.Dst<Dqword>()) << TestString;         \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplSXmmAddr(Dst, Inst, Size)                                      \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Dst ", Addr, cvt" #Inst ", f" #Size ")";                          \
+    const uint32_t T0 = allocateDqword();                                      \
+    const uint32_t T1 = allocateDword();                                       \
+                                                                               \
+    __ movups(Encoded_Xmm_##Dst(), dwordAddress(T0));                          \
+    __ cvt##Inst(IceType_f##Size, Encoded_Xmm_##Dst(), dwordAddress(T1));      \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, Inst##Size##DstValue);                                \
+    test.setDwordTo(T1, Inst##Size##SrcValue);                                 \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(Inst##Size##Expected, test.Dst<Dqword>()) << TestString;         \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplSRegAddr(GPR, Inst, Size)                                      \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #GPR ", Addr, cvt" #Inst ", f" #Size ")";                          \
+    const uint32_t T0 = allocateDqword();                                      \
+                                                                               \
+    __ mov(IceType_i32, Encoded_GPR_##GPR(), Immediate(Inst##Size##DstValue)); \
+    __ cvt##Inst(IceType_f##Size, Encoded_GPR_##GPR(), dwordAddress(T0));      \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, Inst##Size##SrcValue);                                \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(static_cast<uint32_t>(Inst##Size##Expected), test.GPR())         \
+        << TestString;                                                         \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplSize(Dst, Src, GPR, Size)                                      \
+  do {                                                                         \
+    TestImplPXmmXmm(Dst, Src, dq2ps, Size);                                    \
+    TestImplPXmmAddr(Src, dq2ps, Size);                                        \
+    TestImplPXmmXmm(Dst, Src, tps2dq, Size);                                   \
+    TestImplPXmmAddr(Src, tps2dq, Size);                                       \
+    TestImplSXmmReg(Dst, GPR, si2ss, Size);                                    \
+    TestImplSXmmAddr(Dst, si2ss, Size);                                        \
+    TestImplSRegXmm(GPR, Src, tss2si, Size);                                   \
+    TestImplSRegAddr(GPR, tss2si, Size);                                       \
+    TestImplPXmmXmm(Dst, Src, float2float, Size);                              \
+    TestImplPXmmAddr(Src, float2float, Size);                                  \
+  } while (0)
+
+#define TestImpl(Dst, Src, GPR)                                                \
+  do {                                                                         \
+    TestImplSize(Dst, Src, GPR, 32);                                           \
+    TestImplSize(Dst, Src, GPR, 64);                                           \
+  } while (0)
+
+  TestImpl(xmm0, xmm1, r1);
+  TestImpl(xmm1, xmm2, r2);
+  TestImpl(xmm2, xmm3, r3);
+  TestImpl(xmm3, xmm4, r4);
+  TestImpl(xmm4, xmm5, r5);
+  TestImpl(xmm5, xmm6, r6);
+  TestImpl(xmm6, xmm7, r7);
+  TestImpl(xmm7, xmm8, r8);
+  TestImpl(xmm8, xmm9, r10);
+  TestImpl(xmm9, xmm10, r11);
+  TestImpl(xmm10, xmm11, r12);
+  TestImpl(xmm11, xmm12, r13);
+  TestImpl(xmm12, xmm13, r14);
+  TestImpl(xmm13, xmm14, r15);
+  TestImpl(xmm14, xmm15, r1);
+  TestImpl(xmm15, xmm0, r2);
+
+#undef TestImpl
+#undef TestImplSize
+#undef TestImplSRegAddr
+#undef TestImplSXmmAddr
+#undef TestImplPXmmAddr
+#undef TestImplSRegXmm
+#undef TestImplSXmmReg
+#undef TestImplPXmmXmm
+}
+
+TEST_F(AssemblerX8664Test, Ucomiss) {
+  static constexpr float qnan32 = std::numeric_limits<float>::quiet_NaN();
+  static constexpr double qnan64 = std::numeric_limits<float>::quiet_NaN();
+
+  Dqword test32DstValue(0.0, qnan32, qnan32, qnan32);
+  Dqword test32SrcValue(0.0, qnan32, qnan32, qnan32);
+
+  Dqword test64DstValue(0.0, qnan64);
+  Dqword test64SrcValue(0.0, qnan64);
+
+#define TestImplXmmXmm(Dst, Value0, Src, Value1, Size, CompType, BParity,      \
+                       BOther)                                                 \
+  do {                                                                         \
+    static constexpr char NearBranch = AssemblerX8664::kNearJump;              \
+    static constexpr char TestString[] =                                       \
+        "(" #Dst ", " #Value0 ", " #Src ", " #Value1 ", " #Size ", " #CompType \
+        ", " #BParity ", " #BOther ")";                                        \
+    const uint32_t T0 = allocateDqword();                                      \
+    test##Size##DstValue.F##Size[0] = Value0;                                  \
+    const uint32_t T1 = allocateDqword();                                      \
+    test##Size##SrcValue.F##Size[0] = Value1;                                  \
+    const uint32_t ImmIfTrue = 0xBEEF;                                         \
+    const uint32_t ImmIfFalse = 0xC0FFE;                                       \
+                                                                               \
+    __ movups(Encoded_Xmm_##Dst(), dwordAddress(T0));                          \
+    __ movups(Encoded_Xmm_##Src(), dwordAddress(T1));                          \
+    __ mov(IceType_i32, GPRRegister::Encoded_Reg_eax, Immediate(ImmIfFalse));  \
+    __ ucomiss(IceType_f##Size, Encoded_Xmm_##Dst(), Encoded_Xmm_##Src());     \
+    Label Done;                                                                \
+    __ j(Cond::Br_##BParity, &Done, NearBranch);                               \
+    __ j(Cond::Br_##BOther, &Done, NearBranch);                                \
+    __ mov(IceType_i32, GPRRegister::Encoded_Reg_eax, Immediate(ImmIfTrue));   \
+    __ bind(&Done);                                                            \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, test##Size##DstValue);                                \
+    test.setDqwordTo(T1, test##Size##SrcValue);                                \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(ImmIfTrue, test.eax()) << TestString;                            \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplXmmAddr(Dst, Value0, Value1, Size, CompType, BParity, BOther)  \
+  do {                                                                         \
+    static constexpr char NearBranch = AssemblerX8664::kNearJump;              \
+    static constexpr char TestString[] =                                       \
+        "(" #Dst ", " #Value0 ", Addr, " #Value1 ", " #Size ", " #CompType     \
+        ", " #BParity ", " #BOther ")";                                        \
+    const uint32_t T0 = allocateDqword();                                      \
+    test##Size##DstValue.F##Size[0] = Value0;                                  \
+    const uint32_t T1 = allocateDqword();                                      \
+    test##Size##SrcValue.F##Size[0] = Value1;                                  \
+    const uint32_t ImmIfTrue = 0xBEEF;                                         \
+    const uint32_t ImmIfFalse = 0xC0FFE;                                       \
+                                                                               \
+    __ movups(Encoded_Xmm_##Dst(), dwordAddress(T0));                          \
+    __ mov(IceType_i32, GPRRegister::Encoded_Reg_eax, Immediate(ImmIfFalse));  \
+    __ ucomiss(IceType_f##Size, Encoded_Xmm_##Dst(), dwordAddress(T1));        \
+    Label Done;                                                                \
+    __ j(Cond::Br_##BParity, &Done, NearBranch);                               \
+    __ j(Cond::Br_##BOther, &Done, NearBranch);                                \
+    __ mov(IceType_i32, GPRRegister::Encoded_Reg_eax, Immediate(ImmIfTrue));   \
+    __ bind(&Done);                                                            \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, test##Size##DstValue);                                \
+    test.setDqwordTo(T1, test##Size##SrcValue);                                \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(ImmIfTrue, test.eax()) << TestString;                            \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplCond(Dst, Value0, Src, Value1, Size, CompType, BParity,        \
+                     BOther)                                                   \
+  do {                                                                         \
+    TestImplXmmXmm(Dst, Value0, Src, Value1, Size, CompType, BParity, BOther); \
+    TestImplXmmAddr(Dst, Value0, Value1, Size, CompType, BParity, BOther);     \
+  } while (0)
+
+#define TestImplSize(Dst, Src, Size)                                           \
+  do {                                                                         \
+    TestImplCond(Dst, 1.0, Src, 1.0, Size, isEq, p, ne);                       \
+    TestImplCond(Dst, 1.0, Src, 2.0, Size, isNe, p, e);                        \
+    TestImplCond(Dst, 1.0, Src, 2.0, Size, isLe, p, a);                        \
+    TestImplCond(Dst, 1.0, Src, 1.0, Size, isLe, p, a);                        \
+    TestImplCond(Dst, 1.0, Src, 2.0, Size, isLt, p, ae);                       \
+    TestImplCond(Dst, 2.0, Src, 1.0, Size, isGe, p, b);                        \
+    TestImplCond(Dst, 1.0, Src, 1.0, Size, isGe, p, b);                        \
+    TestImplCond(Dst, 2.0, Src, 1.0, Size, isGt, p, be);                       \
+    TestImplCond(Dst, qnan##Size, Src, 1.0, Size, isUnord, np, o);             \
+    TestImplCond(Dst, 1.0, Src, qnan##Size, Size, isUnord, np, s);             \
+    TestImplCond(Dst, qnan##Size, Src, qnan##Size, Size, isUnord, np, s);      \
+  } while (0)
+
+#define TestImpl(Dst, Src)                                                     \
+  do {                                                                         \
+    TestImplSize(Dst, Src, 32);                                                \
+    TestImplSize(Dst, Src, 64);                                                \
+  } while (0)
+
+  TestImpl(xmm0, xmm1);
+  TestImpl(xmm1, xmm2);
+  TestImpl(xmm2, xmm3);
+  TestImpl(xmm3, xmm4);
+  TestImpl(xmm4, xmm5);
+  TestImpl(xmm5, xmm6);
+  TestImpl(xmm6, xmm7);
+  TestImpl(xmm7, xmm8);
+  TestImpl(xmm8, xmm9);
+  TestImpl(xmm9, xmm10);
+  TestImpl(xmm10, xmm11);
+  TestImpl(xmm11, xmm12);
+  TestImpl(xmm12, xmm13);
+  TestImpl(xmm13, xmm14);
+  TestImpl(xmm14, xmm15);
+  TestImpl(xmm15, xmm0);
+
+#undef TestImpl
+#undef TestImplSize
+#undef TestImplCond
+#undef TestImplXmmAddr
+#undef TestImplXmmXmm
+}
+
+TEST_F(AssemblerX8664Test, Sqrtss) {
+  Dqword test32SrcValue(-100.0, -100.0, -100.0, -100.0);
+  Dqword test32DstValue(-1.0, -1.0, -1.0, -1.0);
+
+  Dqword test64SrcValue(-100.0, -100.0);
+  Dqword test64DstValue(-1.0, -1.0);
+
+#define TestSqrtssXmmXmm(Dst, Src, Value1, Result, Size)                       \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Dst ", " #Src ", " #Value1 ", " #Result ", " #Size ")";           \
+    const uint32_t T0 = allocateDqword();                                      \
+    test##Size##SrcValue.F##Size[0] = Value1;                                  \
+    const uint32_t T1 = allocateDqword();                                      \
+                                                                               \
+    __ movups(Encoded_Xmm_##Src(), dwordAddress(T0));                          \
+    __ movups(Encoded_Xmm_##Dst(), dwordAddress(T1));                          \
+    __ sqrtss(IceType_f##Size, Encoded_Xmm_##Dst(), Encoded_Xmm_##Src());      \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, test##Size##SrcValue);                                \
+    test.setDqwordTo(T1, test##Size##DstValue);                                \
+    test.run();                                                                \
+                                                                               \
+    Dqword Expected = test##Size##DstValue;                                    \
+    Expected.F##Size[0] = Result;                                              \
+    ASSERT_EQ(Expected, test.Dst<Dqword>()) << TestString;                     \
+    reset();                                                                   \
+  } while (0)
+
+#define TestSqrtssXmmAddr(Dst, Value1, Result, Size)                           \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Dst ", Addr, " #Value1 ", " #Result ", " #Size ")";               \
+    const uint32_t T0 = allocateDqword();                                      \
+    test##Size##SrcValue.F##Size[0] = Value1;                                  \
+    const uint32_t T1 = allocateDqword();                                      \
+                                                                               \
+    __ movups(Encoded_Xmm_##Dst(), dwordAddress(T1));                          \
+    __ sqrtss(IceType_f##Size, Encoded_Xmm_##Dst(), dwordAddress(T0));         \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, test##Size##SrcValue);                                \
+    test.setDqwordTo(T1, test##Size##DstValue);                                \
+    test.run();                                                                \
+                                                                               \
+    Dqword Expected = test##Size##DstValue;                                    \
+    Expected.F##Size[0] = Result;                                              \
+    ASSERT_EQ(Expected, test.Dst<Dqword>()) << TestString;                     \
+    reset();                                                                   \
+  } while (0)
+
+#define TestSqrtssSize(Dst, Src, Size)                                         \
+  do {                                                                         \
+    TestSqrtssXmmXmm(Dst, Src, 4.0, 2.0, Size);                                \
+    TestSqrtssXmmAddr(Dst, 4.0, 2.0, Size);                                    \
+    TestSqrtssXmmXmm(Dst, Src, 9.0, 3.0, Size);                                \
+    TestSqrtssXmmAddr(Dst, 9.0, 3.0, Size);                                    \
+    TestSqrtssXmmXmm(Dst, Src, 100.0, 10.0, Size);                             \
+    TestSqrtssXmmAddr(Dst, 100.0, 10.0, Size);                                 \
+  } while (0)
+
+#define TestSqrtss(Dst, Src)                                                   \
+  do {                                                                         \
+    TestSqrtssSize(Dst, Src, 32);                                              \
+    TestSqrtssSize(Dst, Src, 64);                                              \
+  } while (0)
+
+  TestSqrtss(xmm0, xmm1);
+  TestSqrtss(xmm1, xmm2);
+  TestSqrtss(xmm2, xmm3);
+  TestSqrtss(xmm3, xmm4);
+  TestSqrtss(xmm4, xmm5);
+  TestSqrtss(xmm5, xmm6);
+  TestSqrtss(xmm6, xmm7);
+  TestSqrtss(xmm7, xmm8);
+  TestSqrtss(xmm8, xmm9);
+  TestSqrtss(xmm9, xmm10);
+  TestSqrtss(xmm10, xmm11);
+  TestSqrtss(xmm11, xmm12);
+  TestSqrtss(xmm12, xmm13);
+  TestSqrtss(xmm13, xmm14);
+  TestSqrtss(xmm14, xmm15);
+  TestSqrtss(xmm15, xmm0);
+
+#undef TestSqrtss
+#undef TestSqrtssSize
+#undef TestSqrtssXmmAddr
+#undef TestSqrtssXmmXmm
+}
+
+TEST_F(AssemblerX8664Test, Insertps) {
+#define TestInsertpsXmmXmmImm(Dst, Value0, Src, Value1, Imm, Expected)         \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Dst ", " #Value0 ", " #Src ", " #Value1 ", " #Imm ", " #Expected  \
+        ")";                                                                   \
+    const uint32_t T0 = allocateDqword();                                      \
+    const Dqword V0 Value0;                                                    \
+    const uint32_t T1 = allocateDqword();                                      \
+    const Dqword V1 Value1;                                                    \
+                                                                               \
+    __ movups(Encoded_Xmm_##Dst(), dwordAddress(T0));                          \
+    __ movups(Encoded_Xmm_##Src(), dwordAddress(T1));                          \
+    __ insertps(IceType_v4f32, Encoded_Xmm_##Dst(), Encoded_Xmm_##Src(),       \
+                Immediate(Imm));                                               \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0);                                                  \
+    test.setDqwordTo(T1, V1);                                                  \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(Dqword Expected, test.Dst<Dqword>()) << TestString;              \
+    reset();                                                                   \
+  } while (0)
+
+#define TestInsertpsXmmAddrImm(Dst, Value0, Value1, Imm, Expected)             \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Dst ", " #Value0 ", Addr, " #Value1 ", " #Imm ", " #Expected ")"; \
+    const uint32_t T0 = allocateDqword();                                      \
+    const Dqword V0 Value0;                                                    \
+    const uint32_t T1 = allocateDqword();                                      \
+    const Dqword V1 Value1;                                                    \
+                                                                               \
+    __ movups(Encoded_Xmm_##Dst(), dwordAddress(T0));                          \
+    __ insertps(IceType_v4f32, Encoded_Xmm_##Dst(), dwordAddress(T1),          \
+                Immediate(Imm));                                               \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0);                                                  \
+    test.setDqwordTo(T1, V1);                                                  \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(Dqword Expected, test.Dst<Dqword>()) << TestString;              \
+    reset();                                                                   \
+  } while (0)
+
+#define TestInsertps(Dst, Src)                                                 \
+  do {                                                                         \
+    TestInsertpsXmmXmmImm(                                                     \
+        Dst, (uint64_t(-1), uint64_t(-1)), Src,                                \
+        (uint64_t(0xAAAAAAAABBBBBBBBull), uint64_t(0xCCCCCCCCDDDDDDDDull)),    \
+        0x99,                                                                  \
+        (uint64_t(0xDDDDDDDD00000000ull), uint64_t(0x00000000FFFFFFFFull)));   \
+    TestInsertpsXmmAddrImm(                                                    \
+        Dst, (uint64_t(-1), uint64_t(-1)),                                     \
+        (uint64_t(0xAAAAAAAABBBBBBBBull), uint64_t(0xCCCCCCCCDDDDDDDDull)),    \
+        0x99,                                                                  \
+        (uint64_t(0xBBBBBBBB00000000ull), uint64_t(0x00000000FFFFFFFFull)));   \
+    TestInsertpsXmmXmmImm(                                                     \
+        Dst, (uint64_t(-1), uint64_t(-1)), Src,                                \
+        (uint64_t(0xAAAAAAAABBBBBBBBull), uint64_t(0xCCCCCCCCDDDDDDDDull)),    \
+        0x9D,                                                                  \
+        (uint64_t(0xDDDDDDDD00000000ull), uint64_t(0x0000000000000000ull)));   \
+    TestInsertpsXmmAddrImm(                                                    \
+        Dst, (uint64_t(-1), uint64_t(-1)),                                     \
+        (uint64_t(0xAAAAAAAABBBBBBBBull), uint64_t(0xCCCCCCCCDDDDDDDDull)),    \
+        0x9D,                                                                  \
+        (uint64_t(0xBBBBBBBB00000000ull), uint64_t(0x0000000000000000ull)));   \
+  } while (0)
+
+  TestInsertps(xmm0, xmm1);
+  TestInsertps(xmm1, xmm2);
+  TestInsertps(xmm2, xmm3);
+  TestInsertps(xmm3, xmm4);
+  TestInsertps(xmm4, xmm5);
+  TestInsertps(xmm5, xmm6);
+  TestInsertps(xmm6, xmm7);
+  TestInsertps(xmm7, xmm8);
+  TestInsertps(xmm8, xmm9);
+  TestInsertps(xmm9, xmm10);
+  TestInsertps(xmm10, xmm11);
+  TestInsertps(xmm11, xmm12);
+  TestInsertps(xmm12, xmm13);
+  TestInsertps(xmm13, xmm14);
+  TestInsertps(xmm14, xmm15);
+  TestInsertps(xmm15, xmm0);
+
+#undef TestInsertps
+#undef TestInsertpsXmmXmmAddr
+#undef TestInsertpsXmmXmmImm
+}
+
+TEST_F(AssemblerX8664Test, Pinsr) {
+  static constexpr uint8_t Mask32 = 0x03;
+  static constexpr uint8_t Mask16 = 0x07;
+  static constexpr uint8_t Mask8 = 0x0F;
+
+#define TestPinsrXmmGPRImm(Dst, Value0, GPR, Value1, Imm, Size)                \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Dst ", " #Value0 ", " #GPR ", " #Value1 ", " #Imm ", " #Size ")"; \
+    const uint32_t T0 = allocateDqword();                                      \
+    const Dqword V0 Value0;                                                    \
+                                                                               \
+    __ movups(Encoded_Xmm_##Dst(), dwordAddress(T0));                          \
+    __ mov(IceType_i32, Encoded_GPR_##GPR(), Immediate(Value1));               \
+    __ pinsr(IceType_i##Size, Encoded_Xmm_##Dst(), Encoded_GPR_##GPR(),        \
+             Immediate(Imm));                                                  \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0);                                                  \
+    test.run();                                                                \
+                                                                               \
+    constexpr uint8_t sel = (Imm)&Mask##Size;                                  \
+    Dqword Expected = V0;                                                      \
+    Expected.U##Size[sel] = Value1;                                            \
+    ASSERT_EQ(Expected, test.Dst<Dqword>()) << TestString;                     \
+    reset();                                                                   \
+  } while (0)
+
+#define TestPinsrXmmAddrImm(Dst, Value0, Value1, Imm, Size)                    \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Dst ", " #Value0 ", Addr, " #Value1 ", " #Imm ", " #Size ")";     \
+    const uint32_t T0 = allocateDqword();                                      \
+    const Dqword V0 Value0;                                                    \
+    const uint32_t T1 = allocateDword();                                       \
+    const uint32_t V1 = Value1;                                                \
+                                                                               \
+    __ movups(Encoded_Xmm_##Dst(), dwordAddress(T0));                          \
+    __ pinsr(IceType_i##Size, Encoded_Xmm_##Dst(), dwordAddress(T1),           \
+             Immediate(Imm));                                                  \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0);                                                  \
+    test.setDwordTo(T1, V1);                                                   \
+    test.run();                                                                \
+                                                                               \
+    constexpr uint8_t sel = (Imm)&Mask##Size;                                  \
+    Dqword Expected = V0;                                                      \
+    Expected.U##Size[sel] = Value1;                                            \
+    ASSERT_EQ(Expected, test.Dst<Dqword>()) << TestString;                     \
+    reset();                                                                   \
+  } while (0)
+
+#define TestPinsrSize(Dst, GPR, Value1, Imm, Size)                             \
+  do {                                                                         \
+    TestPinsrXmmGPRImm(Dst, (uint64_t(0xAAAAAAAABBBBBBBBull),                  \
+                             uint64_t(0xFFFFFFFFDDDDDDDDull)),                 \
+                       GPR, Value1, Imm, Size);                                \
+    TestPinsrXmmAddrImm(Dst, (uint64_t(0xAAAAAAAABBBBBBBBull),                 \
+                              uint64_t(0xFFFFFFFFDDDDDDDDull)),                \
+                        Value1, Imm, Size);                                    \
+  } while (0)
+
+#define TestPinsr(Src, Dst)                                                    \
+  do {                                                                         \
+    TestPinsrSize(Src, Dst, 0xEE, 0x03, 8);                                    \
+    TestPinsrSize(Src, Dst, 0xFFEE, 0x03, 16);                                 \
+    TestPinsrSize(Src, Dst, 0xC0FFEE, 0x03, 32);                               \
+  } while (0)
+
+  TestPinsr(xmm0, r1);
+  TestPinsr(xmm1, r2);
+  TestPinsr(xmm2, r3);
+  TestPinsr(xmm3, r4);
+  TestPinsr(xmm4, r5);
+  TestPinsr(xmm5, r6);
+  TestPinsr(xmm6, r7);
+  TestPinsr(xmm7, r8);
+  TestPinsr(xmm8, r10);
+  TestPinsr(xmm9, r11);
+  TestPinsr(xmm10, r12);
+  TestPinsr(xmm11, r13);
+  TestPinsr(xmm12, r14);
+  TestPinsr(xmm13, r15);
+  TestPinsr(xmm14, r1);
+  TestPinsr(xmm15, r2);
+
+#undef TestPinsr
+#undef TestPinsrSize
+#undef TestPinsrXmmAddrImm
+#undef TestPinsrXmmGPRImm
+}
+
+TEST_F(AssemblerX8664Test, Pextr) {
+  static constexpr uint8_t Mask32 = 0x03;
+  static constexpr uint8_t Mask16 = 0x07;
+  static constexpr uint8_t Mask8 = 0x0F;
+
+#define TestPextrGPRXmmImm(GPR, Src, Value1, Imm, Size)                        \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #GPR ", " #Src ", " #Value1 ", " #Imm ", " #Size ")";              \
+    const uint32_t T0 = allocateDqword();                                      \
+    const Dqword V0 Value1;                                                    \
+                                                                               \
+    __ movups(Encoded_Xmm_##Src(), dwordAddress(T0));                          \
+    __ pextr(IceType_i##Size, Encoded_GPR_##GPR(), Encoded_Xmm_##Src(),        \
+             Immediate(Imm));                                                  \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0);                                                  \
+    test.run();                                                                \
+                                                                               \
+    constexpr uint8_t sel = (Imm)&Mask##Size;                                  \
+    ASSERT_EQ(V0.U##Size[sel], test.GPR()) << TestString;                      \
+    reset();                                                                   \
+  } while (0)
+
+#define TestPextrSize(GPR, Src, Value1, Imm, Size)                             \
+  do {                                                                         \
+    TestPextrGPRXmmImm(GPR, Src, (uint64_t(0xAAAAAAAABBBBBBBBull),             \
+                                  uint64_t(0xFFFFFFFFDDDDDDDDull)),            \
+                       Imm, Size);                                             \
+  } while (0)
+
+#define TestPextr(Src, Dst)                                                    \
+  do {                                                                         \
+    TestPextrSize(Src, Dst, 0xEE, 0x03, 8);                                    \
+    TestPextrSize(Src, Dst, 0xFFEE, 0x03, 16);                                 \
+    TestPextrSize(Src, Dst, 0xC0FFEE, 0x03, 32);                               \
+  } while (0)
+
+  TestPextr(r1, xmm0);
+  TestPextr(r2, xmm1);
+  TestPextr(r3, xmm2);
+  TestPextr(r4, xmm3);
+  TestPextr(r5, xmm4);
+  TestPextr(r6, xmm5);
+  TestPextr(r7, xmm6);
+  TestPextr(r8, xmm7);
+  TestPextr(r10, xmm8);
+  TestPextr(r11, xmm9);
+  TestPextr(r12, xmm10);
+  TestPextr(r13, xmm11);
+  TestPextr(r14, xmm12);
+  TestPextr(r15, xmm13);
+  TestPextr(r1, xmm14);
+  TestPextr(r2, xmm15);
+
+#undef TestPextr
+#undef TestPextrSize
+#undef TestPextrXmmGPRImm
+}
+
+TEST_F(AssemblerX8664Test, Pcmpeq_Pcmpgt) {
+#define TestPcmpXmmXmm(Dst, Value0, Src, Value1, Size, Inst, Op)               \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Dst ", " #Value0 ", " #Src ", " #Value1 ", " #Size ", " #Op ")";  \
+    const uint32_t T0 = allocateDqword();                                      \
+    const Dqword V0 Value0;                                                    \
+    const uint32_t T1 = allocateDqword();                                      \
+    const Dqword V1 Value1;                                                    \
+                                                                               \
+    __ movups(Encoded_Xmm_##Dst(), dwordAddress(T0));                          \
+    __ movups(Encoded_Xmm_##Src(), dwordAddress(T1));                          \
+    __ Inst(IceType_i##Size, Encoded_Xmm_##Dst(), Encoded_Xmm_##Src());        \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0);                                                  \
+    test.setDqwordTo(T1, V1);                                                  \
+    test.run();                                                                \
+                                                                               \
+    Dqword Expected(uint64_t(0), uint64_t(0));                                 \
+    static constexpr uint8_t ArraySize =                                       \
+        sizeof(Dqword) / sizeof(uint##Size##_t);                               \
+    for (uint8_t i = 0; i < ArraySize; ++i) {                                  \
+      Expected.I##Size[i] = (V1.I##Size[i] Op V0.I##Size[i]) ? -1 : 0;         \
+    }                                                                          \
+    ASSERT_EQ(Expected, test.Dst<Dqword>()) << TestString;                     \
+    reset();                                                                   \
+  } while (0)
+
+#define TestPcmpXmmAddr(Dst, Value0, Value1, Size, Inst, Op)                   \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Dst ", " #Value0 ", Addr, " #Value1 ", " #Size ", " #Op ")";      \
+    const uint32_t T0 = allocateDqword();                                      \
+    const Dqword V0 Value0;                                                    \
+    const uint32_t T1 = allocateDqword();                                      \
+    const Dqword V1 Value1;                                                    \
+                                                                               \
+    __ movups(Encoded_Xmm_##Dst(), dwordAddress(T0));                          \
+    __ Inst(IceType_i##Size, Encoded_Xmm_##Dst(), dwordAddress(T1));           \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0);                                                  \
+    test.setDqwordTo(T1, V1);                                                  \
+    test.run();                                                                \
+                                                                               \
+    Dqword Expected(uint64_t(0), uint64_t(0));                                 \
+    static constexpr uint8_t ArraySize =                                       \
+        sizeof(Dqword) / sizeof(uint##Size##_t);                               \
+    for (uint8_t i = 0; i < ArraySize; ++i) {                                  \
+      Expected.I##Size[i] = (V1.I##Size[i] Op V0.I##Size[i]) ? -1 : 0;         \
+    }                                                                          \
+    ASSERT_EQ(Expected, test.Dst<Dqword>()) << TestString;                     \
+    reset();                                                                   \
+  } while (0)
+
+#define TestPcmpValues(Dst, Value0, Src, Value1, Size)                         \
+  do {                                                                         \
+    TestPcmpXmmXmm(Dst, Value0, Src, Value1, Size, pcmpeq, == );               \
+    TestPcmpXmmAddr(Dst, Value0, Value1, Size, pcmpeq, == );                   \
+    TestPcmpXmmXmm(Dst, Value0, Src, Value1, Size, pcmpgt, < );                \
+    TestPcmpXmmAddr(Dst, Value0, Value1, Size, pcmpgt, < );                    \
+  } while (0)
+
+#define TestPcmpSize(Dst, Src, Size)                                           \
+  do {                                                                         \
+    TestPcmpValues(Dst, (uint64_t(0x8888888888888888ull),                      \
+                         uint64_t(0x0000000000000000ull)),                     \
+                   Src, (uint64_t(0x0000008800008800ull),                      \
+                         uint64_t(0xFFFFFFFFFFFFFFFFull)),                     \
+                   Size);                                                      \
+    TestPcmpValues(Dst, (uint64_t(0x123567ABAB55DE01ull),                      \
+                         uint64_t(0x12345abcde12345Aull)),                     \
+                   Src, (uint64_t(0x0000008800008800ull),                      \
+                         uint64_t(0xAABBCCDD1234321Aull)),                     \
+                   Size);                                                      \
+  } while (0)
+
+#define TestPcmp(Dst, Src)                                                     \
+  do {                                                                         \
+    TestPcmpSize(xmm0, xmm1, 8);                                               \
+    TestPcmpSize(xmm0, xmm1, 16);                                              \
+    TestPcmpSize(xmm0, xmm1, 32);                                              \
+  } while (0)
+
+  TestPcmp(xmm0, xmm1);
+  TestPcmp(xmm1, xmm2);
+  TestPcmp(xmm2, xmm3);
+  TestPcmp(xmm3, xmm4);
+  TestPcmp(xmm4, xmm5);
+  TestPcmp(xmm5, xmm6);
+  TestPcmp(xmm6, xmm7);
+  TestPcmp(xmm7, xmm8);
+  TestPcmp(xmm8, xmm9);
+  TestPcmp(xmm9, xmm10);
+  TestPcmp(xmm10, xmm11);
+  TestPcmp(xmm11, xmm12);
+  TestPcmp(xmm12, xmm13);
+  TestPcmp(xmm13, xmm14);
+  TestPcmp(xmm14, xmm15);
+  TestPcmp(xmm15, xmm0);
+
+#undef TestPcmp
+#undef TestPcmpSize
+#undef TestPcmpValues
+#undef TestPcmpXmmAddr
+#undef TestPcmpXmmXmm
+}
+
+TEST_F(AssemblerX8664Test, Roundsd) {
+#define TestRoundsdXmmXmm(Dst, Src, Mode, Input, RN)                           \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Dst ", " #Src ", " #Mode ", " #Input ", " #RN ")";                \
+    const uint32_t T0 = allocateDqword();                                      \
+    const Dqword V0(-3.0, -3.0);                                               \
+    const uint32_t T1 = allocateDqword();                                      \
+    const Dqword V1(double(Input), -123.4);                                    \
+                                                                               \
+    __ movups(Encoded_Xmm_##Dst(), dwordAddress(T0));                          \
+    __ movups(Encoded_Xmm_##Src(), dwordAddress(T1));                          \
+    __ roundsd(Encoded_Xmm_##Dst(), Encoded_Xmm_##Src(),                       \
+               AssemblerX8664::k##Mode);                                       \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0);                                                  \
+    test.setDqwordTo(T1, V1);                                                  \
+    test.run();                                                                \
+                                                                               \
+    const Dqword Expected(double(RN), -3.0);                                   \
+    EXPECT_EQ(Expected, test.Dst<Dqword>()) << TestString;                     \
+    reset();                                                                   \
+  } while (0)
+
+#define TestRoundsd(Dst, Src)                                                  \
+  do {                                                                         \
+    TestRoundsdXmmXmm(Dst, Src, RoundToNearest, 5.51, 6);                      \
+    TestRoundsdXmmXmm(Dst, Src, RoundToNearest, 5.49, 5);                      \
+    TestRoundsdXmmXmm(Dst, Src, RoundDown, 5.51, 5);                           \
+    TestRoundsdXmmXmm(Dst, Src, RoundUp, 5.49, 6);                             \
+    TestRoundsdXmmXmm(Dst, Src, RoundToZero, 5.49, 5);                         \
+    TestRoundsdXmmXmm(Dst, Src, RoundToZero, 5.51, 5);                         \
+  } while (0)
+
+  TestRoundsd(xmm0, xmm1);
+  TestRoundsd(xmm1, xmm2);
+  TestRoundsd(xmm2, xmm3);
+  TestRoundsd(xmm3, xmm4);
+  TestRoundsd(xmm4, xmm5);
+  TestRoundsd(xmm5, xmm6);
+  TestRoundsd(xmm6, xmm7);
+  TestRoundsd(xmm7, xmm8);
+  TestRoundsd(xmm8, xmm9);
+  TestRoundsd(xmm9, xmm10);
+  TestRoundsd(xmm10, xmm11);
+  TestRoundsd(xmm11, xmm12);
+  TestRoundsd(xmm12, xmm13);
+  TestRoundsd(xmm13, xmm14);
+  TestRoundsd(xmm14, xmm15);
+  TestRoundsd(xmm15, xmm0);
+
+#undef TestRoundsd
+#undef TestRoundsdXmmXmm
+}
+
+TEST_F(AssemblerX8664Test, Set1ps) {
+#define TestImpl(Xmm, Src, Imm)                                                \
+  do {                                                                         \
+    __ set1ps(Encoded_Xmm_##Xmm(), Encoded_GPR_##Src(), Immediate(Imm));       \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.run();                                                                \
+                                                                               \
+    const Dqword Expected((uint64_t(Imm) << 32) | uint32_t(Imm),               \
+                          (uint64_t(Imm) << 32) | uint32_t(Imm));              \
+    ASSERT_EQ(Expected, test.Xmm<Dqword>())                                    \
+        << "(" #Xmm ", " #Src ", " #Imm ")";                                   \
+    reset();                                                                   \
+  } while (0)
+
+  TestImpl(xmm0, r1, 1);
+  TestImpl(xmm1, r2, 12);
+  TestImpl(xmm2, r3, 22);
+  TestImpl(xmm3, r4, 54);
+  TestImpl(xmm4, r5, 80);
+  TestImpl(xmm5, r6, 32);
+  TestImpl(xmm6, r7, 55);
+  TestImpl(xmm7, r8, 44);
+  TestImpl(xmm8, r10, 10);
+  TestImpl(xmm9, r11, 155);
+  TestImpl(xmm10, r12, 165);
+  TestImpl(xmm11, r13, 170);
+  TestImpl(xmm12, r14, 200);
+  TestImpl(xmm13, r15, 124);
+  TestImpl(xmm14, r1, 101);
+  TestImpl(xmm15, r2, 166);
+
+#undef TestImpl
+}
+
+} // end of anonymous namespace
+} // end of namespace Test
+} // end of namespace X8664
+} // end of namespace Ice
diff --git a/unittest/IceAssemblerX8632Test.cpp b/unittest/IceAssemblerX8632Test.cpp
deleted file mode 100644
index 3e3d7f3..0000000
--- a/unittest/IceAssemblerX8632Test.cpp
+++ /dev/null
@@ -1,719 +0,0 @@
-//===- subzero/unittest/IceAssemblerX8632.cpp - X8632 Assembler tests -----===//
-//
-//                        The Subzero Code Generator
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "IceAssemblerX8632.h"
-
-#include "IceDefs.h"
-
-#include "gtest/gtest.h"
-
-#include <cstring>
-#include <errno.h>
-#include <iostream>
-#include <memory>
-#include <sys/mman.h>
-#include <type_traits>
-
-namespace Ice {
-namespace X8632 {
-namespace {
-
-class AssemblerX8632TestBase : public ::testing::Test {
-protected:
-  using Address = AssemblerX8632::Traits::Address;
-  using Cond = AssemblerX8632::Traits::Cond;
-  using GPRRegister = AssemblerX8632::Traits::GPRRegister;
-  using XmmRegister = AssemblerX8632::Traits::XmmRegister;
-  using X87STRegister = AssemblerX8632::Traits::X87STRegister;
-
-  AssemblerX8632TestBase() { reset(); }
-
-  void reset() { Assembler.reset(new AssemblerX8632()); }
-
-  AssemblerX8632 *assembler() const { return Assembler.get(); }
-
-  size_t codeBytesSize() const { return Assembler->getBufferView().size(); }
-
-  const uint8_t *codeBytes() const {
-    return static_cast<const uint8_t *>(
-        static_cast<const void *>(Assembler->getBufferView().data()));
-  }
-
-private:
-  std::unique_ptr<AssemblerX8632> Assembler;
-};
-
-// __ is a helper macro. It allows test cases to emit X8632 assembly
-// instructions with
-//
-//   __ mov(GPRRegister::Reg_Eax, 1);
-//   __ ret();
-//
-// and so on. The idea of having this was "stolen" from dart's unit tests.
-#define __ (this->assembler())->
-
-// AssemblerX8632LowLevelTest verify that the "basic" instructions the tests
-// rely on are encoded correctly. Therefore, instead of executing the assembled
-// code, these tests will verify that the assembled bytes are sane.
-class AssemblerX8632LowLevelTest : public AssemblerX8632TestBase {
-protected:
-  // verifyBytes is a template helper that takes a Buffer, and a variable number
-  // of bytes. As the name indicates, it is used to verify the bytes for an
-  // instruction encoding.
-  template <int N, int I> static void verifyBytes(const uint8_t *) {
-    static_assert(I == N, "Invalid template instantiation.");
-  }
-
-  template <int N, int I = 0, typename... Args>
-  static void verifyBytes(const uint8_t *Buffer, uint8_t Byte,
-                          Args... OtherBytes) {
-    static_assert(I < N, "Invalid template instantiation.");
-    EXPECT_EQ(Byte, Buffer[I]) << "Byte " << (I + 1) << " of " << N;
-    verifyBytes<N, I + 1>(Buffer, OtherBytes...);
-    assert(Buffer[I] == Byte);
-  }
-};
-
-TEST_F(AssemblerX8632LowLevelTest, Ret) {
-  __ ret();
-
-  constexpr size_t ByteCount = 1;
-  ASSERT_EQ(ByteCount, codeBytesSize());
-
-  verifyBytes<ByteCount>(codeBytes(), 0xc3);
-}
-
-TEST_F(AssemblerX8632LowLevelTest, CallImm4) {
-  __ call(Immediate(4));
-
-  constexpr size_t ByteCount = 5;
-  ASSERT_EQ(ByteCount, codeBytesSize());
-
-  verifyBytes<ByteCount>(codeBytes(), 0xe8, 0x00, 0x00, 0x00, 0x00);
-}
-
-TEST_F(AssemblerX8632LowLevelTest, PopRegs) {
-  __ popl(GPRRegister::Encoded_Reg_eax);
-  __ popl(GPRRegister::Encoded_Reg_ebx);
-  __ popl(GPRRegister::Encoded_Reg_ecx);
-  __ popl(GPRRegister::Encoded_Reg_edx);
-  __ popl(GPRRegister::Encoded_Reg_edi);
-  __ popl(GPRRegister::Encoded_Reg_esi);
-  __ popl(GPRRegister::Encoded_Reg_ebp);
-
-  constexpr size_t ByteCount = 7;
-  ASSERT_EQ(ByteCount, codeBytesSize());
-
-  constexpr uint8_t PopOpcode = 0x58;
-  verifyBytes<ByteCount>(codeBytes(), PopOpcode | GPRRegister::Encoded_Reg_eax,
-                         PopOpcode | GPRRegister::Encoded_Reg_ebx,
-                         PopOpcode | GPRRegister::Encoded_Reg_ecx,
-                         PopOpcode | GPRRegister::Encoded_Reg_edx,
-                         PopOpcode | GPRRegister::Encoded_Reg_edi,
-                         PopOpcode | GPRRegister::Encoded_Reg_esi,
-                         PopOpcode | GPRRegister::Encoded_Reg_ebp);
-}
-
-TEST_F(AssemblerX8632LowLevelTest, PushRegs) {
-  __ pushl(GPRRegister::Encoded_Reg_eax);
-  __ pushl(GPRRegister::Encoded_Reg_ebx);
-  __ pushl(GPRRegister::Encoded_Reg_ecx);
-  __ pushl(GPRRegister::Encoded_Reg_edx);
-  __ pushl(GPRRegister::Encoded_Reg_edi);
-  __ pushl(GPRRegister::Encoded_Reg_esi);
-  __ pushl(GPRRegister::Encoded_Reg_ebp);
-
-  constexpr size_t ByteCount = 7;
-  ASSERT_EQ(ByteCount, codeBytesSize());
-
-  constexpr uint8_t PushOpcode = 0x50;
-  verifyBytes<ByteCount>(codeBytes(), PushOpcode | GPRRegister::Encoded_Reg_eax,
-                         PushOpcode | GPRRegister::Encoded_Reg_ebx,
-                         PushOpcode | GPRRegister::Encoded_Reg_ecx,
-                         PushOpcode | GPRRegister::Encoded_Reg_edx,
-                         PushOpcode | GPRRegister::Encoded_Reg_edi,
-                         PushOpcode | GPRRegister::Encoded_Reg_esi,
-                         PushOpcode | GPRRegister::Encoded_Reg_ebp);
-}
-
-TEST_F(AssemblerX8632LowLevelTest, MovRegisterZero) {
-  __ mov(IceType_i32, GPRRegister::Encoded_Reg_eax, Immediate(0x00));
-  __ mov(IceType_i32, GPRRegister::Encoded_Reg_ebx, Immediate(0x00));
-  __ mov(IceType_i32, GPRRegister::Encoded_Reg_ecx, Immediate(0x00));
-  __ mov(IceType_i32, GPRRegister::Encoded_Reg_edx, Immediate(0x00));
-  __ mov(IceType_i32, GPRRegister::Encoded_Reg_edi, Immediate(0x00));
-  __ mov(IceType_i32, GPRRegister::Encoded_Reg_esi, Immediate(0x00));
-
-  constexpr size_t MovReg32BitImmBytes = 5;
-  constexpr size_t ByteCount = 6 * MovReg32BitImmBytes;
-  ASSERT_EQ(ByteCount, codeBytesSize());
-
-  constexpr uint8_t MovOpcode = 0xb8;
-  verifyBytes<ByteCount>(
-      codeBytes(), MovOpcode | GPRRegister::Encoded_Reg_eax, 0x00, 0x00, 0x00,
-      0x00, MovOpcode | GPRRegister::Encoded_Reg_ebx, 0x00, 0x00, 0x00, 0x00,
-      MovOpcode | GPRRegister::Encoded_Reg_ecx, 0x00, 0x00, 0x00, 0x00,
-      MovOpcode | GPRRegister::Encoded_Reg_edx, 0x00, 0x00, 0x00, 0x00,
-      MovOpcode | GPRRegister::Encoded_Reg_edi, 0x00, 0x00, 0x00, 0x00,
-      MovOpcode | GPRRegister::Encoded_Reg_esi, 0x00, 0x00, 0x00, 0x00);
-}
-
-TEST_F(AssemblerX8632LowLevelTest, CmpRegReg) {
-  __ cmp(IceType_i32, GPRRegister::Encoded_Reg_eax,
-         GPRRegister::Encoded_Reg_ebx);
-  __ cmp(IceType_i32, GPRRegister::Encoded_Reg_ebx,
-         GPRRegister::Encoded_Reg_ecx);
-  __ cmp(IceType_i32, GPRRegister::Encoded_Reg_ecx,
-         GPRRegister::Encoded_Reg_edx);
-  __ cmp(IceType_i32, GPRRegister::Encoded_Reg_edx,
-         GPRRegister::Encoded_Reg_edi);
-  __ cmp(IceType_i32, GPRRegister::Encoded_Reg_edi,
-         GPRRegister::Encoded_Reg_esi);
-  __ cmp(IceType_i32, GPRRegister::Encoded_Reg_esi,
-         GPRRegister::Encoded_Reg_eax);
-
-  const size_t CmpRegRegBytes = 2;
-  const size_t ByteCount = 6 * CmpRegRegBytes;
-  ASSERT_EQ(ByteCount, codeBytesSize());
-
-  constexpr size_t CmpOpcode = 0x3b;
-  constexpr size_t ModRm = 0xC0 /* Register Addressing */;
-  verifyBytes<ByteCount>(
-      codeBytes(), CmpOpcode, ModRm | (GPRRegister::Encoded_Reg_eax << 3) |
-                                  GPRRegister::Encoded_Reg_ebx,
-      CmpOpcode, ModRm | (GPRRegister::Encoded_Reg_ebx << 3) |
-                     GPRRegister::Encoded_Reg_ecx,
-      CmpOpcode, ModRm | (GPRRegister::Encoded_Reg_ecx << 3) |
-                     GPRRegister::Encoded_Reg_edx,
-      CmpOpcode, ModRm | (GPRRegister::Encoded_Reg_edx << 3) |
-                     GPRRegister::Encoded_Reg_edi,
-      CmpOpcode, ModRm | (GPRRegister::Encoded_Reg_edi << 3) |
-                     GPRRegister::Encoded_Reg_esi,
-      CmpOpcode, ModRm | (GPRRegister::Encoded_Reg_esi << 3) |
-                     GPRRegister::Encoded_Reg_eax);
-}
-
-// After these tests we should have a sane environment; we know the following
-// work:
-//
-//  (*) zeroing eax, ebx, ecx, edx, edi, and esi;
-//  (*) call $4 instruction (used for ip materialization);
-//  (*) register push and pop;
-//  (*) cmp reg, reg; and
-//  (*) returning from functions.
-//
-// We can now dive into testing each emitting method in AssemblerX8632. Each
-// test will emit some instructions for performing the test. The assembled
-// instructions will operate in a "safe" environment. All x86-32 registers are
-// spilled to the program stack, and the registers are then zeroed out, with the
-// exception of %esp and %ebp.
-//
-// The jitted code and the unittest code will share the same stack. Therefore,
-// test harnesses need to ensure it does not leave anything it pushed on the
-// stack.
-//
-// %ebp is initialized with a pointer for rIP-based addressing. This pointer is
-// used for position-independent access to a scratchpad area for use in tests.
-// This mechanism is used because the test framework needs to generate addresses
-// that work on both x86-32 and x86-64 hosts, but are encodable using our x86-32
-// assembler. This is made possible because the encoding for
-//
-//    pushq %rax (x86-64 only)
-//
-// is the same as the one for
-//
-//    pushl %eax (x86-32 only; not encodable in x86-64)
-//
-// Likewise, the encodings for
-//
-//    movl offset(%ebp), %reg (32-bit only)
-//    movl <src>, offset(%ebp) (32-bit only)
-//
-// and
-//
-//    movl offset(%rbp), %reg (64-bit only)
-//    movl <src>, offset(%rbp) (64-bit only)
-//
-// are also the same.
-//
-// We use a call instruction in order to generate a natural sized address on the
-// stack. Said address is then removed from the stack with a pop %rBP, which can
-// then be used to address memory safely in either x86-32 or x86-64, as long as
-// the test code does not perform any arithmetic operation that writes to %rBP.
-// This PC materialization technique is very common in x86-32 PIC.
-//
-// %rBP is used to provide the tests with a scratchpad area that can safely and
-// portably be written to and read from. This scratchpad area is also used to
-// store the "final" values in eax, ebx, ecx, edx, esi, and edi, allowing the
-// harnesses access to 6 "return values" instead of the usual single return
-// value supported by C++.
-//
-// The jitted code will look like the following:
-//
-// test:
-//       push %eax
-//       push %ebx
-//       push %ecx
-//       push %edx
-//       push %edi
-//       push %esi
-//       push %ebp
-//       call test$materialize_ip
-// test$materialize_ip:                           <<------- %eBP will point here
-//       pop  %ebp
-//       mov  $0, %eax
-//       mov  $0, %ebx
-//       mov  $0, %ecx
-//       mov  $0, %edx
-//       mov  $0, %edi
-//       mov  $0, %esi
-//
-//       << test code goes here >>
-//
-//       mov %eax, { 0 + $ScratchpadOffset}(%ebp)
-//       mov %ebx, { 4 + $ScratchpadOffset}(%ebp)
-//       mov %ecx, { 8 + $ScratchpadOffset}(%ebp)
-//       mov %edx, {12 + $ScratchpadOffset}(%ebp)
-//       mov %edi, {16 + $ScratchpadOffset}(%ebp)
-//       mov %esi, {20 + $ScratchpadOffset}(%ebp)
-//
-//       pop %ebp
-//       pop %esi
-//       pop %edi
-//       pop %edx
-//       pop %ecx
-//       pop %ebx
-//       pop %eax
-//       ret
-//
-//      << ... >>
-//
-// scratchpad:                              <<------- accessed via $Offset(%ebp)
-//
-//      << test scratch area >>
-//
-// TODO(jpp): test the
-//
-//    mov %reg, $Offset(%ebp)
-//
-// encodings using the low level assembler test ensuring that the register
-// values can be written to the scratchpad area.
-class AssemblerX8632Test : public AssemblerX8632TestBase {
-protected:
-  AssemblerX8632Test() { reset(); }
-
-  void reset() {
-    AssemblerX8632TestBase::reset();
-
-    NeedsEpilogue = true;
-    // 6 dwords are allocated for saving the GPR state after the jitted code
-    // runs.
-    NumAllocatedDwords = 6;
-    addPrologue();
-  }
-
-  // AssembledBuffer is a wrapper around a PROT_EXEC mmap'ed buffer. This buffer
-  // contains both the test code as well as prologue/epilogue, and the
-  // scratchpad area that tests may use -- all tests use this scratchpad area
-  // for storing the processor's registers after the tests executed. This class
-  // also exposes helper methods for reading the register state after test
-  // execution, as well as for reading the scratchpad area.
-  class AssembledBuffer {
-    AssembledBuffer() = delete;
-    AssembledBuffer(const AssembledBuffer &) = delete;
-    AssembledBuffer &operator=(const AssembledBuffer &) = delete;
-
-  public:
-    static constexpr uint32_t MaximumCodeSize = 1 << 20;
-    static constexpr uint32_t EaxSlot = 0;
-    static constexpr uint32_t EbxSlot = 1;
-    static constexpr uint32_t EcxSlot = 2;
-    static constexpr uint32_t EdxSlot = 3;
-    static constexpr uint32_t EdiSlot = 4;
-    static constexpr uint32_t EsiSlot = 5;
-
-    AssembledBuffer(const uint8_t *Data, const size_t MySize,
-                    const size_t ExtraStorageDwords)
-        : Size(MaximumCodeSize + 4 * ExtraStorageDwords) {
-      // MaxCodeSize is needed because EXPECT_LT needs a symbol with a name --
-      // probably a compiler bug?
-      uint32_t MaxCodeSize = MaximumCodeSize;
-      EXPECT_LT(MySize, MaxCodeSize);
-      assert(MySize < MaximumCodeSize);
-      ExecutableData = mmap(nullptr, Size, PROT_WRITE | PROT_READ | PROT_EXEC,
-                            MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-      EXPECT_NE(MAP_FAILED, ExecutableData) << strerror(errno);
-      assert(MAP_FAILED != ExecutableData);
-      std::memcpy(ExecutableData, Data, MySize);
-    }
-
-    // We allow AssembledBuffer to be moved so that we can return objects of
-    // this type.
-    AssembledBuffer(AssembledBuffer &&Buffer)
-        : ExecutableData(Buffer.ExecutableData), Size(Buffer.Size) {
-      Buffer.ExecutableData = nullptr;
-      Buffer.Size = 0;
-    }
-
-    AssembledBuffer &operator=(AssembledBuffer &&Buffer) {
-      ExecutableData = Buffer.ExecutableData;
-      Buffer.ExecutableData = nullptr;
-      Size = Buffer.Size;
-      Buffer.Size = 0;
-      return *this;
-    }
-
-    ~AssembledBuffer() {
-      if (ExecutableData != nullptr) {
-        munmap(ExecutableData, Size);
-        ExecutableData = nullptr;
-      }
-    }
-
-    void run() const { reinterpret_cast<void (*)()>(ExecutableData)(); }
-
-    uint32_t eax() const { return contentsOfDword(AssembledBuffer::EaxSlot); }
-
-    uint32_t ebx() const { return contentsOfDword(AssembledBuffer::EbxSlot); }
-
-    uint32_t ecx() const { return contentsOfDword(AssembledBuffer::EcxSlot); }
-
-    uint32_t edx() const { return contentsOfDword(AssembledBuffer::EdxSlot); }
-
-    uint32_t edi() const { return contentsOfDword(AssembledBuffer::EdiSlot); }
-
-    uint32_t esi() const { return contentsOfDword(AssembledBuffer::EsiSlot); }
-
-    // contentsOfDword is used for reading the values in the scratchpad area.
-    // Valid arguments are the dword ids returned by
-    // AssemblerX8632Test::allocateDword() -- other inputs are considered
-    // invalid, and are not guaranteed to work if the implementation changes.
-    uint32_t contentsOfDword(uint32_t Dword) const {
-      return *reinterpret_cast<uint32_t *>(
-                 static_cast<uint8_t *>(ExecutableData) + dwordOffset(Dword));
-    }
-
-  private:
-    static uint32_t dwordOffset(uint32_t Index) {
-      return MaximumCodeSize + (Index * 4);
-    }
-
-    void *ExecutableData = nullptr;
-    size_t Size;
-  };
-
-  // assemble created an AssembledBuffer with the jitted code. The first time
-  // assemble is executed it will add the epilogue to the jitted code (which is
-  // the reason why this method is not const qualified.
-  AssembledBuffer assemble() {
-    if (NeedsEpilogue) {
-      addEpilogue();
-    }
-
-    NeedsEpilogue = false;
-    return AssembledBuffer(codeBytes(), codeBytesSize(), NumAllocatedDwords);
-  }
-
-  // Allocates a new dword slot in the test's scratchpad area.
-  uint32_t allocateDword() { return NumAllocatedDwords++; }
-
-  Address dwordAddress(uint32_t Dword) {
-    return Address(GPRRegister::Encoded_Reg_ebp, dwordDisp(Dword));
-  }
-
-private:
-  // e??SlotAddress returns an AssemblerX8632::Traits::Address that can be used
-  // by the test cases to encode an address operand for accessing the slot for
-  // the specified register. These are all private for, when jitting the test
-  // code, tests should not tamper with these values. Besides, during the test
-  // execution these slots' contents are undefined and should not be accessed.
-  Address eaxSlotAddress() { return dwordAddress(AssembledBuffer::EaxSlot); }
-  Address ebxSlotAddress() { return dwordAddress(AssembledBuffer::EbxSlot); }
-  Address ecxSlotAddress() { return dwordAddress(AssembledBuffer::EcxSlot); }
-  Address edxSlotAddress() { return dwordAddress(AssembledBuffer::EdxSlot); }
-  Address ediSlotAddress() { return dwordAddress(AssembledBuffer::EdiSlot); }
-  Address esiSlotAddress() { return dwordAddress(AssembledBuffer::EsiSlot); }
-
-  // Returns the displacement that should be used when accessing the specified
-  // Dword in the scratchpad area. It needs to adjust for the initial
-  // instructions that are emitted before the call that materializes the IP
-  // register.
-  uint32_t dwordDisp(uint32_t Dword) const {
-    EXPECT_LT(Dword, NumAllocatedDwords);
-    assert(Dword < NumAllocatedDwords);
-    static constexpr uint8_t PushBytes = 1;
-    static constexpr uint8_t CallImmBytes = 5;
-    return AssembledBuffer::MaximumCodeSize + (Dword * 4) -
-           (7 * PushBytes + CallImmBytes);
-  }
-
-  void addPrologue() {
-    __ pushl(GPRRegister::Encoded_Reg_eax);
-    __ pushl(GPRRegister::Encoded_Reg_ebx);
-    __ pushl(GPRRegister::Encoded_Reg_ecx);
-    __ pushl(GPRRegister::Encoded_Reg_edx);
-    __ pushl(GPRRegister::Encoded_Reg_edi);
-    __ pushl(GPRRegister::Encoded_Reg_esi);
-    __ pushl(GPRRegister::Encoded_Reg_ebp);
-
-    __ call(Immediate(4));
-    __ popl(GPRRegister::Encoded_Reg_ebp);
-    __ mov(IceType_i32, GPRRegister::Encoded_Reg_eax, Immediate(0x00));
-    __ mov(IceType_i32, GPRRegister::Encoded_Reg_ebx, Immediate(0x00));
-    __ mov(IceType_i32, GPRRegister::Encoded_Reg_ecx, Immediate(0x00));
-    __ mov(IceType_i32, GPRRegister::Encoded_Reg_edx, Immediate(0x00));
-    __ mov(IceType_i32, GPRRegister::Encoded_Reg_edi, Immediate(0x00));
-    __ mov(IceType_i32, GPRRegister::Encoded_Reg_esi, Immediate(0x00));
-  }
-
-  void addEpilogue() {
-    __ mov(IceType_i32, eaxSlotAddress(), GPRRegister::Encoded_Reg_eax);
-    __ mov(IceType_i32, ebxSlotAddress(), GPRRegister::Encoded_Reg_ebx);
-    __ mov(IceType_i32, ecxSlotAddress(), GPRRegister::Encoded_Reg_ecx);
-    __ mov(IceType_i32, edxSlotAddress(), GPRRegister::Encoded_Reg_edx);
-    __ mov(IceType_i32, ediSlotAddress(), GPRRegister::Encoded_Reg_edi);
-    __ mov(IceType_i32, esiSlotAddress(), GPRRegister::Encoded_Reg_esi);
-
-    __ popl(GPRRegister::Encoded_Reg_ebp);
-    __ popl(GPRRegister::Encoded_Reg_esi);
-    __ popl(GPRRegister::Encoded_Reg_edi);
-    __ popl(GPRRegister::Encoded_Reg_edx);
-    __ popl(GPRRegister::Encoded_Reg_ecx);
-    __ popl(GPRRegister::Encoded_Reg_ebx);
-    __ popl(GPRRegister::Encoded_Reg_eax);
-
-    __ ret();
-  }
-
-  bool NeedsEpilogue;
-  uint32_t NumAllocatedDwords;
-};
-
-TEST_F(AssemblerX8632Test, MovRegImm) {
-  constexpr uint32_t ExpectedEax = 0x000000FFul;
-  constexpr uint32_t ExpectedEbx = 0x0000FF00ul;
-  constexpr uint32_t ExpectedEcx = 0x00FF0000ul;
-  constexpr uint32_t ExpectedEdx = 0xFF000000ul;
-  constexpr uint32_t ExpectedEdi = 0x6AAA0006ul;
-  constexpr uint32_t ExpectedEsi = 0x6000AAA6ul;
-
-  __ mov(IceType_i32, GPRRegister::Encoded_Reg_eax, Immediate(ExpectedEax));
-  __ mov(IceType_i32, GPRRegister::Encoded_Reg_ebx, Immediate(ExpectedEbx));
-  __ mov(IceType_i32, GPRRegister::Encoded_Reg_ecx, Immediate(ExpectedEcx));
-  __ mov(IceType_i32, GPRRegister::Encoded_Reg_edx, Immediate(ExpectedEdx));
-  __ mov(IceType_i32, GPRRegister::Encoded_Reg_edi, Immediate(ExpectedEdi));
-  __ mov(IceType_i32, GPRRegister::Encoded_Reg_esi, Immediate(ExpectedEsi));
-
-  AssembledBuffer test = assemble();
-  test.run();
-  EXPECT_EQ(ExpectedEax, test.eax());
-  EXPECT_EQ(ExpectedEbx, test.ebx());
-  EXPECT_EQ(ExpectedEcx, test.ecx());
-  EXPECT_EQ(ExpectedEdx, test.edx());
-  EXPECT_EQ(ExpectedEdi, test.edi());
-  EXPECT_EQ(ExpectedEsi, test.esi());
-}
-
-TEST_F(AssemblerX8632Test, MovMemImm) {
-  const uint32_t T0 = allocateDword();
-  constexpr uint32_t ExpectedT0 = 0x00111100ul;
-  const uint32_t T1 = allocateDword();
-  constexpr uint32_t ExpectedT1 = 0x00222200ul;
-  const uint32_t T2 = allocateDword();
-  constexpr uint32_t ExpectedT2 = 0x03333000ul;
-  const uint32_t T3 = allocateDword();
-  constexpr uint32_t ExpectedT3 = 0x00444400ul;
-
-  __ mov(IceType_i32, dwordAddress(T0), Immediate(ExpectedT0));
-  __ mov(IceType_i32, dwordAddress(T1), Immediate(ExpectedT1));
-  __ mov(IceType_i32, dwordAddress(T2), Immediate(ExpectedT2));
-  __ mov(IceType_i32, dwordAddress(T3), Immediate(ExpectedT3));
-
-  AssembledBuffer test = assemble();
-  test.run();
-  EXPECT_EQ(0ul, test.eax());
-  EXPECT_EQ(0ul, test.ebx());
-  EXPECT_EQ(0ul, test.ecx());
-  EXPECT_EQ(0ul, test.edx());
-  EXPECT_EQ(0ul, test.edi());
-  EXPECT_EQ(0ul, test.esi());
-  EXPECT_EQ(ExpectedT0, test.contentsOfDword(T0));
-  EXPECT_EQ(ExpectedT1, test.contentsOfDword(T1));
-  EXPECT_EQ(ExpectedT2, test.contentsOfDword(T2));
-  EXPECT_EQ(ExpectedT3, test.contentsOfDword(T3));
-}
-
-TEST_F(AssemblerX8632Test, MovMemReg) {
-  const uint32_t T0 = allocateDword();
-  constexpr uint32_t ExpectedT0 = 0x00111100ul;
-  const uint32_t T1 = allocateDword();
-  constexpr uint32_t ExpectedT1 = 0x00222200ul;
-  const uint32_t T2 = allocateDword();
-  constexpr uint32_t ExpectedT2 = 0x00333300ul;
-  const uint32_t T3 = allocateDword();
-  constexpr uint32_t ExpectedT3 = 0x00444400ul;
-  const uint32_t T4 = allocateDword();
-  constexpr uint32_t ExpectedT4 = 0x00555500ul;
-  const uint32_t T5 = allocateDword();
-  constexpr uint32_t ExpectedT5 = 0x00666600ul;
-
-  __ mov(IceType_i32, GPRRegister::Encoded_Reg_eax, Immediate(ExpectedT0));
-  __ mov(IceType_i32, dwordAddress(T0), GPRRegister::Encoded_Reg_eax);
-  __ mov(IceType_i32, GPRRegister::Encoded_Reg_ebx, Immediate(ExpectedT1));
-  __ mov(IceType_i32, dwordAddress(T1), GPRRegister::Encoded_Reg_ebx);
-  __ mov(IceType_i32, GPRRegister::Encoded_Reg_ecx, Immediate(ExpectedT2));
-  __ mov(IceType_i32, dwordAddress(T2), GPRRegister::Encoded_Reg_ecx);
-  __ mov(IceType_i32, GPRRegister::Encoded_Reg_edx, Immediate(ExpectedT3));
-  __ mov(IceType_i32, dwordAddress(T3), GPRRegister::Encoded_Reg_edx);
-  __ mov(IceType_i32, GPRRegister::Encoded_Reg_edi, Immediate(ExpectedT4));
-  __ mov(IceType_i32, dwordAddress(T4), GPRRegister::Encoded_Reg_edi);
-  __ mov(IceType_i32, GPRRegister::Encoded_Reg_esi, Immediate(ExpectedT5));
-  __ mov(IceType_i32, dwordAddress(T5), GPRRegister::Encoded_Reg_esi);
-
-  AssembledBuffer test = assemble();
-  test.run();
-  EXPECT_EQ(ExpectedT0, test.contentsOfDword(T0));
-  EXPECT_EQ(ExpectedT1, test.contentsOfDword(T1));
-  EXPECT_EQ(ExpectedT2, test.contentsOfDword(T2));
-  EXPECT_EQ(ExpectedT3, test.contentsOfDword(T3));
-  EXPECT_EQ(ExpectedT4, test.contentsOfDword(T4));
-  EXPECT_EQ(ExpectedT5, test.contentsOfDword(T5));
-}
-
-TEST_F(AssemblerX8632Test, MovRegReg) {
-  __ mov(IceType_i32, GPRRegister::Encoded_Reg_eax, Immediate(0x20));
-  __ mov(IceType_i32, GPRRegister::Encoded_Reg_ebx,
-         GPRRegister::Encoded_Reg_eax);
-  __ mov(IceType_i32, GPRRegister::Encoded_Reg_ecx,
-         GPRRegister::Encoded_Reg_ebx);
-  __ mov(IceType_i32, GPRRegister::Encoded_Reg_edx,
-         GPRRegister::Encoded_Reg_ecx);
-  __ mov(IceType_i32, GPRRegister::Encoded_Reg_edi,
-         GPRRegister::Encoded_Reg_edx);
-  __ mov(IceType_i32, GPRRegister::Encoded_Reg_esi,
-         GPRRegister::Encoded_Reg_edi);
-
-  __ mov(IceType_i32, GPRRegister::Encoded_Reg_esi, Immediate(0x55000000ul));
-  __ mov(IceType_i32, GPRRegister::Encoded_Reg_eax,
-         GPRRegister::Encoded_Reg_esi);
-
-  AssembledBuffer test = assemble();
-  test.run();
-  EXPECT_EQ(0x55000000ul, test.eax());
-  EXPECT_EQ(0x20ul, test.ebx());
-  EXPECT_EQ(0x20ul, test.ecx());
-  EXPECT_EQ(0x20ul, test.edx());
-  EXPECT_EQ(0x20ul, test.edi());
-  EXPECT_EQ(0x55000000ul, test.esi());
-}
-
-TEST_F(AssemblerX8632Test, MovRegMem) {
-  const uint32_t T0 = allocateDword();
-  constexpr uint32_t ExpectedT0 = 0x00111100ul;
-  const uint32_t T1 = allocateDword();
-  constexpr uint32_t ExpectedT1 = 0x00222200ul;
-  const uint32_t T2 = allocateDword();
-  constexpr uint32_t ExpectedT2 = 0x00333300ul;
-  const uint32_t T3 = allocateDword();
-  constexpr uint32_t ExpectedT3 = 0x00444400ul;
-  const uint32_t T4 = allocateDword();
-  constexpr uint32_t ExpectedT4 = 0x00555500ul;
-  const uint32_t T5 = allocateDword();
-  constexpr uint32_t ExpectedT5 = 0x00666600ul;
-
-  __ mov(IceType_i32, dwordAddress(T0), Immediate(ExpectedT0));
-  __ mov(IceType_i32, GPRRegister::Encoded_Reg_eax, dwordAddress(T0));
-
-  __ mov(IceType_i32, dwordAddress(T1), Immediate(ExpectedT1));
-  __ mov(IceType_i32, GPRRegister::Encoded_Reg_ebx, dwordAddress(T1));
-
-  __ mov(IceType_i32, dwordAddress(T2), Immediate(ExpectedT2));
-  __ mov(IceType_i32, GPRRegister::Encoded_Reg_ecx, dwordAddress(T2));
-
-  __ mov(IceType_i32, dwordAddress(T3), Immediate(ExpectedT3));
-  __ mov(IceType_i32, GPRRegister::Encoded_Reg_edx, dwordAddress(T3));
-
-  __ mov(IceType_i32, dwordAddress(T4), Immediate(ExpectedT4));
-  __ mov(IceType_i32, GPRRegister::Encoded_Reg_edi, dwordAddress(T4));
-
-  __ mov(IceType_i32, dwordAddress(T5), Immediate(ExpectedT5));
-  __ mov(IceType_i32, GPRRegister::Encoded_Reg_esi, dwordAddress(T5));
-
-  AssembledBuffer test = assemble();
-  test.run();
-  EXPECT_EQ(ExpectedT0, test.eax());
-  EXPECT_EQ(ExpectedT1, test.ebx());
-  EXPECT_EQ(ExpectedT2, test.ecx());
-  EXPECT_EQ(ExpectedT3, test.edx());
-  EXPECT_EQ(ExpectedT4, test.edi());
-  EXPECT_EQ(ExpectedT5, test.esi());
-}
-
-TEST_F(AssemblerX8632Test, J) {
-#define TestJ(C, Near, Src0, Value0, Src1, Value1, Dest)                       \
-  do {                                                                         \
-    const bool NearJmp = std::strcmp(#Near, "Near") == 0;                      \
-    Label ShouldBeTaken;                                                       \
-    __ mov(IceType_i32, GPRRegister::Encoded_Reg_##Src0, Immediate(Value0));   \
-    __ mov(IceType_i32, GPRRegister::Encoded_Reg_##Src1, Immediate(Value1));   \
-    __ mov(IceType_i32, GPRRegister::Encoded_Reg_##Dest, Immediate(0xBEEF));   \
-    __ cmp(IceType_i32, GPRRegister::Encoded_Reg_##Src0,                       \
-           GPRRegister::Encoded_Reg_##Src1);                                   \
-    __ j(Cond::Br_##C, &ShouldBeTaken, NearJmp);                               \
-    __ mov(IceType_i32, GPRRegister::Encoded_Reg_##Dest, Immediate(0xC0FFEE)); \
-    __ bind(&ShouldBeTaken);                                                   \
-    AssembledBuffer test = assemble();                                         \
-    test.run();                                                                \
-    EXPECT_EQ(Value0, test.Src0()) << "Br_" #C ", " #Near;                     \
-    EXPECT_EQ(Value1, test.Src1()) << "Br_" #C ", " #Near;                     \
-    EXPECT_EQ(0xBEEFul, test.Dest()) << "Br_" #C ", " #Near;                   \
-    reset();                                                                   \
-  } while (0)
-
-  TestJ(o, Near, eax, 0x80000000ul, ebx, 0x1ul, ecx);
-  TestJ(o, Far, ebx, 0x80000000ul, ecx, 0x1ul, edx);
-  TestJ(no, Near, ecx, 0x1ul, edx, 0x1ul, edi);
-  TestJ(no, Far, edx, 0x1ul, edi, 0x1ul, esi);
-  TestJ(b, Near, edi, 0x1ul, esi, 0x80000000ul, eax);
-  TestJ(b, Far, esi, 0x1ul, eax, 0x80000000ul, ebx);
-  TestJ(ae, Near, eax, 0x80000000ul, ebx, 0x1ul, ecx);
-  TestJ(ae, Far, ebx, 0x80000000ul, ecx, 0x1ul, edx);
-  TestJ(e, Near, ecx, 0x80000000ul, edx, 0x80000000ul, edi);
-  TestJ(e, Far, edx, 0x80000000ul, edi, 0x80000000ul, esi);
-  TestJ(ne, Near, edi, 0x80000000ul, esi, 0x1ul, eax);
-  TestJ(ne, Far, esi, 0x80000000ul, eax, 0x1ul, ebx);
-  TestJ(be, Near, eax, 0x1ul, ebx, 0x80000000ul, ecx);
-  TestJ(be, Far, ebx, 0x1ul, ecx, 0x80000000ul, edx);
-  TestJ(a, Near, ecx, 0x80000000ul, edx, 0x1ul, edi);
-  TestJ(a, Far, edx, 0x80000000ul, edi, 0x1ul, esi);
-  TestJ(s, Near, edi, 0x1ul, esi, 0x80000000ul, eax);
-  TestJ(s, Far, esi, 0x1ul, eax, 0x80000000ul, ebx);
-  TestJ(ns, Near, eax, 0x80000000ul, ebx, 0x1ul, ecx);
-  TestJ(ns, Far, ebx, 0x80000000ul, ecx, 0x1ul, edx);
-  TestJ(p, Near, ecx, 0x80000000ul, edx, 0x1ul, edi);
-  TestJ(p, Far, edx, 0x80000000ul, edi, 0x1ul, esi);
-  TestJ(np, Near, edi, 0x1ul, esi, 0x80000000ul, eax);
-  TestJ(np, Far, esi, 0x1ul, eax, 0x80000000ul, ebx);
-  TestJ(l, Near, eax, 0x80000000ul, ebx, 0x1ul, ecx);
-  TestJ(l, Far, ebx, 0x80000000ul, ecx, 0x1ul, edx);
-  TestJ(ge, Near, ecx, 0x1ul, edx, 0x80000000ul, edi);
-  TestJ(ge, Far, edx, 0x1ul, edi, 0x80000000ul, esi);
-  TestJ(le, Near, edi, 0x80000000ul, esi, 0x1ul, eax);
-  TestJ(le, Far, esi, 0x80000000ul, eax, 0x1ul, ebx);
-  TestJ(g, Near, eax, 0x1ul, ebx, 0x80000000ul, ecx);
-  TestJ(g, Far, ebx, 0x1ul, ecx, 0x80000000ul, edx);
-
-#undef TestJ
-}
-
-#undef __
-
-} // end of anonymous namespace
-} // end of namespace X8632
-} // end of namespace Ice