Subzero. X86. Uses pshufb for shufflevector lowering.

This CL enables subzero to lower shufflevector instructions in x86 using
pshufb (only for SSE 4.1)

BUG= https://bugs.chromium.org/p/nativeclient/issues/detail?id=4136
BUG= https://bugs.chromium.org/p/nativeclient/issues/detail?id=4077
R=stichnot@chromium.org

Review URL: https://codereview.chromium.org/1917863004 .
diff --git a/src/IceAssemblerX86Base.h b/src/IceAssemblerX86Base.h
index bd56dbc..e9bc6bc 100644
--- a/src/IceAssemblerX86Base.h
+++ b/src/IceAssemblerX86Base.h
@@ -429,6 +429,8 @@
 
   void sqrtpd(XmmRegister dst);
 
+  void pshufb(Type Ty, XmmRegister dst, XmmRegister src);
+  void pshufb(Type Ty, XmmRegister dst, const Address &src);
   void pshufd(Type Ty, XmmRegister dst, XmmRegister src, const Immediate &mask);
   void pshufd(Type Ty, XmmRegister dst, const Address &src,
               const Immediate &mask);
diff --git a/src/IceAssemblerX86BaseImpl.h b/src/IceAssemblerX86BaseImpl.h
index a986515..554f533 100644
--- a/src/IceAssemblerX86BaseImpl.h
+++ b/src/IceAssemblerX86BaseImpl.h
@@ -1535,6 +1535,31 @@
 }
 
 template <typename TraitsType>
+void AssemblerX86Base<TraitsType>::pshufb(Type /* Ty */, XmmRegister dst,
+                                          XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitUint8(0x66);
+  emitRexRB(RexTypeIrrelevant, dst, src);
+  emitUint8(0x0F);
+  emitUint8(0x38);
+  emitUint8(0x00);
+  emitXmmRegisterOperand(dst, src);
+}
+
+template <typename TraitsType>
+void AssemblerX86Base<TraitsType>::pshufb(Type /* Ty */, XmmRegister dst,
+                                          const Address &src) {
+  AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitUint8(0x66);
+  emitAddrSizeOverridePrefix();
+  emitRex(RexTypeIrrelevant, src, dst);
+  emitUint8(0x0F);
+  emitUint8(0x38);
+  emitUint8(0x00);
+  emitOperand(gprEncoding(dst), src);
+}
+
+template <typename TraitsType>
 void AssemblerX86Base<TraitsType>::pshufd(Type /* Ty */, XmmRegister dst,
                                           XmmRegister src,
                                           const Immediate &imm) {
diff --git a/src/IceInstX86Base.h b/src/IceInstX86Base.h
index c29538a..655b38d 100644
--- a/src/IceInstX86Base.h
+++ b/src/IceInstX86Base.h
@@ -142,6 +142,7 @@
       Pmuludq,
       Pop,
       Por,
+      Pshufb,
       Pshufd,
       Punpckl,
       Psll,
@@ -2844,6 +2845,22 @@
     InstX86IacaEnd(Cfg *Func);
   };
 
+  class InstX86Pshufb
+      : public InstX86BaseBinopXmm<InstX86Base::Pshufb, false,
+                                   InstX86Base::SseSuffix::None> {
+  public:
+    static InstX86Pshufb *create(Cfg *Func, Variable *Dest, Operand *Source) {
+      return new (Func->allocate<InstX86Pshufb>())
+          InstX86Pshufb(Func, Dest, Source);
+    }
+
+  private:
+    InstX86Pshufb(Cfg *Func, Variable *Dest, Operand *Source)
+        : InstX86BaseBinopXmm<InstX86Base::Pshufb, false,
+                              InstX86Base::SseSuffix::None>(Func, Dest,
+                                                            Source) {}
+  };
+
   class InstX86Punpckl
       : public InstX86BaseBinopXmm<InstX86Base::Punpckl, false,
                                    InstX86Base::SseSuffix::Unpack> {
@@ -2982,6 +2999,7 @@
   using IacaStart = typename InstImpl<TraitsType>::InstX86IacaStart;
   using IacaEnd = typename InstImpl<TraitsType>::InstX86IacaEnd;
 
+  using Pshufb = typename InstImpl<TraitsType>::InstX86Pshufb;
   using Punpckl = typename InstImpl<TraitsType>::InstX86Punpckl;
 };
 
@@ -3214,6 +3232,9 @@
   const char *InstImpl<TraitsType>::InstX86Pshufd::Base::Opcode = "pshufd";    \
   template <>                                                                  \
   template <>                                                                  \
+  const char *InstImpl<TraitsType>::InstX86Pshufb::Base::Opcode = "pshufb";    \
+  template <>                                                                  \
+  template <>                                                                  \
   const char *InstImpl<TraitsType>::InstX86Punpckl::Base::Opcode = "punpckl";  \
   /* Inplace GPR ops */                                                        \
   template <>                                                                  \
@@ -3579,6 +3600,12 @@
   template <>                                                                  \
   template <>                                                                  \
   const InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                       \
+      InstImpl<TraitsType>::InstX86Pshufb::Base::Emitter = {                   \
+          &InstImpl<TraitsType>::Assembler::pshufb,                            \
+          &InstImpl<TraitsType>::Assembler::pshufb};                           \
+  template <>                                                                  \
+  template <>                                                                  \
+  const InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                       \
       InstImpl<TraitsType>::InstX86Punpckl::Base::Emitter = {                  \
           &InstImpl<TraitsType>::Assembler::punpckldq,                         \
           &InstImpl<TraitsType>::Assembler::punpckldq};                        \
diff --git a/src/IceTargetLoweringX86Base.h b/src/IceTargetLoweringX86Base.h
index 71b824f..d465423 100644
--- a/src/IceTargetLoweringX86Base.h
+++ b/src/IceTargetLoweringX86Base.h
@@ -805,6 +805,10 @@
     AutoMemorySandboxer<> _(this, &Dest, &Src0);
     Context.insert<typename Traits::Insts::Punpckl>(Dest, Src0);
   }
+  void _pshufb(Variable *Dest, Operand *Src0) {
+    AutoMemorySandboxer<> _(this, &Dest, &Src0);
+    Context.insert<typename Traits::Insts::Pshufb>(Dest, Src0);
+  }
   void _pshufd(Variable *Dest, Operand *Src0, Operand *Src1) {
     AutoMemorySandboxer<> _(this, &Dest, &Src0, &Src1);
     Context.insert<typename Traits::Insts::Pshufd>(Dest, Src0, Src1);
@@ -1101,6 +1105,21 @@
                                                       SizeT Index0,
                                                       Variable *Src1,
                                                       SizeT Index1);
+  static constexpr SizeT CLEAR_ALL_BITS = 0x80;
+  SizeT PshufbMaskCount = 0;
+  GlobalString lowerShuffleVector_NewMaskName();
+  ConstantRelocatable *lowerShuffleVector_CreatePshufbMask(
+      int8_t Idx0, int8_t Idx1, int8_t Idx2, int8_t Idx3, int8_t Idx4,
+      int8_t Idx5, int8_t Idx6, int8_t Idx7, int8_t Idx8, int8_t Idx9,
+      int8_t Idx10, int8_t Idx11, int8_t Idx12, int8_t Idx13, int8_t Idx14,
+      int8_t Idx15);
+  void lowerShuffleVector_UsingPshufb(Variable *Dest, Operand *Src0,
+                                      Operand *Src1, int8_t Idx0, int8_t Idx1,
+                                      int8_t Idx2, int8_t Idx3, int8_t Idx4,
+                                      int8_t Idx5, int8_t Idx6, int8_t Idx7,
+                                      int8_t Idx8, int8_t Idx9, int8_t Idx10,
+                                      int8_t Idx11, int8_t Idx12, int8_t Idx13,
+                                      int8_t Idx14, int8_t Idx15);
   /// @}
 
   static FixupKind PcRelFixup;
diff --git a/src/IceTargetLoweringX86BaseImpl.h b/src/IceTargetLoweringX86BaseImpl.h
index 35d7ea0..79b5477 100644
--- a/src/IceTargetLoweringX86BaseImpl.h
+++ b/src/IceTargetLoweringX86BaseImpl.h
@@ -5675,6 +5675,99 @@
 }
 
 template <typename TraitsType>
+GlobalString TargetX86Base<TraitsType>::lowerShuffleVector_NewMaskName() {
+  GlobalString FuncName = Func->getFunctionName();
+  const SizeT Id = PshufbMaskCount++;
+  if (!BuildDefs::dump() || !FuncName.hasStdString()) {
+    return GlobalString::createWithString(
+        Ctx,
+        "$PS" + std::to_string(FuncName.getID()) + "_" + std::to_string(Id));
+  }
+  return GlobalString::createWithString(
+      Ctx, "Pshufb$" + Func->getFunctionName() + "$" + std::to_string(Id));
+}
+
+template <typename TraitsType>
+ConstantRelocatable *
+TargetX86Base<TraitsType>::lowerShuffleVector_CreatePshufbMask(
+    int8_t Idx0, int8_t Idx1, int8_t Idx2, int8_t Idx3, int8_t Idx4,
+    int8_t Idx5, int8_t Idx6, int8_t Idx7, int8_t Idx8, int8_t Idx9,
+    int8_t Idx10, int8_t Idx11, int8_t Idx12, int8_t Idx13, int8_t Idx14,
+    int8_t Idx15) {
+  static constexpr uint8_t NumElements = 16;
+  const char Initializer[NumElements] = {
+      Idx0, Idx1, Idx2,  Idx3,  Idx4,  Idx5,  Idx6,  Idx7,
+      Idx8, Idx9, Idx10, Idx11, Idx12, Idx13, Idx14, Idx15,
+  };
+
+  static constexpr Type V4VectorType = IceType_v4i32;
+  const uint32_t MaskAlignment = typeWidthInBytesOnStack(V4VectorType);
+  auto *Mask = VariableDeclaration::create(Func->getGlobalPool());
+  GlobalString MaskName = lowerShuffleVector_NewMaskName();
+  Mask->setIsConstant(true);
+  Mask->addInitializer(VariableDeclaration::DataInitializer::create(
+      Func->getGlobalPool(), Initializer, NumElements));
+  Mask->setName(MaskName);
+  // Mask needs to be 16-byte aligned, or pshufb will seg fault.
+  Mask->setAlignment(MaskAlignment);
+  Func->addGlobal(Mask);
+
+  constexpr RelocOffsetT Offset = 0;
+  return llvm::cast<ConstantRelocatable>(Ctx->getConstantSym(Offset, MaskName));
+}
+
+template <typename TraitsType>
+void TargetX86Base<TraitsType>::lowerShuffleVector_UsingPshufb(
+    Variable *Dest, Operand *Src0, Operand *Src1, int8_t Idx0, int8_t Idx1,
+    int8_t Idx2, int8_t Idx3, int8_t Idx4, int8_t Idx5, int8_t Idx6,
+    int8_t Idx7, int8_t Idx8, int8_t Idx9, int8_t Idx10, int8_t Idx11,
+    int8_t Idx12, int8_t Idx13, int8_t Idx14, int8_t Idx15) {
+  const Type DestTy = Dest->getType();
+  static constexpr bool NotRebased = false;
+  static constexpr Variable *NoBase = nullptr;
+  // We use void for the memory operand instead of DestTy because using the
+  // latter causes a validation failure: the X86 Inst layer complains that
+  // vector mem operands could be under aligned. Thus, using void we avoid the
+  // validation error. Note that the mask global declaration is aligned, so it
+  // can be used as an XMM mem operand.
+  static constexpr Type MaskType = IceType_void;
+#define IDX_IN_SRC(N, S)                                                       \
+  ((((N) & (1 << 4)) == (S << 4)) ? ((N)&0xf) : CLEAR_ALL_BITS)
+  auto *Mask0M = X86OperandMem::create(
+      Func, MaskType, NoBase,
+      lowerShuffleVector_CreatePshufbMask(
+          IDX_IN_SRC(Idx0, 0), IDX_IN_SRC(Idx1, 0), IDX_IN_SRC(Idx2, 0),
+          IDX_IN_SRC(Idx3, 0), IDX_IN_SRC(Idx4, 0), IDX_IN_SRC(Idx5, 0),
+          IDX_IN_SRC(Idx6, 0), IDX_IN_SRC(Idx7, 0), IDX_IN_SRC(Idx8, 0),
+          IDX_IN_SRC(Idx9, 0), IDX_IN_SRC(Idx10, 0), IDX_IN_SRC(Idx11, 0),
+          IDX_IN_SRC(Idx12, 0), IDX_IN_SRC(Idx13, 0), IDX_IN_SRC(Idx14, 0),
+          IDX_IN_SRC(Idx15, 0)),
+      NotRebased);
+  auto *Mask1M = X86OperandMem::create(
+      Func, MaskType, NoBase,
+      lowerShuffleVector_CreatePshufbMask(
+          IDX_IN_SRC(Idx0, 1), IDX_IN_SRC(Idx1, 1), IDX_IN_SRC(Idx2, 1),
+          IDX_IN_SRC(Idx3, 1), IDX_IN_SRC(Idx4, 1), IDX_IN_SRC(Idx5, 1),
+          IDX_IN_SRC(Idx6, 1), IDX_IN_SRC(Idx7, 1), IDX_IN_SRC(Idx8, 1),
+          IDX_IN_SRC(Idx9, 1), IDX_IN_SRC(Idx10, 1), IDX_IN_SRC(Idx11, 1),
+          IDX_IN_SRC(Idx12, 1), IDX_IN_SRC(Idx13, 1), IDX_IN_SRC(Idx14, 1),
+          IDX_IN_SRC(Idx15, 1)),
+      NotRebased);
+#undef IDX_IN_SRC
+  auto *T0 = makeReg(DestTy);
+  auto *T1 = makeReg(DestTy);
+  auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+  _movp(T0, Src0RM);
+  auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
+  _movp(T1, Src1RM);
+
+  _pshufb(T1, Mask1M);
+  _pshufb(T0, Mask0M);
+  _por(T1, T0);
+  _movp(Dest, T1);
+}
+
+template <typename TraitsType>
 void TargetX86Base<TraitsType>::lowerShuffleVector(
     const InstShuffleVector *Instr) {
   auto *Dest = Instr->getDest();
@@ -5687,9 +5780,68 @@
 
   switch (DestTy) {
   default:
-    break;
-  // TODO(jpp): figure out how to properly lower the remaining cases without
-  // scalarization.
+    llvm::report_fatal_error("Unexpected vector type.");
+  case IceType_v16i1:
+  case IceType_v16i8: {
+    if (InstructionSet < Traits::SSE4_1) {
+      // TODO(jpp): figure out how to lower with sse2.
+      break;
+    }
+    static constexpr SizeT ExpectedNumElements = 16;
+    assert(ExpectedNumElements == Instr->getNumIndexes());
+    (void)ExpectedNumElements;
+    const SizeT Index0 = Instr->getIndex(0)->getValue();
+    const SizeT Index1 = Instr->getIndex(1)->getValue();
+    const SizeT Index2 = Instr->getIndex(2)->getValue();
+    const SizeT Index3 = Instr->getIndex(3)->getValue();
+    const SizeT Index4 = Instr->getIndex(4)->getValue();
+    const SizeT Index5 = Instr->getIndex(5)->getValue();
+    const SizeT Index6 = Instr->getIndex(6)->getValue();
+    const SizeT Index7 = Instr->getIndex(7)->getValue();
+    const SizeT Index8 = Instr->getIndex(8)->getValue();
+    const SizeT Index9 = Instr->getIndex(9)->getValue();
+    const SizeT Index10 = Instr->getIndex(10)->getValue();
+    const SizeT Index11 = Instr->getIndex(11)->getValue();
+    const SizeT Index12 = Instr->getIndex(12)->getValue();
+    const SizeT Index13 = Instr->getIndex(13)->getValue();
+    const SizeT Index14 = Instr->getIndex(14)->getValue();
+    const SizeT Index15 = Instr->getIndex(15)->getValue();
+    lowerShuffleVector_UsingPshufb(Dest, Src0, Src1, Index0, Index1, Index2,
+                                   Index3, Index4, Index5, Index6, Index7,
+                                   Index8, Index9, Index10, Index11, Index12,
+                                   Index13, Index14, Index15);
+    return;
+  }
+  case IceType_v8i1:
+  case IceType_v8i16: {
+    if (InstructionSet < Traits::SSE4_1) {
+      // TODO(jpp): figure out how to lower with sse2.
+      break;
+    }
+    static constexpr SizeT ExpectedNumElements = 8;
+    assert(ExpectedNumElements == Instr->getNumIndexes());
+    (void)ExpectedNumElements;
+    const SizeT Index0 = Instr->getIndex(0)->getValue();
+    const SizeT Index1 = Instr->getIndex(1)->getValue();
+    const SizeT Index2 = Instr->getIndex(2)->getValue();
+    const SizeT Index3 = Instr->getIndex(3)->getValue();
+    const SizeT Index4 = Instr->getIndex(4)->getValue();
+    const SizeT Index5 = Instr->getIndex(5)->getValue();
+    const SizeT Index6 = Instr->getIndex(6)->getValue();
+    const SizeT Index7 = Instr->getIndex(7)->getValue();
+#define TO_BYTE_INDEX(I) ((I) << 1)
+    lowerShuffleVector_UsingPshufb(
+        Dest, Src0, Src1, TO_BYTE_INDEX(Index0), TO_BYTE_INDEX(Index0) + 1,
+        TO_BYTE_INDEX(Index1), TO_BYTE_INDEX(Index1) + 1, TO_BYTE_INDEX(Index2),
+        TO_BYTE_INDEX(Index2) + 1, TO_BYTE_INDEX(Index3),
+        TO_BYTE_INDEX(Index3) + 1, TO_BYTE_INDEX(Index4),
+        TO_BYTE_INDEX(Index4) + 1, TO_BYTE_INDEX(Index5),
+        TO_BYTE_INDEX(Index5) + 1, TO_BYTE_INDEX(Index6),
+        TO_BYTE_INDEX(Index6) + 1, TO_BYTE_INDEX(Index7),
+        TO_BYTE_INDEX(Index7) + 1);
+#undef TO_BYTE_INDEX
+    return;
+  }
   case IceType_v4i1:
   case IceType_v4i32:
   case IceType_v4f32: {
diff --git a/unittest/AssemblerX8632/XmmArith.cpp b/unittest/AssemblerX8632/XmmArith.cpp
index d19fcb8..1d6572c 100644
--- a/unittest/AssemblerX8632/XmmArith.cpp
+++ b/unittest/AssemblerX8632/XmmArith.cpp
@@ -1084,6 +1084,73 @@
 #undef TestImplXmmXmm
 }
 
+TEST_F(AssemblerX8632Test, Pshufb) {
+  const Dqword V0(uint64_t(0x1122334455667788ull),
+                  uint64_t(0x99aabbccddeeff32ull));
+  const Dqword V1(uint64_t(0x0204050380060708ull),
+                  uint64_t(0x010306080a8b0c0dull));
+
+  const Dqword Expected(uint64_t(0x6644335500221132ull),
+                        uint64_t(0x77552232ee00ccbbull));
+
+#define TestImplXmmXmm(Dst, Src, Inst)                                         \
+  do {                                                                         \
+    static constexpr char TestString[] = "(" #Dst ", " #Src ", " #Inst ")";    \
+    const uint32_t T0 = allocateDqword();                                      \
+    const uint32_t T1 = allocateDqword();                                      \
+                                                                               \
+    __ movups(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0));               \
+    __ movups(XmmRegister::Encoded_Reg_##Src, dwordAddress(T1));               \
+    __ Inst(IceType_void, XmmRegister::Encoded_Reg_##Dst,                      \
+            XmmRegister::Encoded_Reg_##Src);                                   \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0);                                                  \
+    test.setDqwordTo(T1, V1);                                                  \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(Expected, test.Dst<Dqword>()) << TestString;                     \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplXmmAddr(Dst, Inst)                                             \
+  do {                                                                         \
+    static constexpr char TestString[] = "(" #Dst ", Addr, " #Inst ")";        \
+    const uint32_t T0 = allocateDqword();                                      \
+    const uint32_t T1 = allocateDqword();                                      \
+                                                                               \
+    __ movups(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0));               \
+    __ Inst(IceType_void, XmmRegister::Encoded_Reg_##Dst, dwordAddress(T1));   \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0);                                                  \
+    test.setDqwordTo(T1, V1);                                                  \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(Expected, test.Dst<Dqword>()) << TestString;                     \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImpl(Dst, Src)                                                     \
+  do {                                                                         \
+    TestImplXmmXmm(Dst, Src, pshufb);                                          \
+    TestImplXmmAddr(Dst, pshufb);                                              \
+  } while (0)
+
+  TestImpl(xmm0, xmm1);
+  TestImpl(xmm1, xmm2);
+  TestImpl(xmm2, xmm3);
+  TestImpl(xmm3, xmm4);
+  TestImpl(xmm4, xmm5);
+  TestImpl(xmm5, xmm6);
+  TestImpl(xmm6, xmm7);
+  TestImpl(xmm7, xmm0);
+
+#undef TestImpl
+#undef TestImplXmmAddr
+#undef TestImplXmmXmm
+}
+
 TEST_F(AssemblerX8632Test, Cvt) {
   const Dqword dq2ps32DstValue(-1.0f, -1.0f, -1.0f, -1.0f);
   const Dqword dq2ps32SrcValue(-5, 3, 100, 200);
diff --git a/unittest/AssemblerX8664/XmmArith.cpp b/unittest/AssemblerX8664/XmmArith.cpp
index c037520..837b717 100644
--- a/unittest/AssemblerX8664/XmmArith.cpp
+++ b/unittest/AssemblerX8664/XmmArith.cpp
@@ -1158,6 +1158,81 @@
 #undef TestImplXmmXmm
 }
 
+TEST_F(AssemblerX8664Test, Pshufb) {
+  const Dqword V0(uint64_t(0x1122334455667788ull),
+                  uint64_t(0x99aabbccddeeff32ull));
+  const Dqword V1(uint64_t(0x0204050380060708ull),
+                  uint64_t(0x010306080a8b0c0dull));
+
+  const Dqword Expected(uint64_t(0x6644335500221132ull),
+                        uint64_t(0x77552232ee00ccbbull));
+
+#define TestImplXmmXmm(Dst, Src, Inst)                                         \
+  do {                                                                         \
+    static constexpr char TestString[] = "(" #Dst ", " #Src ", " #Inst ")";    \
+    const uint32_t T0 = allocateDqword();                                      \
+    const uint32_t T1 = allocateDqword();                                      \
+                                                                               \
+    __ movups(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0));               \
+    __ movups(XmmRegister::Encoded_Reg_##Src, dwordAddress(T1));               \
+    __ Inst(IceType_void, XmmRegister::Encoded_Reg_##Dst,                      \
+            XmmRegister::Encoded_Reg_##Src);                                   \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0);                                                  \
+    test.setDqwordTo(T1, V1);                                                  \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(Expected, test.Dst<Dqword>()) << TestString;                     \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplXmmAddr(Dst, Inst)                                             \
+  do {                                                                         \
+    static constexpr char TestString[] = "(" #Dst ", Addr, " #Inst ")";        \
+    const uint32_t T0 = allocateDqword();                                      \
+    const uint32_t T1 = allocateDqword();                                      \
+                                                                               \
+    __ movups(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0));               \
+    __ Inst(IceType_void, XmmRegister::Encoded_Reg_##Dst, dwordAddress(T1));   \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0);                                                  \
+    test.setDqwordTo(T1, V1);                                                  \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(Expected, test.Dst<Dqword>()) << TestString;                     \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImpl(Dst, Src)                                                     \
+  do {                                                                         \
+    TestImplXmmXmm(Dst, Src, pshufb);                                          \
+    TestImplXmmAddr(Dst, pshufb);                                              \
+  } while (0)
+
+  TestImpl(xmm0, xmm1);
+  TestImpl(xmm1, xmm2);
+  TestImpl(xmm2, xmm3);
+  TestImpl(xmm3, xmm4);
+  TestImpl(xmm4, xmm5);
+  TestImpl(xmm5, xmm6);
+  TestImpl(xmm6, xmm7);
+  TestImpl(xmm7, xmm8);
+  TestImpl(xmm8, xmm9);
+  TestImpl(xmm9, xmm10);
+  TestImpl(xmm10, xmm11);
+  TestImpl(xmm11, xmm12);
+  TestImpl(xmm12, xmm13);
+  TestImpl(xmm13, xmm14);
+  TestImpl(xmm14, xmm15);
+  TestImpl(xmm15, xmm0);
+
+#undef TestImpl
+#undef TestImplXmmAddr
+#undef TestImplXmmXmm
+}
+
 TEST_F(AssemblerX8664Test, Cvt) {
   const Dqword dq2ps32DstValue(-1.0f, -1.0f, -1.0f, -1.0f);
   const Dqword dq2ps32SrcValue(-5, 3, 100, 200);