Optimize lowering of x86 byte and word vector unpack.

BUG=swiftshader:15

Change-Id: Id0d3bed46d00336fc31501c41a26ebe2d4ddd697
Reviewed-on: https://chromium-review.googlesource.com/392626
Reviewed-by: Jim Stichnoth <stichnot@chromium.org>
Tested-by: Nicolas Capens <nicolascapens@google.com>
diff --git a/src/IceAssemblerX86Base.h b/src/IceAssemblerX86Base.h
index f0e87df..c9baf23 100644
--- a/src/IceAssemblerX86Base.h
+++ b/src/IceAssemblerX86Base.h
@@ -435,8 +435,8 @@
   void pshufd(Type Ty, XmmRegister dst, XmmRegister src, const Immediate &mask);
   void pshufd(Type Ty, XmmRegister dst, const Address &src,
               const Immediate &mask);
-  void punpckldq(Type, XmmRegister Dst, XmmRegister Src);
-  void punpckldq(Type, XmmRegister Dst, const Address &Src);
+  void punpckl(Type Ty, XmmRegister Dst, XmmRegister Src);
+  void punpckl(Type Ty, XmmRegister Dst, const Address &Src);
   void shufps(Type Ty, XmmRegister dst, XmmRegister src, const Immediate &mask);
   void shufps(Type Ty, XmmRegister dst, const Address &src,
               const Immediate &mask);
diff --git a/src/IceAssemblerX86BaseImpl.h b/src/IceAssemblerX86BaseImpl.h
index 85d9302..e5819ce 100644
--- a/src/IceAssemblerX86BaseImpl.h
+++ b/src/IceAssemblerX86BaseImpl.h
@@ -1589,25 +1589,41 @@
 }
 
 template <typename TraitsType>
-void AssemblerX86Base<TraitsType>::punpckldq(Type, XmmRegister Dst,
-                                             XmmRegister Src) {
+void AssemblerX86Base<TraitsType>::punpckl(Type Ty, XmmRegister Dst,
+                                           XmmRegister Src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
   emitRexRB(RexTypeIrrelevant, Dst, Src);
   emitUint8(0x0F);
-  emitUint8(0x62);
+  if (Ty == IceType_v4i32 || Ty == IceType_v4f32) {
+    emitUint8(0x62);
+  } else if (Ty == IceType_v8i16) {
+    emitUint8(0x61);
+  } else if (Ty == IceType_v16i8) {
+    emitUint8(0x60);
+  } else {
+    assert(false && "Unexpected vector unpack operand type");
+  }
   emitXmmRegisterOperand(Dst, Src);
 }
 
 template <typename TraitsType>
-void AssemblerX86Base<TraitsType>::punpckldq(Type, XmmRegister Dst,
-                                             const Address &Src) {
+void AssemblerX86Base<TraitsType>::punpckl(Type Ty, XmmRegister Dst,
+                                           const Address &Src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
   emitAddrSizeOverridePrefix();
   emitRex(RexTypeIrrelevant, Src, Dst);
   emitUint8(0x0F);
-  emitUint8(0x62);
+  if (Ty == IceType_v4i32 || Ty == IceType_v4f32) {
+    emitUint8(0x62);
+  } else if (Ty == IceType_v8i16) {
+    emitUint8(0x61);
+  } else if (Ty == IceType_v16i8) {
+    emitUint8(0x60);
+  } else {
+    assert(false && "Unexpected vector unpack operand type");
+  }
   emitOperand(gprEncoding(Dst), Src);
 }
 
diff --git a/src/IceInstX86Base.h b/src/IceInstX86Base.h
index 230df8f..1ce6d6f 100644
--- a/src/IceInstX86Base.h
+++ b/src/IceInstX86Base.h
@@ -3658,8 +3658,8 @@
   template <>                                                                  \
   const InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                       \
       InstImpl<TraitsType>::InstX86Punpckl::Base::Emitter = {                  \
-          &InstImpl<TraitsType>::Assembler::punpckldq,                         \
-          &InstImpl<TraitsType>::Assembler::punpckldq};                        \
+          &InstImpl<TraitsType>::Assembler::punpckl,                           \
+          &InstImpl<TraitsType>::Assembler::punpckl};                          \
   }                                                                            \
   }
 
diff --git a/src/IceTargetLoweringX86BaseImpl.h b/src/IceTargetLoweringX86BaseImpl.h
index d0a2aa3..9dbf1a9 100644
--- a/src/IceTargetLoweringX86BaseImpl.h
+++ b/src/IceTargetLoweringX86BaseImpl.h
@@ -5978,10 +5978,6 @@
     llvm::report_fatal_error("Unexpected vector type.");
   case IceType_v16i1:
   case IceType_v16i8: {
-    if (InstructionSet < Traits::SSE4_1) {
-      // TODO(jpp): figure out how to lower with sse2.
-      break;
-    }
     static constexpr SizeT ExpectedNumElements = 16;
     assert(ExpectedNumElements == Instr->getNumIndexes());
     (void)ExpectedNumElements;
@@ -6001,6 +5997,25 @@
     const SizeT Index13 = Instr->getIndex(13)->getValue();
     const SizeT Index14 = Instr->getIndex(14)->getValue();
     const SizeT Index15 = Instr->getIndex(15)->getValue();
+
+    if (Index0 == 0 && Index1 == 0 && Index2 == 1 && Index3 == 1 &&
+        Index4 == 2 && Index5 == 2 && Index6 == 3 && Index7 == 3 &&
+        Index8 == 4 && Index9 == 4 && Index10 == 5 && Index11 == 5 &&
+        Index12 == 6 && Index13 == 6 && Index14 == 7 && Index15 == 7) {
+      auto *T = makeReg(DestTy);
+      auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+      auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
+      _movp(T, Src0RM);
+      _punpckl(T, Src1RM);
+      _movp(Dest, T);
+      return;
+    }
+
+    if (InstructionSet < Traits::SSE4_1) {
+      // TODO(jpp): figure out how to lower with sse2.
+      break;
+    }
+
     lowerShuffleVector_UsingPshufb(Dest, Src0, Src1, Index0, Index1, Index2,
                                    Index3, Index4, Index5, Index6, Index7,
                                    Index8, Index9, Index10, Index11, Index12,
@@ -6009,10 +6024,6 @@
   }
   case IceType_v8i1:
   case IceType_v8i16: {
-    if (InstructionSet < Traits::SSE4_1) {
-      // TODO(jpp): figure out how to lower with sse2.
-      break;
-    }
     static constexpr SizeT ExpectedNumElements = 8;
     assert(ExpectedNumElements == Instr->getNumIndexes());
     (void)ExpectedNumElements;
@@ -6024,6 +6035,23 @@
     const SizeT Index5 = Instr->getIndex(5)->getValue();
     const SizeT Index6 = Instr->getIndex(6)->getValue();
     const SizeT Index7 = Instr->getIndex(7)->getValue();
+
+    if (Index0 == 0 && Index1 == 0 && Index2 == 1 && Index3 == 1 &&
+        Index4 == 2 && Index5 == 2 && Index6 == 3 && Index7 == 3) {
+      auto *T = makeReg(DestTy);
+      auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+      auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
+      _movp(T, Src0RM);
+      _punpckl(T, Src1RM);
+      _movp(Dest, T);
+      return;
+    }
+
+    if (InstructionSet < Traits::SSE4_1) {
+      // TODO(jpp): figure out how to lower with sse2.
+      break;
+    }
+
 #define TO_BYTE_INDEX(I) ((I) << 1)
     lowerShuffleVector_UsingPshufb(
         Dest, Src0, Src1, TO_BYTE_INDEX(Index0), TO_BYTE_INDEX(Index0) + 1,
diff --git a/unittest/AssemblerX8632/XmmArith.cpp b/unittest/AssemblerX8632/XmmArith.cpp
index 1d6572c..1571874 100644
--- a/unittest/AssemblerX8632/XmmArith.cpp
+++ b/unittest/AssemblerX8632/XmmArith.cpp
@@ -1017,57 +1017,76 @@
 #undef TestImplSingleXmmXmm
 }
 
-TEST_F(AssemblerX8632Test, Punpckldq) {
-  const Dqword V0(uint64_t(0x1111111122222222ull),
-                  uint64_t(0x5555555577777777ull));
-  const Dqword V1(uint64_t(0xAAAAAAAABBBBBBBBull),
-                  uint64_t(0xCCCCCCCCDDDDDDDDull));
+TEST_F(AssemblerX8632Test, Punpckl) {
+  const Dqword V0_v4i32(uint64_t(0x1111111122222222ull),
+                        uint64_t(0x5555555577777777ull));
+  const Dqword V1_v4i32(uint64_t(0xAAAAAAAABBBBBBBBull),
+                        uint64_t(0xCCCCCCCCDDDDDDDDull));
+  const Dqword Expected_v4i32(uint64_t(0xBBBBBBBB22222222ull),
+                              uint64_t(0xAAAAAAAA11111111ull));
 
-  const Dqword Expected(uint64_t(0xBBBBBBBB22222222ull),
-                        uint64_t(0xAAAAAAAA11111111ull));
+  const Dqword V0_v8i16(uint64_t(0x1111222233334444ull),
+                        uint64_t(0x5555666677778888ull));
+  const Dqword V1_v8i16(uint64_t(0xAAAABBBBCCCCDDDDull),
+                        uint64_t(0xEEEEFFFF00009999ull));
+  const Dqword Expected_v8i16(uint64_t(0xCCCC3333DDDD4444ull),
+                              uint64_t(0xAAAA1111BBBB2222ull));
 
-#define TestImplXmmXmm(Dst, Src, Inst)                                         \
+  const Dqword V0_v16i8(uint64_t(0x1122334455667788ull),
+                        uint64_t(0x99AABBCCDDEEFF00ull));
+  const Dqword V1_v16i8(uint64_t(0xFFEEDDCCBBAA9900ull),
+                        uint64_t(0xBAADF00DFEEDFACEull));
+  const Dqword Expected_v16i8(uint64_t(0xBB55AA6699770088ull),
+                              uint64_t(0xFF11EE22DD33CC44ull));
+
+#define TestImplXmmXmm(Dst, Src, Inst, Ty)                                     \
   do {                                                                         \
-    static constexpr char TestString[] = "(" #Dst ", " #Src ", " #Inst ")";    \
+    static constexpr char TestString[] =                                       \
+        "(" #Dst ", " #Src ", " #Inst ", " #Ty ")";                            \
     const uint32_t T0 = allocateDqword();                                      \
     const uint32_t T1 = allocateDqword();                                      \
                                                                                \
     __ movups(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0));               \
     __ movups(XmmRegister::Encoded_Reg_##Src, dwordAddress(T1));               \
-    __ Inst(IceType_void, XmmRegister::Encoded_Reg_##Dst,                      \
+    __ Inst(IceType_##Ty, XmmRegister::Encoded_Reg_##Dst,                      \
             XmmRegister::Encoded_Reg_##Src);                                   \
                                                                                \
     AssembledTest test = assemble();                                           \
-    test.setDqwordTo(T0, V0);                                                  \
-    test.setDqwordTo(T1, V1);                                                  \
+    test.setDqwordTo(T0, V0_##Ty);                                             \
+    test.setDqwordTo(T1, V1_##Ty);                                             \
     test.run();                                                                \
                                                                                \
-    ASSERT_EQ(Expected, test.Dst<Dqword>()) << TestString;                     \
+    ASSERT_EQ(Expected_##Ty, test.Dst<Dqword>()) << TestString;                \
     reset();                                                                   \
   } while (0)
 
-#define TestImplXmmAddr(Dst, Inst)                                             \
+#define TestImplXmmAddr(Dst, Inst, Ty)                                         \
   do {                                                                         \
-    static constexpr char TestString[] = "(" #Dst ", Addr, " #Inst ")";        \
+    static constexpr char TestString[] =                                       \
+        "(" #Dst ", Addr, " #Inst ", " #Ty ")";                                \
     const uint32_t T0 = allocateDqword();                                      \
     const uint32_t T1 = allocateDqword();                                      \
                                                                                \
     __ movups(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0));               \
-    __ Inst(IceType_void, XmmRegister::Encoded_Reg_##Dst, dwordAddress(T1));   \
+    __ Inst(IceType_##Ty, XmmRegister::Encoded_Reg_##Dst, dwordAddress(T1));   \
                                                                                \
     AssembledTest test = assemble();                                           \
-    test.setDqwordTo(T0, V0);                                                  \
-    test.setDqwordTo(T1, V1);                                                  \
+    test.setDqwordTo(T0, V0_##Ty);                                             \
+    test.setDqwordTo(T1, V1_##Ty);                                             \
     test.run();                                                                \
                                                                                \
-    ASSERT_EQ(Expected, test.Dst<Dqword>()) << TestString;                     \
+    ASSERT_EQ(Expected_##Ty, test.Dst<Dqword>()) << TestString;                \
     reset();                                                                   \
   } while (0)
 
 #define TestImpl(Dst, Src)                                                     \
   do {                                                                         \
-    TestImplXmmXmm(Dst, Src, punpckldq);                                       \
-    TestImplXmmAddr(Dst, punpckldq);                                           \
+    TestImplXmmXmm(Dst, Src, punpckl, v4i32);                                  \
+    TestImplXmmAddr(Dst, punpckl, v4i32);                                      \
+    TestImplXmmXmm(Dst, Src, punpckl, v8i16);                                  \
+    TestImplXmmAddr(Dst, punpckl, v8i16);                                      \
+    TestImplXmmXmm(Dst, Src, punpckl, v16i8);                                  \
+    TestImplXmmAddr(Dst, punpckl, v16i8);                                      \
   } while (0)
 
   TestImpl(xmm0, xmm1);
diff --git a/unittest/AssemblerX8664/XmmArith.cpp b/unittest/AssemblerX8664/XmmArith.cpp
index 837b717..cadb88e 100644
--- a/unittest/AssemblerX8664/XmmArith.cpp
+++ b/unittest/AssemblerX8664/XmmArith.cpp
@@ -1083,57 +1083,76 @@
 #undef TestImplSingleXmmXmm
 }
 
-TEST_F(AssemblerX8664Test, Punpckldq) {
-  const Dqword V0(uint64_t(0x1111111122222222ull),
-                  uint64_t(0x5555555577777777ull));
-  const Dqword V1(uint64_t(0xAAAAAAAABBBBBBBBull),
-                  uint64_t(0xCCCCCCCCDDDDDDDDull));
+TEST_F(AssemblerX8664Test, Punpckl) {
+  const Dqword V0_v4i32(uint64_t(0x1111111122222222ull),
+                        uint64_t(0x5555555577777777ull));
+  const Dqword V1_v4i32(uint64_t(0xAAAAAAAABBBBBBBBull),
+                        uint64_t(0xCCCCCCCCDDDDDDDDull));
+  const Dqword Expected_v4i32(uint64_t(0xBBBBBBBB22222222ull),
+                              uint64_t(0xAAAAAAAA11111111ull));
 
-  const Dqword Expected(uint64_t(0xBBBBBBBB22222222ull),
-                        uint64_t(0xAAAAAAAA11111111ull));
+  const Dqword V0_v8i16(uint64_t(0x1111222233334444ull),
+                        uint64_t(0x5555666677778888ull));
+  const Dqword V1_v8i16(uint64_t(0xAAAABBBBCCCCDDDDull),
+                        uint64_t(0xEEEEFFFF00009999ull));
+  const Dqword Expected_v8i16(uint64_t(0xCCCC3333DDDD4444ull),
+                              uint64_t(0xAAAA1111BBBB2222ull));
 
-#define TestImplXmmXmm(Dst, Src, Inst)                                         \
+  const Dqword V0_v16i8(uint64_t(0x1122334455667788ull),
+                        uint64_t(0x99AABBCCDDEEFF00ull));
+  const Dqword V1_v16i8(uint64_t(0xFFEEDDCCBBAA9900ull),
+                        uint64_t(0xBAADF00DFEEDFACEull));
+  const Dqword Expected_v16i8(uint64_t(0xBB55AA6699770088ull),
+                              uint64_t(0xFF11EE22DD33CC44ull));
+
+#define TestImplXmmXmm(Dst, Src, Inst, Ty)                                     \
   do {                                                                         \
-    static constexpr char TestString[] = "(" #Dst ", " #Src ", " #Inst ")";    \
+    static constexpr char TestString[] =                                       \
+        "(" #Dst ", " #Src ", " #Inst ", " #Ty ")";                            \
     const uint32_t T0 = allocateDqword();                                      \
     const uint32_t T1 = allocateDqword();                                      \
                                                                                \
     __ movups(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0));               \
     __ movups(XmmRegister::Encoded_Reg_##Src, dwordAddress(T1));               \
-    __ Inst(IceType_void, XmmRegister::Encoded_Reg_##Dst,                      \
+    __ Inst(IceType_##Ty, XmmRegister::Encoded_Reg_##Dst,                      \
             XmmRegister::Encoded_Reg_##Src);                                   \
                                                                                \
     AssembledTest test = assemble();                                           \
-    test.setDqwordTo(T0, V0);                                                  \
-    test.setDqwordTo(T1, V1);                                                  \
+    test.setDqwordTo(T0, V0_##Ty);                                             \
+    test.setDqwordTo(T1, V1_##Ty);                                             \
     test.run();                                                                \
                                                                                \
-    ASSERT_EQ(Expected, test.Dst<Dqword>()) << TestString;                     \
+    ASSERT_EQ(Expected_##Ty, test.Dst<Dqword>()) << TestString;                \
     reset();                                                                   \
   } while (0)
 
-#define TestImplXmmAddr(Dst, Inst)                                             \
+#define TestImplXmmAddr(Dst, Inst, Ty)                                         \
   do {                                                                         \
-    static constexpr char TestString[] = "(" #Dst ", Addr, " #Inst ")";        \
+    static constexpr char TestString[] =                                       \
+        "(" #Dst ", Addr, " #Inst ", " #Ty ")";                                \
     const uint32_t T0 = allocateDqword();                                      \
     const uint32_t T1 = allocateDqword();                                      \
                                                                                \
     __ movups(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0));               \
-    __ Inst(IceType_void, XmmRegister::Encoded_Reg_##Dst, dwordAddress(T1));   \
+    __ Inst(IceType_##Ty, XmmRegister::Encoded_Reg_##Dst, dwordAddress(T1));   \
                                                                                \
     AssembledTest test = assemble();                                           \
-    test.setDqwordTo(T0, V0);                                                  \
-    test.setDqwordTo(T1, V1);                                                  \
+    test.setDqwordTo(T0, V0_##Ty);                                             \
+    test.setDqwordTo(T1, V1_##Ty);                                             \
     test.run();                                                                \
                                                                                \
-    ASSERT_EQ(Expected, test.Dst<Dqword>()) << TestString;                     \
+    ASSERT_EQ(Expected_##Ty, test.Dst<Dqword>()) << TestString;                \
     reset();                                                                   \
   } while (0)
 
 #define TestImpl(Dst, Src)                                                     \
   do {                                                                         \
-    TestImplXmmXmm(Dst, Src, punpckldq);                                       \
-    TestImplXmmAddr(Dst, punpckldq);                                           \
+    TestImplXmmXmm(Dst, Src, punpckl, v4i32);                                  \
+    TestImplXmmAddr(Dst, punpckl, v4i32);                                      \
+    TestImplXmmXmm(Dst, Src, punpckl, v8i16);                                  \
+    TestImplXmmAddr(Dst, punpckl, v8i16);                                      \
+    TestImplXmmXmm(Dst, Src, punpckl, v16i8);                                  \
+    TestImplXmmAddr(Dst, punpckl, v16i8);                                      \
   } while (0)
 
   TestImpl(xmm0, xmm1);
@@ -1143,15 +1162,7 @@
   TestImpl(xmm4, xmm5);
   TestImpl(xmm5, xmm6);
   TestImpl(xmm6, xmm7);
-  TestImpl(xmm7, xmm8);
-  TestImpl(xmm8, xmm9);
-  TestImpl(xmm9, xmm10);
-  TestImpl(xmm10, xmm11);
-  TestImpl(xmm11, xmm12);
-  TestImpl(xmm12, xmm13);
-  TestImpl(xmm13, xmm14);
-  TestImpl(xmm14, xmm15);
-  TestImpl(xmm15, xmm0);
+  TestImpl(xmm7, xmm0);
 
 #undef TestImpl
 #undef TestImplXmmAddr