Optimize shuffles corresponding to x86 punpckh instructions.

BUG=swiftshader:15

Change-Id: I04a7c4206f3936c604ec623e43834c2a153fd3cb
Reviewed-on: https://chromium-review.googlesource.com/399379
Tested-by: Nicolas Capens <nicolascapens@google.com>
Reviewed-by: Jim Stichnoth <stichnot@chromium.org>
diff --git a/src/IceAssemblerX86Base.h b/src/IceAssemblerX86Base.h
index be0ff00..7ba22f2 100644
--- a/src/IceAssemblerX86Base.h
+++ b/src/IceAssemblerX86Base.h
@@ -437,6 +437,8 @@
               const Immediate &mask);
   void punpckl(Type Ty, XmmRegister Dst, XmmRegister Src);
   void punpckl(Type Ty, XmmRegister Dst, const Address &Src);
+  void punpckh(Type Ty, XmmRegister Dst, XmmRegister Src);
+  void punpckh(Type Ty, XmmRegister Dst, const Address &Src);
   void packss(Type Ty, XmmRegister Dst, XmmRegister Src);
   void packss(Type Ty, XmmRegister Dst, const Address &Src);
   void packus(Type Ty, XmmRegister Dst, XmmRegister Src);
diff --git a/src/IceAssemblerX86BaseImpl.h b/src/IceAssemblerX86BaseImpl.h
index 11da946..adf9105 100644
--- a/src/IceAssemblerX86BaseImpl.h
+++ b/src/IceAssemblerX86BaseImpl.h
@@ -1628,6 +1628,45 @@
 }
 
 template <typename TraitsType>
+void AssemblerX86Base<TraitsType>::punpckh(Type Ty, XmmRegister Dst,
+                                           XmmRegister Src) {
+  AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitUint8(0x66);
+  emitRexRB(RexTypeIrrelevant, Dst, Src);
+  emitUint8(0x0F);
+  if (Ty == IceType_v4i32 || Ty == IceType_v4f32) {
+    emitUint8(0x6A);
+  } else if (Ty == IceType_v8i16) {
+    emitUint8(0x69);
+  } else if (Ty == IceType_v16i8) {
+    emitUint8(0x68);
+  } else {
+    assert(false && "Unexpected vector unpack operand type");
+  }
+  emitXmmRegisterOperand(Dst, Src);
+}
+
+template <typename TraitsType>
+void AssemblerX86Base<TraitsType>::punpckh(Type Ty, XmmRegister Dst,
+                                           const Address &Src) {
+  AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitUint8(0x66);
+  emitAddrSizeOverridePrefix();
+  emitRex(RexTypeIrrelevant, Src, Dst);
+  emitUint8(0x0F);
+  if (Ty == IceType_v4i32 || Ty == IceType_v4f32) {
+    emitUint8(0x6A);
+  } else if (Ty == IceType_v8i16) {
+    emitUint8(0x69);
+  } else if (Ty == IceType_v16i8) {
+    emitUint8(0x68);
+  } else {
+    assert(false && "Unexpected vector unpack operand type");
+  }
+  emitOperand(gprEncoding(Dst), Src);
+}
+
+template <typename TraitsType>
 void AssemblerX86Base<TraitsType>::packss(Type Ty, XmmRegister Dst,
                                           XmmRegister Src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
diff --git a/src/IceInst.h b/src/IceInst.h
index d038d53..26119ea 100644
--- a/src/IceInst.h
+++ b/src/IceInst.h
@@ -997,6 +997,37 @@
     return Indexes[Pos];
   }
 
+  inline bool indexesAre(int32_t i0, int32_t i1, int32_t i2, int32_t i3,
+                         int32_t i4, int32_t i5, int32_t i6, int32_t i7) const {
+    static constexpr SizeT ExpectedNumElements = 8;
+    assert(ExpectedNumElements == getNumIndexes());
+    (void)ExpectedNumElements;
+
+    return getIndex(0)->getValue() == i0 && getIndex(1)->getValue() == i1 &&
+           getIndex(2)->getValue() == i2 && getIndex(3)->getValue() == i3 &&
+           getIndex(4)->getValue() == i4 && getIndex(5)->getValue() == i5 &&
+           getIndex(6)->getValue() == i6 && getIndex(7)->getValue() == i7;
+  }
+
+  inline bool indexesAre(int32_t i0, int32_t i1, int32_t i2, int32_t i3,
+                         int32_t i4, int32_t i5, int32_t i6, int32_t i7,
+                         int32_t i8, int32_t i9, int32_t i10, int32_t i11,
+                         int32_t i12, int32_t i13, int32_t i14,
+                         int32_t i15) const {
+    static constexpr SizeT ExpectedNumElements = 16;
+    assert(ExpectedNumElements == getNumIndexes());
+    (void)ExpectedNumElements;
+
+    return getIndex(0)->getValue() == i0 && getIndex(1)->getValue() == i1 &&
+           getIndex(2)->getValue() == i2 && getIndex(3)->getValue() == i3 &&
+           getIndex(4)->getValue() == i4 && getIndex(5)->getValue() == i5 &&
+           getIndex(6)->getValue() == i6 && getIndex(7)->getValue() == i7 &&
+           getIndex(8)->getValue() == i8 && getIndex(9)->getValue() == i9 &&
+           getIndex(10)->getValue() == i10 && getIndex(11)->getValue() == i11 &&
+           getIndex(12)->getValue() == i12 && getIndex(13)->getValue() == i13 &&
+           getIndex(14)->getValue() == i14 && getIndex(15)->getValue() == i15;
+  }
+
   bool isMemoryWrite() const override { return false; }
   void dump(const Cfg *Func) const override;
   static bool classof(const Inst *Instr) {
diff --git a/src/IceInstX86Base.h b/src/IceInstX86Base.h
index 7e96de5..09025aa 100644
--- a/src/IceInstX86Base.h
+++ b/src/IceInstX86Base.h
@@ -145,6 +145,7 @@
       Pshufb,
       Pshufd,
       Punpckl,
+      Punpckh,
       Packss,
       Packus,
       Psll,
@@ -2932,6 +2933,22 @@
                                                               Source) {}
   };
 
+  class InstX86Punpckh
+      : public InstX86BaseBinopXmm<InstX86Base::Punpckh, false,
+                                   InstX86Base::SseSuffix::Unpack> {
+  public:
+    static InstX86Punpckh *create(Cfg *Func, Variable *Dest, Operand *Source) {
+      return new (Func->allocate<InstX86Punpckh>())
+          InstX86Punpckh(Func, Dest, Source);
+    }
+
+  private:
+    InstX86Punpckh(Cfg *Func, Variable *Dest, Operand *Source)
+        : InstX86BaseBinopXmm<InstX86Base::Punpckh, false,
+                              InstX86Base::SseSuffix::Unpack>(Func, Dest,
+                                                              Source) {}
+  };
+
   class InstX86Packss
       : public InstX86BaseBinopXmm<InstX86Base::Packss, false,
                                    InstX86Base::SseSuffix::Pack> {
@@ -3089,6 +3106,7 @@
 
   using Pshufb = typename InstImpl<TraitsType>::InstX86Pshufb;
   using Punpckl = typename InstImpl<TraitsType>::InstX86Punpckl;
+  using Punpckh = typename InstImpl<TraitsType>::InstX86Punpckh;
   using Packss = typename InstImpl<TraitsType>::InstX86Packss;
   using Packus = typename InstImpl<TraitsType>::InstX86Packus;
 };
@@ -3328,6 +3346,9 @@
   const char *InstImpl<TraitsType>::InstX86Punpckl::Base::Opcode = "punpckl";  \
   template <>                                                                  \
   template <>                                                                  \
+  const char *InstImpl<TraitsType>::InstX86Punpckh::Base::Opcode = "punpckh";  \
+  template <>                                                                  \
+  template <>                                                                  \
   const char *InstImpl<TraitsType>::InstX86Packss::Base::Opcode = "packss";    \
   template <>                                                                  \
   template <>                                                                  \
@@ -3708,6 +3729,12 @@
   template <>                                                                  \
   template <>                                                                  \
   const InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                       \
+      InstImpl<TraitsType>::InstX86Punpckh::Base::Emitter = {                  \
+          &InstImpl<TraitsType>::Assembler::punpckh,                           \
+          &InstImpl<TraitsType>::Assembler::punpckh};                          \
+  template <>                                                                  \
+  template <>                                                                  \
+  const InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                       \
       InstImpl<TraitsType>::InstX86Packss::Base::Emitter = {                   \
           &InstImpl<TraitsType>::Assembler::packss,                            \
           &InstImpl<TraitsType>::Assembler::packss};                           \
diff --git a/src/IceTargetLoweringX86Base.h b/src/IceTargetLoweringX86Base.h
index 0f31dd4..7b1e730 100644
--- a/src/IceTargetLoweringX86Base.h
+++ b/src/IceTargetLoweringX86Base.h
@@ -815,6 +815,10 @@
     AutoMemorySandboxer<> _(this, &Dest, &Src0);
     Context.insert<typename Traits::Insts::Punpckl>(Dest, Src0);
   }
+  void _punpckh(Variable *Dest, Operand *Src0) {
+    AutoMemorySandboxer<> _(this, &Dest, &Src0);
+    Context.insert<typename Traits::Insts::Punpckh>(Dest, Src0);
+  }
   void _packss(Variable *Dest, Operand *Src0) {
     AutoMemorySandboxer<> _(this, &Dest, &Src0);
     Context.insert<typename Traits::Insts::Packss>(Dest, Src0);
diff --git a/src/IceTargetLoweringX86BaseImpl.h b/src/IceTargetLoweringX86BaseImpl.h
index eb0f67d..87bbfde 100644
--- a/src/IceTargetLoweringX86BaseImpl.h
+++ b/src/IceTargetLoweringX86BaseImpl.h
@@ -6002,6 +6002,53 @@
     static constexpr SizeT ExpectedNumElements = 16;
     assert(ExpectedNumElements == Instr->getNumIndexes());
     (void)ExpectedNumElements;
+
+    if (Instr->indexesAre(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7)) {
+      auto *T = makeReg(DestTy);
+      auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+      _movp(T, Src0RM);
+      _punpckl(T, Src0RM);
+      _movp(Dest, T);
+      return;
+    }
+
+    if (Instr->indexesAre(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7,
+                          23)) {
+      auto *T = makeReg(DestTy);
+      auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+      auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
+      _movp(T, Src0RM);
+      _punpckl(T, Src1RM);
+      _movp(Dest, T);
+      return;
+    }
+
+    if (Instr->indexesAre(8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14,
+                          15, 15)) {
+      auto *T = makeReg(DestTy);
+      auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+      _movp(T, Src0RM);
+      _punpckh(T, Src0RM);
+      _movp(Dest, T);
+      return;
+    }
+
+    if (Instr->indexesAre(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30,
+                          15, 31)) {
+      auto *T = makeReg(DestTy);
+      auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+      auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
+      _movp(T, Src0RM);
+      _punpckh(T, Src1RM);
+      _movp(Dest, T);
+      return;
+    }
+
+    if (InstructionSet < Traits::SSE4_1) {
+      // TODO(jpp): figure out how to lower with sse2.
+      break;
+    }
+
     const SizeT Index0 = Instr->getIndex(0)->getValue();
     const SizeT Index1 = Instr->getIndex(1)->getValue();
     const SizeT Index2 = Instr->getIndex(2)->getValue();
@@ -6019,23 +6066,6 @@
     const SizeT Index14 = Instr->getIndex(14)->getValue();
     const SizeT Index15 = Instr->getIndex(15)->getValue();
 
-    if (Index0 == 0 && Index1 == 0 && Index2 == 1 && Index3 == 1 &&
-        Index4 == 2 && Index5 == 2 && Index6 == 3 && Index7 == 3 &&
-        Index8 == 4 && Index9 == 4 && Index10 == 5 && Index11 == 5 &&
-        Index12 == 6 && Index13 == 6 && Index14 == 7 && Index15 == 7) {
-      auto *T = makeReg(DestTy);
-      auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
-      _movp(T, Src0RM);
-      _punpckl(T, Src0RM);
-      _movp(Dest, T);
-      return;
-    }
-
-    if (InstructionSet < Traits::SSE4_1) {
-      // TODO(jpp): figure out how to lower with sse2.
-      break;
-    }
-
     lowerShuffleVector_UsingPshufb(Dest, Src0, Src1, Index0, Index1, Index2,
                                    Index3, Index4, Index5, Index6, Index7,
                                    Index8, Index9, Index10, Index11, Index12,
@@ -6047,17 +6077,8 @@
     static constexpr SizeT ExpectedNumElements = 8;
     assert(ExpectedNumElements == Instr->getNumIndexes());
     (void)ExpectedNumElements;
-    const SizeT Index0 = Instr->getIndex(0)->getValue();
-    const SizeT Index1 = Instr->getIndex(1)->getValue();
-    const SizeT Index2 = Instr->getIndex(2)->getValue();
-    const SizeT Index3 = Instr->getIndex(3)->getValue();
-    const SizeT Index4 = Instr->getIndex(4)->getValue();
-    const SizeT Index5 = Instr->getIndex(5)->getValue();
-    const SizeT Index6 = Instr->getIndex(6)->getValue();
-    const SizeT Index7 = Instr->getIndex(7)->getValue();
 
-    if (Index0 == 0 && Index1 == 0 && Index2 == 1 && Index3 == 1 &&
-        Index4 == 2 && Index5 == 2 && Index6 == 3 && Index7 == 3) {
+    if (Instr->indexesAre(0, 0, 1, 1, 2, 2, 3, 3)) {
       auto *T = makeReg(DestTy);
       auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
       _movp(T, Src0RM);
@@ -6066,11 +6087,49 @@
       return;
     }
 
+    if (Instr->indexesAre(0, 8, 1, 9, 2, 10, 3, 11)) {
+      auto *T = makeReg(DestTy);
+      auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+      auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
+      _movp(T, Src0RM);
+      _punpckl(T, Src0RM);
+      _movp(Dest, T);
+      return;
+    }
+
+    if (Instr->indexesAre(4, 4, 5, 5, 6, 6, 7, 7)) {
+      auto *T = makeReg(DestTy);
+      auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+      _movp(T, Src0RM);
+      _punpckh(T, Src0RM);
+      _movp(Dest, T);
+      return;
+    }
+
+    if (Instr->indexesAre(4, 12, 5, 13, 6, 14, 7, 15)) {
+      auto *T = makeReg(DestTy);
+      auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+      auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
+      _movp(T, Src0RM);
+      _punpckh(T, Src1RM);
+      _movp(Dest, T);
+      return;
+    }
+
     if (InstructionSet < Traits::SSE4_1) {
       // TODO(jpp): figure out how to lower with sse2.
       break;
     }
 
+    const SizeT Index0 = Instr->getIndex(0)->getValue();
+    const SizeT Index1 = Instr->getIndex(1)->getValue();
+    const SizeT Index2 = Instr->getIndex(2)->getValue();
+    const SizeT Index3 = Instr->getIndex(3)->getValue();
+    const SizeT Index4 = Instr->getIndex(4)->getValue();
+    const SizeT Index5 = Instr->getIndex(5)->getValue();
+    const SizeT Index6 = Instr->getIndex(6)->getValue();
+    const SizeT Index7 = Instr->getIndex(7)->getValue();
+
 #define TO_BYTE_INDEX(I) ((I) << 1)
     lowerShuffleVector_UsingPshufb(
         Dest, Src0, Src1, TO_BYTE_INDEX(Index0), TO_BYTE_INDEX(Index0) + 1,