Optimize lowering of x86 byte and word vector unpack.
BUG=swiftshader:15
Change-Id: Id0d3bed46d00336fc31501c41a26ebe2d4ddd697
Reviewed-on: https://chromium-review.googlesource.com/392626
Reviewed-by: Jim Stichnoth <stichnot@chromium.org>
Tested-by: Nicolas Capens <nicolascapens@google.com>
diff --git a/src/IceAssemblerX86Base.h b/src/IceAssemblerX86Base.h
index f0e87df..c9baf23 100644
--- a/src/IceAssemblerX86Base.h
+++ b/src/IceAssemblerX86Base.h
@@ -435,8 +435,8 @@
void pshufd(Type Ty, XmmRegister dst, XmmRegister src, const Immediate &mask);
void pshufd(Type Ty, XmmRegister dst, const Address &src,
const Immediate &mask);
- void punpckldq(Type, XmmRegister Dst, XmmRegister Src);
- void punpckldq(Type, XmmRegister Dst, const Address &Src);
+ void punpckl(Type Ty, XmmRegister Dst, XmmRegister Src);
+ void punpckl(Type Ty, XmmRegister Dst, const Address &Src);
void shufps(Type Ty, XmmRegister dst, XmmRegister src, const Immediate &mask);
void shufps(Type Ty, XmmRegister dst, const Address &src,
const Immediate &mask);
diff --git a/src/IceAssemblerX86BaseImpl.h b/src/IceAssemblerX86BaseImpl.h
index 85d9302..e5819ce 100644
--- a/src/IceAssemblerX86BaseImpl.h
+++ b/src/IceAssemblerX86BaseImpl.h
@@ -1589,25 +1589,41 @@
}
template <typename TraitsType>
-void AssemblerX86Base<TraitsType>::punpckldq(Type, XmmRegister Dst,
- XmmRegister Src) {
+void AssemblerX86Base<TraitsType>::punpckl(Type Ty, XmmRegister Dst,
+ XmmRegister Src) {
AssemblerBuffer::EnsureCapacity ensured(&Buffer);
emitUint8(0x66);
emitRexRB(RexTypeIrrelevant, Dst, Src);
emitUint8(0x0F);
- emitUint8(0x62);
+ if (Ty == IceType_v4i32 || Ty == IceType_v4f32) {
+ emitUint8(0x62);
+ } else if (Ty == IceType_v8i16) {
+ emitUint8(0x61);
+ } else if (Ty == IceType_v16i8) {
+ emitUint8(0x60);
+ } else {
+ assert(false && "Unexpected vector unpack operand type");
+ }
emitXmmRegisterOperand(Dst, Src);
}
template <typename TraitsType>
-void AssemblerX86Base<TraitsType>::punpckldq(Type, XmmRegister Dst,
- const Address &Src) {
+void AssemblerX86Base<TraitsType>::punpckl(Type Ty, XmmRegister Dst,
+ const Address &Src) {
AssemblerBuffer::EnsureCapacity ensured(&Buffer);
emitUint8(0x66);
emitAddrSizeOverridePrefix();
emitRex(RexTypeIrrelevant, Src, Dst);
emitUint8(0x0F);
- emitUint8(0x62);
+ if (Ty == IceType_v4i32 || Ty == IceType_v4f32) {
+ emitUint8(0x62);
+ } else if (Ty == IceType_v8i16) {
+ emitUint8(0x61);
+ } else if (Ty == IceType_v16i8) {
+ emitUint8(0x60);
+ } else {
+ assert(false && "Unexpected vector unpack operand type");
+ }
emitOperand(gprEncoding(Dst), Src);
}
diff --git a/src/IceInstX86Base.h b/src/IceInstX86Base.h
index 230df8f..1ce6d6f 100644
--- a/src/IceInstX86Base.h
+++ b/src/IceInstX86Base.h
@@ -3658,8 +3658,8 @@
template <> \
const InstImpl<TraitsType>::Assembler::XmmEmitterRegOp \
InstImpl<TraitsType>::InstX86Punpckl::Base::Emitter = { \
- &InstImpl<TraitsType>::Assembler::punpckldq, \
- &InstImpl<TraitsType>::Assembler::punpckldq}; \
+ &InstImpl<TraitsType>::Assembler::punpckl, \
+ &InstImpl<TraitsType>::Assembler::punpckl}; \
} \
}
diff --git a/src/IceTargetLoweringX86BaseImpl.h b/src/IceTargetLoweringX86BaseImpl.h
index d0a2aa3..9dbf1a9 100644
--- a/src/IceTargetLoweringX86BaseImpl.h
+++ b/src/IceTargetLoweringX86BaseImpl.h
@@ -5978,10 +5978,6 @@
llvm::report_fatal_error("Unexpected vector type.");
case IceType_v16i1:
case IceType_v16i8: {
- if (InstructionSet < Traits::SSE4_1) {
- // TODO(jpp): figure out how to lower with sse2.
- break;
- }
static constexpr SizeT ExpectedNumElements = 16;
assert(ExpectedNumElements == Instr->getNumIndexes());
(void)ExpectedNumElements;
@@ -6001,6 +5997,25 @@
const SizeT Index13 = Instr->getIndex(13)->getValue();
const SizeT Index14 = Instr->getIndex(14)->getValue();
const SizeT Index15 = Instr->getIndex(15)->getValue();
+
+ if (Index0 == 0 && Index1 == 0 && Index2 == 1 && Index3 == 1 &&
+ Index4 == 2 && Index5 == 2 && Index6 == 3 && Index7 == 3 &&
+ Index8 == 4 && Index9 == 4 && Index10 == 5 && Index11 == 5 &&
+ Index12 == 6 && Index13 == 6 && Index14 == 7 && Index15 == 7) {
+ auto *T = makeReg(DestTy);
+ auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+ auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
+ _movp(T, Src0RM);
+ _punpckl(T, Src1RM);
+ _movp(Dest, T);
+ return;
+ }
+
+ if (InstructionSet < Traits::SSE4_1) {
+ // TODO(jpp): figure out how to lower with sse2.
+ break;
+ }
+
lowerShuffleVector_UsingPshufb(Dest, Src0, Src1, Index0, Index1, Index2,
Index3, Index4, Index5, Index6, Index7,
Index8, Index9, Index10, Index11, Index12,
@@ -6009,10 +6024,6 @@
}
case IceType_v8i1:
case IceType_v8i16: {
- if (InstructionSet < Traits::SSE4_1) {
- // TODO(jpp): figure out how to lower with sse2.
- break;
- }
static constexpr SizeT ExpectedNumElements = 8;
assert(ExpectedNumElements == Instr->getNumIndexes());
(void)ExpectedNumElements;
@@ -6024,6 +6035,23 @@
const SizeT Index5 = Instr->getIndex(5)->getValue();
const SizeT Index6 = Instr->getIndex(6)->getValue();
const SizeT Index7 = Instr->getIndex(7)->getValue();
+
+ if (Index0 == 0 && Index1 == 0 && Index2 == 1 && Index3 == 1 &&
+ Index4 == 2 && Index5 == 2 && Index6 == 3 && Index7 == 3) {
+ auto *T = makeReg(DestTy);
+ auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+ auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
+ _movp(T, Src0RM);
+ _punpckl(T, Src1RM);
+ _movp(Dest, T);
+ return;
+ }
+
+ if (InstructionSet < Traits::SSE4_1) {
+ // TODO(jpp): figure out how to lower with sse2.
+ break;
+ }
+
#define TO_BYTE_INDEX(I) ((I) << 1)
lowerShuffleVector_UsingPshufb(
Dest, Src0, Src1, TO_BYTE_INDEX(Index0), TO_BYTE_INDEX(Index0) + 1,
diff --git a/unittest/AssemblerX8632/XmmArith.cpp b/unittest/AssemblerX8632/XmmArith.cpp
index 1d6572c..1571874 100644
--- a/unittest/AssemblerX8632/XmmArith.cpp
+++ b/unittest/AssemblerX8632/XmmArith.cpp
@@ -1017,57 +1017,76 @@
#undef TestImplSingleXmmXmm
}
-TEST_F(AssemblerX8632Test, Punpckldq) {
- const Dqword V0(uint64_t(0x1111111122222222ull),
- uint64_t(0x5555555577777777ull));
- const Dqword V1(uint64_t(0xAAAAAAAABBBBBBBBull),
- uint64_t(0xCCCCCCCCDDDDDDDDull));
+TEST_F(AssemblerX8632Test, Punpckl) {
+ const Dqword V0_v4i32(uint64_t(0x1111111122222222ull),
+ uint64_t(0x5555555577777777ull));
+ const Dqword V1_v4i32(uint64_t(0xAAAAAAAABBBBBBBBull),
+ uint64_t(0xCCCCCCCCDDDDDDDDull));
+ const Dqword Expected_v4i32(uint64_t(0xBBBBBBBB22222222ull),
+ uint64_t(0xAAAAAAAA11111111ull));
- const Dqword Expected(uint64_t(0xBBBBBBBB22222222ull),
- uint64_t(0xAAAAAAAA11111111ull));
+ const Dqword V0_v8i16(uint64_t(0x1111222233334444ull),
+ uint64_t(0x5555666677778888ull));
+ const Dqword V1_v8i16(uint64_t(0xAAAABBBBCCCCDDDDull),
+ uint64_t(0xEEEEFFFF00009999ull));
+ const Dqword Expected_v8i16(uint64_t(0xCCCC3333DDDD4444ull),
+ uint64_t(0xAAAA1111BBBB2222ull));
-#define TestImplXmmXmm(Dst, Src, Inst) \
+ const Dqword V0_v16i8(uint64_t(0x1122334455667788ull),
+ uint64_t(0x99AABBCCDDEEFF00ull));
+ const Dqword V1_v16i8(uint64_t(0xFFEEDDCCBBAA9900ull),
+ uint64_t(0xBAADF00DFEEDFACEull));
+ const Dqword Expected_v16i8(uint64_t(0xBB55AA6699770088ull),
+ uint64_t(0xFF11EE22DD33CC44ull));
+
+#define TestImplXmmXmm(Dst, Src, Inst, Ty) \
do { \
- static constexpr char TestString[] = "(" #Dst ", " #Src ", " #Inst ")"; \
+ static constexpr char TestString[] = \
+ "(" #Dst ", " #Src ", " #Inst ", " #Ty ")"; \
const uint32_t T0 = allocateDqword(); \
const uint32_t T1 = allocateDqword(); \
\
__ movups(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0)); \
__ movups(XmmRegister::Encoded_Reg_##Src, dwordAddress(T1)); \
- __ Inst(IceType_void, XmmRegister::Encoded_Reg_##Dst, \
+ __ Inst(IceType_##Ty, XmmRegister::Encoded_Reg_##Dst, \
XmmRegister::Encoded_Reg_##Src); \
\
AssembledTest test = assemble(); \
- test.setDqwordTo(T0, V0); \
- test.setDqwordTo(T1, V1); \
+ test.setDqwordTo(T0, V0_##Ty); \
+ test.setDqwordTo(T1, V1_##Ty); \
test.run(); \
\
- ASSERT_EQ(Expected, test.Dst<Dqword>()) << TestString; \
+ ASSERT_EQ(Expected_##Ty, test.Dst<Dqword>()) << TestString; \
reset(); \
} while (0)
-#define TestImplXmmAddr(Dst, Inst) \
+#define TestImplXmmAddr(Dst, Inst, Ty) \
do { \
- static constexpr char TestString[] = "(" #Dst ", Addr, " #Inst ")"; \
+ static constexpr char TestString[] = \
+ "(" #Dst ", Addr, " #Inst ", " #Ty ")"; \
const uint32_t T0 = allocateDqword(); \
const uint32_t T1 = allocateDqword(); \
\
__ movups(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0)); \
- __ Inst(IceType_void, XmmRegister::Encoded_Reg_##Dst, dwordAddress(T1)); \
+ __ Inst(IceType_##Ty, XmmRegister::Encoded_Reg_##Dst, dwordAddress(T1)); \
\
AssembledTest test = assemble(); \
- test.setDqwordTo(T0, V0); \
- test.setDqwordTo(T1, V1); \
+ test.setDqwordTo(T0, V0_##Ty); \
+ test.setDqwordTo(T1, V1_##Ty); \
test.run(); \
\
- ASSERT_EQ(Expected, test.Dst<Dqword>()) << TestString; \
+ ASSERT_EQ(Expected_##Ty, test.Dst<Dqword>()) << TestString; \
reset(); \
} while (0)
#define TestImpl(Dst, Src) \
do { \
- TestImplXmmXmm(Dst, Src, punpckldq); \
- TestImplXmmAddr(Dst, punpckldq); \
+ TestImplXmmXmm(Dst, Src, punpckl, v4i32); \
+ TestImplXmmAddr(Dst, punpckl, v4i32); \
+ TestImplXmmXmm(Dst, Src, punpckl, v8i16); \
+ TestImplXmmAddr(Dst, punpckl, v8i16); \
+ TestImplXmmXmm(Dst, Src, punpckl, v16i8); \
+ TestImplXmmAddr(Dst, punpckl, v16i8); \
} while (0)
TestImpl(xmm0, xmm1);
diff --git a/unittest/AssemblerX8664/XmmArith.cpp b/unittest/AssemblerX8664/XmmArith.cpp
index 837b717..cadb88e 100644
--- a/unittest/AssemblerX8664/XmmArith.cpp
+++ b/unittest/AssemblerX8664/XmmArith.cpp
@@ -1083,57 +1083,76 @@
#undef TestImplSingleXmmXmm
}
-TEST_F(AssemblerX8664Test, Punpckldq) {
- const Dqword V0(uint64_t(0x1111111122222222ull),
- uint64_t(0x5555555577777777ull));
- const Dqword V1(uint64_t(0xAAAAAAAABBBBBBBBull),
- uint64_t(0xCCCCCCCCDDDDDDDDull));
+TEST_F(AssemblerX8664Test, Punpckl) {
+ const Dqword V0_v4i32(uint64_t(0x1111111122222222ull),
+ uint64_t(0x5555555577777777ull));
+ const Dqword V1_v4i32(uint64_t(0xAAAAAAAABBBBBBBBull),
+ uint64_t(0xCCCCCCCCDDDDDDDDull));
+ const Dqword Expected_v4i32(uint64_t(0xBBBBBBBB22222222ull),
+ uint64_t(0xAAAAAAAA11111111ull));
- const Dqword Expected(uint64_t(0xBBBBBBBB22222222ull),
- uint64_t(0xAAAAAAAA11111111ull));
+ const Dqword V0_v8i16(uint64_t(0x1111222233334444ull),
+ uint64_t(0x5555666677778888ull));
+ const Dqword V1_v8i16(uint64_t(0xAAAABBBBCCCCDDDDull),
+ uint64_t(0xEEEEFFFF00009999ull));
+ const Dqword Expected_v8i16(uint64_t(0xCCCC3333DDDD4444ull),
+ uint64_t(0xAAAA1111BBBB2222ull));
-#define TestImplXmmXmm(Dst, Src, Inst) \
+ const Dqword V0_v16i8(uint64_t(0x1122334455667788ull),
+ uint64_t(0x99AABBCCDDEEFF00ull));
+ const Dqword V1_v16i8(uint64_t(0xFFEEDDCCBBAA9900ull),
+ uint64_t(0xBAADF00DFEEDFACEull));
+ const Dqword Expected_v16i8(uint64_t(0xBB55AA6699770088ull),
+ uint64_t(0xFF11EE22DD33CC44ull));
+
+#define TestImplXmmXmm(Dst, Src, Inst, Ty) \
do { \
- static constexpr char TestString[] = "(" #Dst ", " #Src ", " #Inst ")"; \
+ static constexpr char TestString[] = \
+ "(" #Dst ", " #Src ", " #Inst ", " #Ty ")"; \
const uint32_t T0 = allocateDqword(); \
const uint32_t T1 = allocateDqword(); \
\
__ movups(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0)); \
__ movups(XmmRegister::Encoded_Reg_##Src, dwordAddress(T1)); \
- __ Inst(IceType_void, XmmRegister::Encoded_Reg_##Dst, \
+ __ Inst(IceType_##Ty, XmmRegister::Encoded_Reg_##Dst, \
XmmRegister::Encoded_Reg_##Src); \
\
AssembledTest test = assemble(); \
- test.setDqwordTo(T0, V0); \
- test.setDqwordTo(T1, V1); \
+ test.setDqwordTo(T0, V0_##Ty); \
+ test.setDqwordTo(T1, V1_##Ty); \
test.run(); \
\
- ASSERT_EQ(Expected, test.Dst<Dqword>()) << TestString; \
+ ASSERT_EQ(Expected_##Ty, test.Dst<Dqword>()) << TestString; \
reset(); \
} while (0)
-#define TestImplXmmAddr(Dst, Inst) \
+#define TestImplXmmAddr(Dst, Inst, Ty) \
do { \
- static constexpr char TestString[] = "(" #Dst ", Addr, " #Inst ")"; \
+ static constexpr char TestString[] = \
+ "(" #Dst ", Addr, " #Inst ", " #Ty ")"; \
const uint32_t T0 = allocateDqword(); \
const uint32_t T1 = allocateDqword(); \
\
__ movups(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0)); \
- __ Inst(IceType_void, XmmRegister::Encoded_Reg_##Dst, dwordAddress(T1)); \
+ __ Inst(IceType_##Ty, XmmRegister::Encoded_Reg_##Dst, dwordAddress(T1)); \
\
AssembledTest test = assemble(); \
- test.setDqwordTo(T0, V0); \
- test.setDqwordTo(T1, V1); \
+ test.setDqwordTo(T0, V0_##Ty); \
+ test.setDqwordTo(T1, V1_##Ty); \
test.run(); \
\
- ASSERT_EQ(Expected, test.Dst<Dqword>()) << TestString; \
+ ASSERT_EQ(Expected_##Ty, test.Dst<Dqword>()) << TestString; \
reset(); \
} while (0)
#define TestImpl(Dst, Src) \
do { \
- TestImplXmmXmm(Dst, Src, punpckldq); \
- TestImplXmmAddr(Dst, punpckldq); \
+ TestImplXmmXmm(Dst, Src, punpckl, v4i32); \
+ TestImplXmmAddr(Dst, punpckl, v4i32); \
+ TestImplXmmXmm(Dst, Src, punpckl, v8i16); \
+ TestImplXmmAddr(Dst, punpckl, v8i16); \
+ TestImplXmmXmm(Dst, Src, punpckl, v16i8); \
+ TestImplXmmAddr(Dst, punpckl, v16i8); \
} while (0)
TestImpl(xmm0, xmm1);
@@ -1143,15 +1162,7 @@
TestImpl(xmm4, xmm5);
TestImpl(xmm5, xmm6);
TestImpl(xmm6, xmm7);
- TestImpl(xmm7, xmm8);
- TestImpl(xmm8, xmm9);
- TestImpl(xmm9, xmm10);
- TestImpl(xmm10, xmm11);
- TestImpl(xmm11, xmm12);
- TestImpl(xmm12, xmm13);
- TestImpl(xmm13, xmm14);
- TestImpl(xmm14, xmm15);
- TestImpl(xmm15, xmm0);
+ TestImpl(xmm7, xmm0);
#undef TestImpl
#undef TestImplXmmAddr