Optimize shuffles corresponding to x86 punpckh instructions.
BUG=swiftshader:15
Change-Id: I04a7c4206f3936c604ec623e43834c2a153fd3cb
Reviewed-on: https://chromium-review.googlesource.com/399379
Tested-by: Nicolas Capens <nicolascapens@google.com>
Reviewed-by: Jim Stichnoth <stichnot@chromium.org>
diff --git a/src/IceAssemblerX86Base.h b/src/IceAssemblerX86Base.h
index be0ff00..7ba22f2 100644
--- a/src/IceAssemblerX86Base.h
+++ b/src/IceAssemblerX86Base.h
@@ -437,6 +437,8 @@
const Immediate &mask);
void punpckl(Type Ty, XmmRegister Dst, XmmRegister Src);
void punpckl(Type Ty, XmmRegister Dst, const Address &Src);
+ void punpckh(Type Ty, XmmRegister Dst, XmmRegister Src);
+ void punpckh(Type Ty, XmmRegister Dst, const Address &Src);
void packss(Type Ty, XmmRegister Dst, XmmRegister Src);
void packss(Type Ty, XmmRegister Dst, const Address &Src);
void packus(Type Ty, XmmRegister Dst, XmmRegister Src);
diff --git a/src/IceAssemblerX86BaseImpl.h b/src/IceAssemblerX86BaseImpl.h
index 11da946..adf9105 100644
--- a/src/IceAssemblerX86BaseImpl.h
+++ b/src/IceAssemblerX86BaseImpl.h
@@ -1628,6 +1628,45 @@
}
template <typename TraitsType>
+void AssemblerX86Base<TraitsType>::punpckh(Type Ty, XmmRegister Dst,
+ XmmRegister Src) {
+ AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+ emitUint8(0x66);
+ emitRexRB(RexTypeIrrelevant, Dst, Src);
+ emitUint8(0x0F);
+ if (Ty == IceType_v4i32 || Ty == IceType_v4f32) {
+ emitUint8(0x6A);
+ } else if (Ty == IceType_v8i16) {
+ emitUint8(0x69);
+ } else if (Ty == IceType_v16i8) {
+ emitUint8(0x68);
+ } else {
+ assert(false && "Unexpected vector unpack operand type");
+ }
+ emitXmmRegisterOperand(Dst, Src);
+}
+
+template <typename TraitsType>
+void AssemblerX86Base<TraitsType>::punpckh(Type Ty, XmmRegister Dst,
+ const Address &Src) {
+ AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+ emitUint8(0x66);
+ emitAddrSizeOverridePrefix();
+ emitRex(RexTypeIrrelevant, Src, Dst);
+ emitUint8(0x0F);
+ if (Ty == IceType_v4i32 || Ty == IceType_v4f32) {
+ emitUint8(0x6A);
+ } else if (Ty == IceType_v8i16) {
+ emitUint8(0x69);
+ } else if (Ty == IceType_v16i8) {
+ emitUint8(0x68);
+ } else {
+ assert(false && "Unexpected vector unpack operand type");
+ }
+ emitOperand(gprEncoding(Dst), Src);
+}
+
+template <typename TraitsType>
void AssemblerX86Base<TraitsType>::packss(Type Ty, XmmRegister Dst,
XmmRegister Src) {
AssemblerBuffer::EnsureCapacity ensured(&Buffer);
diff --git a/src/IceInst.h b/src/IceInst.h
index d038d53..26119ea 100644
--- a/src/IceInst.h
+++ b/src/IceInst.h
@@ -997,6 +997,37 @@
return Indexes[Pos];
}
+ inline bool indexesAre(int32_t i0, int32_t i1, int32_t i2, int32_t i3,
+ int32_t i4, int32_t i5, int32_t i6, int32_t i7) const {
+ static constexpr SizeT ExpectedNumElements = 8;
+ assert(ExpectedNumElements == getNumIndexes());
+ (void)ExpectedNumElements;
+
+ return getIndex(0)->getValue() == i0 && getIndex(1)->getValue() == i1 &&
+ getIndex(2)->getValue() == i2 && getIndex(3)->getValue() == i3 &&
+ getIndex(4)->getValue() == i4 && getIndex(5)->getValue() == i5 &&
+ getIndex(6)->getValue() == i6 && getIndex(7)->getValue() == i7;
+ }
+
+ inline bool indexesAre(int32_t i0, int32_t i1, int32_t i2, int32_t i3,
+ int32_t i4, int32_t i5, int32_t i6, int32_t i7,
+ int32_t i8, int32_t i9, int32_t i10, int32_t i11,
+ int32_t i12, int32_t i13, int32_t i14,
+ int32_t i15) const {
+ static constexpr SizeT ExpectedNumElements = 16;
+ assert(ExpectedNumElements == getNumIndexes());
+ (void)ExpectedNumElements;
+
+ return getIndex(0)->getValue() == i0 && getIndex(1)->getValue() == i1 &&
+ getIndex(2)->getValue() == i2 && getIndex(3)->getValue() == i3 &&
+ getIndex(4)->getValue() == i4 && getIndex(5)->getValue() == i5 &&
+ getIndex(6)->getValue() == i6 && getIndex(7)->getValue() == i7 &&
+ getIndex(8)->getValue() == i8 && getIndex(9)->getValue() == i9 &&
+ getIndex(10)->getValue() == i10 && getIndex(11)->getValue() == i11 &&
+ getIndex(12)->getValue() == i12 && getIndex(13)->getValue() == i13 &&
+ getIndex(14)->getValue() == i14 && getIndex(15)->getValue() == i15;
+ }
+
bool isMemoryWrite() const override { return false; }
void dump(const Cfg *Func) const override;
static bool classof(const Inst *Instr) {
diff --git a/src/IceInstX86Base.h b/src/IceInstX86Base.h
index 7e96de5..09025aa 100644
--- a/src/IceInstX86Base.h
+++ b/src/IceInstX86Base.h
@@ -145,6 +145,7 @@
Pshufb,
Pshufd,
Punpckl,
+ Punpckh,
Packss,
Packus,
Psll,
@@ -2932,6 +2933,22 @@
Source) {}
};
+ class InstX86Punpckh
+ : public InstX86BaseBinopXmm<InstX86Base::Punpckh, false,
+ InstX86Base::SseSuffix::Unpack> {
+ public:
+ static InstX86Punpckh *create(Cfg *Func, Variable *Dest, Operand *Source) {
+ return new (Func->allocate<InstX86Punpckh>())
+ InstX86Punpckh(Func, Dest, Source);
+ }
+
+ private:
+ InstX86Punpckh(Cfg *Func, Variable *Dest, Operand *Source)
+ : InstX86BaseBinopXmm<InstX86Base::Punpckh, false,
+ InstX86Base::SseSuffix::Unpack>(Func, Dest,
+ Source) {}
+ };
+
class InstX86Packss
: public InstX86BaseBinopXmm<InstX86Base::Packss, false,
InstX86Base::SseSuffix::Pack> {
@@ -3089,6 +3106,7 @@
using Pshufb = typename InstImpl<TraitsType>::InstX86Pshufb;
using Punpckl = typename InstImpl<TraitsType>::InstX86Punpckl;
+ using Punpckh = typename InstImpl<TraitsType>::InstX86Punpckh;
using Packss = typename InstImpl<TraitsType>::InstX86Packss;
using Packus = typename InstImpl<TraitsType>::InstX86Packus;
};
@@ -3328,6 +3346,9 @@
const char *InstImpl<TraitsType>::InstX86Punpckl::Base::Opcode = "punpckl"; \
template <> \
template <> \
+ const char *InstImpl<TraitsType>::InstX86Punpckh::Base::Opcode = "punpckh"; \
+ template <> \
+ template <> \
const char *InstImpl<TraitsType>::InstX86Packss::Base::Opcode = "packss"; \
template <> \
template <> \
@@ -3708,6 +3729,12 @@
template <> \
template <> \
const InstImpl<TraitsType>::Assembler::XmmEmitterRegOp \
+ InstImpl<TraitsType>::InstX86Punpckh::Base::Emitter = { \
+ &InstImpl<TraitsType>::Assembler::punpckh, \
+ &InstImpl<TraitsType>::Assembler::punpckh}; \
+ template <> \
+ template <> \
+ const InstImpl<TraitsType>::Assembler::XmmEmitterRegOp \
InstImpl<TraitsType>::InstX86Packss::Base::Emitter = { \
&InstImpl<TraitsType>::Assembler::packss, \
&InstImpl<TraitsType>::Assembler::packss}; \
diff --git a/src/IceTargetLoweringX86Base.h b/src/IceTargetLoweringX86Base.h
index 0f31dd4..7b1e730 100644
--- a/src/IceTargetLoweringX86Base.h
+++ b/src/IceTargetLoweringX86Base.h
@@ -815,6 +815,10 @@
AutoMemorySandboxer<> _(this, &Dest, &Src0);
Context.insert<typename Traits::Insts::Punpckl>(Dest, Src0);
}
+ void _punpckh(Variable *Dest, Operand *Src0) {
+ AutoMemorySandboxer<> _(this, &Dest, &Src0);
+ Context.insert<typename Traits::Insts::Punpckh>(Dest, Src0);
+ }
void _packss(Variable *Dest, Operand *Src0) {
AutoMemorySandboxer<> _(this, &Dest, &Src0);
Context.insert<typename Traits::Insts::Packss>(Dest, Src0);
diff --git a/src/IceTargetLoweringX86BaseImpl.h b/src/IceTargetLoweringX86BaseImpl.h
index eb0f67d..87bbfde 100644
--- a/src/IceTargetLoweringX86BaseImpl.h
+++ b/src/IceTargetLoweringX86BaseImpl.h
@@ -6002,6 +6002,53 @@
static constexpr SizeT ExpectedNumElements = 16;
assert(ExpectedNumElements == Instr->getNumIndexes());
(void)ExpectedNumElements;
+
+ if (Instr->indexesAre(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7)) {
+ auto *T = makeReg(DestTy);
+ auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+ _movp(T, Src0RM);
+ _punpckl(T, Src0RM);
+ _movp(Dest, T);
+ return;
+ }
+
+ if (Instr->indexesAre(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7,
+ 23)) {
+ auto *T = makeReg(DestTy);
+ auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+ auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
+ _movp(T, Src0RM);
+ _punpckl(T, Src1RM);
+ _movp(Dest, T);
+ return;
+ }
+
+ if (Instr->indexesAre(8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14,
+ 15, 15)) {
+ auto *T = makeReg(DestTy);
+ auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+ _movp(T, Src0RM);
+ _punpckh(T, Src0RM);
+ _movp(Dest, T);
+ return;
+ }
+
+ if (Instr->indexesAre(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30,
+ 15, 31)) {
+ auto *T = makeReg(DestTy);
+ auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+ auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
+ _movp(T, Src0RM);
+ _punpckh(T, Src1RM);
+ _movp(Dest, T);
+ return;
+ }
+
+ if (InstructionSet < Traits::SSE4_1) {
+ // TODO(jpp): figure out how to lower with sse2.
+ break;
+ }
+
const SizeT Index0 = Instr->getIndex(0)->getValue();
const SizeT Index1 = Instr->getIndex(1)->getValue();
const SizeT Index2 = Instr->getIndex(2)->getValue();
@@ -6019,23 +6066,6 @@
const SizeT Index14 = Instr->getIndex(14)->getValue();
const SizeT Index15 = Instr->getIndex(15)->getValue();
- if (Index0 == 0 && Index1 == 0 && Index2 == 1 && Index3 == 1 &&
- Index4 == 2 && Index5 == 2 && Index6 == 3 && Index7 == 3 &&
- Index8 == 4 && Index9 == 4 && Index10 == 5 && Index11 == 5 &&
- Index12 == 6 && Index13 == 6 && Index14 == 7 && Index15 == 7) {
- auto *T = makeReg(DestTy);
- auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
- _movp(T, Src0RM);
- _punpckl(T, Src0RM);
- _movp(Dest, T);
- return;
- }
-
- if (InstructionSet < Traits::SSE4_1) {
- // TODO(jpp): figure out how to lower with sse2.
- break;
- }
-
lowerShuffleVector_UsingPshufb(Dest, Src0, Src1, Index0, Index1, Index2,
Index3, Index4, Index5, Index6, Index7,
Index8, Index9, Index10, Index11, Index12,
@@ -6047,17 +6077,8 @@
static constexpr SizeT ExpectedNumElements = 8;
assert(ExpectedNumElements == Instr->getNumIndexes());
(void)ExpectedNumElements;
- const SizeT Index0 = Instr->getIndex(0)->getValue();
- const SizeT Index1 = Instr->getIndex(1)->getValue();
- const SizeT Index2 = Instr->getIndex(2)->getValue();
- const SizeT Index3 = Instr->getIndex(3)->getValue();
- const SizeT Index4 = Instr->getIndex(4)->getValue();
- const SizeT Index5 = Instr->getIndex(5)->getValue();
- const SizeT Index6 = Instr->getIndex(6)->getValue();
- const SizeT Index7 = Instr->getIndex(7)->getValue();
- if (Index0 == 0 && Index1 == 0 && Index2 == 1 && Index3 == 1 &&
- Index4 == 2 && Index5 == 2 && Index6 == 3 && Index7 == 3) {
+ if (Instr->indexesAre(0, 0, 1, 1, 2, 2, 3, 3)) {
auto *T = makeReg(DestTy);
auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
_movp(T, Src0RM);
@@ -6066,11 +6087,49 @@
return;
}
+ if (Instr->indexesAre(0, 8, 1, 9, 2, 10, 3, 11)) {
+ auto *T = makeReg(DestTy);
+ auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+ auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
+ _movp(T, Src0RM);
+ _punpckl(T, Src0RM);
+ _movp(Dest, T);
+ return;
+ }
+
+ if (Instr->indexesAre(4, 4, 5, 5, 6, 6, 7, 7)) {
+ auto *T = makeReg(DestTy);
+ auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+ _movp(T, Src0RM);
+ _punpckh(T, Src0RM);
+ _movp(Dest, T);
+ return;
+ }
+
+ if (Instr->indexesAre(4, 12, 5, 13, 6, 14, 7, 15)) {
+ auto *T = makeReg(DestTy);
+ auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+ auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
+ _movp(T, Src0RM);
+ _punpckh(T, Src1RM);
+ _movp(Dest, T);
+ return;
+ }
+
if (InstructionSet < Traits::SSE4_1) {
// TODO(jpp): figure out how to lower with sse2.
break;
}
+ const SizeT Index0 = Instr->getIndex(0)->getValue();
+ const SizeT Index1 = Instr->getIndex(1)->getValue();
+ const SizeT Index2 = Instr->getIndex(2)->getValue();
+ const SizeT Index3 = Instr->getIndex(3)->getValue();
+ const SizeT Index4 = Instr->getIndex(4)->getValue();
+ const SizeT Index5 = Instr->getIndex(5)->getValue();
+ const SizeT Index6 = Instr->getIndex(6)->getValue();
+ const SizeT Index7 = Instr->getIndex(7)->getValue();
+
#define TO_BYTE_INDEX(I) ((I) << 1)
lowerShuffleVector_UsingPshufb(
Dest, Src0, Src1, TO_BYTE_INDEX(Index0), TO_BYTE_INDEX(Index0) + 1,