Optimize common vector shuffle patterns for ARM32. Use VDUP for replicating a single element. Use VZIP for interleaving vectors. Use VMOV Dd, Dm for rearranging quadword vectors. Bug b/67106219 Change-Id: I0de1457454c1db6d467bf870288b7af7cb59ac09 Reviewed-on: https://chromium-review.googlesource.com/695004 Reviewed-by: Jim Stichnoth <stichnot@chromium.org> Reviewed-on: https://swiftshader-review.googlesource.com/12968 Reviewed-by: Nicolas Capens <nicolascapens@google.com> Tested-by: Nicolas Capens <nicolascapens@google.com>
diff --git a/third_party/subzero/src/IceAssemblerARM32.cpp b/third_party/subzero/src/IceAssemblerARM32.cpp index 502668c..4b1fcb9 100644 --- a/third_party/subzero/src/IceAssemblerARM32.cpp +++ b/third_party/subzero/src/IceAssemblerARM32.cpp
@@ -3418,6 +3418,97 @@ emitSIMDBase(VpaddOpcode, Dd, Dd, Dd + 1, UseQRegs, IsFloatTy); } +void AssemblerARM32::vdup(Type ElmtTy, const Operand *OpQd, const Operand *OpQn, + IValueT Idx) { + // VDUP (scalar) - ARMv7-A/R section A8.6.302, encoding A1: + // VDUP<c>.<size> <Qd>, <Dm[x]> + // + // 111100111D11iiiiddd011000QM0mmmm where Dddd=<Qd>, Mmmmm=<Dm>, and + // iiii=imm4 encodes <size> and [x]. + constexpr const char *Vdup = "vdup"; + + const IValueT VdupOpcode = B25 | B24 | B23 | B21 | B20 | B11 | B10; + + const IValueT Dd = mapQRegToDReg(encodeQRegister(OpQd, "Qd", Vdup)); + const IValueT Dn = mapQRegToDReg(encodeQRegister(OpQn, "Qn", Vdup)); + + constexpr bool UseQRegs = true; + constexpr bool IsFloatTy = false; + + IValueT Imm4 = 0; + bool Lower = true; + switch (ElmtTy) { + case IceType_i8: + assert(Idx < 16); + Lower = Idx < 8; + Imm4 = 1 | ((Idx & 0x7) << 1); + break; + case IceType_i16: + assert(Idx < 8); + Lower = Idx < 4; + Imm4 = 2 | ((Idx & 0x3) << 2); + break; + case IceType_i32: + case IceType_f32: + assert(Idx < 4); + Lower = Idx < 2; + Imm4 = 4 | ((Idx & 0x1) << 3); + break; + default: + assert(false && "vdup only supports 8, 16, and 32-bit elements"); + break; + } + + emitSIMDBase(VdupOpcode, Dd, Imm4, Dn + (Lower ? 0 : 1), UseQRegs, IsFloatTy); +} + +void AssemblerARM32::vzip(Type ElmtTy, const Operand *OpQd, const Operand *OpQn, + const Operand *OpQm) { + // Pseudo-instruction which interleaves the elements of the lower halves of + // two quadword registers. + + // Vzip - ARMv7-A/R section A8.6.410, encoding A1: + // VZIP<c>.<size> <Dd>, <Dm> + // + // 111100111D11ss10dddd00011QM0mmmm where Ddddd=<Dd>, Mmmmm=<Dm>, and + // ss=<size> + assert(ElmtTy != IceType_i64 && "vzip on i64 vector not allowed"); + + constexpr const char *Vzip = "vzip"; + const IValueT Dd = mapQRegToDReg(encodeQRegister(OpQd, "Qd", Vzip)); + const IValueT Dn = mapQRegToDReg(encodeQRegister(OpQn, "Qn", Vzip)); + const IValueT Dm = mapQRegToDReg(encodeQRegister(OpQm, "Qm", Vzip)); + + constexpr bool UseQRegs = false; + constexpr bool IsFloatTy = false; + + // VMOV Dd, Dm + // 111100100D10mmmmdddd0001MQM1mmmm + constexpr IValueT VmovOpcode = B25 | B21 | B8 | B4; + + // Copy lower half of second source to upper half of destination. + emitSIMDBase(VmovOpcode, Dd + 1, Dm, Dm, UseQRegs, IsFloatTy); + + // Copy lower half of first source to lower half of destination. + if (Dd != Dn) + emitSIMDBase(VmovOpcode, Dd, Dn, Dn, UseQRegs, IsFloatTy); + + constexpr IValueT ElmtShift = 18; + const IValueT ElmtSize = encodeElmtType(ElmtTy); + assert(Utils::IsUint(2, ElmtSize)); + + if (ElmtTy != IceType_i32 && ElmtTy != IceType_f32) { + constexpr IValueT VzipOpcode = B25 | B24 | B23 | B21 | B20 | B17 | B8 | B7; + // Zip the lower and upper half of destination. + emitSIMDBase(VzipOpcode | (ElmtSize << ElmtShift), Dd, 0, Dd + 1, UseQRegs, + IsFloatTy); + } else { + constexpr IValueT VtrnOpcode = B25 | B24 | B23 | B21 | B20 | B17 | B7; + emitSIMDBase(VtrnOpcode | (ElmtSize << ElmtShift), Dd, 0, Dd + 1, UseQRegs, + IsFloatTy); + } +} + void AssemblerARM32::vmulqf(const Operand *OpQd, const Operand *OpQn, const Operand *OpQm) { // VMUL (floating-point) - ARM section A8.8.351, encoding A1: @@ -3448,6 +3539,110 @@ mapQRegToDReg(Qm), UseQRegs, IsFloat); } +void AssemblerARM32::vmovlq(const Operand *OpQd, const Operand *OpQn, + const Operand *OpQm) { + // Pseudo-instruction to copy the first source operand and insert the lower + // half of the second operand into the lower half of the destination. + + // VMOV (register) - ARMv7-A/R section A8.6.327, encoding A1: + // VMOV<c> <Dd>, <Dm> + // + // 111100111D110000ddd001011QM0mmm0 where Dddd=Qd, Mmmm=Qm, and Q=0. + + constexpr const char *Vmov = "vmov"; + const IValueT Dd = mapQRegToDReg(encodeQRegister(OpQd, "Qd", Vmov)); + const IValueT Dn = mapQRegToDReg(encodeQRegister(OpQn, "Qn", Vmov)); + const IValueT Dm = mapQRegToDReg(encodeQRegister(OpQm, "Qm", Vmov)); + + constexpr bool UseQRegs = false; + constexpr bool IsFloat = false; + + const IValueT VmovOpcode = B25 | B21 | B8 | B4; + + if (Dd != Dm) + emitSIMDBase(VmovOpcode, Dd, Dm, Dm, UseQRegs, IsFloat); + if (Dd + 1 != Dn + 1) + emitSIMDBase(VmovOpcode, Dd + 1, Dn + 1, Dn + 1, UseQRegs, IsFloat); +} + +void AssemblerARM32::vmovhq(const Operand *OpQd, const Operand *OpQn, + const Operand *OpQm) { + // Pseudo-instruction to copy the first source operand and insert the high + // half of the second operand into the high half of the destination. + + // VMOV (register) - ARMv7-A/R section A8.6.327, encoding A1: + // VMOV<c> <Dd>, <Dm> + // + // 111100111D110000ddd001011QM0mmm0 where Dddd=Qd, Mmmm=Qm, and Q=0. + + constexpr const char *Vmov = "vmov"; + const IValueT Dd = mapQRegToDReg(encodeQRegister(OpQd, "Qd", Vmov)); + const IValueT Dn = mapQRegToDReg(encodeQRegister(OpQn, "Qn", Vmov)); + const IValueT Dm = mapQRegToDReg(encodeQRegister(OpQm, "Qm", Vmov)); + + constexpr bool UseQRegs = false; + constexpr bool IsFloat = false; + + const IValueT VmovOpcode = B25 | B21 | B8 | B4; + + if (Dd != Dn) + emitSIMDBase(VmovOpcode, Dd, Dn, Dn, UseQRegs, IsFloat); + if (Dd + 1 != Dm + 1) + emitSIMDBase(VmovOpcode, Dd + 1, Dm + 1, Dm + 1, UseQRegs, IsFloat); +} + +void AssemblerARM32::vmovhlq(const Operand *OpQd, const Operand *OpQn, + const Operand *OpQm) { + // Pseudo-instruction to copy the first source operand and insert the high + // half of the second operand into the lower half of the destination. + + // VMOV (register) - ARMv7-A/R section A8.6.327, encoding A1: + // VMOV<c> <Dd>, <Dm> + // + // 111100111D110000ddd001011QM0mmm0 where Dddd=Qd, Mmmm=Qm, and Q=0. + + constexpr const char *Vmov = "vmov"; + const IValueT Dd = mapQRegToDReg(encodeQRegister(OpQd, "Qd", Vmov)); + const IValueT Dn = mapQRegToDReg(encodeQRegister(OpQn, "Qn", Vmov)); + const IValueT Dm = mapQRegToDReg(encodeQRegister(OpQm, "Qm", Vmov)); + + constexpr bool UseQRegs = false; + constexpr bool IsFloat = false; + + const IValueT VmovOpcode = B25 | B21 | B8 | B4; + + if (Dd != Dm + 1) + emitSIMDBase(VmovOpcode, Dd, Dm + 1, Dm + 1, UseQRegs, IsFloat); + if (Dd + 1 != Dn + 1) + emitSIMDBase(VmovOpcode, Dd + 1, Dn + 1, Dn + 1, UseQRegs, IsFloat); +} + +void AssemblerARM32::vmovlhq(const Operand *OpQd, const Operand *OpQn, + const Operand *OpQm) { + // Pseudo-instruction to copy the first source operand and insert the lower + // half of the second operand into the high half of the destination. + + // VMOV (register) - ARMv7-A/R section A8.6.327, encoding A1: + // VMOV<c> <Dd>, <Dm> + // + // 111100111D110000ddd001011QM0mmm0 where Dddd=Qd, Mmmm=Qm, and Q=0. + + constexpr const char *Vmov = "vmov"; + const IValueT Dd = mapQRegToDReg(encodeQRegister(OpQd, "Qd", Vmov)); + const IValueT Dn = mapQRegToDReg(encodeQRegister(OpQn, "Qn", Vmov)); + const IValueT Dm = mapQRegToDReg(encodeQRegister(OpQm, "Qm", Vmov)); + + constexpr bool UseQRegs = false; + constexpr bool IsFloat = false; + + const IValueT VmovOpcode = B25 | B21 | B8 | B4; + + if (Dd + 1 != Dm) + emitSIMDBase(VmovOpcode, Dd + 1, Dm, Dm, UseQRegs, IsFloat); + if (Dd != Dn) + emitSIMDBase(VmovOpcode, Dd, Dn, Dn, UseQRegs, IsFloat); +} + void AssemblerARM32::vnegqs(Type ElmtTy, const Operand *OpQd, const Operand *OpQm) { // VNEG - ARM section A8.8.355, encoding A1:
diff --git a/third_party/subzero/src/IceAssemblerARM32.h b/third_party/subzero/src/IceAssemblerARM32.h index 1f80043..43c3f56 100644 --- a/third_party/subzero/src/IceAssemblerARM32.h +++ b/third_party/subzero/src/IceAssemblerARM32.h
@@ -546,6 +546,13 @@ void vmlap(Type ElmtTy, const Operand *OpQd, const Operand *OpQn, const Operand *OpQm); + // Vector element replication. + void vdup(Type ElmtTy, const Operand *OpQd, const Operand *OpQn, IValueT Idx); + + // Vector interleave lower halves. + void vzip(Type ElmtTy, const Operand *OpQd, const Operand *OpQn, + const Operand *OpQm); + // Float vector multiply. void vmulqf(const Operand *OpQd, const Operand *OpQn, const Operand *OpQm); @@ -554,6 +561,11 @@ void vmvnq(const Operand *OpQd, const Operand *OpQm); + void vmovlq(const Operand *OpQd, const Operand *OpQn, const Operand *OpQm); + void vmovhq(const Operand *OpQd, const Operand *OpQn, const Operand *OpQm); + void vmovhlq(const Operand *OpQd, const Operand *OpQn, const Operand *OpQm); + void vmovlhq(const Operand *OpQd, const Operand *OpQn, const Operand *OpQm); + void vnegqs(const Operand *OpQd, const Operand *OpQm); void vnegqs(Type ElmtTy, const Operand *OpQd, const Operand *OpQm);
diff --git a/third_party/subzero/src/IceInst.h b/third_party/subzero/src/IceInst.h index 889ead5..187c16d 100644 --- a/third_party/subzero/src/IceInst.h +++ b/third_party/subzero/src/IceInst.h
@@ -997,35 +997,45 @@ return Indexes[Pos]; } - inline bool indexesAre(int32_t i0, int32_t i1, int32_t i2, int32_t i3, - int32_t i4, int32_t i5, int32_t i6, int32_t i7) const { + int32_t getIndexValue(SizeT Pos) const { return getIndex(Pos)->getValue(); } + + bool indexesAre(int32_t i0, int32_t i1, int32_t i2, int32_t i3) const { + static constexpr SizeT ExpectedNumElements = 4; + assert(ExpectedNumElements == getNumIndexes()); + (void)ExpectedNumElements; + + return getIndexValue(0) == i0 && getIndexValue(1) == i1 && + getIndexValue(2) == i2 && getIndexValue(3) == i3; + } + + bool indexesAre(int32_t i0, int32_t i1, int32_t i2, int32_t i3, int32_t i4, + int32_t i5, int32_t i6, int32_t i7) const { static constexpr SizeT ExpectedNumElements = 8; assert(ExpectedNumElements == getNumIndexes()); (void)ExpectedNumElements; - return getIndex(0)->getValue() == i0 && getIndex(1)->getValue() == i1 && - getIndex(2)->getValue() == i2 && getIndex(3)->getValue() == i3 && - getIndex(4)->getValue() == i4 && getIndex(5)->getValue() == i5 && - getIndex(6)->getValue() == i6 && getIndex(7)->getValue() == i7; + return getIndexValue(0) == i0 && getIndexValue(1) == i1 && + getIndexValue(2) == i2 && getIndexValue(3) == i3 && + getIndexValue(4) == i4 && getIndexValue(5) == i5 && + getIndexValue(6) == i6 && getIndexValue(7) == i7; } - inline bool indexesAre(int32_t i0, int32_t i1, int32_t i2, int32_t i3, - int32_t i4, int32_t i5, int32_t i6, int32_t i7, - int32_t i8, int32_t i9, int32_t i10, int32_t i11, - int32_t i12, int32_t i13, int32_t i14, - int32_t i15) const { + bool indexesAre(int32_t i0, int32_t i1, int32_t i2, int32_t i3, int32_t i4, + int32_t i5, int32_t i6, int32_t i7, int32_t i8, int32_t i9, + int32_t i10, int32_t i11, int32_t i12, int32_t i13, + int32_t i14, int32_t i15) const { static constexpr SizeT ExpectedNumElements = 16; assert(ExpectedNumElements == getNumIndexes()); (void)ExpectedNumElements; - return getIndex(0)->getValue() == i0 && getIndex(1)->getValue() == i1 && - getIndex(2)->getValue() == i2 && getIndex(3)->getValue() == i3 && - getIndex(4)->getValue() == i4 && getIndex(5)->getValue() == i5 && - getIndex(6)->getValue() == i6 && getIndex(7)->getValue() == i7 && - getIndex(8)->getValue() == i8 && getIndex(9)->getValue() == i9 && - getIndex(10)->getValue() == i10 && getIndex(11)->getValue() == i11 && - getIndex(12)->getValue() == i12 && getIndex(13)->getValue() == i13 && - getIndex(14)->getValue() == i14 && getIndex(15)->getValue() == i15; + return getIndexValue(0) == i0 && getIndexValue(1) == i1 && + getIndexValue(2) == i2 && getIndexValue(3) == i3 && + getIndexValue(4) == i4 && getIndexValue(5) == i5 && + getIndexValue(6) == i6 && getIndexValue(7) == i7 && + getIndexValue(8) == i8 && getIndexValue(9) == i9 && + getIndexValue(10) == i10 && getIndexValue(11) == i11 && + getIndexValue(12) == i12 && getIndexValue(13) == i13 && + getIndexValue(14) == i14 && getIndexValue(15) == i15; } bool isMemoryWrite() const override { return false; }
diff --git a/third_party/subzero/src/IceInstARM32.cpp b/third_party/subzero/src/IceInstARM32.cpp index 2f12b85..646730f 100644 --- a/third_party/subzero/src/IceInstARM32.cpp +++ b/third_party/subzero/src/IceInstARM32.cpp
@@ -903,6 +903,82 @@ } } +template <> void InstARM32Vmovl::emitIAS(const Cfg *Func) const { + auto *Asm = Func->getAssembler<ARM32::AssemblerARM32>(); + const Variable *Dest = getDest(); + switch (Dest->getType()) { + default: + llvm::report_fatal_error("Vmovlq not defined on type " + + typeStdString(Dest->getType())); + case IceType_v4i1: + case IceType_v8i1: + case IceType_v16i1: + case IceType_v16i8: + case IceType_v8i16: + case IceType_v4i32: + case IceType_v4f32: { + Asm->vmovlq(Dest, getSrc(0), getSrc(1)); + } break; + } +} + +template <> void InstARM32Vmovh::emitIAS(const Cfg *Func) const { + auto *Asm = Func->getAssembler<ARM32::AssemblerARM32>(); + const Variable *Dest = getDest(); + switch (Dest->getType()) { + default: + llvm::report_fatal_error("Vmovhq not defined on type " + + typeStdString(Dest->getType())); + case IceType_v4i1: + case IceType_v8i1: + case IceType_v16i1: + case IceType_v16i8: + case IceType_v8i16: + case IceType_v4i32: + case IceType_v4f32: { + Asm->vmovhq(Dest, getSrc(0), getSrc(1)); + } break; + } +} + +template <> void InstARM32Vmovhl::emitIAS(const Cfg *Func) const { + auto *Asm = Func->getAssembler<ARM32::AssemblerARM32>(); + const Variable *Dest = getDest(); + switch (Dest->getType()) { + default: + llvm::report_fatal_error("Vmovhlq not defined on type " + + typeStdString(Dest->getType())); + case IceType_v4i1: + case IceType_v8i1: + case IceType_v16i1: + case IceType_v16i8: + case IceType_v8i16: + case IceType_v4i32: + case IceType_v4f32: { + Asm->vmovhlq(Dest, getSrc(0), getSrc(1)); + } break; + } +} + +template <> void InstARM32Vmovlh::emitIAS(const Cfg *Func) const { + auto *Asm = Func->getAssembler<ARM32::AssemblerARM32>(); + const Variable *Dest = getDest(); + switch (Dest->getType()) { + default: + llvm::report_fatal_error("Vmovlhq not defined on type " + + typeStdString(Dest->getType())); + case IceType_v4i1: + case IceType_v8i1: + case IceType_v16i1: + case IceType_v16i8: + case IceType_v8i16: + case IceType_v4i32: + case IceType_v4f32: { + Asm->vmovlhq(Dest, getSrc(0), getSrc(1)); + } break; + } +} + template <> void InstARM32Vneg::emitIAS(const Cfg *Func) const { auto *Asm = Func->getAssembler<ARM32::AssemblerARM32>(); const Variable *Dest = getDest(); @@ -1168,6 +1244,15 @@ assert(!Asm->needsTextFixup()); } +template <> void InstARM32Vzip::emitIAS(const Cfg *Func) const { + auto *Asm = Func->getAssembler<ARM32::AssemblerARM32>(); + const Operand *Src0 = getSrc(0); + const Operand *Src1 = getSrc(1); + Type DestTy = Dest->getType(); + Asm->vzip(typeElementType(DestTy), Dest, Src0, Src1); + assert(!Asm->needsTextFixup()); +} + template <> void InstARM32Vmul::emitIAS(const Cfg *Func) const { auto *Asm = Func->getAssembler<ARM32::AssemblerARM32>(); const Variable *Dest = getDest(); @@ -1425,6 +1510,12 @@ this->Size = Size; } +InstARM32Vdup::InstARM32Vdup(Cfg *Func, Variable *Dest, Variable *Src, + IValueT Idx) + : InstARM32Pred(Func, InstARM32::Vdup, 1, Dest, CondARM32::AL), Idx(Idx) { + addSource(Src); +} + InstARM32Trap::InstARM32Trap(Cfg *Func) : InstARM32(Func, InstARM32::Trap, 0, nullptr) {} @@ -1775,6 +1866,10 @@ template <> const char *InstARM32Vmls::Opcode = "vmls"; template <> const char *InstARM32Vmul::Opcode = "vmul"; template <> const char *InstARM32Vmvn::Opcode = "vmvn"; +template <> const char *InstARM32Vmovl::Opcode = "vmovl"; +template <> const char *InstARM32Vmovh::Opcode = "vmovh"; +template <> const char *InstARM32Vmovhl::Opcode = "vmovhl"; +template <> const char *InstARM32Vmovlh::Opcode = "vmovlh"; template <> const char *InstARM32Vorr::Opcode = "vorr"; template <> const char *InstARM32UnaryopFP<InstARM32::Vneg>::Opcode = "vneg"; template <> const char *InstARM32ThreeAddrFP<InstARM32::Vshl>::Opcode = "vshl"; @@ -1790,6 +1885,7 @@ const char *InstARM32ThreeAddrFP<InstARM32::Vmulh>::Opcode = "vmulh"; template <> const char *InstARM32ThreeAddrFP<InstARM32::Vmlap>::Opcode = "vmlap"; +template <> const char *InstARM32ThreeAddrFP<InstARM32::Vzip>::Opcode = "vzip"; // Four-addr ops template <> const char *InstARM32Mla::Opcode = "mla"; template <> const char *InstARM32Mls::Opcode = "mls"; @@ -2805,6 +2901,43 @@ getSrc(0)->dump(Func); } +void InstARM32Vdup::emit(const Cfg *Func) const { + if (!BuildDefs::dump()) + return; + Ostream &Str = Func->getContext()->getStrEmit(); + assert(getSrcSize() == 2); + Type Ty = getSrc(0)->getType(); + const char *Opcode = "vdup"; + Str << "\t" << Opcode; + Str << getPredicate() << "." << getWidthString(Ty) << getVecElmtBitsize(Ty); + Str << "\t"; + getSrc(0)->emit(Func); + Str << ", "; + getSrc(1)->emit(Func); + Str << ", " << Idx; +} + +void InstARM32Vdup::emitIAS(const Cfg *Func) const { + assert(getSrcSize() == 1); + auto *Asm = Func->getAssembler<ARM32::AssemblerARM32>(); + const Operand *Dest = getDest(); + const Operand *Src = getSrc(0); + Type DestTy = Dest->getType(); + Asm->vdup(typeElementType(DestTy), Dest, Src, Idx); +} + +void InstARM32Vdup::dump(const Cfg *Func) const { + if (!BuildDefs::dump()) + return; + Ostream &Str = Func->getContext()->getStrDump(); + dumpDest(Func); + Str << " = "; + dumpOpcodePred(Str, "vdup", getDest()->getType()); + Str << " "; + dumpSources(Func); + Str << ", " << Idx; +} + void InstARM32Trap::emit(const Cfg *Func) const { if (!BuildDefs::dump()) return; @@ -3386,6 +3519,7 @@ template class InstARM32LoadBase<InstARM32::Ldrex>; template class InstARM32LoadBase<InstARM32::Vldr1d>; template class InstARM32LoadBase<InstARM32::Vldr1q>; +template class InstARM32ThreeAddrFP<InstARM32::Vzip>; template class InstARM32TwoAddrGPR<InstARM32::Movt>; template class InstARM32UnaryopGPR<InstARM32::Movw, false>;
diff --git a/third_party/subzero/src/IceInstARM32.h b/third_party/subzero/src/IceInstARM32.h index 593d96d..e31aabc 100644 --- a/third_party/subzero/src/IceInstARM32.h +++ b/third_party/subzero/src/IceInstARM32.h
@@ -434,12 +434,17 @@ Vcmp, Vcvt, Vdiv, + Vdup, Veor, Vldr1d, Vldr1q, Vmla, Vmlap, Vmls, + Vmovl, + Vmovh, + Vmovhl, + Vmovlh, Vmrs, Vmul, Vmulh, @@ -453,7 +458,8 @@ Vshr, Vsqrt, Vstr1, - Vsub + Vsub, + Vzip }; static constexpr size_t InstSize = sizeof(uint32_t); @@ -1020,6 +1026,10 @@ using InstARM32Veor = InstARM32ThreeAddrFP<InstARM32::Veor>; using InstARM32Vmla = InstARM32FourAddrFP<InstARM32::Vmla>; using InstARM32Vmls = InstARM32FourAddrFP<InstARM32::Vmls>; +using InstARM32Vmovl = InstARM32ThreeAddrFP<InstARM32::Vmovl>; +using InstARM32Vmovh = InstARM32ThreeAddrFP<InstARM32::Vmovh>; +using InstARM32Vmovhl = InstARM32ThreeAddrFP<InstARM32::Vmovhl>; +using InstARM32Vmovlh = InstARM32ThreeAddrFP<InstARM32::Vmovlh>; using InstARM32Vmul = InstARM32ThreeAddrFP<InstARM32::Vmul>; using InstARM32Vmvn = InstARM32UnaryopFP<InstARM32::Vmvn>; using InstARM32Vneg = InstARM32UnaryopSignAwareFP<InstARM32::Vneg>; @@ -1036,6 +1046,7 @@ using InstARM32Ldrex = InstARM32LoadBase<InstARM32::Ldrex>; using InstARM32Vldr1d = InstARM32LoadBase<InstARM32::Vldr1d>; using InstARM32Vldr1q = InstARM32LoadBase<InstARM32::Vldr1q>; +using InstARM32Vzip = InstARM32ThreeAddrFP<InstARM32::Vzip>; /// MovT leaves the bottom bits alone so dest is also a source. This helps /// indicate that a previous MovW setting dest is not dead code. using InstARM32Movt = InstARM32TwoAddrGPR<InstARM32::Movt>; @@ -1374,6 +1385,30 @@ SizeT Size; }; +/// Vector element duplication/replication instruction. +class InstARM32Vdup final : public InstARM32Pred { + InstARM32Vdup() = delete; + InstARM32Vdup(const InstARM32Vdup &) = delete; + InstARM32Vdup &operator=(const InstARM32Vdup &) = delete; + +public: + /// Value must be a register. + static InstARM32Vdup *create(Cfg *Func, Variable *Dest, Variable *Src, + IValueT Idx) { + return new (Func->allocate<InstARM32Vdup>()) + InstARM32Vdup(Func, Dest, Src, Idx); + } + void emit(const Cfg *Func) const override; + void emitIAS(const Cfg *Func) const override; + void dump(const Cfg *Func) const override; + static bool classof(const Inst *Instr) { return isClassof(Instr, Vdup); } + +private: + InstARM32Vdup(Cfg *Func, Variable *Dest, Variable *Src, IValueT Idx); + + const IValueT Idx; +}; + class InstARM32Trap : public InstARM32 { InstARM32Trap() = delete; InstARM32Trap(const InstARM32Trap &) = delete;
diff --git a/third_party/subzero/src/IceTargetLoweringARM32.cpp b/third_party/subzero/src/IceTargetLoweringARM32.cpp index 9856f7a..d820bca 100644 --- a/third_party/subzero/src/IceTargetLoweringARM32.cpp +++ b/third_party/subzero/src/IceTargetLoweringARM32.cpp
@@ -5357,7 +5357,7 @@ Func->setError("Unexpected size for LoadSubVector"); return; } - _mov(Dest, T); // FIXME: necessary? + _mov(Dest, T); return; } case Intrinsics::StoreSubVector: { @@ -5975,8 +5975,121 @@ const Type DestTy = Dest->getType(); auto *T = makeReg(DestTy); + auto *Src0 = Instr->getSrc(0); + auto *Src1 = Instr->getSrc(1); + const SizeT NumElements = typeNumElements(DestTy); + const Type ElementType = typeElementType(DestTy); + + bool Replicate = true; + for (SizeT I = 1; Replicate && I < Instr->getNumIndexes(); ++I) { + if (Instr->getIndexValue(I) != Instr->getIndexValue(0)) { + Replicate = false; + } + } + + if (Replicate) { + Variable *Src0Var = legalizeToReg(Src0); + _vdup(T, Src0Var, Instr->getIndexValue(0)); + _mov(Dest, T); + return; + } switch (DestTy) { + case IceType_v8i1: + case IceType_v8i16: { + static constexpr SizeT ExpectedNumElements = 8; + assert(ExpectedNumElements == Instr->getNumIndexes()); + (void)ExpectedNumElements; + + if (Instr->indexesAre(0, 0, 1, 1, 2, 2, 3, 3)) { + Variable *Src0R = legalizeToReg(Src0); + _vzip(T, Src0R, Src0R); + _mov(Dest, T); + return; + } + + if (Instr->indexesAre(0, 8, 1, 9, 2, 10, 3, 11)) { + Variable *Src0R = legalizeToReg(Src0); + Variable *Src1R = legalizeToReg(Src1); + _vzip(T, Src0R, Src1R); + _mov(Dest, T); + return; + } + + if (Instr->indexesAre(0, 2, 4, 6, 0, 2, 4, 6)) { + Variable *Src0R = legalizeToReg(Src0); + _vqmovn2(T, Src0R, Src0R, false, false); + _mov(Dest, T); + return; + } + } break; + case IceType_v16i1: + case IceType_v16i8: { + static constexpr SizeT ExpectedNumElements = 16; + assert(ExpectedNumElements == Instr->getNumIndexes()); + (void)ExpectedNumElements; + + if (Instr->indexesAre(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7)) { + Variable *Src0R = legalizeToReg(Src0); + _vzip(T, Src0R, Src0R); + _mov(Dest, T); + return; + } + + if (Instr->indexesAre(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, + 23)) { + Variable *Src0R = legalizeToReg(Src0); + Variable *Src1R = legalizeToReg(Src1); + _vzip(T, Src0R, Src1R); + _mov(Dest, T); + return; + } + } break; + case IceType_v4i1: + case IceType_v4i32: + case IceType_v4f32: { + static constexpr SizeT ExpectedNumElements = 4; + assert(ExpectedNumElements == Instr->getNumIndexes()); + (void)ExpectedNumElements; + + if (Instr->indexesAre(0, 0, 1, 1)) { + Variable *Src0R = legalizeToReg(Src0); + _vzip(T, Src0R, Src0R); + _mov(Dest, T); + return; + } + + if (Instr->indexesAre(0, 4, 1, 5)) { + Variable *Src0R = legalizeToReg(Src0); + Variable *Src1R = legalizeToReg(Src1); + _vzip(T, Src0R, Src1R); + _mov(Dest, T); + return; + } + + if (Instr->indexesAre(0, 1, 4, 5)) { + Variable *Src0R = legalizeToReg(Src0); + Variable *Src1R = legalizeToReg(Src1); + _vmovlh(T, Src0R, Src1R); + _mov(Dest, T); + return; + } + + if (Instr->indexesAre(2, 3, 2, 3)) { + Variable *Src0R = legalizeToReg(Src0); + _vmovhl(T, Src0R, Src0R); + _mov(Dest, T); + return; + } + + if (Instr->indexesAre(2, 3, 6, 7)) { + Variable *Src0R = legalizeToReg(Src0); + Variable *Src1R = legalizeToReg(Src1); + _vmovhl(T, Src1R, Src0R); + _mov(Dest, T); + return; + } + } break; default: break; // TODO(jpp): figure out how to properly lower this without scalarization. @@ -5984,10 +6097,6 @@ // Unoptimized shuffle. Perform a series of inserts and extracts. Context.insert<InstFakeDef>(T); - auto *Src0 = Instr->getSrc(0); - auto *Src1 = Instr->getSrc(1); - const SizeT NumElements = typeNumElements(DestTy); - const Type ElementType = typeElementType(DestTy); for (SizeT I = 0; I < Instr->getNumIndexes(); ++I) { auto *Index = Instr->getIndex(I); const SizeT Elem = Index->getValue();
diff --git a/third_party/subzero/src/IceTargetLoweringARM32.h b/third_party/subzero/src/IceTargetLoweringARM32.h index a82337a..a629627 100644 --- a/third_party/subzero/src/IceTargetLoweringARM32.h +++ b/third_party/subzero/src/IceTargetLoweringARM32.h
@@ -885,6 +885,9 @@ CondARM32::Cond Pred = CondARM32::AL) { Context.insert<InstARM32Vcmp>(Src0, FpZero, Pred); } + void _vdup(Variable *Dest, Variable *Src, int Idx) { + Context.insert<InstARM32Vdup>(Dest, Src, Idx); + } void _veor(Variable *Dest, Variable *Src0, Variable *Src1) { Context.insert<InstARM32Veor>(Dest, Src0, Src1); } @@ -908,6 +911,18 @@ void _vmls(Variable *Dest, Variable *Src0, Variable *Src1) { Context.insert<InstARM32Vmls>(Dest, Src0, Src1); } + void _vmovl(Variable *Dest, Variable *Src0, Variable *Src1) { + Context.insert<InstARM32Vmovl>(Dest, Src0, Src1); + } + void _vmovh(Variable *Dest, Variable *Src0, Variable *Src1) { + Context.insert<InstARM32Vmovh>(Dest, Src0, Src1); + } + void _vmovhl(Variable *Dest, Variable *Src0, Variable *Src1) { + Context.insert<InstARM32Vmovhl>(Dest, Src0, Src1); + } + void _vmovlh(Variable *Dest, Variable *Src0, Variable *Src1) { + Context.insert<InstARM32Vmovlh>(Dest, Src0, Src1); + } void _vmul(Variable *Dest, Variable *Src0, Variable *Src1) { Context.insert<InstARM32Vmul>(Dest, Src0, Src1); } @@ -966,6 +981,9 @@ void _vsub(Variable *Dest, Variable *Src0, Variable *Src1) { Context.insert<InstARM32Vsub>(Dest, Src0, Src1); } + void _vzip(Variable *Dest, Variable *Src0, Variable *Src1) { + Context.insert<InstARM32Vzip>(Dest, Src0, Src1); + } // Iterates over the CFG and determines the maximum outgoing stack arguments // bytes. This information is later used during addProlog() to pre-allocate
diff --git a/third_party/subzero/src/IceTargetLoweringX86BaseImpl.h b/third_party/subzero/src/IceTargetLoweringX86BaseImpl.h index c5eac33..f2fd83e 100644 --- a/third_party/subzero/src/IceTargetLoweringX86BaseImpl.h +++ b/third_party/subzero/src/IceTargetLoweringX86BaseImpl.h
@@ -6304,22 +6304,22 @@ break; } - const SizeT Index0 = Instr->getIndex(0)->getValue(); - const SizeT Index1 = Instr->getIndex(1)->getValue(); - const SizeT Index2 = Instr->getIndex(2)->getValue(); - const SizeT Index3 = Instr->getIndex(3)->getValue(); - const SizeT Index4 = Instr->getIndex(4)->getValue(); - const SizeT Index5 = Instr->getIndex(5)->getValue(); - const SizeT Index6 = Instr->getIndex(6)->getValue(); - const SizeT Index7 = Instr->getIndex(7)->getValue(); - const SizeT Index8 = Instr->getIndex(8)->getValue(); - const SizeT Index9 = Instr->getIndex(9)->getValue(); - const SizeT Index10 = Instr->getIndex(10)->getValue(); - const SizeT Index11 = Instr->getIndex(11)->getValue(); - const SizeT Index12 = Instr->getIndex(12)->getValue(); - const SizeT Index13 = Instr->getIndex(13)->getValue(); - const SizeT Index14 = Instr->getIndex(14)->getValue(); - const SizeT Index15 = Instr->getIndex(15)->getValue(); + const SizeT Index0 = Instr->getIndexValue(0); + const SizeT Index1 = Instr->getIndexValue(1); + const SizeT Index2 = Instr->getIndexValue(2); + const SizeT Index3 = Instr->getIndexValue(3); + const SizeT Index4 = Instr->getIndexValue(4); + const SizeT Index5 = Instr->getIndexValue(5); + const SizeT Index6 = Instr->getIndexValue(6); + const SizeT Index7 = Instr->getIndexValue(7); + const SizeT Index8 = Instr->getIndexValue(8); + const SizeT Index9 = Instr->getIndexValue(9); + const SizeT Index10 = Instr->getIndexValue(10); + const SizeT Index11 = Instr->getIndexValue(11); + const SizeT Index12 = Instr->getIndexValue(12); + const SizeT Index13 = Instr->getIndexValue(13); + const SizeT Index14 = Instr->getIndexValue(14); + const SizeT Index15 = Instr->getIndexValue(15); lowerShuffleVector_UsingPshufb(Dest, Src0, Src1, Index0, Index1, Index2, Index3, Index4, Index5, Index6, Index7, @@ -6376,14 +6376,14 @@ break; } - const SizeT Index0 = Instr->getIndex(0)->getValue(); - const SizeT Index1 = Instr->getIndex(1)->getValue(); - const SizeT Index2 = Instr->getIndex(2)->getValue(); - const SizeT Index3 = Instr->getIndex(3)->getValue(); - const SizeT Index4 = Instr->getIndex(4)->getValue(); - const SizeT Index5 = Instr->getIndex(5)->getValue(); - const SizeT Index6 = Instr->getIndex(6)->getValue(); - const SizeT Index7 = Instr->getIndex(7)->getValue(); + const SizeT Index0 = Instr->getIndexValue(0); + const SizeT Index1 = Instr->getIndexValue(1); + const SizeT Index2 = Instr->getIndexValue(2); + const SizeT Index3 = Instr->getIndexValue(3); + const SizeT Index4 = Instr->getIndexValue(4); + const SizeT Index5 = Instr->getIndexValue(5); + const SizeT Index6 = Instr->getIndexValue(6); + const SizeT Index7 = Instr->getIndexValue(7); #define TO_BYTE_INDEX(I) ((I) << 1) lowerShuffleVector_UsingPshufb( @@ -6403,10 +6403,10 @@ case IceType_v4f32: { static constexpr SizeT ExpectedNumElements = 4; assert(ExpectedNumElements == Instr->getNumIndexes()); - const SizeT Index0 = Instr->getIndex(0)->getValue(); - const SizeT Index1 = Instr->getIndex(1)->getValue(); - const SizeT Index2 = Instr->getIndex(2)->getValue(); - const SizeT Index3 = Instr->getIndex(3)->getValue(); + const SizeT Index0 = Instr->getIndexValue(0); + const SizeT Index1 = Instr->getIndexValue(1); + const SizeT Index2 = Instr->getIndexValue(2); + const SizeT Index3 = Instr->getIndexValue(3); Variable *T = nullptr; switch (makeSrcSwitchMask(Index0, Index1, Index2, Index3)) { #define CASE_SRCS_IN(S0, S1, S2, S3) \ @@ -6611,8 +6611,7 @@ InstExtractElement::create(Func, ExtElmt, Src0, Index)); } else { lowerExtractElement(InstExtractElement::create( - Func, ExtElmt, Src1, - Ctx->getConstantInt32(Index->getValue() - NumElements))); + Func, ExtElmt, Src1, Ctx->getConstantInt32(Elem - NumElements))); } auto *NewT = makeReg(DestTy); lowerInsertElement(InstInsertElement::create(Func, NewT, T, ExtElmt,