Optimize common vector shuffle patterns for ARM32.
Use VDUP for replicating a single element.
Use VZIP for interleaving vectors.
Use VMOV Dd, Dm for rearranging quadword vectors.
Bug b/67106219
Change-Id: I0de1457454c1db6d467bf870288b7af7cb59ac09
Reviewed-on: https://chromium-review.googlesource.com/695004
Reviewed-by: Jim Stichnoth <stichnot@chromium.org>
Reviewed-on: https://swiftshader-review.googlesource.com/12968
Reviewed-by: Nicolas Capens <nicolascapens@google.com>
Tested-by: Nicolas Capens <nicolascapens@google.com>
diff --git a/third_party/subzero/src/IceTargetLoweringARM32.cpp b/third_party/subzero/src/IceTargetLoweringARM32.cpp
index 9856f7a..d820bca 100644
--- a/third_party/subzero/src/IceTargetLoweringARM32.cpp
+++ b/third_party/subzero/src/IceTargetLoweringARM32.cpp
@@ -5357,7 +5357,7 @@
Func->setError("Unexpected size for LoadSubVector");
return;
}
- _mov(Dest, T); // FIXME: necessary?
+ _mov(Dest, T);
return;
}
case Intrinsics::StoreSubVector: {
@@ -5975,8 +5975,121 @@
const Type DestTy = Dest->getType();
auto *T = makeReg(DestTy);
+ auto *Src0 = Instr->getSrc(0);
+ auto *Src1 = Instr->getSrc(1);
+ const SizeT NumElements = typeNumElements(DestTy);
+ const Type ElementType = typeElementType(DestTy);
+
+ bool Replicate = true;
+ for (SizeT I = 1; Replicate && I < Instr->getNumIndexes(); ++I) {
+ if (Instr->getIndexValue(I) != Instr->getIndexValue(0)) {
+ Replicate = false;
+ }
+ }
+
+ if (Replicate) {
+ Variable *Src0Var = legalizeToReg(Src0);
+ _vdup(T, Src0Var, Instr->getIndexValue(0));
+ _mov(Dest, T);
+ return;
+ }
switch (DestTy) {
+ case IceType_v8i1:
+ case IceType_v8i16: {
+ static constexpr SizeT ExpectedNumElements = 8;
+ assert(ExpectedNumElements == Instr->getNumIndexes());
+ (void)ExpectedNumElements;
+
+ if (Instr->indexesAre(0, 0, 1, 1, 2, 2, 3, 3)) {
+ Variable *Src0R = legalizeToReg(Src0);
+ _vzip(T, Src0R, Src0R);
+ _mov(Dest, T);
+ return;
+ }
+
+ if (Instr->indexesAre(0, 8, 1, 9, 2, 10, 3, 11)) {
+ Variable *Src0R = legalizeToReg(Src0);
+ Variable *Src1R = legalizeToReg(Src1);
+ _vzip(T, Src0R, Src1R);
+ _mov(Dest, T);
+ return;
+ }
+
+ if (Instr->indexesAre(0, 2, 4, 6, 0, 2, 4, 6)) {
+ Variable *Src0R = legalizeToReg(Src0);
+ _vqmovn2(T, Src0R, Src0R, false, false);
+ _mov(Dest, T);
+ return;
+ }
+ } break;
+ case IceType_v16i1:
+ case IceType_v16i8: {
+ static constexpr SizeT ExpectedNumElements = 16;
+ assert(ExpectedNumElements == Instr->getNumIndexes());
+ (void)ExpectedNumElements;
+
+ if (Instr->indexesAre(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7)) {
+ Variable *Src0R = legalizeToReg(Src0);
+ _vzip(T, Src0R, Src0R);
+ _mov(Dest, T);
+ return;
+ }
+
+ if (Instr->indexesAre(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7,
+ 23)) {
+ Variable *Src0R = legalizeToReg(Src0);
+ Variable *Src1R = legalizeToReg(Src1);
+ _vzip(T, Src0R, Src1R);
+ _mov(Dest, T);
+ return;
+ }
+ } break;
+ case IceType_v4i1:
+ case IceType_v4i32:
+ case IceType_v4f32: {
+ static constexpr SizeT ExpectedNumElements = 4;
+ assert(ExpectedNumElements == Instr->getNumIndexes());
+ (void)ExpectedNumElements;
+
+ if (Instr->indexesAre(0, 0, 1, 1)) {
+ Variable *Src0R = legalizeToReg(Src0);
+ _vzip(T, Src0R, Src0R);
+ _mov(Dest, T);
+ return;
+ }
+
+ if (Instr->indexesAre(0, 4, 1, 5)) {
+ Variable *Src0R = legalizeToReg(Src0);
+ Variable *Src1R = legalizeToReg(Src1);
+ _vzip(T, Src0R, Src1R);
+ _mov(Dest, T);
+ return;
+ }
+
+ if (Instr->indexesAre(0, 1, 4, 5)) {
+ Variable *Src0R = legalizeToReg(Src0);
+ Variable *Src1R = legalizeToReg(Src1);
+ _vmovlh(T, Src0R, Src1R);
+ _mov(Dest, T);
+ return;
+ }
+
+ if (Instr->indexesAre(2, 3, 2, 3)) {
+ Variable *Src0R = legalizeToReg(Src0);
+ _vmovhl(T, Src0R, Src0R);
+ _mov(Dest, T);
+ return;
+ }
+
+ if (Instr->indexesAre(2, 3, 6, 7)) {
+ Variable *Src0R = legalizeToReg(Src0);
+ Variable *Src1R = legalizeToReg(Src1);
+ _vmovhl(T, Src1R, Src0R);
+ _mov(Dest, T);
+ return;
+ }
+ } break;
default:
break;
// TODO(jpp): figure out how to properly lower this without scalarization.
@@ -5984,10 +6097,6 @@
// Unoptimized shuffle. Perform a series of inserts and extracts.
Context.insert<InstFakeDef>(T);
- auto *Src0 = Instr->getSrc(0);
- auto *Src1 = Instr->getSrc(1);
- const SizeT NumElements = typeNumElements(DestTy);
- const Type ElementType = typeElementType(DestTy);
for (SizeT I = 0; I < Instr->getNumIndexes(); ++I) {
auto *Index = Instr->getIndex(I);
const SizeT Elem = Index->getValue();