Optimize common vector shuffle patterns for ARM32.

Use VDUP for replicating a single element.
Use VZIP for interleaving vectors.
Use VMOV Dd, Dm for rearranging quadword vectors.

Bug b/67106219

Change-Id: I0de1457454c1db6d467bf870288b7af7cb59ac09
Reviewed-on: https://chromium-review.googlesource.com/695004
Reviewed-by: Jim Stichnoth <stichnot@chromium.org>
Reviewed-on: https://swiftshader-review.googlesource.com/12968
Reviewed-by: Nicolas Capens <nicolascapens@google.com>
Tested-by: Nicolas Capens <nicolascapens@google.com>
diff --git a/third_party/subzero/src/IceTargetLoweringARM32.cpp b/third_party/subzero/src/IceTargetLoweringARM32.cpp
index 9856f7a..d820bca 100644
--- a/third_party/subzero/src/IceTargetLoweringARM32.cpp
+++ b/third_party/subzero/src/IceTargetLoweringARM32.cpp
@@ -5357,7 +5357,7 @@
       Func->setError("Unexpected size for LoadSubVector");
       return;
     }
-    _mov(Dest, T); // FIXME: necessary?
+    _mov(Dest, T);
     return;
   }
   case Intrinsics::StoreSubVector: {
@@ -5975,8 +5975,121 @@
   const Type DestTy = Dest->getType();
 
   auto *T = makeReg(DestTy);
+  auto *Src0 = Instr->getSrc(0);
+  auto *Src1 = Instr->getSrc(1);
+  const SizeT NumElements = typeNumElements(DestTy);
+  const Type ElementType = typeElementType(DestTy);
+
+  bool Replicate = true;
+  for (SizeT I = 1; Replicate && I < Instr->getNumIndexes(); ++I) {
+    if (Instr->getIndexValue(I) != Instr->getIndexValue(0)) {
+      Replicate = false;
+    }
+  }
+
+  if (Replicate) {
+    Variable *Src0Var = legalizeToReg(Src0);
+    _vdup(T, Src0Var, Instr->getIndexValue(0));
+    _mov(Dest, T);
+    return;
+  }
 
   switch (DestTy) {
+  case IceType_v8i1:
+  case IceType_v8i16: {
+    static constexpr SizeT ExpectedNumElements = 8;
+    assert(ExpectedNumElements == Instr->getNumIndexes());
+    (void)ExpectedNumElements;
+
+    if (Instr->indexesAre(0, 0, 1, 1, 2, 2, 3, 3)) {
+      Variable *Src0R = legalizeToReg(Src0);
+      _vzip(T, Src0R, Src0R);
+      _mov(Dest, T);
+      return;
+    }
+
+    if (Instr->indexesAre(0, 8, 1, 9, 2, 10, 3, 11)) {
+      Variable *Src0R = legalizeToReg(Src0);
+      Variable *Src1R = legalizeToReg(Src1);
+      _vzip(T, Src0R, Src1R);
+      _mov(Dest, T);
+      return;
+    }
+
+    if (Instr->indexesAre(0, 2, 4, 6, 0, 2, 4, 6)) {
+      Variable *Src0R = legalizeToReg(Src0);
+      _vqmovn2(T, Src0R, Src0R, false, false);
+      _mov(Dest, T);
+      return;
+    }
+  } break;
+  case IceType_v16i1:
+  case IceType_v16i8: {
+    static constexpr SizeT ExpectedNumElements = 16;
+    assert(ExpectedNumElements == Instr->getNumIndexes());
+    (void)ExpectedNumElements;
+
+    if (Instr->indexesAre(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7)) {
+      Variable *Src0R = legalizeToReg(Src0);
+      _vzip(T, Src0R, Src0R);
+      _mov(Dest, T);
+      return;
+    }
+
+    if (Instr->indexesAre(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7,
+                          23)) {
+      Variable *Src0R = legalizeToReg(Src0);
+      Variable *Src1R = legalizeToReg(Src1);
+      _vzip(T, Src0R, Src1R);
+      _mov(Dest, T);
+      return;
+    }
+  } break;
+  case IceType_v4i1:
+  case IceType_v4i32:
+  case IceType_v4f32: {
+    static constexpr SizeT ExpectedNumElements = 4;
+    assert(ExpectedNumElements == Instr->getNumIndexes());
+    (void)ExpectedNumElements;
+
+    if (Instr->indexesAre(0, 0, 1, 1)) {
+      Variable *Src0R = legalizeToReg(Src0);
+      _vzip(T, Src0R, Src0R);
+      _mov(Dest, T);
+      return;
+    }
+
+    if (Instr->indexesAre(0, 4, 1, 5)) {
+      Variable *Src0R = legalizeToReg(Src0);
+      Variable *Src1R = legalizeToReg(Src1);
+      _vzip(T, Src0R, Src1R);
+      _mov(Dest, T);
+      return;
+    }
+
+    if (Instr->indexesAre(0, 1, 4, 5)) {
+      Variable *Src0R = legalizeToReg(Src0);
+      Variable *Src1R = legalizeToReg(Src1);
+      _vmovlh(T, Src0R, Src1R);
+      _mov(Dest, T);
+      return;
+    }
+
+    if (Instr->indexesAre(2, 3, 2, 3)) {
+      Variable *Src0R = legalizeToReg(Src0);
+      _vmovhl(T, Src0R, Src0R);
+      _mov(Dest, T);
+      return;
+    }
+
+    if (Instr->indexesAre(2, 3, 6, 7)) {
+      Variable *Src0R = legalizeToReg(Src0);
+      Variable *Src1R = legalizeToReg(Src1);
+      _vmovhl(T, Src1R, Src0R);
+      _mov(Dest, T);
+      return;
+    }
+  } break;
   default:
     break;
     // TODO(jpp): figure out how to properly lower this without scalarization.
@@ -5984,10 +6097,6 @@
 
   // Unoptimized shuffle. Perform a series of inserts and extracts.
   Context.insert<InstFakeDef>(T);
-  auto *Src0 = Instr->getSrc(0);
-  auto *Src1 = Instr->getSrc(1);
-  const SizeT NumElements = typeNumElements(DestTy);
-  const Type ElementType = typeElementType(DestTy);
   for (SizeT I = 0; I < Instr->getNumIndexes(); ++I) {
     auto *Index = Instr->getIndex(I);
     const SizeT Elem = Index->getValue();