Subzero. ARM32. Implements vector select.

Also piggy-backs necro-comments from cl 1878943009.

BUG= https://bugs.chromium.org/p/nativeclient/issues/detail?id=4076
R=kschimpf@google.com

Review URL: https://codereview.chromium.org/1886263004 .
diff --git a/src/DartARM32/assembler_arm.h b/src/DartARM32/assembler_arm.h
index 4d205c7..c6e53df 100644
--- a/src/DartARM32/assembler_arm.h
+++ b/src/DartARM32/assembler_arm.h
@@ -1406,9 +1406,13 @@
   // ARM32::AssemblerARM32::uxt() (uxtb and uxth)
   // ARM32::AssemblerARM32::vpop()
   // ARM32::AssemblerARM32::vpush()
-  // ARM32::AssemblerARM32:rbit()
+  // ARM32::AssemblerARM32::rbit()
+  // ARM32::AssemblerARM32::vbslq()
   // ARM32::AssemblerARM32::veord()
   // ARM32::AssemblerARM32::vld1qr()
+  // ARM32::AssemblerARM32::vshlqc
+  // ARM32::AssemblerARM32::vshrqic
+  // ARM32::AssemblerARM32::vshrquc
   // ARM32::AssemblerARM32::vst1qr()
   // ARM32::AssemblerARM32::vmorqi()
   // ARM32::AssemblerARM32::vmovqc()
diff --git a/src/IceAssemblerARM32.cpp b/src/IceAssemblerARM32.cpp
index d21bd1b..71e5c6a 100644
--- a/src/IceAssemblerARM32.cpp
+++ b/src/IceAssemblerARM32.cpp
@@ -2410,6 +2410,18 @@
   emitSIMDqqq(VandqOpcode, ElmtTy, OpQd, OpQm, OpQn, Vandq);
 }
 
+void AssemblerARM32::vbslq(const Operand *OpQd, const Operand *OpQm,
+                           const Operand *OpQn) {
+  // VBSL (register) - ARM section A8.8.290, encoding A1:
+  //   vbsl <Qd>, <Qn>, <Qm>
+  //
+  // 111100110D01nnn0ddd00001N1M1mmm0 where Dddd=OpQd, Nnnn=OpQm, and Mmmm=OpQm.
+  constexpr const char *Vbslq = "vbslq";
+  constexpr IValueT VbslqOpcode = B24 | B20 | B8 | B4;
+  constexpr Type ElmtTy = IceType_i8; // emits sz=0
+  emitSIMDqqq(VbslqOpcode, ElmtTy, OpQd, OpQm, OpQn, Vbslq);
+}
+
 void AssemblerARM32::vcmpd(const Operand *OpDd, const Operand *OpDm,
                            CondARM32::Cond Cond) {
   constexpr const char *Vcmpd = "vcmpd";
diff --git a/src/IceAssemblerARM32.h b/src/IceAssemblerARM32.h
index b1ca928..ae3b93e 100644
--- a/src/IceAssemblerARM32.h
+++ b/src/IceAssemblerARM32.h
@@ -343,6 +343,8 @@
 
   void vandq(const Operand *OpQd, const Operand *OpQm, const Operand *OpQn);
 
+  void vbslq(const Operand *OpQd, const Operand *OpQm, const Operand *OpQn);
+
   void vcmpd(const Operand *OpDd, const Operand *OpDm, CondARM32::Cond cond);
 
   // Second argument of compare is zero (+0.0).
diff --git a/src/IceInstARM32.cpp b/src/IceInstARM32.cpp
index e2498fe..7c3e288 100644
--- a/src/IceInstARM32.cpp
+++ b/src/IceInstARM32.cpp
@@ -704,6 +704,24 @@
   assert(!Asm->needsTextFixup());
 }
 
+template <> void InstARM32Vbsl::emitIAS(const Cfg *Func) const {
+  auto *Asm = Func->getAssembler<ARM32::AssemblerARM32>();
+  const Variable *Dest = getDest();
+  switch (Dest->getType()) {
+  default:
+    llvm::report_fatal_error("Vbsl not defined on type " +
+                             typeStdString(Dest->getType()));
+  case IceType_v4i1:
+  case IceType_v8i1:
+  case IceType_v16i1:
+  case IceType_v16i8:
+  case IceType_v8i16:
+  case IceType_v4i32:
+    Asm->vbslq(Dest, getSrc(0), getSrc(1));
+  }
+  assert(!Asm->needsTextFixup());
+}
+
 template <> void InstARM32Vdiv::emitIAS(const Cfg *Func) const {
   auto *Asm = Func->getAssembler<ARM32::AssemblerARM32>();
   const Variable *Dest = getDest();
@@ -1496,6 +1514,7 @@
 // FP
 template <> const char *InstARM32Vadd::Opcode = "vadd";
 template <> const char *InstARM32Vand::Opcode = "vand";
+template <> const char *InstARM32Vbsl::Opcode = "vbsl";
 template <> const char *InstARM32Vdiv::Opcode = "vdiv";
 template <> const char *InstARM32Veor::Opcode = "veor";
 template <> const char *InstARM32Vmla::Opcode = "vmla";
diff --git a/src/IceInstARM32.h b/src/IceInstARM32.h
index 3c65037..4a052c4 100644
--- a/src/IceInstARM32.h
+++ b/src/IceInstARM32.h
@@ -427,6 +427,7 @@
     Vabs,
     Vadd,
     Vand,
+    Vbsl,
     Vcmp,
     Vcvt,
     Vdiv,
@@ -992,6 +993,7 @@
 using InstARM32Udiv = InstARM32ThreeAddrGPR<InstARM32::Udiv>;
 using InstARM32Vadd = InstARM32ThreeAddrFP<InstARM32::Vadd>;
 using InstARM32Vand = InstARM32ThreeAddrFP<InstARM32::Vand>;
+using InstARM32Vbsl = InstARM32ThreeAddrFP<InstARM32::Vbsl>;
 using InstARM32Vdiv = InstARM32ThreeAddrFP<InstARM32::Vdiv>;
 using InstARM32Veor = InstARM32ThreeAddrFP<InstARM32::Veor>;
 using InstARM32Vmla = InstARM32FourAddrFP<InstARM32::Vmla>;
diff --git a/src/IceTargetLoweringARM32.cpp b/src/IceTargetLoweringARM32.cpp
index b8fa3b6..3de321e 100644
--- a/src/IceTargetLoweringARM32.cpp
+++ b/src/IceTargetLoweringARM32.cpp
@@ -861,24 +861,6 @@
     }
     return;
   }
-  case Inst::Select: {
-    Variable *Dest = Instr->getDest();
-    const auto DestTy = Dest->getType();
-    if (isVectorType(DestTy)) {
-      auto *SelectInstr = llvm::cast<InstSelect>(Instr);
-      scalarizeInstruction(Dest,
-                           [this](Variable *Dest, Variable *Src0,
-                                  Variable *Src1, Variable *Src2) {
-                             return Context.insert<InstSelect>(Dest, Src0, Src1,
-                                                               Src2);
-                           },
-                           llvm::cast<Variable>(SelectInstr->getSrc(0)),
-                           llvm::cast<Variable>(SelectInstr->getSrc(1)),
-                           llvm::cast<Variable>(SelectInstr->getSrc(2)));
-      SelectInstr->setDeleted();
-    }
-    return;
-  }
   }
 }
 
@@ -5727,12 +5709,39 @@
   Operand *SrcF = Instr->getFalseOperand();
   Operand *Condition = Instr->getCondition();
 
-  if (isVectorType(DestTy)) {
-    UnimplementedLoweringError(this, Instr);
+  if (!isVectorType(DestTy)) {
+    lowerInt1ForSelect(Dest, Condition, legalizeUndef(SrcT),
+                       legalizeUndef(SrcF));
     return;
   }
 
-  lowerInt1ForSelect(Dest, Condition, legalizeUndef(SrcT), legalizeUndef(SrcF));
+  Type TType = DestTy;
+  switch (DestTy) {
+  default:
+    llvm::report_fatal_error("Unexpected type for vector select.");
+  case IceType_v4i1:
+    TType = IceType_v4i32;
+    break;
+  case IceType_v8i1:
+    TType = IceType_v8i16;
+    break;
+  case IceType_v16i1:
+    TType = IceType_v16i8;
+    break;
+  case IceType_v4f32:
+    TType = IceType_v4i32;
+    break;
+  case IceType_v4i32:
+  case IceType_v8i16:
+  case IceType_v16i8:
+    break;
+  }
+  auto *T = makeReg(TType);
+  lowerCast(InstCast::create(Func, InstCast::Sext, T, Condition));
+  auto *SrcTR = legalizeToReg(SrcT);
+  auto *SrcFR = legalizeToReg(SrcF);
+  _vbsl(T, SrcTR, SrcFR)->setDestRedefined();
+  _mov(Dest, T);
 }
 
 void TargetARM32::lowerStore(const InstStore *Instr) {
diff --git a/src/IceTargetLoweringARM32.h b/src/IceTargetLoweringARM32.h
index 9d2b760..2cfa945 100644
--- a/src/IceTargetLoweringARM32.h
+++ b/src/IceTargetLoweringARM32.h
@@ -854,6 +854,9 @@
   void _vand(Variable *Dest, Variable *Src0, Variable *Src1) {
     Context.insert<InstARM32Vand>(Dest, Src0, Src1);
   }
+  InstARM32Vbsl *_vbsl(Variable *Dest, Variable *Src0, Variable *Src1) {
+    return Context.insert<InstARM32Vbsl>(Dest, Src0, Src1);
+  }
   void _vcvt(Variable *Dest, Variable *Src, InstARM32Vcvt::VcvtVariant Variant,
              CondARM32::Cond Pred = CondARM32::AL) {
     Context.insert<InstARM32Vcvt>(Dest, Src, Variant, Pred);
diff --git a/tests_lit/assembler/arm32/select-vec.ll b/tests_lit/assembler/arm32/select-vec.ll
index 8d6165e..415f936 100644
--- a/tests_lit/assembler/arm32/select-vec.ll
+++ b/tests_lit/assembler/arm32/select-vec.ll
@@ -1,152 +1,84 @@
 ; Test that we handle select on vectors.
 
-; TODO(eholk): This test will need to be updated once comparison is no longer
-; scalarized.
-
 ; REQUIRES: allow_dump
 
 ; Compile using standalone assembler.
 ; RUN: %p2i --filetype=asm -i %s --target=arm32 --args -O2 \
 ; RUN:   | FileCheck %s --check-prefix=ASM
 
+; Show bytes in assembled standalone code.
+; RUN: %p2i --filetype=asm -i %s --target=arm32 --assemble --disassemble \
+; RUN:   --args -O2 --reg-use=s20  | FileCheck %s --check-prefix=DIS
+
+; Compile using integrated assembler.
+; RUN: %p2i --filetype=iasm -i %s --target=arm32 --args -O2 \
+; RUN:   --reg-use=s20 \
+; RUN:   | FileCheck %s --check-prefix=IASM
+
+; Show bytes in assembled integrated code.
+; RUN: %p2i --filetype=iasm -i %s --target=arm32 --assemble --disassemble \
+; RUN:   --args -O2 --reg-use=s20 | FileCheck %s --check-prefix=DIS
+
 define internal <4 x float> @select4float(<4 x i1> %s, <4 x float> %a,
                                           <4 x float> %b) {
 ; ASM-LABEL:select4float:
 ; DIS-LABEL:00000000 <select4float>:
+; IASM-LABEL:select4float:
 
 entry:
   %res = select <4 x i1> %s, <4 x float> %a, <4 x float> %b
 
-; ASM:	# q3 = def.pseudo
-; ASM-NEXT:	vmov.s8	r0, d0[0]
-; ASM-NEXT:	vmov.f32	s16, s4
-; ASM-NEXT:	vmov.f32	s17, s8
-; ASM-NEXT:	tst	r0, #1
-; ASM-NEXT:	vmovne.f32	s17, s16
-; ASM-NEXT:	vmov.f32	s12, s17
-; ASM-NEXT:	vmov.s8	r0, d0[4]
-; ASM-NEXT:	vmov.f32	s16, s5
-; ASM-NEXT:	vmov.f32	s17, s9
-; ASM-NEXT:	tst	r0, #1
-; ASM-NEXT:	vmovne.f32	s17, s16
-; ASM-NEXT:	vmov.f32	s13, s17
-; ASM-NEXT:	vmov.s8	r0, d1[0]
-; ASM-NEXT:	vmov.f32	s16, s6
-; ASM-NEXT:	vmov.f32	s17, s10
-; ASM-NEXT:	tst	r0, #1
-; ASM-NEXT:	vmovne.f32	s17, s16
-; ASM-NEXT:	vmov.f32	s14, s17
-; ASM-NEXT:	vmov.s8	r0, d1[4]
-; ASM-NEXT:	vmov.f32	s4, s7
-; ASM-NEXT:	vmov.f32	s8, s11
-; ASM-NEXT:	tst	r0, #1
-; ASM-NEXT:	vmovne.f32	s8, s4
-; ASM-NEXT:	vmov.f32	s15, s8
-; ASM-NEXT:	vmov.f32	q0, q3
-; ASM-NEXT:	vpop	{s16, s17}
-; ASM-NEXT:	# s16 = def.pseudo
-; ASM-NEXT:	# s17 = def.pseudo
-; ASM-NEXT:	bx	lr
+; ASM:          vshl.u32 [[M:.*]], {{.*}}, #31
+; ASM-NEXT:     vshr.s32 [[M:.*]], {{.*}}, #31
+; ASM-NEXT:     vbsl.i32 [[M]], {{.*}}
+; DIS:       0: f2bf0550
+; DIS-NEXT:  4: f2a10050
+; DIS-NEXT:  8: f3120154
+; IASM-NOT:     vshl
+; IASM-NOT:     vshr
+; IASM-NOT:     vbsl
 
   ret <4 x float> %res
 }
 
 define internal <4 x i32> @select4i32(<4 x i1> %s, <4 x i32> %a, <4 x i32> %b) {
 ; ASM-LABEL:select4i32:
-; DIS-LABEL:00000000 <select4i32>:
+; DIS-LABEL:00000010 <select4i32>:
+; IASM-LABEL:select4i32:
 
 entry:
   %res = select <4 x i1> %s, <4 x i32> %a, <4 x i32> %b
 
-; ASM:	# q3 = def.pseudo
-; ASM-NEXT:	vmov.s8	r0, d0[0]
-; ASM-NEXT:	vmov.32	r1, d2[0]
-; ASM-NEXT:	vmov.32	r2, d4[0]
-; ASM-NEXT:	tst	r0, #1
-; ASM-NEXT:	movne	r2, r1
-; ASM-NEXT:	vmov.32	d6[0], r2
-; ASM-NEXT:	vmov.s8	r0, d0[4]
-; ASM-NEXT:	vmov.32	r1, d2[1]
-; ASM-NEXT:	vmov.32	r2, d4[1]
-; ASM-NEXT:	tst	r0, #1
-; ASM-NEXT:	movne	r2, r1
-; ASM-NEXT:	vmov.32	d6[1], r2
-; ASM-NEXT:	vmov.s8	r0, d1[0]
-; ASM-NEXT:	vmov.32	r1, d3[0]
-; ASM-NEXT:	vmov.32	r2, d5[0]
-; ASM-NEXT:	tst	r0, #1
-; ASM-NEXT:	movne	r2, r1
-; ASM-NEXT:	vmov.32	d7[0], r2
-; ASM-NEXT:	vmov.s8	r0, d1[4]
-; ASM-NEXT:	vmov.32	r1, d3[1]
-; ASM-NEXT:	vmov.32	r2, d5[1]
-; ASM-NEXT:	tst	r0, #1
-; ASM-NEXT:	movne	r2, r1
-; ASM-NEXT:	vmov.32	d7[1], r2
-; ASM-NEXT:	vmov.i32	q0, q3
-; ASM-NEXT:	bx	lr
+; ASM:          vshl.u32 [[M:.*]], {{.*}}, #31
+; ASM-NEXT:     vshr.s32 [[M:.*]], {{.*}}, #31
+; ASM-NEXT:     vbsl.i32 [[M]], {{.*}}
+; DIS:      10: f2bf0550
+; DIS-NEXT: 14: f2a10050
+; DIS_NEXT: 18: f3120154
+; IASM-NOT:     vshl
+; IASM-NOT:     vshr
+; IASM-NOT:     vbsl
 
   ret <4 x i32> %res
 }
 
 define internal <8 x i16> @select8i16(<8 x i1> %s, <8 x i16> %a, <8 x i16> %b) {
 ; ASM-LABEL:select8i16:
-; DIS-LABEL:00000000 <select8i16>:
+; DIS-LABEL:00000020 <select8i16>:
+; IASM-LABEL:select8i16:
 
 entry:
   %res = select <8 x i1> %s, <8 x i16> %a, <8 x i16> %b
 
-; ASM:	# q3 = def.pseudo
-; ASM-NEXT:	vmov.s8	r0, d0[0]
-; ASM-NEXT:	vmov.s16	r1, d2[0]
-; ASM-NEXT:	vmov.s16	r2, d4[0]
-; ASM-NEXT:	tst	r0, #1
-; ASM-NEXT:	movne	r2, r1
-; ASM-NEXT:	vmov.16	d6[0], r2
-; ASM-NEXT:	vmov.s8	r0, d0[2]
-; ASM-NEXT:	vmov.s16	r1, d2[1]
-; ASM-NEXT:	vmov.s16	r2, d4[1]
-; ASM-NEXT:	tst	r0, #1
-; ASM-NEXT:	movne	r2, r1
-; ASM-NEXT:	vmov.16	d6[1], r2
-; ASM-NEXT:	vmov.s8	r0, d0[4]
-; ASM-NEXT:	vmov.s16	r1, d2[2]
-; ASM-NEXT:	vmov.s16	r2, d4[2]
-; ASM-NEXT:	tst	r0, #1
-; ASM-NEXT:	movne	r2, r1
-; ASM-NEXT:	vmov.16	d6[2], r2
-; ASM-NEXT:	vmov.s8	r0, d0[6]
-; ASM-NEXT:	vmov.s16	r1, d2[3]
-; ASM-NEXT:	vmov.s16	r2, d4[3]
-; ASM-NEXT:	tst	r0, #1
-; ASM-NEXT:	movne	r2, r1
-; ASM-NEXT:	vmov.16	d6[3], r2
-; ASM-NEXT:	vmov.s8	r0, d1[0]
-; ASM-NEXT:	vmov.s16	r1, d3[0]
-; ASM-NEXT:	vmov.s16	r2, d5[0]
-; ASM-NEXT:	tst	r0, #1
-; ASM-NEXT:	movne	r2, r1
-; ASM-NEXT:	vmov.16	d7[0], r2
-; ASM-NEXT:	vmov.s8	r0, d1[2]
-; ASM-NEXT:	vmov.s16	r1, d3[1]
-; ASM-NEXT:	vmov.s16	r2, d5[1]
-; ASM-NEXT:	tst	r0, #1
-; ASM-NEXT:	movne	r2, r1
-; ASM-NEXT:	vmov.16	d7[1], r2
-; ASM-NEXT:	vmov.s8	r0, d1[4]
-; ASM-NEXT:	vmov.s16	r1, d3[2]
-; ASM-NEXT:	vmov.s16	r2, d5[2]
-; ASM-NEXT:	tst	r0, #1
-; ASM-NEXT:	movne	r2, r1
-; ASM-NEXT:	vmov.16	d7[2], r2
-; ASM-NEXT:	vmov.s8	r0, d1[6]
-; ASM-NEXT:	vmov.s16	r1, d3[3]
-; ASM-NEXT:	vmov.s16	r2, d5[3]
-; ASM-NEXT:	tst	r0, #1
-; ASM-NEXT:	movne	r2, r1
-; ASM-NEXT:	vmov.16	d7[3], r2
-; ASM-NEXT:	vmov.i16	q0, q3
-; ASM-NEXT:	bx	lr
+; ASM:          vshl.u16 [[M:.*]], {{.*}}, #15
+; ASM-NEXT:     vshr.s16 [[M:.*]], {{.*}}, #15
+; ASM-NEXT:     vbsl.i16 [[M]], {{.*}}
+; DIS:      20: f29f0550
+; DIS-NEXT: 24: f2910050
+; DIS-NEXT: 28: f3120154
+; IASM-NOT:     vshl
+; IASM-NOT:     vshr
+; IASM-NOT:     vbsl
 
   ret <8 x i16> %res
 }
@@ -154,110 +86,21 @@
 define internal <16 x i8> @select16i8(<16 x i1> %s, <16 x i8> %a,
                                       <16 x i8> %b) {
 ; ASM-LABEL:select16i8:
-; DIS-LABEL:00000000 <select16i8>:
+; DIS-LABEL:00000030 <select16i8>:
+; IASM-LABEL:select16i8:
 
 entry:
   %res = select <16 x i1> %s, <16 x i8> %a, <16 x i8> %b
 
-; ASM:	# q3 = def.pseudo
-; ASM-NEXT:	vmov.s8	r0, d0[0]
-; ASM-NEXT:	vmov.s8	r1, d2[0]
-; ASM-NEXT:	vmov.s8	r2, d4[0]
-; ASM-NEXT:	tst	r0, #1
-; ASM-NEXT:	movne	r2, r1
-; ASM-NEXT:	vmov.8	d6[0], r2
-; ASM-NEXT:	vmov.s8	r0, d0[1]
-; ASM-NEXT:	vmov.s8	r1, d2[1]
-; ASM-NEXT:	vmov.s8	r2, d4[1]
-; ASM-NEXT:	tst	r0, #1
-; ASM-NEXT:	movne	r2, r1
-; ASM-NEXT:	vmov.8	d6[1], r2
-; ASM-NEXT:	vmov.s8	r0, d0[2]
-; ASM-NEXT:	vmov.s8	r1, d2[2]
-; ASM-NEXT:	vmov.s8	r2, d4[2]
-; ASM-NEXT:	tst	r0, #1
-; ASM-NEXT:	movne	r2, r1
-; ASM-NEXT:	vmov.8	d6[2], r2
-; ASM-NEXT:	vmov.s8	r0, d0[3]
-; ASM-NEXT:	vmov.s8	r1, d2[3]
-; ASM-NEXT:	vmov.s8	r2, d4[3]
-; ASM-NEXT:	tst	r0, #1
-; ASM-NEXT:	movne	r2, r1
-; ASM-NEXT:	vmov.8	d6[3], r2
-; ASM-NEXT:	vmov.s8	r0, d0[4]
-; ASM-NEXT:	vmov.s8	r1, d2[4]
-; ASM-NEXT:	vmov.s8	r2, d4[4]
-; ASM-NEXT:	tst	r0, #1
-; ASM-NEXT:	movne	r2, r1
-; ASM-NEXT:	vmov.8	d6[4], r2
-; ASM-NEXT:	vmov.s8	r0, d0[5]
-; ASM-NEXT:	vmov.s8	r1, d2[5]
-; ASM-NEXT:	vmov.s8	r2, d4[5]
-; ASM-NEXT:	tst	r0, #1
-; ASM-NEXT:	movne	r2, r1
-; ASM-NEXT:	vmov.8	d6[5], r2
-; ASM-NEXT:	vmov.s8	r0, d0[6]
-; ASM-NEXT:	vmov.s8	r1, d2[6]
-; ASM-NEXT:	vmov.s8	r2, d4[6]
-; ASM-NEXT:	tst	r0, #1
-; ASM-NEXT:	movne	r2, r1
-; ASM-NEXT:	vmov.8	d6[6], r2
-; ASM-NEXT:	vmov.s8	r0, d0[7]
-; ASM-NEXT:	vmov.s8	r1, d2[7]
-; ASM-NEXT:	vmov.s8	r2, d4[7]
-; ASM-NEXT:	tst	r0, #1
-; ASM-NEXT:	movne	r2, r1
-; ASM-NEXT:	vmov.8	d6[7], r2
-; ASM-NEXT:	vmov.s8	r0, d1[0]
-; ASM-NEXT:	vmov.s8	r1, d3[0]
-; ASM-NEXT:	vmov.s8	r2, d5[0]
-; ASM-NEXT:	tst	r0, #1
-; ASM-NEXT:	movne	r2, r1
-; ASM-NEXT:	vmov.8	d7[0], r2
-; ASM-NEXT:	vmov.s8	r0, d1[1]
-; ASM-NEXT:	vmov.s8	r1, d3[1]
-; ASM-NEXT:	vmov.s8	r2, d5[1]
-; ASM-NEXT:	tst	r0, #1
-; ASM-NEXT:	movne	r2, r1
-; ASM-NEXT:	vmov.8	d7[1], r2
-; ASM-NEXT:	vmov.s8	r0, d1[2]
-; ASM-NEXT:	vmov.s8	r1, d3[2]
-; ASM-NEXT:	vmov.s8	r2, d5[2]
-; ASM-NEXT:	tst	r0, #1
-; ASM-NEXT:	movne	r2, r1
-; ASM-NEXT:	vmov.8	d7[2], r2
-; ASM-NEXT:	vmov.s8	r0, d1[3]
-; ASM-NEXT:	vmov.s8	r1, d3[3]
-; ASM-NEXT:	vmov.s8	r2, d5[3]
-; ASM-NEXT:	tst	r0, #1
-; ASM-NEXT:	movne	r2, r1
-; ASM-NEXT:	vmov.8	d7[3], r2
-; ASM-NEXT:	vmov.s8	r0, d1[4]
-; ASM-NEXT:	vmov.s8	r1, d3[4]
-; ASM-NEXT:	vmov.s8	r2, d5[4]
-; ASM-NEXT:	tst	r0, #1
-; ASM-NEXT:	movne	r2, r1
-; ASM-NEXT:	vmov.8	d7[4], r2
-; ASM-NEXT:	vmov.s8	r0, d1[5]
-; ASM-NEXT:	vmov.s8	r1, d3[5]
-; ASM-NEXT:	vmov.s8	r2, d5[5]
-; ASM-NEXT:	tst	r0, #1
-; ASM-NEXT:	movne	r2, r1
-; ASM-NEXT:	vmov.8	d7[5], r2
-; ASM-NEXT:	vmov.s8	r0, d1[6]
-; ASM-NEXT:	vmov.s8	r1, d3[6]
-; ASM-NEXT:	vmov.s8	r2, d5[6]
-; ASM-NEXT:	tst	r0, #1
-; ASM-NEXT:	movne	r2, r1
-; ASM-NEXT:	vmov.8	d7[6], r2
-; ASM-NEXT:	vmov.s8	r0, d1[7]
-; ASM-NEXT:	vmov.s8	r1, d3[7]
-; ASM-NEXT:	vmov.s8	r2, d5[7]
-; ASM-NEXT:	tst	r0, #1
-; ASM-NEXT:	movne	r2, r1
-; ASM-NEXT:	vmov.8	d7[7], r2
-; ASM-NEXT:	vmov.i8	q0, q3
-; ASM-NEXT:	bx	lr
+; ASM:          vshl.u8 [[M:.*]], {{.*}}, #7
+; ASM-NEXT:     vshr.s8 [[M:.*]], {{.*}}, #7
+; ASM-NEXT:     vbsl.i8 [[M]], {{.*}}
+; DIS:      30: f28f0550
+; DIS-NEXT: 34: f2890050
+; DIS-NEXT: 38: f3120154
+; IASM-NOT:     vshl
+; IASM-NOT:     vshr
+; IASM-NOT:     vbsl
 
   ret <16 x i8> %res
 }
diff --git a/tests_lit/assembler/arm32/vcvt.f32.s32.ll b/tests_lit/assembler/arm32/vcvt.f32.s32.ll
index 92edf99..8481e87 100644
--- a/tests_lit/assembler/arm32/vcvt.f32.s32.ll
+++ b/tests_lit/assembler/arm32/vcvt.f32.s32.ll
@@ -46,7 +46,7 @@
 
 ; ASM:         vcvt.f32.s32    q0, q0
 ; DIS:     40: f3bb0640
-; IASM-NOT:    vcvt.f32.s32
+; IASM-NOT:    vcvt
 
   ret <4 x float> %v
 }
diff --git a/tests_lit/assembler/arm32/vcvt.f32.u32.ll b/tests_lit/assembler/arm32/vcvt.f32.u32.ll
index dee7f3b..56fd7c8 100644
--- a/tests_lit/assembler/arm32/vcvt.f32.u32.ll
+++ b/tests_lit/assembler/arm32/vcvt.f32.u32.ll
@@ -46,7 +46,7 @@
 
 ; ASM:         vcvt.f32.u32    q0, q0
 ; DIS:     40: f3bb06c0
-; IASM-NOT:    vcvt.f32.u32
+; IASM-NOT:    vcvt
 
   ret <4 x float> %v
 }
diff --git a/tests_lit/assembler/arm32/vcvt.s32.f32.ll b/tests_lit/assembler/arm32/vcvt.s32.f32.ll
index c38e752..656ba04 100644
--- a/tests_lit/assembler/arm32/vcvt.s32.f32.ll
+++ b/tests_lit/assembler/arm32/vcvt.s32.f32.ll
@@ -46,7 +46,7 @@
 
 ; ASM:         vcvt.s32.f32    q0, q0
 ; DIS:     40: f3bb0740
-; IASM-NOT:    vcvt.s32.f32
+; IASM-NOT:    vcvt
 
   ret <4 x i32> %v
 }
diff --git a/tests_lit/assembler/arm32/vcvt.u32.f32.ll b/tests_lit/assembler/arm32/vcvt.u32.f32.ll
index 6b08765..2fae0f5 100644
--- a/tests_lit/assembler/arm32/vcvt.u32.f32.ll
+++ b/tests_lit/assembler/arm32/vcvt.u32.f32.ll
@@ -45,7 +45,7 @@
 
 ; ASM:         vcvt.u32.f32    q0, q0
 ; DIS:     40: f3bb07c0
-; IASM-NOT:    vcvt.u32.f32
+; IASM-NOT:    vcvt
 
   ret <4 x i32> %v
 }
diff --git a/tests_lit/assembler/arm32/vec-sh-imm.ll b/tests_lit/assembler/arm32/vec-sh-imm.ll
index 0436824..c2c5ebc 100644
--- a/tests_lit/assembler/arm32/vec-sh-imm.ll
+++ b/tests_lit/assembler/arm32/vec-sh-imm.ll
@@ -34,8 +34,8 @@
 ; ASM-NEXT:    vshr.s32 {{.*}}, #31
 ; DIS:      0: f2bf0550
 ; DIS-NEXT: 4: f2a10050
-; IASM-NOT:    vshl.u32 {{.*}}, #31
-; IASM-NOT:    vshr.s32 {{.*}}, #31
+; IASM-NOT:    vshl
+; IASM-NOT:    vshr
 }
 
 define internal <8 x i16> @SextV8I1(<8 x i16> %a) {
@@ -46,12 +46,12 @@
   %trunc = trunc <8 x i16> %a to <8 x i1>
   %sext = sext <8 x i1> %trunc to <8 x i16>
   ret <8 x i16> %sext
-; ASM:      vshl.u16 {{.*}}, #15
-; ASM-NEXT: vshr.s16 {{.*}}, #15
+; ASM:          vshl.u16 {{.*}}, #15
+; ASM-NEXT:     vshr.s16 {{.*}}, #15
 ; DIS:      10: f29f0550
 ; DIS-NEXT: 14: f2910050
-; IASM-NOT: vshl.u16 {{.*}}, #15
-; IASM-NOT: vshr.s16 {{.*}}, #15
+; IASM-NOT:     vshl
+; IASM-NOT:     vshr
 }
 
 define internal <16 x i8> @SextV16I1(<16 x i8> %a) {
@@ -62,10 +62,10 @@
   %trunc = trunc <16 x i8> %a to <16 x i1>
   %sext = sext <16 x i1> %trunc to <16 x i8>
   ret <16 x i8> %sext
-; ASM:      vshl.u8 {{.*}}, #7
-; ASM-NEXT: vshr.s8 {{.*}}, #7
+; ASM:          vshl.u8 {{.*}}, #7
+; ASM-NEXT:     vshr.s8 {{.*}}, #7
 ; DIS:      20: f28f0550
 ; DIS-NEXT: 24: f2890050
-; IASM-NOT: vshl.u8 {{.*}}, #7
-; IASM-NOT: vshr.s8 {{.*}}, #7
+; IASM-NOT:     vshl
+; IASM-NOT:     vshr
 }