Subzero. ARM32. De-scalarizes icmp and fcmp for vectors.

BUG= https://bugs.chromium.org/p/nativeclient/issues/detail?id=4076
R=eholk@chromium.org, stichnot@chromium.org

Review URL: https://codereview.chromium.org/1891243002 .
diff --git a/runtime/wasm-runtime.c b/runtime/wasm-runtime.c
index a7ce49b..2e1bd37 100644
--- a/runtime/wasm-runtime.c
+++ b/runtime/wasm-runtime.c
@@ -30,9 +30,7 @@
 
 void env$$_abort() { env$$abort(); }
 
-void env$$exit(int Status) {
-  exit(Status);
-}
+void env$$exit(int Status) { exit(Status); }
 void env$$_exit(int Status) { env$$exit(Status); }
 
 #define UNIMPLEMENTED(f)                                                       \
diff --git a/src/DartARM32/assembler_arm.cc b/src/DartARM32/assembler_arm.cc
index c2cdead..961299e 100644
--- a/src/DartARM32/assembler_arm.cc
+++ b/src/DartARM32/assembler_arm.cc
@@ -1323,11 +1323,11 @@
 void Assembler::vandq(QRegister qd, QRegister qn, QRegister qm) {
   EmitSIMDqqq(B8 | B4, kByte, qd, qn, qm);
 }
-#endif
 
 void Assembler::vmvnq(QRegister qd, QRegister qm) {
   EmitSIMDqqq(B25 | B24 | B23 | B10 | B8 | B7, kWordPair, qd, Q0, qm);
 }
+#endif
 
 
 void Assembler::vminqs(QRegister qd, QRegister qn, QRegister qm) {
@@ -1422,52 +1422,52 @@
 }
 
 
+#if 0
+// Moved to Arm32::AssemblerARM32::vceqqi().
 void Assembler::vceqqi(OperandSize sz,
                       QRegister qd, QRegister qn, QRegister qm) {
   EmitSIMDqqq(B24 | B11 | B4, sz, qd, qn, qm);
 }
 
-
+// Moved to Arm32::AssemblerARM32::vceqqi().
 void Assembler::vceqqs(QRegister qd, QRegister qn, QRegister qm) {
   EmitSIMDqqq(B11 | B10 | B9, kSWord, qd, qn, qm);
 }
 
-
+// Moved to Arm32::AssemblerARM32::vcgeqi().
 void Assembler::vcgeqi(OperandSize sz,
                       QRegister qd, QRegister qn, QRegister qm) {
   EmitSIMDqqq(B9 | B8 | B4, sz, qd, qn, qm);
 }
 
-
+// Moved to Arm32::AssemblerARM32::vcugeqi().
 void Assembler::vcugeqi(OperandSize sz,
                       QRegister qd, QRegister qn, QRegister qm) {
   EmitSIMDqqq(B24 | B9 | B8 | B4, sz, qd, qn, qm);
 }
 
-
+// Moved to Arm32::AssemblerARM32::vcgeqs().
 void Assembler::vcgeqs(QRegister qd, QRegister qn, QRegister qm) {
   EmitSIMDqqq(B24 | B11 | B10 | B9, kSWord, qd, qn, qm);
 }
 
-
+// Moved to Arm32::AssemblerARM32::vcgtqi().
 void Assembler::vcgtqi(OperandSize sz,
                       QRegister qd, QRegister qn, QRegister qm) {
   EmitSIMDqqq(B9 | B8, sz, qd, qn, qm);
 }
 
-
+// Moved to Arm32::AssemblerARM32::vcugtqi().
 void Assembler::vcugtqi(OperandSize sz,
                       QRegister qd, QRegister qn, QRegister qm) {
   EmitSIMDqqq(B24 | B9 | B8, sz, qd, qn, qm);
 }
 
-
+// Moved to Arm32::AssemblerARM32::vcgtqs().
 void Assembler::vcgtqs(QRegister qd, QRegister qn, QRegister qm) {
   EmitSIMDqqq(B24 | B21 | B11 | B10 | B9, kSWord, qd, qn, qm);
 }
 
-
-#if 0
 // Moved to ARM32::AssemblerARM32::bkpt()
 void Assembler::bkpt(uint16_t imm16) {
   Emit(BkptEncoding(imm16));
diff --git a/src/DartARM32/assembler_arm.h b/src/DartARM32/assembler_arm.h
index c6e53df..93de45f 100644
--- a/src/DartARM32/assembler_arm.h
+++ b/src/DartARM32/assembler_arm.h
@@ -727,19 +727,26 @@
 #if 0
   // Moved to Arm32::AssemblerARM32::vandq().
   void vandq(QRegister qd, QRegister qn, QRegister qm);
-#endif
+  // Moved to Arm32::AssemblerARM32::vandq().
   void vmvnq(QRegister qd, QRegister qm);
 
+  // Moved to Arm32::AssemblerARM32::vceqqi().
   void vceqqi(OperandSize sz, QRegister qd, QRegister qn, QRegister qm);
+  // Moved to Arm32::AssemblerARM32::vceqqs().
   void vceqqs(QRegister qd, QRegister qn, QRegister qm);
+  // Moved to Arm32::AssemblerARM32::vcgeqi().
   void vcgeqi(OperandSize sz, QRegister qd, QRegister qn, QRegister qm);
+  // Moved to Arm32::AssemblerARM32::vcugeqi().
   void vcugeqi(OperandSize sz, QRegister qd, QRegister qn, QRegister qm);
+  // Moved to Arm32::AssemblerARM32::vcgeqs().
   void vcgeqs(QRegister qd, QRegister qn, QRegister qm);
+  // Moved to Arm32::AssemblerARM32::vcgtqi().
   void vcgtqi(OperandSize sz, QRegister qd, QRegister qn, QRegister qm);
+  // Moved to Arm32::AssemblerARM32::vcugtqi().
   void vcugtqi(OperandSize sz, QRegister qd, QRegister qn, QRegister qm);
+  // Moved to Arm32::AssemblerARM32::vcgtqs().
   void vcgtqs(QRegister qd, QRegister qn, QRegister qm);
 
-#if 0
   // Moved to Arm32::AssemblerARM32::vabss().
   void vabss(SRegister sd, SRegister sm, Condition cond = AL);
   // Moved to Arm32::AssemblerARM32::vabsd().
diff --git a/src/IceAssemblerARM32.cpp b/src/IceAssemblerARM32.cpp
index 71e5c6a..2cf09ff 100644
--- a/src/IceAssemblerARM32.cpp
+++ b/src/IceAssemblerARM32.cpp
@@ -2422,6 +2422,102 @@
   emitSIMDqqq(VbslqOpcode, ElmtTy, OpQd, OpQm, OpQn, Vbslq);
 }
 
+void AssemblerARM32::vceqqi(const Type ElmtTy, const Operand *OpQd,
+                            const Operand *OpQm, const Operand *OpQn) {
+  // vceq (register) - ARM section A8.8.291, encoding A1:
+  //   vceq.<st> <Qd>, <Qn>, <Qm>
+  //
+  // 111100110Dssnnnndddd1000NQM1mmmm where Dddd=OpQd, Nnnn=OpQm, Mmmm=OpQm, and
+  // st in [i8, i16, i32] where ss is the index.
+  constexpr const char *Vceq = "vceq";
+  constexpr IValueT VceqOpcode = B24 | B11 | B4;
+  emitSIMDqqq(VceqOpcode, ElmtTy, OpQd, OpQm, OpQn, Vceq);
+}
+
+void AssemblerARM32::vceqqs(const Operand *OpQd, const Operand *OpQm,
+                            const Operand *OpQn) {
+  // vceq (register) - ARM section A8.8.291, encoding A2:
+  //   vceq.f32 <Qd>, <Qn>, <Qm>
+  //
+  // 111100100D00nnnndddd1110NQM0mmmm where Dddd=OpQd, Nnnn=OpQm, and Mmmm=OpQm.
+  constexpr const char *Vceq = "vceq";
+  constexpr IValueT VceqOpcode = B11 | B10 | B9;
+  constexpr Type ElmtTy = IceType_i8; // encoded as 0b00
+  emitSIMDqqq(VceqOpcode, ElmtTy, OpQd, OpQm, OpQn, Vceq);
+}
+
+void AssemblerARM32::vcgeqi(const Type ElmtTy, const Operand *OpQd,
+                            const Operand *OpQm, const Operand *OpQn) {
+  // vcge (register) - ARM section A8.8.293, encoding A1:
+  //   vcge.<st> <Qd>, <Qn>, <Qm>
+  //
+  // 1111001U0Dssnnnndddd0011NQM1mmmm where Dddd=OpQd, Nnnn=OpQm, Mmmm=OpQm,
+  // 0=U, and st in [s8, s16, s32] where ss is the index.
+  constexpr const char *Vcge = "vcge";
+  constexpr IValueT VcgeOpcode = B9 | B8 | B4;
+  emitSIMDqqq(VcgeOpcode, ElmtTy, OpQd, OpQm, OpQn, Vcge);
+}
+
+void AssemblerARM32::vcugeqi(const Type ElmtTy, const Operand *OpQd,
+                             const Operand *OpQm, const Operand *OpQn) {
+  // vcge (register) - ARM section A8.8.293, encoding A1:
+  //   vcge.<st> <Qd>, <Qn>, <Qm>
+  //
+  // 1111001U0Dssnnnndddd0011NQM1mmmm where Dddd=OpQd, Nnnn=OpQm, Mmmm=OpQm,
+  // 1=U, and st in [u8, u16, u32] where ss is the index.
+  constexpr const char *Vcge = "vcge";
+  constexpr IValueT VcgeOpcode = B24 | B9 | B8 | B4;
+  emitSIMDqqq(VcgeOpcode, ElmtTy, OpQd, OpQm, OpQn, Vcge);
+}
+
+void AssemblerARM32::vcgeqs(const Operand *OpQd, const Operand *OpQm,
+                            const Operand *OpQn) {
+  // vcge (register) - ARM section A8.8.293, encoding A2:
+  //   vcge.f32 <Qd>, <Qn>, <Qm>
+  //
+  // 111100110D00nnnndddd1110NQM0mmmm where Dddd=OpQd, Nnnn=OpQm, and Mmmm=OpQm.
+  constexpr const char *Vcge = "vcge";
+  constexpr IValueT VcgeOpcode = B24 | B11 | B10 | B9;
+  constexpr Type ElmtTy = IceType_i8; // encoded as 0b00.
+  emitSIMDqqq(VcgeOpcode, ElmtTy, OpQd, OpQm, OpQn, Vcge);
+}
+
+void AssemblerARM32::vcgtqi(const Type ElmtTy, const Operand *OpQd,
+                            const Operand *OpQm, const Operand *OpQn) {
+  // vcgt (register) - ARM section A8.8.295, encoding A1:
+  //   vcgt.<st> <Qd>, <Qn>, <Qm>
+  //
+  // 1111001U0Dssnnnndddd0011NQM0mmmm where Dddd=OpQd, Nnnn=OpQm, Mmmm=OpQm,
+  // 0=U, and st in [s8, s16, s32] where ss is the index.
+  constexpr const char *Vcge = "vcgt";
+  constexpr IValueT VcgeOpcode = B9 | B8;
+  emitSIMDqqq(VcgeOpcode, ElmtTy, OpQd, OpQm, OpQn, Vcge);
+}
+
+void AssemblerARM32::vcugtqi(const Type ElmtTy, const Operand *OpQd,
+                             const Operand *OpQm, const Operand *OpQn) {
+  // vcgt (register) - ARM section A8.8.295, encoding A1:
+  //   vcgt.<st> <Qd>, <Qn>, <Qm>
+  //
+  // 111100110Dssnnnndddd0011NQM0mmmm where Dddd=OpQd, Nnnn=OpQm, Mmmm=OpQm,
+  // 1=U, and st in [u8, u16, u32] where ss is the index.
+  constexpr const char *Vcge = "vcgt";
+  constexpr IValueT VcgeOpcode = B24 | B9 | B8;
+  emitSIMDqqq(VcgeOpcode, ElmtTy, OpQd, OpQm, OpQn, Vcge);
+}
+
+void AssemblerARM32::vcgtqs(const Operand *OpQd, const Operand *OpQm,
+                            const Operand *OpQn) {
+  // vcgt (register) - ARM section A8.8.295, encoding A2:
+  //   vcgt.f32 <Qd>, <Qn>, <Qm>
+  //
+  // 111100110D10nnnndddd1110NQM0mmmm where Dddd=OpQd, Nnnn=OpQm, and Mmmm=OpQm.
+  constexpr const char *Vcge = "vcgt";
+  constexpr IValueT VcgeOpcode = B24 | B21 | B11 | B10 | B9;
+  constexpr Type ElmtTy = IceType_i8; // encoded as 0b00.
+  emitSIMDqqq(VcgeOpcode, ElmtTy, OpQd, OpQm, OpQn, Vcge);
+}
+
 void AssemblerARM32::vcmpd(const Operand *OpDd, const Operand *OpDm,
                            CondARM32::Cond Cond) {
   constexpr const char *Vcmpd = "vcmpd";
@@ -3143,6 +3239,23 @@
   emitSIMDqqqBase(VmulqfOpcode, OpQd, OpQn, OpQm, IsFloatTy, Vmulqf);
 }
 
+void AssemblerARM32::vmvnq(const Operand *OpQd, const Operand *OpQm) {
+  // VMVN (integer) - ARM section A8.8.354, encoding A1:
+  //   vmvn <Qd>, <Qm>
+  //
+  // 111100111D110000dddd01011QM0mmmm where Dddd=Qd, Mmmm=Qm, and 1=Q.
+  // TODO(jpp) xxx: unify
+  constexpr const char *Vmvn = "vmvn";
+  constexpr IValueT VmvnOpcode = B24 | B23 | B21 | B20 | B10 | B8 | B7;
+  const IValueT Qd = encodeQRegister(OpQd, "Qd", Vmvn);
+  constexpr IValueT Qn = 0;
+  const IValueT Qm = encodeQRegister(OpQm, "Qm", Vmvn);
+  constexpr bool UseQRegs = true;
+  constexpr bool IsFloat = false;
+  emitSIMDBase(VmvnOpcode, mapQRegToDReg(Qd), mapQRegToDReg(Qn),
+               mapQRegToDReg(Qm), UseQRegs, IsFloat);
+}
+
 void AssemblerARM32::vnegqs(Type ElmtTy, const Operand *OpQd,
                             const Operand *OpQm) {
   // VNEG - ARM section A8.8.355, encoding A1:
diff --git a/src/IceAssemblerARM32.h b/src/IceAssemblerARM32.h
index ae3b93e..84ca973 100644
--- a/src/IceAssemblerARM32.h
+++ b/src/IceAssemblerARM32.h
@@ -345,6 +345,27 @@
 
   void vbslq(const Operand *OpQd, const Operand *OpQm, const Operand *OpQn);
 
+  void vceqqi(const Type ElmtTy, const Operand *OpQd, const Operand *OpQm,
+              const Operand *OpQn);
+
+  void vceqqs(const Operand *OpQd, const Operand *OpQm, const Operand *OpQn);
+
+  void vcgeqi(const Type ElmtTy, const Operand *OpQd, const Operand *OpQm,
+              const Operand *OpQn);
+
+  void vcugeqi(const Type ElmtTy, const Operand *OpQd, const Operand *OpQm,
+               const Operand *OpQn);
+
+  void vcgeqs(const Operand *OpQd, const Operand *OpQm, const Operand *OpQn);
+
+  void vcgtqi(const Type ElmtTy, const Operand *OpQd, const Operand *OpQm,
+              const Operand *OpQn);
+
+  void vcugtqi(const Type ElmtTy, const Operand *OpQd, const Operand *OpQm,
+               const Operand *OpQn);
+
+  void vcgtqs(const Operand *OpQd, const Operand *OpQm, const Operand *OpQn);
+
   void vcmpd(const Operand *OpDd, const Operand *OpDm, CondARM32::Cond cond);
 
   // Second argument of compare is zero (+0.0).
@@ -505,6 +526,10 @@
   void vmuls(const Operand *OpSd, const Operand *OpSn, const Operand *OpSm,
              CondARM32::Cond Cond);
 
+  void vmvnq(const Operand *OpQd, const Operand *OpQm);
+
+  void vnegqs(const Operand *OpQd, const Operand *OpQm);
+
   void vnegqs(Type ElmtTy, const Operand *OpQd, const Operand *OpQm);
 
   void vorrq(const Operand *OpQd, const Operand *OpQm, const Operand *OpQn);
diff --git a/src/IceInstARM32.cpp b/src/IceInstARM32.cpp
index 7c3e288..4bd530c 100644
--- a/src/IceInstARM32.cpp
+++ b/src/IceInstARM32.cpp
@@ -228,13 +228,13 @@
 }
 
 void InstARM32::emitThreeAddrFP(const char *Opcode, FPSign SignType,
-                                const InstARM32 *Instr, const Cfg *Func) {
+                                const InstARM32 *Instr, const Cfg *Func,
+                                Type OpType) {
   if (!BuildDefs::dump())
     return;
   Ostream &Str = Func->getContext()->getStrEmit();
   assert(Instr->getSrcSize() == 2);
-  Str << "\t" << Opcode
-      << getVWidthString(Instr->getDest()->getType(), SignType) << "\t";
+  Str << "\t" << Opcode << getVWidthString(OpType, SignType) << "\t";
   Instr->getDest()->emit(Func);
   Str << ", ";
   Instr->getSrc(0)->emit(Func);
@@ -704,6 +704,95 @@
   assert(!Asm->needsTextFixup());
 }
 
+template <> void InstARM32Vceq::emitIAS(const Cfg *Func) const {
+  auto *Asm = Func->getAssembler<ARM32::AssemblerARM32>();
+  const Variable *Dest = getDest();
+  const Type SrcTy = getSrc(0)->getType();
+  switch (SrcTy) {
+  default:
+    llvm::report_fatal_error("Vceq not defined on type " +
+                             typeStdString(SrcTy));
+  case IceType_v4i1:
+  case IceType_v8i1:
+  case IceType_v16i1:
+  case IceType_v16i8:
+  case IceType_v8i16:
+  case IceType_v4i32:
+    Asm->vceqqi(typeElementType(SrcTy), Dest, getSrc(0), getSrc(1));
+    break;
+  case IceType_v4f32:
+    Asm->vceqqs(Dest, getSrc(0), getSrc(1));
+    break;
+  }
+  assert(!Asm->needsTextFixup());
+}
+
+template <> void InstARM32Vcge::emitIAS(const Cfg *Func) const {
+  auto *Asm = Func->getAssembler<ARM32::AssemblerARM32>();
+  const Variable *Dest = getDest();
+  const Type SrcTy = getSrc(0)->getType();
+  switch (SrcTy) {
+  default:
+    llvm::report_fatal_error("Vcge not defined on type " +
+                             typeStdString(Dest->getType()));
+  case IceType_v4i1:
+  case IceType_v8i1:
+  case IceType_v16i1:
+  case IceType_v16i8:
+  case IceType_v8i16:
+  case IceType_v4i32: {
+    const Type ElmtTy = typeElementType(SrcTy);
+    assert(Sign != InstARM32::FS_None);
+    switch (Sign) {
+    case InstARM32::FS_None: // defaults to unsigned.
+      llvm_unreachable("Sign should not be FS_None.");
+    case InstARM32::FS_Unsigned:
+      Asm->vcugeqi(ElmtTy, Dest, getSrc(0), getSrc(1));
+      break;
+    case InstARM32::FS_Signed:
+      Asm->vcgeqi(ElmtTy, Dest, getSrc(0), getSrc(1));
+      break;
+    }
+  } break;
+  case IceType_v4f32:
+    Asm->vcgeqs(Dest, getSrc(0), getSrc(1));
+    break;
+  }
+}
+
+template <> void InstARM32Vcgt::emitIAS(const Cfg *Func) const {
+  auto *Asm = Func->getAssembler<ARM32::AssemblerARM32>();
+  const Variable *Dest = getDest();
+  const Type SrcTy = getSrc(0)->getType();
+  switch (SrcTy) {
+  default:
+    llvm::report_fatal_error("Vcgt not defined on type " +
+                             typeStdString(Dest->getType()));
+  case IceType_v4i1:
+  case IceType_v8i1:
+  case IceType_v16i1:
+  case IceType_v16i8:
+  case IceType_v8i16:
+  case IceType_v4i32: {
+    const Type ElmtTy = typeElementType(SrcTy);
+    assert(Sign != InstARM32::FS_None);
+    switch (Sign) {
+    case InstARM32::FS_None: // defaults to unsigned.
+      llvm_unreachable("Sign should not be FS_None.");
+    case InstARM32::FS_Unsigned:
+      Asm->vcugtqi(ElmtTy, Dest, getSrc(0), getSrc(1));
+      break;
+    case InstARM32::FS_Signed:
+      Asm->vcgtqi(ElmtTy, Dest, getSrc(0), getSrc(1));
+      break;
+    }
+  } break;
+  case IceType_v4f32:
+    Asm->vcgtqs(Dest, getSrc(0), getSrc(1));
+    break;
+  }
+}
+
 template <> void InstARM32Vbsl::emitIAS(const Cfg *Func) const {
   auto *Asm = Func->getAssembler<ARM32::AssemblerARM32>();
   const Variable *Dest = getDest();
@@ -795,6 +884,25 @@
   }
 }
 
+template <> void InstARM32Vmvn::emitIAS(const Cfg *Func) const {
+  auto *Asm = Func->getAssembler<ARM32::AssemblerARM32>();
+  const Variable *Dest = getDest();
+  switch (Dest->getType()) {
+  default:
+    llvm::report_fatal_error("Vmvn not defined on type " +
+                             typeStdString(Dest->getType()));
+  case IceType_v4i1:
+  case IceType_v8i1:
+  case IceType_v16i1:
+  case IceType_v16i8:
+  case IceType_v8i16:
+  case IceType_v4i32:
+  case IceType_v4f32: {
+    Asm->vmvnq(Dest, getSrc(0));
+  } break;
+  }
+}
+
 template <> void InstARM32Vneg::emitIAS(const Cfg *Func) const {
   auto *Asm = Func->getAssembler<ARM32::AssemblerARM32>();
   const Variable *Dest = getDest();
@@ -842,6 +950,8 @@
   default:
     llvm::report_fatal_error("Vshl not defined on type " +
                              typeStdString(Dest->getType()));
+  // TODO(jpp): handle i1 vectors in terms of element count instead of element
+  // type.
   case IceType_v4i1:
   case IceType_v8i1:
   case IceType_v16i1:
@@ -879,6 +989,8 @@
   default:
     llvm::report_fatal_error("Vshr not defined on type " +
                              typeStdString(Dest->getType()));
+  // TODO(jpp): handle i1 vectors in terms of element count instead of element
+  // type.
   case IceType_v4i1:
   case IceType_v8i1:
   case IceType_v16i1:
@@ -1515,11 +1627,15 @@
 template <> const char *InstARM32Vadd::Opcode = "vadd";
 template <> const char *InstARM32Vand::Opcode = "vand";
 template <> const char *InstARM32Vbsl::Opcode = "vbsl";
+template <> const char *InstARM32Vceq::Opcode = "vceq";
+template <> const char *InstARM32ThreeAddrFP<InstARM32::Vcge>::Opcode = "vcge";
+template <> const char *InstARM32ThreeAddrFP<InstARM32::Vcgt>::Opcode = "vcgt";
 template <> const char *InstARM32Vdiv::Opcode = "vdiv";
 template <> const char *InstARM32Veor::Opcode = "veor";
 template <> const char *InstARM32Vmla::Opcode = "vmla";
 template <> const char *InstARM32Vmls::Opcode = "vmls";
 template <> const char *InstARM32Vmul::Opcode = "vmul";
+template <> const char *InstARM32Vmvn::Opcode = "vmvn";
 template <> const char *InstARM32Vorr::Opcode = "vorr";
 template <> const char *InstARM32UnaryopFP<InstARM32::Vneg>::Opcode = "vneg";
 template <> const char *InstARM32ThreeAddrFP<InstARM32::Vshl>::Opcode = "vshl";
@@ -1758,6 +1874,7 @@
       }
     }
     break; // Error
+  // TODO(jpp): Remove vectors of i1.
   case IceType_v4i1:
   case IceType_v8i1:
   case IceType_v16i1:
@@ -2984,6 +3101,8 @@
 template class InstARM32ThreeAddrGPR<InstARM32::Udiv>;
 
 template class InstARM32ThreeAddrFP<InstARM32::Vadd>;
+template class InstARM32ThreeAddrSignAwareFP<InstARM32::Vcge>;
+template class InstARM32ThreeAddrSignAwareFP<InstARM32::Vcgt>;
 template class InstARM32ThreeAddrFP<InstARM32::Vdiv>;
 template class InstARM32ThreeAddrFP<InstARM32::Veor>;
 template class InstARM32FourAddrFP<InstARM32::Vmla>;
diff --git a/src/IceInstARM32.h b/src/IceInstARM32.h
index 4a052c4..89f894d 100644
--- a/src/IceInstARM32.h
+++ b/src/IceInstARM32.h
@@ -428,6 +428,9 @@
     Vadd,
     Vand,
     Vbsl,
+    Vceq,
+    Vcge,
+    Vcgt,
     Vcmp,
     Vcvt,
     Vdiv,
@@ -436,6 +439,7 @@
     Vmls,
     Vmrs,
     Vmul,
+    Vmvn,
     Vneg,
     Vorr,
     Vshl,
@@ -464,7 +468,8 @@
   /// Shared emit routines for common forms of instructions.
   /// @{
   static void emitThreeAddrFP(const char *Opcode, FPSign Sign,
-                              const InstARM32 *Instr, const Cfg *Func);
+                              const InstARM32 *Instr, const Cfg *Func,
+                              Type OpType);
   static void emitFourAddrFP(const char *Opcode, FPSign Sign,
                              const InstARM32 *Instr, const Cfg *Func);
   /// @}
@@ -782,7 +787,8 @@
   void emit(const Cfg *Func) const override {
     if (!BuildDefs::dump())
       return;
-    emitThreeAddrFP(Opcode, Sign, this, Func);
+    const Type OpType = (isVectorCompare() ? getSrc(0) : getDest())->getType();
+    emitThreeAddrFP(Opcode, Sign, this, Func, OpType);
   }
   void emitIAS(const Cfg *Func) const override;
   void dump(const Cfg *Func) const override {
@@ -790,8 +796,8 @@
       return;
     Ostream &Str = Func->getContext()->getStrDump();
     dumpDest(Func);
-    Str << " = ";
-    Str << Opcode << "." << getDest()->getType() << " ";
+    const Type OpType = (isVectorCompare() ? getSrc(0) : getDest())->getType();
+    Str << " = " << Opcode << "." << OpType << " ";
     dumpSources(Func);
   }
   static bool classof(const Inst *Instr) { return isClassof(Instr, K); }
@@ -806,6 +812,11 @@
   }
 
   static const char *Opcode;
+
+private:
+  static constexpr bool isVectorCompare() {
+    return K == InstARM32::Vceq || K == InstARM32::Vcgt || K == InstARM32::Vcge;
+  }
 };
 
 template <InstARM32::InstKindARM32 K>
@@ -994,11 +1005,15 @@
 using InstARM32Vadd = InstARM32ThreeAddrFP<InstARM32::Vadd>;
 using InstARM32Vand = InstARM32ThreeAddrFP<InstARM32::Vand>;
 using InstARM32Vbsl = InstARM32ThreeAddrFP<InstARM32::Vbsl>;
+using InstARM32Vceq = InstARM32ThreeAddrFP<InstARM32::Vceq>;
+using InstARM32Vcge = InstARM32ThreeAddrSignAwareFP<InstARM32::Vcge>;
+using InstARM32Vcgt = InstARM32ThreeAddrSignAwareFP<InstARM32::Vcgt>;
 using InstARM32Vdiv = InstARM32ThreeAddrFP<InstARM32::Vdiv>;
 using InstARM32Veor = InstARM32ThreeAddrFP<InstARM32::Veor>;
 using InstARM32Vmla = InstARM32FourAddrFP<InstARM32::Vmla>;
 using InstARM32Vmls = InstARM32FourAddrFP<InstARM32::Vmls>;
 using InstARM32Vmul = InstARM32ThreeAddrFP<InstARM32::Vmul>;
+using InstARM32Vmvn = InstARM32UnaryopFP<InstARM32::Vmvn>;
 using InstARM32Vneg = InstARM32UnaryopSignAwareFP<InstARM32::Vneg>;
 using InstARM32Vorr = InstARM32ThreeAddrFP<InstARM32::Vorr>;
 using InstARM32Vshl = InstARM32ThreeAddrSignAwareFP<InstARM32::Vshl>;
diff --git a/src/IceTargetLoweringARM32.cpp b/src/IceTargetLoweringARM32.cpp
index a36fe99..fbfa914 100644
--- a/src/IceTargetLoweringARM32.cpp
+++ b/src/IceTargetLoweringARM32.cpp
@@ -124,7 +124,7 @@
 const struct TableIcmp32_ {
   CondARM32::Cond Mapping;
 } TableIcmp32[] = {
-#define X(val, is_signed, swapped64, C_32, C1_64, C2_64)                       \
+#define X(val, is_signed, swapped64, C_32, C1_64, C2_64, C_V, INV_V, NEG_V)    \
   { CondARM32::C_32 }                                                          \
   ,
     ICMPARM32_TABLE
@@ -140,7 +140,7 @@
   bool Swapped;
   CondARM32::Cond C1, C2;
 } TableIcmp64[] = {
-#define X(val, is_signed, swapped64, C_32, C1_64, C2_64)                       \
+#define X(val, is_signed, swapped64, C_32, C1_64, C2_64, C_V, INV_V, NEG_V)    \
   { is_signed, swapped64, CondARM32::C1_64, CondARM32::C2_64 }                 \
   ,
     ICMPARM32_TABLE
@@ -163,7 +163,8 @@
 namespace {
 // Define a temporary set of enum values based on low-level table entries.
 enum _icmp_ll_enum {
-#define X(val, signed, swapped64, C_32, C1_64, C2_64) _icmp_ll_##val,
+#define X(val, is_signed, swapped64, C_32, C1_64, C2_64, C_V, INV_V, NEG_V)    \
+  _icmp_ll_##val,
   ICMPARM32_TABLE
 #undef X
       _num
@@ -174,7 +175,7 @@
 #undef X
 // Define a set of constants based on low-level table entries, and ensure the
 // table entry keys are consistent.
-#define X(val, signed, swapped64, C_32, C1_64, C2_64)                          \
+#define X(val, is_signed, swapped64, C_32, C1_64, C2_64, C_V, INV_V, NEG_V)    \
   static_assert(                                                               \
       _icmp_ll_##val == _icmp_hl_##val,                                        \
       "Inconsistency between ICMPARM32_TABLE and ICEINSTICMP_TABLE: " #val);
@@ -829,38 +830,6 @@
     }
     llvm::report_fatal_error("Control flow should never have reached here.");
   }
-  case Inst::Icmp: {
-    Variable *Dest = Instr->getDest();
-    const Type DestTy = Dest->getType();
-    if (isVectorType(DestTy)) {
-      auto *CmpInstr = llvm::cast<InstIcmp>(Instr);
-      const auto Condition = CmpInstr->getCondition();
-      scalarizeInstruction(
-          Dest,
-          [this, Condition](Variable *Dest, Variable *Src0, Variable *Src1) {
-            return Context.insert<InstIcmp>(Condition, Dest, Src0, Src1);
-          },
-          CmpInstr->getSrc(0), CmpInstr->getSrc(1));
-      CmpInstr->setDeleted();
-    }
-    return;
-  }
-  case Inst::Fcmp: {
-    Variable *Dest = Instr->getDest();
-    const Type DestTy = Dest->getType();
-    if (isVectorType(DestTy)) {
-      auto *CmpInstr = llvm::cast<InstFcmp>(Instr);
-      const auto Condition = CmpInstr->getCondition();
-      scalarizeInstruction(
-          Dest,
-          [this, Condition](Variable *Dest, Variable *Src0, Variable *Src1) {
-            return Context.insert<InstFcmp>(Condition, Dest, Src0, Src1);
-          },
-          CmpInstr->getSrc(0), CmpInstr->getSrc(1));
-      CmpInstr->setDeleted();
-    }
-    return;
-  }
   }
 }
 
@@ -4251,7 +4220,7 @@
 // Validates FCMPARM32_TABLE's declaration w.r.t. InstFcmp::FCondition ordering
 // (and naming).
 enum {
-#define X(val, CC0, CC1) _fcmp_ll_##val,
+#define X(val, CC0, CC1, CC0_V, CC1_V, INV_V, NEG_V) _fcmp_ll_##val,
   FCMPARM32_TABLE
 #undef X
       _fcmp_ll_NUM
@@ -4277,7 +4246,7 @@
   CondARM32::Cond CC0;
   CondARM32::Cond CC1;
 } TableFcmp[] = {
-#define X(val, CC0, CC1)                                                       \
+#define X(val, CC0, CC1, CC0_V, CC1_V, INV_V, NEG_V)                           \
   { CondARM32::CC0, CondARM32::CC1 }                                           \
   ,
     FCMPARM32_TABLE
@@ -4322,8 +4291,80 @@
 
 void TargetARM32::lowerFcmp(const InstFcmp *Instr) {
   Variable *Dest = Instr->getDest();
-  if (isVectorType(Dest->getType())) {
-    UnimplementedLoweringError(this, Instr);
+  const Type DestTy = Dest->getType();
+
+  if (isVectorType(DestTy)) {
+    if (Instr->getCondition() == InstFcmp::False) {
+      constexpr Type SafeTypeForMovingConstant = IceType_v4i32;
+      auto *T = makeReg(SafeTypeForMovingConstant);
+      _mov(T, llvm::cast<ConstantInteger32>(Ctx->getConstantInt32(0)));
+      _mov(Dest, T);
+      return;
+    }
+
+    if (Instr->getCondition() == InstFcmp::True) {
+      constexpr Type SafeTypeForMovingConstant = IceType_v4i32;
+      auto *T = makeReg(SafeTypeForMovingConstant);
+      _mov(T, llvm::cast<ConstantInteger32>(Ctx->getConstantInt32(1)));
+      _mov(Dest, T);
+      return;
+    }
+
+    Variable *T0;
+    Variable *T1;
+    bool Negate = false;
+    auto *Src0 = legalizeToReg(Instr->getSrc(0));
+    auto *Src1 = legalizeToReg(Instr->getSrc(1));
+
+    switch (Instr->getCondition()) {
+    default:
+      llvm::report_fatal_error("Unhandled fp comparison.");
+#define _Vcnone(Tptr, S0, S1)                                                  \
+  do {                                                                         \
+    *(Tptr) = nullptr;                                                         \
+  } while (0)
+#define _Vceq(Tptr, S0, S1)                                                    \
+  do {                                                                         \
+    *(Tptr) = makeReg(DestTy);                                                 \
+    _vceq(*(Tptr), S0, S1);                                                    \
+  } while (0)
+#define _Vcge(Tptr, S0, S1)                                                    \
+  do {                                                                         \
+    *(Tptr) = makeReg(DestTy);                                                 \
+    _vcge(*(Tptr), S0, S1)->setSignType(InstARM32::FS_Signed);                 \
+  } while (0)
+#define _Vcgt(Tptr, S0, S1)                                                    \
+  do {                                                                         \
+    *(Tptr) = makeReg(DestTy);                                                 \
+    _vcgt(*(Tptr), S0, S1)->setSignType(InstARM32::FS_Signed);                 \
+  } while (0)
+#define X(val, CC0, CC1, CC0_V, CC1_V, INV_V, NEG_V)                           \
+  case InstFcmp::val: {                                                        \
+    _Vc##CC0_V(&T0, (INV_V) ? Src1 : Src0, (INV_V) ? Src0 : Src1);             \
+    _Vc##CC1_V(&T1, (INV_V) ? Src0 : Src1, (INV_V) ? Src1 : Src0);             \
+    Negate = NEG_V;                                                            \
+  } break;
+      FCMPARM32_TABLE
+#undef X
+#undef _Vcgt
+#undef _Vcge
+#undef _Vceq
+#undef _Vcnone
+    }
+    assert(T0 != nullptr);
+    Variable *T = T0;
+    if (T1 != nullptr) {
+      T = makeReg(DestTy);
+      _vorr(T, T0, T1);
+    }
+
+    if (Negate) {
+      auto *TNeg = makeReg(DestTy);
+      _vmvn(TNeg, T);
+      T = TNeg;
+    }
+
+    _mov(Dest, T);
     return;
   }
 
@@ -4621,9 +4662,78 @@
 
 void TargetARM32::lowerIcmp(const InstIcmp *Instr) {
   Variable *Dest = Instr->getDest();
+  const Type DestTy = Dest->getType();
 
-  if (isVectorType(Dest->getType())) {
-    UnimplementedLoweringError(this, Instr);
+  if (isVectorType(DestTy)) {
+    auto *T = makeReg(DestTy);
+    auto *Src0 = legalizeToReg(Instr->getSrc(0));
+    auto *Src1 = legalizeToReg(Instr->getSrc(1));
+    const Type SrcTy = Src0->getType();
+
+    bool NeedsShl = false;
+    Type NewTypeAfterShl;
+    SizeT ShAmt;
+    switch (SrcTy) {
+    default:
+      break;
+    case IceType_v16i1:
+      NeedsShl = true;
+      NewTypeAfterShl = IceType_v16i8;
+      ShAmt = 7;
+      break;
+    case IceType_v8i1:
+      NeedsShl = true;
+      NewTypeAfterShl = IceType_v8i16;
+      ShAmt = 15;
+      break;
+    case IceType_v4i1:
+      NeedsShl = true;
+      NewTypeAfterShl = IceType_v4i32;
+      ShAmt = 31;
+      break;
+    }
+
+    if (NeedsShl) {
+      auto *Imm = llvm::cast<ConstantInteger32>(Ctx->getConstantInt32(ShAmt));
+      auto *Src0T = makeReg(NewTypeAfterShl);
+      auto *Src0Shl = makeReg(NewTypeAfterShl);
+      _mov(Src0T, Src0);
+      _vshl(Src0Shl, Src0T, Imm);
+      Src0 = Src0Shl;
+
+      auto *Src1T = makeReg(NewTypeAfterShl);
+      auto *Src1Shl = makeReg(NewTypeAfterShl);
+      _mov(Src1T, Src1);
+      _vshl(Src1Shl, Src1T, Imm);
+      Src1 = Src1Shl;
+    }
+
+    switch (Instr->getCondition()) {
+    default:
+      llvm::report_fatal_error("Unhandled integer comparison.");
+#define _Vceq(T, S0, S1, Signed) _vceq(T, S0, S1)
+#define _Vcge(T, S0, S1, Signed)                                               \
+  _vcge(T, S0, S1)                                                             \
+      ->setSignType(Signed ? InstARM32::FS_Signed : InstARM32::FS_Unsigned)
+#define _Vcgt(T, S0, S1, Signed)                                               \
+  _vcgt(T, S0, S1)                                                             \
+      ->setSignType(Signed ? InstARM32::FS_Signed : InstARM32::FS_Unsigned)
+#define X(val, is_signed, swapped64, C_32, C1_64, C2_64, C_V, INV_V, NEG_V)    \
+  case InstIcmp::val: {                                                        \
+    _Vc##C_V(T, (INV_V) ? Src1 : Src0, (INV_V) ? Src0 : Src1, is_signed);      \
+    if (NEG_V) {                                                               \
+      auto *TInv = makeReg(DestTy);                                            \
+      _vmvn(TInv, T);                                                          \
+      T = TInv;                                                                \
+    }                                                                          \
+  } break;
+      ICMPARM32_TABLE
+#undef X
+#undef _Vcgt
+#undef _Vcge
+#undef _Vceq
+    }
+    _mov(Dest, T);
     return;
   }
 
diff --git a/src/IceTargetLoweringARM32.def b/src/IceTargetLoweringARM32.def
index 95aa255..c1d29f8 100644
--- a/src/IceTargetLoweringARM32.def
+++ b/src/IceTargetLoweringARM32.def
@@ -18,44 +18,52 @@
 // Patterns for lowering fcmp. These are expected to be used in the following
 // manner:
 //
+// Scalar:
 //   mov reg, #0
 //   movCC0 reg, #1 /* only if CC0 != kNone */
 //   movCC1 reg, #1 /* only if CC1 != kNone */
 //
-// TODO(jpp): vector lowerings.
+// Vector:
+//   vcCC0_V Cmp0, Src0, Src1 /* only if CC0_V != none */
+//   vcCC1_V Cmp1, Src1, Src0 /* only if CC1_V != none */
+//   vorr    Cmp2, Cmp0, Cmp1 /* only if CC1_V != none */
+//   vmvn    Reg3, Cmp?       /* only if NEG_V = true */
+//
+//   If INV_V = true, then Src0 and Src1 should be swapped.
+//
 #define FCMPARM32_TABLE                                                        \
-  /*  val, CC0,   CC1 */                                                       \
-  X(False, kNone, kNone)                                                       \
-  X(Oeq,   EQ,    kNone)                                                       \
-  X(Ogt,   GT,    kNone)                                                       \
-  X(Oge,   GE,    kNone)                                                       \
-  X(Olt,   MI,    kNone)                                                       \
-  X(Ole,   LS,    kNone)                                                       \
-  X(One,   MI,    GT)                                                          \
-  X(Ord,   VC,    kNone)                                                       \
-  X(Ueq,   EQ,    VS)                                                          \
-  X(Ugt,   HI,    kNone)                                                       \
-  X(Uge,   PL,    kNone)                                                       \
-  X(Ult,   LT,    kNone)                                                       \
-  X(Ule,   LE,    kNone)                                                       \
-  X(Une,   NE,    kNone)                                                       \
-  X(Uno,   VS,    kNone)                                                       \
-  X(True,  AL,    kNone)                                                       \
-//#define X(val, CC0, CC1)
+  /*val  , CC0  , CC1  , CC0_V, CC1_V, INV_V, NEG_V */                         \
+  X(False, kNone, kNone, none , none , false, false)                           \
+  X(Oeq  , EQ   , kNone, eq   , none , false, false)                           \
+  X(Ogt  , GT   , kNone, gt   , none , false, false)                           \
+  X(Oge  , GE   , kNone, ge   , none , false, false)                           \
+  X(Olt  , MI   , kNone, gt   , none , true , false)                           \
+  X(Ole  , LS   , kNone, ge   , none , true , false)                           \
+  X(One  , MI   , GT   , gt   , gt   , false, false)                           \
+  X(Ord  , VC   , kNone, ge   , gt   , false, false)                           \
+  X(Ueq  , EQ   , VS   , gt   , gt   , false, true)                            \
+  X(Ugt  , HI   , kNone, ge   , none , true , true)                            \
+  X(Uge  , PL   , kNone, gt   , none , true , true)                            \
+  X(Ult  , LT   , kNone, ge   , none , false, true)                            \
+  X(Ule  , LE   , kNone, gt   , none , false, true)                            \
+  X(Une  , NE   , kNone, eq   , none , false, true)                            \
+  X(Uno  , VS   , kNone, ge   , gt   , false, true)                            \
+  X(True , AL   , kNone, none , none , false, false)
+//#define X(val, CC0, CC1, CC0_V, CC1_V, INV_V, NEG_V)
 
 // Patterns for lowering icmp.
-#define ICMPARM32_TABLE                                                        \
-  /* val, is_signed, swapped64, C_32, C1_64, C2_64 */                          \
-  X(Eq,   false,     false,     EQ,   EQ,    NE)                               \
-  X(Ne,   false,     false,     NE,   NE,    EQ)                               \
-  X(Ugt,  false,     false,     HI,   HI,    LS)                               \
-  X(Uge,  false,     false,     CS,   CS,    CC)                               \
-  X(Ult,  false,     false,     CC,   CC,    CS)                               \
-  X(Ule,  false,     false,     LS,   LS,    HI)                               \
-  X(Sgt,  true,      true,      GT,   LT,    GE)                               \
-  X(Sge,  true,      false,     GE,   GE,    LT)                               \
-  X(Slt,  true,      false,     LT,   LT,    GE)                               \
-  X(Sle,  true,      true,      LE,   GE,    LT)                               \
-//#define X(val, is_signed, swapped64, C_32, C1_64, C2_64)
+#define ICMPARM32_TABLE                                                       \
+  /*val, is_signed, swapped64, C_32, C1_64, C2_64, C_V, INV_V, NEG_V */       \
+  X(Eq , false    , false    , EQ,   EQ   , NE   , eq , false, false)         \
+  X(Ne , false    , false    , NE,   NE   , EQ   , eq , false, true)          \
+  X(Ugt, false    , false    , HI,   HI   , LS   , gt , false, false)         \
+  X(Uge, false    , false    , CS,   CS   , CC   , ge , false, false)         \
+  X(Ult, false    , false    , CC,   CC   , CS   , gt , true , false)         \
+  X(Ule, false    , false    , LS,   LS   , HI   , ge , true , false)         \
+  X(Sgt, true     , true     , GT,   LT   , GE   , gt , false, false)         \
+  X(Sge, true     , false    , GE,   GE   , LT   , ge , false, false)         \
+  X(Slt, true     , false    , LT,   LT   , GE   , gt , true , false)         \
+  X(Sle, true     , true     , LE,   GE   , LT   , ge , true , false)
+//#define X(val, is_signed, swapped64, C_32, C1_64, C2_64, C_V, INV_V, NEG_V)
 
 #endif // SUBZERO_SRC_ICETARGETLOWERINGARM32_DEF
diff --git a/src/IceTargetLoweringARM32.h b/src/IceTargetLoweringARM32.h
index 408db29..18930be 100644
--- a/src/IceTargetLoweringARM32.h
+++ b/src/IceTargetLoweringARM32.h
@@ -858,6 +858,15 @@
   InstARM32Vbsl *_vbsl(Variable *Dest, Variable *Src0, Variable *Src1) {
     return Context.insert<InstARM32Vbsl>(Dest, Src0, Src1);
   }
+  void _vceq(Variable *Dest, Variable *Src0, Variable *Src1) {
+    Context.insert<InstARM32Vceq>(Dest, Src0, Src1);
+  }
+  InstARM32Vcge *_vcge(Variable *Dest, Variable *Src0, Variable *Src1) {
+    return Context.insert<InstARM32Vcge>(Dest, Src0, Src1);
+  }
+  InstARM32Vcgt *_vcgt(Variable *Dest, Variable *Src0, Variable *Src1) {
+    return Context.insert<InstARM32Vcgt>(Dest, Src0, Src1);
+  }
   void _vcvt(Variable *Dest, Variable *Src, InstARM32Vcvt::VcvtVariant Variant,
              CondARM32::Cond Pred = CondARM32::AL) {
     Context.insert<InstARM32Vcvt>(Dest, Src, Variant, Pred);
@@ -888,6 +897,9 @@
   void _vmul(Variable *Dest, Variable *Src0, Variable *Src1) {
     Context.insert<InstARM32Vmul>(Dest, Src0, Src1);
   }
+  void _vmvn(Variable *Dest, Variable *Src0) {
+    Context.insert<InstARM32Vmvn>(Dest, Src0, CondARM32::AL);
+  }
   void _vneg(Variable *Dest, Variable *Src0) {
     Context.insert<InstARM32Vneg>(Dest, Src0, CondARM32::AL)
         ->setSignType(InstARM32::FS_Signed);
diff --git a/src/WasmTranslator.cpp b/src/WasmTranslator.cpp
index 29f19a4..7c462d9 100644
--- a/src/WasmTranslator.cpp
+++ b/src/WasmTranslator.cpp
@@ -941,8 +941,8 @@
     auto MemBase = Ctx->getConstantSym(0, Ctx->getGlobalString("WASM_MEMORY"));
     if (!ConstZeroBase) {
       auto RealAddrV = Func->makeVariable(Ice::getPointerType());
-      Control()->appendInst(InstArithmetic::create(
-          Func, InstArithmetic::Add, RealAddrV, Base, MemBase));
+      Control()->appendInst(InstArithmetic::create(Func, InstArithmetic::Add,
+                                                   RealAddrV, Base, MemBase));
 
       RealAddr = RealAddrV;
     } else {
diff --git a/tests_lit/assembler/arm32/cmp-vec.ll b/tests_lit/assembler/arm32/cmp-vec.ll
index 86f5b6c..ab25cb3 100644
--- a/tests_lit/assembler/arm32/cmp-vec.ll
+++ b/tests_lit/assembler/arm32/cmp-vec.ll
@@ -1,52 +1,1438 @@
 ; Test that we handle icmp and fcmp on vectors.
 
-; TODO(eholk): This test will need to be updated once comparison is no
-; longer scalarized.
-
 ; REQUIRES: allow_dump
 
 ; Compile using standalone assembler.
-; RUN: %p2i --filetype=asm -i %s --target=arm32 --args -Om1 \
+; RUN: %p2i --filetype=asm -i %s --target=arm32 --args -O2 \
 ; RUN:   | FileCheck %s --check-prefix=ASM
 
 ; Show bytes in assembled standalone code.
 ; RUN: %p2i --filetype=asm -i %s --target=arm32 --assemble --disassemble \
-; RUN:   --args -Om1 | FileCheck %s --check-prefix=DIS
+; RUN:   --args -O2 | FileCheck %s --check-prefix=DIS
+
+; Compile using integrated assembler.
+; RUN: %p2i --filetype=iasm -i %s --target=arm32 --args -O2 \
+; RUN:   | FileCheck %s --check-prefix=IASM
 
 ; Show bytes in assembled integrated code.
 ; RUN: %p2i --filetype=iasm -i %s --target=arm32 --assemble --disassemble \
-; RUN:   --args -Om1 | FileCheck %s --check-prefix=DIS
+; RUN:   --args -O2 | FileCheck %s --check-prefix=DIS
 
-define internal <4 x i32> @cmpEq4I32(<4 x i32> %a, <4 x i32> %b) {
-; ASM-LABEL:cmpEq4I32:
-; DIS-LABEL:00000000 <cmpEq4I32>:
+define internal <4 x i32> @cmpEqV4I32(<4 x i32> %a, <4 x i32> %b) {
+; ASM-LABEL:cmpEqV4I32:
+; DIS-LABEL:00000000 <cmpEqV4I32>:
+; IASM-LABEL:cmpEqV4I32:
 
 entry:
   %cmp = icmp eq <4 x i32> %a, %b
 
-; ASM:        cmp     r1, r2
-; ASM:        cmp     r1, r2
-; ASM:        cmp     r1, r2
-; ASM:        cmp     r1, r2
-; DIS:  40:        e1510002
+; ASM:         vceq.i32 q0, q0, q1
+; DIS:      0: f3200852
+; IASM-NOT:    vceq
 
   %cmp.ret_ext = zext <4 x i1> %cmp to <4 x i32>
   ret <4 x i32> %cmp.ret_ext
 }
 
-define internal <4 x i32> @cmpEq4f32(<4 x float> %a, <4 x float> %b) {
-; ASM-LABEL:cmpEq4f32:
-; DIS-LABEL:00000180 <cmpEq4f32>:
+define internal <4 x i32> @cmpNeV4I32(<4 x i32> %a, <4 x i32> %b) {
+; ASM-LABEL:cmpNeV4I32:
+; DIS-LABEL:00000010 <cmpNeV4I32>:
+; IASM-LABEL:cmpNeV4I32:
+
+entry:
+  %cmp = icmp ne <4 x i32> %a, %b
+
+; ASM:          vceq.i32 q0, q0, q1
+; ASM-NEXT:     vmvn.i32 q0, q0
+; DIS:      10: f3200852
+; DIS-NEXT: 14: f3b005c0
+; IASM-NOT:     vceq
+; IASM-NOT:     vmvn
+
+  %cmp.ret_ext = zext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %cmp.ret_ext
+}
+
+define internal <4 x i32> @cmpUgtV4I32(<4 x i32> %a, <4 x i32> %b) {
+; ASM-LABEL:cmpUgtV4I32:
+; DIS-LABEL:00000030 <cmpUgtV4I32>:
+; IASM-LABEL:cmpUgtV4I32:
+
+entry:
+  %cmp = icmp ugt <4 x i32> %a, %b
+
+; ASM:          vcgt.u32 q0, q0, q1
+; DIS:      30: f3200342
+; IASM-NOT:     vcgt
+
+  %cmp.ret_ext = zext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %cmp.ret_ext
+}
+
+define internal <4 x i32> @cmpUgeV4I32(<4 x i32> %a, <4 x i32> %b) {
+; ASM-LABEL:cmpUgeV4I32:
+; DIS-LABEL:00000040 <cmpUgeV4I32>:
+; IASM-LABEL:cmpUgeV4I32:
+
+entry:
+  %cmp = icmp uge <4 x i32> %a, %b
+
+; ASM:          vcge.u32 q0, q0, q1
+; DIS:      40: f3200352
+; IASM-NOT:     vcge
+
+  %cmp.ret_ext = zext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %cmp.ret_ext
+}
+
+define internal <4 x i32> @cmpUltV4I32(<4 x i32> %a, <4 x i32> %b) {
+; ASM-LABEL:cmpUltV4I32:
+; DIS-LABEL:00000050 <cmpUltV4I32>:
+; IASM-LABEL:cmpUltV4I32:
+
+entry:
+  %cmp = icmp ult <4 x i32> %a, %b
+
+; ASM:          vcgt.u32 q1, q1, q0
+; DIS:      50: f3222340
+; IASM-NOT:     vcgt
+
+  %cmp.ret_ext = zext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %cmp.ret_ext
+}
+
+define internal <4 x i32> @cmpUleV4I32(<4 x i32> %a, <4 x i32> %b) {
+; ASM-LABEL:cmpUleV4I32:
+; DIS-LABEL:00000070 <cmpUleV4I32>:
+; IASM-LABEL:cmpUleV4I32:
+
+entry:
+  %cmp = icmp ule <4 x i32> %a, %b
+
+; ASM:          vcge.u32 q1, q1, q0
+; DIS:      70: f3222350
+; IASM-NOT:     vcge
+
+  %cmp.ret_ext = zext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %cmp.ret_ext
+}
+
+define internal <4 x i32> @cmpSgtV4I32(<4 x i32> %a, <4 x i32> %b) {
+; ASM-LABEL:cmpSgtV4I32:
+; DIS-LABEL:00000090 <cmpSgtV4I32>:
+; IASM-LABEL:cmpSgtV4I32:
+
+entry:
+  %cmp = icmp sgt <4 x i32> %a, %b
+
+; ASM:          vcgt.s32 q0, q0, q1
+; DIS:      90: f2200342
+; IASM-NOT:     vcgt
+
+  %cmp.ret_ext = zext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %cmp.ret_ext
+}
+
+define internal <4 x i32> @cmpSgeV4I32(<4 x i32> %a, <4 x i32> %b) {
+; ASM-LABEL:cmpSgeV4I32:
+; DIS-LABEL:000000a0 <cmpSgeV4I32>:
+; IASM-LABEL:cmpSgeV4I32:
+
+entry:
+  %cmp = icmp sge <4 x i32> %a, %b
+
+; ASM:          vcge.s32 q0, q0, q1
+; DIS:      a0: f2200352
+; IASM-NOT:     vcge
+
+  %cmp.ret_ext = zext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %cmp.ret_ext
+}
+
+define internal <4 x i32> @cmpSltV4I32(<4 x i32> %a, <4 x i32> %b) {
+; ASM-LABEL:cmpSltV4I32:
+; DIS-LABEL:000000b0 <cmpSltV4I32>:
+; IASM-LABEL:cmpSltV4I32:
+
+entry:
+  %cmp = icmp slt <4 x i32> %a, %b
+
+; ASM:          vcgt.s32 q1, q1, q0
+; DIS:      b0: f2222340
+; IASM-NOT:     vcgt
+
+  %cmp.ret_ext = zext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %cmp.ret_ext
+}
+
+define internal <4 x i32> @cmpSleV4I32(<4 x i32> %a, <4 x i32> %b) {
+; ASM-LABEL:cmpSleV4I32:
+; DIS-LABEL:000000d0 <cmpSleV4I32>:
+; IASM-LABEL:cmpSleV4I32:
+
+entry:
+  %cmp = icmp sle <4 x i32> %a, %b
+
+; ASM:          vcge.s32 q1, q1, q0
+; DIS:      d0: f2222350
+; IASM-NOT:     vcge
+
+  %cmp.ret_ext = zext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %cmp.ret_ext
+}
+
+define internal <4 x i32> @cmpEqV4I1(<4 x i32> %a, <4 x i32> %b) {
+; ASM-LABEL:cmpEqV4I1:
+; DIS-LABEL:000000f0 <cmpEqV4I1>:
+; IASM-LABEL:cmpEqV4I1:
+
+entry:
+  %a1 = trunc <4 x i32> %a to <4 x i1>
+  %b1 = trunc <4 x i32> %b to <4 x i1>
+  %cmp = icmp eq <4 x i1> %a1, %b1
+
+; ASM:          vshl.u32 q0, q0, #31
+; ASM-NEXT:     vshl.u32 q1, q1, #31
+; ASM-NEXT:     vceq.i32 q0, q0, q1
+; DIS:      f0: f2bf0550
+; DIS-NEXT: f4: f2bf2552
+; DIS-NEXT: f8: f3200852
+; IASM-NOT:     vshl
+; IASM-NOT:     vceq
+
+  %cmp.ret_ext = zext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %cmp.ret_ext
+}
+
+define internal <4 x i32> @cmpNeV4I1(<4 x i32> %a, <4 x i32> %b) {
+; ASM-LABEL:cmpNeV4I1:
+; DIS-LABEL:00000110 <cmpNeV4I1>:
+; IASM-LABEL:cmpNeV4I1:
+
+entry:
+  %a1 = trunc <4 x i32> %a to <4 x i1>
+  %b1 = trunc <4 x i32> %b to <4 x i1>
+  %cmp = icmp ne <4 x i1> %a1, %b1
+
+; ASM:           vshl.u32 q0, q0, #31
+; ASM-NEXT:      vshl.u32 q1, q1, #31
+; ASM-NEXT:      vceq.i32 q0, q0, q1
+; ASM-NEXT:      vmvn.i32 q0, q0
+; DIS:      110: f2bf0550
+; DIS-NEXT: 114: f2bf2552
+; DIS-NEXT: 118: f3200852
+; DIS-NEXT: 11c: f3b005c0
+; IASM-NOT:      vshl
+; IASM-NOT:      vceq
+; IASM-NOT:      vmvn
+
+  %cmp.ret_ext = zext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %cmp.ret_ext
+}
+
+define internal <4 x i32> @cmpUgtV4I1(<4 x i32> %a, <4 x i32> %b) {
+; ASM-LABEL:cmpUgtV4I1:
+; DIS-LABEL:00000130 <cmpUgtV4I1>:
+; IASM-LABEL:cmpUgtV4I1:
+
+entry:
+  %a1 = trunc <4 x i32> %a to <4 x i1>
+  %b1 = trunc <4 x i32> %b to <4 x i1>
+  %cmp = icmp ugt <4 x i1> %a1, %b1
+
+; ASM:           vshl.u32 q0, q0, #31
+; ASM-NEXT:      vshl.u32 q1, q1, #31
+; ASM-NEXT:      vcgt.u32 q0, q0, q1
+; DIS:      130: f2bf0550
+; DIS-NEXT: 134: f2bf2552
+; DIS-NEXT: 138: f3200342
+; IASM-NOT:      vshl
+; IASM-NOT:      vcgt
+
+  %cmp.ret_ext = zext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %cmp.ret_ext
+}
+
+define internal <4 x i32> @cmpUgeV4I1(<4 x i32> %a, <4 x i32> %b) {
+; ASM-LABEL:cmpUgeV4I1:
+; DIS-LABEL:00000150 <cmpUgeV4I1>:
+; IASM-LABEL:cmpUgeV4I1:
+
+entry:
+  %a1 = trunc <4 x i32> %a to <4 x i1>
+  %b1 = trunc <4 x i32> %b to <4 x i1>
+  %cmp = icmp uge <4 x i1> %a1, %b1
+
+; ASM:           vshl.u32 q0, q0, #31
+; ASM-NEXT:      vshl.u32 q1, q1, #31
+; ASM-NEXT:      vcge.u32 q0, q0, q1
+; DIS:      150: f2bf0550
+; DIS-NEXT: 154: f2bf2552
+; DIS-NEXT: 158: f3200352
+; IASM-NOT:      vshl
+; IASM-NOT:      vcge
+
+  %cmp.ret_ext = zext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %cmp.ret_ext
+}
+
+define internal <4 x i32> @cmpUltV4I1(<4 x i32> %a, <4 x i32> %b) {
+; ASM-LABEL:cmpUltV4I1:
+; DIS-LABEL:00000170 <cmpUltV4I1>:
+; IASM-LABEL:cmpUltV4I1:
+
+entry:
+  %a1 = trunc <4 x i32> %a to <4 x i1>
+  %b1 = trunc <4 x i32> %b to <4 x i1>
+  %cmp = icmp ult <4 x i1> %a1, %b1
+
+; ASM:           vshl.u32 q0, q0, #31
+; ASM-NEXT:      vshl.u32 q1, q1, #31
+; ASM-NEXT:      vcgt.u32 q1, q1, q0
+; DIS:      170: f2bf0550
+; DIS-NEXT: 174: f2bf2552
+; DIS-NEXT: 178: f3222340
+; IASM-NOT:      vshl
+; IASM-NOT:      vcgt
+
+  %cmp.ret_ext = zext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %cmp.ret_ext
+}
+
+define internal <4 x i32> @cmpUleV4I1(<4 x i32> %a, <4 x i32> %b) {
+; ASM-LABEL:cmpUleV4I1:
+; DIS-LABEL:00000190 <cmpUleV4I1>:
+; IASM-LABEL:cmpUleV4I1:
+
+entry:
+  %a1 = trunc <4 x i32> %a to <4 x i1>
+  %b1 = trunc <4 x i32> %b to <4 x i1>
+  %cmp = icmp ule <4 x i1> %a1, %b1
+
+; ASM:           vshl.u32 q0, q0, #31
+; ASM-NEXT:      vshl.u32 q1, q1, #31
+; ASM-NEXT:      vcge.u32 q1, q1, q0
+; DIS:      190: f2bf0550
+; DIS-NEXT: 194: f2bf2552
+; DIS-NEXT: 198: f3222350
+; IASM-NOT:      vshl
+; IASM-NOT:      vcge
+
+  %cmp.ret_ext = zext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %cmp.ret_ext
+}
+
+define internal <4 x i32> @cmpSgtV4I1(<4 x i32> %a, <4 x i32> %b) {
+; ASM-LABEL:cmpSgtV4I1:
+; DIS-LABEL:000001b0 <cmpSgtV4I1>:
+; IASM-LABEL:cmpSgtV4I1:
+
+entry:
+  %a1 = trunc <4 x i32> %a to <4 x i1>
+  %b1 = trunc <4 x i32> %b to <4 x i1>
+  %cmp = icmp sgt <4 x i1> %a1, %b1
+
+; ASM:           vshl.u32 q0, q0, #31
+; ASM-NEXT:      vshl.u32 q1, q1, #31
+; ASM-NEXT:      vcgt.s32 q0, q0, q1
+; DIS:      1b0: f2bf0550
+; DIS-NEXT: 1b4: f2bf2552
+; DIS-NEXT: 1b8: f2200342
+; IASM-NOT:      vshl
+; IASM-NOT:      vcgt
+
+  %cmp.ret_ext = zext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %cmp.ret_ext
+}
+
+define internal <4 x i32> @cmpSgeV4I1(<4 x i32> %a, <4 x i32> %b) {
+; ASM-LABEL:cmpSgeV4I1:
+; DIS-LABEL:000001d0 <cmpSgeV4I1>:
+; IASM-LABEL:cmpSgeV4I1:
+
+entry:
+  %a1 = trunc <4 x i32> %a to <4 x i1>
+  %b1 = trunc <4 x i32> %b to <4 x i1>
+  %cmp = icmp sge <4 x i1> %a1, %b1
+
+; ASM:           vshl.u32 q0, q0, #31
+; ASM-NEXT:      vshl.u32 q1, q1, #31
+; ASM-NEXT:      vcge.s32 q0, q0, q1
+; DIS:      1d0: f2bf0550
+; DIS-NEXT: 1d4: f2bf2552
+; DIS-NEXT: 1d8: f2200352
+; IASM-NOT:      vshl
+; IASM-NOT:      vcge
+
+  %cmp.ret_ext = zext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %cmp.ret_ext
+}
+
+define internal <4 x i32> @cmpSltV4I1(<4 x i32> %a, <4 x i32> %b) {
+; ASM-LABEL:cmpSltV4I1:
+; DIS-LABEL:000001f0 <cmpSltV4I1>:
+; IASM-LABEL:cmpSltV4I1:
+
+entry:
+  %a1 = trunc <4 x i32> %a to <4 x i1>
+  %b1 = trunc <4 x i32> %b to <4 x i1>
+  %cmp = icmp slt <4 x i1> %a1, %b1
+
+; ASM:           vshl.u32 q0, q0, #31
+; ASM-NEXT:      vshl.u32 q1, q1, #31
+; ASM-NEXT:      vcgt.s32 q1, q1, q0
+; DIS:      1f0: f2bf0550
+; DIS-NEXT: 1f4: f2bf2552
+; DIS-NEXT: 1f8: f2222340
+; IASM-NOT:      vshl
+; IASM-NOT:      vcgt
+
+  %cmp.ret_ext = zext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %cmp.ret_ext
+}
+
+define internal <4 x i32> @cmpSleV4I1(<4 x i32> %a, <4 x i32> %b) {
+; ASM-LABEL:cmpSleV4I1:
+; DIS-LABEL:00000210 <cmpSleV4I1>:
+; IASM-LABEL:cmpSleV4I1:
+
+entry:
+  %a1 = trunc <4 x i32> %a to <4 x i1>
+  %b1 = trunc <4 x i32> %b to <4 x i1>
+  %cmp = icmp sle <4 x i1> %a1, %b1
+
+; ASM:           vshl.u32 q0, q0, #31
+; ASM-NEXT:      vshl.u32 q1, q1, #31
+; ASM-NEXT:      vcge.s32 q1, q1, q0
+; DIS:      210: f2bf0550
+; DIS-NEXT: 214: f2bf2552
+; DIS-NEXT: 218: f2222350
+; IASM-NOT:      vshl
+; IASM-NOT:      vcge
+
+  %cmp.ret_ext = zext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %cmp.ret_ext
+}
+
+define internal <8 x i16> @cmpEqV8I16(<8 x i16> %a, <8 x i16> %b) {
+; ASM-LABEL:cmpEqV8I16:
+; DIS-LABEL:00000230 <cmpEqV8I16>:
+; IASM-LABEL:cmpEqV8I16:
+
+entry:
+  %cmp = icmp eq <8 x i16> %a, %b
+
+; ASM:           vceq.i16 q0, q0, q1
+; DIS:      230: f3100852
+; IASM-NOT:      vceq
+
+  %cmp.ret_ext = zext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %cmp.ret_ext
+}
+
+define internal <8 x i16> @cmpNeV8I16(<8 x i16> %a, <8 x i16> %b) {
+; ASM-LABEL:cmpNeV8I16:
+; DIS-LABEL:00000240 <cmpNeV8I16>:
+; IASM-LABEL:cmpNeV8I16:
+
+entry:
+  %cmp = icmp ne <8 x i16> %a, %b
+
+; ASM:           vceq.i16 q0, q0, q1
+; ASM-NEXT:      vmvn.i16 q0, q0
+; DIS:      240: f3100852
+; DIS-NEXT: 244: f3b005c0
+; IASM-NOT:      vceq
+; IASM-NOT:      vmvn
+
+  %cmp.ret_ext = zext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %cmp.ret_ext
+}
+
+define internal <8 x i16> @cmpUgtV8I16(<8 x i16> %a, <8 x i16> %b) {
+; ASM-LABEL:cmpUgtV8I16:
+; DIS-LABEL:00000260 <cmpUgtV8I16>:
+; IASM-LABEL:cmpUgtV8I16:
+
+entry:
+  %cmp = icmp ugt <8 x i16> %a, %b
+
+; ASM:           vcgt.u16 q0, q0, q1
+; DIS:      260: f3100342
+; IASM-NOT:      vcgt
+
+  %cmp.ret_ext = zext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %cmp.ret_ext
+}
+
+define internal <8 x i16> @cmpUgeV8I16(<8 x i16> %a, <8 x i16> %b) {
+; ASM-LABEL:cmpUgeV8I16:
+; DIS-LABEL:00000270 <cmpUgeV8I16>:
+; IASM-LABEL:cmpUgeV8I16:
+
+entry:
+  %cmp = icmp uge <8 x i16> %a, %b
+
+; ASM:           vcge.u16 q0, q0, q1
+; DIS:      270: f3100352
+; IASM-NOT:      vcge
+
+  %cmp.ret_ext = zext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %cmp.ret_ext
+}
+
+define internal <8 x i16> @cmpUltV8I16(<8 x i16> %a, <8 x i16> %b) {
+; ASM-LABEL:cmpUltV8I16:
+; DIS-LABEL:00000280 <cmpUltV8I16>:
+; IASM-LABEL:cmpUltV8I16:
+
+entry:
+  %cmp = icmp ult <8 x i16> %a, %b
+
+; ASM:           vcgt.u16 q1, q1, q0
+; DIS:      280: f3122340
+; IASM-NOT:      vcgt
+
+  %cmp.ret_ext = zext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %cmp.ret_ext
+}
+
+define internal <8 x i16> @cmpUleV8I16(<8 x i16> %a, <8 x i16> %b) {
+; ASM-LABEL:cmpUleV8I16:
+; DIS-LABEL:000002a0 <cmpUleV8I16>:
+; IASM-LABEL:cmpUleV8I16:
+
+entry:
+  %cmp = icmp ule <8 x i16> %a, %b
+
+; ASM:           vcge.u16 q1, q1, q0
+; DIS:      2a0: f3122350
+; IASM-NOT:      vcge
+
+  %cmp.ret_ext = zext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %cmp.ret_ext
+}
+
+define internal <8 x i16> @cmpSgtV8I16(<8 x i16> %a, <8 x i16> %b) {
+; ASM-LABEL:cmpSgtV8I16:
+; DIS-LABEL:000002c0 <cmpSgtV8I16>:
+; IASM-LABEL:cmpSgtV8I16:
+
+entry:
+  %cmp = icmp sgt <8 x i16> %a, %b
+
+; ASM:           vcgt.s16 q0, q0, q1
+; DIS:      2c0: f2100342
+; IASM-NOT:      vcgt
+
+  %cmp.ret_ext = zext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %cmp.ret_ext
+}
+
+define internal <8 x i16> @cmpSgeV8I16(<8 x i16> %a, <8 x i16> %b) {
+; ASM-LABEL:cmpSgeV8I16:
+; DIS-LABEL:000002d0 <cmpSgeV8I16>:
+; IASM-LABEL:cmpSgeV8I16:
+
+entry:
+  %cmp = icmp sge <8 x i16> %a, %b
+
+; ASM:           vcge.s16 q0, q0, q1
+; DIS:      2d0: f2100352
+; IASM-NOT:      vcge
+
+  %cmp.ret_ext = zext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %cmp.ret_ext
+}
+
+define internal <8 x i16> @cmpSltV8I16(<8 x i16> %a, <8 x i16> %b) {
+; ASM-LABEL:cmpSltV8I16:
+; DIS-LABEL:000002e0 <cmpSltV8I16>:
+; IASM-LABEL:cmpSltV8I16:
+
+entry:
+  %cmp = icmp slt <8 x i16> %a, %b
+
+; ASM:           vcgt.s16 q1, q1, q0
+; DIS:      2e0: f2122340
+; IASM-NOT:      vcgt
+
+  %cmp.ret_ext = zext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %cmp.ret_ext
+}
+
+define internal <8 x i16> @cmpSleV8I16(<8 x i16> %a, <8 x i16> %b) {
+; ASM-LABEL:cmpSleV8I16:
+; DIS-LABEL:00000300 <cmpSleV8I16>:
+; IASM-LABEL:cmpSleV8I16:
+
+entry:
+  %cmp = icmp sle <8 x i16> %a, %b
+
+; ASM:           vcge.s16 q1, q1, q0
+; DIS:      300: f2122350
+; IASM-NOT:      vcge
+
+  %cmp.ret_ext = zext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %cmp.ret_ext
+}
+
+define internal <8 x i16> @cmpEqV8I1(<8 x i16> %a, <8 x i16> %b) {
+; ASM-LABEL:cmpEqV8I1:
+; DIS-LABEL:00000320 <cmpEqV8I1>:
+; IASM-LABEL:cmpEqV8I1:
+
+entry:
+  %a1 = trunc <8 x i16> %a to <8 x i1>
+  %b1 = trunc <8 x i16> %b to <8 x i1>
+  %cmp = icmp eq <8 x i1> %a1, %b1
+
+; ASM:           vshl.u16 q0, q0, #15
+; ASM-NEXT:      vshl.u16 q1, q1, #15
+; ASM-NEXT:      vceq.i16 q0, q0, q1
+; DIS:      320: f29f0550
+; DIS-NEXT: 324: f29f2552
+; DIS-NEXT: 328: f3100852
+; IASM-NOT:      vshl
+; IASM-NOT:      vceq
+
+  %cmp.ret_ext = zext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %cmp.ret_ext
+}
+
+define internal <8 x i16> @cmpNeV8I1(<8 x i16> %a, <8 x i16> %b) {
+; ASM-LABEL:cmpNeV8I1:
+; DIS-LABEL:00000340 <cmpNeV8I1>:
+; IASM-LABEL:cmpNeV8I1:
+
+entry:
+  %a1 = trunc <8 x i16> %a to <8 x i1>
+  %b1 = trunc <8 x i16> %b to <8 x i1>
+  %cmp = icmp ne <8 x i1> %a1, %b1
+
+; ASM:           vshl.u16 q0, q0, #15
+; ASM-NEXT:      vshl.u16 q1, q1, #15
+; ASM-NEXT:      vceq.i16 q0, q0, q1
+; ASM-NEXT:      vmvn.i16 q0, q0
+; DIS:      340: f29f0550
+; DIS-NEXT: 344: f29f2552
+; DIS-NEXT: 348: f3100852
+; DIS-NEXT: 34c: f3b005c0
+; IASM-NOT:      vshl
+; IASM-NOT:      vceq
+; IASM-NOT:      vmvn
+
+  %cmp.ret_ext = zext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %cmp.ret_ext
+}
+
+define internal <8 x i16> @cmpUgtV8I1(<8 x i16> %a, <8 x i16> %b) {
+; ASM-LABEL:cmpUgtV8I1:
+; DIS-LABEL:00000360 <cmpUgtV8I1>:
+; IASM-LABEL:cmpUgtV8I1:
+
+entry:
+  %a1 = trunc <8 x i16> %a to <8 x i1>
+  %b1 = trunc <8 x i16> %b to <8 x i1>
+  %cmp = icmp ugt <8 x i1> %a1, %b1
+
+; ASM:           vshl.u16 q0, q0, #15
+; ASM-NEXT:      vshl.u16 q1, q1, #15
+; ASM-NEXT:      vcgt.u16 q0, q0, q1
+; DIS:      360: f29f0550
+; DIS-NEXT: 364: f29f2552
+; DIS-NEXT: 368: f3100342
+; IASM-NOT:      vshl
+; IASM-NOT:      vcgt
+
+  %cmp.ret_ext = zext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %cmp.ret_ext
+}
+
+define internal <8 x i16> @cmpUgeV8I1(<8 x i16> %a, <8 x i16> %b) {
+; ASM-LABEL:cmpUgeV8I1:
+; DIS-LABEL:00000380 <cmpUgeV8I1>:
+; IASM-LABEL:cmpUgeV8I1:
+
+entry:
+  %a1 = trunc <8 x i16> %a to <8 x i1>
+  %b1 = trunc <8 x i16> %b to <8 x i1>
+  %cmp = icmp uge <8 x i1> %a1, %b1
+
+; ASM:           vshl.u16 q0, q0, #15
+; ASM-NEXT:      vshl.u16 q1, q1, #15
+; ASM-NEXT:      vcge.u16 q0, q0, q1
+; DIS:      380: f29f0550
+; DIS-NEXT: 384: f29f2552
+; DIS-NEXT: 388: f3100352
+; IASM-NOT:      vshl
+; IASM-NOT:      vcge
+
+  %cmp.ret_ext = zext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %cmp.ret_ext
+}
+
+define internal <8 x i16> @cmpUltV8I1(<8 x i16> %a, <8 x i16> %b) {
+; ASM-LABEL:cmpUltV8I1:
+; DIS-LABEL:000003a0 <cmpUltV8I1>:
+; IASM-LABEL:cmpUltV8I1:
+
+entry:
+  %a1 = trunc <8 x i16> %a to <8 x i1>
+  %b1 = trunc <8 x i16> %b to <8 x i1>
+  %cmp = icmp ult <8 x i1> %a1, %b1
+
+; ASM:           vshl.u16 q0, q0, #15
+; ASM-NEXT:      vshl.u16 q1, q1, #15
+; ASM-NEXT:      vcgt.u16 q1, q1, q0
+; DIS:      3a0: f29f0550
+; DIS-NEXT: 3a4: f29f2552
+; DIS-NEXT: 3a8: f3122340
+; IASM-NOT:      vshl
+; IASM-NOT:      vcgt
+
+  %cmp.ret_ext = zext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %cmp.ret_ext
+}
+
+define internal <8 x i16> @cmpUleV8I1(<8 x i16> %a, <8 x i16> %b) {
+; ASM-LABEL:cmpUleV8I1:
+; DIS-LABEL:000003c0 <cmpUleV8I1>:
+; IASM-LABEL:cmpUleV8I1:
+
+entry:
+  %a1 = trunc <8 x i16> %a to <8 x i1>
+  %b1 = trunc <8 x i16> %b to <8 x i1>
+  %cmp = icmp ule <8 x i1> %a1, %b1
+
+; ASM:           vshl.u16 q0, q0, #15
+; ASM-NEXT:      vshl.u16 q1, q1, #15
+; ASM-NEXT:      vcge.u16 q1, q1, q0
+; DIS:      3c0: f29f0550
+; DIS-NEXT: 3c4: f29f2552
+; DIS-NEXT: 3c8: f3122350
+; IASM-NOT:      vshl
+; IASM-NOT:      vcge
+
+  %cmp.ret_ext = zext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %cmp.ret_ext
+}
+
+define internal <8 x i16> @cmpSgtV8I1(<8 x i16> %a, <8 x i16> %b) {
+; ASM-LABEL:cmpSgtV8I1:
+; DIS-LABEL:000003e0 <cmpSgtV8I1>:
+; IASM-LABEL:cmpSgtV8I1:
+
+entry:
+  %a1 = trunc <8 x i16> %a to <8 x i1>
+  %b1 = trunc <8 x i16> %b to <8 x i1>
+  %cmp = icmp sgt <8 x i1> %a1, %b1
+
+; ASM:           vshl.u16 q0, q0, #15
+; ASM-NEXT:      vshl.u16 q1, q1, #15
+; ASM-NEXT:      vcgt.s16 q0, q0, q1
+; DIS:      3e0: f29f0550
+; DIS-NEXT: 3e4: f29f2552
+; DIS-NEXT: 3e8: f2100342
+; IASM-NOT:      vshl
+; IASM-NOT:      vcgt
+
+  %cmp.ret_ext = zext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %cmp.ret_ext
+}
+
+define internal <8 x i16> @cmpSgeV8I1(<8 x i16> %a, <8 x i16> %b) {
+; ASM-LABEL:cmpSgeV8I1:
+; DIS-LABEL:00000400 <cmpSgeV8I1>:
+; IASM-LABEL:cmpSgeV8I1:
+
+entry:
+  %a1 = trunc <8 x i16> %a to <8 x i1>
+  %b1 = trunc <8 x i16> %b to <8 x i1>
+  %cmp = icmp sge <8 x i1> %a1, %b1
+
+; ASM:           vshl.u16 q0, q0, #15
+; ASM-NEXT:      vshl.u16 q1, q1, #15
+; ASM-NEXT:      vcge.s16 q0, q0, q1
+; DIS:      400: f29f0550
+; DIS-NEXT: 404: f29f2552
+; DIS-NEXT: 408: f2100352
+; IASM-NOT:      vshl
+; IASM-NOT:      vcge
+
+  %cmp.ret_ext = zext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %cmp.ret_ext
+}
+
+define internal <8 x i16> @cmpSltV8I1(<8 x i16> %a, <8 x i16> %b) {
+; ASM-LABEL:cmpSltV8I1:
+; DIS-LABEL:00000420 <cmpSltV8I1>:
+; IASM-LABEL:cmpSltV8I1:
+
+entry:
+  %a1 = trunc <8 x i16> %a to <8 x i1>
+  %b1 = trunc <8 x i16> %b to <8 x i1>
+  %cmp = icmp slt <8 x i1> %a1, %b1
+
+; ASM:           vshl.u16 q0, q0, #15
+; ASM-NEXT:      vshl.u16 q1, q1, #15
+; ASM-NEXT:      vcgt.s16 q1, q1, q0
+; DIS:      420: f29f0550
+; DIS-NEXT: 424: f29f2552
+; DIS-NEXT: 428: f2122340
+; IASM-NOT:      vshl
+; IASM-NOT:      vcgt
+
+  %cmp.ret_ext = zext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %cmp.ret_ext
+}
+
+define internal <8 x i16> @cmpSleV8I1(<8 x i16> %a, <8 x i16> %b) {
+; ASM-LABEL:cmpSleV8I1:
+; DIS-LABEL:00000440 <cmpSleV8I1>:
+; IASM-LABEL:cmpSleV8I1:
+
+entry:
+  %a1 = trunc <8 x i16> %a to <8 x i1>
+  %b1 = trunc <8 x i16> %b to <8 x i1>
+  %cmp = icmp sle <8 x i1> %a1, %b1
+
+; ASM:           vshl.u16 q0, q0, #15
+; ASM-NEXT:      vshl.u16 q1, q1, #15
+; ASM-NEXT:      vcge.s16 q1, q1, q0
+; DIS:      440: f29f0550
+; DIS-NEXT: 444: f29f2552
+; DIS-NEXT: 448: f2122350
+; IASM-NOT:      vshl
+; IASM-NOT:      vcge
+
+  %cmp.ret_ext = zext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %cmp.ret_ext
+}
+
+define internal <16 x i8> @cmpEqV16I8(<16 x i8> %a, <16 x i8> %b) {
+; ASM-LABEL:cmpEqV16I8:
+; DIS-LABEL:00000460 <cmpEqV16I8>:
+; IASM-LABEL:cmpEqV16I8:
+
+entry:
+  %cmp = icmp eq <16 x i8> %a, %b
+
+; ASM:           vceq.i8 q0, q0, q1
+; DIS:      460: f3000852
+; IASM-NOT:      vceq
+
+  %cmp.ret_ext = zext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %cmp.ret_ext
+}
+
+define internal <16 x i8> @cmpNeV16I8(<16 x i8> %a, <16 x i8> %b) {
+; ASM-LABEL:cmpNeV16I8:
+; DIS-LABEL:00000470 <cmpNeV16I8>:
+; IASM-LABEL:cmpNeV16I8:
+
+entry:
+  %cmp = icmp ne <16 x i8> %a, %b
+
+; ASM:           vceq.i8 q0, q0, q1
+; ASM-NEXT:      vmvn.i8 q0, q0
+; DIS:      470: f3000852
+; DIS-NEXT: 474: f3b005c0
+; IASM-NOT:      vceq
+; IASM-NOT:      vmvn
+
+  %cmp.ret_ext = zext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %cmp.ret_ext
+}
+
+define internal <16 x i8> @cmpUgtV16I8(<16 x i8> %a, <16 x i8> %b) {
+; ASM-LABEL:cmpUgtV16I8:
+; DIS-LABEL:00000490 <cmpUgtV16I8>:
+; IASM-LABEL:cmpUgtV16I8:
+
+entry:
+  %cmp = icmp ugt <16 x i8> %a, %b
+
+; ASM:           vcgt.u8 q0, q0, q1
+; DIS:      490: f3000342
+; IASM-NOT:      vcgt
+
+  %cmp.ret_ext = zext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %cmp.ret_ext
+}
+
+define internal <16 x i8> @cmpUgeV16I8(<16 x i8> %a, <16 x i8> %b) {
+; ASM-LABEL:cmpUgeV16I8:
+; DIS-LABEL:000004a0 <cmpUgeV16I8>:
+; IASM-LABEL:cmpUgeV16I8:
+
+entry:
+  %cmp = icmp uge <16 x i8> %a, %b
+
+; ASM:           vcge.u8 q0, q0, q1
+; DIS:      4a0: f3000352
+; IASM-NOT:      vcge
+
+  %cmp.ret_ext = zext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %cmp.ret_ext
+}
+
+define internal <16 x i8> @cmpUltV16I8(<16 x i8> %a, <16 x i8> %b) {
+; ASM-LABEL:cmpUltV16I8:
+; DIS-LABEL:000004b0 <cmpUltV16I8>:
+; IASM-LABEL:cmpUltV16I8:
+
+entry:
+  %cmp = icmp ult <16 x i8> %a, %b
+
+; ASM:           vcgt.u8 q1, q1, q0
+; DIS:      4b0: f3022340
+; IASM-NOT:      vcgt
+
+  %cmp.ret_ext = zext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %cmp.ret_ext
+}
+
+define internal <16 x i8> @cmpUleV16I8(<16 x i8> %a, <16 x i8> %b) {
+; ASM-LABEL:cmpUleV16I8:
+; DIS-LABEL:000004d0 <cmpUleV16I8>:
+; IASM-LABEL:cmpUleV16I8:
+
+entry:
+  %cmp = icmp ule <16 x i8> %a, %b
+
+; ASM:           vcge.u8 q1, q1, q0
+; DIS:      4d0: f3022350
+; IASM-NOT:      vcge
+
+  %cmp.ret_ext = zext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %cmp.ret_ext
+}
+
+define internal <16 x i8> @cmpSgtV16I8(<16 x i8> %a, <16 x i8> %b) {
+; ASM-LABEL:cmpSgtV16I8:
+; DIS-LABEL:000004f0 <cmpSgtV16I8>:
+; IASM-LABEL:cmpSgtV16I8:
+
+entry:
+  %cmp = icmp sgt <16 x i8> %a, %b
+
+; ASM:           vcgt.s8 q0, q0, q1
+; DIS:      4f0: f2000342
+; IASM-NOT:      vcgt
+
+  %cmp.ret_ext = zext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %cmp.ret_ext
+}
+
+define internal <16 x i8> @cmpSgeV16I8(<16 x i8> %a, <16 x i8> %b) {
+; ASM-LABEL:cmpSgeV16I8:
+; DIS-LABEL:00000500 <cmpSgeV16I8>:
+; IASM-LABEL:cmpSgeV16I8:
+
+entry:
+  %cmp = icmp sge <16 x i8> %a, %b
+
+; ASM:           vcge.s8 q0, q0, q1
+; DIS:      500: f2000352
+; IASM-NOT:      vcge
+
+  %cmp.ret_ext = zext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %cmp.ret_ext
+}
+
+define internal <16 x i8> @cmpSltV16I8(<16 x i8> %a, <16 x i8> %b) {
+; ASM-LABEL:cmpSltV16I8:
+; DIS-LABEL:00000510 <cmpSltV16I8>:
+; IASM-LABEL:cmpSltV16I8:
+
+entry:
+  %cmp = icmp slt <16 x i8> %a, %b
+
+; ASM:           vcgt.s8 q1, q1, q0
+; DIS:      510: f2022340
+; IASM-NOT:      vcgt
+
+  %cmp.ret_ext = zext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %cmp.ret_ext
+}
+
+define internal <16 x i8> @cmpSleV16I8(<16 x i8> %a, <16 x i8> %b) {
+; ASM-LABEL:cmpSleV16I8:
+; DIS-LABEL:00000530 <cmpSleV16I8>:
+; IASM-LABEL:cmpSleV16I8:
+
+entry:
+  %cmp = icmp sle <16 x i8> %a, %b
+
+; ASM:           vcge.s8 q1, q1, q0
+; DIS:      530: f2022350
+; IASM-NOT:      vcge
+
+  %cmp.ret_ext = zext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %cmp.ret_ext
+}
+
+define internal <16 x i8> @cmpEqV16I1(<16 x i8> %a, <16 x i8> %b) {
+; ASM-LABEL:cmpEqV16I1:
+; DIS-LABEL:00000550 <cmpEqV16I1>:
+; IASM-LABEL:cmpEqV16I1:
+
+entry:
+  %a1 = trunc <16 x i8> %a to <16 x i1>
+  %b1 = trunc <16 x i8> %b to <16 x i1>
+  %cmp = icmp eq <16 x i1> %a1, %b1
+
+; ASM:           vshl.u8 q0, q0, #7
+; ASM-NEXT:      vshl.u8 q1, q1, #7
+; ASM-NEXT:      vceq.i8 q0, q0, q1
+; DIS:      550: f28f0550
+; DIS-NEXT: 554: f28f2552
+; DIS-NEXT: 558: f3000852
+; IASM-NOT:      vshl
+; IASM-NOT:      vceq
+
+  %cmp.ret_ext = zext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %cmp.ret_ext
+}
+
+define internal <16 x i8> @cmpNeV16I1(<16 x i8> %a, <16 x i8> %b) {
+; ASM-LABEL:cmpNeV16I1:
+; DIS-LABEL:00000570 <cmpNeV16I1>:
+; IASM-LABEL:cmpNeV16I1:
+
+entry:
+  %a1 = trunc <16 x i8> %a to <16 x i1>
+  %b1 = trunc <16 x i8> %b to <16 x i1>
+  %cmp = icmp ne <16 x i1> %a1, %b1
+
+; ASM:           vshl.u8 q0, q0, #7
+; ASM-NEXT:      vshl.u8 q1, q1, #7
+; ASM-NEXT:      vceq.i8 q0, q0, q1
+; ASM-NEXT:      vmvn.i8 q0, q0
+; DIS:      570: f28f0550
+; DIS-NEXT: 574: f28f2552
+; DIS-NEXT: 578: f3000852
+; DIS-NEXT: 57c: f3b005c0
+; IASM-NOT:      vshl
+; IASM-NOT:      vceq
+; IASM-NOT:      vmvn
+
+  %cmp.ret_ext = zext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %cmp.ret_ext
+}
+
+define internal <16 x i8> @cmpUgtV16I1(<16 x i8> %a, <16 x i8> %b) {
+; ASM-LABEL:cmpUgtV16I1:
+; DIS-LABEL:00000590 <cmpUgtV16I1>:
+; IASM-LABEL:cmpUgtV16I1:
+
+entry:
+  %a1 = trunc <16 x i8> %a to <16 x i1>
+  %b1 = trunc <16 x i8> %b to <16 x i1>
+  %cmp = icmp ugt <16 x i1> %a1, %b1
+
+; ASM:           vshl.u8 q0, q0, #7
+; ASM-NEXT:      vshl.u8 q1, q1, #7
+; ASM-NEXT:      vcgt.u8 q0, q0, q1
+; DIS:      590: f28f0550
+; DIS-NEXT: 594: f28f2552
+; DIS-NEXT: 598: f3000342
+; IASM-NOT:      vshl
+; IASM-NOT:      vcgt
+
+  %cmp.ret_ext = zext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %cmp.ret_ext
+}
+
+define internal <16 x i8> @cmpUgeV16I1(<16 x i8> %a, <16 x i8> %b) {
+; ASM-LABEL:cmpUgeV16I1:
+; DIS-LABEL:000005b0 <cmpUgeV16I1>:
+; IASM-LABEL:cmpUgeV16I1:
+
+entry:
+  %a1 = trunc <16 x i8> %a to <16 x i1>
+  %b1 = trunc <16 x i8> %b to <16 x i1>
+  %cmp = icmp uge <16 x i1> %a1, %b1
+
+; ASM:           vshl.u8 q0, q0, #7
+; ASM-NEXT:      vshl.u8 q1, q1, #7
+; ASM-NEXT:      vcge.u8 q0, q0, q1
+; DIS:      5b0: f28f0550
+; DIS-NEXT: 5b4: f28f2552
+; DIS-NEXT: 5b8: f3000352
+; IASM-NOT:      vshl
+; IASM-NOT:      vcge
+
+  %cmp.ret_ext = zext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %cmp.ret_ext
+}
+
+define internal <16 x i8> @cmpUltV16I1(<16 x i8> %a, <16 x i8> %b) {
+; ASM-LABEL:cmpUltV16I1:
+; DIS-LABEL:000005d0 <cmpUltV16I1>:
+; IASM-LABEL:cmpUltV16I1:
+
+entry:
+  %a1 = trunc <16 x i8> %a to <16 x i1>
+  %b1 = trunc <16 x i8> %b to <16 x i1>
+  %cmp = icmp ult <16 x i1> %a1, %b1
+
+; ASM:           vshl.u8 q0, q0, #7
+; ASM-NEXT:      vshl.u8 q1, q1, #7
+; ASM-NEXT:      vcgt.u8 q1, q1, q0
+; DIS:      5d0: f28f0550
+; DIS-NEXT: 5d4: f28f2552
+; DIS-NEXT: 5d8: f3022340
+; IASM-NOT:      vshl
+; IASM-NOT:      vcgt
+
+  %cmp.ret_ext = zext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %cmp.ret_ext
+}
+
+define internal <16 x i8> @cmpUleV16I1(<16 x i8> %a, <16 x i8> %b) {
+; ASM-LABEL:cmpUleV16I1:
+; DIS-LABEL:000005f0 <cmpUleV16I1>:
+; IASM-LABEL:cmpUleV16I1:
+
+entry:
+  %a1 = trunc <16 x i8> %a to <16 x i1>
+  %b1 = trunc <16 x i8> %b to <16 x i1>
+  %cmp = icmp ule <16 x i1> %a1, %b1
+
+; ASM:           vshl.u8 q0, q0, #7
+; ASM-NEXT:      vshl.u8 q1, q1, #7
+; ASM-NEXT:      vcge.u8 q1, q1, q0
+; DIS:      5f0: f28f0550
+; DIS-NEXT: 5f4: f28f2552
+; DIS-NEXT: 5f8: f3022350
+; IASM-NOT:      vshl
+; IASM-NOT:      vcge
+
+  %cmp.ret_ext = zext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %cmp.ret_ext
+}
+
+define internal <16 x i8> @cmpSgtV16I1(<16 x i8> %a, <16 x i8> %b) {
+; ASM-LABEL:cmpSgtV16I1:
+; DIS-LABEL:00000610 <cmpSgtV16I1>:
+; IASM-LABEL:cmpSgtV16I1:
+
+entry:
+  %a1 = trunc <16 x i8> %a to <16 x i1>
+  %b1 = trunc <16 x i8> %b to <16 x i1>
+  %cmp = icmp sgt <16 x i1> %a1, %b1
+
+; ASM:           vshl.u8 q0, q0, #7
+; ASM-NEXT:      vshl.u8 q1, q1, #7
+; ASM-NEXT:      vcgt.s8 q0, q0, q1
+; DIS:      610: f28f0550
+; DIS-NEXT: 614: f28f2552
+; DIS-NEXT: 618: f2000342
+; IASM-NOT:      vshl
+; IASM-NOT:      vcgt
+
+  %cmp.ret_ext = zext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %cmp.ret_ext
+}
+
+define internal <16 x i8> @cmpSgeV16I1(<16 x i8> %a, <16 x i8> %b) {
+; ASM-LABEL:cmpSgeV16I1:
+; DIS-LABEL:00000630 <cmpSgeV16I1>:
+; IASM-LABEL:cmpSgeV16I1:
+
+entry:
+  %a1 = trunc <16 x i8> %a to <16 x i1>
+  %b1 = trunc <16 x i8> %b to <16 x i1>
+  %cmp = icmp sge <16 x i1> %a1, %b1
+
+; ASM:           vshl.u8 q0, q0, #7
+; ASM-NEXT:      vshl.u8 q1, q1, #7
+; ASM-NEXT:      vcge.s8 q0, q0, q1
+; DIS:      630: f28f0550
+; DIS-NEXT: 634: f28f2552
+; DIS-NEXT: 638: f2000352
+; IASM-NOT:      vshl
+; IASM-NOT:      vcge
+
+  %cmp.ret_ext = zext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %cmp.ret_ext
+}
+
+define internal <16 x i8> @cmpSltV16I1(<16 x i8> %a, <16 x i8> %b) {
+; ASM-LABEL:cmpSltV16I1:
+; DIS-LABEL:00000650 <cmpSltV16I1>:
+; IASM-LABEL:cmpSltV16I1:
+
+entry:
+  %a1 = trunc <16 x i8> %a to <16 x i1>
+  %b1 = trunc <16 x i8> %b to <16 x i1>
+  %cmp = icmp slt <16 x i1> %a1, %b1
+
+; ASM:           vshl.u8 q0, q0, #7
+; ASM-NEXT:      vshl.u8 q1, q1, #7
+; ASM-NEXT:      vcgt.s8 q1, q1, q0
+; DIS:      650: f28f0550
+; DIS-NEXT: 654: f28f2552
+; DIS-NEXT: 658: f2022340
+; IASM-NOT:      vshl
+; IASM-NOT:      vcgt
+
+  %cmp.ret_ext = zext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %cmp.ret_ext
+}
+
+define internal <16 x i8> @cmpSleV16I1(<16 x i8> %a, <16 x i8> %b) {
+; ASM-LABEL:cmpSleV16I1:
+; DIS-LABEL:00000670 <cmpSleV16I1>:
+; IASM-LABEL:cmpSleV16I1:
+
+entry:
+  %a1 = trunc <16 x i8> %a to <16 x i1>
+  %b1 = trunc <16 x i8> %b to <16 x i1>
+  %cmp = icmp sle <16 x i1> %a1, %b1
+
+; ASM:           vshl.u8 q0, q0, #7
+; ASM-NEXT:      vshl.u8 q1, q1, #7
+; ASM-NEXT:      vcge.s8 q1, q1, q0
+; DIS:      670: f28f0550
+; DIS-NEXT: 674: f28f2552
+; DIS-NEXT: 678: f2022350
+; IASM-NOT:      vshl
+; IASM-NOT:      vcge
+
+  %cmp.ret_ext = zext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %cmp.ret_ext
+}
+
+define internal <4 x i32> @cmpFalseV4Float(<4 x float> %a, <4 x float> %b) {
+; ASM-LABEL:cmpFalseV4Float:
+; DIS-LABEL:00000690 <cmpFalseV4Float>:
+; IASM-LABEL:cmpFalseV4Float:
+
+entry:
+  %cmp = fcmp false <4 x float> %a, %b
+
+; ASM:           vmov.i32 q0, #0
+; DIS:      690: f2800050
+; IASM-NOT:      vmov
+
+  %zext = zext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %zext
+}
+
+define internal <4 x i32> @cmpOeqV4Float(<4 x float> %a, <4 x float> %b) {
+; ASM-LABEL:cmpOeqV4Float:
+; DIS-LABEL:000006a0 <cmpOeqV4Float>:
+; IASM-LABEL:cmpOeqV4Float:
 
 entry:
   %cmp = fcmp oeq <4 x float> %a, %b
 
-; ASM:        vcmp.f32 s0, s1
-; ASM:        vcmp.f32 s0, s1
-; ASM:        vcmp.f32 s0, s1
-; ASM:        vcmp.f32 s0, s1
-; DIS:  1bc:  eeb40a60
+; ASM:           vceq.f32 q0, q0, q1
+; DIS:      6a0: f2000e42
+; IASM-NOT:      vceq
 
-  %cmp.ret_ext = zext <4 x i1> %cmp to <4 x i32>
-  ret <4 x i32> %cmp.ret_ext
+  %zext = zext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %zext
+}
+
+define internal <4 x i32> @cmpOgtV4Float(<4 x float> %a, <4 x float> %b) {
+; ASM-LABEL:cmpOgtV4Float:
+; DIS-LABEL:000006b0 <cmpOgtV4Float>:
+; IASM-LABEL:cmpOgtV4Float:
+
+entry:
+  %cmp = fcmp ogt <4 x float> %a, %b
+
+; ASM:           vcgt.f32 q0, q0, q1
+; DIS:      6b0: f3200e42
+; IASM-NOT:      vcgt
+
+  %zext = zext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %zext
+}
+
+define internal <4 x i32> @cmpOgeV4Float(<4 x float> %a, <4 x float> %b) {
+; ASM-LABEL:cmpOgeV4Float:
+; DIS-LABEL:000006c0 <cmpOgeV4Float>:
+; IASM-LABEL:cmpOgeV4Float:
+
+entry:
+  %cmp = fcmp oge <4 x float> %a, %b
+
+; ASM:           vcge.f32 q0, q0, q1
+; DIS:      6c0: f3000e42
+; IASM-NOT:      vcge
+
+  %zext = zext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %zext
+}
+
+define internal <4 x i32> @cmpOltV4Float(<4 x float> %a, <4 x float> %b) {
+; ASM-LABEL:cmpOltV4Float:
+; DIS-LABEL:000006d0 <cmpOltV4Float>:
+; IASM-LABEL:cmpOltV4Float:
+
+entry:
+  %cmp = fcmp olt <4 x float> %a, %b
+
+; ASM:           vcgt.f32 q1, q1, q0
+; DIS:      6d0: f3222e40
+; IASM-NOT:      vcgt
+
+  %zext = zext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %zext
+}
+
+define internal <4 x i32> @cmpOleV4Float(<4 x float> %a, <4 x float> %b) {
+; ASM-LABEL:cmpOleV4Float:
+; DIS-LABEL:000006f0 <cmpOleV4Float>:
+; IASM-LABEL:cmpOleV4Float:
+
+entry:
+  %cmp = fcmp ole <4 x float> %a, %b
+
+; ASM:           vcge.f32 q1, q1, q0
+; DIS:      6f0: f3022e40
+; IASM-NOT:      vcge
+
+  %zext = zext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %zext
+}
+
+define internal <4 x i32> @cmpOrdV4Float(<4 x float> %a, <4 x float> %b) {
+; ASM-LABEL:cmpOrdV4Float:
+; DIS-LABEL:00000710 <cmpOrdV4Float>:
+; IASM-LABEL:cmpOrdV4Float:
+
+entry:
+  %cmp = fcmp ord <4 x float> %a, %b
+
+; ASM:           vcge.f32 q2, q0, q1
+; ASM-NEXT:      vcgt.f32 q1, q1, q0
+; DIS:      710: f3004e42
+; DIS-NEXT: 714: f3222e40
+; IASM-NOT:      vcge
+; IASM-NOT:      vcgt
+
+  %zext = zext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %zext
+}
+
+define internal <4 x i32> @cmpUeqV4Float(<4 x float> %a, <4 x float> %b) {
+; ASM-LABEL:cmpUeqV4Float:
+; DIS-LABEL:00000730 <cmpUeqV4Float>:
+; IASM-LABEL:cmpUeqV4Float:
+
+entry:
+  %cmp = fcmp ueq <4 x float> %a, %b
+
+; ASM:           vcgt.f32 q2, q0, q1
+; ASM-NEXT:      vcgt.f32 q1, q1, q0
+; ASM-NEXT:      vorr.i32 q2, q2, q1
+; ASM-NEXT:      vmvn.i32 q2, q2
+; DIS:      730: f3204e42
+; DIS-NEXT: 734: f3222e40
+; DIS-NEXT: 738: f2244152
+; DIS-NEXT: 73c: f3b045c4
+; IASM-NOT:      vcgt
+; IASM-NOT:      vorr
+; IASM-NOT:      vmvn
+
+  %zext = zext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %zext
+}
+
+define internal <4 x i32> @cmpUgtV4Float(<4 x float> %a, <4 x float> %b) {
+; ASM-LABEL:cmpUgtV4Float:
+; DIS-LABEL:00000750 <cmpUgtV4Float>:
+; IASM-LABEL:cmpUgtV4Float:
+
+entry:
+  %cmp = fcmp ugt <4 x float> %a, %b
+
+; ASM:           vcge.f32 q1, q1, q0
+; ASM-NEXT:      vmvn.i32 q1, q1
+; DIS:      750: f3022e40
+; DIS-NEXT: 754: f3b025c2
+; IASM-NOT:      vcge
+; IASM-NOT:      vmvn
+
+  %zext = zext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %zext
+}
+
+define internal <4 x i32> @cmpUgeV4Float(<4 x float> %a, <4 x float> %b) {
+; ASM-LABEL:cmpUgeV4Float:
+; DIS-LABEL:00000770 <cmpUgeV4Float>:
+; IASM-LABEL:cmpUgeV4Float:
+
+entry:
+  %cmp = fcmp uge <4 x float> %a, %b
+
+; ASM:           vcgt.f32 q1, q1, q0
+; ASM-NEXT:      vmvn.i32 q1, q1
+; DIS:      770: f3222e40
+; DIS-NEXT: 774: f3b025c2
+; IASM-NOT:      vcgt
+; IASM-NOT:      vmvn
+
+  %zext = zext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %zext
+}
+
+define internal <4 x i32> @cmpUltV4Float(<4 x float> %a, <4 x float> %b) {
+; ASM-LABEL:cmpUltV4Float:
+; DIS-LABEL:00000790 <cmpUltV4Float>:
+; IASM-LABEL:cmpUltV4Float:
+
+entry:
+  %cmp = fcmp ult <4 x float> %a, %b
+
+; ASM:           vcge.f32 q0, q0, q1
+; ASM-NEXT:      vmvn.i32 q0, q0
+; DIS:      790: f3000e42
+; DIS-NEXT: 794: f3b005c0
+; IASM-NOT:      vcge
+; IASM-NOT:      vmvn
+
+  %zext = zext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %zext
+}
+
+define internal <4 x i32> @cmpUleV4Float(<4 x float> %a, <4 x float> %b) {
+; ASM-LABEL:cmpUleV4Float:
+; DIS-LABEL:000007b0 <cmpUleV4Float>:
+; IASM-LABEL:cmpUleV4Float:
+
+entry:
+  %cmp = fcmp ule <4 x float> %a, %b
+
+; ASM:           vcgt.f32 q0, q0, q1
+; ASM-NEXT:      vmvn.i32 q0, q0
+; DIS:      7b0: f3200e42
+; DIS-NEXT: 7b4: f3b005c0
+; IASM-NOT:      vcgt
+; IASM-NOT:      vmvn
+
+  %zext = zext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %zext
+}
+
+define internal <4 x i32> @cmpTrueV4Float(<4 x float> %a, <4 x float> %b) {
+; ASM-LABEL:cmpTrueV4Float:
+; DIS-LABEL:000007d0 <cmpTrueV4Float>:
+; IASM-LABEL:cmpTrueV4Float:
+
+entry:
+  %cmp = fcmp true <4 x float> %a, %b
+
+; ASM:           vmov.i32 q0, #1
+; DIS:      7d0: f2800051
+; IASM-NOT:      vmov
+
+  %zext = zext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %zext
 }