Lower icmp operations between vector values.

SSE2 only has signed integer comparison. Unsigned compares are
implemented by inverting the sign bits of the operands and doing a
signed compare.

A common pattern in clang generated IR is a vector compare which
generates an i1 vector followed by a sign extension of the result of the
compare. The x86 comparison instructions already generate sign extended
values, so we can eliminate unnecessary sext operations that follow
compares in the IR.

BUG=none
R=jvoung@chromium.org, stichnot@chromium.org

Review URL: https://codereview.chromium.org/412593002
diff --git a/crosstest/runtests.sh b/crosstest/runtests.sh
index a5dc31f..06c43f4 100755
--- a/crosstest/runtests.sh
+++ b/crosstest/runtests.sh
@@ -67,7 +67,7 @@
     ./crosstest.py -O${optlevel} --prefix=Subzero_ --target=x8632 \
         --dir="${OUTDIR}" \
         --llvm-bin-path="${LLVM_BIN_PATH}" \
-        --test=test_icmp.cpp \
+        --test=test_icmp.cpp --test=test_icmp_i1vec.ll \
         --driver=test_icmp_main.cpp \
         --output=test_icmp_O${optlevel}
 
diff --git a/crosstest/test_icmp.cpp b/crosstest/test_icmp.cpp
index f1b144d..b74abce 100644
--- a/crosstest/test_icmp.cpp
+++ b/crosstest/test_icmp.cpp
@@ -1,5 +1,16 @@
-// This aims to test the icmp bitcode instruction across all PNaCl
-// primitive integer types.
+//===- subzero/crosstest/test_icmp.cpp - Implementation for tests ---------===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This aims to test the icmp bitcode instruction across all PNaCl primitive
+// and SIMD integer types.
+//
+//===----------------------------------------------------------------------===//
 
 #include <stdint.h>
 
@@ -9,13 +20,20 @@
   bool icmp##cmp(uint8_t a, uint8_t b) { return a op b; }                      \
   bool icmp##cmp(uint16_t a, uint16_t b) { return a op b; }                    \
   bool icmp##cmp(uint32_t a, uint32_t b) { return a op b; }                    \
-  bool icmp##cmp(uint64_t a, uint64_t b) { return a op b; }
+  bool icmp##cmp(uint64_t a, uint64_t b) { return a op b; }                    \
+  v4ui32 icmp##cmp(v4ui32 a, v4ui32 b) { return a op b; }                      \
+  v8ui16 icmp##cmp(v8ui16 a, v8ui16 b) { return a op b; }                      \
+  v16ui8 icmp##cmp(v16ui8 a, v16ui8 b) { return a op b; }
 ICMP_U_TABLE
 #undef X
+
 #define X(cmp, op)                                                             \
   bool icmp##cmp(int8_t a, int8_t b) { return a op b; }                        \
   bool icmp##cmp(int16_t a, int16_t b) { return a op b; }                      \
   bool icmp##cmp(int32_t a, int32_t b) { return a op b; }                      \
-  bool icmp##cmp(int64_t a, int64_t b) { return a op b; }
+  bool icmp##cmp(int64_t a, int64_t b) { return a op b; }                      \
+  v4si32 icmp##cmp(v4si32 a, v4si32 b) { return a op b; }                      \
+  v8si16 icmp##cmp(v8si16 a, v8si16 b) { return a op b; }                      \
+  v16si8 icmp##cmp(v16si8 a, v16si8 b) { return a op b; }
 ICMP_S_TABLE
 #undef X
diff --git a/crosstest/test_icmp.h b/crosstest/test_icmp.h
index d4ce9f1..52a9f22 100644
--- a/crosstest/test_icmp.h
+++ b/crosstest/test_icmp.h
@@ -1,10 +1,29 @@
+//===- subzero/crosstest/test_icmp.h - Test prototypes -------*- C++ -*----===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the function prototypes for crosstesting the icmp
+// bitcode instruction.
+//
+//===----------------------------------------------------------------------===//
+
 #include "test_icmp.def"
 
+#include "vectors.h"
+
 #define X(cmp, op)                                                             \
   bool icmp##cmp(uint8_t a, uint8_t b);                                        \
   bool icmp##cmp(uint16_t a, uint16_t b);                                      \
   bool icmp##cmp(uint32_t a, uint32_t b);                                      \
-  bool icmp##cmp(uint64_t a, uint64_t b);
+  bool icmp##cmp(uint64_t a, uint64_t b);                                      \
+  v4ui32 icmp##cmp(v4ui32 a, v4ui32 b);                                        \
+  v8ui16 icmp##cmp(v8ui16 a, v8ui16 b);                                        \
+  v16ui8 icmp##cmp(v16ui8 a, v16ui8 b);
 ICMP_U_TABLE
 #undef X
 
@@ -12,6 +31,17 @@
   bool icmp##cmp(int8_t a, int8_t b);                                          \
   bool icmp##cmp(int16_t a, int16_t b);                                        \
   bool icmp##cmp(int32_t a, int32_t b);                                        \
-  bool icmp##cmp(int64_t a, int64_t b);
+  bool icmp##cmp(int64_t a, int64_t b);                                        \
+  v4si32 icmp##cmp(v4si32 a, v4si32 b);                                        \
+  v8si16 icmp##cmp(v8si16 a, v8si16 b);                                        \
+  v16si8 icmp##cmp(v16si8 a, v16si8 b);
+ICMP_S_TABLE
+#undef X
+
+#define X(cmp, op)                                                             \
+  v4si32 icmpi1##cmp(v4si32 a, v4si32 b);                                      \
+  v8si16 icmpi1##cmp(v8si16 a, v8si16 b);                                      \
+  v16si8 icmpi1##cmp(v16si8 a, v16si8 b);
+ICMP_U_TABLE
 ICMP_S_TABLE
 #undef X
diff --git a/crosstest/test_icmp_i1vec.ll b/crosstest/test_icmp_i1vec.ll
new file mode 100644
index 0000000..bf9aa37
--- /dev/null
+++ b/crosstest/test_icmp_i1vec.ll
@@ -0,0 +1,271 @@
+target triple = "i686-pc-linux-gnu"
+
+define <16 x i8> @_Z8icmpi1EqDv16_aS_(<16 x i8> %a, <16 x i8> %b) {
+entry:
+  %a.trunc = trunc <16 x i8> %a to <16 x i1>
+  %b.trunc = trunc <16 x i8> %b to <16 x i1>
+  %cmp = icmp eq <16 x i1> %a.trunc, %b.trunc
+  %cmp.sext = sext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %cmp.sext
+}
+
+define <16 x i8> @_Z8icmpi1NeDv16_aS_(<16 x i8> %a, <16 x i8> %b) {
+entry:
+  %a.trunc = trunc <16 x i8> %a to <16 x i1>
+  %b.trunc = trunc <16 x i8> %b to <16 x i1>
+  %cmp = icmp ne <16 x i1> %a.trunc, %b.trunc
+  %cmp.sext = sext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %cmp.sext
+}
+
+define <16 x i8> @_Z9icmpi1UgtDv16_aS_(<16 x i8> %a, <16 x i8> %b) {
+entry:
+  %a.trunc = trunc <16 x i8> %a to <16 x i1>
+  %b.trunc = trunc <16 x i8> %b to <16 x i1>
+  %cmp = icmp ugt <16 x i1> %a.trunc, %b.trunc
+  %cmp.sext = sext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %cmp.sext
+}
+
+define <16 x i8> @_Z9icmpi1UgeDv16_aS_(<16 x i8> %a, <16 x i8> %b) {
+entry:
+  %a.trunc = trunc <16 x i8> %a to <16 x i1>
+  %b.trunc = trunc <16 x i8> %b to <16 x i1>
+  %cmp = icmp uge <16 x i1> %a.trunc, %b.trunc
+  %cmp.sext = sext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %cmp.sext
+}
+
+define <16 x i8> @_Z9icmpi1UltDv16_aS_(<16 x i8> %a, <16 x i8> %b) {
+entry:
+  %a.trunc = trunc <16 x i8> %a to <16 x i1>
+  %b.trunc = trunc <16 x i8> %b to <16 x i1>
+  %cmp = icmp ult <16 x i1> %a.trunc, %b.trunc
+  %cmp.sext = sext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %cmp.sext
+}
+
+define <16 x i8> @_Z9icmpi1UleDv16_aS_(<16 x i8> %a, <16 x i8> %b) {
+entry:
+  %a.trunc = trunc <16 x i8> %a to <16 x i1>
+  %b.trunc = trunc <16 x i8> %b to <16 x i1>
+  %cmp = icmp ule <16 x i1> %a.trunc, %b.trunc
+  %cmp.sext = sext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %cmp.sext
+}
+
+define <16 x i8> @_Z9icmpi1SgtDv16_aS_(<16 x i8> %a, <16 x i8> %b) {
+entry:
+  %a.trunc = trunc <16 x i8> %a to <16 x i1>
+  %b.trunc = trunc <16 x i8> %b to <16 x i1>
+  %cmp = icmp sgt <16 x i1> %a.trunc, %b.trunc
+  %cmp.sext = sext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %cmp.sext
+}
+
+define <16 x i8> @_Z9icmpi1SgeDv16_aS_(<16 x i8> %a, <16 x i8> %b) {
+entry:
+  %a.trunc = trunc <16 x i8> %a to <16 x i1>
+  %b.trunc = trunc <16 x i8> %b to <16 x i1>
+  %cmp = icmp sge <16 x i1> %a.trunc, %b.trunc
+  %cmp.sext = sext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %cmp.sext
+}
+
+define <16 x i8> @_Z9icmpi1SltDv16_aS_(<16 x i8> %a, <16 x i8> %b) {
+entry:
+  %a.trunc = trunc <16 x i8> %a to <16 x i1>
+  %b.trunc = trunc <16 x i8> %b to <16 x i1>
+  %cmp = icmp slt <16 x i1> %a.trunc, %b.trunc
+  %cmp.sext = sext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %cmp.sext
+}
+
+define <16 x i8> @_Z9icmpi1SleDv16_aS_(<16 x i8> %a, <16 x i8> %b) {
+entry:
+  %a.trunc = trunc <16 x i8> %a to <16 x i1>
+  %b.trunc = trunc <16 x i8> %b to <16 x i1>
+  %cmp = icmp sle <16 x i1> %a.trunc, %b.trunc
+  %cmp.sext = sext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %cmp.sext
+}
+
+define <8 x i16> @_Z8icmpi1EqDv8_sS_(<8 x i16> %a, <8 x i16> %b) {
+entry:
+  %a.trunc = trunc <8 x i16> %a to <8 x i1>
+  %b.trunc = trunc <8 x i16> %b to <8 x i1>
+  %cmp = icmp eq <8 x i1> %a.trunc, %b.trunc
+  %cmp.sext = sext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %cmp.sext
+}
+
+define <8 x i16> @_Z8icmpi1NeDv8_sS_(<8 x i16> %a, <8 x i16> %b) {
+entry:
+  %a.trunc = trunc <8 x i16> %a to <8 x i1>
+  %b.trunc = trunc <8 x i16> %b to <8 x i1>
+  %cmp = icmp ne <8 x i1> %a.trunc, %b.trunc
+  %cmp.sext = sext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %cmp.sext
+}
+
+define <8 x i16> @_Z9icmpi1UgtDv8_sS_(<8 x i16> %a, <8 x i16> %b) {
+entry:
+  %a.trunc = trunc <8 x i16> %a to <8 x i1>
+  %b.trunc = trunc <8 x i16> %b to <8 x i1>
+  %cmp = icmp ugt <8 x i1> %a.trunc, %b.trunc
+  %cmp.sext = sext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %cmp.sext
+}
+
+define <8 x i16> @_Z9icmpi1UgeDv8_sS_(<8 x i16> %a, <8 x i16> %b) {
+entry:
+  %a.trunc = trunc <8 x i16> %a to <8 x i1>
+  %b.trunc = trunc <8 x i16> %b to <8 x i1>
+  %cmp = icmp uge <8 x i1> %a.trunc, %b.trunc
+  %cmp.sext = sext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %cmp.sext
+}
+
+define <8 x i16> @_Z9icmpi1UltDv8_sS_(<8 x i16> %a, <8 x i16> %b) {
+entry:
+  %a.trunc = trunc <8 x i16> %a to <8 x i1>
+  %b.trunc = trunc <8 x i16> %b to <8 x i1>
+  %cmp = icmp ult <8 x i1> %a.trunc, %b.trunc
+  %cmp.sext = sext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %cmp.sext
+}
+
+define <8 x i16> @_Z9icmpi1UleDv8_sS_(<8 x i16> %a, <8 x i16> %b) {
+entry:
+  %a.trunc = trunc <8 x i16> %a to <8 x i1>
+  %b.trunc = trunc <8 x i16> %b to <8 x i1>
+  %cmp = icmp ule <8 x i1> %a.trunc, %b.trunc
+  %cmp.sext = sext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %cmp.sext
+}
+
+define <8 x i16> @_Z9icmpi1SgtDv8_sS_(<8 x i16> %a, <8 x i16> %b) {
+entry:
+  %a.trunc = trunc <8 x i16> %a to <8 x i1>
+  %b.trunc = trunc <8 x i16> %b to <8 x i1>
+  %cmp = icmp sgt <8 x i1> %a.trunc, %b.trunc
+  %cmp.sext = sext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %cmp.sext
+}
+
+define <8 x i16> @_Z9icmpi1SgeDv8_sS_(<8 x i16> %a, <8 x i16> %b) {
+entry:
+  %a.trunc = trunc <8 x i16> %a to <8 x i1>
+  %b.trunc = trunc <8 x i16> %b to <8 x i1>
+  %cmp = icmp sge <8 x i1> %a.trunc, %b.trunc
+  %cmp.sext = sext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %cmp.sext
+}
+
+define <8 x i16> @_Z9icmpi1SltDv8_sS_(<8 x i16> %a, <8 x i16> %b) {
+entry:
+  %a.trunc = trunc <8 x i16> %a to <8 x i1>
+  %b.trunc = trunc <8 x i16> %b to <8 x i1>
+  %cmp = icmp slt <8 x i1> %a.trunc, %b.trunc
+  %cmp.sext = sext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %cmp.sext
+}
+
+define <8 x i16> @_Z9icmpi1SleDv8_sS_(<8 x i16> %a, <8 x i16> %b) {
+entry:
+  %a.trunc = trunc <8 x i16> %a to <8 x i1>
+  %b.trunc = trunc <8 x i16> %b to <8 x i1>
+  %cmp = icmp sle <8 x i1> %a.trunc, %b.trunc
+  %cmp.sext = sext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %cmp.sext
+}
+
+define <4 x i32> @_Z8icmpi1EqDv4_iS_(<4 x i32> %a, <4 x i32> %b) {
+entry:
+  %a.trunc = trunc <4 x i32> %a to <4 x i1>
+  %b.trunc = trunc <4 x i32> %b to <4 x i1>
+  %cmp = icmp eq <4 x i1> %a.trunc, %b.trunc
+  %cmp.sext = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %cmp.sext
+}
+
+define <4 x i32> @_Z8icmpi1NeDv4_iS_(<4 x i32> %a, <4 x i32> %b) {
+entry:
+  %a.trunc = trunc <4 x i32> %a to <4 x i1>
+  %b.trunc = trunc <4 x i32> %b to <4 x i1>
+  %cmp = icmp ne <4 x i1> %a.trunc, %b.trunc
+  %cmp.sext = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %cmp.sext
+}
+
+define <4 x i32> @_Z9icmpi1UgtDv4_iS_(<4 x i32> %a, <4 x i32> %b) {
+entry:
+  %a.trunc = trunc <4 x i32> %a to <4 x i1>
+  %b.trunc = trunc <4 x i32> %b to <4 x i1>
+  %cmp = icmp ugt <4 x i1> %a.trunc, %b.trunc
+  %cmp.sext = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %cmp.sext
+}
+
+define <4 x i32> @_Z9icmpi1UgeDv4_iS_(<4 x i32> %a, <4 x i32> %b) {
+entry:
+  %a.trunc = trunc <4 x i32> %a to <4 x i1>
+  %b.trunc = trunc <4 x i32> %b to <4 x i1>
+  %cmp = icmp uge <4 x i1> %a.trunc, %b.trunc
+  %cmp.sext = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %cmp.sext
+}
+
+define <4 x i32> @_Z9icmpi1UltDv4_iS_(<4 x i32> %a, <4 x i32> %b) {
+entry:
+  %a.trunc = trunc <4 x i32> %a to <4 x i1>
+  %b.trunc = trunc <4 x i32> %b to <4 x i1>
+  %cmp = icmp ult <4 x i1> %a.trunc, %b.trunc
+  %cmp.sext = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %cmp.sext
+}
+
+define <4 x i32> @_Z9icmpi1UleDv4_iS_(<4 x i32> %a, <4 x i32> %b) {
+entry:
+  %a.trunc = trunc <4 x i32> %a to <4 x i1>
+  %b.trunc = trunc <4 x i32> %b to <4 x i1>
+  %cmp = icmp ule <4 x i1> %a.trunc, %b.trunc
+  %cmp.sext = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %cmp.sext
+}
+
+define <4 x i32> @_Z9icmpi1SgtDv4_iS_(<4 x i32> %a, <4 x i32> %b) {
+entry:
+  %a.trunc = trunc <4 x i32> %a to <4 x i1>
+  %b.trunc = trunc <4 x i32> %b to <4 x i1>
+  %cmp = icmp sgt <4 x i1> %a.trunc, %b.trunc
+  %cmp.sext = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %cmp.sext
+}
+
+define <4 x i32> @_Z9icmpi1SgeDv4_iS_(<4 x i32> %a, <4 x i32> %b) {
+entry:
+  %a.trunc = trunc <4 x i32> %a to <4 x i1>
+  %b.trunc = trunc <4 x i32> %b to <4 x i1>
+  %cmp = icmp sge <4 x i1> %a.trunc, %b.trunc
+  %cmp.sext = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %cmp.sext
+}
+
+define <4 x i32> @_Z9icmpi1SltDv4_iS_(<4 x i32> %a, <4 x i32> %b) {
+entry:
+  %a.trunc = trunc <4 x i32> %a to <4 x i1>
+  %b.trunc = trunc <4 x i32> %b to <4 x i1>
+  %cmp = icmp slt <4 x i1> %a.trunc, %b.trunc
+  %cmp.sext = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %cmp.sext
+}
+
+define <4 x i32> @_Z9icmpi1SleDv4_iS_(<4 x i32> %a, <4 x i32> %b) {
+entry:
+  %a.trunc = trunc <4 x i32> %a to <4 x i1>
+  %b.trunc = trunc <4 x i32> %b to <4 x i1>
+  %cmp = icmp sle <4 x i1> %a.trunc, %b.trunc
+  %cmp.sext = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %cmp.sext
+}
diff --git a/crosstest/test_icmp_main.cpp b/crosstest/test_icmp_main.cpp
index 3981fcf..53597e8 100644
--- a/crosstest/test_icmp_main.cpp
+++ b/crosstest/test_icmp_main.cpp
@@ -1,6 +1,21 @@
-/* crosstest.py --test=test_icmp.cpp --driver=test_icmp_main.cpp \
-   --prefix=Subzero_ --output=test_icmp */
+//===- subzero/crosstest/test_icmp_main.cpp - Driver for tests. -----------===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Driver for cross testing the icmp bitcode instruction
+//
+//===----------------------------------------------------------------------===//
 
+/* crosstest.py --test=test_icmp.cpp --test=test_icmp_i1vec.ll \
+   --driver=test_icmp_main.cpp --prefix=Subzero_ --output=test_icmp */
+
+#include <climits> // CHAR_BIT
+#include <cstring> // memcmp, memset
 #include <stdint.h>
 #include <iostream>
 
@@ -35,7 +50,7 @@
         (FuncTypeUnsigned)Subzero_::icmp##cmp                                  \
   }                                                                            \
   ,
-      ICMP_U_TABLE
+        ICMP_U_TABLE
 #undef X
 #define X(cmp, op)                                                             \
   {                                                                            \
@@ -43,9 +58,9 @@
         (FuncTypeUnsigned)(FuncTypeSigned)Subzero_::icmp##cmp                  \
   }                                                                            \
   ,
-          ICMP_S_TABLE
+        ICMP_S_TABLE
 #undef X
-    };
+  };
   const static size_t NumFuncs = sizeof(Funcs) / sizeof(*Funcs);
 
   if (sizeof(TypeUnsigned) <= sizeof(uint32_t)) {
@@ -63,8 +78,9 @@
             ++Passes;
           } else {
             ++Failures;
-            std::cout << "icmp" << Funcs[f].Name << (8 * sizeof(TypeUnsigned))
-                      << "(" << Value1 << ", " << Value2 << "): sz=" << ResultSz
+            std::cout << "icmp" << Funcs[f].Name
+                      << (CHAR_BIT * sizeof(TypeUnsigned)) << "(" << Value1
+                      << ", " << Value2 << "): sz=" << ResultSz
                       << " llc=" << ResultLlc << std::endl;
           }
         }
@@ -90,8 +106,8 @@
               } else {
                 ++Failures;
                 std::cout << "icmp" << Funcs[f].Name
-                          << (8 * sizeof(TypeUnsigned)) << "(" << Value1 << ", "
-                          << Value2 << "): sz=" << ResultSz
+                          << (CHAR_BIT * sizeof(TypeUnsigned)) << "(" << Value1
+                          << ", " << Value2 << "): sz=" << ResultSz
                           << " llc=" << ResultLlc << std::endl;
               }
             }
@@ -102,6 +118,155 @@
   }
 }
 
+const static size_t MaxTestsPerFunc = 100000;
+
+template <typename TypeUnsignedLabel, typename TypeSignedLabel>
+void testsVecInt(size_t &TotalTests, size_t &Passes, size_t &Failures) {
+  typedef typename Vectors<TypeUnsignedLabel>::Ty TypeUnsigned;
+  typedef typename Vectors<TypeSignedLabel>::Ty TypeSigned;
+  typedef TypeUnsigned (*FuncTypeUnsigned)(TypeUnsigned, TypeUnsigned);
+  typedef TypeSigned (*FuncTypeSigned)(TypeSigned, TypeSigned);
+  static struct {
+    const char *Name;
+    FuncTypeUnsigned FuncLlc;
+    FuncTypeUnsigned FuncSz;
+  } Funcs[] = {
+#define X(cmp, op)                                                             \
+  {                                                                            \
+    STR(inst), (FuncTypeUnsigned)icmp##cmp,                                    \
+        (FuncTypeUnsigned)Subzero_::icmp##cmp                                  \
+  }                                                                            \
+  ,
+        ICMP_U_TABLE
+#undef X
+#define X(cmp, op)                                                             \
+  {                                                                            \
+    STR(inst), (FuncTypeUnsigned)(FuncTypeSigned)icmp##cmp,                    \
+        (FuncTypeUnsigned)(FuncTypeSigned)Subzero_::icmp##cmp                  \
+  }                                                                            \
+  ,
+        ICMP_S_TABLE
+#undef X
+  };
+  const static size_t NumFuncs = sizeof(Funcs) / sizeof(*Funcs);
+  const static size_t NumElementsInType = Vectors<TypeUnsigned>::NumElements;
+  for (size_t f = 0; f < NumFuncs; ++f) {
+    PRNG Index;
+    for (size_t i = 0; i < MaxTestsPerFunc; ++i) {
+      // Initialize the test vectors.
+      TypeUnsigned Value1, Value2;
+      for (size_t j = 0; j < NumElementsInType;) {
+        Value1[j] = Values[Index() % NumValues];
+        Value2[j] = Values[Index() % NumValues];
+        ++j;
+      }
+      // Perform the test.
+      TypeUnsigned ResultSz = Funcs[f].FuncSz(Value1, Value2);
+      TypeUnsigned ResultLlc = Funcs[f].FuncLlc(Value1, Value2);
+      ++TotalTests;
+      if (!memcmp(&ResultSz, &ResultLlc, sizeof(ResultSz))) {
+        ++Passes;
+      } else {
+        ++Failures;
+        std::cout << "test" << Funcs[f].Name
+                  << Vectors<TypeUnsignedLabel>::TypeName << "("
+                  << vectAsString<TypeUnsignedLabel>(Value1) << ","
+                  << vectAsString<TypeUnsignedLabel>(Value2)
+                  << "): sz=" << vectAsString<TypeUnsignedLabel>(ResultSz)
+                  << " llc=" << vectAsString<TypeUnsignedLabel>(ResultLlc)
+                  << std::endl;
+      }
+    }
+  }
+}
+
+// Return true on wraparound
+template <typename T> bool incrementI1Vector(typename Vectors<T>::Ty &Vect) {
+  size_t Pos = 0;
+  const static size_t NumElements = Vectors<T>::NumElements;
+  for (Pos = 0; Pos < NumElements; ++Pos) {
+    if (Vect[Pos] == 0) {
+      Vect[Pos] = 1;
+      break;
+    }
+    Vect[Pos] = 0;
+  }
+  return (Pos == NumElements);
+}
+
+template <typename T>
+void testsVecI1(size_t &TotalTests, size_t &Passes, size_t &Failures) {
+  typedef typename Vectors<T>::Ty Ty;
+  typedef Ty (*FuncType)(Ty, Ty);
+  static struct {
+    const char *Name;
+    FuncType FuncLlc;
+    FuncType FuncSz;
+  } Funcs[] = {
+#define X(cmp, op)                                                             \
+  { STR(inst), (FuncType)icmpi1##cmp, (FuncType)Subzero_::icmpi1##cmp }        \
+  ,
+        ICMP_U_TABLE
+        ICMP_S_TABLE
+  };
+  const static size_t NumFuncs = sizeof(Funcs) / sizeof(*Funcs);
+  const static size_t NumElements = Vectors<T>::NumElements;
+  const static size_t MAX_NUMBER_OF_ELEMENTS_FOR_EXHAUSTIVE_TESTING = 8;
+
+  // Check if the type is small enough to try all possible input pairs.
+  if (NumElements <= MAX_NUMBER_OF_ELEMENTS_FOR_EXHAUSTIVE_TESTING) {
+    for (size_t f = 0; f < NumFuncs; ++f) {
+      Ty Value1, Value2;
+      memset(&Value1, 0, sizeof(Value1));
+      for (bool IsValue1Done = false; !IsValue1Done;
+           IsValue1Done = incrementI1Vector<T>(Value1)) {
+        memset(&Value2, 0, sizeof(Value2));
+        for (bool IsValue2Done = false; !IsValue2Done;
+             IsValue2Done = incrementI1Vector<T>(Value2)) {
+          Ty ResultSz = Funcs[f].FuncSz(Value1, Value2);
+          Ty ResultLlc = Funcs[f].FuncLlc(Value1, Value2);
+          ++TotalTests;
+          if (!memcmp(&ResultSz, &ResultLlc, sizeof(ResultSz))) {
+            ++Passes;
+          } else {
+            ++Failures;
+            std::cout << "test" << Funcs[f].Name << Vectors<T>::TypeName << "("
+                      << vectAsString<T>(Value1) << ","
+                      << vectAsString<T>(Value2)
+                      << "): sz=" << vectAsString<T>(ResultSz)
+                      << " llc=" << vectAsString<T>(ResultLlc) << std::endl;
+          }
+        }
+      }
+    }
+  } else {
+    for (size_t f = 0; f < NumFuncs; ++f) {
+      PRNG Index;
+      for (size_t i = 0; i < MaxTestsPerFunc; ++i) {
+        Ty Value1, Value2;
+        // Initialize the test vectors.
+        for (size_t j = 0; j < NumElements; ++j) {
+          Value1[j] = Index() % 2;
+          Value2[j] = Index() % 2;
+        }
+        // Perform the test.
+        Ty ResultSz = Funcs[f].FuncSz(Value1, Value2);
+        Ty ResultLlc = Funcs[f].FuncLlc(Value1, Value2);
+        ++TotalTests;
+        if (!memcmp(&ResultSz, &ResultLlc, sizeof(ResultSz))) {
+          ++Passes;
+        } else {
+          ++Failures;
+          std::cout << "test" << Funcs[f].Name << Vectors<T>::TypeName << "("
+                    << vectAsString<T>(Value1) << "," << vectAsString<T>(Value2)
+                    << "): sz=" << vectAsString<T>(ResultSz)
+                    << " llc=" << vectAsString<T>(ResultLlc) << std::endl;
+        }
+      }
+    }
+  }
+}
+
 int main(int argc, char **argv) {
   size_t TotalTests = 0;
   size_t Passes = 0;
@@ -111,6 +276,12 @@
   testsInt<uint16_t, int16_t>(TotalTests, Passes, Failures);
   testsInt<uint32_t, int32_t>(TotalTests, Passes, Failures);
   testsInt<uint64_t, int64_t>(TotalTests, Passes, Failures);
+  testsVecInt<v4ui32, v4si32>(TotalTests, Passes, Failures);
+  testsVecInt<v8ui16, v8si16>(TotalTests, Passes, Failures);
+  testsVecInt<v16ui8, v16si8>(TotalTests, Passes, Failures);
+  testsVecI1<v4i1>(TotalTests, Passes, Failures);
+  testsVecI1<v8i1>(TotalTests, Passes, Failures);
+  testsVecI1<v16i1>(TotalTests, Passes, Failures);
 
   std::cout << "TotalTests=" << TotalTests << " Passes=" << Passes
             << " Failures=" << Failures << "\n";
diff --git a/src/IceTargetLoweringX8632.cpp b/src/IceTargetLoweringX8632.cpp
index 3808ecb..71b4c17 100644
--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
@@ -2261,6 +2261,124 @@
   Operand *Src1 = legalize(Inst->getSrc(1));
   Variable *Dest = Inst->getDest();
 
+  if (isVectorType(Dest->getType())) {
+    Type Ty = Src0->getType();
+    // Promote i1 vectors to 128 bit integer vector types.
+    if (typeElementType(Ty) == IceType_i1) {
+      Type NewTy = IceType_NUM;
+      switch (Ty) {
+      default:
+        llvm_unreachable("unexpected type");
+        break;
+      case IceType_v4i1:
+        NewTy = IceType_v4i32;
+        break;
+      case IceType_v8i1:
+        NewTy = IceType_v8i16;
+        break;
+      case IceType_v16i1:
+        NewTy = IceType_v16i8;
+        break;
+      }
+      Variable *NewSrc0 = Func->makeVariable(NewTy, Context.getNode());
+      Variable *NewSrc1 = Func->makeVariable(NewTy, Context.getNode());
+      lowerCast(InstCast::create(Func, InstCast::Sext, NewSrc0, Src0));
+      lowerCast(InstCast::create(Func, InstCast::Sext, NewSrc1, Src1));
+      Src0 = NewSrc0;
+      Src1 = NewSrc1;
+      Ty = NewTy;
+    }
+
+    InstIcmp::ICond Condition = Inst->getCondition();
+
+    // SSE2 only has signed comparison operations.  Transform unsigned
+    // inputs in a manner that allows for the use of signed comparison
+    // operations by flipping the high order bits.
+    if (Condition == InstIcmp::Ugt || Condition == InstIcmp::Uge ||
+        Condition == InstIcmp::Ult || Condition == InstIcmp::Ule) {
+      Variable *T0 = makeReg(Ty);
+      Variable *T1 = makeReg(Ty);
+      Variable *HighOrderBits = makeVectorOfHighOrderBits(Ty);
+      _movp(T0, Src0);
+      _pxor(T0, HighOrderBits);
+      _movp(T1, Src1);
+      _pxor(T1, HighOrderBits);
+      Src0 = T0;
+      Src1 = T1;
+    }
+
+    // TODO: ALIGNHACK: Both operands to compare instructions need to be
+    // in registers until stack alignment support is implemented.  Once
+    // there is support for stack alignment, LEGAL_HACK can be removed.
+#define LEGAL_HACK(Vect) legalizeToVar((Vect))
+    Variable *T = makeReg(Ty);
+    switch (Condition) {
+    default:
+      llvm_unreachable("unexpected condition");
+      break;
+    case InstIcmp::Eq: {
+      _movp(T, Src0);
+      _pcmpeq(T, LEGAL_HACK(Src1));
+    } break;
+    case InstIcmp::Ne: {
+      _movp(T, Src0);
+      _pcmpeq(T, LEGAL_HACK(Src1));
+      Variable *MinusOne = makeVectorOfMinusOnes(Ty);
+      _pxor(T, MinusOne);
+    } break;
+    case InstIcmp::Ugt:
+    case InstIcmp::Sgt: {
+      _movp(T, Src0);
+      _pcmpgt(T, LEGAL_HACK(Src1));
+    } break;
+    case InstIcmp::Uge:
+    case InstIcmp::Sge: {
+      // !(Src1 > Src0)
+      _movp(T, Src1);
+      _pcmpgt(T, LEGAL_HACK(Src0));
+      Variable *MinusOne = makeVectorOfMinusOnes(Ty);
+      _pxor(T, MinusOne);
+    } break;
+    case InstIcmp::Ult:
+    case InstIcmp::Slt: {
+      _movp(T, Src1);
+      _pcmpgt(T, LEGAL_HACK(Src0));
+    } break;
+    case InstIcmp::Ule:
+    case InstIcmp::Sle: {
+      // !(Src0 > Src1)
+      _movp(T, Src0);
+      _pcmpgt(T, LEGAL_HACK(Src1));
+      Variable *MinusOne = makeVectorOfMinusOnes(Ty);
+      _pxor(T, MinusOne);
+    } break;
+    }
+#undef LEGAL_HACK
+
+    _movp(Dest, T);
+
+    // The following pattern occurs often in lowered C and C++ code:
+    //
+    //   %cmp     = icmp pred <n x ty> %src0, %src1
+    //   %cmp.ext = sext <n x i1> %cmp to <n x ty>
+    //
+    // We can avoid the sext operation by copying the result from pcmpgt
+    // and pcmpeq, which is already sign extended, to the result of the
+    // sext operation
+    if (InstCast *NextCast =
+            llvm::dyn_cast_or_null<InstCast>(Context.getNextInst())) {
+      if (NextCast->getCastKind() == InstCast::Sext &&
+          NextCast->getSrc(0) == Dest) {
+        _movp(NextCast->getDest(), T);
+        // Skip over the instruction.
+        NextCast->setDeleted();
+        Context.advanceNext();
+      }
+    }
+
+    return;
+  }
+
   // If Src1 is an immediate, or known to be a physical register, we can
   // allow Src0 to be a memory operand.  Otherwise, Src0 must be copied into
   // a physical register.  (Actually, either Src0 or Src1 can be chosen for
@@ -3398,9 +3516,14 @@
   lowerCall(Call);
 }
 
+// There is no support for loading or emitting vector constants, so the
+// vector values returned from makeVectorOfZeros, makeVectorOfOnes,
+// etc. are initialized with register operations.
+//
+// TODO(wala): Add limited support for vector constants so that
+// complex initialization in registers is unnecessary.
+
 Variable *TargetX8632::makeVectorOfZeros(Type Ty, int32_t RegNum) {
-  // There is no support for loading or emitting vector constants, so
-  // this value is initialized using register operations.
   Variable *Reg = makeReg(Ty, RegNum);
   // Insert a FakeDef, since otherwise the live range of Reg might
   // be overestimated.
@@ -3409,18 +3532,41 @@
   return Reg;
 }
 
+Variable *TargetX8632::makeVectorOfMinusOnes(Type Ty, int32_t RegNum) {
+  Variable *MinusOnes = makeReg(Ty, RegNum);
+  // Insert a FakeDef so the live range of MinusOnes is not overestimated.
+  Context.insert(InstFakeDef::create(Func, MinusOnes));
+  _pcmpeq(MinusOnes, MinusOnes);
+  return MinusOnes;
+}
+
 Variable *TargetX8632::makeVectorOfOnes(Type Ty, int32_t RegNum) {
-  // There is no support for loading or emitting vector constants, so
-  // this value is initialized using register operations.
   Variable *Dest = makeVectorOfZeros(Ty, RegNum);
-  Variable *MinusOne = makeReg(Ty);
-  // Insert a FakeDef so the live range of MinusOne is not overestimated.
-  Context.insert(InstFakeDef::create(Func, MinusOne));
-  _pcmpeq(MinusOne, MinusOne);
+  Variable *MinusOne = makeVectorOfMinusOnes(Ty);
   _psub(Dest, MinusOne);
   return Dest;
 }
 
+Variable *TargetX8632::makeVectorOfHighOrderBits(Type Ty, int32_t RegNum) {
+  assert(Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v8i16 ||
+         Ty == IceType_v16i8);
+  if (Ty == IceType_v4f32 || Ty == IceType_v4i32 || Ty == IceType_v8i16) {
+    Variable *Reg = makeVectorOfOnes(Ty, RegNum);
+    SizeT Shift = typeWidthInBytes(typeElementType(Ty)) * X86_CHAR_BIT - 1;
+    _psll(Reg, Ctx->getConstantInt(IceType_i8, Shift));
+    return Reg;
+  } else {
+    // SSE has no left shift operation for vectors of 8 bit integers.
+    const uint32_t HIGH_ORDER_BITS_MASK = 0x80808080;
+    Constant *ConstantMask =
+        Ctx->getConstantInt(IceType_i32, HIGH_ORDER_BITS_MASK);
+    Variable *Reg = makeReg(Ty, RegNum);
+    _movd(Reg, legalize(ConstantMask, Legal_Reg | Legal_Mem));
+    _pshufd(Reg, Reg, Ctx->getConstantZero(IceType_i8));
+    return Reg;
+  }
+}
+
 OperandX8632Mem *TargetX8632::getMemoryOperandForStackSlot(Type Ty,
                                                            Variable *Slot,
                                                            uint32_t Offset) {
diff --git a/src/IceTargetLoweringX8632.h b/src/IceTargetLoweringX8632.h
index 4c0c245..864881f 100644
--- a/src/IceTargetLoweringX8632.h
+++ b/src/IceTargetLoweringX8632.h
@@ -153,6 +153,10 @@
   // Returns a vector in a register with the given constant entries.
   Variable *makeVectorOfZeros(Type Ty, int32_t RegNum = Variable::NoRegister);
   Variable *makeVectorOfOnes(Type Ty, int32_t RegNum = Variable::NoRegister);
+  Variable *makeVectorOfMinusOnes(Type Ty,
+                                  int32_t RegNum = Variable::NoRegister);
+  Variable *makeVectorOfHighOrderBits(Type Ty,
+                                      int32_t RegNum = Variable::NoRegister);
 
   // Return a memory operand corresponding to a stack allocated Variable.
   OperandX8632Mem *getMemoryOperandForStackSlot(Type Ty, Variable *Slot,
diff --git a/tests_lit/llvm2ice_tests/vector-icmp.ll b/tests_lit/llvm2ice_tests/vector-icmp.ll
new file mode 100644
index 0000000..9b95f33
--- /dev/null
+++ b/tests_lit/llvm2ice_tests/vector-icmp.ll
@@ -0,0 +1,508 @@
+; This file checks support for comparing vector values with the icmp
+; instruction.
+
+; RUN: %llvm2ice -O2 --verbose none %s | FileCheck %s
+; RUN: %llvm2ice -Om1 --verbose none %s | FileCheck %s
+; RUN: %llvm2ice -O2 --verbose none %s \
+; RUN:               | llvm-mc -arch=x86 -x86-asm-syntax=intel -filetype=obj
+; RUN: %llvm2ice -Om1 --verbose none %s \
+; RUN:               | llvm-mc -arch=x86 -x86-asm-syntax=intel -filetype=obj
+; RUN: %llvm2ice --verbose none %s | FileCheck --check-prefix=ERRORS %s
+; RUN: %llvm2iceinsts %s | %szdiff %s | FileCheck --check-prefix=DUMP %s
+; RUN: %llvm2iceinsts --pnacl %s | %szdiff %s \
+; RUN:                           | FileCheck --check-prefix=DUMP %s
+
+; Check that sext elimination occurs when the result of the comparison
+; instruction is alrady sign extended.  Sign extension to 4 x i32 uses
+; the pslld instruction.
+define <4 x i32> @test_sext_elimination(<4 x i32> %a, <4 x i32> %b) {
+entry:
+  %res.trunc = icmp eq <4 x i32> %a, %b
+  %res = sext <4 x i1> %res.trunc to <4 x i32>
+  ret <4 x i32> %res
+; CHECK-LABEL: test_sext_elimination:
+; CHECK: pcmpeqd
+; CHECK-NOT: pslld
+}
+
+define <4 x i1> @test_icmp_v4i32_eq(<4 x i32> %a, <4 x i32> %b) {
+entry:
+  %res = icmp eq <4 x i32> %a, %b
+  ret <4 x i1> %res
+; CHECK-LABEL: test_icmp_v4i32_eq:
+; CHECK: pcmpeqd
+}
+
+define <4 x i1> @test_icmp_v4i32_ne(<4 x i32> %a, <4 x i32> %b) {
+entry:
+  %res = icmp ne <4 x i32> %a, %b
+  ret <4 x i1> %res
+; CHECK-LABEL: test_icmp_v4i32_ne:
+; CHECK: pcmpeqd
+; CHECK: pxor
+}
+
+define <4 x i1> @test_icmp_v4i32_sgt(<4 x i32> %a, <4 x i32> %b) {
+entry:
+  %res = icmp sgt <4 x i32> %a, %b
+  ret <4 x i1> %res
+; CHECK: pcmpgtd
+}
+
+define <4 x i1> @test_icmp_v4i32_sle(<4 x i32> %a, <4 x i32> %b) {
+entry:
+  %res = icmp sle <4 x i32> %a, %b
+  ret <4 x i1> %res
+; CHECK-LABEL: test_icmp_v4i32_sle:
+; CHECK: pcmpgtd
+; CHECK: pxor
+}
+
+define <4 x i1> @test_icmp_v4i32_slt(<4 x i32> %a, <4 x i32> %b) {
+entry:
+  %res = icmp slt <4 x i32> %a, %b
+  ret <4 x i1> %res
+; CHECK-LABEL: test_icmp_v4i32_slt:
+; CHECK: pcmpgtd
+}
+
+define <4 x i1> @test_icmp_v4i32_uge(<4 x i32> %a, <4 x i32> %b) {
+entry:
+  %res = icmp uge <4 x i32> %a, %b
+  ret <4 x i1> %res
+; CHECK-LABEL: test_icmp_v4i32_uge:
+; CHECK: pxor
+; CHECK: pcmpgtd
+; CHECK: pxor
+}
+
+define <4 x i1> @test_icmp_v4i32_ugt(<4 x i32> %a, <4 x i32> %b) {
+entry:
+  %res = icmp ugt <4 x i32> %a, %b
+  ret <4 x i1> %res
+; CHECK-LABEL: test_icmp_v4i32_ugt:
+; CHECK: pxor
+; CHECK: pcmpgtd
+}
+
+define <4 x i1> @test_icmp_v4i32_ule(<4 x i32> %a, <4 x i32> %b) {
+entry:
+  %res = icmp ule <4 x i32> %a, %b
+  ret <4 x i1> %res
+; CHECK-LABEL: test_icmp_v4i32_ule:
+; CHECK: pxor
+; CHECK: pcmpgtd
+; CHECK: pxor
+}
+
+define <4 x i1> @test_icmp_v4i32_ult(<4 x i32> %a, <4 x i32> %b) {
+entry:
+  %res = icmp ult <4 x i32> %a, %b
+  ret <4 x i1> %res
+; CHECK-LABEL: test_icmp_v4i32_ult:
+; CHECK: pxor
+; CHECK: pcmpgtd
+}
+
+define <4 x i1> @test_icmp_v4i1_eq(<4 x i1> %a, <4 x i1> %b) {
+entry:
+  %res = icmp eq <4 x i1> %a, %b
+  ret <4 x i1> %res
+; CHECK-LABEL: test_icmp_v4i1_eq:
+; CHECK: pcmpeqd
+}
+
+define <4 x i1> @test_icmp_v4i1_ne(<4 x i1> %a, <4 x i1> %b) {
+entry:
+  %res = icmp ne <4 x i1> %a, %b
+  ret <4 x i1> %res
+; CHECK-LABEL: test_icmp_v4i1_ne:
+; CHECK: pcmpeqd
+; CHECK: pxor
+}
+
+define <4 x i1> @test_icmp_v4i1_sgt(<4 x i1> %a, <4 x i1> %b) {
+entry:
+  %res = icmp sgt <4 x i1> %a, %b
+  ret <4 x i1> %res
+; CHECK-LABEL: test_icmp_v4i1_sgt:
+; CHECK: pcmpgtd
+}
+
+define <4 x i1> @test_icmp_v4i1_sle(<4 x i1> %a, <4 x i1> %b) {
+entry:
+  %res = icmp sle <4 x i1> %a, %b
+  ret <4 x i1> %res
+; CHECK-LABEL: test_icmp_v4i1_sle:
+; CHECK: pcmpgtd
+; CHECK: pxor
+}
+
+define <4 x i1> @test_icmp_v4i1_slt(<4 x i1> %a, <4 x i1> %b) {
+entry:
+  %res = icmp slt <4 x i1> %a, %b
+  ret <4 x i1> %res
+; CHECK-LABEL: test_icmp_v4i1_slt:
+; CHECK: pcmpgtd
+}
+
+define <4 x i1> @test_icmp_v4i1_uge(<4 x i1> %a, <4 x i1> %b) {
+entry:
+  %res = icmp uge <4 x i1> %a, %b
+  ret <4 x i1> %res
+; CHECK-LABEL: test_icmp_v4i1_uge:
+; CHECK: pxor
+; CHECK: pcmpgtd
+; CHECK: pxor
+}
+
+define <4 x i1> @test_icmp_v4i1_ugt(<4 x i1> %a, <4 x i1> %b) {
+entry:
+  %res = icmp ugt <4 x i1> %a, %b
+  ret <4 x i1> %res
+; CHECK-LABEL: test_icmp_v4i1_ugt:
+; CHECK: pxor
+; CHECK: pcmpgtd
+}
+
+define <4 x i1> @test_icmp_v4i1_ule(<4 x i1> %a, <4 x i1> %b) {
+entry:
+  %res = icmp ule <4 x i1> %a, %b
+  ret <4 x i1> %res
+; CHECK-LABEL: test_icmp_v4i1_ule:
+; CHECK: pxor
+; CHECK: pcmpgtd
+; CHECK: pxor
+}
+
+define <4 x i1> @test_icmp_v4i1_ult(<4 x i1> %a, <4 x i1> %b) {
+entry:
+  %res = icmp ult <4 x i1> %a, %b
+  ret <4 x i1> %res
+; CHECK-LABEL: test_icmp_v4i1_ult:
+; CHECK: pxor
+; CHECK: pcmpgtd
+}
+
+define <8 x i1> @test_icmp_v8i16_eq(<8 x i16> %a, <8 x i16> %b) {
+entry:
+  %res = icmp eq <8 x i16> %a, %b
+  ret <8 x i1> %res
+; CHECK-LABEL: test_icmp_v8i16_eq:
+; CHECK: pcmpeqw
+}
+
+define <8 x i1> @test_icmp_v8i16_ne(<8 x i16> %a, <8 x i16> %b) {
+entry:
+  %res = icmp ne <8 x i16> %a, %b
+  ret <8 x i1> %res
+; CHECK-LABEL: test_icmp_v8i16_ne:
+; CHECK: pcmpeqw
+; CHECK: pxor
+}
+
+define <8 x i1> @test_icmp_v8i16_sgt(<8 x i16> %a, <8 x i16> %b) {
+entry:
+  %res = icmp sgt <8 x i16> %a, %b
+  ret <8 x i1> %res
+; CHECK-LABEL: test_icmp_v8i16_sgt:
+; CHECK: pcmpgtw
+}
+
+define <8 x i1> @test_icmp_v8i16_sle(<8 x i16> %a, <8 x i16> %b) {
+entry:
+  %res = icmp sle <8 x i16> %a, %b
+  ret <8 x i1> %res
+; CHECK-LABEL: test_icmp_v8i16_sle:
+; CHECK: pcmpgtw
+; CHECK: pxor
+}
+
+define <8 x i1> @test_icmp_v8i16_slt(<8 x i16> %a, <8 x i16> %b) {
+entry:
+  %res = icmp slt <8 x i16> %a, %b
+  ret <8 x i1> %res
+; CHECK-LABEL: test_icmp_v8i16_slt:
+; CHECK: pcmpgtw
+}
+
+define <8 x i1> @test_icmp_v8i16_uge(<8 x i16> %a, <8 x i16> %b) {
+entry:
+  %res = icmp uge <8 x i16> %a, %b
+  ret <8 x i1> %res
+; CHECK-LABEL: test_icmp_v8i16_uge:
+; CHECK: pxor
+; CHECK: pcmpgtw
+; CHECK: pxor
+}
+
+define <8 x i1> @test_icmp_v8i16_ugt(<8 x i16> %a, <8 x i16> %b) {
+entry:
+  %res = icmp ugt <8 x i16> %a, %b
+  ret <8 x i1> %res
+; CHECK-LABEL: test_icmp_v8i16_ugt:
+; CHECK: pxor
+; CHECK: pcmpgtw
+}
+
+define <8 x i1> @test_icmp_v8i16_ule(<8 x i16> %a, <8 x i16> %b) {
+entry:
+  %res = icmp ule <8 x i16> %a, %b
+  ret <8 x i1> %res
+; CHECK-LABEL: test_icmp_v8i16_ule:
+; CHECK: pxor
+; CHECK: pcmpgtw
+; CHECK: pxor
+}
+
+define <8 x i1> @test_icmp_v8i16_ult(<8 x i16> %a, <8 x i16> %b) {
+entry:
+  %res = icmp ult <8 x i16> %a, %b
+  ret <8 x i1> %res
+; CHECK-LABEL: test_icmp_v8i16_ult:
+; CHECK: pxor
+; CHECK: pcmpgtw
+}
+
+define <8 x i1> @test_icmp_v8i1_eq(<8 x i1> %a, <8 x i1> %b) {
+entry:
+  %res = icmp eq <8 x i1> %a, %b
+  ret <8 x i1> %res
+; CHECK-LABEL: test_icmp_v8i1_eq:
+; CHECK: pcmpeqw
+}
+
+define <8 x i1> @test_icmp_v8i1_ne(<8 x i1> %a, <8 x i1> %b) {
+entry:
+  %res = icmp ne <8 x i1> %a, %b
+  ret <8 x i1> %res
+; CHECK-LABEL: test_icmp_v8i1_ne:
+; CHECK: pcmpeqw
+; CHECK: pxor
+}
+
+define <8 x i1> @test_icmp_v8i1_sgt(<8 x i1> %a, <8 x i1> %b) {
+entry:
+  %res = icmp sgt <8 x i1> %a, %b
+  ret <8 x i1> %res
+; CHECK-LABEL: test_icmp_v8i1_sgt:
+; CHECK: pcmpgtw
+}
+
+define <8 x i1> @test_icmp_v8i1_sle(<8 x i1> %a, <8 x i1> %b) {
+entry:
+  %res = icmp sle <8 x i1> %a, %b
+  ret <8 x i1> %res
+; CHECK-LABEL: test_icmp_v8i1_sle:
+; CHECK: pcmpgtw
+; CHECK: pxor
+}
+
+define <8 x i1> @test_icmp_v8i1_slt(<8 x i1> %a, <8 x i1> %b) {
+entry:
+  %res = icmp slt <8 x i1> %a, %b
+  ret <8 x i1> %res
+; CHECK-LABEL: test_icmp_v8i1_slt:
+; CHECK: pcmpgtw
+}
+
+define <8 x i1> @test_icmp_v8i1_uge(<8 x i1> %a, <8 x i1> %b) {
+entry:
+  %res = icmp uge <8 x i1> %a, %b
+  ret <8 x i1> %res
+; CHECK-LABEL: test_icmp_v8i1_uge:
+; CHECK: pxor
+; CHECK: pcmpgtw
+; CHECK: pxor
+}
+
+define <8 x i1> @test_icmp_v8i1_ugt(<8 x i1> %a, <8 x i1> %b) {
+entry:
+  %res = icmp ugt <8 x i1> %a, %b
+  ret <8 x i1> %res
+; CHECK-LABEL: test_icmp_v8i1_ugt:
+; CHECK: pxor
+; CHECK: pcmpgtw
+}
+
+define <8 x i1> @test_icmp_v8i1_ule(<8 x i1> %a, <8 x i1> %b) {
+entry:
+  %res = icmp ule <8 x i1> %a, %b
+  ret <8 x i1> %res
+; CHECK-LABEL: test_icmp_v8i1_ule:
+; CHECK: pxor
+; CHECK: pcmpgtw
+; CHECK: pxor
+}
+
+define <8 x i1> @test_icmp_v8i1_ult(<8 x i1> %a, <8 x i1> %b) {
+entry:
+  %res = icmp ult <8 x i1> %a, %b
+  ret <8 x i1> %res
+; CHECK-LABEL: test_icmp_v8i1_ult:
+; CHECK: pxor
+; CHECK: pcmpgtw
+}
+
+define <16 x i1> @test_icmp_v16i8_eq(<16 x i8> %a, <16 x i8> %b) {
+entry:
+  %res = icmp eq <16 x i8> %a, %b
+  ret <16 x i1> %res
+; CHECK-LABEL: test_icmp_v16i8_eq:
+; CHECK: pcmpeqb
+}
+
+define <16 x i1> @test_icmp_v16i8_ne(<16 x i8> %a, <16 x i8> %b) {
+entry:
+  %res = icmp ne <16 x i8> %a, %b
+  ret <16 x i1> %res
+; CHECK-LABEL: test_icmp_v16i8_ne:
+; CHECK: pcmpeqb
+; CHECK: pxor
+}
+
+define <16 x i1> @test_icmp_v16i8_sgt(<16 x i8> %a, <16 x i8> %b) {
+entry:
+  %res = icmp sgt <16 x i8> %a, %b
+  ret <16 x i1> %res
+; CHECK-LABEL: test_icmp_v16i8_sgt:
+; CHECK: pcmpgtb
+}
+
+define <16 x i1> @test_icmp_v16i8_sle(<16 x i8> %a, <16 x i8> %b) {
+entry:
+  %res = icmp sle <16 x i8> %a, %b
+  ret <16 x i1> %res
+; CHECK-LABEL: test_icmp_v16i8_sle:
+; CHECK: pcmpgtb
+; CHECK: pxor
+}
+
+define <16 x i1> @test_icmp_v16i8_slt(<16 x i8> %a, <16 x i8> %b) {
+entry:
+  %res = icmp slt <16 x i8> %a, %b
+  ret <16 x i1> %res
+; CHECK-LABEL: test_icmp_v16i8_slt:
+; CHECK: pcmpgtb
+}
+
+define <16 x i1> @test_icmp_v16i8_uge(<16 x i8> %a, <16 x i8> %b) {
+entry:
+  %res = icmp uge <16 x i8> %a, %b
+  ret <16 x i1> %res
+; CHECK-LABEL: test_icmp_v16i8_uge:
+; CHECK: pxor
+; CHECK: pcmpgtb
+; CHECK: pxor
+}
+
+define <16 x i1> @test_icmp_v16i8_ugt(<16 x i8> %a, <16 x i8> %b) {
+entry:
+  %res = icmp ugt <16 x i8> %a, %b
+  ret <16 x i1> %res
+; CHECK-LABEL: test_icmp_v16i8_ugt:
+; CHECK: pxor
+; CHECK: pcmpgtb
+}
+
+define <16 x i1> @test_icmp_v16i8_ule(<16 x i8> %a, <16 x i8> %b) {
+entry:
+  %res = icmp ule <16 x i8> %a, %b
+  ret <16 x i1> %res
+; CHECK-LABEL: test_icmp_v16i8_ule:
+; CHECK: pxor
+; CHECK: pcmpgtb
+; CHECK: pxor
+}
+
+define <16 x i1> @test_icmp_v16i8_ult(<16 x i8> %a, <16 x i8> %b) {
+entry:
+  %res = icmp ult <16 x i8> %a, %b
+  ret <16 x i1> %res
+; CHECK-LABEL: test_icmp_v16i8_ult:
+; CHECK: pxor
+; CHECK: pcmpgtb
+}
+
+define <16 x i1> @test_icmp_v16i1_eq(<16 x i1> %a, <16 x i1> %b) {
+entry:
+  %res = icmp eq <16 x i1> %a, %b
+  ret <16 x i1> %res
+; CHECK-LABEL: test_icmp_v16i1_eq:
+; CHECK: pcmpeqb
+}
+
+define <16 x i1> @test_icmp_v16i1_ne(<16 x i1> %a, <16 x i1> %b) {
+entry:
+  %res = icmp ne <16 x i1> %a, %b
+  ret <16 x i1> %res
+; CHECK-LABEL: test_icmp_v16i1_ne:
+; CHECK: pcmpeqb
+; CHECK: pxor
+}
+
+define <16 x i1> @test_icmp_v16i1_sgt(<16 x i1> %a, <16 x i1> %b) {
+entry:
+  %res = icmp sgt <16 x i1> %a, %b
+  ret <16 x i1> %res
+; CHECK-LABEL: test_icmp_v16i1_sgt:
+; CHECK: pcmpgtb
+}
+
+define <16 x i1> @test_icmp_v16i1_sle(<16 x i1> %a, <16 x i1> %b) {
+entry:
+  %res = icmp sle <16 x i1> %a, %b
+  ret <16 x i1> %res
+; CHECK-LABEL: test_icmp_v16i1_sle:
+; CHECK: pcmpgtb
+; CHECK: pxor
+}
+
+define <16 x i1> @test_icmp_v16i1_slt(<16 x i1> %a, <16 x i1> %b) {
+entry:
+  %res = icmp slt <16 x i1> %a, %b
+  ret <16 x i1> %res
+; CHECK-LABEL: test_icmp_v16i1_slt:
+; CHECK: pcmpgtb
+}
+
+define <16 x i1> @test_icmp_v16i1_uge(<16 x i1> %a, <16 x i1> %b) {
+entry:
+  %res = icmp uge <16 x i1> %a, %b
+  ret <16 x i1> %res
+; CHECK-LABEL: test_icmp_v16i1_uge:
+; CHECK: pxor
+; CHECK: pcmpgtb
+; CHECK: pxor
+}
+
+define <16 x i1> @test_icmp_v16i1_ugt(<16 x i1> %a, <16 x i1> %b) {
+entry:
+  %res = icmp ugt <16 x i1> %a, %b
+  ret <16 x i1> %res
+; CHECK-LABEL: test_icmp_v16i1_ugt:
+; CHECK: pxor
+; CHECK: pcmpgtb
+}
+
+define <16 x i1> @test_icmp_v16i1_ule(<16 x i1> %a, <16 x i1> %b) {
+entry:
+  %res = icmp ule <16 x i1> %a, %b
+  ret <16 x i1> %res
+; CHECK-LABEL: test_icmp_v16i1_ule:
+; CHECK: pxor
+; CHECK: pcmpgtb
+; CHECK: pxor
+}
+
+define <16 x i1> @test_icmp_v16i1_ult(<16 x i1> %a, <16 x i1> %b) {
+entry:
+  %res = icmp ult <16 x i1> %a, %b
+  ret <16 x i1> %res
+; CHECK-LABEL: test_icmp_v16i1_ult:
+; CHECK: pxor
+; CHECK: pcmpgtb
+}
+
+; ERRORS-NOT: ICE translation error
+; DUMP-NOT: SZ