Lower the select instruction when the operands are of vector type.

Select of vectors is implemented by appropriately masking and
combining the inputs with sign extend / bitwise operations
and without the use of branches.

BUG=none
R=jvoung@chromium.org, stichnot@chromium.org

Review URL: https://codereview.chromium.org/417653004
diff --git a/crosstest/runtests.sh b/crosstest/runtests.sh
index 06c43f4..7b81df1 100755
--- a/crosstest/runtests.sh
+++ b/crosstest/runtests.sh
@@ -74,6 +74,13 @@
     ./crosstest.py -O${optlevel} --prefix=Subzero_ --target=x8632 \
         --dir="${OUTDIR}" \
         --llvm-bin-path="${LLVM_BIN_PATH}" \
+        --test=test_select.ll \
+        --driver=test_select_main.cpp \
+        --output=test_select_O${optlevel}
+
+    ./crosstest.py -O${optlevel} --prefix=Subzero_ --target=x8632 \
+        --dir="${OUTDIR}" \
+        --llvm-bin-path="${LLVM_BIN_PATH}" \
         --test=test_stacksave.c \
         --driver=test_stacksave_main.c \
         --output=test_stacksave_O${optlevel}
@@ -107,6 +114,7 @@
     "${OUTDIR}"/test_fcmp_O${optlevel}
     "${OUTDIR}"/test_global_O${optlevel}
     "${OUTDIR}"/test_icmp_O${optlevel}
+    "${OUTDIR}"/test_select_O${optlevel}
     "${OUTDIR}"/test_stacksave_O${optlevel}
     "${OUTDIR}"/test_sync_atomic_O${optlevel}
     "${OUTDIR}"/test_vector_ops_O${optlevel}
diff --git a/crosstest/test_select.h b/crosstest/test_select.h
new file mode 100644
index 0000000..6e890a0
--- /dev/null
+++ b/crosstest/test_select.h
@@ -0,0 +1,26 @@
+//===- subzero/crosstest/test_select.h - Test prototypes -----*- C++ -*----===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the function prototypes for cross testing the select
+// bitcode instruction.
+//
+//===----------------------------------------------------------------------===//
+
+#include "vectors.h"
+
+v4f32 select(v4si32 cond, v4f32 val1, v4f32 val2);
+v4si32 select(v4si32 cond, v4si32 val1, v4si32 val2);
+v4ui32 select(v4si32 cond, v4ui32 val1, v4ui32 val2);
+v8si16 select(v8si16 cond, v8si16 val1, v8si16 val2);
+v8ui16 select(v8si16 cond, v8ui16 val1, v8ui16 val2);
+v16si8 select(v16si8 cond, v16si8 val1, v16si8 val2);
+v16ui8 select(v16si8 cond, v16ui8 val1, v16ui8 val2);
+v4si32 select_i1(v4si32 cond, v4si32 val1, v4si32 val2);
+v8si16 select_i1(v8si16 cond, v8si16 val1, v8si16 val2);
+v16si8 select_i1(v16si8 cond, v16si8 val1, v16si8 val2);
diff --git a/crosstest/test_select.ll b/crosstest/test_select.ll
new file mode 100644
index 0000000..e286b2d
--- /dev/null
+++ b/crosstest/test_select.ll
@@ -0,0 +1,80 @@
+target triple = "i686-pc-linux-gnu"
+
+define <4 x float> @_Z6selectDv4_iDv4_fS0_(<4 x i32> %cond.ext, <4 x float> %arg1, <4 x float> %arg2) {
+entry:
+  %cond = trunc <4 x i32> %cond.ext to <4 x i1>
+  %res = select <4 x i1> %cond, <4 x float> %arg1, <4 x float> %arg2
+  ret <4 x float> %res
+}
+
+define <4 x i32> @_Z6selectDv4_iS_S_(<4 x i32> %cond.ext, <4 x i32> %arg1, <4 x i32> %arg2) {
+entry:
+  %cond = trunc <4 x i32> %cond.ext to <4 x i1>
+  %res = select <4 x i1> %cond, <4 x i32> %arg1, <4 x i32> %arg2
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @_Z6selectDv4_iDv4_jS0_(<4 x i32> %cond.ext, <4 x i32> %arg1, <4 x i32> %arg2) {
+entry:
+  %cond = trunc <4 x i32> %cond.ext to <4 x i1>
+  %res = select <4 x i1> %cond, <4 x i32> %arg1, <4 x i32> %arg2
+  ret <4 x i32> %res
+}
+
+define <8 x i16> @_Z6selectDv8_sS_S_(<8 x i16> %cond.ext, <8 x i16> %arg1, <8 x i16> %arg2) {
+entry:
+  %cond = trunc <8 x i16> %cond.ext to <8 x i1>
+  %res = select <8 x i1> %cond, <8 x i16> %arg1, <8 x i16> %arg2
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @_Z6selectDv8_sDv8_tS0_(<8 x i16> %cond.ext, <8 x i16> %arg1, <8 x i16> %arg2) {
+entry:
+  %cond = trunc <8 x i16> %cond.ext to <8 x i1>
+  %res = select <8 x i1> %cond, <8 x i16> %arg1, <8 x i16> %arg2
+  ret <8 x i16> %res
+}
+
+define <16 x i8> @_Z6selectDv16_aS_S_(<16 x i8> %cond.ext, <16 x i8> %arg1, <16 x i8> %arg2) {
+entry:
+  %cond = trunc <16 x i8> %cond.ext to <16 x i1>
+  %res = select <16 x i1> %cond, <16 x i8> %arg1, <16 x i8> %arg2
+  ret <16 x i8> %res
+}
+
+define <16 x i8> @_Z6selectDv16_aDv16_hS0_(<16 x i8> %cond.ext, <16 x i8> %arg1, <16 x i8> %arg2) {
+entry:
+  %cond = trunc <16 x i8> %cond.ext to <16 x i1>
+  %res = select <16 x i1> %cond, <16 x i8> %arg1, <16 x i8> %arg2
+  ret <16 x i8> %res
+}
+
+define <4 x i32> @_Z9select_i1Dv4_iS_S_(<4 x i32> %cond.ext, <4 x i32> %arg1.ext, <4 x i32> %arg2.ext) {
+entry:
+  %cond = trunc <4 x i32> %cond.ext to <4 x i1>
+  %arg1 = trunc <4 x i32> %arg1.ext to <4 x i1>
+  %arg2 = trunc <4 x i32> %arg2.ext to <4 x i1>
+  %res.trunc = select <4 x i1> %cond, <4 x i1> %arg1, <4 x i1> %arg2
+  %res = sext <4 x i1> %res.trunc to <4 x i32>
+  ret <4 x i32> %res
+}
+
+define <8 x i16> @_Z9select_i1Dv8_sS_S_(<8 x i16> %cond.ext, <8 x i16> %arg1.ext, <8 x i16> %arg2.ext) {
+entry:
+  %cond = trunc <8 x i16> %cond.ext to <8 x i1>
+  %arg1 = trunc <8 x i16> %arg1.ext to <8 x i1>
+  %arg2 = trunc <8 x i16> %arg2.ext to <8 x i1>
+  %res.trunc = select <8 x i1> %cond, <8 x i1> %arg1, <8 x i1> %arg2
+  %res = sext <8 x i1> %res.trunc to <8 x i16>
+  ret <8 x i16> %res
+}
+
+define <16 x i8> @_Z9select_i1Dv16_aS_S_(<16 x i8> %cond.ext, <16 x i8> %arg1.ext, <16 x i8> %arg2.ext) {
+entry:
+  %cond = trunc <16 x i8> %cond.ext to <16 x i1>
+  %arg1 = trunc <16 x i8> %arg1.ext to <16 x i1>
+  %arg2 = trunc <16 x i8> %arg2.ext to <16 x i1>
+  %res.trunc = select <16 x i1> %cond, <16 x i1> %arg1, <16 x i1> %arg2
+  %res = sext <16 x i1> %res.trunc to <16 x i8>
+  ret <16 x i8> %res
+}
diff --git a/crosstest/test_select_main.cpp b/crosstest/test_select_main.cpp
new file mode 100644
index 0000000..7456469
--- /dev/null
+++ b/crosstest/test_select_main.cpp
@@ -0,0 +1,168 @@
+//===- subzero/crosstest/test_select_main.cpp - Driver for tests ----------===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Driver for crosstesting the select bitcode instruction
+//
+//===----------------------------------------------------------------------===//
+
+/* crosstest.py --test=test_select.ll  --driver=test_select_main.cpp \
+   --prefix=Subzero_ --output=test_select */
+
+#include <cfloat>
+#include <cstring>
+#include <iostream>
+
+#include "test_select.h"
+
+namespace Subzero_ {
+#include "test_select.h"
+}
+
+static const size_t MaxTestsPerFunc = 100000;
+
+template <typename T, typename TI1>
+void testSelect(size_t &TotalTests, size_t &Passes, size_t &Failures) {
+  typedef typename Vectors<T>::Ty Ty;
+  typedef typename Vectors<TI1>::Ty TyI1;
+  volatile unsigned Values[] = {
+    0x0,        0x1,        0x7ffffffe, 0x7fffffff,
+    0x80000000, 0x80000001, 0xfffffffe, 0xffffffff,
+    0x7e,       0x7f,       0x80,       0x81,
+    0xfe,       0xff,       0x100,      0x101,
+    0x7ffe,     0x7fff,     0x8000,     0x8001,
+    0xfffe,     0xffff,     0x10000,    0x10001
+  };
+  static const size_t NumValues = sizeof(Values) / sizeof(*Values);
+  static const size_t NumElements = Vectors<T>::NumElements;
+  PRNG Index;
+  for (size_t i = 0; i < MaxTestsPerFunc; ++i) {
+    TyI1 Cond;
+    Ty Value1, Value2;
+    for (size_t j = 0; j < NumElements; ++j) {
+      Cond[j] = Index() % 2;
+      Value1[j] = Values[Index() % NumValues];
+      Value2[j] = Values[Index() % NumValues];
+    }
+    Ty ResultLlc = select(Cond, Value1, Value2);
+    Ty ResultSz = Subzero_::select(Cond, Value1, Value2);
+    ++TotalTests;
+    if (!memcmp(&ResultLlc, &ResultSz, sizeof(ResultLlc))) {
+      ++Passes;
+    } else {
+      ++Failures;
+      std::cout << "select<" << Vectors<T>::TypeName << ">(Cond=";
+      std::cout << vectAsString<TI1>(Cond)
+                << ", Value1=" << vectAsString<T>(Value1)
+                << ", Value2=" << vectAsString<T>(Value2) << ")\n";
+      std::cout << "llc=" << vectAsString<T>(ResultLlc) << "\n";
+      std::cout << "sz =" << vectAsString<T>(ResultSz) << "\n";
+    }
+  }
+}
+
+template<> void
+testSelect<v4f32, v4i1>(size_t &TotalTests, size_t &Passes, size_t &Failures) {
+  static const float NegInf = -1.0 / 0.0;
+  static const float PosInf = 1.0 / 0.0;
+  static const float Nan = 0.0 / 0.0;
+  static const float NegNan = -0.0 / 0.0;
+  volatile float Values[] = {
+    0,                    1,                    0x7e,
+    0x7f,                 0x80,                 0x81,
+    0xfe,                 0xff,                 0x7ffe,
+    0x7fff,               0x8000,               0x8001,
+    0xfffe,               0xffff,               0x7ffffffe,
+    0x7fffffff,           0x80000000,           0x80000001,
+    0xfffffffe,           0xffffffff,           0x100000000ll,
+    0x100000001ll,        0x7ffffffffffffffell, 0x7fffffffffffffffll,
+    0x8000000000000000ll, 0x8000000000000001ll, 0xfffffffffffffffell,
+    0xffffffffffffffffll, NegInf,               PosInf,
+    Nan,                  NegNan,               -0.0,
+    FLT_MIN,              FLT_MAX,              DBL_MIN,
+    DBL_MAX
+  };
+  static const size_t NumValues = sizeof(Values) / sizeof(*Values);
+  static const size_t NumElements = 4;
+  PRNG Index;
+  for (size_t i = 0; i < MaxTestsPerFunc; ++i) {
+    v4si32 Cond;
+    v4f32 Value1, Value2;
+    for (size_t j = 0; j < NumElements; ++j) {
+      Cond[j] = Index() % 2;
+      Value1[j] = Values[Index() % NumValues];
+      Value2[j] = Values[Index() % NumValues];
+    }
+    v4f32 ResultLlc = select(Cond, Value1, Value2);
+    v4f32 ResultSz = Subzero_::select(Cond, Value1, Value2);
+    ++TotalTests;
+    if (!memcmp(&ResultLlc, &ResultSz, sizeof(ResultLlc))) {
+      ++Passes;
+    } else {
+      ++Failures;
+      std::cout << "select<v4f32>(Cond=";
+      std::cout << vectAsString<v4i1>(Cond)
+                << ", Value1=" << vectAsString<v4f32>(Value1)
+                << ", Value2=" << vectAsString<v4f32>(Value2) << ")\n";
+      std::cout << "llc=" << vectAsString<v4f32>(ResultLlc) << "\n";
+      std::cout << "sz =" << vectAsString<v4f32>(ResultSz) << "\n";
+    }
+  }
+}
+
+template<typename T>
+void testSelectI1(size_t &TotalTests, size_t &Passes, size_t &Failures) {
+  typedef typename Vectors<T>::Ty Ty;
+  static const size_t NumElements = Vectors<T>::NumElements;
+  PRNG Index;
+  for (size_t i = 0; i < MaxTestsPerFunc; ++i) {
+    Ty Cond;
+    Ty Value1, Value2;
+    for (size_t j = 0; j < NumElements; ++j) {
+      Cond[j] = Index() % 2;
+      Value1[j] = Index() % 2;
+      Value2[j] = Index() % 2;
+    }
+    Ty ResultLlc = select_i1(Cond, Value1, Value2);
+    Ty ResultSz = Subzero_::select_i1(Cond, Value1, Value2);
+    ++TotalTests;
+    if (!memcmp(&ResultLlc, &ResultSz, sizeof(ResultLlc))) {
+      ++Passes;
+    } else {
+      ++Failures;
+      std::cout << "select<" << Vectors<T>::TypeName << ">(Cond=";
+      std::cout << vectAsString<T>(Cond)
+                << ", Value1=" << vectAsString<T>(Value1)
+                << ", Value2=" << vectAsString<T>(Value2) << ")\n";
+      std::cout << "llc=" << vectAsString<T>(ResultLlc) << "\n";
+      std::cout << "sz =" << vectAsString<T>(ResultSz) << "\n";
+    }
+  }
+}
+
+int main(int argc, char *argv[]) {
+  size_t TotalTests = 0;
+  size_t Passes = 0;
+  size_t Failures = 0;
+
+  testSelect<v4f32, v4i1>(TotalTests, Passes, Failures);
+  testSelect<v4si32, v4i1>(TotalTests, Passes, Failures);
+  testSelect<v4ui32, v4i1>(TotalTests, Passes, Failures);
+  testSelect<v8si16, v8i1>(TotalTests, Passes, Failures);
+  testSelect<v8ui16, v8i1>(TotalTests, Passes, Failures);
+  testSelect<v16si8, v16i1>(TotalTests, Passes, Failures);
+  testSelect<v16ui8, v16i1>(TotalTests, Passes, Failures);
+  testSelectI1<v4i1>(TotalTests, Passes, Failures);
+  testSelectI1<v8i1>(TotalTests, Passes, Failures);
+  testSelectI1<v16i1>(TotalTests, Passes, Failures);
+
+  std::cout << "TotalTests=" << TotalTests << " Passes=" << Passes
+            << " Failures=" << Failures << "\n";
+
+  return Failures;
+}
diff --git a/src/IceInst.cpp b/src/IceInst.cpp
index 004b555..6893856 100644
--- a/src/IceInst.cpp
+++ b/src/IceInst.cpp
@@ -376,7 +376,7 @@
 InstSelect::InstSelect(Cfg *Func, Variable *Dest, Operand *Condition,
                        Operand *SourceTrue, Operand *SourceFalse)
     : Inst(Func, Inst::Select, 3, Dest) {
-  assert(Condition->getType() == IceType_i1);
+  assert(typeElementType(Condition->getType()) == IceType_i1);
   addSource(Condition);
   addSource(SourceTrue);
   addSource(SourceFalse);
diff --git a/src/IceInstX8632.cpp b/src/IceInstX8632.cpp
index f1a68da..93a872c 100644
--- a/src/IceInstX8632.cpp
+++ b/src/IceInstX8632.cpp
@@ -458,6 +458,7 @@
 template <> const char *InstX8632Psub::Opcode = "psub";
 template <> const char *InstX8632And::Opcode = "and";
 template <> const char *InstX8632Pand::Opcode = "pand";
+template <> const char *InstX8632Pandn::Opcode = "pandn";
 template <> const char *InstX8632Or::Opcode = "or";
 template <> const char *InstX8632Por::Opcode = "por";
 template <> const char *InstX8632Xor::Opcode = "xor";
diff --git a/src/IceInstX8632.h b/src/IceInstX8632.h
index b50199b..ce1cc65 100644
--- a/src/IceInstX8632.h
+++ b/src/IceInstX8632.h
@@ -174,6 +174,7 @@
     Or,
     Padd,
     Pand,
+    Pandn,
     Pcmpeq,
     Pcmpgt,
     Pextrw,
@@ -564,6 +565,7 @@
 typedef InstX8632Binop<InstX8632::Psub> InstX8632Psub;
 typedef InstX8632Binop<InstX8632::And> InstX8632And;
 typedef InstX8632Binop<InstX8632::Pand> InstX8632Pand;
+typedef InstX8632Binop<InstX8632::Pandn> InstX8632Pandn;
 typedef InstX8632Binop<InstX8632::Or> InstX8632Or;
 typedef InstX8632Binop<InstX8632::Por> InstX8632Por;
 typedef InstX8632Binop<InstX8632::Xor> InstX8632Xor;
diff --git a/src/IceTargetLoweringX8632.cpp b/src/IceTargetLoweringX8632.cpp
index 71b4c17..c8cf170 100644
--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
@@ -3410,11 +3410,46 @@
 }
 
 void TargetX8632::lowerSelect(const InstSelect *Inst) {
-  // a=d?b:c ==> cmp d,0; a=b; jne L1; FakeUse(a); a=c; L1:
   Variable *Dest = Inst->getDest();
   Operand *SrcT = Inst->getTrueOperand();
   Operand *SrcF = Inst->getFalseOperand();
-  Operand *Condition = legalize(Inst->getCondition());
+  Operand *Condition = Inst->getCondition();
+
+  if (isVectorType(Dest->getType())) {
+    // a=d?b:c ==> d=sext(d); a=(b&d)|(c&~d)
+    // TODO(wala): SSE4.1 has blendvps and pblendvb.  SSE4.1 also has
+    // blendps and pblendw for constant condition operands.
+    Type SrcTy = SrcT->getType();
+    Variable *T = makeReg(SrcTy);
+    Variable *T2 = makeReg(SrcTy);
+    // Sign extend the condition operand if applicable.
+    if (SrcTy == IceType_v4f32) {
+      // The sext operation takes only integer arguments.
+      Variable *T3 = Func->makeVariable(IceType_v4i32, Context.getNode());
+      lowerCast(InstCast::create(Func, InstCast::Sext, T3, Condition));
+      _movp(T, T3);
+    } else if (typeElementType(SrcTy) != IceType_i1) {
+      lowerCast(InstCast::create(Func, InstCast::Sext, T, Condition));
+    } else {
+      _movp(T, Condition);
+    }
+    // ALIGNHACK: Until stack alignment support is implemented, the
+    // bitwise vector instructions need to have both operands in
+    // registers.  Once there is support for stack alignment, LEGAL_HACK
+    // can be removed.
+#define LEGAL_HACK(Vect) legalizeToVar((Vect))
+    _movp(T2, T);
+    _pand(T, LEGAL_HACK(SrcT));
+    _pandn(T2, LEGAL_HACK(SrcF));
+    _por(T, T2);
+    _movp(Dest, T);
+#undef LEGAL_HACK
+
+    return;
+  }
+
+  // a=d?b:c ==> cmp d,0; a=b; jne L1; FakeUse(a); a=c; L1:
+  Operand *ConditionRMI = legalize(Condition);
   Constant *Zero = Ctx->getConstantZero(IceType_i32);
   InstX8632Label *Label = InstX8632Label::create(Func, this);
 
@@ -3423,7 +3458,7 @@
     Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
     Operand *SrcLoRI = legalize(loOperand(SrcT), Legal_Reg | Legal_Imm, true);
     Operand *SrcHiRI = legalize(hiOperand(SrcT), Legal_Reg | Legal_Imm, true);
-    _cmp(Condition, Zero);
+    _cmp(ConditionRMI, Zero);
     _mov(DestLo, SrcLoRI);
     _mov(DestHi, SrcHiRI);
     _br(InstX8632Br::Br_ne, Label);
@@ -3436,7 +3471,7 @@
     _mov(DestLo, SrcLoRI);
     _mov(DestHi, SrcHiRI);
   } else {
-    _cmp(Condition, Zero);
+    _cmp(ConditionRMI, Zero);
     SrcT = legalize(SrcT, Legal_Reg | Legal_Imm, true);
     _mov(Dest, SrcT);
     _br(InstX8632Br::Br_ne, Label);
diff --git a/src/IceTargetLoweringX8632.h b/src/IceTargetLoweringX8632.h
index 864881f..cbc98ce 100644
--- a/src/IceTargetLoweringX8632.h
+++ b/src/IceTargetLoweringX8632.h
@@ -304,6 +304,9 @@
   void _pand(Variable *Dest, Operand *Src0) {
     Context.insert(InstX8632Pand::create(Func, Dest, Src0));
   }
+  void _pandn(Variable *Dest, Operand *Src0) {
+    Context.insert(InstX8632Pandn::create(Func, Dest, Src0));
+  }
   void _pcmpeq(Variable *Dest, Operand *Src0) {
     Context.insert(InstX8632Pcmpeq::create(Func, Dest, Src0));
   }
diff --git a/tests_lit/llvm2ice_tests/vector-select.ll b/tests_lit/llvm2ice_tests/vector-select.ll
new file mode 100644
index 0000000..93f5941
--- /dev/null
+++ b/tests_lit/llvm2ice_tests/vector-select.ll
@@ -0,0 +1,85 @@
+; This file tests support for the select instruction with vector valued inputs.
+
+; RUN: %llvm2ice -O2 --verbose none %s | FileCheck %s
+; RUN: %llvm2ice -Om1 --verbose none %s | FileCheck %s
+; RUN: %llvm2ice -O2 --verbose none %s \
+; RUN:               | llvm-mc -arch=x86 -x86-asm-syntax=intel -filetype=obj
+; RUN: %llvm2ice -Om1 --verbose none %s \
+; RUN:               | llvm-mc -arch=x86 -x86-asm-syntax=intel -filetype=obj
+; RUN: %llvm2ice --verbose none %s | FileCheck --check-prefix=ERRORS %s
+; RUN: %llvm2iceinsts %s | %szdiff %s | FileCheck --check-prefix=DUMP %s
+; RUN: %llvm2iceinsts --pnacl %s | %szdiff %s \
+; RUN:                           | FileCheck --check-prefix=DUMP %s
+
+define <16 x i8> @test_select_v16i8(<16 x i1> %cond, <16 x i8> %arg1, <16 x i8> %arg2) {
+entry:
+  %res = select <16 x i1> %cond, <16 x i8> %arg1, <16 x i8> %arg2
+  ret <16 x i8> %res
+; CHECK-LABEL: test_select_v16i8:
+; CHECK: pand
+; CHECK: pandn
+; CHECK: por
+}
+
+define <16 x i1> @test_select_v16i1(<16 x i1> %cond, <16 x i1> %arg1, <16 x i1> %arg2) {
+entry:
+  %res = select <16 x i1> %cond, <16 x i1> %arg1, <16 x i1> %arg2
+  ret <16 x i1> %res
+; CHECK-LABEL: test_select_v16i1:
+; CHECK: pand
+; CHECK: pandn
+; CHECK: por
+}
+
+define <8 x i16> @test_select_v8i16(<8 x i1> %cond, <8 x i16> %arg1, <8 x i16> %arg2) {
+entry:
+  %res = select <8 x i1> %cond, <8 x i16> %arg1, <8 x i16> %arg2
+  ret <8 x i16> %res
+; CHECK-LABEL: test_select_v8i16:
+; CHECK: pand
+; CHECK: pandn
+; CHECK: por
+}
+
+define <8 x i1> @test_select_v8i1(<8 x i1> %cond, <8 x i1> %arg1, <8 x i1> %arg2) {
+entry:
+  %res = select <8 x i1> %cond, <8 x i1> %arg1, <8 x i1> %arg2
+  ret <8 x i1> %res
+; CHECK-LABEL: test_select_v8i1:
+; CHECK: pand
+; CHECK: pandn
+; CHECK: por
+}
+
+define <4 x i32> @test_select_v4i32(<4 x i1> %cond, <4 x i32> %arg1, <4 x i32> %arg2) {
+entry:
+  %res = select <4 x i1> %cond, <4 x i32> %arg1, <4 x i32> %arg2
+  ret <4 x i32> %res
+; CHECK-LABEL: test_select_v4i32:
+; CHECK: pand
+; CHECK: pandn
+; CHECK: por
+}
+
+define <4 x float> @test_select_v4f32(<4 x i1> %cond, <4 x float> %arg1, <4 x float> %arg2) {
+entry:
+  %res = select <4 x i1> %cond, <4 x float> %arg1, <4 x float> %arg2
+  ret <4 x float> %res
+; CHECK-LABEL: test_select_v4f32:
+; CHECK: pand
+; CHECK: pandn
+; CHECK: por
+}
+
+define <4 x i1> @test_select_v4i1(<4 x i1> %cond, <4 x i1> %arg1, <4 x i1> %arg2) {
+entry:
+  %res = select <4 x i1> %cond, <4 x i1> %arg1, <4 x i1> %arg2
+  ret <4 x i1> %res
+; CHECK-LABEL: test_select_v4i1:
+; CHECK: pand
+; CHECK: pandn
+; CHECK: por
+}
+
+; ERRORS-NOT: ICE translation error
+; DUMP-NOT: SZ