Lower the select instruction when the operands are of vector type.
Select of vectors is implemented by appropriately masking and
combining the inputs with sign extend / bitwise operations
and without the use of branches.
BUG=none
R=jvoung@chromium.org, stichnot@chromium.org
Review URL: https://codereview.chromium.org/417653004
diff --git a/crosstest/runtests.sh b/crosstest/runtests.sh
index 06c43f4..7b81df1 100755
--- a/crosstest/runtests.sh
+++ b/crosstest/runtests.sh
@@ -74,6 +74,13 @@
./crosstest.py -O${optlevel} --prefix=Subzero_ --target=x8632 \
--dir="${OUTDIR}" \
--llvm-bin-path="${LLVM_BIN_PATH}" \
+ --test=test_select.ll \
+ --driver=test_select_main.cpp \
+ --output=test_select_O${optlevel}
+
+ ./crosstest.py -O${optlevel} --prefix=Subzero_ --target=x8632 \
+ --dir="${OUTDIR}" \
+ --llvm-bin-path="${LLVM_BIN_PATH}" \
--test=test_stacksave.c \
--driver=test_stacksave_main.c \
--output=test_stacksave_O${optlevel}
@@ -107,6 +114,7 @@
"${OUTDIR}"/test_fcmp_O${optlevel}
"${OUTDIR}"/test_global_O${optlevel}
"${OUTDIR}"/test_icmp_O${optlevel}
+ "${OUTDIR}"/test_select_O${optlevel}
"${OUTDIR}"/test_stacksave_O${optlevel}
"${OUTDIR}"/test_sync_atomic_O${optlevel}
"${OUTDIR}"/test_vector_ops_O${optlevel}
diff --git a/crosstest/test_select.h b/crosstest/test_select.h
new file mode 100644
index 0000000..6e890a0
--- /dev/null
+++ b/crosstest/test_select.h
@@ -0,0 +1,26 @@
+//===- subzero/crosstest/test_select.h - Test prototypes -----*- C++ -*----===//
+//
+// The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the function prototypes for cross testing the select
+// bitcode instruction.
+//
+//===----------------------------------------------------------------------===//
+
+#include "vectors.h"
+
+v4f32 select(v4si32 cond, v4f32 val1, v4f32 val2);
+v4si32 select(v4si32 cond, v4si32 val1, v4si32 val2);
+v4ui32 select(v4si32 cond, v4ui32 val1, v4ui32 val2);
+v8si16 select(v8si16 cond, v8si16 val1, v8si16 val2);
+v8ui16 select(v8si16 cond, v8ui16 val1, v8ui16 val2);
+v16si8 select(v16si8 cond, v16si8 val1, v16si8 val2);
+v16ui8 select(v16si8 cond, v16ui8 val1, v16ui8 val2);
+v4si32 select_i1(v4si32 cond, v4si32 val1, v4si32 val2);
+v8si16 select_i1(v8si16 cond, v8si16 val1, v8si16 val2);
+v16si8 select_i1(v16si8 cond, v16si8 val1, v16si8 val2);
diff --git a/crosstest/test_select.ll b/crosstest/test_select.ll
new file mode 100644
index 0000000..e286b2d
--- /dev/null
+++ b/crosstest/test_select.ll
@@ -0,0 +1,80 @@
+target triple = "i686-pc-linux-gnu"
+
+define <4 x float> @_Z6selectDv4_iDv4_fS0_(<4 x i32> %cond.ext, <4 x float> %arg1, <4 x float> %arg2) {
+entry:
+ %cond = trunc <4 x i32> %cond.ext to <4 x i1>
+ %res = select <4 x i1> %cond, <4 x float> %arg1, <4 x float> %arg2
+ ret <4 x float> %res
+}
+
+define <4 x i32> @_Z6selectDv4_iS_S_(<4 x i32> %cond.ext, <4 x i32> %arg1, <4 x i32> %arg2) {
+entry:
+ %cond = trunc <4 x i32> %cond.ext to <4 x i1>
+ %res = select <4 x i1> %cond, <4 x i32> %arg1, <4 x i32> %arg2
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @_Z6selectDv4_iDv4_jS0_(<4 x i32> %cond.ext, <4 x i32> %arg1, <4 x i32> %arg2) {
+entry:
+ %cond = trunc <4 x i32> %cond.ext to <4 x i1>
+ %res = select <4 x i1> %cond, <4 x i32> %arg1, <4 x i32> %arg2
+ ret <4 x i32> %res
+}
+
+define <8 x i16> @_Z6selectDv8_sS_S_(<8 x i16> %cond.ext, <8 x i16> %arg1, <8 x i16> %arg2) {
+entry:
+ %cond = trunc <8 x i16> %cond.ext to <8 x i1>
+ %res = select <8 x i1> %cond, <8 x i16> %arg1, <8 x i16> %arg2
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @_Z6selectDv8_sDv8_tS0_(<8 x i16> %cond.ext, <8 x i16> %arg1, <8 x i16> %arg2) {
+entry:
+ %cond = trunc <8 x i16> %cond.ext to <8 x i1>
+ %res = select <8 x i1> %cond, <8 x i16> %arg1, <8 x i16> %arg2
+ ret <8 x i16> %res
+}
+
+define <16 x i8> @_Z6selectDv16_aS_S_(<16 x i8> %cond.ext, <16 x i8> %arg1, <16 x i8> %arg2) {
+entry:
+ %cond = trunc <16 x i8> %cond.ext to <16 x i1>
+ %res = select <16 x i1> %cond, <16 x i8> %arg1, <16 x i8> %arg2
+ ret <16 x i8> %res
+}
+
+define <16 x i8> @_Z6selectDv16_aDv16_hS0_(<16 x i8> %cond.ext, <16 x i8> %arg1, <16 x i8> %arg2) {
+entry:
+ %cond = trunc <16 x i8> %cond.ext to <16 x i1>
+ %res = select <16 x i1> %cond, <16 x i8> %arg1, <16 x i8> %arg2
+ ret <16 x i8> %res
+}
+
+define <4 x i32> @_Z9select_i1Dv4_iS_S_(<4 x i32> %cond.ext, <4 x i32> %arg1.ext, <4 x i32> %arg2.ext) {
+entry:
+ %cond = trunc <4 x i32> %cond.ext to <4 x i1>
+ %arg1 = trunc <4 x i32> %arg1.ext to <4 x i1>
+ %arg2 = trunc <4 x i32> %arg2.ext to <4 x i1>
+ %res.trunc = select <4 x i1> %cond, <4 x i1> %arg1, <4 x i1> %arg2
+ %res = sext <4 x i1> %res.trunc to <4 x i32>
+ ret <4 x i32> %res
+}
+
+define <8 x i16> @_Z9select_i1Dv8_sS_S_(<8 x i16> %cond.ext, <8 x i16> %arg1.ext, <8 x i16> %arg2.ext) {
+entry:
+ %cond = trunc <8 x i16> %cond.ext to <8 x i1>
+ %arg1 = trunc <8 x i16> %arg1.ext to <8 x i1>
+ %arg2 = trunc <8 x i16> %arg2.ext to <8 x i1>
+ %res.trunc = select <8 x i1> %cond, <8 x i1> %arg1, <8 x i1> %arg2
+ %res = sext <8 x i1> %res.trunc to <8 x i16>
+ ret <8 x i16> %res
+}
+
+define <16 x i8> @_Z9select_i1Dv16_aS_S_(<16 x i8> %cond.ext, <16 x i8> %arg1.ext, <16 x i8> %arg2.ext) {
+entry:
+ %cond = trunc <16 x i8> %cond.ext to <16 x i1>
+ %arg1 = trunc <16 x i8> %arg1.ext to <16 x i1>
+ %arg2 = trunc <16 x i8> %arg2.ext to <16 x i1>
+ %res.trunc = select <16 x i1> %cond, <16 x i1> %arg1, <16 x i1> %arg2
+ %res = sext <16 x i1> %res.trunc to <16 x i8>
+ ret <16 x i8> %res
+}
diff --git a/crosstest/test_select_main.cpp b/crosstest/test_select_main.cpp
new file mode 100644
index 0000000..7456469
--- /dev/null
+++ b/crosstest/test_select_main.cpp
@@ -0,0 +1,168 @@
+//===- subzero/crosstest/test_select_main.cpp - Driver for tests ----------===//
+//
+// The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Driver for crosstesting the select bitcode instruction
+//
+//===----------------------------------------------------------------------===//
+
+/* crosstest.py --test=test_select.ll --driver=test_select_main.cpp \
+ --prefix=Subzero_ --output=test_select */
+
+#include <cfloat>
+#include <cstring>
+#include <iostream>
+
+#include "test_select.h"
+
+namespace Subzero_ {
+#include "test_select.h"
+}
+
+static const size_t MaxTestsPerFunc = 100000;
+
+template <typename T, typename TI1>
+void testSelect(size_t &TotalTests, size_t &Passes, size_t &Failures) {
+ typedef typename Vectors<T>::Ty Ty;
+ typedef typename Vectors<TI1>::Ty TyI1;
+ volatile unsigned Values[] = {
+ 0x0, 0x1, 0x7ffffffe, 0x7fffffff,
+ 0x80000000, 0x80000001, 0xfffffffe, 0xffffffff,
+ 0x7e, 0x7f, 0x80, 0x81,
+ 0xfe, 0xff, 0x100, 0x101,
+ 0x7ffe, 0x7fff, 0x8000, 0x8001,
+ 0xfffe, 0xffff, 0x10000, 0x10001
+ };
+ static const size_t NumValues = sizeof(Values) / sizeof(*Values);
+ static const size_t NumElements = Vectors<T>::NumElements;
+ PRNG Index;
+ for (size_t i = 0; i < MaxTestsPerFunc; ++i) {
+ TyI1 Cond;
+ Ty Value1, Value2;
+ for (size_t j = 0; j < NumElements; ++j) {
+ Cond[j] = Index() % 2;
+ Value1[j] = Values[Index() % NumValues];
+ Value2[j] = Values[Index() % NumValues];
+ }
+ Ty ResultLlc = select(Cond, Value1, Value2);
+ Ty ResultSz = Subzero_::select(Cond, Value1, Value2);
+ ++TotalTests;
+ if (!memcmp(&ResultLlc, &ResultSz, sizeof(ResultLlc))) {
+ ++Passes;
+ } else {
+ ++Failures;
+ std::cout << "select<" << Vectors<T>::TypeName << ">(Cond=";
+ std::cout << vectAsString<TI1>(Cond)
+ << ", Value1=" << vectAsString<T>(Value1)
+ << ", Value2=" << vectAsString<T>(Value2) << ")\n";
+ std::cout << "llc=" << vectAsString<T>(ResultLlc) << "\n";
+ std::cout << "sz =" << vectAsString<T>(ResultSz) << "\n";
+ }
+ }
+}
+
+template<> void
+testSelect<v4f32, v4i1>(size_t &TotalTests, size_t &Passes, size_t &Failures) {
+ static const float NegInf = -1.0 / 0.0;
+ static const float PosInf = 1.0 / 0.0;
+ static const float Nan = 0.0 / 0.0;
+ static const float NegNan = -0.0 / 0.0;
+ volatile float Values[] = {
+ 0, 1, 0x7e,
+ 0x7f, 0x80, 0x81,
+ 0xfe, 0xff, 0x7ffe,
+ 0x7fff, 0x8000, 0x8001,
+ 0xfffe, 0xffff, 0x7ffffffe,
+ 0x7fffffff, 0x80000000, 0x80000001,
+ 0xfffffffe, 0xffffffff, 0x100000000ll,
+ 0x100000001ll, 0x7ffffffffffffffell, 0x7fffffffffffffffll,
+ 0x8000000000000000ll, 0x8000000000000001ll, 0xfffffffffffffffell,
+ 0xffffffffffffffffll, NegInf, PosInf,
+ Nan, NegNan, -0.0,
+ FLT_MIN, FLT_MAX, DBL_MIN,
+ DBL_MAX
+ };
+ static const size_t NumValues = sizeof(Values) / sizeof(*Values);
+ static const size_t NumElements = 4;
+ PRNG Index;
+ for (size_t i = 0; i < MaxTestsPerFunc; ++i) {
+ v4si32 Cond;
+ v4f32 Value1, Value2;
+ for (size_t j = 0; j < NumElements; ++j) {
+ Cond[j] = Index() % 2;
+ Value1[j] = Values[Index() % NumValues];
+ Value2[j] = Values[Index() % NumValues];
+ }
+ v4f32 ResultLlc = select(Cond, Value1, Value2);
+ v4f32 ResultSz = Subzero_::select(Cond, Value1, Value2);
+ ++TotalTests;
+ if (!memcmp(&ResultLlc, &ResultSz, sizeof(ResultLlc))) {
+ ++Passes;
+ } else {
+ ++Failures;
+ std::cout << "select<v4f32>(Cond=";
+ std::cout << vectAsString<v4i1>(Cond)
+ << ", Value1=" << vectAsString<v4f32>(Value1)
+ << ", Value2=" << vectAsString<v4f32>(Value2) << ")\n";
+ std::cout << "llc=" << vectAsString<v4f32>(ResultLlc) << "\n";
+ std::cout << "sz =" << vectAsString<v4f32>(ResultSz) << "\n";
+ }
+ }
+}
+
+template<typename T>
+void testSelectI1(size_t &TotalTests, size_t &Passes, size_t &Failures) {
+ typedef typename Vectors<T>::Ty Ty;
+ static const size_t NumElements = Vectors<T>::NumElements;
+ PRNG Index;
+ for (size_t i = 0; i < MaxTestsPerFunc; ++i) {
+ Ty Cond;
+ Ty Value1, Value2;
+ for (size_t j = 0; j < NumElements; ++j) {
+ Cond[j] = Index() % 2;
+ Value1[j] = Index() % 2;
+ Value2[j] = Index() % 2;
+ }
+ Ty ResultLlc = select_i1(Cond, Value1, Value2);
+ Ty ResultSz = Subzero_::select_i1(Cond, Value1, Value2);
+ ++TotalTests;
+ if (!memcmp(&ResultLlc, &ResultSz, sizeof(ResultLlc))) {
+ ++Passes;
+ } else {
+ ++Failures;
+ std::cout << "select<" << Vectors<T>::TypeName << ">(Cond=";
+ std::cout << vectAsString<T>(Cond)
+ << ", Value1=" << vectAsString<T>(Value1)
+ << ", Value2=" << vectAsString<T>(Value2) << ")\n";
+ std::cout << "llc=" << vectAsString<T>(ResultLlc) << "\n";
+ std::cout << "sz =" << vectAsString<T>(ResultSz) << "\n";
+ }
+ }
+}
+
+int main(int argc, char *argv[]) {
+ size_t TotalTests = 0;
+ size_t Passes = 0;
+ size_t Failures = 0;
+
+ testSelect<v4f32, v4i1>(TotalTests, Passes, Failures);
+ testSelect<v4si32, v4i1>(TotalTests, Passes, Failures);
+ testSelect<v4ui32, v4i1>(TotalTests, Passes, Failures);
+ testSelect<v8si16, v8i1>(TotalTests, Passes, Failures);
+ testSelect<v8ui16, v8i1>(TotalTests, Passes, Failures);
+ testSelect<v16si8, v16i1>(TotalTests, Passes, Failures);
+ testSelect<v16ui8, v16i1>(TotalTests, Passes, Failures);
+ testSelectI1<v4i1>(TotalTests, Passes, Failures);
+ testSelectI1<v8i1>(TotalTests, Passes, Failures);
+ testSelectI1<v16i1>(TotalTests, Passes, Failures);
+
+ std::cout << "TotalTests=" << TotalTests << " Passes=" << Passes
+ << " Failures=" << Failures << "\n";
+
+ return Failures;
+}
diff --git a/src/IceInst.cpp b/src/IceInst.cpp
index 004b555..6893856 100644
--- a/src/IceInst.cpp
+++ b/src/IceInst.cpp
@@ -376,7 +376,7 @@
InstSelect::InstSelect(Cfg *Func, Variable *Dest, Operand *Condition,
Operand *SourceTrue, Operand *SourceFalse)
: Inst(Func, Inst::Select, 3, Dest) {
- assert(Condition->getType() == IceType_i1);
+ assert(typeElementType(Condition->getType()) == IceType_i1);
addSource(Condition);
addSource(SourceTrue);
addSource(SourceFalse);
diff --git a/src/IceInstX8632.cpp b/src/IceInstX8632.cpp
index f1a68da..93a872c 100644
--- a/src/IceInstX8632.cpp
+++ b/src/IceInstX8632.cpp
@@ -458,6 +458,7 @@
template <> const char *InstX8632Psub::Opcode = "psub";
template <> const char *InstX8632And::Opcode = "and";
template <> const char *InstX8632Pand::Opcode = "pand";
+template <> const char *InstX8632Pandn::Opcode = "pandn";
template <> const char *InstX8632Or::Opcode = "or";
template <> const char *InstX8632Por::Opcode = "por";
template <> const char *InstX8632Xor::Opcode = "xor";
diff --git a/src/IceInstX8632.h b/src/IceInstX8632.h
index b50199b..ce1cc65 100644
--- a/src/IceInstX8632.h
+++ b/src/IceInstX8632.h
@@ -174,6 +174,7 @@
Or,
Padd,
Pand,
+ Pandn,
Pcmpeq,
Pcmpgt,
Pextrw,
@@ -564,6 +565,7 @@
typedef InstX8632Binop<InstX8632::Psub> InstX8632Psub;
typedef InstX8632Binop<InstX8632::And> InstX8632And;
typedef InstX8632Binop<InstX8632::Pand> InstX8632Pand;
+typedef InstX8632Binop<InstX8632::Pandn> InstX8632Pandn;
typedef InstX8632Binop<InstX8632::Or> InstX8632Or;
typedef InstX8632Binop<InstX8632::Por> InstX8632Por;
typedef InstX8632Binop<InstX8632::Xor> InstX8632Xor;
diff --git a/src/IceTargetLoweringX8632.cpp b/src/IceTargetLoweringX8632.cpp
index 71b4c17..c8cf170 100644
--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
@@ -3410,11 +3410,46 @@
}
void TargetX8632::lowerSelect(const InstSelect *Inst) {
- // a=d?b:c ==> cmp d,0; a=b; jne L1; FakeUse(a); a=c; L1:
Variable *Dest = Inst->getDest();
Operand *SrcT = Inst->getTrueOperand();
Operand *SrcF = Inst->getFalseOperand();
- Operand *Condition = legalize(Inst->getCondition());
+ Operand *Condition = Inst->getCondition();
+
+ if (isVectorType(Dest->getType())) {
+ // a=d?b:c ==> d=sext(d); a=(b&d)|(c&~d)
+ // TODO(wala): SSE4.1 has blendvps and pblendvb. SSE4.1 also has
+ // blendps and pblendw for constant condition operands.
+ Type SrcTy = SrcT->getType();
+ Variable *T = makeReg(SrcTy);
+ Variable *T2 = makeReg(SrcTy);
+ // Sign extend the condition operand if applicable.
+ if (SrcTy == IceType_v4f32) {
+ // The sext operation takes only integer arguments.
+ Variable *T3 = Func->makeVariable(IceType_v4i32, Context.getNode());
+ lowerCast(InstCast::create(Func, InstCast::Sext, T3, Condition));
+ _movp(T, T3);
+ } else if (typeElementType(SrcTy) != IceType_i1) {
+ lowerCast(InstCast::create(Func, InstCast::Sext, T, Condition));
+ } else {
+ _movp(T, Condition);
+ }
+ // ALIGNHACK: Until stack alignment support is implemented, the
+ // bitwise vector instructions need to have both operands in
+ // registers. Once there is support for stack alignment, LEGAL_HACK
+ // can be removed.
+#define LEGAL_HACK(Vect) legalizeToVar((Vect))
+ _movp(T2, T);
+ _pand(T, LEGAL_HACK(SrcT));
+ _pandn(T2, LEGAL_HACK(SrcF));
+ _por(T, T2);
+ _movp(Dest, T);
+#undef LEGAL_HACK
+
+ return;
+ }
+
+ // a=d?b:c ==> cmp d,0; a=b; jne L1; FakeUse(a); a=c; L1:
+ Operand *ConditionRMI = legalize(Condition);
Constant *Zero = Ctx->getConstantZero(IceType_i32);
InstX8632Label *Label = InstX8632Label::create(Func, this);
@@ -3423,7 +3458,7 @@
Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
Operand *SrcLoRI = legalize(loOperand(SrcT), Legal_Reg | Legal_Imm, true);
Operand *SrcHiRI = legalize(hiOperand(SrcT), Legal_Reg | Legal_Imm, true);
- _cmp(Condition, Zero);
+ _cmp(ConditionRMI, Zero);
_mov(DestLo, SrcLoRI);
_mov(DestHi, SrcHiRI);
_br(InstX8632Br::Br_ne, Label);
@@ -3436,7 +3471,7 @@
_mov(DestLo, SrcLoRI);
_mov(DestHi, SrcHiRI);
} else {
- _cmp(Condition, Zero);
+ _cmp(ConditionRMI, Zero);
SrcT = legalize(SrcT, Legal_Reg | Legal_Imm, true);
_mov(Dest, SrcT);
_br(InstX8632Br::Br_ne, Label);
diff --git a/src/IceTargetLoweringX8632.h b/src/IceTargetLoweringX8632.h
index 864881f..cbc98ce 100644
--- a/src/IceTargetLoweringX8632.h
+++ b/src/IceTargetLoweringX8632.h
@@ -304,6 +304,9 @@
void _pand(Variable *Dest, Operand *Src0) {
Context.insert(InstX8632Pand::create(Func, Dest, Src0));
}
+ void _pandn(Variable *Dest, Operand *Src0) {
+ Context.insert(InstX8632Pandn::create(Func, Dest, Src0));
+ }
void _pcmpeq(Variable *Dest, Operand *Src0) {
Context.insert(InstX8632Pcmpeq::create(Func, Dest, Src0));
}
diff --git a/tests_lit/llvm2ice_tests/vector-select.ll b/tests_lit/llvm2ice_tests/vector-select.ll
new file mode 100644
index 0000000..93f5941
--- /dev/null
+++ b/tests_lit/llvm2ice_tests/vector-select.ll
@@ -0,0 +1,85 @@
+; This file tests support for the select instruction with vector valued inputs.
+
+; RUN: %llvm2ice -O2 --verbose none %s | FileCheck %s
+; RUN: %llvm2ice -Om1 --verbose none %s | FileCheck %s
+; RUN: %llvm2ice -O2 --verbose none %s \
+; RUN: | llvm-mc -arch=x86 -x86-asm-syntax=intel -filetype=obj
+; RUN: %llvm2ice -Om1 --verbose none %s \
+; RUN: | llvm-mc -arch=x86 -x86-asm-syntax=intel -filetype=obj
+; RUN: %llvm2ice --verbose none %s | FileCheck --check-prefix=ERRORS %s
+; RUN: %llvm2iceinsts %s | %szdiff %s | FileCheck --check-prefix=DUMP %s
+; RUN: %llvm2iceinsts --pnacl %s | %szdiff %s \
+; RUN: | FileCheck --check-prefix=DUMP %s
+
+define <16 x i8> @test_select_v16i8(<16 x i1> %cond, <16 x i8> %arg1, <16 x i8> %arg2) {
+entry:
+ %res = select <16 x i1> %cond, <16 x i8> %arg1, <16 x i8> %arg2
+ ret <16 x i8> %res
+; CHECK-LABEL: test_select_v16i8:
+; CHECK: pand
+; CHECK: pandn
+; CHECK: por
+}
+
+define <16 x i1> @test_select_v16i1(<16 x i1> %cond, <16 x i1> %arg1, <16 x i1> %arg2) {
+entry:
+ %res = select <16 x i1> %cond, <16 x i1> %arg1, <16 x i1> %arg2
+ ret <16 x i1> %res
+; CHECK-LABEL: test_select_v16i1:
+; CHECK: pand
+; CHECK: pandn
+; CHECK: por
+}
+
+define <8 x i16> @test_select_v8i16(<8 x i1> %cond, <8 x i16> %arg1, <8 x i16> %arg2) {
+entry:
+ %res = select <8 x i1> %cond, <8 x i16> %arg1, <8 x i16> %arg2
+ ret <8 x i16> %res
+; CHECK-LABEL: test_select_v8i16:
+; CHECK: pand
+; CHECK: pandn
+; CHECK: por
+}
+
+define <8 x i1> @test_select_v8i1(<8 x i1> %cond, <8 x i1> %arg1, <8 x i1> %arg2) {
+entry:
+ %res = select <8 x i1> %cond, <8 x i1> %arg1, <8 x i1> %arg2
+ ret <8 x i1> %res
+; CHECK-LABEL: test_select_v8i1:
+; CHECK: pand
+; CHECK: pandn
+; CHECK: por
+}
+
+define <4 x i32> @test_select_v4i32(<4 x i1> %cond, <4 x i32> %arg1, <4 x i32> %arg2) {
+entry:
+ %res = select <4 x i1> %cond, <4 x i32> %arg1, <4 x i32> %arg2
+ ret <4 x i32> %res
+; CHECK-LABEL: test_select_v4i32:
+; CHECK: pand
+; CHECK: pandn
+; CHECK: por
+}
+
+define <4 x float> @test_select_v4f32(<4 x i1> %cond, <4 x float> %arg1, <4 x float> %arg2) {
+entry:
+ %res = select <4 x i1> %cond, <4 x float> %arg1, <4 x float> %arg2
+ ret <4 x float> %res
+; CHECK-LABEL: test_select_v4f32:
+; CHECK: pand
+; CHECK: pandn
+; CHECK: por
+}
+
+define <4 x i1> @test_select_v4i1(<4 x i1> %cond, <4 x i1> %arg1, <4 x i1> %arg2) {
+entry:
+ %res = select <4 x i1> %cond, <4 x i1> %arg1, <4 x i1> %arg2
+ ret <4 x i1> %res
+; CHECK-LABEL: test_select_v4i1:
+; CHECK: pand
+; CHECK: pandn
+; CHECK: por
+}
+
+; ERRORS-NOT: ICE translation error
+; DUMP-NOT: SZ