Subzero: Add support for SSE4.1 instructions.
* Add initial support for code generation with SSE4.1 instructions. The
following operations are affected:
- multiplication with v4i32
- select
- insertelement
- extractelement
* Add appropriate lit checks for SSE4.1 instructions. Run the crosstests
in both SSE2 and SSE4.1 mode.
* Introduce the -mattr flag to llvm2ice to control which instruction set
gets used.
BUG=none
R=jvoung@chromium.org, stichnot@chromium.org
Review URL: https://codereview.chromium.org/427843002
diff --git a/crosstest/crosstest.py b/crosstest/crosstest.py
index c8e9442..9b64399 100755
--- a/crosstest/crosstest.py
+++ b/crosstest/crosstest.py
@@ -43,6 +43,9 @@
metavar='OPTLEVEL',
help='Optimization level ' +
'(m1 and -1 are equivalent)')
+ argparser.add_argument('--mattr', required=False, default='sse2',
+ dest='attr', choices=['sse2', 'sse4.1'],
+ metavar='ATTRIBUTE', help='Target attribute')
argparser.add_argument('--prefix', required=True,
metavar='SZ_PREFIX',
help='String prepended to Subzero symbol names')
@@ -93,6 +96,7 @@
obj_llc = os.path.join(args.dir, base + '.llc.o')
shellcmd(['../llvm2ice',
'-O' + args.optlevel,
+ '-mattr=' + args.attr,
'--target=' + args.target,
'--prefix=' + args.prefix,
'-o=' + asm_sz,
diff --git a/crosstest/runtests.sh b/crosstest/runtests.sh
index 7b81df1..0b79f48 100755
--- a/crosstest/runtests.sh
+++ b/crosstest/runtests.sh
@@ -6,116 +6,144 @@
set -eux
OPTLEVELS="m1 2"
+ATTRIBUTES="sse2 sse4.1"
OUTDIR=Output
# Clean the output directory to avoid reusing stale results.
rm -rf "${OUTDIR}"
mkdir -p "${OUTDIR}"
for optlevel in ${OPTLEVELS} ; do
+ for attribute in ${ATTRIBUTES} ; do
- ./crosstest.py -O${optlevel} --prefix=Subzero_ --target=x8632 \
- --dir="${OUTDIR}" \
- --llvm-bin-path="${LLVM_BIN_PATH}" \
- --test=simple_loop.c \
- --driver=simple_loop_main.c \
- --output=simple_loop_O${optlevel}
+ ./crosstest.py -O${optlevel} --mattr ${attribute} \
+ --prefix=Subzero_ \
+ --target=x8632 \
+ --dir="${OUTDIR}" \
+ --llvm-bin-path="${LLVM_BIN_PATH}" \
+ --test=simple_loop.c \
+ --driver=simple_loop_main.c \
+ --output=simple_loop_O${optlevel}_${attribute}
- ./crosstest.py -O${optlevel} --prefix=Subzero_ --target=x8632 \
- --dir="${OUTDIR}" \
- --llvm-bin-path="${LLVM_BIN_PATH}" \
- --test=mem_intrin.cpp \
- --driver=mem_intrin_main.cpp \
- --output=mem_intrin_O${optlevel}
+ ./crosstest.py -O${optlevel} --mattr ${attribute} \
+ --prefix=Subzero_ \
+ --target=x8632 \
+ --dir="${OUTDIR}" \
+ --llvm-bin-path="${LLVM_BIN_PATH}" \
+ --test=mem_intrin.cpp \
+ --driver=mem_intrin_main.cpp \
+ --output=mem_intrin_O${optlevel}_${attribute}
- ./crosstest.py -O${optlevel} --prefix=Subzero_ --target=x8632 \
- --dir="${OUTDIR}" \
- --llvm-bin-path="${LLVM_BIN_PATH}" \
- --test=test_arith.cpp \
- --test=test_arith_frem.ll \
- --test=test_arith_sqrt.ll \
- --driver=test_arith_main.cpp \
- --output=test_arith_O${optlevel}
+ ./crosstest.py -O${optlevel} --mattr ${attribute} \
+ --prefix=Subzero_ \
+ --target=x8632 \
+ --dir="${OUTDIR}" \
+ --llvm-bin-path="${LLVM_BIN_PATH}" \
+ --test=test_arith.cpp \
+ --test=test_arith_frem.ll \
+ --test=test_arith_sqrt.ll \
+ --driver=test_arith_main.cpp \
+ --output=test_arith_O${optlevel}_${attribute}
- ./crosstest.py -O${optlevel} --prefix=Subzero_ --target=x8632 \
- --dir="${OUTDIR}" \
- --llvm-bin-path="${LLVM_BIN_PATH}" \
- --test=test_bitmanip.cpp --test=test_bitmanip_intrin.ll \
- --driver=test_bitmanip_main.cpp \
- --output=test_bitmanip_O${optlevel}
+ ./crosstest.py -O${optlevel} --mattr ${attribute} \
+ --prefix=Subzero_ \
+ --target=x8632 \
+ --dir="${OUTDIR}" \
+ --llvm-bin-path="${LLVM_BIN_PATH}" \
+ --test=test_bitmanip.cpp --test=test_bitmanip_intrin.ll \
+ --driver=test_bitmanip_main.cpp \
+ --output=test_bitmanip_O${optlevel}_${attribute}
- ./crosstest.py -O${optlevel} --prefix=Subzero_ --target=x8632 \
- --dir="${OUTDIR}" \
- --llvm-bin-path="${LLVM_BIN_PATH}" \
- --test=test_cast.cpp --test=test_cast_to_u1.ll \
- --driver=test_cast_main.cpp \
- --output=test_cast_O${optlevel}
+ ./crosstest.py -O${optlevel} --mattr ${attribute} \
+ --prefix=Subzero_ \
+ --target=x8632 \
+ --dir="${OUTDIR}" \
+ --llvm-bin-path="${LLVM_BIN_PATH}" \
+ --test=test_cast.cpp --test=test_cast_to_u1.ll \
+ --driver=test_cast_main.cpp \
+ --output=test_cast_O${optlevel}_${attribute}
- ./crosstest.py -O${optlevel} --prefix=Subzero_ --target=x8632 \
- --dir="${OUTDIR}" \
- --llvm-bin-path="${LLVM_BIN_PATH}" \
- --test=test_fcmp.pnacl.ll \
- --driver=test_fcmp_main.cpp \
- --output=test_fcmp_O${optlevel}
+ ./crosstest.py -O${optlevel} --mattr ${attribute} \
+ --prefix=Subzero_ \
+ --target=x8632 \
+ --dir="${OUTDIR}" \
+ --llvm-bin-path="${LLVM_BIN_PATH}" \
+ --test=test_fcmp.pnacl.ll \
+ --driver=test_fcmp_main.cpp \
+ --output=test_fcmp_O${optlevel}_${attribute}
- ./crosstest.py -O${optlevel} --prefix=Subzero_ --target=x8632 \
- --dir="${OUTDIR}" \
- --llvm-bin-path="${LLVM_BIN_PATH}" \
- --test=test_global.cpp \
- --driver=test_global_main.cpp \
- --output=test_global_O${optlevel}
+ ./crosstest.py -O${optlevel} --mattr ${attribute} \
+ --prefix=Subzero_ \
+ --target=x8632 \
+ --dir="${OUTDIR}" \
+ --llvm-bin-path="${LLVM_BIN_PATH}" \
+ --test=test_global.cpp \
+ --driver=test_global_main.cpp \
+ --output=test_global_O${optlevel}_${attribute}
- ./crosstest.py -O${optlevel} --prefix=Subzero_ --target=x8632 \
- --dir="${OUTDIR}" \
- --llvm-bin-path="${LLVM_BIN_PATH}" \
- --test=test_icmp.cpp --test=test_icmp_i1vec.ll \
- --driver=test_icmp_main.cpp \
- --output=test_icmp_O${optlevel}
+ ./crosstest.py -O${optlevel} --mattr ${attribute} \
+ --prefix=Subzero_ \
+ --target=x8632 \
+ --dir="${OUTDIR}" \
+ --llvm-bin-path="${LLVM_BIN_PATH}" \
+ --test=test_icmp.cpp --test=test_icmp_i1vec.ll \
+ --driver=test_icmp_main.cpp \
+ --output=test_icmp_O${optlevel}_${attribute}
- ./crosstest.py -O${optlevel} --prefix=Subzero_ --target=x8632 \
- --dir="${OUTDIR}" \
- --llvm-bin-path="${LLVM_BIN_PATH}" \
- --test=test_select.ll \
- --driver=test_select_main.cpp \
- --output=test_select_O${optlevel}
+ ./crosstest.py -O${optlevel} --mattr ${attribute} \
+ --prefix=Subzero_ \
+ --target=x8632 \
+ --dir="${OUTDIR}" \
+ --llvm-bin-path="${LLVM_BIN_PATH}" \
+ --test=test_select.ll \
+ --driver=test_select_main.cpp \
+ --output=test_select_O${optlevel}_${attribute}
- ./crosstest.py -O${optlevel} --prefix=Subzero_ --target=x8632 \
- --dir="${OUTDIR}" \
- --llvm-bin-path="${LLVM_BIN_PATH}" \
- --test=test_stacksave.c \
- --driver=test_stacksave_main.c \
- --output=test_stacksave_O${optlevel}
+ ./crosstest.py -O${optlevel} --mattr ${attribute} \
+ --prefix=Subzero_ \
+ --target=x8632 \
+ --dir="${OUTDIR}" \
+ --llvm-bin-path="${LLVM_BIN_PATH}" \
+ --test=test_stacksave.c \
+ --driver=test_stacksave_main.c \
+ --output=test_stacksave_O${optlevel}_${attribute}
- # Compile the non-subzero object files straight from source
- # since the native LLVM backend does not understand how to
- # lower NaCl-specific intrinsics.
- ./crosstest.py -O${optlevel} --prefix=Subzero_ --target=x8632 \
- --dir="${OUTDIR}" \
- --llvm-bin-path="${LLVM_BIN_PATH}" \
- --test=test_sync_atomic.cpp \
- --crosstest-bitcode=0 \
- --driver=test_sync_atomic_main.cpp \
- --output=test_sync_atomic_O${optlevel}
+ # Compile the non-subzero object files straight from source
+ # since the native LLVM backend does not understand how to
+ # lower NaCl-specific intrinsics.
+ ./crosstest.py -O${optlevel} --mattr ${attribute} \
+ --prefix=Subzero_ \
+ --target=x8632 \
+ --dir="${OUTDIR}" \
+ --llvm-bin-path="${LLVM_BIN_PATH}" \
+ --test=test_sync_atomic.cpp \
+ --crosstest-bitcode=0 \
+ --driver=test_sync_atomic_main.cpp \
+ --output=test_sync_atomic_O${optlevel}_${attribute}
- ./crosstest.py -O${optlevel} --prefix=Subzero_ --target=x8632 \
- --dir="${OUTDIR}" \
- --llvm-bin-path="${LLVM_BIN_PATH}" \
- --test=test_vector_ops.ll \
- --driver=test_vector_ops_main.cpp \
- --output=test_vector_ops_O${optlevel}
+ ./crosstest.py -O${optlevel} --mattr ${attribute} \
+ --prefix=Subzero_ --target=x8632 \
+ --dir="${OUTDIR}" \
+ --llvm-bin-path="${LLVM_BIN_PATH}" \
+ --test=test_vector_ops.ll \
+ --driver=test_vector_ops_main.cpp \
+ --output=test_vector_ops_O${optlevel}_${attribute}
+ done
done
for optlevel in ${OPTLEVELS} ; do
- "${OUTDIR}"/simple_loop_O${optlevel}
- "${OUTDIR}"/mem_intrin_O${optlevel}
- "${OUTDIR}"/test_arith_O${optlevel}
- "${OUTDIR}"/test_bitmanip_O${optlevel}
- "${OUTDIR}"/test_cast_O${optlevel}
- "${OUTDIR}"/test_fcmp_O${optlevel}
- "${OUTDIR}"/test_global_O${optlevel}
- "${OUTDIR}"/test_icmp_O${optlevel}
- "${OUTDIR}"/test_select_O${optlevel}
- "${OUTDIR}"/test_stacksave_O${optlevel}
- "${OUTDIR}"/test_sync_atomic_O${optlevel}
- "${OUTDIR}"/test_vector_ops_O${optlevel}
+ for attribute in ${ATTRIBUTES}; do
+ "${OUTDIR}"/simple_loop_O${optlevel}_${attribute}
+ "${OUTDIR}"/mem_intrin_O${optlevel}_${attribute}
+ "${OUTDIR}"/test_arith_O${optlevel}_${attribute}
+ "${OUTDIR}"/test_bitmanip_O${optlevel}_${attribute}
+ "${OUTDIR}"/test_cast_O${optlevel}_${attribute}
+ "${OUTDIR}"/test_fcmp_O${optlevel}_${attribute}
+ "${OUTDIR}"/test_global_O${optlevel}_${attribute}
+ "${OUTDIR}"/test_icmp_O${optlevel}_${attribute}
+ "${OUTDIR}"/test_select_O${optlevel}_${attribute}
+ "${OUTDIR}"/test_stacksave_O${optlevel}_${attribute}
+ "${OUTDIR}"/test_sync_atomic_O${optlevel}_${attribute}
+ "${OUTDIR}"/test_vector_ops_O${optlevel}_${attribute}
+ done
done
diff --git a/src/IceInstX8632.cpp b/src/IceInstX8632.cpp
index 7d930c2..be84554 100644
--- a/src/IceInstX8632.cpp
+++ b/src/IceInstX8632.cpp
@@ -484,7 +484,7 @@
template <> const char *InstX8632Imul::Opcode = "imul";
template <> const char *InstX8632Mulps::Opcode = "mulps";
template <> const char *InstX8632Mulss::Opcode = "mulss";
-template <> const char *InstX8632Pmullw::Opcode = "pmullw";
+template <> const char *InstX8632Pmull::Opcode = "pmull";
template <> const char *InstX8632Pmuludq::Opcode = "pmuludq";
template <> const char *InstX8632Div::Opcode = "div";
template <> const char *InstX8632Divps::Opcode = "divps";
@@ -500,10 +500,13 @@
template <> const char *InstX8632Pcmpgt::Opcode = "pcmpgt";
template <> const char *InstX8632Movss::Opcode = "movss";
// Ternary ops
+template <> const char *InstX8632Insertps::Opcode = "insertps";
template <> const char *InstX8632Shufps::Opcode = "shufps";
-template <> const char *InstX8632Pinsrw::Opcode = "pinsrw";
+template <> const char *InstX8632Pinsr::Opcode = "pinsr";
+template <> const char *InstX8632Blendvps::Opcode = "blendvps";
+template <> const char *InstX8632Pblendvb::Opcode = "pblendvb";
// Three address ops
-template <> const char *InstX8632Pextrw::Opcode = "pextrw";
+template <> const char *InstX8632Pextr::Opcode = "pextr";
template <> const char *InstX8632Pshufd::Opcode = "pshufd";
template <> void InstX8632Sqrtss::emit(const Cfg *Func) const {
@@ -532,6 +535,23 @@
emitTwoAddress(buf, this, Func);
}
+template <> void InstX8632Pmull::emit(const Cfg *Func) const {
+ char buf[30];
+ bool TypesAreValid = getDest()->getType() == IceType_v4i32 ||
+ getDest()->getType() == IceType_v8i16;
+ bool InstructionSetIsValid =
+ getDest()->getType() == IceType_v8i16 ||
+ static_cast<TargetX8632 *>(Func->getTarget())->getInstructionSet() >=
+ TargetX8632::SSE4_1;
+ (void)TypesAreValid;
+ (void)InstructionSetIsValid;
+ assert(TypesAreValid);
+ assert(InstructionSetIsValid);
+ snprintf(buf, llvm::array_lengthof(buf), "pmull%s",
+ TypeX8632Attributes[getDest()->getType()].PackString);
+ emitTwoAddress(buf, this, Func);
+}
+
template <> void InstX8632Subss::emit(const Cfg *Func) const {
char buf[30];
snprintf(buf, llvm::array_lengthof(buf), "sub%s",
@@ -553,12 +573,6 @@
emitTwoAddress(buf, this, Func);
}
-template <> void InstX8632Pmullw::emit(const Cfg *Func) const {
- assert(getSrc(0)->getType() == IceType_v8i16 &&
- getSrc(1)->getType() == IceType_v8i16);
- emitTwoAddress(Opcode, this, Func);
-}
-
template <> void InstX8632Pmuludq::emit(const Cfg *Func) const {
assert(getSrc(0)->getType() == IceType_v4i32 &&
getSrc(1)->getType() == IceType_v4i32);
@@ -588,6 +602,38 @@
Str << "\n";
}
+
+namespace {
+
+// pblendvb and blendvps take xmm0 as a final implicit argument.
+void emitVariableBlendInst(const char *Opcode, const Inst *Inst,
+ const Cfg *Func) {
+ Ostream &Str = Func->getContext()->getStrEmit();
+ assert(Inst->getSrcSize() == 3);
+ assert(llvm::isa<Variable>(Inst->getSrc(2)));
+ assert(llvm::cast<Variable>(Inst->getSrc(2))->getRegNum() ==
+ TargetX8632::Reg_xmm0);
+ Str << "\t" << Opcode << "\t";
+ Inst->getDest()->emit(Func);
+ Str << ", ";
+ Inst->getSrc(1)->emit(Func);
+ Str << "\n";
+}
+
+} // end anonymous namespace
+
+template <> void InstX8632Blendvps::emit(const Cfg *Func) const {
+ assert(static_cast<TargetX8632 *>(Func->getTarget())->getInstructionSet() >=
+ TargetX8632::SSE4_1);
+ emitVariableBlendInst(Opcode, this, Func);
+}
+
+template <> void InstX8632Pblendvb::emit(const Cfg *Func) const {
+ assert(static_cast<TargetX8632 *>(Func->getTarget())->getInstructionSet() >=
+ TargetX8632::SSE4_1);
+ emitVariableBlendInst(Opcode, this, Func);
+}
+
template <> void InstX8632Imul::emit(const Cfg *Func) const {
Ostream &Str = Func->getContext()->getStrEmit();
assert(getSrcSize() == 2);
@@ -1127,13 +1173,19 @@
emitTwoAddress(buf, this, Func);
}
-template <> void InstX8632Pextrw::emit(const Cfg *Func) const {
+template <> void InstX8632Pextr::emit(const Cfg *Func) const {
Ostream &Str = Func->getContext()->getStrEmit();
assert(getSrcSize() == 2);
- Str << "\t" << Opcode << "\t";
+ // pextrb and pextrd are SSE4.1 instructions.
+ assert(getSrc(0)->getType() == IceType_v8i16 ||
+ getSrc(0)->getType() == IceType_v8i1 ||
+ static_cast<TargetX8632 *>(Func->getTarget())->getInstructionSet()
+ >= TargetX8632::SSE4_1);
+ Str << "\t" << Opcode
+ << TypeX8632Attributes[getSrc(0)->getType()].PackString << "\t";
Variable *Dest = getDest();
- assert(Dest->hasReg() && Dest->getType() == IceType_i16);
- // pextrw takes r32 dest.
+ // pextrw must take a register dest.
+ assert(Dest->getType() != IceType_i16 || Dest->hasReg());
Dest->asType(IceType_i32).emit(Func);
Str << ", ";
getSrc(0)->emit(Func);
@@ -1142,16 +1194,26 @@
Str << "\n";
}
-template <> void InstX8632Pinsrw::emit(const Cfg *Func) const {
+template <> void InstX8632Pinsr::emit(const Cfg *Func) const {
Ostream &Str = Func->getContext()->getStrEmit();
assert(getSrcSize() == 3);
- Str << "\t" << Opcode << "\t";
+ // pinsrb and pinsrd are SSE4.1 instructions.
+ assert(getDest()->getType() == IceType_v8i16 ||
+ getDest()->getType() == IceType_v8i1 ||
+ static_cast<TargetX8632 *>(Func->getTarget())->getInstructionSet()
+ >= TargetX8632::SSE4_1);
+ Str << "\t" << Opcode
+ << TypeX8632Attributes[getDest()->getType()].PackString << "\t";
getDest()->emit(Func);
Str << ", ";
Operand *Src1 = getSrc(1);
if (Variable *VSrc1 = llvm::dyn_cast<Variable>(Src1)) {
- // If src1 is a register, it should be r32.
- VSrc1->asType(VSrc1->hasReg() ? IceType_i32 : IceType_i16).emit(Func);
+ // If src1 is a register, it should always be r32.
+ if (VSrc1->hasReg()) {
+ VSrc1->asType(IceType_i32).emit(Func);
+ } else {
+ VSrc1->emit(Func);
+ }
} else {
Src1->emit(Func);
}
@@ -1216,7 +1278,9 @@
template <> void InstX8632Psll::emit(const Cfg *Func) const {
assert(getDest()->getType() == IceType_v8i16 ||
- getDest()->getType() == IceType_v4i32);
+ getDest()->getType() == IceType_v8i1 ||
+ getDest()->getType() == IceType_v4i32 ||
+ getDest()->getType() == IceType_v4i1);
char buf[30];
snprintf(buf, llvm::array_lengthof(buf), "psll%s",
TypeX8632Attributes[getDest()->getType()].PackString);
@@ -1225,7 +1289,9 @@
template <> void InstX8632Psra::emit(const Cfg *Func) const {
assert(getDest()->getType() == IceType_v8i16 ||
- getDest()->getType() == IceType_v4i32);
+ getDest()->getType() == IceType_v8i1 ||
+ getDest()->getType() == IceType_v4i32 ||
+ getDest()->getType() == IceType_v4i1);
char buf[30];
snprintf(buf, llvm::array_lengthof(buf), "psra%s",
TypeX8632Attributes[getDest()->getType()].PackString);
diff --git a/src/IceInstX8632.def b/src/IceInstX8632.def
index ece6a0a..932500c 100644
--- a/src/IceInstX8632.def
+++ b/src/IceInstX8632.def
@@ -88,9 +88,9 @@
X(IceType_i64, IceType_void, "si", "" , "" , "qword ptr") \
X(IceType_f32, IceType_void, "ss", "ss", "" , "dword ptr") \
X(IceType_f64, IceType_void, "sd", "sd", "" , "qword ptr") \
- X(IceType_v4i1, IceType_i32 , "?" , "" , "" , "xmmword ptr") \
- X(IceType_v8i1, IceType_i16 , "?" , "" , "" , "xmmword ptr") \
- X(IceType_v16i1, IceType_i8 , "?" , "" , "" , "xmmword ptr") \
+ X(IceType_v4i1, IceType_i32 , "?" , "" , "d", "xmmword ptr") \
+ X(IceType_v8i1, IceType_i16 , "?" , "" , "w", "xmmword ptr") \
+ X(IceType_v16i1, IceType_i8 , "?" , "" , "b", "xmmword ptr") \
X(IceType_v16i8, IceType_i8 , "?" , "" , "b", "xmmword ptr") \
X(IceType_v8i16, IceType_i16 , "?" , "" , "w", "xmmword ptr") \
X(IceType_v4i32, IceType_i32 , "dq", "" , "d", "xmmword ptr") \
diff --git a/src/IceInstX8632.h b/src/IceInstX8632.h
index ddea6b5..7c12f11 100644
--- a/src/IceInstX8632.h
+++ b/src/IceInstX8632.h
@@ -138,6 +138,7 @@
Addps,
Addss,
And,
+ Blendvps,
Br,
Bsf,
Bsr,
@@ -157,6 +158,7 @@
Icmp,
Idiv,
Imul,
+ Insertps,
Label,
Lea,
Load,
@@ -176,11 +178,12 @@
Padd,
Pand,
Pandn,
+ Pblendvb,
Pcmpeq,
Pcmpgt,
- Pextrw,
- Pinsrw,
- Pmullw,
+ Pextr,
+ Pinsr,
+ Pmull,
Pmuludq,
Pop,
Por,
@@ -573,7 +576,7 @@
typedef InstX8632Binop<InstX8632::Imul> InstX8632Imul;
typedef InstX8632Binop<InstX8632::Mulps> InstX8632Mulps;
typedef InstX8632Binop<InstX8632::Mulss> InstX8632Mulss;
-typedef InstX8632Binop<InstX8632::Pmullw> InstX8632Pmullw;
+typedef InstX8632Binop<InstX8632::Pmull> InstX8632Pmull;
typedef InstX8632Binop<InstX8632::Pmuludq> InstX8632Pmuludq;
typedef InstX8632Binop<InstX8632::Divps> InstX8632Divps;
typedef InstX8632Binop<InstX8632::Divss> InstX8632Divss;
@@ -594,9 +597,12 @@
typedef InstX8632Binop<InstX8632::Movss> InstX8632Movss;
typedef InstX8632Ternop<InstX8632::Idiv> InstX8632Idiv;
typedef InstX8632Ternop<InstX8632::Div> InstX8632Div;
-typedef InstX8632Ternop<InstX8632::Pinsrw> InstX8632Pinsrw;
+typedef InstX8632Ternop<InstX8632::Insertps> InstX8632Insertps;
+typedef InstX8632Ternop<InstX8632::Pinsr> InstX8632Pinsr;
typedef InstX8632Ternop<InstX8632::Shufps> InstX8632Shufps;
-typedef InstX8632ThreeAddressop<InstX8632::Pextrw> InstX8632Pextrw;
+typedef InstX8632Ternop<InstX8632::Blendvps> InstX8632Blendvps;
+typedef InstX8632Ternop<InstX8632::Pblendvb> InstX8632Pblendvb;
+typedef InstX8632ThreeAddressop<InstX8632::Pextr> InstX8632Pextr;
typedef InstX8632ThreeAddressop<InstX8632::Pshufd> InstX8632Pshufd;
// Base class for a lockable x86-32 instruction (emits a locked prefix).
diff --git a/src/IceTargetLoweringX8632.cpp b/src/IceTargetLoweringX8632.cpp
index 00db25a..cc6f222 100644
--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
@@ -22,6 +22,7 @@
#include "IceOperand.h"
#include "IceTargetLoweringX8632.def"
#include "IceTargetLoweringX8632.h"
+#include "llvm/Support/CommandLine.h"
namespace Ice {
@@ -123,6 +124,17 @@
// The number of bits in a byte
const unsigned X86_CHAR_BIT = 8;
+// Instruction set options
+namespace cl = ::llvm::cl;
+cl::opt<TargetX8632::X86InstructionSet> CLInstructionSet(
+ "mattr", cl::desc("X86 target attributes"),
+ cl::init(TargetX8632::SSE2),
+ cl::values(
+ clEnumValN(TargetX8632::SSE2, "sse2",
+ "Enable SSE2 instructions (default)"),
+ clEnumValN(TargetX8632::SSE4_1, "sse4.1",
+ "Enable SSE 4.1 instructions"), clEnumValEnd));
+
// Return a string representation of the type that is suitable for use
// in an identifier.
IceString typeIdentString(const Type Ty) {
@@ -234,8 +246,9 @@
} // end of anonymous namespace
TargetX8632::TargetX8632(Cfg *Func)
- : TargetLowering(Func), IsEbpBasedFrame(false), FrameSizeLocals(0),
- LocalsSizeBytes(0), NextLabelNumber(0), ComputedLiveRanges(false),
+ : TargetLowering(Func), InstructionSet(CLInstructionSet),
+ IsEbpBasedFrame(false), FrameSizeLocals(0), LocalsSizeBytes(0),
+ NextLabelNumber(0), ComputedLiveRanges(false),
PhysicalRegisters(VarList(Reg_NUM)) {
// TODO: Don't initialize IntegerRegisters and friends every time.
// Instead, initialize in some sort of static initializer for the
@@ -1228,7 +1241,16 @@
_movp(Dest, T);
} break;
case InstArithmetic::Mul: {
- if (Dest->getType() == IceType_v4i32) {
+ bool TypesAreValidForPmull =
+ Dest->getType() == IceType_v4i32 || Dest->getType() == IceType_v8i16;
+ bool InstructionSetIsValidForPmull =
+ Dest->getType() == IceType_v8i16 || InstructionSet >= SSE4_1;
+ if (TypesAreValidForPmull && InstructionSetIsValidForPmull) {
+ Variable *T = makeReg(Dest->getType());
+ _movp(T, Src0);
+ _pmull(T, legalizeToVar(Src1));
+ _movp(Dest, T);
+ } else if (Dest->getType() == IceType_v4i32) {
// Lowering sequence:
// Note: The mask arguments have index 0 on the left.
//
@@ -1243,8 +1265,6 @@
// shufps T1, T2, {0,2,0,2}
// pshufd T4, T1, {0,2,1,3}
// movups Dest, T4
- //
- // TODO(wala): SSE4.1 has pmulld.
// Mask that directs pshufd to create a vector with entries
// Src[1, 0, 3, 0]
@@ -1273,11 +1293,6 @@
_shufps(T1, T2, Ctx->getConstantInt(IceType_i8, Mask0202));
_pshufd(T4, T1, Ctx->getConstantInt(IceType_i8, Mask0213));
_movp(Dest, T4);
- } else if (Dest->getType() == IceType_v8i16) {
- Variable *T = makeReg(IceType_v8i16);
- _movp(T, Src0);
- _pmullw(T, legalizeToVar(Src1));
- _movp(Dest, T);
} else {
assert(Dest->getType() == IceType_v16i8);
// Sz_mul_v16i8
@@ -2155,10 +2170,15 @@
Variable *ExtractedElement = makeReg(InVectorElementTy);
// TODO(wala): Determine the best lowering sequences for each type.
- if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) {
- // Lower extractelement operations where the element is 32 bits
- // wide with pshufd.
- // TODO(wala): SSE4.1 has extractps and pextrd
+ bool CanUsePextr =
+ Ty == IceType_v8i16 || Ty == IceType_v8i1 || InstructionSet >= SSE4_1;
+ if (CanUsePextr && Ty != IceType_v4f32) {
+ // Use pextrb, pextrw, or pextrd.
+ Constant *Mask = Ctx->getConstantInt(IceType_i8, Index);
+ Variable *SourceVectR = legalizeToVar(SourceVectOperand);
+ _pextr(ExtractedElement, SourceVectR, Mask);
+ } else if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) {
+ // Use pshufd and movd/movss.
//
// ALIGNHACK: Force vector operands to registers in instructions that
// require aligned memory operands until support for stack alignment
@@ -2187,13 +2207,9 @@
_movss(ExtractedElement, T);
}
#undef ALIGN_HACK
- } else if (Ty == IceType_v8i16 || Ty == IceType_v8i1) {
- Constant *Mask = Ctx->getConstantInt(IceType_i8, Index);
- _pextrw(ExtractedElement, legalizeToVar(SourceVectOperand), Mask);
} else {
assert(Ty == IceType_v16i8 || Ty == IceType_v16i1);
// Spill the value to a stack slot and do the extraction in memory.
- // TODO(wala): SSE4.1 has pextrb.
//
// TODO(wala): use legalize(SourceVectOperand, Legal_Mem) when
// support for legalizing to mem is implemented.
@@ -2539,10 +2555,18 @@
ElementToInsert = Expanded;
}
- if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) {
- // Lower insertelement with 32-bit wide elements using shufps or
- // movss.
- // TODO(wala): SSE4.1 has pinsrd and insertps.
+ if (Ty == IceType_v8i16 || Ty == IceType_v8i1 || InstructionSet >= SSE4_1) {
+ // Use insertps, pinsrb, pinsrw, or pinsrd.
+ Operand *Element = legalize(ElementToInsert, Legal_Mem | Legal_Reg);
+ Variable *T = makeReg(Ty);
+ _movp(T, SourceVectOperand);
+ if (Ty == IceType_v4f32)
+ _insertps(T, Element, Ctx->getConstantInt(IceType_i8, Index << 4));
+ else
+ _pinsr(T, Element, Ctx->getConstantInt(IceType_i8, Index));
+ _movp(Inst->getDest(), T);
+ } else if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) {
+ // Use shufps or movss.
Variable *Element = NULL;
if (InVectorElementTy == IceType_f32) {
// Element will be in an XMM register since it is floating point.
@@ -2607,17 +2631,10 @@
_movp(Inst->getDest(), T);
}
#undef ALIGN_HACK
- } else if (Ty == IceType_v8i16 || Ty == IceType_v8i1) {
- Operand *Element = legalize(ElementToInsert, Legal_Mem | Legal_Reg);
- Variable *T = makeReg(Ty);
- _movp(T, SourceVectOperand);
- _pinsrw(T, Element, Ctx->getConstantInt(IceType_i8, Index));
- _movp(Inst->getDest(), T);
} else {
assert(Ty == IceType_v16i8 || Ty == IceType_v16i1);
// Spill the value to a stack slot and perform the insertion in
// memory.
- // TODO(wala): SSE4.1 has pinsrb.
//
// TODO(wala): use legalize(SourceVectOperand, Legal_Mem) when
// support for legalizing to mem is implemented.
@@ -3551,11 +3568,42 @@
Operand *Condition = Inst->getCondition();
if (isVectorType(Dest->getType())) {
- // a=d?b:c ==> d=sext(d); a=(b&d)|(c&~d)
- // TODO(wala): SSE4.1 has blendvps and pblendvb. SSE4.1 also has
- // blendps and pblendw for constant condition operands.
Type SrcTy = SrcT->getType();
Variable *T = makeReg(SrcTy);
+ // ALIGNHACK: Until stack alignment support is implemented, vector
+ // instructions need to have vector operands in registers. Once
+ // there is support for stack alignment, LEGAL_HACK can be removed.
+#define LEGAL_HACK(Vect) legalizeToVar((Vect))
+ if (InstructionSet >= SSE4_1) {
+ // TODO(wala): If the condition operand is a constant, use blendps
+ // or pblendw.
+ //
+ // Use blendvps or pblendvb to implement select.
+ if (SrcTy == IceType_v4i1 || SrcTy == IceType_v4i32 ||
+ SrcTy == IceType_v4f32) {
+ Variable *xmm0 = makeReg(IceType_v4i32, Reg_xmm0);
+ _movp(xmm0, Condition);
+ _psll(xmm0, Ctx->getConstantInt(IceType_i8, 31));
+ _movp(T, SrcF);
+ _blendvps(T, LEGAL_HACK(SrcT), xmm0);
+ _movp(Dest, T);
+ } else {
+ assert(typeNumElements(SrcTy) == 8 || typeNumElements(SrcTy) == 16);
+ Type SignExtTy = Condition->getType() == IceType_v8i1 ? IceType_v8i16
+ : IceType_v16i8;
+ Variable *xmm0 = makeReg(SignExtTy, Reg_xmm0);
+ lowerCast(InstCast::create(Func, InstCast::Sext, xmm0, Condition));
+ _movp(T, SrcF);
+ _pblendvb(T, LEGAL_HACK(SrcT), xmm0);
+ _movp(Dest, T);
+ }
+ return;
+ }
+ // Lower select without SSE4.1:
+ // a=d?b:c ==>
+ // if elementtype(d) != i1:
+ // d=sext(d);
+ // a=(b&d)|(c&~d);
Variable *T2 = makeReg(SrcTy);
// Sign extend the condition operand if applicable.
if (SrcTy == IceType_v4f32) {
@@ -3568,11 +3616,6 @@
} else {
_movp(T, Condition);
}
- // ALIGNHACK: Until stack alignment support is implemented, the
- // bitwise vector instructions need to have both operands in
- // registers. Once there is support for stack alignment, LEGAL_HACK
- // can be removed.
-#define LEGAL_HACK(Vect) legalizeToVar((Vect))
_movp(T2, T);
_pand(T, LEGAL_HACK(SrcT));
_pandn(T2, LEGAL_HACK(SrcF));
diff --git a/src/IceTargetLoweringX8632.h b/src/IceTargetLoweringX8632.h
index daca0cd..0c87bee 100644
--- a/src/IceTargetLoweringX8632.h
+++ b/src/IceTargetLoweringX8632.h
@@ -71,6 +71,14 @@
Reg_NUM
};
+ enum X86InstructionSet {
+ // SSE2 is the PNaCl baseline instruction set.
+ SSE2,
+ SSE4_1
+ };
+
+ X86InstructionSet getInstructionSet() const { return InstructionSet; }
+
protected:
TargetX8632(Cfg *Func);
@@ -186,6 +194,9 @@
void _and(Variable *Dest, Operand *Src0) {
Context.insert(InstX8632And::create(Func, Dest, Src0));
}
+ void _blendvps(Variable *Dest, Operand *Src0, Operand *Src1) {
+ Context.insert(InstX8632Blendvps::create(Func, Dest, Src0, Src1));
+ }
void _br(InstX8632::BrCond Condition, CfgNode *TargetTrue,
CfgNode *TargetFalse) {
Context.insert(
@@ -260,6 +271,9 @@
void _imul(Variable *Dest, Operand *Src0) {
Context.insert(InstX8632Imul::create(Func, Dest, Src0));
}
+ void _insertps(Variable *Dest, Operand *Src0, Operand *Src1) {
+ Context.insert(InstX8632Insertps::create(Func, Dest, Src0, Src1));
+ }
void _lea(Variable *Dest, Operand *Src0) {
Context.insert(InstX8632Lea::create(Func, Dest, Src0));
}
@@ -317,20 +331,23 @@
void _pandn(Variable *Dest, Operand *Src0) {
Context.insert(InstX8632Pandn::create(Func, Dest, Src0));
}
+ void _pblendvb(Variable *Dest, Operand *Src0, Operand *Src1) {
+ Context.insert(InstX8632Pblendvb::create(Func, Dest, Src0, Src1));
+ }
void _pcmpeq(Variable *Dest, Operand *Src0) {
Context.insert(InstX8632Pcmpeq::create(Func, Dest, Src0));
}
void _pcmpgt(Variable *Dest, Operand *Src0) {
Context.insert(InstX8632Pcmpgt::create(Func, Dest, Src0));
}
- void _pextrw(Variable *Dest, Operand *Src0, Operand *Src1) {
- Context.insert(InstX8632Pextrw::create(Func, Dest, Src0, Src1));
+ void _pextr(Variable *Dest, Operand *Src0, Operand *Src1) {
+ Context.insert(InstX8632Pextr::create(Func, Dest, Src0, Src1));
}
- void _pinsrw(Variable *Dest, Operand *Src0, Operand *Src1) {
- Context.insert(InstX8632Pinsrw::create(Func, Dest, Src0, Src1));
+ void _pinsr(Variable *Dest, Operand *Src0, Operand *Src1) {
+ Context.insert(InstX8632Pinsr::create(Func, Dest, Src0, Src1));
}
- void _pmullw(Variable *Dest, Operand *Src0) {
- Context.insert(InstX8632Pmullw::create(Func, Dest, Src0));
+ void _pmull(Variable *Dest, Operand *Src0) {
+ Context.insert(InstX8632Pmull::create(Func, Dest, Src0));
}
void _pmuludq(Variable *Dest, Operand *Src0) {
Context.insert(InstX8632Pmuludq::create(Func, Dest, Src0));
@@ -428,6 +445,7 @@
Context.insert(InstX8632Xor::create(Func, Dest, Src0));
}
+ const X86InstructionSet InstructionSet;
bool IsEbpBasedFrame;
size_t FrameSizeLocals;
size_t LocalsSizeBytes;
diff --git a/tests_lit/llvm2ice_tests/vector-arith.ll b/tests_lit/llvm2ice_tests/vector-arith.ll
index d300317..94acfe0 100644
--- a/tests_lit/llvm2ice_tests/vector-arith.ll
+++ b/tests_lit/llvm2ice_tests/vector-arith.ll
@@ -2,10 +2,18 @@
; RUN: %llvm2ice -O2 --verbose none %s | FileCheck %s
; RUN: %llvm2ice -Om1 --verbose none %s | FileCheck %s
+; RUN: %llvm2ice -O2 -mattr=sse4.1 --verbose none %s \
+; RUN: | FileCheck %s --check-prefix=SSE41
+; RUN: %llvm2ice -Om1 -mattr=sse4.1 --verbose none %s \
+; RUN: | FileCheck %s --check-prefix=SSE41
; RUN: %llvm2ice -O2 --verbose none %s \
; RUN: | llvm-mc -arch=x86 -x86-asm-syntax=intel -filetype=obj
; RUN: %llvm2ice -Om1 --verbose none %s \
; RUN: | llvm-mc -arch=x86 -x86-asm-syntax=intel -filetype=obj
+; RUN: %llvm2ice -O2 -mattr=sse4.1 --verbose none %s \
+; RUN: | llvm-mc -arch=x86 -x86-asm-syntax=intel -filetype=obj
+; RUN: %llvm2ice -Om1 -mattr=sse4.1 --verbose none %s \
+; RUN: | llvm-mc -arch=x86 -x86-asm-syntax=intel -filetype=obj
; RUN: %llvm2ice --verbose none %s | FileCheck --check-prefix=ERRORS %s
; RUN: %llvm2iceinsts %s | %szdiff %s | FileCheck --check-prefix=DUMP %s
; RUN: %llvm2iceinsts --pnacl %s | %szdiff %s \
@@ -306,6 +314,9 @@
; CHECK-LABEL: test_mul_v4i32:
; CHECK: pmuludq
; CHECK: pmuludq
+;
+; SSE41-LABEL: test_mul_v4i32:
+; SSE41: pmulld
}
define <4 x i32> @test_shl_v4i32(<4 x i32> %arg0, <4 x i32> %arg1) {
@@ -314,6 +325,9 @@
ret <4 x i32> %res
; CHECK-LABEL: test_shl_v4i32:
; CHECK: Sz_shl_v4i32
+
+; This line is to ensure that pmulld is generated in test_mul_v4i32 above.
+; SSE41-LABEL: test_shl_v4i32:
}
define <4 x i32> @test_lshr_v4i32(<4 x i32> %arg0, <4 x i32> %arg1) {
diff --git a/tests_lit/llvm2ice_tests/vector-ops.ll b/tests_lit/llvm2ice_tests/vector-ops.ll
index 86647db..c730d73 100644
--- a/tests_lit/llvm2ice_tests/vector-ops.ll
+++ b/tests_lit/llvm2ice_tests/vector-ops.ll
@@ -1,10 +1,19 @@
; This checks support for insertelement and extractelement.
-; RUN: %llvm2ice --verbose inst %s | FileCheck %s
+; RUN: %llvm2ice -O2 --verbose none %s | FileCheck %s
+; RUN: %llvm2ice -Om1 --verbose none %s | FileCheck %s
+; RUN: %llvm2ice -O2 -mattr=sse4.1 --verbose none %s \
+; RUN: | FileCheck %s --check-prefix=SSE41
+; RUN: %llvm2ice -Om1 -mattr=sse4.1 --verbose none %s \
+; RUN: | FileCheck %s --check-prefix=SSE41
; RUN: %llvm2ice -O2 --verbose none %s \
; RUN: | llvm-mc -arch=x86 -x86-asm-syntax=intel -filetype=obj
; RUN: %llvm2ice -Om1 --verbose none %s \
; RUN: | llvm-mc -arch=x86 -x86-asm-syntax=intel -filetype=obj
+; RUN: %llvm2ice -O2 -mattr=sse4.1 --verbose none %s \
+; RUN: | llvm-mc -arch=x86 -x86-asm-syntax=intel -filetype=obj
+; RUN: %llvm2ice -Om1 -mattr=sse4.1 --verbose none %s \
+; RUN: | llvm-mc -arch=x86 -x86-asm-syntax=intel -filetype=obj
; RUN: %llvm2ice --verbose none %s | FileCheck --check-prefix=ERRORS %s
; RUN: %llvm2iceinsts %s | %szdiff %s | FileCheck --check-prefix=DUMP %s
; RUN: %llvm2iceinsts --pnacl %s | %szdiff %s \
@@ -18,6 +27,9 @@
ret <4 x float> %res
; CHECK-LABEL: insertelement_v4f32_0:
; CHECK: movss
+
+; SSE41-LABEL: insertelement_v4f32_0:
+; SSE41: insertps {{.*}}, {{.*}}, 0
}
define <4 x i32> @insertelement_v4i32_0(<4 x i32> %vec, i32 %elt) {
@@ -26,6 +38,9 @@
ret <4 x i32> %res
; CHECK-LABEL: insertelement_v4i32_0:
; CHECK: movss
+
+; SSE41-LABEL: insertelement_v4i32_0:
+; SSE41: pinsrd {{.*}}, {{.*}}, 0
}
@@ -36,6 +51,9 @@
; CHECK-LABEL: insertelement_v4f32_1:
; CHECK: shufps
; CHECK: shufps
+
+; SSE41-LABEL: insertelement_v4f32_1:
+; SSE41: insertps {{.*}}, {{.*}}, 16
}
define <4 x i32> @insertelement_v4i32_1(<4 x i32> %vec, i32 %elt) {
@@ -45,6 +63,9 @@
; CHECK-LABEL: insertelement_v4i32_1:
; CHECK: shufps
; CHECK: shufps
+
+; SSE41-LABEL: insertelement_v4i32_1:
+; SSE41: pinsrd {{.*}}, {{.*}}, 1
}
define <8 x i16> @insertelement_v8i16(<8 x i16> %vec, i32 %elt.arg) {
@@ -52,8 +73,11 @@
%elt = trunc i32 %elt.arg to i16
%res = insertelement <8 x i16> %vec, i16 %elt, i32 1
ret <8 x i16> %res
-; CHECK-LABEL: insertelement_v8i16
+; CHECK-LABEL: insertelement_v8i16:
; CHECK: pinsrw
+
+; SSE41-LABEL: insertelement_v8i16:
+; SSE41: pinsrw
}
define <16 x i8> @insertelement_v16i8(<16 x i8> %vec, i32 %elt.arg) {
@@ -65,6 +89,9 @@
; CHECK: movups
; CHECK: lea
; CHECK: mov
+
+; SSE41-LABEL: insertelement_v16i8:
+; SSE41: pinsrb
}
define <4 x i1> @insertelement_v4i1_0(<4 x i1> %vec, i32 %elt.arg) {
@@ -74,6 +101,9 @@
ret <4 x i1> %res
; CHECK-LABEL: insertelement_v4i1_0:
; CHECK: movss
+
+; SSE41-LABEL: insertelement_v4i1_0:
+; SSE41: pinsrd {{.*}}, {{.*}}, 0
}
define <4 x i1> @insertelement_v4i1_1(<4 x i1> %vec, i32 %elt.arg) {
@@ -84,6 +114,9 @@
; CHECK-LABEL: insertelement_v4i1_1:
; CHECK: shufps
; CHECK: shufps
+
+; SSE41-LABEL: insertelement_v4i1_1:
+; SSE41: pinsrd {{.*}}, {{.*}}, 1
}
define <8 x i1> @insertelement_v8i1(<8 x i1> %vec, i32 %elt.arg) {
@@ -93,6 +126,9 @@
ret <8 x i1> %res
; CHECK-LABEL: insertelement_v8i1:
; CHECK: pinsrw
+
+; SSE41-LABEL: insertelement_v8i1:
+; SSE41: pinsrw
}
define <16 x i1> @insertelement_v16i1(<16 x i1> %vec, i32 %elt.arg) {
@@ -104,6 +140,9 @@
; CHECK: movups
; CHECK: lea
; CHECK: mov
+
+; SSE41-LABEL: insertelement_v16i1:
+; SSE41: pinsrb
}
; extractelement operations
@@ -114,6 +153,9 @@
ret float %res
; CHECK-LABEL: extractelement_v4f32:
; CHECK: pshufd
+
+; SSE41-LABEL: extractelement_v4f32:
+; SSE41: pshufd
}
define i32 @extractelement_v4i32(<4 x i32> %vec) {
@@ -122,6 +164,9 @@
ret i32 %res
; CHECK-LABEL: extractelement_v4i32:
; CHECK: pshufd
+
+; SSE41-LABEL: extractelement_v4i32:
+; SSE41: pextrd
}
define i32 @extractelement_v8i16(<8 x i16> %vec) {
@@ -131,6 +176,9 @@
ret i32 %res.ext
; CHECK-LABEL: extractelement_v8i16:
; CHECK: pextrw
+
+; SSE41-LABEL: extractelement_v8i16:
+; SSE41: pextrw
}
define i32 @extractelement_v16i8(<16 x i8> %vec) {
@@ -142,6 +190,9 @@
; CHECK: movups
; CHECK: lea
; CHECK: mov
+
+; SSE41-LABEL: extractelement_v16i8:
+; SSE41: pextrb
}
define i32 @extractelement_v4i1(<4 x i1> %vec) {
@@ -151,6 +202,9 @@
ret i32 %res.ext
; CHECK-LABEL: extractelement_v4i1:
; CHECK: pshufd
+
+; SSE41-LABEL: extractelement_v4i1:
+; SSE41: pextrd
}
define i32 @extractelement_v8i1(<8 x i1> %vec) {
@@ -160,6 +214,9 @@
ret i32 %res.ext
; CHECK-LABEL: extractelement_v8i1:
; CHECK: pextrw
+
+; SSE41-LABEL: extractelement_v8i1:
+; SSE41: pextrw
}
define i32 @extractelement_v16i1(<16 x i1> %vec) {
@@ -171,6 +228,9 @@
; CHECK: movups
; CHECK: lea
; CHECK: mov
+
+; SSE41-LABEL: extractelement_v16i1:
+; SSE41: pextrb
}
; ERRORS-NOT: ICE translation error
diff --git a/tests_lit/llvm2ice_tests/vector-select.ll b/tests_lit/llvm2ice_tests/vector-select.ll
index 93f5941..67270fa 100644
--- a/tests_lit/llvm2ice_tests/vector-select.ll
+++ b/tests_lit/llvm2ice_tests/vector-select.ll
@@ -2,10 +2,18 @@
; RUN: %llvm2ice -O2 --verbose none %s | FileCheck %s
; RUN: %llvm2ice -Om1 --verbose none %s | FileCheck %s
+; RUN: %llvm2ice -O2 -mattr=sse4.1 --verbose none %s \
+; RUN: | FileCheck %s --check-prefix=SSE41
+; RUN: %llvm2ice -Om1 -mattr=sse4.1 --verbose none %s \
+; RUN: | FileCheck %s --check-prefix=SSE41
; RUN: %llvm2ice -O2 --verbose none %s \
; RUN: | llvm-mc -arch=x86 -x86-asm-syntax=intel -filetype=obj
; RUN: %llvm2ice -Om1 --verbose none %s \
; RUN: | llvm-mc -arch=x86 -x86-asm-syntax=intel -filetype=obj
+; RUN: %llvm2ice -O2 -mattr=sse4.1 --verbose none %s \
+; RUN: | llvm-mc -arch=x86 -x86-asm-syntax=intel -filetype=obj
+; RUN: %llvm2ice -Om1 -mattr=sse4.1 --verbose none %s \
+; RUN: | llvm-mc -arch=x86 -x86-asm-syntax=intel -filetype=obj
; RUN: %llvm2ice --verbose none %s | FileCheck --check-prefix=ERRORS %s
; RUN: %llvm2iceinsts %s | %szdiff %s | FileCheck --check-prefix=DUMP %s
; RUN: %llvm2iceinsts --pnacl %s | %szdiff %s \
@@ -19,6 +27,9 @@
; CHECK: pand
; CHECK: pandn
; CHECK: por
+
+; SSE41-LABEL: test_select_v16i8:
+; SSE41: pblendvb
}
define <16 x i1> @test_select_v16i1(<16 x i1> %cond, <16 x i1> %arg1, <16 x i1> %arg2) {
@@ -29,6 +40,9 @@
; CHECK: pand
; CHECK: pandn
; CHECK: por
+
+; SSE41-LABEL: test_select_v16i1:
+; SSE41: pblendvb
}
define <8 x i16> @test_select_v8i16(<8 x i1> %cond, <8 x i16> %arg1, <8 x i16> %arg2) {
@@ -39,6 +53,9 @@
; CHECK: pand
; CHECK: pandn
; CHECK: por
+
+; SSE41-LABEL: test_select_v8i16:
+; SSE41: pblendvb
}
define <8 x i1> @test_select_v8i1(<8 x i1> %cond, <8 x i1> %arg1, <8 x i1> %arg2) {
@@ -49,6 +66,9 @@
; CHECK: pand
; CHECK: pandn
; CHECK: por
+
+; SSE41-LABEL: test_select_v8i1:
+; SSE41: pblendvb
}
define <4 x i32> @test_select_v4i32(<4 x i1> %cond, <4 x i32> %arg1, <4 x i32> %arg2) {
@@ -59,6 +79,10 @@
; CHECK: pand
; CHECK: pandn
; CHECK: por
+
+; SSE41-LABEL: test_select_v4i32:
+; SSE41: pslld xmm0, 31
+; SSE41: blendvps
}
define <4 x float> @test_select_v4f32(<4 x i1> %cond, <4 x float> %arg1, <4 x float> %arg2) {
@@ -69,6 +93,10 @@
; CHECK: pand
; CHECK: pandn
; CHECK: por
+
+; SSE41-LABEL: test_select_v4f32:
+; SSE41: pslld xmm0, 31
+; SSE41: blendvps
}
define <4 x i1> @test_select_v4i1(<4 x i1> %cond, <4 x i1> %arg1, <4 x i1> %arg2) {
@@ -79,6 +107,10 @@
; CHECK: pand
; CHECK: pandn
; CHECK: por
+
+; SSE41-LABEL: test_select_v4i1:
+; SSE41: pslld xmm0, 31
+; SSE41: blendvps
}
; ERRORS-NOT: ICE translation error