Subzero: Add support for SSE4.1 instructions.

* Add initial support for code generation with SSE4.1 instructions. The
following operations are affected:
 - multiplication with v4i32
 - select
 - insertelement
 - extractelement

* Add appropriate lit checks for SSE4.1 instructions. Run the crosstests
in both SSE2 and SSE4.1 mode.

* Introduce the -mattr flag to llvm2ice to control which instruction set
gets used.

BUG=none
R=jvoung@chromium.org, stichnot@chromium.org

Review URL: https://codereview.chromium.org/427843002
diff --git a/crosstest/crosstest.py b/crosstest/crosstest.py
index c8e9442..9b64399 100755
--- a/crosstest/crosstest.py
+++ b/crosstest/crosstest.py
@@ -43,6 +43,9 @@
                            metavar='OPTLEVEL',
                            help='Optimization level ' +
                                 '(m1 and -1 are equivalent)')
+    argparser.add_argument('--mattr',  required=False, default='sse2',
+                           dest='attr', choices=['sse2', 'sse4.1'],
+                           metavar='ATTRIBUTE', help='Target attribute')
     argparser.add_argument('--prefix', required=True,
                            metavar='SZ_PREFIX',
                            help='String prepended to Subzero symbol names')
@@ -93,6 +96,7 @@
         obj_llc = os.path.join(args.dir, base + '.llc.o')
         shellcmd(['../llvm2ice',
                   '-O' + args.optlevel,
+                  '-mattr=' + args.attr,
                   '--target=' + args.target,
                   '--prefix=' + args.prefix,
                   '-o=' + asm_sz,
diff --git a/crosstest/runtests.sh b/crosstest/runtests.sh
index 7b81df1..0b79f48 100755
--- a/crosstest/runtests.sh
+++ b/crosstest/runtests.sh
@@ -6,116 +6,144 @@
 set -eux
 
 OPTLEVELS="m1 2"
+ATTRIBUTES="sse2 sse4.1"
 OUTDIR=Output
 # Clean the output directory to avoid reusing stale results.
 rm -rf "${OUTDIR}"
 mkdir -p "${OUTDIR}"
 
 for optlevel in ${OPTLEVELS} ; do
+    for attribute in ${ATTRIBUTES} ; do
 
-    ./crosstest.py -O${optlevel} --prefix=Subzero_ --target=x8632 \
-        --dir="${OUTDIR}" \
-        --llvm-bin-path="${LLVM_BIN_PATH}" \
-        --test=simple_loop.c \
-        --driver=simple_loop_main.c \
-        --output=simple_loop_O${optlevel}
+        ./crosstest.py -O${optlevel} --mattr ${attribute} \
+            --prefix=Subzero_ \
+            --target=x8632 \
+            --dir="${OUTDIR}" \
+            --llvm-bin-path="${LLVM_BIN_PATH}" \
+            --test=simple_loop.c \
+            --driver=simple_loop_main.c \
+            --output=simple_loop_O${optlevel}_${attribute}
 
-    ./crosstest.py -O${optlevel} --prefix=Subzero_ --target=x8632 \
-        --dir="${OUTDIR}" \
-        --llvm-bin-path="${LLVM_BIN_PATH}" \
-        --test=mem_intrin.cpp \
-        --driver=mem_intrin_main.cpp \
-        --output=mem_intrin_O${optlevel}
+        ./crosstest.py -O${optlevel} --mattr ${attribute} \
+            --prefix=Subzero_ \
+            --target=x8632 \
+            --dir="${OUTDIR}" \
+            --llvm-bin-path="${LLVM_BIN_PATH}" \
+            --test=mem_intrin.cpp \
+            --driver=mem_intrin_main.cpp \
+            --output=mem_intrin_O${optlevel}_${attribute}
 
-    ./crosstest.py -O${optlevel} --prefix=Subzero_ --target=x8632 \
-        --dir="${OUTDIR}" \
-        --llvm-bin-path="${LLVM_BIN_PATH}" \
-        --test=test_arith.cpp \
-        --test=test_arith_frem.ll \
-        --test=test_arith_sqrt.ll \
-        --driver=test_arith_main.cpp \
-        --output=test_arith_O${optlevel}
+        ./crosstest.py -O${optlevel} --mattr ${attribute} \
+            --prefix=Subzero_ \
+            --target=x8632 \
+            --dir="${OUTDIR}" \
+            --llvm-bin-path="${LLVM_BIN_PATH}" \
+            --test=test_arith.cpp \
+            --test=test_arith_frem.ll \
+            --test=test_arith_sqrt.ll \
+            --driver=test_arith_main.cpp \
+            --output=test_arith_O${optlevel}_${attribute}
 
-    ./crosstest.py -O${optlevel} --prefix=Subzero_ --target=x8632 \
-        --dir="${OUTDIR}" \
-        --llvm-bin-path="${LLVM_BIN_PATH}" \
-        --test=test_bitmanip.cpp --test=test_bitmanip_intrin.ll \
-        --driver=test_bitmanip_main.cpp \
-        --output=test_bitmanip_O${optlevel}
+        ./crosstest.py -O${optlevel} --mattr ${attribute} \
+            --prefix=Subzero_ \
+            --target=x8632 \
+            --dir="${OUTDIR}" \
+            --llvm-bin-path="${LLVM_BIN_PATH}" \
+            --test=test_bitmanip.cpp --test=test_bitmanip_intrin.ll \
+            --driver=test_bitmanip_main.cpp \
+            --output=test_bitmanip_O${optlevel}_${attribute}
 
-    ./crosstest.py -O${optlevel} --prefix=Subzero_ --target=x8632 \
-        --dir="${OUTDIR}" \
-        --llvm-bin-path="${LLVM_BIN_PATH}" \
-        --test=test_cast.cpp --test=test_cast_to_u1.ll \
-        --driver=test_cast_main.cpp \
-        --output=test_cast_O${optlevel}
+        ./crosstest.py -O${optlevel} --mattr ${attribute} \
+            --prefix=Subzero_ \
+            --target=x8632 \
+            --dir="${OUTDIR}" \
+            --llvm-bin-path="${LLVM_BIN_PATH}" \
+            --test=test_cast.cpp --test=test_cast_to_u1.ll \
+            --driver=test_cast_main.cpp \
+            --output=test_cast_O${optlevel}_${attribute}
 
-    ./crosstest.py -O${optlevel} --prefix=Subzero_ --target=x8632 \
-        --dir="${OUTDIR}" \
-        --llvm-bin-path="${LLVM_BIN_PATH}" \
-        --test=test_fcmp.pnacl.ll \
-        --driver=test_fcmp_main.cpp \
-        --output=test_fcmp_O${optlevel}
+        ./crosstest.py -O${optlevel} --mattr ${attribute} \
+            --prefix=Subzero_ \
+            --target=x8632 \
+            --dir="${OUTDIR}" \
+            --llvm-bin-path="${LLVM_BIN_PATH}" \
+            --test=test_fcmp.pnacl.ll \
+            --driver=test_fcmp_main.cpp \
+            --output=test_fcmp_O${optlevel}_${attribute}
 
-    ./crosstest.py -O${optlevel} --prefix=Subzero_ --target=x8632 \
-        --dir="${OUTDIR}" \
-        --llvm-bin-path="${LLVM_BIN_PATH}" \
-        --test=test_global.cpp \
-        --driver=test_global_main.cpp \
-        --output=test_global_O${optlevel}
+        ./crosstest.py -O${optlevel} --mattr ${attribute} \
+            --prefix=Subzero_ \
+            --target=x8632 \
+            --dir="${OUTDIR}" \
+            --llvm-bin-path="${LLVM_BIN_PATH}" \
+            --test=test_global.cpp \
+            --driver=test_global_main.cpp \
+            --output=test_global_O${optlevel}_${attribute}
 
-    ./crosstest.py -O${optlevel} --prefix=Subzero_ --target=x8632 \
-        --dir="${OUTDIR}" \
-        --llvm-bin-path="${LLVM_BIN_PATH}" \
-        --test=test_icmp.cpp --test=test_icmp_i1vec.ll \
-        --driver=test_icmp_main.cpp \
-        --output=test_icmp_O${optlevel}
+        ./crosstest.py -O${optlevel} --mattr ${attribute} \
+            --prefix=Subzero_ \
+            --target=x8632 \
+            --dir="${OUTDIR}" \
+            --llvm-bin-path="${LLVM_BIN_PATH}" \
+            --test=test_icmp.cpp --test=test_icmp_i1vec.ll \
+            --driver=test_icmp_main.cpp \
+            --output=test_icmp_O${optlevel}_${attribute}
 
-    ./crosstest.py -O${optlevel} --prefix=Subzero_ --target=x8632 \
-        --dir="${OUTDIR}" \
-        --llvm-bin-path="${LLVM_BIN_PATH}" \
-        --test=test_select.ll \
-        --driver=test_select_main.cpp \
-        --output=test_select_O${optlevel}
+        ./crosstest.py -O${optlevel} --mattr ${attribute} \
+            --prefix=Subzero_ \
+            --target=x8632 \
+            --dir="${OUTDIR}" \
+            --llvm-bin-path="${LLVM_BIN_PATH}" \
+            --test=test_select.ll \
+            --driver=test_select_main.cpp \
+            --output=test_select_O${optlevel}_${attribute}
 
-    ./crosstest.py -O${optlevel} --prefix=Subzero_ --target=x8632 \
-        --dir="${OUTDIR}" \
-        --llvm-bin-path="${LLVM_BIN_PATH}" \
-        --test=test_stacksave.c \
-        --driver=test_stacksave_main.c \
-        --output=test_stacksave_O${optlevel}
+        ./crosstest.py -O${optlevel} --mattr ${attribute} \
+            --prefix=Subzero_ \
+            --target=x8632 \
+            --dir="${OUTDIR}" \
+            --llvm-bin-path="${LLVM_BIN_PATH}" \
+            --test=test_stacksave.c \
+            --driver=test_stacksave_main.c \
+            --output=test_stacksave_O${optlevel}_${attribute}
 
-    # Compile the non-subzero object files straight from source
-    # since the native LLVM backend does not understand how to
-    # lower NaCl-specific intrinsics.
-    ./crosstest.py -O${optlevel} --prefix=Subzero_ --target=x8632 \
-       --dir="${OUTDIR}" \
-       --llvm-bin-path="${LLVM_BIN_PATH}" \
-       --test=test_sync_atomic.cpp \
-       --crosstest-bitcode=0 \
-       --driver=test_sync_atomic_main.cpp \
-       --output=test_sync_atomic_O${optlevel}
+        # Compile the non-subzero object files straight from source
+        # since the native LLVM backend does not understand how to
+        # lower NaCl-specific intrinsics.
+        ./crosstest.py -O${optlevel} --mattr ${attribute} \
+            --prefix=Subzero_ \
+            --target=x8632 \
+            --dir="${OUTDIR}" \
+            --llvm-bin-path="${LLVM_BIN_PATH}" \
+            --test=test_sync_atomic.cpp \
+            --crosstest-bitcode=0 \
+            --driver=test_sync_atomic_main.cpp \
+            --output=test_sync_atomic_O${optlevel}_${attribute}
 
-    ./crosstest.py -O${optlevel} --prefix=Subzero_ --target=x8632 \
-        --dir="${OUTDIR}" \
-        --llvm-bin-path="${LLVM_BIN_PATH}" \
-        --test=test_vector_ops.ll \
-        --driver=test_vector_ops_main.cpp \
-        --output=test_vector_ops_O${optlevel}
+        ./crosstest.py -O${optlevel} --mattr ${attribute} \
+            --prefix=Subzero_ --target=x8632 \
+            --dir="${OUTDIR}" \
+            --llvm-bin-path="${LLVM_BIN_PATH}" \
+            --test=test_vector_ops.ll \
+            --driver=test_vector_ops_main.cpp \
+            --output=test_vector_ops_O${optlevel}_${attribute}
 
+    done
 done
 
 for optlevel in ${OPTLEVELS} ; do
-    "${OUTDIR}"/simple_loop_O${optlevel}
-    "${OUTDIR}"/mem_intrin_O${optlevel}
-    "${OUTDIR}"/test_arith_O${optlevel}
-    "${OUTDIR}"/test_bitmanip_O${optlevel}
-    "${OUTDIR}"/test_cast_O${optlevel}
-    "${OUTDIR}"/test_fcmp_O${optlevel}
-    "${OUTDIR}"/test_global_O${optlevel}
-    "${OUTDIR}"/test_icmp_O${optlevel}
-    "${OUTDIR}"/test_select_O${optlevel}
-    "${OUTDIR}"/test_stacksave_O${optlevel}
-    "${OUTDIR}"/test_sync_atomic_O${optlevel}
-    "${OUTDIR}"/test_vector_ops_O${optlevel}
+    for attribute in ${ATTRIBUTES}; do
+        "${OUTDIR}"/simple_loop_O${optlevel}_${attribute}
+        "${OUTDIR}"/mem_intrin_O${optlevel}_${attribute}
+        "${OUTDIR}"/test_arith_O${optlevel}_${attribute}
+        "${OUTDIR}"/test_bitmanip_O${optlevel}_${attribute}
+        "${OUTDIR}"/test_cast_O${optlevel}_${attribute}
+        "${OUTDIR}"/test_fcmp_O${optlevel}_${attribute}
+        "${OUTDIR}"/test_global_O${optlevel}_${attribute}
+        "${OUTDIR}"/test_icmp_O${optlevel}_${attribute}
+        "${OUTDIR}"/test_select_O${optlevel}_${attribute}
+        "${OUTDIR}"/test_stacksave_O${optlevel}_${attribute}
+        "${OUTDIR}"/test_sync_atomic_O${optlevel}_${attribute}
+        "${OUTDIR}"/test_vector_ops_O${optlevel}_${attribute}
+    done
 done
diff --git a/src/IceInstX8632.cpp b/src/IceInstX8632.cpp
index 7d930c2..be84554 100644
--- a/src/IceInstX8632.cpp
+++ b/src/IceInstX8632.cpp
@@ -484,7 +484,7 @@
 template <> const char *InstX8632Imul::Opcode = "imul";
 template <> const char *InstX8632Mulps::Opcode = "mulps";
 template <> const char *InstX8632Mulss::Opcode = "mulss";
-template <> const char *InstX8632Pmullw::Opcode = "pmullw";
+template <> const char *InstX8632Pmull::Opcode = "pmull";
 template <> const char *InstX8632Pmuludq::Opcode = "pmuludq";
 template <> const char *InstX8632Div::Opcode = "div";
 template <> const char *InstX8632Divps::Opcode = "divps";
@@ -500,10 +500,13 @@
 template <> const char *InstX8632Pcmpgt::Opcode = "pcmpgt";
 template <> const char *InstX8632Movss::Opcode = "movss";
 // Ternary ops
+template <> const char *InstX8632Insertps::Opcode = "insertps";
 template <> const char *InstX8632Shufps::Opcode = "shufps";
-template <> const char *InstX8632Pinsrw::Opcode = "pinsrw";
+template <> const char *InstX8632Pinsr::Opcode = "pinsr";
+template <> const char *InstX8632Blendvps::Opcode = "blendvps";
+template <> const char *InstX8632Pblendvb::Opcode = "pblendvb";
 // Three address ops
-template <> const char *InstX8632Pextrw::Opcode = "pextrw";
+template <> const char *InstX8632Pextr::Opcode = "pextr";
 template <> const char *InstX8632Pshufd::Opcode = "pshufd";
 
 template <> void InstX8632Sqrtss::emit(const Cfg *Func) const {
@@ -532,6 +535,23 @@
   emitTwoAddress(buf, this, Func);
 }
 
+template <> void InstX8632Pmull::emit(const Cfg *Func) const {
+  char buf[30];
+  bool TypesAreValid = getDest()->getType() == IceType_v4i32 ||
+                       getDest()->getType() == IceType_v8i16;
+  bool InstructionSetIsValid =
+      getDest()->getType() == IceType_v8i16 ||
+      static_cast<TargetX8632 *>(Func->getTarget())->getInstructionSet() >=
+          TargetX8632::SSE4_1;
+  (void)TypesAreValid;
+  (void)InstructionSetIsValid;
+  assert(TypesAreValid);
+  assert(InstructionSetIsValid);
+  snprintf(buf, llvm::array_lengthof(buf), "pmull%s",
+           TypeX8632Attributes[getDest()->getType()].PackString);
+  emitTwoAddress(buf, this, Func);
+}
+
 template <> void InstX8632Subss::emit(const Cfg *Func) const {
   char buf[30];
   snprintf(buf, llvm::array_lengthof(buf), "sub%s",
@@ -553,12 +573,6 @@
   emitTwoAddress(buf, this, Func);
 }
 
-template <> void InstX8632Pmullw::emit(const Cfg *Func) const {
-  assert(getSrc(0)->getType() == IceType_v8i16 &&
-         getSrc(1)->getType() == IceType_v8i16);
-  emitTwoAddress(Opcode, this, Func);
-}
-
 template <> void InstX8632Pmuludq::emit(const Cfg *Func) const {
   assert(getSrc(0)->getType() == IceType_v4i32 &&
          getSrc(1)->getType() == IceType_v4i32);
@@ -588,6 +602,38 @@
   Str << "\n";
 }
 
+
+namespace {
+
+// pblendvb and blendvps take xmm0 as a final implicit argument.
+void emitVariableBlendInst(const char *Opcode, const Inst *Inst,
+                           const Cfg *Func) {
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(Inst->getSrcSize() == 3);
+  assert(llvm::isa<Variable>(Inst->getSrc(2)));
+  assert(llvm::cast<Variable>(Inst->getSrc(2))->getRegNum() ==
+         TargetX8632::Reg_xmm0);
+  Str << "\t" << Opcode << "\t";
+  Inst->getDest()->emit(Func);
+  Str << ", ";
+  Inst->getSrc(1)->emit(Func);
+  Str << "\n";
+}
+
+} // end anonymous namespace
+
+template <> void InstX8632Blendvps::emit(const Cfg *Func) const {
+  assert(static_cast<TargetX8632 *>(Func->getTarget())->getInstructionSet() >=
+         TargetX8632::SSE4_1);
+  emitVariableBlendInst(Opcode, this, Func);
+}
+
+template <> void InstX8632Pblendvb::emit(const Cfg *Func) const {
+  assert(static_cast<TargetX8632 *>(Func->getTarget())->getInstructionSet() >=
+         TargetX8632::SSE4_1);
+  emitVariableBlendInst(Opcode, this, Func);
+}
+
 template <> void InstX8632Imul::emit(const Cfg *Func) const {
   Ostream &Str = Func->getContext()->getStrEmit();
   assert(getSrcSize() == 2);
@@ -1127,13 +1173,19 @@
   emitTwoAddress(buf, this, Func);
 }
 
-template <> void InstX8632Pextrw::emit(const Cfg *Func) const {
+template <> void InstX8632Pextr::emit(const Cfg *Func) const {
   Ostream &Str = Func->getContext()->getStrEmit();
   assert(getSrcSize() == 2);
-  Str << "\t" << Opcode << "\t";
+  // pextrb and pextrd are SSE4.1 instructions.
+  assert(getSrc(0)->getType() == IceType_v8i16 ||
+         getSrc(0)->getType() == IceType_v8i1 ||
+         static_cast<TargetX8632 *>(Func->getTarget())->getInstructionSet()
+             >= TargetX8632::SSE4_1);
+  Str << "\t" << Opcode
+      << TypeX8632Attributes[getSrc(0)->getType()].PackString << "\t";
   Variable *Dest = getDest();
-  assert(Dest->hasReg() && Dest->getType() == IceType_i16);
-  // pextrw takes r32 dest.
+  // pextrw must take a register dest.
+  assert(Dest->getType() != IceType_i16 || Dest->hasReg());
   Dest->asType(IceType_i32).emit(Func);
   Str << ", ";
   getSrc(0)->emit(Func);
@@ -1142,16 +1194,26 @@
   Str << "\n";
 }
 
-template <> void InstX8632Pinsrw::emit(const Cfg *Func) const {
+template <> void InstX8632Pinsr::emit(const Cfg *Func) const {
   Ostream &Str = Func->getContext()->getStrEmit();
   assert(getSrcSize() == 3);
-  Str << "\t" << Opcode << "\t";
+  // pinsrb and pinsrd are SSE4.1 instructions.
+  assert(getDest()->getType() == IceType_v8i16 ||
+         getDest()->getType() == IceType_v8i1 ||
+         static_cast<TargetX8632 *>(Func->getTarget())->getInstructionSet()
+             >= TargetX8632::SSE4_1);
+  Str << "\t" << Opcode
+      << TypeX8632Attributes[getDest()->getType()].PackString << "\t";
   getDest()->emit(Func);
   Str << ", ";
   Operand *Src1 = getSrc(1);
   if (Variable *VSrc1 = llvm::dyn_cast<Variable>(Src1)) {
-    // If src1 is a register, it should be r32.
-    VSrc1->asType(VSrc1->hasReg() ? IceType_i32 : IceType_i16).emit(Func);
+    // If src1 is a register, it should always be r32.
+    if (VSrc1->hasReg()) {
+      VSrc1->asType(IceType_i32).emit(Func);
+    } else {
+      VSrc1->emit(Func);
+    }
   } else {
     Src1->emit(Func);
   }
@@ -1216,7 +1278,9 @@
 
 template <> void InstX8632Psll::emit(const Cfg *Func) const {
   assert(getDest()->getType() == IceType_v8i16 ||
-         getDest()->getType() == IceType_v4i32);
+         getDest()->getType() == IceType_v8i1 ||
+         getDest()->getType() == IceType_v4i32 ||
+         getDest()->getType() == IceType_v4i1);
   char buf[30];
   snprintf(buf, llvm::array_lengthof(buf), "psll%s",
            TypeX8632Attributes[getDest()->getType()].PackString);
@@ -1225,7 +1289,9 @@
 
 template <> void InstX8632Psra::emit(const Cfg *Func) const {
   assert(getDest()->getType() == IceType_v8i16 ||
-         getDest()->getType() == IceType_v4i32);
+         getDest()->getType() == IceType_v8i1 ||
+         getDest()->getType() == IceType_v4i32 ||
+         getDest()->getType() == IceType_v4i1);
   char buf[30];
   snprintf(buf, llvm::array_lengthof(buf), "psra%s",
            TypeX8632Attributes[getDest()->getType()].PackString);
diff --git a/src/IceInstX8632.def b/src/IceInstX8632.def
index ece6a0a..932500c 100644
--- a/src/IceInstX8632.def
+++ b/src/IceInstX8632.def
@@ -88,9 +88,9 @@
   X(IceType_i64,   IceType_void, "si", ""  , "" ,  "qword ptr")   \
   X(IceType_f32,   IceType_void, "ss", "ss", "" ,  "dword ptr")   \
   X(IceType_f64,   IceType_void, "sd", "sd", "" ,  "qword ptr")   \
-  X(IceType_v4i1,  IceType_i32 , "?" , ""  , "" ,  "xmmword ptr") \
-  X(IceType_v8i1,  IceType_i16 , "?" , ""  , "" ,  "xmmword ptr") \
-  X(IceType_v16i1, IceType_i8  , "?" , ""  , "" ,  "xmmword ptr") \
+  X(IceType_v4i1,  IceType_i32 , "?" , ""  , "d",  "xmmword ptr") \
+  X(IceType_v8i1,  IceType_i16 , "?" , ""  , "w",  "xmmword ptr") \
+  X(IceType_v16i1, IceType_i8  , "?" , ""  , "b",  "xmmword ptr") \
   X(IceType_v16i8, IceType_i8  , "?" , ""  , "b",  "xmmword ptr") \
   X(IceType_v8i16, IceType_i16 , "?" , ""  , "w",  "xmmword ptr") \
   X(IceType_v4i32, IceType_i32 , "dq", ""  , "d",  "xmmword ptr") \
diff --git a/src/IceInstX8632.h b/src/IceInstX8632.h
index ddea6b5..7c12f11 100644
--- a/src/IceInstX8632.h
+++ b/src/IceInstX8632.h
@@ -138,6 +138,7 @@
     Addps,
     Addss,
     And,
+    Blendvps,
     Br,
     Bsf,
     Bsr,
@@ -157,6 +158,7 @@
     Icmp,
     Idiv,
     Imul,
+    Insertps,
     Label,
     Lea,
     Load,
@@ -176,11 +178,12 @@
     Padd,
     Pand,
     Pandn,
+    Pblendvb,
     Pcmpeq,
     Pcmpgt,
-    Pextrw,
-    Pinsrw,
-    Pmullw,
+    Pextr,
+    Pinsr,
+    Pmull,
     Pmuludq,
     Pop,
     Por,
@@ -573,7 +576,7 @@
 typedef InstX8632Binop<InstX8632::Imul> InstX8632Imul;
 typedef InstX8632Binop<InstX8632::Mulps> InstX8632Mulps;
 typedef InstX8632Binop<InstX8632::Mulss> InstX8632Mulss;
-typedef InstX8632Binop<InstX8632::Pmullw> InstX8632Pmullw;
+typedef InstX8632Binop<InstX8632::Pmull> InstX8632Pmull;
 typedef InstX8632Binop<InstX8632::Pmuludq> InstX8632Pmuludq;
 typedef InstX8632Binop<InstX8632::Divps> InstX8632Divps;
 typedef InstX8632Binop<InstX8632::Divss> InstX8632Divss;
@@ -594,9 +597,12 @@
 typedef InstX8632Binop<InstX8632::Movss> InstX8632Movss;
 typedef InstX8632Ternop<InstX8632::Idiv> InstX8632Idiv;
 typedef InstX8632Ternop<InstX8632::Div> InstX8632Div;
-typedef InstX8632Ternop<InstX8632::Pinsrw> InstX8632Pinsrw;
+typedef InstX8632Ternop<InstX8632::Insertps> InstX8632Insertps;
+typedef InstX8632Ternop<InstX8632::Pinsr> InstX8632Pinsr;
 typedef InstX8632Ternop<InstX8632::Shufps> InstX8632Shufps;
-typedef InstX8632ThreeAddressop<InstX8632::Pextrw> InstX8632Pextrw;
+typedef InstX8632Ternop<InstX8632::Blendvps> InstX8632Blendvps;
+typedef InstX8632Ternop<InstX8632::Pblendvb> InstX8632Pblendvb;
+typedef InstX8632ThreeAddressop<InstX8632::Pextr> InstX8632Pextr;
 typedef InstX8632ThreeAddressop<InstX8632::Pshufd> InstX8632Pshufd;
 
 // Base class for a lockable x86-32 instruction (emits a locked prefix).
diff --git a/src/IceTargetLoweringX8632.cpp b/src/IceTargetLoweringX8632.cpp
index 00db25a..cc6f222 100644
--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
@@ -22,6 +22,7 @@
 #include "IceOperand.h"
 #include "IceTargetLoweringX8632.def"
 #include "IceTargetLoweringX8632.h"
+#include "llvm/Support/CommandLine.h"
 
 namespace Ice {
 
@@ -123,6 +124,17 @@
 // The number of bits in a byte
 const unsigned X86_CHAR_BIT = 8;
 
+// Instruction set options
+namespace cl = ::llvm::cl;
+cl::opt<TargetX8632::X86InstructionSet> CLInstructionSet(
+    "mattr", cl::desc("X86 target attributes"),
+    cl::init(TargetX8632::SSE2),
+    cl::values(
+        clEnumValN(TargetX8632::SSE2, "sse2",
+                   "Enable SSE2 instructions (default)"),
+        clEnumValN(TargetX8632::SSE4_1, "sse4.1",
+                   "Enable SSE 4.1 instructions"), clEnumValEnd));
+
 // Return a string representation of the type that is suitable for use
 // in an identifier.
 IceString typeIdentString(const Type Ty) {
@@ -234,8 +246,9 @@
 } // end of anonymous namespace
 
 TargetX8632::TargetX8632(Cfg *Func)
-    : TargetLowering(Func), IsEbpBasedFrame(false), FrameSizeLocals(0),
-      LocalsSizeBytes(0), NextLabelNumber(0), ComputedLiveRanges(false),
+    : TargetLowering(Func), InstructionSet(CLInstructionSet),
+      IsEbpBasedFrame(false), FrameSizeLocals(0), LocalsSizeBytes(0),
+      NextLabelNumber(0), ComputedLiveRanges(false),
       PhysicalRegisters(VarList(Reg_NUM)) {
   // TODO: Don't initialize IntegerRegisters and friends every time.
   // Instead, initialize in some sort of static initializer for the
@@ -1228,7 +1241,16 @@
       _movp(Dest, T);
     } break;
     case InstArithmetic::Mul: {
-      if (Dest->getType() == IceType_v4i32) {
+      bool TypesAreValidForPmull =
+          Dest->getType() == IceType_v4i32 || Dest->getType() == IceType_v8i16;
+      bool InstructionSetIsValidForPmull =
+          Dest->getType() == IceType_v8i16 || InstructionSet >= SSE4_1;
+      if (TypesAreValidForPmull && InstructionSetIsValidForPmull) {
+        Variable *T = makeReg(Dest->getType());
+        _movp(T, Src0);
+        _pmull(T, legalizeToVar(Src1));
+        _movp(Dest, T);
+      } else if (Dest->getType() == IceType_v4i32) {
         // Lowering sequence:
         // Note: The mask arguments have index 0 on the left.
         //
@@ -1243,8 +1265,6 @@
         // shufps  T1, T2, {0,2,0,2}
         // pshufd  T4, T1, {0,2,1,3}
         // movups  Dest, T4
-        //
-        // TODO(wala): SSE4.1 has pmulld.
 
         // Mask that directs pshufd to create a vector with entries
         // Src[1, 0, 3, 0]
@@ -1273,11 +1293,6 @@
         _shufps(T1, T2, Ctx->getConstantInt(IceType_i8, Mask0202));
         _pshufd(T4, T1, Ctx->getConstantInt(IceType_i8, Mask0213));
         _movp(Dest, T4);
-      } else if (Dest->getType() == IceType_v8i16) {
-        Variable *T = makeReg(IceType_v8i16);
-        _movp(T, Src0);
-        _pmullw(T, legalizeToVar(Src1));
-        _movp(Dest, T);
       } else {
         assert(Dest->getType() == IceType_v16i8);
         // Sz_mul_v16i8
@@ -2155,10 +2170,15 @@
   Variable *ExtractedElement = makeReg(InVectorElementTy);
 
   // TODO(wala): Determine the best lowering sequences for each type.
-  if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) {
-    // Lower extractelement operations where the element is 32 bits
-    // wide with pshufd.
-    // TODO(wala): SSE4.1 has extractps and pextrd
+  bool CanUsePextr =
+      Ty == IceType_v8i16 || Ty == IceType_v8i1 || InstructionSet >= SSE4_1;
+  if (CanUsePextr && Ty != IceType_v4f32) {
+    // Use pextrb, pextrw, or pextrd.
+    Constant *Mask = Ctx->getConstantInt(IceType_i8, Index);
+    Variable *SourceVectR = legalizeToVar(SourceVectOperand);
+    _pextr(ExtractedElement, SourceVectR, Mask);
+  } else if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) {
+    // Use pshufd and movd/movss.
     //
     // ALIGNHACK: Force vector operands to registers in instructions that
     // require aligned memory operands until support for stack alignment
@@ -2187,13 +2207,9 @@
       _movss(ExtractedElement, T);
     }
 #undef ALIGN_HACK
-  } else if (Ty == IceType_v8i16 || Ty == IceType_v8i1) {
-    Constant *Mask = Ctx->getConstantInt(IceType_i8, Index);
-    _pextrw(ExtractedElement, legalizeToVar(SourceVectOperand), Mask);
   } else {
     assert(Ty == IceType_v16i8 || Ty == IceType_v16i1);
     // Spill the value to a stack slot and do the extraction in memory.
-    // TODO(wala): SSE4.1 has pextrb.
     //
     // TODO(wala): use legalize(SourceVectOperand, Legal_Mem) when
     // support for legalizing to mem is implemented.
@@ -2539,10 +2555,18 @@
     ElementToInsert = Expanded;
   }
 
-  if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) {
-    // Lower insertelement with 32-bit wide elements using shufps or
-    // movss.
-    // TODO(wala): SSE4.1 has pinsrd and insertps.
+  if (Ty == IceType_v8i16 || Ty == IceType_v8i1 || InstructionSet >= SSE4_1) {
+    // Use insertps, pinsrb, pinsrw, or pinsrd.
+    Operand *Element = legalize(ElementToInsert, Legal_Mem | Legal_Reg);
+    Variable *T = makeReg(Ty);
+    _movp(T, SourceVectOperand);
+    if (Ty == IceType_v4f32)
+      _insertps(T, Element, Ctx->getConstantInt(IceType_i8, Index << 4));
+    else
+      _pinsr(T, Element, Ctx->getConstantInt(IceType_i8, Index));
+    _movp(Inst->getDest(), T);
+  } else if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) {
+    // Use shufps or movss.
     Variable *Element = NULL;
     if (InVectorElementTy == IceType_f32) {
       // Element will be in an XMM register since it is floating point.
@@ -2607,17 +2631,10 @@
       _movp(Inst->getDest(), T);
     }
 #undef ALIGN_HACK
-  } else if (Ty == IceType_v8i16 || Ty == IceType_v8i1) {
-    Operand *Element = legalize(ElementToInsert, Legal_Mem | Legal_Reg);
-    Variable *T = makeReg(Ty);
-    _movp(T, SourceVectOperand);
-    _pinsrw(T, Element, Ctx->getConstantInt(IceType_i8, Index));
-    _movp(Inst->getDest(), T);
   } else {
     assert(Ty == IceType_v16i8 || Ty == IceType_v16i1);
     // Spill the value to a stack slot and perform the insertion in
     // memory.
-    // TODO(wala): SSE4.1 has pinsrb.
     //
     // TODO(wala): use legalize(SourceVectOperand, Legal_Mem) when
     // support for legalizing to mem is implemented.
@@ -3551,11 +3568,42 @@
   Operand *Condition = Inst->getCondition();
 
   if (isVectorType(Dest->getType())) {
-    // a=d?b:c ==> d=sext(d); a=(b&d)|(c&~d)
-    // TODO(wala): SSE4.1 has blendvps and pblendvb.  SSE4.1 also has
-    // blendps and pblendw for constant condition operands.
     Type SrcTy = SrcT->getType();
     Variable *T = makeReg(SrcTy);
+    // ALIGNHACK: Until stack alignment support is implemented, vector
+    // instructions need to have vector operands in registers.  Once
+    // there is support for stack alignment, LEGAL_HACK can be removed.
+#define LEGAL_HACK(Vect) legalizeToVar((Vect))
+    if (InstructionSet >= SSE4_1) {
+      // TODO(wala): If the condition operand is a constant, use blendps
+      // or pblendw.
+      //
+      // Use blendvps or pblendvb to implement select.
+      if (SrcTy == IceType_v4i1 || SrcTy == IceType_v4i32 ||
+          SrcTy == IceType_v4f32) {
+        Variable *xmm0 = makeReg(IceType_v4i32, Reg_xmm0);
+        _movp(xmm0, Condition);
+        _psll(xmm0, Ctx->getConstantInt(IceType_i8, 31));
+        _movp(T, SrcF);
+        _blendvps(T, LEGAL_HACK(SrcT), xmm0);
+        _movp(Dest, T);
+      } else {
+        assert(typeNumElements(SrcTy) == 8 || typeNumElements(SrcTy) == 16);
+        Type SignExtTy = Condition->getType() == IceType_v8i1 ? IceType_v8i16
+            : IceType_v16i8;
+        Variable *xmm0 = makeReg(SignExtTy, Reg_xmm0);
+        lowerCast(InstCast::create(Func, InstCast::Sext, xmm0, Condition));
+        _movp(T, SrcF);
+        _pblendvb(T, LEGAL_HACK(SrcT), xmm0);
+        _movp(Dest, T);
+      }
+      return;
+    }
+    // Lower select without SSE4.1:
+    // a=d?b:c ==>
+    //   if elementtype(d) != i1:
+    //      d=sext(d);
+    //   a=(b&d)|(c&~d);
     Variable *T2 = makeReg(SrcTy);
     // Sign extend the condition operand if applicable.
     if (SrcTy == IceType_v4f32) {
@@ -3568,11 +3616,6 @@
     } else {
       _movp(T, Condition);
     }
-    // ALIGNHACK: Until stack alignment support is implemented, the
-    // bitwise vector instructions need to have both operands in
-    // registers.  Once there is support for stack alignment, LEGAL_HACK
-    // can be removed.
-#define LEGAL_HACK(Vect) legalizeToVar((Vect))
     _movp(T2, T);
     _pand(T, LEGAL_HACK(SrcT));
     _pandn(T2, LEGAL_HACK(SrcF));
diff --git a/src/IceTargetLoweringX8632.h b/src/IceTargetLoweringX8632.h
index daca0cd..0c87bee 100644
--- a/src/IceTargetLoweringX8632.h
+++ b/src/IceTargetLoweringX8632.h
@@ -71,6 +71,14 @@
         Reg_NUM
   };
 
+  enum X86InstructionSet {
+    // SSE2 is the PNaCl baseline instruction set.
+    SSE2,
+    SSE4_1
+  };
+
+  X86InstructionSet getInstructionSet() const { return InstructionSet; }
+
 protected:
   TargetX8632(Cfg *Func);
 
@@ -186,6 +194,9 @@
   void _and(Variable *Dest, Operand *Src0) {
     Context.insert(InstX8632And::create(Func, Dest, Src0));
   }
+  void _blendvps(Variable *Dest, Operand *Src0, Operand *Src1) {
+    Context.insert(InstX8632Blendvps::create(Func, Dest, Src0, Src1));
+  }
   void _br(InstX8632::BrCond Condition, CfgNode *TargetTrue,
            CfgNode *TargetFalse) {
     Context.insert(
@@ -260,6 +271,9 @@
   void _imul(Variable *Dest, Operand *Src0) {
     Context.insert(InstX8632Imul::create(Func, Dest, Src0));
   }
+  void _insertps(Variable *Dest, Operand *Src0, Operand *Src1) {
+    Context.insert(InstX8632Insertps::create(Func, Dest, Src0, Src1));
+  }
   void _lea(Variable *Dest, Operand *Src0) {
     Context.insert(InstX8632Lea::create(Func, Dest, Src0));
   }
@@ -317,20 +331,23 @@
   void _pandn(Variable *Dest, Operand *Src0) {
     Context.insert(InstX8632Pandn::create(Func, Dest, Src0));
   }
+  void _pblendvb(Variable *Dest, Operand *Src0, Operand *Src1) {
+    Context.insert(InstX8632Pblendvb::create(Func, Dest, Src0, Src1));
+  }
   void _pcmpeq(Variable *Dest, Operand *Src0) {
     Context.insert(InstX8632Pcmpeq::create(Func, Dest, Src0));
   }
   void _pcmpgt(Variable *Dest, Operand *Src0) {
     Context.insert(InstX8632Pcmpgt::create(Func, Dest, Src0));
   }
-  void _pextrw(Variable *Dest, Operand *Src0, Operand *Src1) {
-    Context.insert(InstX8632Pextrw::create(Func, Dest, Src0, Src1));
+  void _pextr(Variable *Dest, Operand *Src0, Operand *Src1) {
+    Context.insert(InstX8632Pextr::create(Func, Dest, Src0, Src1));
   }
-  void _pinsrw(Variable *Dest, Operand *Src0, Operand *Src1) {
-    Context.insert(InstX8632Pinsrw::create(Func, Dest, Src0, Src1));
+  void _pinsr(Variable *Dest, Operand *Src0, Operand *Src1) {
+    Context.insert(InstX8632Pinsr::create(Func, Dest, Src0, Src1));
   }
-  void _pmullw(Variable *Dest, Operand *Src0) {
-    Context.insert(InstX8632Pmullw::create(Func, Dest, Src0));
+  void _pmull(Variable *Dest, Operand *Src0) {
+    Context.insert(InstX8632Pmull::create(Func, Dest, Src0));
   }
   void _pmuludq(Variable *Dest, Operand *Src0) {
     Context.insert(InstX8632Pmuludq::create(Func, Dest, Src0));
@@ -428,6 +445,7 @@
     Context.insert(InstX8632Xor::create(Func, Dest, Src0));
   }
 
+  const X86InstructionSet InstructionSet;
   bool IsEbpBasedFrame;
   size_t FrameSizeLocals;
   size_t LocalsSizeBytes;
diff --git a/tests_lit/llvm2ice_tests/vector-arith.ll b/tests_lit/llvm2ice_tests/vector-arith.ll
index d300317..94acfe0 100644
--- a/tests_lit/llvm2ice_tests/vector-arith.ll
+++ b/tests_lit/llvm2ice_tests/vector-arith.ll
@@ -2,10 +2,18 @@
 
 ; RUN: %llvm2ice -O2 --verbose none %s | FileCheck %s
 ; RUN: %llvm2ice -Om1 --verbose none %s | FileCheck %s
+; RUN: %llvm2ice -O2 -mattr=sse4.1 --verbose none %s \
+; RUN:                | FileCheck %s --check-prefix=SSE41
+; RUN: %llvm2ice -Om1 -mattr=sse4.1 --verbose none %s \
+; RUN:                | FileCheck %s --check-prefix=SSE41
 ; RUN: %llvm2ice -O2 --verbose none %s \
 ; RUN:               | llvm-mc -arch=x86 -x86-asm-syntax=intel -filetype=obj
 ; RUN: %llvm2ice -Om1 --verbose none %s \
 ; RUN:               | llvm-mc -arch=x86 -x86-asm-syntax=intel -filetype=obj
+; RUN: %llvm2ice -O2 -mattr=sse4.1 --verbose none %s \
+; RUN:               | llvm-mc -arch=x86 -x86-asm-syntax=intel -filetype=obj
+; RUN: %llvm2ice -Om1 -mattr=sse4.1 --verbose none %s \
+; RUN:               | llvm-mc -arch=x86 -x86-asm-syntax=intel -filetype=obj
 ; RUN: %llvm2ice --verbose none %s | FileCheck --check-prefix=ERRORS %s
 ; RUN: %llvm2iceinsts %s | %szdiff %s | FileCheck --check-prefix=DUMP %s
 ; RUN: %llvm2iceinsts --pnacl %s | %szdiff %s \
@@ -306,6 +314,9 @@
 ; CHECK-LABEL: test_mul_v4i32:
 ; CHECK: pmuludq
 ; CHECK: pmuludq
+;
+; SSE41-LABEL: test_mul_v4i32:
+; SSE41: pmulld
 }
 
 define <4 x i32> @test_shl_v4i32(<4 x i32> %arg0, <4 x i32> %arg1) {
@@ -314,6 +325,9 @@
   ret <4 x i32> %res
 ; CHECK-LABEL: test_shl_v4i32:
 ; CHECK: Sz_shl_v4i32
+
+; This line is to ensure that pmulld is generated in test_mul_v4i32 above.
+; SSE41-LABEL: test_shl_v4i32:
 }
 
 define <4 x i32> @test_lshr_v4i32(<4 x i32> %arg0, <4 x i32> %arg1) {
diff --git a/tests_lit/llvm2ice_tests/vector-ops.ll b/tests_lit/llvm2ice_tests/vector-ops.ll
index 86647db..c730d73 100644
--- a/tests_lit/llvm2ice_tests/vector-ops.ll
+++ b/tests_lit/llvm2ice_tests/vector-ops.ll
@@ -1,10 +1,19 @@
 ; This checks support for insertelement and extractelement.
 
-; RUN: %llvm2ice --verbose inst %s | FileCheck %s
+; RUN: %llvm2ice -O2 --verbose none %s | FileCheck %s
+; RUN: %llvm2ice -Om1 --verbose none %s | FileCheck %s
+; RUN: %llvm2ice -O2 -mattr=sse4.1 --verbose none %s \
+; RUN:                | FileCheck %s --check-prefix=SSE41
+; RUN: %llvm2ice -Om1 -mattr=sse4.1 --verbose none %s \
+; RUN:                | FileCheck %s --check-prefix=SSE41
 ; RUN: %llvm2ice -O2 --verbose none %s \
 ; RUN:               | llvm-mc -arch=x86 -x86-asm-syntax=intel -filetype=obj
 ; RUN: %llvm2ice -Om1 --verbose none %s \
 ; RUN:               | llvm-mc -arch=x86 -x86-asm-syntax=intel -filetype=obj
+; RUN: %llvm2ice -O2 -mattr=sse4.1 --verbose none %s \
+; RUN:               | llvm-mc -arch=x86 -x86-asm-syntax=intel -filetype=obj
+; RUN: %llvm2ice -Om1 -mattr=sse4.1 --verbose none %s \
+; RUN:               | llvm-mc -arch=x86 -x86-asm-syntax=intel -filetype=obj
 ; RUN: %llvm2ice --verbose none %s | FileCheck --check-prefix=ERRORS %s
 ; RUN: %llvm2iceinsts %s | %szdiff %s | FileCheck --check-prefix=DUMP %s
 ; RUN: %llvm2iceinsts --pnacl %s | %szdiff %s \
@@ -18,6 +27,9 @@
   ret <4 x float> %res
 ; CHECK-LABEL: insertelement_v4f32_0:
 ; CHECK: movss
+
+; SSE41-LABEL: insertelement_v4f32_0:
+; SSE41: insertps {{.*}}, {{.*}}, 0
 }
 
 define <4 x i32> @insertelement_v4i32_0(<4 x i32> %vec, i32 %elt) {
@@ -26,6 +38,9 @@
   ret <4 x i32> %res
 ; CHECK-LABEL: insertelement_v4i32_0:
 ; CHECK: movss
+
+; SSE41-LABEL: insertelement_v4i32_0:
+; SSE41: pinsrd {{.*}}, {{.*}}, 0
 }
 
 
@@ -36,6 +51,9 @@
 ; CHECK-LABEL: insertelement_v4f32_1:
 ; CHECK: shufps
 ; CHECK: shufps
+
+; SSE41-LABEL: insertelement_v4f32_1:
+; SSE41: insertps {{.*}}, {{.*}}, 16
 }
 
 define <4 x i32> @insertelement_v4i32_1(<4 x i32> %vec, i32 %elt) {
@@ -45,6 +63,9 @@
 ; CHECK-LABEL: insertelement_v4i32_1:
 ; CHECK: shufps
 ; CHECK: shufps
+
+; SSE41-LABEL: insertelement_v4i32_1:
+; SSE41: pinsrd {{.*}}, {{.*}}, 1
 }
 
 define <8 x i16> @insertelement_v8i16(<8 x i16> %vec, i32 %elt.arg) {
@@ -52,8 +73,11 @@
   %elt = trunc i32 %elt.arg to i16
   %res = insertelement <8 x i16> %vec, i16 %elt, i32 1
   ret <8 x i16> %res
-; CHECK-LABEL: insertelement_v8i16
+; CHECK-LABEL: insertelement_v8i16:
 ; CHECK: pinsrw
+
+; SSE41-LABEL: insertelement_v8i16:
+; SSE41: pinsrw
 }
 
 define <16 x i8> @insertelement_v16i8(<16 x i8> %vec, i32 %elt.arg) {
@@ -65,6 +89,9 @@
 ; CHECK: movups
 ; CHECK: lea
 ; CHECK: mov
+
+; SSE41-LABEL: insertelement_v16i8:
+; SSE41: pinsrb
 }
 
 define <4 x i1> @insertelement_v4i1_0(<4 x i1> %vec, i32 %elt.arg) {
@@ -74,6 +101,9 @@
   ret <4 x i1> %res
 ; CHECK-LABEL: insertelement_v4i1_0:
 ; CHECK: movss
+
+; SSE41-LABEL: insertelement_v4i1_0:
+; SSE41: pinsrd {{.*}}, {{.*}}, 0
 }
 
 define <4 x i1> @insertelement_v4i1_1(<4 x i1> %vec, i32 %elt.arg) {
@@ -84,6 +114,9 @@
 ; CHECK-LABEL: insertelement_v4i1_1:
 ; CHECK: shufps
 ; CHECK: shufps
+
+; SSE41-LABEL: insertelement_v4i1_1:
+; SSE41: pinsrd {{.*}}, {{.*}}, 1
 }
 
 define <8 x i1> @insertelement_v8i1(<8 x i1> %vec, i32 %elt.arg) {
@@ -93,6 +126,9 @@
   ret <8 x i1> %res
 ; CHECK-LABEL: insertelement_v8i1:
 ; CHECK: pinsrw
+
+; SSE41-LABEL: insertelement_v8i1:
+; SSE41: pinsrw
 }
 
 define <16 x i1> @insertelement_v16i1(<16 x i1> %vec, i32 %elt.arg) {
@@ -104,6 +140,9 @@
 ; CHECK: movups
 ; CHECK: lea
 ; CHECK: mov
+
+; SSE41-LABEL: insertelement_v16i1:
+; SSE41: pinsrb
 }
 
 ; extractelement operations
@@ -114,6 +153,9 @@
   ret float %res
 ; CHECK-LABEL: extractelement_v4f32:
 ; CHECK: pshufd
+
+; SSE41-LABEL: extractelement_v4f32:
+; SSE41: pshufd
 }
 
 define i32 @extractelement_v4i32(<4 x i32> %vec) {
@@ -122,6 +164,9 @@
   ret i32 %res
 ; CHECK-LABEL: extractelement_v4i32:
 ; CHECK: pshufd
+
+; SSE41-LABEL: extractelement_v4i32:
+; SSE41: pextrd
 }
 
 define i32 @extractelement_v8i16(<8 x i16> %vec) {
@@ -131,6 +176,9 @@
   ret i32 %res.ext
 ; CHECK-LABEL: extractelement_v8i16:
 ; CHECK: pextrw
+
+; SSE41-LABEL: extractelement_v8i16:
+; SSE41: pextrw
 }
 
 define i32 @extractelement_v16i8(<16 x i8> %vec) {
@@ -142,6 +190,9 @@
 ; CHECK: movups
 ; CHECK: lea
 ; CHECK: mov
+
+; SSE41-LABEL: extractelement_v16i8:
+; SSE41: pextrb
 }
 
 define i32 @extractelement_v4i1(<4 x i1> %vec) {
@@ -151,6 +202,9 @@
   ret i32 %res.ext
 ; CHECK-LABEL: extractelement_v4i1:
 ; CHECK: pshufd
+
+; SSE41-LABEL: extractelement_v4i1:
+; SSE41: pextrd
 }
 
 define i32 @extractelement_v8i1(<8 x i1> %vec) {
@@ -160,6 +214,9 @@
   ret i32 %res.ext
 ; CHECK-LABEL: extractelement_v8i1:
 ; CHECK: pextrw
+
+; SSE41-LABEL: extractelement_v8i1:
+; SSE41: pextrw
 }
 
 define i32 @extractelement_v16i1(<16 x i1> %vec) {
@@ -171,6 +228,9 @@
 ; CHECK: movups
 ; CHECK: lea
 ; CHECK: mov
+
+; SSE41-LABEL: extractelement_v16i1:
+; SSE41: pextrb
 }
 
 ; ERRORS-NOT: ICE translation error
diff --git a/tests_lit/llvm2ice_tests/vector-select.ll b/tests_lit/llvm2ice_tests/vector-select.ll
index 93f5941..67270fa 100644
--- a/tests_lit/llvm2ice_tests/vector-select.ll
+++ b/tests_lit/llvm2ice_tests/vector-select.ll
@@ -2,10 +2,18 @@
 
 ; RUN: %llvm2ice -O2 --verbose none %s | FileCheck %s
 ; RUN: %llvm2ice -Om1 --verbose none %s | FileCheck %s
+; RUN: %llvm2ice -O2 -mattr=sse4.1 --verbose none %s \
+; RUN:                | FileCheck %s --check-prefix=SSE41
+; RUN: %llvm2ice -Om1 -mattr=sse4.1 --verbose none %s \
+; RUN:                | FileCheck %s --check-prefix=SSE41
 ; RUN: %llvm2ice -O2 --verbose none %s \
 ; RUN:               | llvm-mc -arch=x86 -x86-asm-syntax=intel -filetype=obj
 ; RUN: %llvm2ice -Om1 --verbose none %s \
 ; RUN:               | llvm-mc -arch=x86 -x86-asm-syntax=intel -filetype=obj
+; RUN: %llvm2ice -O2 -mattr=sse4.1 --verbose none %s \
+; RUN:               | llvm-mc -arch=x86 -x86-asm-syntax=intel -filetype=obj
+; RUN: %llvm2ice -Om1 -mattr=sse4.1 --verbose none %s \
+; RUN:               | llvm-mc -arch=x86 -x86-asm-syntax=intel -filetype=obj
 ; RUN: %llvm2ice --verbose none %s | FileCheck --check-prefix=ERRORS %s
 ; RUN: %llvm2iceinsts %s | %szdiff %s | FileCheck --check-prefix=DUMP %s
 ; RUN: %llvm2iceinsts --pnacl %s | %szdiff %s \
@@ -19,6 +27,9 @@
 ; CHECK: pand
 ; CHECK: pandn
 ; CHECK: por
+
+; SSE41-LABEL: test_select_v16i8:
+; SSE41: pblendvb
 }
 
 define <16 x i1> @test_select_v16i1(<16 x i1> %cond, <16 x i1> %arg1, <16 x i1> %arg2) {
@@ -29,6 +40,9 @@
 ; CHECK: pand
 ; CHECK: pandn
 ; CHECK: por
+
+; SSE41-LABEL: test_select_v16i1:
+; SSE41: pblendvb
 }
 
 define <8 x i16> @test_select_v8i16(<8 x i1> %cond, <8 x i16> %arg1, <8 x i16> %arg2) {
@@ -39,6 +53,9 @@
 ; CHECK: pand
 ; CHECK: pandn
 ; CHECK: por
+
+; SSE41-LABEL: test_select_v8i16:
+; SSE41: pblendvb
 }
 
 define <8 x i1> @test_select_v8i1(<8 x i1> %cond, <8 x i1> %arg1, <8 x i1> %arg2) {
@@ -49,6 +66,9 @@
 ; CHECK: pand
 ; CHECK: pandn
 ; CHECK: por
+
+; SSE41-LABEL: test_select_v8i1:
+; SSE41: pblendvb
 }
 
 define <4 x i32> @test_select_v4i32(<4 x i1> %cond, <4 x i32> %arg1, <4 x i32> %arg2) {
@@ -59,6 +79,10 @@
 ; CHECK: pand
 ; CHECK: pandn
 ; CHECK: por
+
+; SSE41-LABEL: test_select_v4i32:
+; SSE41: pslld xmm0, 31
+; SSE41: blendvps
 }
 
 define <4 x float> @test_select_v4f32(<4 x i1> %cond, <4 x float> %arg1, <4 x float> %arg2) {
@@ -69,6 +93,10 @@
 ; CHECK: pand
 ; CHECK: pandn
 ; CHECK: por
+
+; SSE41-LABEL: test_select_v4f32:
+; SSE41: pslld xmm0, 31
+; SSE41: blendvps
 }
 
 define <4 x i1> @test_select_v4i1(<4 x i1> %cond, <4 x i1> %arg1, <4 x i1> %arg2) {
@@ -79,6 +107,10 @@
 ; CHECK: pand
 ; CHECK: pandn
 ; CHECK: por
+
+; SSE41-LABEL: test_select_v4i1:
+; SSE41: pslld xmm0, 31
+; SSE41: blendvps
 }
 
 ; ERRORS-NOT: ICE translation error