Subzero: Add support for SSE4.1 instructions.

* Add initial support for code generation with SSE4.1 instructions. The
following operations are affected:
 - multiplication with v4i32
 - select
 - insertelement
 - extractelement

* Add appropriate lit checks for SSE4.1 instructions. Run the crosstests
in both SSE2 and SSE4.1 mode.

* Introduce the -mattr flag to llvm2ice to control which instruction set
gets used.

BUG=none
R=jvoung@chromium.org, stichnot@chromium.org

Review URL: https://codereview.chromium.org/427843002
diff --git a/src/IceInstX8632.cpp b/src/IceInstX8632.cpp
index 7d930c2..be84554 100644
--- a/src/IceInstX8632.cpp
+++ b/src/IceInstX8632.cpp
@@ -484,7 +484,7 @@
 template <> const char *InstX8632Imul::Opcode = "imul";
 template <> const char *InstX8632Mulps::Opcode = "mulps";
 template <> const char *InstX8632Mulss::Opcode = "mulss";
-template <> const char *InstX8632Pmullw::Opcode = "pmullw";
+template <> const char *InstX8632Pmull::Opcode = "pmull";
 template <> const char *InstX8632Pmuludq::Opcode = "pmuludq";
 template <> const char *InstX8632Div::Opcode = "div";
 template <> const char *InstX8632Divps::Opcode = "divps";
@@ -500,10 +500,13 @@
 template <> const char *InstX8632Pcmpgt::Opcode = "pcmpgt";
 template <> const char *InstX8632Movss::Opcode = "movss";
 // Ternary ops
+template <> const char *InstX8632Insertps::Opcode = "insertps";
 template <> const char *InstX8632Shufps::Opcode = "shufps";
-template <> const char *InstX8632Pinsrw::Opcode = "pinsrw";
+template <> const char *InstX8632Pinsr::Opcode = "pinsr";
+template <> const char *InstX8632Blendvps::Opcode = "blendvps";
+template <> const char *InstX8632Pblendvb::Opcode = "pblendvb";
 // Three address ops
-template <> const char *InstX8632Pextrw::Opcode = "pextrw";
+template <> const char *InstX8632Pextr::Opcode = "pextr";
 template <> const char *InstX8632Pshufd::Opcode = "pshufd";
 
 template <> void InstX8632Sqrtss::emit(const Cfg *Func) const {
@@ -532,6 +535,23 @@
   emitTwoAddress(buf, this, Func);
 }
 
+template <> void InstX8632Pmull::emit(const Cfg *Func) const {
+  char buf[30];
+  bool TypesAreValid = getDest()->getType() == IceType_v4i32 ||
+                       getDest()->getType() == IceType_v8i16;
+  bool InstructionSetIsValid =
+      getDest()->getType() == IceType_v8i16 ||
+      static_cast<TargetX8632 *>(Func->getTarget())->getInstructionSet() >=
+          TargetX8632::SSE4_1;
+  (void)TypesAreValid;
+  (void)InstructionSetIsValid;
+  assert(TypesAreValid);
+  assert(InstructionSetIsValid);
+  snprintf(buf, llvm::array_lengthof(buf), "pmull%s",
+           TypeX8632Attributes[getDest()->getType()].PackString);
+  emitTwoAddress(buf, this, Func);
+}
+
 template <> void InstX8632Subss::emit(const Cfg *Func) const {
   char buf[30];
   snprintf(buf, llvm::array_lengthof(buf), "sub%s",
@@ -553,12 +573,6 @@
   emitTwoAddress(buf, this, Func);
 }
 
-template <> void InstX8632Pmullw::emit(const Cfg *Func) const {
-  assert(getSrc(0)->getType() == IceType_v8i16 &&
-         getSrc(1)->getType() == IceType_v8i16);
-  emitTwoAddress(Opcode, this, Func);
-}
-
 template <> void InstX8632Pmuludq::emit(const Cfg *Func) const {
   assert(getSrc(0)->getType() == IceType_v4i32 &&
          getSrc(1)->getType() == IceType_v4i32);
@@ -588,6 +602,38 @@
   Str << "\n";
 }
 
+
+namespace {
+
+// pblendvb and blendvps take xmm0 as a final implicit argument.
+void emitVariableBlendInst(const char *Opcode, const Inst *Inst,
+                           const Cfg *Func) {
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(Inst->getSrcSize() == 3);
+  assert(llvm::isa<Variable>(Inst->getSrc(2)));
+  assert(llvm::cast<Variable>(Inst->getSrc(2))->getRegNum() ==
+         TargetX8632::Reg_xmm0);
+  Str << "\t" << Opcode << "\t";
+  Inst->getDest()->emit(Func);
+  Str << ", ";
+  Inst->getSrc(1)->emit(Func);
+  Str << "\n";
+}
+
+} // end anonymous namespace
+
+template <> void InstX8632Blendvps::emit(const Cfg *Func) const {
+  assert(static_cast<TargetX8632 *>(Func->getTarget())->getInstructionSet() >=
+         TargetX8632::SSE4_1);
+  emitVariableBlendInst(Opcode, this, Func);
+}
+
+template <> void InstX8632Pblendvb::emit(const Cfg *Func) const {
+  assert(static_cast<TargetX8632 *>(Func->getTarget())->getInstructionSet() >=
+         TargetX8632::SSE4_1);
+  emitVariableBlendInst(Opcode, this, Func);
+}
+
 template <> void InstX8632Imul::emit(const Cfg *Func) const {
   Ostream &Str = Func->getContext()->getStrEmit();
   assert(getSrcSize() == 2);
@@ -1127,13 +1173,19 @@
   emitTwoAddress(buf, this, Func);
 }
 
-template <> void InstX8632Pextrw::emit(const Cfg *Func) const {
+template <> void InstX8632Pextr::emit(const Cfg *Func) const {
   Ostream &Str = Func->getContext()->getStrEmit();
   assert(getSrcSize() == 2);
-  Str << "\t" << Opcode << "\t";
+  // pextrb and pextrd are SSE4.1 instructions.
+  assert(getSrc(0)->getType() == IceType_v8i16 ||
+         getSrc(0)->getType() == IceType_v8i1 ||
+         static_cast<TargetX8632 *>(Func->getTarget())->getInstructionSet()
+             >= TargetX8632::SSE4_1);
+  Str << "\t" << Opcode
+      << TypeX8632Attributes[getSrc(0)->getType()].PackString << "\t";
   Variable *Dest = getDest();
-  assert(Dest->hasReg() && Dest->getType() == IceType_i16);
-  // pextrw takes r32 dest.
+  // pextrw must take a register dest.
+  assert(Dest->getType() != IceType_i16 || Dest->hasReg());
   Dest->asType(IceType_i32).emit(Func);
   Str << ", ";
   getSrc(0)->emit(Func);
@@ -1142,16 +1194,26 @@
   Str << "\n";
 }
 
-template <> void InstX8632Pinsrw::emit(const Cfg *Func) const {
+template <> void InstX8632Pinsr::emit(const Cfg *Func) const {
   Ostream &Str = Func->getContext()->getStrEmit();
   assert(getSrcSize() == 3);
-  Str << "\t" << Opcode << "\t";
+  // pinsrb and pinsrd are SSE4.1 instructions.
+  assert(getDest()->getType() == IceType_v8i16 ||
+         getDest()->getType() == IceType_v8i1 ||
+         static_cast<TargetX8632 *>(Func->getTarget())->getInstructionSet()
+             >= TargetX8632::SSE4_1);
+  Str << "\t" << Opcode
+      << TypeX8632Attributes[getDest()->getType()].PackString << "\t";
   getDest()->emit(Func);
   Str << ", ";
   Operand *Src1 = getSrc(1);
   if (Variable *VSrc1 = llvm::dyn_cast<Variable>(Src1)) {
-    // If src1 is a register, it should be r32.
-    VSrc1->asType(VSrc1->hasReg() ? IceType_i32 : IceType_i16).emit(Func);
+    // If src1 is a register, it should always be r32.
+    if (VSrc1->hasReg()) {
+      VSrc1->asType(IceType_i32).emit(Func);
+    } else {
+      VSrc1->emit(Func);
+    }
   } else {
     Src1->emit(Func);
   }
@@ -1216,7 +1278,9 @@
 
 template <> void InstX8632Psll::emit(const Cfg *Func) const {
   assert(getDest()->getType() == IceType_v8i16 ||
-         getDest()->getType() == IceType_v4i32);
+         getDest()->getType() == IceType_v8i1 ||
+         getDest()->getType() == IceType_v4i32 ||
+         getDest()->getType() == IceType_v4i1);
   char buf[30];
   snprintf(buf, llvm::array_lengthof(buf), "psll%s",
            TypeX8632Attributes[getDest()->getType()].PackString);
@@ -1225,7 +1289,9 @@
 
 template <> void InstX8632Psra::emit(const Cfg *Func) const {
   assert(getDest()->getType() == IceType_v8i16 ||
-         getDest()->getType() == IceType_v4i32);
+         getDest()->getType() == IceType_v8i1 ||
+         getDest()->getType() == IceType_v4i32 ||
+         getDest()->getType() == IceType_v4i1);
   char buf[30];
   snprintf(buf, llvm::array_lengthof(buf), "psra%s",
            TypeX8632Attributes[getDest()->getType()].PackString);