Handle "inplace" ops and unary ops w/ assembler

Extend the bswap test to have a case which will exhibit a bit of register
pressure to test register encoding more (at first wasn't sure if it was
0xC8 + reg or 0xC8 | reg... but it should be the same since there's only
0-7 for regs).

BUG=none
R=stichnot@chromium.org

Review URL: https://codereview.chromium.org/595093002
diff --git a/crosstest/test_bitmanip.cpp b/crosstest/test_bitmanip.cpp
index 7106767..181286e 100644
--- a/crosstest/test_bitmanip.cpp
+++ b/crosstest/test_bitmanip.cpp
@@ -40,6 +40,18 @@
 #undef X
 
 #define X(type, builtin_name)                                                  \
-  type test_bswap(type a) { return builtin_name(a); }
+  type test_bswap(type a) { return builtin_name(a); }                          \
+  type test_bswap_alloca(type a) {                                             \
+    const size_t buf_size = 8;                                                 \
+    type buf[buf_size];                                                        \
+    for (size_t i = 0; i < buf_size; ++i) {                                    \
+      buf[i] = builtin_name(a * i) + builtin_name(a + i);                      \
+    }                                                                          \
+    type sum = 0;                                                              \
+    for (size_t i = 0; i < buf_size; ++i) {                                    \
+      sum += buf[i];                                                           \
+    }                                                                          \
+    return sum;                                                                \
+  }
 BSWAP_TABLE
 #undef X
diff --git a/crosstest/test_bitmanip.h b/crosstest/test_bitmanip.h
index d283d7f..c24c679 100644
--- a/crosstest/test_bitmanip.h
+++ b/crosstest/test_bitmanip.h
@@ -23,6 +23,8 @@
 FOR_ALL_BMI_OP_TYPES(X)
 #undef X
 
-#define X(type, builtin_name) type test_bswap(type);
+#define X(type, builtin_name)                                                  \
+  type test_bswap(type);                                                       \
+  type test_bswap_alloca(type);
 BSWAP_TABLE
 #undef X
diff --git a/crosstest/test_bitmanip_main.cpp b/crosstest/test_bitmanip_main.cpp
index 89dc8ba..592ad7e 100644
--- a/crosstest/test_bitmanip_main.cpp
+++ b/crosstest/test_bitmanip_main.cpp
@@ -102,19 +102,30 @@
 
 template <typename Type>
 void testByteSwap(size_t &TotalTests, size_t &Passes, size_t &Failures) {
-  for (size_t i = 0; i < NumValues; ++i) {
-    Type Value = static_cast<Type>(Values[i]);
-    ++TotalTests;
-    Type ResultSz = test_bswap(Value);
-    Type ResultLlc = Subzero_::test_bswap(Value);
-    if (ResultSz == ResultLlc) {
-      ++Passes;
-    } else {
-      ++Failures;
-      std::cout << "test_bswap" << (CHAR_BIT * sizeof(Type)) << "("
-                << static_cast<uint64_t>(Value)
-                << "): sz=" << static_cast<uint64_t>(ResultSz)
-                << " llc=" << static_cast<uint64_t>(ResultLlc) << "\n";
+  typedef Type (*FuncType)(Type);
+  static struct {
+    const char *Name;
+    FuncType FuncLlc;
+    FuncType FuncSz;
+  } Funcs[] = {
+        {"bswap", test_bswap, Subzero_::test_bswap},
+        {"bswap_alloca", test_bswap_alloca, Subzero_::test_bswap_alloca}};
+  const static size_t NumFuncs = sizeof(Funcs) / sizeof(*Funcs);
+  for (size_t f = 0; f < NumFuncs; ++f) {
+    for (size_t i = 0; i < NumValues; ++i) {
+      Type Value = static_cast<Type>(Values[i]);
+      ++TotalTests;
+      Type ResultSz = Funcs[f].FuncSz(Value);
+      Type ResultLlc = Funcs[f].FuncLlc(Value);
+      if (ResultSz == ResultLlc) {
+        ++Passes;
+      } else {
+        ++Failures;
+        std::cout << "test_" << Funcs[f].Name << (CHAR_BIT * sizeof(Type))
+                  << "(" << static_cast<uint64_t>(Value)
+                  << "): sz=" << static_cast<uint64_t>(ResultSz)
+                  << " llc=" << static_cast<uint64_t>(ResultLlc) << "\n";
+      }
     }
   }
 }
diff --git a/src/IceInstX8632.cpp b/src/IceInstX8632.cpp
index 3fd74f9..72d7545 100644
--- a/src/IceInstX8632.cpp
+++ b/src/IceInstX8632.cpp
@@ -478,10 +478,66 @@
   Str << "\n";
 }
 
+void emitIASVarTyGPR(const Cfg *Func, Type Ty, const Variable *Var,
+                     const x86::AssemblerX86::GPREmitterOneOp &Emitter) {
+  x86::AssemblerX86 *Asm = Func->getAssembler<x86::AssemblerX86>();
+  intptr_t StartPosition = Asm->GetPosition();
+  if (Var->hasReg()) {
+    // We cheat a little and use GPRRegister even for byte operations.
+    RegX8632::GPRRegister VarReg =
+        RegX8632::getEncodedByteRegOrGPR(Ty, Var->getRegNum());
+    (Asm->*(Emitter.Reg))(Ty, VarReg);
+  } else {
+    x86::Address StackAddr(static_cast<TargetX8632 *>(Func->getTarget())
+                               ->stackVarToAsmOperand(Var));
+    (Asm->*(Emitter.Addr))(Ty, StackAddr);
+  }
+  Ostream &Str = Func->getContext()->getStrEmit();
+  emitIASBytes(Str, Asm, StartPosition);
+}
+
+void emitIASRegOpTyGPR(const Cfg *Func, Type Ty, const Variable *Var,
+                       const Operand *Src,
+                       const x86::AssemblerX86::GPREmitterRegOp &Emitter) {
+  x86::AssemblerX86 *Asm = Func->getAssembler<x86::AssemblerX86>();
+  intptr_t StartPosition = Asm->GetPosition();
+  assert(Var->hasReg());
+  // We cheat a little and use GPRRegister even for byte operations.
+  RegX8632::GPRRegister VarReg =
+      RegX8632::getEncodedByteRegOrGPR(Ty, Var->getRegNum());
+  if (const Variable *SrcVar = llvm::dyn_cast<Variable>(Src)) {
+    if (SrcVar->hasReg()) {
+      RegX8632::GPRRegister SrcReg;
+      if (Ty == IceType_i8 || Ty == IceType_i1) {
+        SrcReg = static_cast<RegX8632::GPRRegister>(
+            RegX8632::getEncodedByteReg(SrcVar->getRegNum()));
+      } else {
+        SrcReg = RegX8632::getEncodedGPR(SrcVar->getRegNum());
+      }
+      (Asm->*(Emitter.GPRGPR))(Ty, VarReg, SrcReg);
+    } else {
+      x86::Address SrcStackAddr = static_cast<TargetX8632 *>(Func->getTarget())
+                                      ->stackVarToAsmOperand(SrcVar);
+      (Asm->*(Emitter.GPRAddr))(Ty, VarReg, SrcStackAddr);
+    }
+  } else if (const OperandX8632Mem *Mem =
+                 llvm::dyn_cast<OperandX8632Mem>(Src)) {
+    x86::Address SrcAddr = Mem->toAsmAddress(Asm);
+    (Asm->*(Emitter.GPRAddr))(Ty, VarReg, SrcAddr);
+  } else if (const ConstantInteger32 *Imm =
+                 llvm::dyn_cast<ConstantInteger32>(Src)) {
+    (Asm->*(Emitter.GPRImm))(Ty, VarReg, x86::Immediate(Imm->getValue()));
+  } else {
+    llvm_unreachable("Unexpected operand type");
+  }
+  Ostream &Str = Func->getContext()->getStrEmit();
+  emitIASBytes(Str, Asm, StartPosition);
+}
+
 void
 emitIASVarOperandTyXMM(const Cfg *Func, Type Ty, const Variable *Var,
                        const Operand *Src,
-                       const x86::AssemblerX86::TypedXmmEmitters &Emitter) {
+                       const x86::AssemblerX86::XmmEmitterTwoOps &Emitter) {
   x86::AssemblerX86 *Asm = Func->getAssembler<x86::AssemblerX86>();
   intptr_t StartPosition = Asm->GetPosition();
   assert(Var->hasReg());
@@ -586,54 +642,75 @@
 template <> const char *InstX8632Pextr::Opcode = "pextr";
 template <> const char *InstX8632Pshufd::Opcode = "pshufd";
 
+// Inplace GPR ops
+template <>
+const x86::AssemblerX86::GPREmitterOneOp InstX8632Bswap::Emitter = {
+    &x86::AssemblerX86::bswap, NULL /* only a reg form exists */};
+template <>
+const x86::AssemblerX86::GPREmitterOneOp InstX8632Neg::Emitter = {
+    &x86::AssemblerX86::neg, &x86::AssemblerX86::neg};
+
+// Unary GPR ops
+template <>
+const x86::AssemblerX86::GPREmitterRegOp InstX8632Bsf::Emitter = {
+    &x86::AssemblerX86::bsf, &x86::AssemblerX86::bsf, NULL};
+template <>
+const x86::AssemblerX86::GPREmitterRegOp InstX8632Bsr::Emitter = {
+    &x86::AssemblerX86::bsr, &x86::AssemblerX86::bsr, NULL};
+template <>
+const x86::AssemblerX86::GPREmitterRegOp InstX8632Lea::Emitter = {
+    /* reg/reg and reg/imm are illegal */ NULL, &x86::AssemblerX86::lea, NULL};
+
+// Unary XMM ops
+template <>
+const x86::AssemblerX86::XmmEmitterTwoOps InstX8632Sqrtss::Emitter = {
+    &x86::AssemblerX86::sqrtss, &x86::AssemblerX86::sqrtss, NULL};
+
 // Binary XMM ops
 template <>
-const x86::AssemblerX86::TypedXmmEmitters InstX8632Addss::Emitter = {
+const x86::AssemblerX86::XmmEmitterTwoOps InstX8632Addss::Emitter = {
     &x86::AssemblerX86::addss, &x86::AssemblerX86::addss, NULL};
 template <>
-const x86::AssemblerX86::TypedXmmEmitters InstX8632Addps::Emitter = {
+const x86::AssemblerX86::XmmEmitterTwoOps InstX8632Addps::Emitter = {
     &x86::AssemblerX86::addps, &x86::AssemblerX86::addps, NULL};
 template <>
-const x86::AssemblerX86::TypedXmmEmitters InstX8632Divss::Emitter = {
+const x86::AssemblerX86::XmmEmitterTwoOps InstX8632Divss::Emitter = {
     &x86::AssemblerX86::divss, &x86::AssemblerX86::divss, NULL};
 template <>
-const x86::AssemblerX86::TypedXmmEmitters InstX8632Divps::Emitter = {
+const x86::AssemblerX86::XmmEmitterTwoOps InstX8632Divps::Emitter = {
     &x86::AssemblerX86::divps, &x86::AssemblerX86::divps, NULL};
 template <>
-const x86::AssemblerX86::TypedXmmEmitters InstX8632Mulss::Emitter = {
+const x86::AssemblerX86::XmmEmitterTwoOps InstX8632Mulss::Emitter = {
     &x86::AssemblerX86::mulss, &x86::AssemblerX86::mulss, NULL};
 template <>
-const x86::AssemblerX86::TypedXmmEmitters InstX8632Mulps::Emitter = {
+const x86::AssemblerX86::XmmEmitterTwoOps InstX8632Mulps::Emitter = {
     &x86::AssemblerX86::mulps, &x86::AssemblerX86::mulps, NULL};
 template <>
-const x86::AssemblerX86::TypedXmmEmitters InstX8632Padd::Emitter = {
+const x86::AssemblerX86::XmmEmitterTwoOps InstX8632Padd::Emitter = {
     &x86::AssemblerX86::padd, &x86::AssemblerX86::padd, NULL};
 template <>
-const x86::AssemblerX86::TypedXmmEmitters InstX8632Pand::Emitter = {
+const x86::AssemblerX86::XmmEmitterTwoOps InstX8632Pand::Emitter = {
     &x86::AssemblerX86::pand, &x86::AssemblerX86::pand, NULL};
 template <>
-const x86::AssemblerX86::TypedXmmEmitters InstX8632Pandn::Emitter = {
+const x86::AssemblerX86::XmmEmitterTwoOps InstX8632Pandn::Emitter = {
     &x86::AssemblerX86::pandn, &x86::AssemblerX86::pandn, NULL};
 template <>
-const x86::AssemblerX86::TypedXmmEmitters InstX8632Pmuludq::Emitter = {
+const x86::AssemblerX86::XmmEmitterTwoOps InstX8632Pmuludq::Emitter = {
     &x86::AssemblerX86::pmuludq, &x86::AssemblerX86::pmuludq, NULL};
 template <>
-const x86::AssemblerX86::TypedXmmEmitters InstX8632Por::Emitter = {
+const x86::AssemblerX86::XmmEmitterTwoOps InstX8632Por::Emitter = {
     &x86::AssemblerX86::por, &x86::AssemblerX86::por, NULL};
 template <>
-const x86::AssemblerX86::TypedXmmEmitters InstX8632Psub::Emitter = {
+const x86::AssemblerX86::XmmEmitterTwoOps InstX8632Psub::Emitter = {
     &x86::AssemblerX86::psub, &x86::AssemblerX86::psub, NULL};
 template <>
-const x86::AssemblerX86::TypedXmmEmitters InstX8632Pxor::Emitter = {
+const x86::AssemblerX86::XmmEmitterTwoOps InstX8632Pxor::Emitter = {
     &x86::AssemblerX86::pxor, &x86::AssemblerX86::pxor, NULL};
 template <>
-const x86::AssemblerX86::TypedXmmEmitters InstX8632Sqrtss::Emitter = {
-    &x86::AssemblerX86::sqrtss, &x86::AssemblerX86::sqrtss, NULL};
-template <>
-const x86::AssemblerX86::TypedXmmEmitters InstX8632Subss::Emitter = {
+const x86::AssemblerX86::XmmEmitterTwoOps InstX8632Subss::Emitter = {
     &x86::AssemblerX86::subss, &x86::AssemblerX86::subss, NULL};
 template <>
-const x86::AssemblerX86::TypedXmmEmitters InstX8632Subps::Emitter = {
+const x86::AssemblerX86::XmmEmitterTwoOps InstX8632Subps::Emitter = {
     &x86::AssemblerX86::subps, &x86::AssemblerX86::subps, NULL};
 
 template <> void InstX8632Sqrtss::emit(const Cfg *Func) const {
@@ -1125,7 +1202,7 @@
   assert(llvm::isa<Variable>(getSrc(0)));
   const Variable *Src0 = llvm::cast<Variable>(getSrc(0));
   Type Ty = Src0->getType();
-  const static x86::AssemblerX86::TypedXmmEmitters Emitter = {
+  const static x86::AssemblerX86::XmmEmitterTwoOps Emitter = {
       &x86::AssemblerX86::ucomiss, &x86::AssemblerX86::ucomiss, NULL};
   emitIASVarOperandTyXMM(Func, Ty, Src0, getSrc(1), Emitter);
 }
@@ -1300,6 +1377,42 @@
   }
 }
 
+template <> void InstX8632Movd::emitIAS(const Cfg *Func) const {
+  x86::AssemblerX86 *Asm = Func->getAssembler<x86::AssemblerX86>();
+  intptr_t StartPosition = Asm->GetPosition();
+  assert(getSrcSize() == 1);
+  const Variable *Dest = getDest();
+  const Variable *Src = llvm::cast<Variable>(getSrc(0));
+  // For insert/extract element (one of Src/Dest is an Xmm vector and
+  // the other is an int type).
+  if (Src->getType() == IceType_i32) {
+    assert(isVectorType(Dest->getType()));
+    assert(Dest->hasReg());
+    RegX8632::XmmRegister DestReg = RegX8632::getEncodedXmm(Dest->getRegNum());
+    if (Src->hasReg()) {
+      Asm->movd(DestReg, RegX8632::getEncodedGPR(Src->getRegNum()));
+    } else {
+      x86::Address StackAddr(static_cast<TargetX8632 *>(Func->getTarget())
+                                 ->stackVarToAsmOperand(Src));
+      Asm->movd(DestReg, StackAddr);
+    }
+  } else {
+    assert(isVectorType(Src->getType()));
+    assert(Src->hasReg());
+    assert(Dest->getType() == IceType_i32);
+    RegX8632::XmmRegister SrcReg = RegX8632::getEncodedXmm(Src->getRegNum());
+    if (Dest->hasReg()) {
+      Asm->movd(RegX8632::getEncodedGPR(Dest->getRegNum()), SrcReg);
+    } else {
+      x86::Address StackAddr(static_cast<TargetX8632 *>(Func->getTarget())
+                                 ->stackVarToAsmOperand(Dest));
+      Asm->movd(StackAddr, SrcReg);
+    }
+  }
+  Ostream &Str = Func->getContext()->getStrEmit();
+  emitIASBytes(Str, Asm, StartPosition);
+}
+
 template <> void InstX8632Movp::emit(const Cfg *Func) const {
   // TODO(wala,stichnot): movups works with all vector operands, but
   // there exist other instructions (movaps, movdqa, movdqu) that may
diff --git a/src/IceInstX8632.h b/src/IceInstX8632.h
index facd4f4..ac30870 100644
--- a/src/IceInstX8632.h
+++ b/src/IceInstX8632.h
@@ -429,13 +429,17 @@
   virtual ~InstX8632Call() {}
 };
 
+// Emit a one-operand (GPR) instruction.
+void emitIASVarTyGPR(const Cfg *Func, Type Ty, const Variable *Var,
+                     const x86::AssemblerX86::GPREmitterOneOp &Emitter);
+
 // Instructions of the form x := op(x).
 template <InstX8632::InstKindX8632 K>
-class InstX8632Inplaceop : public InstX8632 {
+class InstX8632InplaceopGPR : public InstX8632 {
 public:
-  static InstX8632Inplaceop *create(Cfg *Func, Operand *SrcDest) {
-    return new (Func->allocate<InstX8632Inplaceop>())
-        InstX8632Inplaceop(Func, SrcDest);
+  static InstX8632InplaceopGPR *create(Cfg *Func, Operand *SrcDest) {
+    return new (Func->allocate<InstX8632InplaceopGPR>())
+        InstX8632InplaceopGPR(Func, SrcDest);
   }
   virtual void emit(const Cfg *Func) const {
     Ostream &Str = Func->getContext()->getStrEmit();
@@ -444,6 +448,12 @@
     getSrc(0)->emit(Func);
     Str << "\n";
   }
+  virtual void emitIAS(const Cfg *Func) const {
+    assert(getSrcSize() == 1);
+    const Variable *Var = getDest();
+    Type Ty = Var->getType();
+    emitIASVarTyGPR(Func, Ty, Var, Emitter);
+  }
   virtual void dump(const Cfg *Func) const {
     Ostream &Str = Func->getContext()->getStrDump();
     dumpDest(Func);
@@ -453,24 +463,31 @@
   static bool classof(const Inst *Inst) { return isClassof(Inst, K); }
 
 private:
-  InstX8632Inplaceop(Cfg *Func, Operand *SrcDest)
+  InstX8632InplaceopGPR(Cfg *Func, Operand *SrcDest)
       : InstX8632(Func, K, 1, llvm::dyn_cast<Variable>(SrcDest)) {
     addSource(SrcDest);
   }
-  InstX8632Inplaceop(const InstX8632Inplaceop &) LLVM_DELETED_FUNCTION;
-  InstX8632Inplaceop &
-  operator=(const InstX8632Inplaceop &) LLVM_DELETED_FUNCTION;
-  virtual ~InstX8632Inplaceop() {}
+  InstX8632InplaceopGPR(const InstX8632InplaceopGPR &) LLVM_DELETED_FUNCTION;
+  InstX8632InplaceopGPR &
+  operator=(const InstX8632InplaceopGPR &) LLVM_DELETED_FUNCTION;
+  virtual ~InstX8632InplaceopGPR() {}
   static const char *Opcode;
+  static const x86::AssemblerX86::GPREmitterOneOp Emitter;
 };
 
+// Emit a two-operand (GPR) instruction, where the dest operand is a
+// Variable that's guaranteed to be a register.
+void emitIASRegOpTyGPR(const Cfg *Func, Type Ty, const Variable *Dst,
+                       const Operand *Src,
+                       const x86::AssemblerX86::GPREmitterRegOp &Emitter);
+
 // Instructions of the form x := op(y)
 template <InstX8632::InstKindX8632 K>
-class InstX8632Unaryop : public InstX8632 {
+class InstX8632UnaryopGPR : public InstX8632 {
 public:
-  static InstX8632Unaryop *create(Cfg *Func, Variable *Dest, Operand *Src) {
-    return new (Func->allocate<InstX8632Unaryop>())
-        InstX8632Unaryop(Func, Dest, Src);
+  static InstX8632UnaryopGPR *create(Cfg *Func, Variable *Dest, Operand *Src) {
+    return new (Func->allocate<InstX8632UnaryopGPR>())
+        InstX8632UnaryopGPR(Func, Dest, Src);
   }
   virtual void emit(const Cfg *Func) const {
     Ostream &Str = Func->getContext()->getStrEmit();
@@ -481,7 +498,13 @@
     getSrc(0)->emit(Func);
     Str << "\n";
   }
-  virtual void emitIAS(const Cfg *Func) const { emit(Func); }
+  virtual void emitIAS(const Cfg *Func) const {
+    assert(getSrcSize() == 1);
+    const Variable *Var = getDest();
+    Type Ty = Var->getType();
+    const Operand *Src = getSrc(0);
+    emitIASRegOpTyGPR(Func, Ty, Var, Src, Emitter);
+  }
   virtual void dump(const Cfg *Func) const {
     Ostream &Str = Func->getContext()->getStrDump();
     dumpDest(Func);
@@ -491,19 +514,21 @@
   static bool classof(const Inst *Inst) { return isClassof(Inst, K); }
 
 private:
-  InstX8632Unaryop(Cfg *Func, Variable *Dest, Operand *Src)
+  InstX8632UnaryopGPR(Cfg *Func, Variable *Dest, Operand *Src)
       : InstX8632(Func, K, 1, Dest) {
     addSource(Src);
   }
-  InstX8632Unaryop(const InstX8632Unaryop &) LLVM_DELETED_FUNCTION;
-  InstX8632Unaryop &operator=(const InstX8632Unaryop &) LLVM_DELETED_FUNCTION;
-  virtual ~InstX8632Unaryop() {}
+  InstX8632UnaryopGPR(const InstX8632UnaryopGPR &) LLVM_DELETED_FUNCTION;
+  InstX8632UnaryopGPR &
+  operator=(const InstX8632UnaryopGPR &) LLVM_DELETED_FUNCTION;
+  virtual ~InstX8632UnaryopGPR() {}
   static const char *Opcode;
+  static const x86::AssemblerX86::GPREmitterRegOp Emitter;
 };
 
 void emitIASVarOperandTyXMM(const Cfg *Func, Type Ty, const Variable *Var,
                             const Operand *Src,
-                            const x86::AssemblerX86::TypedXmmEmitters &Emitter);
+                            const x86::AssemblerX86::XmmEmitterTwoOps &Emitter);
 
 template <InstX8632::InstKindX8632 K>
 class InstX8632UnaryopXmm : public InstX8632 {
@@ -544,7 +569,7 @@
   operator=(const InstX8632UnaryopXmm &) LLVM_DELETED_FUNCTION;
   virtual ~InstX8632UnaryopXmm() {}
   static const char *Opcode;
-  static const x86::AssemblerX86::TypedXmmEmitters Emitter;
+  static const x86::AssemblerX86::XmmEmitterTwoOps Emitter;
 };
 
 // See the definition of emitTwoAddress() for a description of
@@ -620,7 +645,7 @@
   InstX8632BinopXmm &operator=(const InstX8632BinopXmm &) LLVM_DELETED_FUNCTION;
   virtual ~InstX8632BinopXmm() {}
   static const char *Opcode;
-  static const x86::AssemblerX86::TypedXmmEmitters Emitter;
+  static const x86::AssemblerX86::XmmEmitterTwoOps Emitter;
 };
 
 template <InstX8632::InstKindX8632 K> class InstX8632Ternop : public InstX8632 {
@@ -741,15 +766,15 @@
   static const char *Opcode;
 };
 
-typedef InstX8632Inplaceop<InstX8632::Bswap> InstX8632Bswap;
-typedef InstX8632Inplaceop<InstX8632::Neg> InstX8632Neg;
-typedef InstX8632Unaryop<InstX8632::Bsf> InstX8632Bsf;
-typedef InstX8632Unaryop<InstX8632::Bsr> InstX8632Bsr;
-typedef InstX8632Unaryop<InstX8632::Lea> InstX8632Lea;
-typedef InstX8632Unaryop<InstX8632::Movd> InstX8632Movd;
-typedef InstX8632UnaryopXmm<InstX8632::Sqrtss> InstX8632Sqrtss;
+typedef InstX8632InplaceopGPR<InstX8632::Bswap> InstX8632Bswap;
+typedef InstX8632InplaceopGPR<InstX8632::Neg> InstX8632Neg;
+typedef InstX8632UnaryopGPR<InstX8632::Bsf> InstX8632Bsf;
+typedef InstX8632UnaryopGPR<InstX8632::Bsr> InstX8632Bsr;
+typedef InstX8632UnaryopGPR<InstX8632::Lea> InstX8632Lea;
 // Cbwdq instruction - wrapper for cbw, cwd, and cdq
-typedef InstX8632Unaryop<InstX8632::Cbwdq> InstX8632Cbwdq;
+typedef InstX8632UnaryopGPR<InstX8632::Cbwdq> InstX8632Cbwdq;
+typedef InstX8632UnaryopXmm<InstX8632::Movd> InstX8632Movd;
+typedef InstX8632UnaryopXmm<InstX8632::Sqrtss> InstX8632Sqrtss;
 // Move/assignment instruction - wrapper for mov/movss/movsd.
 typedef InstX8632Movlike<InstX8632::Mov> InstX8632Mov;
 // Move packed - copy 128 bit values between XMM registers, or mem128
@@ -1384,6 +1409,7 @@
 template <> void InstX8632Subss::emit(const Cfg *Func) const;
 
 template <> void InstX8632Cbwdq::emitIAS(const Cfg *Func) const;
+template <> void InstX8632Movd::emitIAS(const Cfg *Func) const;
 
 } // end of namespace Ice
 
diff --git a/src/IceRegistersX8632.h b/src/IceRegistersX8632.h
index 3aa8178..cbbcfc9 100644
--- a/src/IceRegistersX8632.h
+++ b/src/IceRegistersX8632.h
@@ -16,6 +16,7 @@
 
 #include "IceDefs.h"
 #include "IceInstX8632.def"
+#include "IceTypes.h"
 
 namespace Ice {
 
@@ -83,6 +84,13 @@
   return ByteRegister(RegNum - Reg_GPR_First);
 }
 
+static inline GPRRegister getEncodedByteRegOrGPR(Type Ty, int32_t RegNum) {
+  if (Ty == IceType_i8 || Ty == IceType_i1)
+    return GPRRegister(getEncodedByteReg(RegNum));
+  else
+    return getEncodedGPR(RegNum);
+}
+
 } // end of namespace RegX8632
 
 } // end of namespace Ice
diff --git a/src/assembler_ia32.cpp b/src/assembler_ia32.cpp
index 3cf9e25..4347f46 100644
--- a/src/assembler_ia32.cpp
+++ b/src/assembler_ia32.cpp
@@ -258,8 +258,11 @@
   EmitOperand(src, dst);
 }
 
-void AssemblerX86::leal(GPRRegister dst, const Address &src) {
+void AssemblerX86::lea(Type Ty, GPRRegister dst, const Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  assert(Ty == IceType_i16 || Ty == IceType_i32);
+  if (Ty == IceType_i16)
+    EmitOperandSizeOverride();
   EmitUint8(0x8D);
   EmitOperand(dst, src);
 }
@@ -307,7 +310,15 @@
   EmitUint8(0x66);
   EmitUint8(0x0F);
   EmitUint8(0x6E);
-  EmitOperand(dst, Operand(src));
+  EmitRegisterOperand(dst, src);
+}
+
+void AssemblerX86::movd(XmmRegister dst, const Address &src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  EmitUint8(0x6E);
+  EmitOperand(dst, src);
 }
 
 void AssemblerX86::movd(GPRRegister dst, XmmRegister src) {
@@ -315,7 +326,15 @@
   EmitUint8(0x66);
   EmitUint8(0x0F);
   EmitUint8(0x7E);
-  EmitOperand(src, Operand(dst));
+  EmitRegisterOperand(src, dst);
+}
+
+void AssemblerX86::movd(const Address &dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  EmitUint8(0x7E);
+  EmitOperand(src, dst);
 }
 
 void AssemblerX86::movq(const Address &dst, XmmRegister src) {
@@ -1494,10 +1513,26 @@
   EmitOperand(src, Operand(dst));
 }
 
-void AssemblerX86::negl(GPRRegister reg) {
+void AssemblerX86::neg(Type Ty, GPRRegister reg) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitUint8(0xF7);
-  EmitOperand(3, Operand(reg));
+  if (Ty == IceType_i16)
+    EmitOperandSizeOverride();
+  if (Ty == IceType_i8 || Ty == IceType_i1)
+    EmitUint8(0xF6);
+  else
+    EmitUint8(0xF7);
+  EmitRegisterOperand(3, reg);
+}
+
+void AssemblerX86::neg(Type Ty, const Address &addr) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  if (Ty == IceType_i16)
+    EmitOperandSizeOverride();
+  if (Ty == IceType_i8 || Ty == IceType_i1)
+    EmitUint8(0xF6);
+  else
+    EmitUint8(0xF7);
+  EmitOperand(3, addr);
 }
 
 void AssemblerX86::notl(GPRRegister reg) {
@@ -1506,13 +1541,53 @@
   EmitUint8(0xD0 | reg);
 }
 
-void AssemblerX86::bsrl(GPRRegister dst, GPRRegister src) {
+void AssemblerX86::bswap(Type Ty, GPRRegister reg) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  assert(Ty == IceType_i32);
+  EmitUint8(0x0F);
+  EmitUint8(0xC8 | reg);
+}
+
+void AssemblerX86::bsf(Type Ty, GPRRegister dst, GPRRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  assert(Ty == IceType_i16 || Ty == IceType_i32);
+  if (Ty == IceType_i16)
+    EmitOperandSizeOverride();
+  EmitUint8(0x0F);
+  EmitUint8(0xBC);
+  EmitRegisterOperand(dst, src);
+}
+
+void AssemblerX86::bsf(Type Ty, GPRRegister dst, const Address &src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  assert(Ty == IceType_i16 || Ty == IceType_i32);
+  if (Ty == IceType_i16)
+    EmitOperandSizeOverride();
+  EmitUint8(0x0F);
+  EmitUint8(0xBC);
+  EmitOperand(dst, src);
+}
+
+void AssemblerX86::bsr(Type Ty, GPRRegister dst, GPRRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  assert(Ty == IceType_i16 || Ty == IceType_i32);
+  if (Ty == IceType_i16)
+    EmitOperandSizeOverride();
   EmitUint8(0x0F);
   EmitUint8(0xBD);
   EmitRegisterOperand(dst, src);
 }
 
+void AssemblerX86::bsr(Type Ty, GPRRegister dst, const Address &src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  assert(Ty == IceType_i16 || Ty == IceType_i32);
+  if (Ty == IceType_i16)
+    EmitOperandSizeOverride();
+  EmitUint8(0x0F);
+  EmitUint8(0xBD);
+  EmitOperand(dst, src);
+}
+
 void AssemblerX86::bt(GPRRegister base, GPRRegister offset) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0x0F);
diff --git a/src/assembler_ia32.h b/src/assembler_ia32.h
index 6192e91..b5c9a16 100644
--- a/src/assembler_ia32.h
+++ b/src/assembler_ia32.h
@@ -336,13 +336,32 @@
   static const bool kNearJump = true;
   static const bool kFarJump = false;
 
+  // Operations to emit GPR instructions (and dispatch on operand type).
+  typedef void (AssemblerX86::*TypedEmitGPR)(Type, GPRRegister);
+  typedef void (AssemblerX86::*TypedEmitAddr)(Type, const Address &);
+  struct GPREmitterOneOp {
+    TypedEmitGPR Reg;
+    TypedEmitAddr Addr;
+  };
+
+  typedef void (AssemblerX86::*TypedEmitGPRGPR)(Type, GPRRegister, GPRRegister);
+  typedef void (AssemblerX86::*TypedEmitGPRAddr)(Type, GPRRegister,
+                                                 const Address &);
+  typedef void (AssemblerX86::*TypedEmitGPRImm)(Type, GPRRegister,
+                                                const Immediate &);
+  struct GPREmitterRegOp {
+    TypedEmitGPRGPR GPRGPR;
+    TypedEmitGPRAddr GPRAddr;
+    TypedEmitGPRImm GPRImm;
+  };
+
   // Operations to emit XMM instructions (and dispatch on operand type).
   typedef void (AssemblerX86::*TypedEmitXmmXmm)(Type, XmmRegister, XmmRegister);
   typedef void (AssemblerX86::*TypedEmitXmmAddr)(Type, XmmRegister,
                                                  const Address &);
   typedef void (AssemblerX86::*TypedEmitAddrXmm)(Type, const Address &,
                                                  XmmRegister);
-  struct TypedXmmEmitters {
+  struct XmmEmitterTwoOps {
     TypedEmitXmmXmm XmmXmm;
     TypedEmitXmmAddr XmmAddr;
     TypedEmitAddrXmm AddrXmm;
@@ -393,7 +412,7 @@
   void movw(GPRRegister dst, const Address &src);
   void movw(const Address &dst, GPRRegister src);
 
-  void leal(GPRRegister dst, const Address &src);
+  void lea(Type Ty, GPRRegister dst, const Address &src);
 
   void cmov(CondX86::BrCond cond, GPRRegister dst, GPRRegister src);
 
@@ -404,7 +423,9 @@
   void movss(XmmRegister dst, XmmRegister src);
 
   void movd(XmmRegister dst, GPRRegister src);
+  void movd(XmmRegister dst, const Address &src);
   void movd(GPRRegister dst, XmmRegister src);
+  void movd(const Address &dst, XmmRegister src);
 
   void movq(const Address &dst, XmmRegister src);
   void movq(XmmRegister dst, const Address &src);
@@ -622,10 +643,16 @@
   void shrd(GPRRegister dst, GPRRegister src, const Immediate &imm);
   void shrd(const Address &dst, GPRRegister src);
 
-  void negl(GPRRegister reg);
+  void neg(Type Ty, GPRRegister reg);
+  void neg(Type Ty, const Address &addr);
   void notl(GPRRegister reg);
 
-  void bsrl(GPRRegister dst, GPRRegister src);
+  void bsf(Type Ty, GPRRegister dst, GPRRegister src);
+  void bsf(Type Ty, GPRRegister dst, const Address &src);
+  void bsr(Type Ty, GPRRegister dst, GPRRegister src);
+  void bsr(Type Ty, GPRRegister dst, const Address &src);
+
+  void bswap(Type Ty, GPRRegister reg);
 
   void bt(GPRRegister base, GPRRegister offset);
 
diff --git a/tests_lit/llvm2ice_tests/vector-ops.ll b/tests_lit/llvm2ice_tests/vector-ops.ll
index 1866e62..a5dadde 100644
--- a/tests_lit/llvm2ice_tests/vector-ops.ll
+++ b/tests_lit/llvm2ice_tests/vector-ops.ll
@@ -37,6 +37,7 @@
   %res = insertelement <4 x i32> %vec, i32 %elt, i32 0
   ret <4 x i32> %res
 ; CHECK-LABEL: insertelement_v4i32_0:
+; CHECK: movd xmm{{.*}},
 ; CHECK: movss
 
 ; SSE41-LABEL: insertelement_v4i32_0:
@@ -164,6 +165,7 @@
   ret i32 %res
 ; CHECK-LABEL: extractelement_v4i32:
 ; CHECK: pshufd
+; CHECK: movd {{.*}}, xmm
 
 ; SSE41-LABEL: extractelement_v4i32:
 ; SSE41: pextrd