Handle add, adc, etc., mfence, div, idiv, mul in the assembler.

Add a test to check that the encodings are efficient for immediates
(chooses the i8, and eax encodings when appropriate).

The .byte syntax breaks NaCl bundle straddle checking in llvm-mc,
so I had to change one of the tests which noted that a nop appeared
(no longer does).

This also assumes that _add(), etc. are usually done with _add(T, ...) and
then _mov(dst, T) so that the dest is always register.

BUG=none
R=stichnot@chromium.org

Review URL: https://codereview.chromium.org/604873003
diff --git a/src/IceInstX8632.cpp b/src/IceInstX8632.cpp
index 72d7545..0ea67b9 100644
--- a/src/IceInstX8632.cpp
+++ b/src/IceInstX8632.cpp
@@ -345,20 +345,20 @@
   }
   if (LastFixupLoc < StartPosition) {
     // The fixup doesn't apply to this current block.
-    for (intptr_t i = 0; i < EndPosition - StartPosition; ++i) {
-      Str << "\t.byte "
-          << static_cast<uint32_t>(Asm->LoadBuffer<uint8_t>(StartPosition + i))
-          << "\n";
+    for (intptr_t i = StartPosition; i < EndPosition; ++i) {
+      Str << "\t.byte 0x";
+      Str.write_hex(Asm->LoadBuffer<uint8_t>(i));
+      Str << "\n";
     }
     return;
   }
   const intptr_t FixupSize = 4;
   assert(LastFixupLoc + FixupSize <= EndPosition);
   // The fixup does apply to this current block.
-  for (intptr_t i = 0; i < LastFixupLoc - StartPosition; ++i) {
-    Str << "\t.byte "
-        << static_cast<uint32_t>(Asm->LoadBuffer<uint8_t>(StartPosition + i))
-        << "\n";
+  for (intptr_t i = StartPosition; i < LastFixupLoc; ++i) {
+    Str << "\t.byte 0x";
+    Str.write_hex(Asm->LoadBuffer<uint8_t>(i));
+    Str << "\n";
   }
   Str << "\t.long " << LastFixup->value()->getName();
   if (LastFixup->value()->getOffset()) {
@@ -366,8 +366,9 @@
   }
   Str << "\n";
   for (intptr_t i = LastFixupLoc + FixupSize; i < EndPosition; ++i) {
-    Str << "\t.byte " << static_cast<uint32_t>(Asm->LoadBuffer<uint8_t>(i))
-        << "\n";
+    Str << "\t.byte 0x";
+    Str.write_hex(Asm->LoadBuffer<uint8_t>(i));
+    Str << "\n";
   }
 }
 
@@ -478,19 +479,25 @@
   Str << "\n";
 }
 
-void emitIASVarTyGPR(const Cfg *Func, Type Ty, const Variable *Var,
-                     const x86::AssemblerX86::GPREmitterOneOp &Emitter) {
+void emitIASOpTyGPR(const Cfg *Func, Type Ty, const Operand *Op,
+                    const x86::AssemblerX86::GPREmitterOneOp &Emitter) {
   x86::AssemblerX86 *Asm = Func->getAssembler<x86::AssemblerX86>();
   intptr_t StartPosition = Asm->GetPosition();
-  if (Var->hasReg()) {
-    // We cheat a little and use GPRRegister even for byte operations.
-    RegX8632::GPRRegister VarReg =
-        RegX8632::getEncodedByteRegOrGPR(Ty, Var->getRegNum());
-    (Asm->*(Emitter.Reg))(Ty, VarReg);
+  if (const Variable *Var = llvm::dyn_cast<Variable>(Op)) {
+    if (Var->hasReg()) {
+      // We cheat a little and use GPRRegister even for byte operations.
+      RegX8632::GPRRegister VarReg =
+          RegX8632::getEncodedByteRegOrGPR(Ty, Var->getRegNum());
+      (Asm->*(Emitter.Reg))(Ty, VarReg);
+    } else {
+      x86::Address StackAddr(static_cast<TargetX8632 *>(Func->getTarget())
+                                 ->stackVarToAsmOperand(Var));
+      (Asm->*(Emitter.Addr))(Ty, StackAddr);
+    }
+  } else if (const OperandX8632Mem *Mem = llvm::dyn_cast<OperandX8632Mem>(Op)) {
+    (Asm->*(Emitter.Addr))(Ty, Mem->toAsmAddress(Asm));
   } else {
-    x86::Address StackAddr(static_cast<TargetX8632 *>(Func->getTarget())
-                               ->stackVarToAsmOperand(Var));
-    (Asm->*(Emitter.Addr))(Ty, StackAddr);
+    llvm_unreachable("Unexpected operand type");
   }
   Ostream &Str = Func->getContext()->getStrEmit();
   emitIASBytes(Str, Asm, StartPosition);
@@ -666,6 +673,29 @@
 const x86::AssemblerX86::XmmEmitterTwoOps InstX8632Sqrtss::Emitter = {
     &x86::AssemblerX86::sqrtss, &x86::AssemblerX86::sqrtss, NULL};
 
+// Binary GPR ops
+template <>
+const x86::AssemblerX86::GPREmitterRegOp InstX8632Add::Emitter = {
+    &x86::AssemblerX86::add, &x86::AssemblerX86::add, &x86::AssemblerX86::add};
+template <>
+const x86::AssemblerX86::GPREmitterRegOp InstX8632Adc::Emitter = {
+    &x86::AssemblerX86::adc, &x86::AssemblerX86::adc, &x86::AssemblerX86::adc};
+template <>
+const x86::AssemblerX86::GPREmitterRegOp InstX8632And::Emitter = {
+    &x86::AssemblerX86::And, &x86::AssemblerX86::And, &x86::AssemblerX86::And};
+template <>
+const x86::AssemblerX86::GPREmitterRegOp InstX8632Or::Emitter = {
+    &x86::AssemblerX86::Or, &x86::AssemblerX86::Or, &x86::AssemblerX86::Or};
+template <>
+const x86::AssemblerX86::GPREmitterRegOp InstX8632Sbb::Emitter = {
+    &x86::AssemblerX86::sbb, &x86::AssemblerX86::sbb, &x86::AssemblerX86::sbb};
+template <>
+const x86::AssemblerX86::GPREmitterRegOp InstX8632Sub::Emitter = {
+    &x86::AssemblerX86::sub, &x86::AssemblerX86::sub, &x86::AssemblerX86::sub};
+template <>
+const x86::AssemblerX86::GPREmitterRegOp InstX8632Xor::Emitter = {
+    &x86::AssemblerX86::Xor, &x86::AssemblerX86::Xor, &x86::AssemblerX86::Xor};
+
 // Binary XMM ops
 template <>
 const x86::AssemblerX86::XmmEmitterTwoOps InstX8632Addss::Emitter = {
@@ -798,6 +828,15 @@
   Str << "\n";
 }
 
+template <> void InstX8632Div::emitIAS(const Cfg *Func) const {
+  assert(getSrcSize() == 3);
+  const Operand *Src = getSrc(1);
+  Type Ty = Src->getType();
+  const static x86::AssemblerX86::GPREmitterOneOp Emitter = {
+      &x86::AssemblerX86::div, &x86::AssemblerX86::div};
+  emitIASOpTyGPR(Func, Ty, Src, Emitter);
+}
+
 template <> void InstX8632Idiv::emit(const Cfg *Func) const {
   Ostream &Str = Func->getContext()->getStrEmit();
   assert(getSrcSize() == 3);
@@ -806,6 +845,14 @@
   Str << "\n";
 }
 
+template <> void InstX8632Idiv::emitIAS(const Cfg *Func) const {
+  assert(getSrcSize() == 3);
+  const Operand *Src = getSrc(1);
+  Type Ty = Src->getType();
+  const static x86::AssemblerX86::GPREmitterOneOp Emitter = {
+      &x86::AssemblerX86::idiv, &x86::AssemblerX86::idiv};
+  emitIASOpTyGPR(Func, Ty, Src, Emitter);
+}
 
 namespace {
 
@@ -926,6 +973,18 @@
   Str << "\n";
 }
 
+void InstX8632Mul::emitIAS(const Cfg *Func) const {
+  assert(getSrcSize() == 2);
+  assert(llvm::isa<Variable>(getSrc(0)));
+  assert(llvm::dyn_cast<Variable>(getSrc(0))->getRegNum() == RegX8632::Reg_eax);
+  assert(getDest()->getRegNum() == RegX8632::Reg_eax); // TODO: allow edx?
+  const Operand *Src = getSrc(1);
+  Type Ty = Src->getType();
+  const static x86::AssemblerX86::GPREmitterOneOp Emitter = {
+      &x86::AssemblerX86::mul, &x86::AssemblerX86::mul};
+  emitIASOpTyGPR(Func, Ty, Src, Emitter);
+}
+
 void InstX8632Mul::dump(const Cfg *Func) const {
   Ostream &Str = Func->getContext()->getStrDump();
   dumpDest(Func);
@@ -1246,6 +1305,14 @@
   Str << "\tmfence\n";
 }
 
+void InstX8632Mfence::emitIAS(const Cfg *Func) const {
+  Ostream &Str = Func->getContext()->getStrEmit();
+  x86::AssemblerX86 *Asm = Func->getAssembler<x86::AssemblerX86>();
+  intptr_t StartPosition = Asm->GetPosition();
+  Asm->mfence();
+  emitIASBytes(Str, Asm, StartPosition);
+}
+
 void InstX8632Mfence::dump(const Cfg *Func) const {
   Ostream &Str = Func->getContext()->getStrDump();
   Str << "mfence\n";
@@ -1661,7 +1728,7 @@
   Ostream &Str = Func->getContext()->getStrEmit();
   x86::AssemblerX86 *Asm = Func->getAssembler<x86::AssemblerX86>();
   intptr_t StartPosition = Asm->GetPosition();
-  Asm->subl(RegX8632::Encoded_Reg_esp, x86::Immediate(Amount));
+  Asm->sub(IceType_i32, RegX8632::Encoded_Reg_esp, x86::Immediate(Amount));
   emitIASBytes(Str, Asm, StartPosition);
   Func->getTarget()->updateStackAdjustment(Amount);
 }
diff --git a/src/IceInstX8632.h b/src/IceInstX8632.h
index 5d25dee..0f1a336 100644
--- a/src/IceInstX8632.h
+++ b/src/IceInstX8632.h
@@ -428,8 +428,8 @@
 };
 
 // Emit a one-operand (GPR) instruction.
-void emitIASVarTyGPR(const Cfg *Func, Type Ty, const Variable *Var,
-                     const x86::AssemblerX86::GPREmitterOneOp &Emitter);
+void emitIASOpTyGPR(const Cfg *Func, Type Ty, const Operand *Var,
+                    const x86::AssemblerX86::GPREmitterOneOp &Emitter);
 
 // Instructions of the form x := op(x).
 template <InstX8632::InstKindX8632 K>
@@ -450,7 +450,7 @@
     assert(getSrcSize() == 1);
     const Variable *Var = getDest();
     Type Ty = Var->getType();
-    emitIASVarTyGPR(Func, Ty, Var, Emitter);
+    emitIASOpTyGPR(Func, Ty, Var, Emitter);
   }
   void dump(const Cfg *Func) const override {
     Ostream &Str = Func->getContext()->getStrDump();
@@ -578,7 +578,7 @@
 template <InstX8632::InstKindX8632 K, bool ShiftHack = false>
 class InstX8632Binop : public InstX8632 {
 public:
-  // Create an ordinary binary-op instruction like add or sub.
+  // Create a binary-op instruction like shifts.
   static InstX8632Binop *create(Cfg *Func, Variable *Dest, Operand *Source) {
     return new (Func->allocate<InstX8632Binop>())
         InstX8632Binop(Func, Dest, Source);
@@ -606,6 +606,44 @@
   static const char *Opcode;
 };
 
+template <InstX8632::InstKindX8632 K>
+class InstX8632BinopGPR : public InstX8632 {
+public:
+  // Create an ordinary binary-op instruction like add or sub.
+  static InstX8632BinopGPR *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX8632BinopGPR>())
+        InstX8632BinopGPR(Func, Dest, Source);
+  }
+  void emit(const Cfg *Func) const override {
+    const bool ShiftHack = false;
+    emitTwoAddress(Opcode, this, Func, ShiftHack);
+  }
+  void emitIAS(const Cfg *Func) const override {
+    Type Ty = getDest()->getType();
+    assert(getSrcSize() == 2);
+    emitIASRegOpTyGPR(Func, Ty, getDest(), getSrc(1), Emitter);
+  }
+  void dump(const Cfg *Func) const override {
+    Ostream &Str = Func->getContext()->getStrDump();
+    dumpDest(Func);
+    Str << " = " << Opcode << "." << getDest()->getType() << " ";
+    dumpSources(Func);
+  }
+  static bool classof(const Inst *Inst) { return isClassof(Inst, K); }
+
+private:
+  InstX8632BinopGPR(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX8632(Func, K, 2, Dest) {
+    addSource(Dest);
+    addSource(Source);
+  }
+  InstX8632BinopGPR(const InstX8632BinopGPR &) LLVM_DELETED_FUNCTION;
+  InstX8632BinopGPR &operator=(const InstX8632BinopGPR &) LLVM_DELETED_FUNCTION;
+  ~InstX8632BinopGPR() override {}
+  static const char *Opcode;
+  static const x86::AssemblerX86::GPREmitterRegOp Emitter;
+};
+
 template <InstX8632::InstKindX8632 K, bool NeedsElementType>
 class InstX8632BinopXmm : public InstX8632 {
 public:
@@ -665,6 +703,7 @@
     getSrc(2)->emit(Func);
     Str << "\n";
   }
+  void emitIAS(const Cfg *Func) const override { emit(Func); }
   void dump(const Cfg *Func) const override {
     Ostream &Str = Func->getContext()->getStrDump();
     dumpDest(Func);
@@ -781,22 +820,22 @@
 typedef InstX8632Movlike<InstX8632::Movp> InstX8632Movp;
 // Movq - copy between XMM registers, or mem64 and XMM registers.
 typedef InstX8632Movlike<InstX8632::Movq> InstX8632Movq;
-typedef InstX8632Binop<InstX8632::Add> InstX8632Add;
+typedef InstX8632BinopGPR<InstX8632::Add> InstX8632Add;
 typedef InstX8632BinopXmm<InstX8632::Addps, true> InstX8632Addps;
-typedef InstX8632Binop<InstX8632::Adc> InstX8632Adc;
+typedef InstX8632BinopGPR<InstX8632::Adc> InstX8632Adc;
 typedef InstX8632BinopXmm<InstX8632::Addss, false> InstX8632Addss;
 typedef InstX8632BinopXmm<InstX8632::Padd, true> InstX8632Padd;
-typedef InstX8632Binop<InstX8632::Sub> InstX8632Sub;
+typedef InstX8632BinopGPR<InstX8632::Sub> InstX8632Sub;
 typedef InstX8632BinopXmm<InstX8632::Subps, true> InstX8632Subps;
 typedef InstX8632BinopXmm<InstX8632::Subss, false> InstX8632Subss;
-typedef InstX8632Binop<InstX8632::Sbb> InstX8632Sbb;
+typedef InstX8632BinopGPR<InstX8632::Sbb> InstX8632Sbb;
 typedef InstX8632BinopXmm<InstX8632::Psub, true> InstX8632Psub;
-typedef InstX8632Binop<InstX8632::And> InstX8632And;
+typedef InstX8632BinopGPR<InstX8632::And> InstX8632And;
 typedef InstX8632BinopXmm<InstX8632::Pand, false> InstX8632Pand;
 typedef InstX8632BinopXmm<InstX8632::Pandn, false> InstX8632Pandn;
-typedef InstX8632Binop<InstX8632::Or> InstX8632Or;
+typedef InstX8632BinopGPR<InstX8632::Or> InstX8632Or;
 typedef InstX8632BinopXmm<InstX8632::Por, false> InstX8632Por;
-typedef InstX8632Binop<InstX8632::Xor> InstX8632Xor;
+typedef InstX8632BinopGPR<InstX8632::Xor> InstX8632Xor;
 typedef InstX8632BinopXmm<InstX8632::Pxor, false> InstX8632Pxor;
 typedef InstX8632Binop<InstX8632::Imul> InstX8632Imul;
 typedef InstX8632BinopXmm<InstX8632::Mulps, true> InstX8632Mulps;
@@ -858,6 +897,7 @@
         InstX8632Mul(Func, Dest, Source1, Source2);
   }
   void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
   void dump(const Cfg *Func) const override;
   static bool classof(const Inst *Inst) { return isClassof(Inst, Mul); }
 
@@ -1112,6 +1152,7 @@
     return new (Func->allocate<InstX8632Mfence>()) InstX8632Mfence(Func);
   }
   void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
   void dump(const Cfg *Func) const override;
   static bool classof(const Inst *Inst) { return isClassof(Inst, Mfence); }
 
@@ -1407,6 +1448,8 @@
 template <> void InstX8632Sqrtss::emit(const Cfg *Func) const;
 template <> void InstX8632Subss::emit(const Cfg *Func) const;
 
+template <> void InstX8632Div::emitIAS(const Cfg *Func) const;
+template <> void InstX8632Idiv::emitIAS(const Cfg *Func) const;
 template <> void InstX8632Cbwdq::emitIAS(const Cfg *Func) const;
 template <> void InstX8632Movd::emitIAS(const Cfg *Func) const;
 
diff --git a/src/assembler_ia32.cpp b/src/assembler_ia32.cpp
index 4347f46..34e7a42 100644
--- a/src/assembler_ia32.cpp
+++ b/src/assembler_ia32.cpp
@@ -1146,18 +1146,6 @@
   EmitOperand(reg, address);
 }
 
-void AssemblerX86::addl(GPRRegister dst, GPRRegister src) {
-  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitUint8(0x03);
-  EmitRegisterOperand(dst, src);
-}
-
-void AssemblerX86::addl(GPRRegister reg, const Address &address) {
-  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitUint8(0x03);
-  EmitOperand(reg, address);
-}
-
 void AssemblerX86::cmpl(const Address &address, GPRRegister reg) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0x39);
@@ -1207,117 +1195,235 @@
   }
 }
 
-void AssemblerX86::andl(GPRRegister dst, GPRRegister src) {
+void AssemblerX86::And(Type Ty, GPRRegister dst, GPRRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitUint8(0x23);
-  EmitOperand(dst, Operand(src));
+  if (Ty == IceType_i16)
+    EmitOperandSizeOverride();
+  if (Ty == IceType_i8 || Ty == IceType_i1)
+    EmitUint8(0x22);
+  else
+    EmitUint8(0x23);
+  EmitRegisterOperand(dst, src);
 }
 
-void AssemblerX86::andl(GPRRegister dst, const Immediate &imm) {
+void AssemblerX86::And(Type Ty, GPRRegister dst, const Address &address) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  if (Ty == IceType_i16)
+    EmitOperandSizeOverride();
+  if (Ty == IceType_i8 || Ty == IceType_i1)
+    EmitUint8(0x22);
+  else
+    EmitUint8(0x23);
+  EmitOperand(dst, address);
+}
+
+void AssemblerX86::And(Type Ty, GPRRegister dst, const Immediate &imm) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  if (Ty == IceType_i8 || Ty == IceType_i1) {
+    EmitComplexI8(4, Operand(dst), imm);
+    return;
+  }
+  if (Ty == IceType_i16)
+    EmitOperandSizeOverride();
   EmitComplex(4, Operand(dst), imm);
 }
 
-void AssemblerX86::andl(GPRRegister dst, const Address &address) {
+void AssemblerX86::Or(Type Ty, GPRRegister dst, GPRRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitUint8(0x23);
+  if (Ty == IceType_i16)
+    EmitOperandSizeOverride();
+  if (Ty == IceType_i8 || Ty == IceType_i1)
+    EmitUint8(0x0A);
+  else
+    EmitUint8(0x0B);
+  EmitRegisterOperand(dst, src);
+}
+
+void AssemblerX86::Or(Type Ty, GPRRegister dst, const Address &address) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  if (Ty == IceType_i16)
+    EmitOperandSizeOverride();
+  if (Ty == IceType_i8 || Ty == IceType_i1)
+    EmitUint8(0x0A);
+  else
+    EmitUint8(0x0B);
   EmitOperand(dst, address);
 }
 
-void AssemblerX86::orl(GPRRegister dst, GPRRegister src) {
+void AssemblerX86::Or(Type Ty, GPRRegister dst, const Immediate &imm) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitUint8(0x0B);
-  EmitOperand(dst, Operand(src));
-}
-
-void AssemblerX86::orl(GPRRegister dst, const Immediate &imm) {
-  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  if (Ty == IceType_i8 || Ty == IceType_i1) {
+    EmitComplexI8(1, Operand(dst), imm);
+    return;
+  }
+  if (Ty == IceType_i16)
+    EmitOperandSizeOverride();
   EmitComplex(1, Operand(dst), imm);
 }
 
-void AssemblerX86::orl(GPRRegister dst, const Address &address) {
+void AssemblerX86::Xor(Type Ty, GPRRegister dst, GPRRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitUint8(0x0B);
+  if (Ty == IceType_i16)
+    EmitOperandSizeOverride();
+  if (Ty == IceType_i8 || Ty == IceType_i1)
+    EmitUint8(0x32);
+  else
+    EmitUint8(0x33);
+  EmitRegisterOperand(dst, src);
+}
+
+void AssemblerX86::Xor(Type Ty, GPRRegister dst, const Address &address) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  if (Ty == IceType_i16)
+    EmitOperandSizeOverride();
+  if (Ty == IceType_i8 || Ty == IceType_i1)
+    EmitUint8(0x32);
+  else
+    EmitUint8(0x33);
   EmitOperand(dst, address);
 }
 
-void AssemblerX86::xorl(GPRRegister dst, GPRRegister src) {
+void AssemblerX86::Xor(Type Ty, GPRRegister dst, const Immediate &imm) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitUint8(0x33);
-  EmitOperand(dst, Operand(src));
-}
-
-void AssemblerX86::xorl(GPRRegister dst, const Immediate &imm) {
-  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  if (Ty == IceType_i8 || Ty == IceType_i1) {
+    EmitComplexI8(6, Operand(dst), imm);
+    return;
+  }
+  if (Ty == IceType_i16)
+    EmitOperandSizeOverride();
   EmitComplex(6, Operand(dst), imm);
 }
 
-void AssemblerX86::xorl(GPRRegister dst, const Address &address) {
+void AssemblerX86::add(Type Ty, GPRRegister dst, GPRRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitUint8(0x33);
-  EmitOperand(dst, address);
+  if (Ty == IceType_i16)
+    EmitOperandSizeOverride();
+  if (Ty == IceType_i8 || Ty == IceType_i1)
+    EmitUint8(0x02);
+  else
+    EmitUint8(0x03);
+  EmitRegisterOperand(dst, src);
 }
 
-void AssemblerX86::addl(GPRRegister reg, const Immediate &imm) {
+void AssemblerX86::add(Type Ty, GPRRegister reg, const Address &address) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  if (Ty == IceType_i16)
+    EmitOperandSizeOverride();
+  if (Ty == IceType_i8 || Ty == IceType_i1)
+    EmitUint8(0x02);
+  else
+    EmitUint8(0x03);
+  EmitOperand(reg, address);
+}
+
+void AssemblerX86::add(Type Ty, GPRRegister reg, const Immediate &imm) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  if (Ty == IceType_i8 || Ty == IceType_i1) {
+    EmitComplexI8(0, Operand(reg), imm);
+    return;
+  }
+  if (Ty == IceType_i16)
+    EmitOperandSizeOverride();
   EmitComplex(0, Operand(reg), imm);
 }
 
-void AssemblerX86::addl(const Address &address, GPRRegister reg) {
+void AssemblerX86::adc(Type Ty, GPRRegister dst, GPRRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitUint8(0x01);
-  EmitOperand(reg, address);
+  if (Ty == IceType_i16)
+    EmitOperandSizeOverride();
+  if (Ty == IceType_i8 || Ty == IceType_i1)
+    EmitUint8(0x12);
+  else
+    EmitUint8(0x13);
+  EmitRegisterOperand(dst, src);
 }
 
-void AssemblerX86::addl(const Address &address, const Immediate &imm) {
+void AssemblerX86::adc(Type Ty, GPRRegister dst, const Address &address) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitComplex(0, address, imm);
-}
-
-void AssemblerX86::adcl(GPRRegister reg, const Immediate &imm) {
-  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitComplex(2, Operand(reg), imm);
-}
-
-void AssemblerX86::adcl(GPRRegister dst, GPRRegister src) {
-  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitUint8(0x13);
-  EmitOperand(dst, Operand(src));
-}
-
-void AssemblerX86::adcl(GPRRegister dst, const Address &address) {
-  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitUint8(0x13);
+  if (Ty == IceType_i16)
+    EmitOperandSizeOverride();
+  if (Ty == IceType_i8 || Ty == IceType_i1)
+    EmitUint8(0x12);
+  else
+    EmitUint8(0x13);
   EmitOperand(dst, address);
 }
 
-void AssemblerX86::adcl(const Address &address, GPRRegister reg) {
+void AssemblerX86::adc(Type Ty, GPRRegister reg, const Immediate &imm) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitUint8(0x11);
+  if (Ty == IceType_i8 || Ty == IceType_i1) {
+    EmitComplexI8(2, Operand(reg), imm);
+    return;
+  }
+  if (Ty == IceType_i16)
+    EmitOperandSizeOverride();
+  EmitComplex(2, Operand(reg), imm);
+}
+
+void AssemblerX86::sub(Type Ty, GPRRegister dst, GPRRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  if (Ty == IceType_i16)
+    EmitOperandSizeOverride();
+  if (Ty == IceType_i8 || Ty == IceType_i1)
+    EmitUint8(0x2A);
+  else
+    EmitUint8(0x2B);
+  EmitRegisterOperand(dst, src);
+}
+
+void AssemblerX86::sub(Type Ty, GPRRegister reg, const Address &address) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  if (Ty == IceType_i16)
+    EmitOperandSizeOverride();
+  if (Ty == IceType_i8 || Ty == IceType_i1)
+    EmitUint8(0x2A);
+  else
+    EmitUint8(0x2B);
   EmitOperand(reg, address);
 }
 
-void AssemblerX86::subl(GPRRegister dst, GPRRegister src) {
+void AssemblerX86::sub(Type Ty, GPRRegister reg, const Immediate &imm) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitUint8(0x2B);
-  EmitOperand(dst, Operand(src));
-}
-
-void AssemblerX86::subl(GPRRegister reg, const Immediate &imm) {
-  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  if (Ty == IceType_i8 || Ty == IceType_i1) {
+    EmitComplexI8(5, Operand(reg), imm);
+    return;
+  }
+  if (Ty == IceType_i16)
+    EmitOperandSizeOverride();
   EmitComplex(5, Operand(reg), imm);
 }
 
-void AssemblerX86::subl(GPRRegister reg, const Address &address) {
+void AssemblerX86::sbb(Type Ty, GPRRegister dst, GPRRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitUint8(0x2B);
-  EmitOperand(reg, address);
+  if (Ty == IceType_i16)
+    EmitOperandSizeOverride();
+  if (Ty == IceType_i8 || Ty == IceType_i1)
+    EmitUint8(0x1A);
+  else
+    EmitUint8(0x1B);
+  EmitRegisterOperand(dst, src);
 }
 
-void AssemblerX86::subl(const Address &address, GPRRegister reg) {
+void AssemblerX86::sbb(Type Ty, GPRRegister dst, const Address &address) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitUint8(0x29);
-  EmitOperand(reg, address);
+  if (Ty == IceType_i16)
+    EmitOperandSizeOverride();
+  if (Ty == IceType_i8 || Ty == IceType_i1)
+    EmitUint8(0x1A);
+  else
+    EmitUint8(0x1B);
+  EmitOperand(dst, address);
+}
+
+void AssemblerX86::sbb(Type Ty, GPRRegister reg, const Immediate &imm) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  if (Ty == IceType_i8 || Ty == IceType_i1) {
+    EmitComplexI8(3, Operand(reg), imm);
+    return;
+  }
+  if (Ty == IceType_i16)
+    EmitOperandSizeOverride();
+  EmitComplex(3, Operand(reg), imm);
 }
 
 void AssemblerX86::cbw() {
@@ -1337,10 +1443,48 @@
   EmitUint8(0x99);
 }
 
-void AssemblerX86::idivl(GPRRegister reg) {
+void AssemblerX86::div(Type Ty, GPRRegister reg) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitUint8(0xF7);
-  EmitUint8(0xF8 | reg);
+  if (Ty == IceType_i16)
+    EmitOperandSizeOverride();
+  if (Ty == IceType_i8 || Ty == IceType_i1)
+    EmitUint8(0xF6);
+  else
+    EmitUint8(0xF7);
+  EmitRegisterOperand(6, reg);
+}
+
+void AssemblerX86::div(Type Ty, const Address &addr) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  if (Ty == IceType_i16)
+    EmitOperandSizeOverride();
+  if (Ty == IceType_i8 || Ty == IceType_i1)
+    EmitUint8(0xF6);
+  else
+    EmitUint8(0xF7);
+  EmitOperand(6, addr);
+}
+
+void AssemblerX86::idiv(Type Ty, GPRRegister reg) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  if (Ty == IceType_i16)
+    EmitOperandSizeOverride();
+  if (Ty == IceType_i8 || Ty == IceType_i1)
+    EmitUint8(0xF6);
+  else
+    EmitUint8(0xF7);
+  EmitRegisterOperand(7, reg);
+}
+
+void AssemblerX86::idiv(Type Ty, const Address &addr) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  if (Ty == IceType_i16)
+    EmitOperandSizeOverride();
+  if (Ty == IceType_i8 || Ty == IceType_i1)
+    EmitUint8(0xF6);
+  else
+    EmitUint8(0xF7);
+  EmitOperand(7, addr);
 }
 
 void AssemblerX86::imull(GPRRegister dst, GPRRegister src) {
@@ -1376,41 +1520,28 @@
   EmitOperand(5, address);
 }
 
-void AssemblerX86::mull(GPRRegister reg) {
+void AssemblerX86::mul(Type Ty, GPRRegister reg) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitUint8(0xF7);
-  EmitOperand(4, Operand(reg));
+  if (Ty == IceType_i16)
+    EmitOperandSizeOverride();
+  if (Ty == IceType_i8 || Ty == IceType_i1)
+    EmitUint8(0xF6);
+  else
+    EmitUint8(0xF7);
+  EmitRegisterOperand(4, reg);
 }
 
-void AssemblerX86::mull(const Address &address) {
+void AssemblerX86::mul(Type Ty, const Address &address) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitUint8(0xF7);
+  if (Ty == IceType_i16)
+    EmitOperandSizeOverride();
+  if (Ty == IceType_i8 || Ty == IceType_i1)
+    EmitUint8(0xF6);
+  else
+    EmitUint8(0xF7);
   EmitOperand(4, address);
 }
 
-void AssemblerX86::sbbl(GPRRegister dst, GPRRegister src) {
-  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitUint8(0x1B);
-  EmitOperand(dst, Operand(src));
-}
-
-void AssemblerX86::sbbl(GPRRegister reg, const Immediate &imm) {
-  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitComplex(3, Operand(reg), imm);
-}
-
-void AssemblerX86::sbbl(GPRRegister dst, const Address &address) {
-  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitUint8(0x1B);
-  EmitOperand(dst, address);
-}
-
-void AssemblerX86::sbbl(const Address &address, GPRRegister dst) {
-  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitUint8(0x19);
-  EmitOperand(dst, address);
-}
-
 void AssemblerX86::incl(GPRRegister reg) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0x40 + reg);
@@ -1750,6 +1881,13 @@
   EmitInt32(-4);
 }
 
+void AssemblerX86::mfence() {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x0F);
+  EmitUint8(0xAE);
+  EmitUint8(0xF0);
+}
+
 void AssemblerX86::lock() {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF0);
diff --git a/src/assembler_ia32.h b/src/assembler_ia32.h
index 2dc0c12..83c0b66 100644
--- a/src/assembler_ia32.h
+++ b/src/assembler_ia32.h
@@ -571,40 +571,43 @@
   void testl(GPRRegister reg1, GPRRegister reg2);
   void testl(GPRRegister reg, const Immediate &imm);
 
-  void andl(GPRRegister dst, const Immediate &imm);
-  void andl(GPRRegister dst, GPRRegister src);
-  void andl(GPRRegister dst, const Address &address);
+  void And(Type Ty, GPRRegister dst, GPRRegister src);
+  void And(Type Ty, GPRRegister dst, const Address &address);
+  void And(Type Ty, GPRRegister dst, const Immediate &imm);
 
-  void orl(GPRRegister dst, const Immediate &imm);
-  void orl(GPRRegister dst, GPRRegister src);
-  void orl(GPRRegister dst, const Address &address);
+  void Or(Type Ty, GPRRegister dst, GPRRegister src);
+  void Or(Type Ty, GPRRegister dst, const Address &address);
+  void Or(Type Ty, GPRRegister dst, const Immediate &imm);
 
-  void xorl(GPRRegister dst, const Immediate &imm);
-  void xorl(GPRRegister dst, GPRRegister src);
-  void xorl(GPRRegister dst, const Address &address);
+  void Xor(Type Ty, GPRRegister dst, GPRRegister src);
+  void Xor(Type Ty, GPRRegister dst, const Address &address);
+  void Xor(Type Ty, GPRRegister dst, const Immediate &imm);
 
-  void addl(GPRRegister dst, GPRRegister src);
-  void addl(GPRRegister reg, const Immediate &imm);
-  void addl(GPRRegister reg, const Address &address);
+  void add(Type Ty, GPRRegister dst, GPRRegister src);
+  void add(Type Ty, GPRRegister reg, const Address &address);
+  void add(Type Ty, GPRRegister reg, const Immediate &imm);
 
-  void addl(const Address &address, GPRRegister reg);
-  void addl(const Address &address, const Immediate &imm);
+  void adc(Type Ty, GPRRegister dst, GPRRegister src);
+  void adc(Type Ty, GPRRegister dst, const Address &address);
+  void adc(Type Ty, GPRRegister reg, const Immediate &imm);
 
-  void adcl(GPRRegister dst, GPRRegister src);
-  void adcl(GPRRegister reg, const Immediate &imm);
-  void adcl(GPRRegister dst, const Address &address);
-  void adcl(const Address &dst, GPRRegister src);
+  void sub(Type Ty, GPRRegister dst, GPRRegister src);
+  void sub(Type Ty, GPRRegister reg, const Address &address);
+  void sub(Type Ty, GPRRegister reg, const Immediate &imm);
 
-  void subl(GPRRegister dst, GPRRegister src);
-  void subl(GPRRegister reg, const Immediate &imm);
-  void subl(GPRRegister reg, const Address &address);
-  void subl(const Address &address, GPRRegister reg);
+  void sbb(Type Ty, GPRRegister dst, GPRRegister src);
+  void sbb(Type Ty, GPRRegister reg, const Address &address);
+  void sbb(Type Ty, GPRRegister reg, const Immediate &imm);
 
   void cbw();
   void cwd();
   void cdq();
 
-  void idivl(GPRRegister reg);
+  void div(Type Ty, GPRRegister reg);
+  void div(Type Ty, const Address &address);
+
+  void idiv(Type Ty, GPRRegister reg);
+  void idiv(Type Ty, const Address &address);
 
   void imull(GPRRegister dst, GPRRegister src);
   void imull(GPRRegister reg, const Immediate &imm);
@@ -613,13 +616,8 @@
   void imull(GPRRegister reg);
   void imull(const Address &address);
 
-  void mull(GPRRegister reg);
-  void mull(const Address &address);
-
-  void sbbl(GPRRegister dst, GPRRegister src);
-  void sbbl(GPRRegister reg, const Immediate &imm);
-  void sbbl(GPRRegister reg, const Address &address);
-  void sbbl(const Address &address, GPRRegister reg);
+  void mul(Type Ty, GPRRegister reg);
+  void mul(Type Ty, const Address &address);
 
   void incl(GPRRegister reg);
   void incl(const Address &address);
@@ -670,6 +668,8 @@
   void jmp(Label *label, bool near = kFarJump);
   void jmp(const ConstantRelocatable *label);
 
+  void mfence();
+
   void lock();
   void cmpxchg(Type Ty, const Address &address, GPRRegister reg);
   void cmpxchg8b(const Address &address);
diff --git a/tests_lit/assembler/x86/immediate_encodings.ll b/tests_lit/assembler/x86/immediate_encodings.ll
new file mode 100644
index 0000000..2a9fcb4
--- /dev/null
+++ b/tests_lit/assembler/x86/immediate_encodings.ll
@@ -0,0 +1,114 @@
+; Tests various aspects of x86 immediate encoding. Some encodings are shorter.
+; For example, the encoding is shorter for 8-bit immediates or when using EAX.
+; This assumes that EAX is chosen as the first free register in O2 mode.
+
+; RUN: %p2i -i %s --args -O2 --verbose none \
+; RUN:   | llvm-mc -triple=i686-none-nacl -x86-asm-syntax=intel -filetype=obj \
+; RUN:   | llvm-objdump -d --symbolize -x86-asm-syntax=intel - | FileCheck %s
+; RUN: %p2i -i %s --args --verbose none | FileCheck --check-prefix=ERRORS %s
+; RUN: %p2i -i %s --insts | %szdiff %s | FileCheck --check-prefix=DUMP %s
+
+define internal i32 @testXor8Imm8(i32 %arg) {
+entry:
+  %arg_i8 = trunc i32 %arg to i8
+  %result_i8 = xor i8 %arg_i8, 127
+  %result = zext i8 %result_i8 to i32
+  ret i32 %result
+}
+; CHECK-LABEL: testXor8Imm8
+; CHECK: 34 7f   xor al, 127
+
+define internal i32 @testXor8Imm8Neg(i32 %arg) {
+entry:
+  %arg_i8 = trunc i32 %arg to i8
+  %result_i8 = xor i8 %arg_i8, -128
+  %result = zext i8 %result_i8 to i32
+  ret i32 %result
+}
+; CHECK-LABEL: testXor8Imm8Neg
+; CHECK: 34 80   xor al, -128
+
+define internal i32 @testXor8Imm8NotEAX(i32 %arg, i32 %arg2, i32 %arg3) {
+entry:
+  %arg_i8 = trunc i32 %arg to i8
+  %arg2_i8 = trunc i32 %arg2 to i8
+  %arg3_i8 = trunc i32 %arg3 to i8
+  %x1 = xor i8 %arg_i8, 127
+  %x2 = xor i8 %arg2_i8, 127
+  %x3 = xor i8 %arg3_i8, 127
+  %x4 = add i8 %x1, %x2
+  %x5 = add i8 %x4, %x3
+  %result = zext i8 %x5 to i32
+  ret i32 %result
+}
+; CHECK-LABEL: testXor8Imm8NotEAX
+; CHECK: 80 f{{[1-3]}} 7f xor {{[^a]}}l, 127
+
+define internal i32 @testXor32Imm8(i32 %arg) {
+entry:
+  %result = xor i32 %arg, 127
+  ret i32 %result
+}
+; CHECK-LABEL: testXor32Imm8
+; CHECK: 83 f0 7f   xor eax, 127
+
+define internal i32 @testXor32Imm8Neg(i32 %arg) {
+entry:
+  %result = xor i32 %arg, -128
+  ret i32 %result
+}
+; CHECK-LABEL: testXor32Imm8Neg
+; CHECK: 83 f0 80   xor eax, -128
+
+define internal i32 @testXor32Imm32Eax(i32 %arg) {
+entry:
+  %result = xor i32 %arg, 16777216
+  ret i32 %result
+}
+; CHECK-LABEL: testXor32Imm32Eax
+; CHECK: 35 00 00 00 01   xor eax, 16777216
+
+define internal i32 @testXor32Imm32NegEax(i32 %arg) {
+entry:
+  %result = xor i32 %arg, -256
+  ret i32 %result
+}
+; CHECK-LABEL: testXor32Imm32NegEax
+; CHECK: 35 00 ff ff ff   xor eax, 4294967040
+
+define internal i32 @testXor32Imm32NotEAX(i32 %arg, i32 %arg2, i32 %arg3) {
+entry:
+  %x = xor i32 %arg, 32767
+  %x2 = xor i32 %arg2, 32767
+  %x3 = xor i32 %arg3, 32767
+  %add1 = add i32 %x, %x2
+  %add2 = add i32 %add1, %x3
+  ret i32 %add2
+}
+; CHECK-LABEL: testXor32Imm32NotEAX
+; CHECK: 81 f{{[1-3]}} ff 7f 00 00   xor e{{[^a]}}x, 32767
+
+; Should be similar for add, sub, etc., so sample a few.
+
+define internal i32 @testAdd8Imm8(i32 %arg) {
+entry:
+  %arg_i8 = trunc i32 %arg to i8
+  %result_i8 = add i8 %arg_i8, 126
+  %result = zext i8 %result_i8 to i32
+  ret i32 %result
+}
+; CHECK-LABEL: testAdd8Imm8
+; CHECK: 04 7e   add al, 126
+
+define internal i32 @testSub8Imm8(i32 %arg) {
+entry:
+  %arg_i8 = trunc i32 %arg to i8
+  %result_i8 = sub i8 %arg_i8, 125
+  %result = zext i8 %result_i8 to i32
+  ret i32 %result
+}
+; CHECK-LABEL: testSub8Imm8
+; CHECK: 2c 7d  sub al, 125
+
+; ERRORS-NOT: ICE translation error
+; DUMP-NOT: SZ
diff --git a/tests_lit/llvm2ice_tests/simple-loop.ll b/tests_lit/llvm2ice_tests/simple-loop.ll
index f40a9af..02b78ab 100644
--- a/tests_lit/llvm2ice_tests/simple-loop.ll
+++ b/tests_lit/llvm2ice_tests/simple-loop.ll
@@ -36,8 +36,6 @@
 ; CHECK:      mov ecx, dword ptr [esp{{.*}}+{{.*}}{{[0-9]+}}]
 ; CHECK:      cmp ecx, 0
 ; CHECK-NEXT: jle {{[0-9]}}
-; NaCl bundle padding
-; CHECK-NEXT: nop
 
 ; TODO: the mov from ebx to esi seems redundant here - so this may need to be
 ; modified later
diff --git a/tests_lit/llvm2ice_tests/test_i1.ll b/tests_lit/llvm2ice_tests/test_i1.ll
index b0b654e..697f0d1 100644
--- a/tests_lit/llvm2ice_tests/test_i1.ll
+++ b/tests_lit/llvm2ice_tests/test_i1.ll
@@ -9,6 +9,28 @@
 ; RUN: %p2i -i %s -a --verbose none | FileCheck --check-prefix=ERRORS %s
 ; RUN: %p2i -i %s --insts | %szdiff %s | FileCheck --check-prefix=DUMP %s
 
+; Test that and with true uses immediate 1, not -1.
+define internal i32 @testAndTrue(i32 %arg) {
+entry:
+  %arg_i1 = trunc i32 %arg to i1
+  %result_i1 = and i1 %arg_i1, true
+  %result = zext i1 %result_i1 to i32
+  ret i32 %result
+}
+; CHECK-LABEL: testAndTrue
+; CHECK: and {{.*}}, 1
+
+; Test that or with true uses immediate 1, not -1.
+define internal i32 @testOrTrue(i32 %arg) {
+entry:
+  %arg_i1 = trunc i32 %arg to i1
+  %result_i1 = or i1 %arg_i1, true
+  %result = zext i1 %result_i1 to i32
+  ret i32 %result
+}
+; CHECK-LABEL: testOrTrue
+; CHECK: or {{.*}}, 1
+
 ; Test that xor with true uses immediate 1, not -1.
 define internal i32 @testXorTrue(i32 %arg) {
 entry: