Handle imul, pcmpeq, pcmpgt.

Be sure to legalize 8-bit imul immediates (there is only the r/m form).
Add a test for that, and cover a couple of other ops too...

There is a one-byte-shorter form when Dest/Src0 == EAX and Src1 is not
an immediate, but that isn't taken advantage of.

Go ahead and add the optimization for 8-bit immediates for i16/i32
(not allowed for i8). It shows up sometimes in spec, e.g., to multiply by 10.
There is a lot of multiply by 4 as well, that we could strength-reduce.

BUG=none
R=stichnot@chromium.org

Review URL: https://codereview.chromium.org/617593002
diff --git a/src/IceInstX8632.cpp b/src/IceInstX8632.cpp
index 0a2d034..ab32335 100644
--- a/src/IceInstX8632.cpp
+++ b/src/IceInstX8632.cpp
@@ -720,6 +720,12 @@
 const x86::AssemblerX86::XmmEmitterTwoOps InstX8632Pandn::Emitter = {
     &x86::AssemblerX86::pandn, &x86::AssemblerX86::pandn, NULL};
 template <>
+const x86::AssemblerX86::XmmEmitterTwoOps InstX8632Pcmpeq::Emitter = {
+    &x86::AssemblerX86::pcmpeq, &x86::AssemblerX86::pcmpeq, NULL};
+template <>
+const x86::AssemblerX86::XmmEmitterTwoOps InstX8632Pcmpgt::Emitter = {
+    &x86::AssemblerX86::pcmpgt, &x86::AssemblerX86::pcmpgt, NULL};
+template <>
 const x86::AssemblerX86::XmmEmitterTwoOps InstX8632Pmuludq::Emitter = {
     &x86::AssemblerX86::pmuludq, &x86::AssemblerX86::pmuludq, NULL};
 template <>
@@ -904,6 +910,30 @@
   }
 }
 
+template <> void InstX8632Imul::emitIAS(const Cfg *Func) const {
+  assert(getSrcSize() == 2);
+  const Variable *Var = getDest();
+  Type Ty = Var->getType();
+  const Operand *Src = getSrc(1);
+  if (isByteSizedArithType(Ty)) {
+    // The 8-bit version of imul only allows the form "imul r/m8".
+    Variable *Src0 = llvm::dyn_cast<Variable>(getSrc(0));
+    (void)Src0;
+    assert(Src0 && Src0->getRegNum() == RegX8632::Reg_eax);
+    const x86::AssemblerX86::GPREmitterOneOp Emitter = {
+        &x86::AssemblerX86::imul, &x86::AssemblerX86::imul};
+    emitIASOpTyGPR(Func, Ty, getSrc(1), Emitter);
+  } else {
+    // We only use imul as a two-address instruction even though
+    // there is a 3 operand version when one of the operands is a constant.
+    assert(Var == getSrc(0));
+    const x86::AssemblerX86::GPREmitterRegOp Emitter = {
+        &x86::AssemblerX86::imul, &x86::AssemblerX86::imul,
+        &x86::AssemblerX86::imul};
+    emitIASRegOpTyGPR(Func, Ty, Var, Src, Emitter);
+  }
+}
+
 template <> void InstX8632Cbwdq::emit(const Cfg *Func) const {
   Ostream &Str = Func->getContext()->getStrEmit();
   assert(getSrcSize() == 1);
diff --git a/src/IceInstX8632.h b/src/IceInstX8632.h
index 0f1a336..9facb2b 100644
--- a/src/IceInstX8632.h
+++ b/src/IceInstX8632.h
@@ -837,7 +837,7 @@
 typedef InstX8632BinopXmm<InstX8632::Por, false> InstX8632Por;
 typedef InstX8632BinopGPR<InstX8632::Xor> InstX8632Xor;
 typedef InstX8632BinopXmm<InstX8632::Pxor, false> InstX8632Pxor;
-typedef InstX8632Binop<InstX8632::Imul> InstX8632Imul;
+typedef InstX8632BinopGPR<InstX8632::Imul> InstX8632Imul;
 typedef InstX8632BinopXmm<InstX8632::Mulps, true> InstX8632Mulps;
 typedef InstX8632BinopXmm<InstX8632::Mulss, false> InstX8632Mulss;
 typedef InstX8632Binop<InstX8632::Pmull> InstX8632Pmull;
@@ -850,8 +850,8 @@
 typedef InstX8632Binop<InstX8632::Shr, true> InstX8632Shr;
 typedef InstX8632Binop<InstX8632::Sar, true> InstX8632Sar;
 typedef InstX8632Binop<InstX8632::Psra> InstX8632Psra;
-typedef InstX8632Binop<InstX8632::Pcmpeq> InstX8632Pcmpeq;
-typedef InstX8632Binop<InstX8632::Pcmpgt> InstX8632Pcmpgt;
+typedef InstX8632BinopXmm<InstX8632::Pcmpeq, true> InstX8632Pcmpeq;
+typedef InstX8632BinopXmm<InstX8632::Pcmpgt, true> InstX8632Pcmpgt;
 // TODO: movss is only a binary operation when the source and dest
 // operands are both registers.  In other cases, it behaves like a copy
 // (mov-like) operation.  Eventually, InstX8632Movss should assert that
@@ -1450,6 +1450,7 @@
 
 template <> void InstX8632Div::emitIAS(const Cfg *Func) const;
 template <> void InstX8632Idiv::emitIAS(const Cfg *Func) const;
+template <> void InstX8632Imul::emitIAS(const Cfg *Func) const;
 template <> void InstX8632Cbwdq::emitIAS(const Cfg *Func) const;
 template <> void InstX8632Movd::emitIAS(const Cfg *Func) const;
 
diff --git a/src/IceTargetLoweringX8632.cpp b/src/IceTargetLoweringX8632.cpp
index 7a7fb12..96cc57b 100644
--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
@@ -1581,10 +1581,12 @@
       //
       // The 8-bit version of imul only allows the form "imul r/m8"
       // where T must be in eax.
-      if (isByteSizedArithType(Dest->getType()))
+      if (isByteSizedArithType(Dest->getType())) {
         _mov(T, Src0, RegX8632::Reg_eax);
-      else
+        Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
+      } else {
         _mov(T, Src0);
+      }
       _imul(T, Src1);
       _mov(Dest, T);
       break;
diff --git a/src/assembler_ia32.cpp b/src/assembler_ia32.cpp
index 7791709..3dce9b7 100644
--- a/src/assembler_ia32.cpp
+++ b/src/assembler_ia32.cpp
@@ -1056,15 +1056,62 @@
   EmitXmmRegisterOperand(dst, src);
 }
 
-void AssemblerX86::pcmpeqq(XmmRegister dst, XmmRegister src) {
+void AssemblerX86::pcmpeq(Type Ty, XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0x66);
   EmitUint8(0x0F);
-  EmitUint8(0x38);
-  EmitUint8(0x29);
+  if (isByteSizedArithType(Ty)) {
+    EmitUint8(0x74);
+  } else if (Ty == IceType_i16) {
+    EmitUint8(0x75);
+  } else {
+    EmitUint8(0x76);
+  }
   EmitXmmRegisterOperand(dst, src);
 }
 
+void AssemblerX86::pcmpeq(Type Ty, XmmRegister dst, const Address &src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  if (isByteSizedArithType(Ty)) {
+    EmitUint8(0x74);
+  } else if (Ty == IceType_i16) {
+    EmitUint8(0x75);
+  } else {
+    EmitUint8(0x76);
+  }
+  EmitOperand(dst, src);
+}
+
+void AssemblerX86::pcmpgt(Type Ty, XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  if (isByteSizedArithType(Ty)) {
+    EmitUint8(0x64);
+  } else if (Ty == IceType_i16) {
+    EmitUint8(0x65);
+  } else {
+    EmitUint8(0x66);
+  }
+  EmitXmmRegisterOperand(dst, src);
+}
+
+void AssemblerX86::pcmpgt(Type Ty, XmmRegister dst, const Address &src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  if (isByteSizedArithType(Ty)) {
+    EmitUint8(0x64);
+  } else if (Ty == IceType_i16) {
+    EmitUint8(0x65);
+  } else {
+    EmitUint8(0x66);
+  }
+  EmitOperand(dst, src);
+}
+
 void AssemblerX86::roundsd(XmmRegister dst, XmmRegister src,
                            RoundingMode mode) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
@@ -1489,36 +1536,61 @@
   EmitOperand(7, addr);
 }
 
-void AssemblerX86::imull(GPRRegister dst, GPRRegister src) {
+void AssemblerX86::imul(Type Ty, GPRRegister dst, GPRRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  assert(Ty == IceType_i16 || Ty == IceType_i32);
+  if (Ty == IceType_i16)
+    EmitOperandSizeOverride();
   EmitUint8(0x0F);
   EmitUint8(0xAF);
-  EmitOperand(dst, Operand(src));
+  EmitRegisterOperand(dst, src);
 }
 
-void AssemblerX86::imull(GPRRegister reg, const Immediate &imm) {
+void AssemblerX86::imul(Type Ty, GPRRegister reg, const Address &address) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitUint8(0x69);
-  EmitOperand(reg, Operand(reg));
-  EmitImmediate(BrokenType, imm);
-}
-
-void AssemblerX86::imull(GPRRegister reg, const Address &address) {
-  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  assert(Ty == IceType_i16 || Ty == IceType_i32);
+  if (Ty == IceType_i16)
+    EmitOperandSizeOverride();
   EmitUint8(0x0F);
   EmitUint8(0xAF);
   EmitOperand(reg, address);
 }
 
-void AssemblerX86::imull(GPRRegister reg) {
+void AssemblerX86::imul(Type Ty, GPRRegister reg, const Immediate &imm) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitUint8(0xF7);
-  EmitOperand(5, Operand(reg));
+  assert(Ty == IceType_i16 || Ty == IceType_i32);
+  if (Ty == IceType_i16)
+    EmitOperandSizeOverride();
+  if (imm.is_int8()) {
+    EmitUint8(0x6B);
+    EmitRegisterOperand(reg, reg);
+    EmitUint8(imm.value() & 0xFF);
+  } else {
+    EmitUint8(0x69);
+    EmitRegisterOperand(reg, reg);
+    EmitImmediate(Ty, imm);
+  }
 }
 
-void AssemblerX86::imull(const Address &address) {
+void AssemblerX86::imul(Type Ty, GPRRegister reg) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitUint8(0xF7);
+  if (Ty == IceType_i16)
+    EmitOperandSizeOverride();
+  if (isByteSizedArithType(Ty))
+    EmitUint8(0xF6);
+  else
+    EmitUint8(0xF7);
+  EmitRegisterOperand(5, reg);
+}
+
+void AssemblerX86::imul(Type Ty, const Address &address) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  if (Ty == IceType_i16)
+    EmitOperandSizeOverride();
+  if (isByteSizedArithType(Ty))
+    EmitUint8(0xF6);
+  else
+    EmitUint8(0xF7);
   EmitOperand(5, address);
 }
 
diff --git a/src/assembler_ia32.h b/src/assembler_ia32.h
index 3e9a937..0f95e35 100644
--- a/src/assembler_ia32.h
+++ b/src/assembler_ia32.h
@@ -535,7 +535,11 @@
 
   void pextrd(GPRRegister dst, XmmRegister src, const Immediate &imm);
   void pmovsxdq(XmmRegister dst, XmmRegister src);
-  void pcmpeqq(XmmRegister dst, XmmRegister src);
+
+  void pcmpeq(Type Ty, XmmRegister dst, XmmRegister src);
+  void pcmpeq(Type Ty, XmmRegister dst, const Address &src);
+  void pcmpgt(Type Ty, XmmRegister dst, XmmRegister src);
+  void pcmpgt(Type Ty, XmmRegister dst, const Address &src);
 
   enum RoundingMode {
     kRoundToNearest = 0x0,
@@ -609,12 +613,12 @@
   void idiv(Type Ty, GPRRegister reg);
   void idiv(Type Ty, const Address &address);
 
-  void imull(GPRRegister dst, GPRRegister src);
-  void imull(GPRRegister reg, const Immediate &imm);
-  void imull(GPRRegister reg, const Address &address);
+  void imul(Type Ty, GPRRegister dst, GPRRegister src);
+  void imul(Type Ty, GPRRegister reg, const Immediate &imm);
+  void imul(Type Ty, GPRRegister reg, const Address &address);
 
-  void imull(GPRRegister reg);
-  void imull(const Address &address);
+  void imul(Type Ty, GPRRegister reg);
+  void imul(Type Ty, const Address &address);
 
   void mul(Type Ty, GPRRegister reg);
   void mul(Type Ty, const Address &address);
diff --git a/tests_lit/assembler/x86/immediate_encodings.ll b/tests_lit/assembler/x86/immediate_encodings.ll
index a80fe5b..ff23e55 100644
--- a/tests_lit/assembler/x86/immediate_encodings.ll
+++ b/tests_lit/assembler/x86/immediate_encodings.ll
@@ -171,5 +171,88 @@
 ; CHECK-LABEL: testSub8Imm8
 ; CHECK: 2c 7d  sub al, 125
 
+; imul has some shorter 8-bit immediate encodings.
+; It also has a shorter encoding for eax, but we don't do that yet.
+
+define internal i32 @testMul16Imm8(i32 %arg) {
+entry:
+  %arg_i16 = trunc i32 %arg to i16
+  %tmp = mul i16 %arg_i16, 99
+  %result_i16 = add i16 %tmp, 1
+  %result = zext i16 %result_i16 to i32
+  ret i32 %result
+}
+; CHECK-LABEL: testMul16Imm8
+; CHECK: 66 6b c0 63  imul ax, ax, 99
+; CHECK-NEXT: add ax, 1
+
+define internal i32 @testMul16Imm8Neg(i32 %arg) {
+entry:
+  %arg_i16 = trunc i32 %arg to i16
+  %tmp = mul i16 %arg_i16, -111
+  %result_i16 = add i16 %tmp, 1
+  %result = zext i16 %result_i16 to i32
+  ret i32 %result
+}
+; CHECK-LABEL: testMul16Imm8Neg
+; CHECK: 66 6b c0 91  imul ax, ax, 145
+; CHECK-NEXT: add ax, 1
+
+define internal i32 @testMul16Imm16(i32 %arg) {
+entry:
+  %arg_i16 = trunc i32 %arg to i16
+  %tmp = mul i16 %arg_i16, 1024
+  %result_i16 = add i16 %tmp, 1
+  %result = zext i16 %result_i16 to i32
+  ret i32 %result
+}
+; CHECK-LABEL: testMul16Imm16
+; CHECK: 66 69 c0 00 04  imul ax, ax, 1024
+; CHECK-NEXT: add ax, 1
+
+define internal i32 @testMul16Imm16Neg(i32 %arg) {
+entry:
+  %arg_i16 = trunc i32 %arg to i16
+  %tmp = mul i16 %arg_i16, -256
+  %result_i16 = add i16 %tmp, 1
+  %result = zext i16 %result_i16 to i32
+  ret i32 %result
+}
+; CHECK-LABEL: testMul16Imm16Neg
+; CHECK: 66 69 c0 00 ff  imul ax, ax, 65280
+; CHECK-NEXT: add ax, 1
+
+define internal i32 @testMul32Imm8(i32 %arg) {
+entry:
+  %result = mul i32 %arg, 99
+  ret i32 %result
+}
+; CHECK-LABEL: testMul32Imm8
+; CHECK: 6b c0 63  imul eax, eax, 99
+
+define internal i32 @testMul32Imm8Neg(i32 %arg) {
+entry:
+  %result = mul i32 %arg, -111
+  ret i32 %result
+}
+; CHECK-LABEL: testMul32Imm8Neg
+; CHECK: 6b c0 91  imul eax, eax, -111
+
+define internal i32 @testMul32Imm16(i32 %arg) {
+entry:
+  %result = mul i32 %arg, 1024
+  ret i32 %result
+}
+; CHECK-LABEL: testMul32Imm16
+; CHECK: 69 c0 00 04 00 00  imul eax, eax, 1024
+
+define internal i32 @testMul32Imm16Neg(i32 %arg) {
+entry:
+  %result = mul i32 %arg, -256
+  ret i32 %result
+}
+; CHECK-LABEL: testMul32Imm16Neg
+; CHECK: 69 c0 00 ff ff ff  imul eax, eax, 4294967040
+
 ; ERRORS-NOT: ICE translation error
 ; DUMP-NOT: SZ
diff --git a/tests_lit/llvm2ice_tests/8bit.pnacl.ll b/tests_lit/llvm2ice_tests/8bit.pnacl.ll
new file mode 100644
index 0000000..2307712
--- /dev/null
+++ b/tests_lit/llvm2ice_tests/8bit.pnacl.ll
@@ -0,0 +1,164 @@
+; This tries to be a comprehensive test of i8 operations.
+
+; RUN: %p2i -i %s --args -O2 --verbose none \
+; RUN:   | llvm-mc -triple=i686-none-nacl -x86-asm-syntax=intel -filetype=obj \
+; RUN:   | llvm-objdump -d --symbolize -x86-asm-syntax=intel - | FileCheck %s
+; RUN: %p2i -i %s --args -Om1 --verbose none \
+; RUN:   | llvm-mc -triple=i686-none-nacl -x86-asm-syntax=intel -filetype=obj \
+; RUN:   | llvm-objdump -d --symbolize -x86-asm-syntax=intel - | FileCheck %s
+; RUN: %p2i -i %s --args --verbose none | FileCheck --check-prefix=ERRORS %s
+; RUN: %p2i -i %s --insts | %szdiff %s | FileCheck --check-prefix=DUMP %s
+
+define internal i32 @add8Bit(i32 %a, i32 %b) {
+entry:
+  %a_8 = trunc i32 %a to i8
+  %b_8 = trunc i32 %b to i8
+  %add = add i8 %b_8, %a_8
+  %ret = zext i8 %add to i32
+  ret i32 %ret
+}
+; CHECK-LABEL: add8Bit
+; CHECK: add {{[abcd]l}}
+
+define internal i32 @add8BitConst(i32 %a, i32 %b) {
+entry:
+  %a_8 = trunc i32 %a to i8
+  %add = add i8 %a_8, 123
+  %ret = zext i8 %add to i32
+  ret i32 %ret
+}
+; CHECK-LABEL: add8BitConst
+; CHECK: add {{[abcd]l}}
+
+define internal i32 @sub8Bit(i32 %a, i32 %b) {
+entry:
+  %a_8 = trunc i32 %a to i8
+  %b_8 = trunc i32 %b to i8
+  %sub = sub i8 %b_8, %a_8
+  %ret = zext i8 %sub to i32
+  ret i32 %ret
+}
+; CHECK-LABEL: sub8Bit
+; XCHECK: sub {{[abcd]l}}
+
+define internal i32 @sub8BitConst(i32 %a, i32 %b) {
+entry:
+  %a_8 = trunc i32 %a to i8
+  %sub = sub i8 %a_8, 123
+  %ret = zext i8 %sub to i32
+  ret i32 %ret
+}
+; CHECK-LABEL: sub8BitConst
+; XCHECK: sub {{[abcd]l}}
+
+define internal i32 @mul8Bit(i32 %a, i32 %b) {
+entry:
+  %a_8 = trunc i32 %a to i8
+  %b_8 = trunc i32 %b to i8
+  %mul = mul i8 %b_8, %a_8
+  %ret = zext i8 %mul to i32
+  ret i32 %ret
+}
+; CHECK-LABEL: mul8Bit
+; CHECK: mul {{[abcd]l|byte ptr}}
+
+define internal i32 @mul8BitConst(i32 %a, i32 %b) {
+entry:
+  %a_8 = trunc i32 %a to i8
+  %mul = mul i8 %a_8, 56
+  %ret = zext i8 %mul to i32
+  ret i32 %ret
+}
+; CHECK-LABEL: mul8BitConst
+; 8-bit imul only accepts r/m, not imm
+; CHECK: mov {{.*}}, 56
+; CHECK: mul {{[abcd]l|byte ptr}}
+
+define internal i32 @udiv8Bit(i32 %a, i32 %b) {
+entry:
+  %a_8 = trunc i32 %a to i8
+  %b_8 = trunc i32 %b to i8
+  %udiv = udiv i8 %b_8, %a_8
+  %ret = zext i8 %udiv to i32
+  ret i32 %ret
+}
+; CHECK-LABEL: udiv8Bit
+; CHECK: div {{[abcd]l|byte ptr}}
+
+define internal i32 @udiv8BitConst(i32 %a, i32 %b) {
+entry:
+  %a_8 = trunc i32 %a to i8
+  %udiv = udiv i8 %a_8, 123
+  %ret = zext i8 %udiv to i32
+  ret i32 %ret
+}
+; CHECK-LABEL: udiv8BitConst
+; CHECK: div {{[abcd]l|byte ptr}}
+
+define internal i32 @urem8Bit(i32 %a, i32 %b) {
+entry:
+  %a_8 = trunc i32 %a to i8
+  %b_8 = trunc i32 %b to i8
+  %urem = urem i8 %b_8, %a_8
+  %ret = zext i8 %urem to i32
+  ret i32 %ret
+}
+; CHECK-LABEL: urem8Bit
+; CHECK: div {{[abcd]l|byte ptr}}
+
+define internal i32 @urem8BitConst(i32 %a, i32 %b) {
+entry:
+  %a_8 = trunc i32 %a to i8
+  %urem = urem i8 %a_8, 123
+  %ret = zext i8 %urem to i32
+  ret i32 %ret
+}
+; CHECK-LABEL: urem8BitConst
+; CHECK: div {{[abcd]l|byte ptr}}
+
+
+define internal i32 @sdiv8Bit(i32 %a, i32 %b) {
+entry:
+  %a_8 = trunc i32 %a to i8
+  %b_8 = trunc i32 %b to i8
+  %sdiv = sdiv i8 %b_8, %a_8
+  %ret = zext i8 %sdiv to i32
+  ret i32 %ret
+}
+; CHECK-LABEL: sdiv8Bit
+; CHECK: idiv {{[abcd]l|byte ptr}}
+
+define internal i32 @sdiv8BitConst(i32 %a, i32 %b) {
+entry:
+  %a_8 = trunc i32 %a to i8
+  %sdiv = sdiv i8 %a_8, 123
+  %ret = zext i8 %sdiv to i32
+  ret i32 %ret
+}
+; CHECK-LABEL: sdiv8BitConst
+; CHECK: idiv {{[abcd]l|byte ptr}}
+
+define internal i32 @srem8Bit(i32 %a, i32 %b) {
+entry:
+  %a_8 = trunc i32 %a to i8
+  %b_8 = trunc i32 %b to i8
+  %srem = srem i8 %b_8, %a_8
+  %ret = zext i8 %srem to i32
+  ret i32 %ret
+}
+; CHECK-LABEL: srem8Bit
+; CHECK: idiv {{[abcd]l|byte ptr}}
+
+define internal i32 @srem8BitConst(i32 %a, i32 %b) {
+entry:
+  %a_8 = trunc i32 %a to i8
+  %srem = srem i8 %a_8, 123
+  %ret = zext i8 %srem to i32
+  ret i32 %ret
+}
+; CHECK-LABEL: srem8BitConst
+; CHECK: idiv {{[abcd]l|byte ptr}}
+
+
+; ERRORS-NOT: ICE translation error
+; DUMP-NOT: SZ