Use three-address form of imul

Previously we did not take advantage of the three address versions of the
imul instruction.  With this we are able to avoid some copies before imuls.

BUG=
R=stichnot@chromium.org

Review URL: https://codereview.chromium.org/1365433004 .
diff --git a/src/IceAssemblerX86Base.h b/src/IceAssemblerX86Base.h
index 7c4be6f..f9de5a5 100644
--- a/src/IceAssemblerX86Base.h
+++ b/src/IceAssemblerX86Base.h
@@ -751,6 +751,11 @@
   void imul(Type Ty, typename Traits::GPRRegister reg);
   void imul(Type Ty, const typename Traits::Address &address);
 
+  void imul(Type Ty, typename Traits::GPRRegister dst,
+            typename Traits::GPRRegister src, const Immediate &imm);
+  void imul(Type Ty, typename Traits::GPRRegister dst,
+            const typename Traits::Address &address, const Immediate &imm);
+
   void mul(Type Ty, typename Traits::GPRRegister reg);
   void mul(Type Ty, const typename Traits::Address &address);
 
diff --git a/src/IceAssemblerX86BaseImpl.h b/src/IceAssemblerX86BaseImpl.h
index b1013d6..9bdc449 100644
--- a/src/IceAssemblerX86BaseImpl.h
+++ b/src/IceAssemblerX86BaseImpl.h
@@ -2581,6 +2581,46 @@
 }
 
 template <class Machine>
+void AssemblerX86Base<Machine>::imul(Type Ty, typename Traits::GPRRegister dst,
+                                     typename Traits::GPRRegister src,
+                                     const Immediate &imm) {
+  AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  assert(Ty == IceType_i16 || Ty == IceType_i32);
+  if (Ty == IceType_i16)
+    emitOperandSizeOverride();
+  emitRexRB(Ty, dst, src);
+  if (imm.is_int8()) {
+    emitUint8(0x6B);
+    emitRegisterOperand(gprEncoding(dst), gprEncoding(src));
+    emitUint8(imm.value() & 0xFF);
+  } else {
+    emitUint8(0x69);
+    emitRegisterOperand(gprEncoding(dst), gprEncoding(src));
+    emitImmediate(Ty, imm);
+  }
+}
+
+template <class Machine>
+void AssemblerX86Base<Machine>::imul(Type Ty, typename Traits::GPRRegister dst,
+                                     const typename Traits::Address &address,
+                                     const Immediate &imm) {
+  AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  assert(Ty == IceType_i16 || Ty == IceType_i32);
+  if (Ty == IceType_i16)
+    emitOperandSizeOverride();
+  emitRex(Ty, address, dst);
+  if (imm.is_int8()) {
+    emitUint8(0x6B);
+    emitOperand(gprEncoding(dst), address);
+    emitUint8(imm.value() & 0xFF);
+  } else {
+    emitUint8(0x69);
+    emitOperand(gprEncoding(dst), address);
+    emitImmediate(Ty, imm);
+  }
+}
+
+template <class Machine>
 void AssemblerX86Base<Machine>::mul(Type Ty, typename Traits::GPRRegister reg) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   if (Ty == IceType_i16)
diff --git a/src/IceInstX86Base.h b/src/IceInstX86Base.h
index 9b0b5fc..36b34ac 100644
--- a/src/IceInstX86Base.h
+++ b/src/IceInstX86Base.h
@@ -68,6 +68,7 @@
     Icmp,
     Idiv,
     Imul,
+    ImulImm,
     Insertps,
     Jmp,
     Label,
@@ -1622,6 +1623,25 @@
 };
 
 template <class Machine>
+class InstX86ImulImm
+    : public InstX86BaseThreeAddressop<Machine, InstX86Base<Machine>::ImulImm> {
+public:
+  static InstX86ImulImm *create(Cfg *Func, Variable *Dest, Operand *Source0,
+                                Operand *Source1) {
+    return new (Func->allocate<InstX86ImulImm>())
+        InstX86ImulImm(Func, Dest, Source0, Source1);
+  }
+
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+
+private:
+  InstX86ImulImm(Cfg *Func, Variable *Dest, Operand *Source0, Operand *Source1)
+      : InstX86BaseThreeAddressop<Machine, InstX86Base<Machine>::ImulImm>(
+            Func, Dest, Source0, Source1) {}
+};
+
+template <class Machine>
 class InstX86Mulps
     : public InstX86BaseBinopXmm<Machine, InstX86Base<Machine>::Mulps, true> {
 public:
@@ -2790,6 +2810,7 @@
   using XorRMW = InstX86XorRMW<Machine>;
   using Pxor = InstX86Pxor<Machine>;
   using Imul = InstX86Imul<Machine>;
+  using ImulImm = InstX86ImulImm<Machine>;
   using Mulps = InstX86Mulps<Machine>;
   using Mulss = InstX86Mulss<Machine>;
   using Pmull = InstX86Pmull<Machine>;
@@ -2897,6 +2918,7 @@
   template <> const char *InstX86XorRMW<Machine>::Base::Opcode = "xor";        \
   template <> const char *InstX86Pxor<Machine>::Base::Opcode = "pxor";         \
   template <> const char *InstX86Imul<Machine>::Base::Opcode = "imul";         \
+  template <> const char *InstX86ImulImm<Machine>::Base::Opcode = "imul";      \
   template <> const char *InstX86Mulps<Machine>::Base::Opcode = "mulps";       \
   template <> const char *InstX86Mulss<Machine>::Base::Opcode = "mulss";       \
   template <> const char *InstX86Pmull<Machine>::Base::Opcode = "pmull";       \
diff --git a/src/IceInstX86BaseImpl.h b/src/IceInstX86BaseImpl.h
index c6ebd3a..f2e7ff2 100644
--- a/src/IceInstX86BaseImpl.h
+++ b/src/IceInstX86BaseImpl.h
@@ -1330,8 +1330,8 @@
         &InstX86Base<Machine>::Traits::Assembler::imul};
     emitIASOpTyGPR<Machine>(Func, Ty, this->getSrc(1), Emitter);
   } else {
-    // We only use imul as a two-address instruction even though there is a 3
-    // operand version when one of the operands is a constant.
+    // The two-address version is used when multiplying by a non-constant
+    // or doing an 8-bit multiply.
     assert(Var == this->getSrc(0));
     static const typename InstX86Base<
         Machine>::Traits::Assembler::GPREmitterRegOp Emitter = {
@@ -1343,6 +1343,43 @@
 }
 
 template <class Machine>
+void InstX86ImulImm<Machine>::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(this->getSrcSize() == 2);
+  Variable *Dest = this->getDest();
+  assert(Dest->getType() == IceType_i16 || Dest->getType() == IceType_i32);
+  assert(llvm::isa<Constant>(this->getSrc(1)));
+  Str << "\timul" << this->getWidthString(Dest->getType()) << "\t";
+  this->getSrc(1)->emit(Func);
+  Str << ", ";
+  this->getSrc(0)->emit(Func);
+  Str << ", ";
+  Dest->emit(Func);
+}
+
+template <class Machine>
+void InstX86ImulImm<Machine>::emitIAS(const Cfg *Func) const {
+  assert(this->getSrcSize() == 2);
+  const Variable *Dest = this->getDest();
+  Type Ty = Dest->getType();
+  assert(llvm::isa<Constant>(this->getSrc(1)));
+  static const typename InstX86Base<Machine>::Traits::Assembler::
+      template ThreeOpImmEmitter<
+          typename InstX86Base<Machine>::Traits::RegisterSet::GPRRegister,
+          typename InstX86Base<Machine>::Traits::RegisterSet::GPRRegister>
+          Emitter = {&InstX86Base<Machine>::Traits::Assembler::imul,
+                     &InstX86Base<Machine>::Traits::Assembler::imul};
+  emitIASThreeOpImmOps<
+      Machine, typename InstX86Base<Machine>::Traits::RegisterSet::GPRRegister,
+      typename InstX86Base<Machine>::Traits::RegisterSet::GPRRegister,
+      InstX86Base<Machine>::Traits::RegisterSet::getEncodedGPR,
+      InstX86Base<Machine>::Traits::RegisterSet::getEncodedGPR>(
+      Func, Ty, Dest, this->getSrc(0), this->getSrc(1), Emitter);
+}
+
+template <class Machine>
 void InstX86Insertps<Machine>::emitIAS(const Cfg *Func) const {
   assert(this->getSrcSize() == 3);
   assert(static_cast<typename InstX86Base<Machine>::Traits::TargetLowering *>(
diff --git a/src/IceTargetLoweringX86Base.h b/src/IceTargetLoweringX86Base.h
index 4875e65..1df99e5 100644
--- a/src/IceTargetLoweringX86Base.h
+++ b/src/IceTargetLoweringX86Base.h
@@ -428,6 +428,9 @@
   void _imul(Variable *Dest, Operand *Src0) {
     Context.insert(Traits::Insts::Imul::create(Func, Dest, Src0));
   }
+  void _imul_imm(Variable *Dest, Operand *Src0, Constant *Imm) {
+    Context.insert(Traits::Insts::ImulImm::create(Func, Dest, Src0, Imm));
+  }
   void _insertps(Variable *Dest, Operand *Src0, Operand *Src1) {
     Context.insert(Traits::Insts::Insertps::create(Func, Dest, Src0, Src1));
   }
diff --git a/src/IceTargetLoweringX86BaseImpl.h b/src/IceTargetLoweringX86BaseImpl.h
index 677bc98..00c2870 100644
--- a/src/IceTargetLoweringX86BaseImpl.h
+++ b/src/IceTargetLoweringX86BaseImpl.h
@@ -1618,11 +1618,17 @@
     if (isByteSizedArithType(Dest->getType())) {
       _mov(T, Src0, Traits::RegisterSet::Reg_eax);
       Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
+      _imul(T, Src0 == Src1 ? T : Src1);
+      _mov(Dest, T);
+    } else if (auto *ImmConst = llvm::dyn_cast<ConstantInteger32>(Src1)) {
+      T = makeReg(Dest->getType());
+      _imul_imm(T, Src0, ImmConst);
+      _mov(Dest, T);
     } else {
       _mov(T, Src0);
+      _imul(T, Src0 == Src1 ? T : Src1);
+      _mov(Dest, T);
     }
-    _imul(T, Src0 == Src1 ? T : Src1);
-    _mov(Dest, T);
     break;
   case InstArithmetic::Shl:
     _mov(T, Src0);
diff --git a/tests_lit/assembler/x86/immediate_encodings.ll b/tests_lit/assembler/x86/immediate_encodings.ll
index 0c96720..e16ade3 100644
--- a/tests_lit/assembler/x86/immediate_encodings.ll
+++ b/tests_lit/assembler/x86/immediate_encodings.ll
@@ -250,6 +250,88 @@
 ; CHECK-LABEL: testMul32Imm16Neg
 ; CHECK: 69 c0 01 ff ff ff  imul eax,eax,0xffffff01
 
+define i32 @testMul32Imm32ThreeAddress(i32 %a) {
+entry:
+  %mul = mul i32 232, %a
+  %add = add i32 %mul, %a
+  ret i32 %add
+}
+; CHECK-LABEL: testMul32Imm32ThreeAddress
+; CHECK: 69 c8 e8 00 00 00  imul ecx,eax,0xe8
+
+define i32 @testMul32Mem32Imm32ThreeAddress(i32 %addr_arg) {
+entry:
+  %__1 = inttoptr i32 %addr_arg to i32*
+  %a = load i32, i32* %__1, align 1
+  %mul = mul i32 232, %a
+  ret i32 %mul
+}
+; CHECK-LABEL: testMul32Mem32Imm32ThreeAddress
+; CHECK: 69 00 e8 00 00 00  imul eax,DWORD PTR [eax],0xe8
+
+define i32 @testMul32Imm8ThreeAddress(i32 %a) {
+entry:
+  %mul = mul i32 127, %a
+  %add = add i32 %mul, %a
+  ret i32 %add
+}
+; CHECK-LABEL: testMul32Imm8ThreeAddress
+; CHECK: 6b c8 7f imul ecx,eax,0x7f
+
+define i32 @testMul32Mem32Imm8ThreeAddress(i32 %addr_arg) {
+entry:
+  %__1 = inttoptr i32 %addr_arg to i32*
+  %a = load i32, i32* %__1, align 1
+  %mul = mul i32 127, %a
+  ret i32 %mul
+}
+; CHECK-LABEL: testMul32Mem32Imm8ThreeAddress
+; CHECK: 6b 00 7f imul eax,DWORD PTR [eax],0x7f
+
+define i32 @testMul16Imm16ThreeAddress(i32 %a) {
+entry:
+  %arg_i16 = trunc i32 %a to i16
+  %mul = mul i16 232, %arg_i16
+  %add = add i16 %mul, %arg_i16
+  %result = zext i16 %add to i32
+  ret i32 %result
+}
+; CHECK-LABEL: testMul16Imm16ThreeAddress
+; CHECK: 66 69 c8 e8 00 imul cx,ax,0xe8
+
+define i32 @testMul16Mem16Imm16ThreeAddress(i32 %addr_arg) {
+entry:
+  %__1 = inttoptr i32 %addr_arg to i16*
+  %a = load i16, i16* %__1, align 1
+  %mul = mul i16 232, %a
+  %result = zext i16 %mul to i32
+  ret i32 %result
+}
+; CHECK-LABEL: testMul16Mem16Imm16ThreeAddress
+; CHECK: 66 69 00 e8 00 imul ax,WORD PTR [eax],0xe8
+
+define i32 @testMul16Imm8ThreeAddress(i32 %a) {
+entry:
+  %arg_i16 = trunc i32 %a to i16
+  %mul = mul i16 127, %arg_i16
+  %add = add i16 %mul, %arg_i16
+  %result = zext i16 %add to i32
+  ret i32 %result
+}
+; CHECK-LABEL: testMul16Imm8ThreeAddress
+; CHECK: 66 6b c8 7f imul cx,ax,0x7f
+
+define i32 @testMul16Mem16Imm8ThreeAddress(i32 %addr_arg) {
+entry:
+  %__1 = inttoptr i32 %addr_arg to i16*
+  %a = load i16, i16* %__1, align 1
+  %mul = mul i16 127, %a
+  %result = zext i16 %mul to i32
+  ret i32 %result
+}
+; CHECK-LABEL: testMul16Mem16Imm8ThreeAddress
+; CHECK: 66 6b 00 7f imul ax,WORD PTR [eax],0x7f
+
 ; The GPR shift instructions either allow an 8-bit immediate or
 ; have a special encoding for "1".
 define internal i32 @testShl16Imm8(i32 %arg) {