Handle imul, pcmpeq, pcmpgt.
Be sure to legalize 8-bit imul immediates (there is only the r/m form).
Add a test for that, and cover a couple of other ops too...
There is a one-byte-shorter form when Dest/Src0 == EAX and Src1 is not
an immediate, but that isn't taken advantage of.
Go ahead and add the optimization for 8-bit immediates for i16/i32
(not allowed for i8). It shows up sometimes in spec, e.g., to multiply by 10.
There is a lot of multiply by 4 as well, that we could strength-reduce.
BUG=none
R=stichnot@chromium.org
Review URL: https://codereview.chromium.org/617593002
diff --git a/src/IceInstX8632.cpp b/src/IceInstX8632.cpp
index 0a2d034..ab32335 100644
--- a/src/IceInstX8632.cpp
+++ b/src/IceInstX8632.cpp
@@ -720,6 +720,12 @@
const x86::AssemblerX86::XmmEmitterTwoOps InstX8632Pandn::Emitter = {
&x86::AssemblerX86::pandn, &x86::AssemblerX86::pandn, NULL};
template <>
+const x86::AssemblerX86::XmmEmitterTwoOps InstX8632Pcmpeq::Emitter = {
+ &x86::AssemblerX86::pcmpeq, &x86::AssemblerX86::pcmpeq, NULL};
+template <>
+const x86::AssemblerX86::XmmEmitterTwoOps InstX8632Pcmpgt::Emitter = {
+ &x86::AssemblerX86::pcmpgt, &x86::AssemblerX86::pcmpgt, NULL};
+template <>
const x86::AssemblerX86::XmmEmitterTwoOps InstX8632Pmuludq::Emitter = {
&x86::AssemblerX86::pmuludq, &x86::AssemblerX86::pmuludq, NULL};
template <>
@@ -904,6 +910,30 @@
}
}
+template <> void InstX8632Imul::emitIAS(const Cfg *Func) const {
+ assert(getSrcSize() == 2);
+ const Variable *Var = getDest();
+ Type Ty = Var->getType();
+ const Operand *Src = getSrc(1);
+ if (isByteSizedArithType(Ty)) {
+ // The 8-bit version of imul only allows the form "imul r/m8".
+ Variable *Src0 = llvm::dyn_cast<Variable>(getSrc(0));
+ (void)Src0;
+ assert(Src0 && Src0->getRegNum() == RegX8632::Reg_eax);
+ const x86::AssemblerX86::GPREmitterOneOp Emitter = {
+ &x86::AssemblerX86::imul, &x86::AssemblerX86::imul};
+ emitIASOpTyGPR(Func, Ty, getSrc(1), Emitter);
+ } else {
+ // We only use imul as a two-address instruction even though
+ // there is a 3 operand version when one of the operands is a constant.
+ assert(Var == getSrc(0));
+ const x86::AssemblerX86::GPREmitterRegOp Emitter = {
+ &x86::AssemblerX86::imul, &x86::AssemblerX86::imul,
+ &x86::AssemblerX86::imul};
+ emitIASRegOpTyGPR(Func, Ty, Var, Src, Emitter);
+ }
+}
+
template <> void InstX8632Cbwdq::emit(const Cfg *Func) const {
Ostream &Str = Func->getContext()->getStrEmit();
assert(getSrcSize() == 1);
diff --git a/src/IceInstX8632.h b/src/IceInstX8632.h
index 0f1a336..9facb2b 100644
--- a/src/IceInstX8632.h
+++ b/src/IceInstX8632.h
@@ -837,7 +837,7 @@
typedef InstX8632BinopXmm<InstX8632::Por, false> InstX8632Por;
typedef InstX8632BinopGPR<InstX8632::Xor> InstX8632Xor;
typedef InstX8632BinopXmm<InstX8632::Pxor, false> InstX8632Pxor;
-typedef InstX8632Binop<InstX8632::Imul> InstX8632Imul;
+typedef InstX8632BinopGPR<InstX8632::Imul> InstX8632Imul;
typedef InstX8632BinopXmm<InstX8632::Mulps, true> InstX8632Mulps;
typedef InstX8632BinopXmm<InstX8632::Mulss, false> InstX8632Mulss;
typedef InstX8632Binop<InstX8632::Pmull> InstX8632Pmull;
@@ -850,8 +850,8 @@
typedef InstX8632Binop<InstX8632::Shr, true> InstX8632Shr;
typedef InstX8632Binop<InstX8632::Sar, true> InstX8632Sar;
typedef InstX8632Binop<InstX8632::Psra> InstX8632Psra;
-typedef InstX8632Binop<InstX8632::Pcmpeq> InstX8632Pcmpeq;
-typedef InstX8632Binop<InstX8632::Pcmpgt> InstX8632Pcmpgt;
+typedef InstX8632BinopXmm<InstX8632::Pcmpeq, true> InstX8632Pcmpeq;
+typedef InstX8632BinopXmm<InstX8632::Pcmpgt, true> InstX8632Pcmpgt;
// TODO: movss is only a binary operation when the source and dest
// operands are both registers. In other cases, it behaves like a copy
// (mov-like) operation. Eventually, InstX8632Movss should assert that
@@ -1450,6 +1450,7 @@
template <> void InstX8632Div::emitIAS(const Cfg *Func) const;
template <> void InstX8632Idiv::emitIAS(const Cfg *Func) const;
+template <> void InstX8632Imul::emitIAS(const Cfg *Func) const;
template <> void InstX8632Cbwdq::emitIAS(const Cfg *Func) const;
template <> void InstX8632Movd::emitIAS(const Cfg *Func) const;
diff --git a/src/IceTargetLoweringX8632.cpp b/src/IceTargetLoweringX8632.cpp
index 7a7fb12..96cc57b 100644
--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
@@ -1581,10 +1581,12 @@
//
// The 8-bit version of imul only allows the form "imul r/m8"
// where T must be in eax.
- if (isByteSizedArithType(Dest->getType()))
+ if (isByteSizedArithType(Dest->getType())) {
_mov(T, Src0, RegX8632::Reg_eax);
- else
+ Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
+ } else {
_mov(T, Src0);
+ }
_imul(T, Src1);
_mov(Dest, T);
break;
diff --git a/src/assembler_ia32.cpp b/src/assembler_ia32.cpp
index 7791709..3dce9b7 100644
--- a/src/assembler_ia32.cpp
+++ b/src/assembler_ia32.cpp
@@ -1056,15 +1056,62 @@
EmitXmmRegisterOperand(dst, src);
}
-void AssemblerX86::pcmpeqq(XmmRegister dst, XmmRegister src) {
+void AssemblerX86::pcmpeq(Type Ty, XmmRegister dst, XmmRegister src) {
AssemblerBuffer::EnsureCapacity ensured(&buffer_);
EmitUint8(0x66);
EmitUint8(0x0F);
- EmitUint8(0x38);
- EmitUint8(0x29);
+ if (isByteSizedArithType(Ty)) {
+ EmitUint8(0x74);
+ } else if (Ty == IceType_i16) {
+ EmitUint8(0x75);
+ } else {
+ EmitUint8(0x76);
+ }
EmitXmmRegisterOperand(dst, src);
}
+void AssemblerX86::pcmpeq(Type Ty, XmmRegister dst, const Address &src) {
+ AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+ EmitUint8(0x66);
+ EmitUint8(0x0F);
+ if (isByteSizedArithType(Ty)) {
+ EmitUint8(0x74);
+ } else if (Ty == IceType_i16) {
+ EmitUint8(0x75);
+ } else {
+ EmitUint8(0x76);
+ }
+ EmitOperand(dst, src);
+}
+
+void AssemblerX86::pcmpgt(Type Ty, XmmRegister dst, XmmRegister src) {
+ AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+ EmitUint8(0x66);
+ EmitUint8(0x0F);
+ if (isByteSizedArithType(Ty)) {
+ EmitUint8(0x64);
+ } else if (Ty == IceType_i16) {
+ EmitUint8(0x65);
+ } else {
+ EmitUint8(0x66);
+ }
+ EmitXmmRegisterOperand(dst, src);
+}
+
+void AssemblerX86::pcmpgt(Type Ty, XmmRegister dst, const Address &src) {
+ AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+ EmitUint8(0x66);
+ EmitUint8(0x0F);
+ if (isByteSizedArithType(Ty)) {
+ EmitUint8(0x64);
+ } else if (Ty == IceType_i16) {
+ EmitUint8(0x65);
+ } else {
+ EmitUint8(0x66);
+ }
+ EmitOperand(dst, src);
+}
+
void AssemblerX86::roundsd(XmmRegister dst, XmmRegister src,
RoundingMode mode) {
AssemblerBuffer::EnsureCapacity ensured(&buffer_);
@@ -1489,36 +1536,61 @@
EmitOperand(7, addr);
}
-void AssemblerX86::imull(GPRRegister dst, GPRRegister src) {
+void AssemblerX86::imul(Type Ty, GPRRegister dst, GPRRegister src) {
AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+ assert(Ty == IceType_i16 || Ty == IceType_i32);
+ if (Ty == IceType_i16)
+ EmitOperandSizeOverride();
EmitUint8(0x0F);
EmitUint8(0xAF);
- EmitOperand(dst, Operand(src));
+ EmitRegisterOperand(dst, src);
}
-void AssemblerX86::imull(GPRRegister reg, const Immediate &imm) {
+void AssemblerX86::imul(Type Ty, GPRRegister reg, const Address &address) {
AssemblerBuffer::EnsureCapacity ensured(&buffer_);
- EmitUint8(0x69);
- EmitOperand(reg, Operand(reg));
- EmitImmediate(BrokenType, imm);
-}
-
-void AssemblerX86::imull(GPRRegister reg, const Address &address) {
- AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+ assert(Ty == IceType_i16 || Ty == IceType_i32);
+ if (Ty == IceType_i16)
+ EmitOperandSizeOverride();
EmitUint8(0x0F);
EmitUint8(0xAF);
EmitOperand(reg, address);
}
-void AssemblerX86::imull(GPRRegister reg) {
+void AssemblerX86::imul(Type Ty, GPRRegister reg, const Immediate &imm) {
AssemblerBuffer::EnsureCapacity ensured(&buffer_);
- EmitUint8(0xF7);
- EmitOperand(5, Operand(reg));
+ assert(Ty == IceType_i16 || Ty == IceType_i32);
+ if (Ty == IceType_i16)
+ EmitOperandSizeOverride();
+ if (imm.is_int8()) {
+ EmitUint8(0x6B);
+ EmitRegisterOperand(reg, reg);
+ EmitUint8(imm.value() & 0xFF);
+ } else {
+ EmitUint8(0x69);
+ EmitRegisterOperand(reg, reg);
+ EmitImmediate(Ty, imm);
+ }
}
-void AssemblerX86::imull(const Address &address) {
+void AssemblerX86::imul(Type Ty, GPRRegister reg) {
AssemblerBuffer::EnsureCapacity ensured(&buffer_);
- EmitUint8(0xF7);
+ if (Ty == IceType_i16)
+ EmitOperandSizeOverride();
+ if (isByteSizedArithType(Ty))
+ EmitUint8(0xF6);
+ else
+ EmitUint8(0xF7);
+ EmitRegisterOperand(5, reg);
+}
+
+void AssemblerX86::imul(Type Ty, const Address &address) {
+ AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+ if (Ty == IceType_i16)
+ EmitOperandSizeOverride();
+ if (isByteSizedArithType(Ty))
+ EmitUint8(0xF6);
+ else
+ EmitUint8(0xF7);
EmitOperand(5, address);
}
diff --git a/src/assembler_ia32.h b/src/assembler_ia32.h
index 3e9a937..0f95e35 100644
--- a/src/assembler_ia32.h
+++ b/src/assembler_ia32.h
@@ -535,7 +535,11 @@
void pextrd(GPRRegister dst, XmmRegister src, const Immediate &imm);
void pmovsxdq(XmmRegister dst, XmmRegister src);
- void pcmpeqq(XmmRegister dst, XmmRegister src);
+
+ void pcmpeq(Type Ty, XmmRegister dst, XmmRegister src);
+ void pcmpeq(Type Ty, XmmRegister dst, const Address &src);
+ void pcmpgt(Type Ty, XmmRegister dst, XmmRegister src);
+ void pcmpgt(Type Ty, XmmRegister dst, const Address &src);
enum RoundingMode {
kRoundToNearest = 0x0,
@@ -609,12 +613,12 @@
void idiv(Type Ty, GPRRegister reg);
void idiv(Type Ty, const Address &address);
- void imull(GPRRegister dst, GPRRegister src);
- void imull(GPRRegister reg, const Immediate &imm);
- void imull(GPRRegister reg, const Address &address);
+ void imul(Type Ty, GPRRegister dst, GPRRegister src);
+ void imul(Type Ty, GPRRegister reg, const Immediate &imm);
+ void imul(Type Ty, GPRRegister reg, const Address &address);
- void imull(GPRRegister reg);
- void imull(const Address &address);
+ void imul(Type Ty, GPRRegister reg);
+ void imul(Type Ty, const Address &address);
void mul(Type Ty, GPRRegister reg);
void mul(Type Ty, const Address &address);
diff --git a/tests_lit/assembler/x86/immediate_encodings.ll b/tests_lit/assembler/x86/immediate_encodings.ll
index a80fe5b..ff23e55 100644
--- a/tests_lit/assembler/x86/immediate_encodings.ll
+++ b/tests_lit/assembler/x86/immediate_encodings.ll
@@ -171,5 +171,88 @@
; CHECK-LABEL: testSub8Imm8
; CHECK: 2c 7d sub al, 125
+; imul has some shorter 8-bit immediate encodings.
+; It also has a shorter encoding for eax, but we don't do that yet.
+
+define internal i32 @testMul16Imm8(i32 %arg) {
+entry:
+ %arg_i16 = trunc i32 %arg to i16
+ %tmp = mul i16 %arg_i16, 99
+ %result_i16 = add i16 %tmp, 1
+ %result = zext i16 %result_i16 to i32
+ ret i32 %result
+}
+; CHECK-LABEL: testMul16Imm8
+; CHECK: 66 6b c0 63 imul ax, ax, 99
+; CHECK-NEXT: add ax, 1
+
+define internal i32 @testMul16Imm8Neg(i32 %arg) {
+entry:
+ %arg_i16 = trunc i32 %arg to i16
+ %tmp = mul i16 %arg_i16, -111
+ %result_i16 = add i16 %tmp, 1
+ %result = zext i16 %result_i16 to i32
+ ret i32 %result
+}
+; CHECK-LABEL: testMul16Imm8Neg
+; CHECK: 66 6b c0 91 imul ax, ax, 145
+; CHECK-NEXT: add ax, 1
+
+define internal i32 @testMul16Imm16(i32 %arg) {
+entry:
+ %arg_i16 = trunc i32 %arg to i16
+ %tmp = mul i16 %arg_i16, 1024
+ %result_i16 = add i16 %tmp, 1
+ %result = zext i16 %result_i16 to i32
+ ret i32 %result
+}
+; CHECK-LABEL: testMul16Imm16
+; CHECK: 66 69 c0 00 04 imul ax, ax, 1024
+; CHECK-NEXT: add ax, 1
+
+define internal i32 @testMul16Imm16Neg(i32 %arg) {
+entry:
+ %arg_i16 = trunc i32 %arg to i16
+ %tmp = mul i16 %arg_i16, -256
+ %result_i16 = add i16 %tmp, 1
+ %result = zext i16 %result_i16 to i32
+ ret i32 %result
+}
+; CHECK-LABEL: testMul16Imm16Neg
+; CHECK: 66 69 c0 00 ff imul ax, ax, 65280
+; CHECK-NEXT: add ax, 1
+
+define internal i32 @testMul32Imm8(i32 %arg) {
+entry:
+ %result = mul i32 %arg, 99
+ ret i32 %result
+}
+; CHECK-LABEL: testMul32Imm8
+; CHECK: 6b c0 63 imul eax, eax, 99
+
+define internal i32 @testMul32Imm8Neg(i32 %arg) {
+entry:
+ %result = mul i32 %arg, -111
+ ret i32 %result
+}
+; CHECK-LABEL: testMul32Imm8Neg
+; CHECK: 6b c0 91 imul eax, eax, -111
+
+define internal i32 @testMul32Imm16(i32 %arg) {
+entry:
+ %result = mul i32 %arg, 1024
+ ret i32 %result
+}
+; CHECK-LABEL: testMul32Imm16
+; CHECK: 69 c0 00 04 00 00 imul eax, eax, 1024
+
+define internal i32 @testMul32Imm16Neg(i32 %arg) {
+entry:
+ %result = mul i32 %arg, -256
+ ret i32 %result
+}
+; CHECK-LABEL: testMul32Imm16Neg
+; CHECK: 69 c0 00 ff ff ff imul eax, eax, 4294967040
+
; ERRORS-NOT: ICE translation error
; DUMP-NOT: SZ
diff --git a/tests_lit/llvm2ice_tests/8bit.pnacl.ll b/tests_lit/llvm2ice_tests/8bit.pnacl.ll
new file mode 100644
index 0000000..2307712
--- /dev/null
+++ b/tests_lit/llvm2ice_tests/8bit.pnacl.ll
@@ -0,0 +1,164 @@
+; This tries to be a comprehensive test of i8 operations.
+
+; RUN: %p2i -i %s --args -O2 --verbose none \
+; RUN: | llvm-mc -triple=i686-none-nacl -x86-asm-syntax=intel -filetype=obj \
+; RUN: | llvm-objdump -d --symbolize -x86-asm-syntax=intel - | FileCheck %s
+; RUN: %p2i -i %s --args -Om1 --verbose none \
+; RUN: | llvm-mc -triple=i686-none-nacl -x86-asm-syntax=intel -filetype=obj \
+; RUN: | llvm-objdump -d --symbolize -x86-asm-syntax=intel - | FileCheck %s
+; RUN: %p2i -i %s --args --verbose none | FileCheck --check-prefix=ERRORS %s
+; RUN: %p2i -i %s --insts | %szdiff %s | FileCheck --check-prefix=DUMP %s
+
+define internal i32 @add8Bit(i32 %a, i32 %b) {
+entry:
+ %a_8 = trunc i32 %a to i8
+ %b_8 = trunc i32 %b to i8
+ %add = add i8 %b_8, %a_8
+ %ret = zext i8 %add to i32
+ ret i32 %ret
+}
+; CHECK-LABEL: add8Bit
+; CHECK: add {{[abcd]l}}
+
+define internal i32 @add8BitConst(i32 %a, i32 %b) {
+entry:
+ %a_8 = trunc i32 %a to i8
+ %add = add i8 %a_8, 123
+ %ret = zext i8 %add to i32
+ ret i32 %ret
+}
+; CHECK-LABEL: add8BitConst
+; CHECK: add {{[abcd]l}}
+
+define internal i32 @sub8Bit(i32 %a, i32 %b) {
+entry:
+ %a_8 = trunc i32 %a to i8
+ %b_8 = trunc i32 %b to i8
+ %sub = sub i8 %b_8, %a_8
+ %ret = zext i8 %sub to i32
+ ret i32 %ret
+}
+; CHECK-LABEL: sub8Bit
+; XCHECK: sub {{[abcd]l}}
+
+define internal i32 @sub8BitConst(i32 %a, i32 %b) {
+entry:
+ %a_8 = trunc i32 %a to i8
+ %sub = sub i8 %a_8, 123
+ %ret = zext i8 %sub to i32
+ ret i32 %ret
+}
+; CHECK-LABEL: sub8BitConst
+; XCHECK: sub {{[abcd]l}}
+
+define internal i32 @mul8Bit(i32 %a, i32 %b) {
+entry:
+ %a_8 = trunc i32 %a to i8
+ %b_8 = trunc i32 %b to i8
+ %mul = mul i8 %b_8, %a_8
+ %ret = zext i8 %mul to i32
+ ret i32 %ret
+}
+; CHECK-LABEL: mul8Bit
+; CHECK: mul {{[abcd]l|byte ptr}}
+
+define internal i32 @mul8BitConst(i32 %a, i32 %b) {
+entry:
+ %a_8 = trunc i32 %a to i8
+ %mul = mul i8 %a_8, 56
+ %ret = zext i8 %mul to i32
+ ret i32 %ret
+}
+; CHECK-LABEL: mul8BitConst
+; 8-bit imul only accepts r/m, not imm
+; CHECK: mov {{.*}}, 56
+; CHECK: mul {{[abcd]l|byte ptr}}
+
+define internal i32 @udiv8Bit(i32 %a, i32 %b) {
+entry:
+ %a_8 = trunc i32 %a to i8
+ %b_8 = trunc i32 %b to i8
+ %udiv = udiv i8 %b_8, %a_8
+ %ret = zext i8 %udiv to i32
+ ret i32 %ret
+}
+; CHECK-LABEL: udiv8Bit
+; CHECK: div {{[abcd]l|byte ptr}}
+
+define internal i32 @udiv8BitConst(i32 %a, i32 %b) {
+entry:
+ %a_8 = trunc i32 %a to i8
+ %udiv = udiv i8 %a_8, 123
+ %ret = zext i8 %udiv to i32
+ ret i32 %ret
+}
+; CHECK-LABEL: udiv8BitConst
+; CHECK: div {{[abcd]l|byte ptr}}
+
+define internal i32 @urem8Bit(i32 %a, i32 %b) {
+entry:
+ %a_8 = trunc i32 %a to i8
+ %b_8 = trunc i32 %b to i8
+ %urem = urem i8 %b_8, %a_8
+ %ret = zext i8 %urem to i32
+ ret i32 %ret
+}
+; CHECK-LABEL: urem8Bit
+; CHECK: div {{[abcd]l|byte ptr}}
+
+define internal i32 @urem8BitConst(i32 %a, i32 %b) {
+entry:
+ %a_8 = trunc i32 %a to i8
+ %urem = urem i8 %a_8, 123
+ %ret = zext i8 %urem to i32
+ ret i32 %ret
+}
+; CHECK-LABEL: urem8BitConst
+; CHECK: div {{[abcd]l|byte ptr}}
+
+
+define internal i32 @sdiv8Bit(i32 %a, i32 %b) {
+entry:
+ %a_8 = trunc i32 %a to i8
+ %b_8 = trunc i32 %b to i8
+ %sdiv = sdiv i8 %b_8, %a_8
+ %ret = zext i8 %sdiv to i32
+ ret i32 %ret
+}
+; CHECK-LABEL: sdiv8Bit
+; CHECK: idiv {{[abcd]l|byte ptr}}
+
+define internal i32 @sdiv8BitConst(i32 %a, i32 %b) {
+entry:
+ %a_8 = trunc i32 %a to i8
+ %sdiv = sdiv i8 %a_8, 123
+ %ret = zext i8 %sdiv to i32
+ ret i32 %ret
+}
+; CHECK-LABEL: sdiv8BitConst
+; CHECK: idiv {{[abcd]l|byte ptr}}
+
+define internal i32 @srem8Bit(i32 %a, i32 %b) {
+entry:
+ %a_8 = trunc i32 %a to i8
+ %b_8 = trunc i32 %b to i8
+ %srem = srem i8 %b_8, %a_8
+ %ret = zext i8 %srem to i32
+ ret i32 %ret
+}
+; CHECK-LABEL: srem8Bit
+; CHECK: idiv {{[abcd]l|byte ptr}}
+
+define internal i32 @srem8BitConst(i32 %a, i32 %b) {
+entry:
+ %a_8 = trunc i32 %a to i8
+ %srem = srem i8 %a_8, 123
+ %ret = zext i8 %srem to i32
+ ret i32 %ret
+}
+; CHECK-LABEL: srem8BitConst
+; CHECK: idiv {{[abcd]l|byte ptr}}
+
+
+; ERRORS-NOT: ICE translation error
+; DUMP-NOT: SZ