Generate better two address code by using commutativity
For operations such as
t0 = t1 + t2
Subzero's pattern for arithmetic operations generates two address code that
looks like
movl ...t1..., %ecx
addl ...t2..., %ecx // t0 is in %ecx
When register pressure is high this sometimes becomes:
movl ...t2..., SPILL
movl ...t1..., %ecx
addl SPILL, %ecx // t0 is in %ecx
This CL takes advantage of cases where the use of t2 is the last one, so the
register that held t2 before the operation can be reused. The optimization
simply swaps the (commutative) operation to
t0 = t2 + t1
which then generates code as
movl ...t2..., %ecx
addl ...t1..., %ecx // t0 is in %ecx
This optimization is used for any commutative operation, which now includes
Fadd and Fmul, which were erroneously marked as non-commutative. See the
rationale in IceInst.def for the IEEE wordings.
BUG=
R=jfb@chromium.org, stichnot@chromium.org
Review URL: https://codereview.chromium.org/1371703003 .
diff --git a/src/IceInst.def b/src/IceInst.def
index d265213..6bd2efb 100644
--- a/src/IceInst.def
+++ b/src/IceInst.def
@@ -14,14 +14,31 @@
#ifndef SUBZERO_SRC_ICEINST_DEF
#define SUBZERO_SRC_ICEINST_DEF
+// Floating point addition and multiplication are commutative.
+// 1) non-special values and infinities are required to commute.
+// 2) signed zeroes are handled by:
+// From IEEE standard 754-2008:
+// When the sum of two operands with opposite signs (or the difference of
+// two operands with like signs) is exactly zero, the sign of that sum
+// (or difference) shall be +0 in all rounding-direction attributes
+// except roundTowardNegative; under that attribute, the sign of an exact
+// zero sum (or difference) shall be −0.
+// 3) NaNs are handled by:
+// http://grouper.ieee.org/groups/1788/email/msg03558.html
+// clause of 754 at work is 6.2.3 NaN propagation:
+// "If two or more inputs are NaN, then the payload of the resulting NaN
+// should be identical to the payload of one of the input NaNs if
+// representable in the destination format. This standard does not
+// specify which of the input NaNs will provide the payload."
+
#define ICEINSTARITHMETIC_TABLE \
/* enum value, printable string, commutative */ \
X(Add, "add", 1) \
- X(Fadd, "fadd", 0) \
+ X(Fadd, "fadd", 1) \
X(Sub, "sub", 0) \
X(Fsub, "fsub", 0) \
X(Mul, "mul", 1) \
- X(Fmul, "fmul", 0) \
+ X(Fmul, "fmul", 1) \
X(Udiv, "udiv", 0) \
X(Sdiv, "sdiv", 0) \
X(Fdiv, "fdiv", 0) \
diff --git a/src/IceTargetLoweringX86BaseImpl.h b/src/IceTargetLoweringX86BaseImpl.h
index 00c2870..755d9ca 100644
--- a/src/IceTargetLoweringX86BaseImpl.h
+++ b/src/IceTargetLoweringX86BaseImpl.h
@@ -1293,10 +1293,23 @@
Operand *Src0 = legalize(Inst->getSrc(0));
Operand *Src1 = legalize(Inst->getSrc(1));
if (Inst->isCommutative()) {
- if (!llvm::isa<Variable>(Src0) && llvm::isa<Variable>(Src1))
+ uint32_t SwapCount = 0;
+ if (!llvm::isa<Variable>(Src0) && llvm::isa<Variable>(Src1)) {
std::swap(Src0, Src1);
- if (llvm::isa<Constant>(Src0) && !llvm::isa<Constant>(Src1))
+ ++SwapCount;
+ }
+ if (llvm::isa<Constant>(Src0) && !llvm::isa<Constant>(Src1)) {
std::swap(Src0, Src1);
+ ++SwapCount;
+ }
+ // Improve two-address code patterns by avoiding a copy to the dest
+ // register when one of the source operands ends its lifetime here.
+ if (!Inst->isLastUse(Src0) && Inst->isLastUse(Src1)) {
+ std::swap(Src0, Src1);
+ ++SwapCount;
+ }
+ assert(SwapCount <= 1);
+ (void) SwapCount;
}
if (!Traits::Is64Bit && Dest->getType() == IceType_i64) {
// These x86-32 helper-call-involved instructions are lowered in this
diff --git a/tests_lit/assembler/x86/opcode_register_encodings.ll b/tests_lit/assembler/x86/opcode_register_encodings.ll
index 9685182..57226e7 100644
--- a/tests_lit/assembler/x86/opcode_register_encodings.ll
+++ b/tests_lit/assembler/x86/opcode_register_encodings.ll
@@ -16,14 +16,14 @@
; Test register and address mode encoding.
define <8 x i16> @test_mul_v8i16_more_regs(<8 x i1> %cond, <8 x i16> %arg0, <8 x i16> %arg1, <8 x i16> %arg2, <8 x i16> %arg3, <8 x i16> %arg4, <8 x i16> %arg5, <8 x i16> %arg6, <8 x i16> %arg7, <8 x i16> %arg8) {
entry:
- %res1 = mul <8 x i16> %arg0, %arg1
- %res2 = mul <8 x i16> %arg0, %arg2
- %res3 = mul <8 x i16> %arg0, %arg3
- %res4 = mul <8 x i16> %arg0, %arg4
- %res5 = mul <8 x i16> %arg0, %arg5
- %res6 = mul <8 x i16> %arg0, %arg6
- %res7 = mul <8 x i16> %arg0, %arg7
- %res8 = mul <8 x i16> %arg0, %arg8
+ %res1 = sub <8 x i16> %arg0, %arg1
+ %res2 = sub <8 x i16> %arg0, %arg2
+ %res3 = sub <8 x i16> %arg0, %arg3
+ %res4 = sub <8 x i16> %arg0, %arg4
+ %res5 = sub <8 x i16> %arg0, %arg5
+ %res6 = sub <8 x i16> %arg0, %arg6
+ %res7 = sub <8 x i16> %arg0, %arg7
+ %res8 = sub <8 x i16> %arg0, %arg8
%res_acc1 = select <8 x i1> %cond, <8 x i16> %res1, <8 x i16> %res2
%res_acc2 = select <8 x i1> %cond, <8 x i16> %res3, <8 x i16> %res4
%res_acc3 = select <8 x i1> %cond, <8 x i16> %res5, <8 x i16> %res6
@@ -33,14 +33,14 @@
%res = select <8 x i1> %cond, <8 x i16> %res_acc1_3, <8 x i16> %res_acc2_4
ret <8 x i16> %res
; CHECK-LABEL: test_mul_v8i16_more_regs
-; CHECK-DAG: pmullw xmm0,{{xmm[0-7]|xmmword ptr\[esp}}
-; CHECK-DAG: pmullw xmm0,{{xmm[0-7]|xmmword ptr\[esp}}
-; CHECK-DAG: pmullw xmm0,{{xmm[0-7]|xmmword ptr\[esp}}
-; CHECK-DAG: pmullw xmm0,{{xmm[0-7]|xmmword ptr\[esp}}
-; CHECK-DAG: pmullw xmm0,{{xmm[0-7]|xmmword ptr\[esp}}
-; CHECK-DAG: pmullw xmm0,{{xmm[0-7]|xmmword ptr\[esp}}
-; CHECK-DAG: pmullw xmm0,XMMWORD PTR [esp
-; CHECK-DAG: pmullw xmm1,XMMWORD PTR [esp
+; CHECK-DAG: psubw xmm0,{{xmm[0-7]|xmmword ptr\[esp}}
+; CHECK-DAG: psubw xmm0,{{xmm[0-7]|xmmword ptr\[esp}}
+; CHECK-DAG: psubw xmm0,{{xmm[0-7]|xmmword ptr\[esp}}
+; CHECK-DAG: psubw xmm0,{{xmm[0-7]|xmmword ptr\[esp}}
+; CHECK-DAG: psubw xmm0,{{xmm[0-7]|xmmword ptr\[esp}}
+; CHECK-DAG: psubw xmm0,{{xmm[0-7]|xmmword ptr\[esp}}
+; CHECK-DAG: psubw xmm0,XMMWORD PTR [esp
+; CHECK-DAG: psubw xmm1,XMMWORD PTR [esp
}
define <4 x i32> @test_mul_v4i32(<4 x i32> %arg0, <4 x i32> %arg1) {
@@ -53,14 +53,14 @@
define <4 x i32> @test_mul_v4i32_more_regs(<4 x i1> %cond, <4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> %arg2, <4 x i32> %arg3, <4 x i32> %arg4, <4 x i32> %arg5, <4 x i32> %arg6, <4 x i32> %arg7, <4 x i32> %arg8) {
entry:
- %res1 = mul <4 x i32> %arg0, %arg1
- %res2 = mul <4 x i32> %arg0, %arg2
- %res3 = mul <4 x i32> %arg0, %arg3
- %res4 = mul <4 x i32> %arg0, %arg4
- %res5 = mul <4 x i32> %arg0, %arg5
- %res6 = mul <4 x i32> %arg0, %arg6
- %res7 = mul <4 x i32> %arg0, %arg7
- %res8 = mul <4 x i32> %arg0, %arg8
+ %res1 = sub <4 x i32> %arg0, %arg1
+ %res2 = sub <4 x i32> %arg0, %arg2
+ %res3 = sub <4 x i32> %arg0, %arg3
+ %res4 = sub <4 x i32> %arg0, %arg4
+ %res5 = sub <4 x i32> %arg0, %arg5
+ %res6 = sub <4 x i32> %arg0, %arg6
+ %res7 = sub <4 x i32> %arg0, %arg7
+ %res8 = sub <4 x i32> %arg0, %arg8
%res_acc1 = select <4 x i1> %cond, <4 x i32> %res1, <4 x i32> %res2
%res_acc2 = select <4 x i1> %cond, <4 x i32> %res3, <4 x i32> %res4
%res_acc3 = select <4 x i1> %cond, <4 x i32> %res5, <4 x i32> %res6
@@ -70,14 +70,14 @@
%res = select <4 x i1> %cond, <4 x i32> %res_acc1_3, <4 x i32> %res_acc2_4
ret <4 x i32> %res
; CHECK-LABEL: test_mul_v4i32_more_regs
-; CHECK-DAG: pmulld xmm0,{{xmm[0-7]|xmmword ptr\[esp}}
-; CHECK-DAG: pmulld xmm0,{{xmm[0-7]|xmmword ptr\[esp}}
-; CHECK-DAG: pmulld xmm0,{{xmm[0-7]|xmmword ptr\[esp}}
-; CHECK-DAG: pmulld xmm0,{{xmm[0-7]|xmmword ptr\[esp}}
-; CHECK-DAG: pmulld xmm0,{{xmm[0-7]|xmmword ptr\[esp}}
-; CHECK-DAG: pmulld xmm0,{{xmm[0-7]|xmmword ptr\[esp}}
-; CHECK-DAG: pmulld xmm0,XMMWORD PTR [esp
-; CHECK-DAG: pmulld xmm1,XMMWORD PTR [esp
+; CHECK-DAG: psubd xmm0,{{xmm[0-7]|xmmword ptr\[esp}}
+; CHECK-DAG: psubd xmm0,{{xmm[0-7]|xmmword ptr\[esp}}
+; CHECK-DAG: psubd xmm0,{{xmm[0-7]|xmmword ptr\[esp}}
+; CHECK-DAG: psubd xmm0,{{xmm[0-7]|xmmword ptr\[esp}}
+; CHECK-DAG: psubd xmm0,{{xmm[0-7]|xmmword ptr\[esp}}
+; CHECK-DAG: psubd xmm0,{{xmm[0-7]|xmmword ptr\[esp}}
+; CHECK-DAG: psubd xmm0,XMMWORD PTR [esp
+; CHECK-DAG: psubd xmm1,XMMWORD PTR [esp
}
; Test movq, which is used by atomic stores.
diff --git a/tests_lit/llvm2ice_tests/commutativity.ll b/tests_lit/llvm2ice_tests/commutativity.ll
new file mode 100644
index 0000000..e90c035
--- /dev/null
+++ b/tests_lit/llvm2ice_tests/commutativity.ll
@@ -0,0 +1,103 @@
+; Test the lowering sequence for commutative operations. If there is a source
+; operand whose lifetime ends in an operation, it should be the first operand,
+; eliminating the need for a move to start the new lifetime.
+
+; RUN: %if --need=target_X8632 --command %p2i --filetype=obj --disassemble \
+; RUN: --target x8632 -i %s --args -O2 \
+; RUN: | %if --need=target_X8632 --command FileCheck %s
+
+define i32 @integerAddLeft(i32 %a, i32 %b) {
+entry:
+ %tmp = add i32 %a, %b
+ %result = add i32 %a, %tmp
+ ret i32 %result
+}
+; CHECK-LABEL: integerAddLeft
+; CHECK-NEXT: mov {{e..}},DWORD PTR
+; CHECK-NEXT: mov {{e..}},DWORD PTR
+; CHECK-NEXT: add {{e..}},{{e..}}
+; CHECK-NEXT: add {{e..}},{{e..}}
+
+define i32 @integerAddRight(i32 %a, i32 %b) {
+entry:
+ %tmp = add i32 %a, %b
+ %result = add i32 %b, %tmp
+ ret i32 %result
+}
+; CHECK-LABEL: integerAddRight
+; CHECK-NEXT: mov {{e..}},DWORD PTR
+; CHECK-NEXT: mov {{e..}},DWORD PTR
+; CHECK-NEXT: add {{e..}},{{e..}}
+; CHECK-NEXT: add {{e..}},{{e..}}
+
+define i32 @integerMultiplyLeft(i32 %a, i32 %b) {
+entry:
+ %tmp = mul i32 %a, %b
+ %result = mul i32 %a, %tmp
+ ret i32 %result
+}
+; CHECK-LABEL: integerMultiplyLeft
+; CHECK-NEXT: mov {{e..}},DWORD PTR
+; CHECK-NEXT: mov {{e..}},DWORD PTR
+; CHECK-NEXT: imul {{e..}},{{e..}}
+; CHECK-NEXT: imul {{e..}},{{e..}}
+
+define i32 @integerMultiplyRight(i32 %a, i32 %b) {
+entry:
+ %tmp = mul i32 %a, %b
+ %result = mul i32 %b, %tmp
+ ret i32 %result
+}
+; CHECK-LABEL: integerMultiplyRight
+; CHECK-NEXT: mov {{e..}},DWORD PTR
+; CHECK-NEXT: mov {{e..}},DWORD PTR
+; CHECK-NEXT: imul {{e..}},{{e..}}
+; CHECK-NEXT: imul {{e..}},{{e..}}
+
+define float @floatAddLeft(float %a, float %b) {
+entry:
+ %tmp = fadd float %a, %b
+ %result = fadd float %a, %tmp
+ ret float %result
+}
+; CHECK-LABEL: floatAddLeft
+; CHECK-NEXT: movss xmm0,DWORD PTR
+; CHECK-NEXT: movss xmm1,DWORD PTR
+; CHECK-NEXT: addss xmm1,xmm0
+; CHECK-NEXT: addss xmm0,xmm1
+
+define float @floatAddRight(float %a, float %b) {
+entry:
+ %tmp = fadd float %a, %b
+ %result = fadd float %b, %tmp
+ ret float %result
+}
+; CHECK-LABEL: floatAddRight
+; CHECK-NEXT: movss xmm0,DWORD PTR
+; CHECK-NEXT: movss xmm1,DWORD PTR
+; CHECK-NEXT: addss xmm0,xmm1
+; CHECK-NEXT: addss xmm1,xmm0
+
+define float @floatMultiplyLeft(float %a, float %b) {
+entry:
+ %tmp = fmul float %a, %b
+ %result = fmul float %a, %tmp
+ ret float %result
+}
+; CHECK-LABEL: floatMultiplyLeft
+; CHECK-NEXT: movss xmm0,DWORD PTR
+; CHECK-NEXT: movss xmm1,DWORD PTR
+; CHECK-NEXT: mulss xmm1,xmm0
+; CHECK-NEXT: mulss xmm0,xmm1
+
+define float @floatMultiplyRight(float %a, float %b) {
+entry:
+ %tmp = fmul float %a, %b
+ %result = fmul float %b, %tmp
+ ret float %result
+}
+; CHECK-LABEL: floatMultiplyRight
+; CHECK-NEXT: movss xmm0,DWORD PTR
+; CHECK-NEXT: movss xmm1,DWORD PTR
+; CHECK-NEXT: mulss xmm0,xmm1
+; CHECK-NEXT: mulss xmm1,xmm0