Generate better two address code by using commutativity For operations such as t0 = t1 + t2 Subzero's pattern for arithmetic operations generates two address code that looks like movl ...t1..., %ecx addl ...t2..., %ecx // t0 is in %ecx When register pressure is high this sometimes becomes: movl ...t2..., SPILL movl ...t1..., %ecx addl SPILL, %ecx // t0 is in %ecx This CL takes advantage of cases where the use of t2 is the last one, so the register that held t2 before the operation can be reused. The optimization simply swaps the (commutative) operation to t0 = t2 + t1 which then generates code as movl ...t2..., %ecx addl ...t1..., %ecx // t0 is in %ecx This optimization is used for any commutative operation, which now includes Fadd and Fmul, which were erroneously marked as non-commutative. See the rationale in IceInst.def for the IEEE wordings. BUG= R=jfb@chromium.org, stichnot@chromium.org Review URL: https://codereview.chromium.org/1371703003 .

commit: 487bad0253e0d70d0cb191e8e5c71d01bf0044ee [log] [tgz]
author: David Sehr <sehr@chromium.org> Tue Oct 06 17:41:26 2015 -0700
committer: David Sehr <sehr@chromium.org> Tue Oct 06 17:41:26 2015 -0700
tree: fbeaa29d88968a35c86559e5811a5fdaa44222a2
parent: e11f878a3569f9e316c478d2e5595e0b000d0720 [diff]
diff --git a/src/IceInst.def b/src/IceInst.def
index d265213..6bd2efb 100644
--- a/src/IceInst.def
+++ b/src/IceInst.def

@@ -14,14 +14,31 @@
 #ifndef SUBZERO_SRC_ICEINST_DEF
 #define SUBZERO_SRC_ICEINST_DEF
 
+// Floating point addition and multiplication are commutative.
+// 1) non-special values and infinities are required to commute.
+// 2) signed zeroes are handled by:
+//    From IEEE standard 754-2008:
+//      When the sum of two operands with opposite signs (or the difference of
+//      two operands with like signs) is exactly zero, the sign of that sum
+//      (or difference) shall be +0 in all rounding-direction attributes
+//      except roundTowardNegative; under that attribute, the sign of an exact
+//      zero sum (or difference) shall be −0.
+// 3) NaNs are handled by:
+//    http://grouper.ieee.org/groups/1788/email/msg03558.html
+//      clause of 754 at work is 6.2.3 NaN propagation:
+//      "If two or more inputs are NaN, then the payload of the resulting NaN
+//      should be identical to the payload of one of the input NaNs if
+//      representable in the destination format. This standard does not
+//      specify which of the input NaNs will provide the payload."
+
 #define ICEINSTARITHMETIC_TABLE                   \
   /* enum value, printable string, commutative */ \
   X(Add,         "add",            1)             \
-  X(Fadd,        "fadd",           0)             \
+  X(Fadd,        "fadd",           1)             \
   X(Sub,         "sub",            0)             \
   X(Fsub,        "fsub",           0)             \
   X(Mul,         "mul",            1)             \
-  X(Fmul,        "fmul",           0)             \
+  X(Fmul,        "fmul",           1)             \
   X(Udiv,        "udiv",           0)             \
   X(Sdiv,        "sdiv",           0)             \
   X(Fdiv,        "fdiv",           0)             \

diff --git a/src/IceTargetLoweringX86BaseImpl.h b/src/IceTargetLoweringX86BaseImpl.h
index 00c2870..755d9ca 100644
--- a/src/IceTargetLoweringX86BaseImpl.h
+++ b/src/IceTargetLoweringX86BaseImpl.h

@@ -1293,10 +1293,23 @@
   Operand *Src0 = legalize(Inst->getSrc(0));
   Operand *Src1 = legalize(Inst->getSrc(1));
   if (Inst->isCommutative()) {
-    if (!llvm::isa<Variable>(Src0) && llvm::isa<Variable>(Src1))
+    uint32_t SwapCount = 0;
+    if (!llvm::isa<Variable>(Src0) && llvm::isa<Variable>(Src1)) {
       std::swap(Src0, Src1);
-    if (llvm::isa<Constant>(Src0) && !llvm::isa<Constant>(Src1))
+      ++SwapCount;
+    }
+    if (llvm::isa<Constant>(Src0) && !llvm::isa<Constant>(Src1)) {
       std::swap(Src0, Src1);
+      ++SwapCount;
+    }
+    // Improve two-address code patterns by avoiding a copy to the dest
+    // register when one of the source operands ends its lifetime here.
+    if (!Inst->isLastUse(Src0) && Inst->isLastUse(Src1)) {
+      std::swap(Src0, Src1);
+      ++SwapCount;
+    }
+    assert(SwapCount <= 1);
+    (void) SwapCount;
   }
   if (!Traits::Is64Bit && Dest->getType() == IceType_i64) {
     // These x86-32 helper-call-involved instructions are lowered in this

diff --git a/tests_lit/assembler/x86/opcode_register_encodings.ll b/tests_lit/assembler/x86/opcode_register_encodings.ll
index 9685182..57226e7 100644
--- a/tests_lit/assembler/x86/opcode_register_encodings.ll
+++ b/tests_lit/assembler/x86/opcode_register_encodings.ll

@@ -16,14 +16,14 @@
 ; Test register and address mode encoding.
 define <8 x i16> @test_mul_v8i16_more_regs(<8 x i1> %cond, <8 x i16> %arg0, <8 x i16> %arg1, <8 x i16> %arg2, <8 x i16> %arg3, <8 x i16> %arg4, <8 x i16> %arg5, <8 x i16> %arg6, <8 x i16> %arg7, <8 x i16> %arg8) {
 entry:
-  %res1 = mul <8 x i16> %arg0, %arg1
-  %res2 = mul <8 x i16> %arg0, %arg2
-  %res3 = mul <8 x i16> %arg0, %arg3
-  %res4 = mul <8 x i16> %arg0, %arg4
-  %res5 = mul <8 x i16> %arg0, %arg5
-  %res6 = mul <8 x i16> %arg0, %arg6
-  %res7 = mul <8 x i16> %arg0, %arg7
-  %res8 = mul <8 x i16> %arg0, %arg8
+  %res1 = sub <8 x i16> %arg0, %arg1
+  %res2 = sub <8 x i16> %arg0, %arg2
+  %res3 = sub <8 x i16> %arg0, %arg3
+  %res4 = sub <8 x i16> %arg0, %arg4
+  %res5 = sub <8 x i16> %arg0, %arg5
+  %res6 = sub <8 x i16> %arg0, %arg6
+  %res7 = sub <8 x i16> %arg0, %arg7
+  %res8 = sub <8 x i16> %arg0, %arg8
   %res_acc1 = select <8 x i1> %cond, <8 x i16> %res1, <8 x i16> %res2
   %res_acc2 = select <8 x i1> %cond, <8 x i16> %res3, <8 x i16> %res4
   %res_acc3 = select <8 x i1> %cond, <8 x i16> %res5, <8 x i16> %res6
@@ -33,14 +33,14 @@
   %res = select <8 x i1> %cond, <8 x i16> %res_acc1_3, <8 x i16> %res_acc2_4
   ret <8 x i16> %res
 ; CHECK-LABEL: test_mul_v8i16_more_regs
-; CHECK-DAG: pmullw xmm0,{{xmm[0-7]|xmmword ptr\[esp}}
-; CHECK-DAG: pmullw xmm0,{{xmm[0-7]|xmmword ptr\[esp}}
-; CHECK-DAG: pmullw xmm0,{{xmm[0-7]|xmmword ptr\[esp}}
-; CHECK-DAG: pmullw xmm0,{{xmm[0-7]|xmmword ptr\[esp}}
-; CHECK-DAG: pmullw xmm0,{{xmm[0-7]|xmmword ptr\[esp}}
-; CHECK-DAG: pmullw xmm0,{{xmm[0-7]|xmmword ptr\[esp}}
-; CHECK-DAG: pmullw xmm0,XMMWORD PTR [esp
-; CHECK-DAG: pmullw xmm1,XMMWORD PTR [esp
+; CHECK-DAG: psubw xmm0,{{xmm[0-7]|xmmword ptr\[esp}}
+; CHECK-DAG: psubw xmm0,{{xmm[0-7]|xmmword ptr\[esp}}
+; CHECK-DAG: psubw xmm0,{{xmm[0-7]|xmmword ptr\[esp}}
+; CHECK-DAG: psubw xmm0,{{xmm[0-7]|xmmword ptr\[esp}}
+; CHECK-DAG: psubw xmm0,{{xmm[0-7]|xmmword ptr\[esp}}
+; CHECK-DAG: psubw xmm0,{{xmm[0-7]|xmmword ptr\[esp}}
+; CHECK-DAG: psubw xmm0,XMMWORD PTR [esp
+; CHECK-DAG: psubw xmm1,XMMWORD PTR [esp
 }
 
 define <4 x i32> @test_mul_v4i32(<4 x i32> %arg0, <4 x i32> %arg1) {
@@ -53,14 +53,14 @@
 
 define <4 x i32> @test_mul_v4i32_more_regs(<4 x i1> %cond, <4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> %arg2, <4 x i32> %arg3, <4 x i32> %arg4, <4 x i32> %arg5, <4 x i32> %arg6, <4 x i32> %arg7, <4 x i32> %arg8) {
 entry:
-  %res1 = mul <4 x i32> %arg0, %arg1
-  %res2 = mul <4 x i32> %arg0, %arg2
-  %res3 = mul <4 x i32> %arg0, %arg3
-  %res4 = mul <4 x i32> %arg0, %arg4
-  %res5 = mul <4 x i32> %arg0, %arg5
-  %res6 = mul <4 x i32> %arg0, %arg6
-  %res7 = mul <4 x i32> %arg0, %arg7
-  %res8 = mul <4 x i32> %arg0, %arg8
+  %res1 = sub <4 x i32> %arg0, %arg1
+  %res2 = sub <4 x i32> %arg0, %arg2
+  %res3 = sub <4 x i32> %arg0, %arg3
+  %res4 = sub <4 x i32> %arg0, %arg4
+  %res5 = sub <4 x i32> %arg0, %arg5
+  %res6 = sub <4 x i32> %arg0, %arg6
+  %res7 = sub <4 x i32> %arg0, %arg7
+  %res8 = sub <4 x i32> %arg0, %arg8
   %res_acc1 = select <4 x i1> %cond, <4 x i32> %res1, <4 x i32> %res2
   %res_acc2 = select <4 x i1> %cond, <4 x i32> %res3, <4 x i32> %res4
   %res_acc3 = select <4 x i1> %cond, <4 x i32> %res5, <4 x i32> %res6
@@ -70,14 +70,14 @@
   %res = select <4 x i1> %cond, <4 x i32> %res_acc1_3, <4 x i32> %res_acc2_4
   ret <4 x i32> %res
 ; CHECK-LABEL: test_mul_v4i32_more_regs
-; CHECK-DAG: pmulld xmm0,{{xmm[0-7]|xmmword ptr\[esp}}
-; CHECK-DAG: pmulld xmm0,{{xmm[0-7]|xmmword ptr\[esp}}
-; CHECK-DAG: pmulld xmm0,{{xmm[0-7]|xmmword ptr\[esp}}
-; CHECK-DAG: pmulld xmm0,{{xmm[0-7]|xmmword ptr\[esp}}
-; CHECK-DAG: pmulld xmm0,{{xmm[0-7]|xmmword ptr\[esp}}
-; CHECK-DAG: pmulld xmm0,{{xmm[0-7]|xmmword ptr\[esp}}
-; CHECK-DAG: pmulld xmm0,XMMWORD PTR [esp
-; CHECK-DAG: pmulld xmm1,XMMWORD PTR [esp
+; CHECK-DAG: psubd xmm0,{{xmm[0-7]|xmmword ptr\[esp}}
+; CHECK-DAG: psubd xmm0,{{xmm[0-7]|xmmword ptr\[esp}}
+; CHECK-DAG: psubd xmm0,{{xmm[0-7]|xmmword ptr\[esp}}
+; CHECK-DAG: psubd xmm0,{{xmm[0-7]|xmmword ptr\[esp}}
+; CHECK-DAG: psubd xmm0,{{xmm[0-7]|xmmword ptr\[esp}}
+; CHECK-DAG: psubd xmm0,{{xmm[0-7]|xmmword ptr\[esp}}
+; CHECK-DAG: psubd xmm0,XMMWORD PTR [esp
+; CHECK-DAG: psubd xmm1,XMMWORD PTR [esp
 }
 
 ; Test movq, which is used by atomic stores.

diff --git a/tests_lit/llvm2ice_tests/commutativity.ll b/tests_lit/llvm2ice_tests/commutativity.ll
new file mode 100644
index 0000000..e90c035
--- /dev/null
+++ b/tests_lit/llvm2ice_tests/commutativity.ll

@@ -0,0 +1,103 @@
+; Test the lowering sequence for commutative operations.  If there is a source
+; operand whose lifetime ends in an operation, it should be the first operand,
+; eliminating the need for a move to start the new lifetime.
+
+; RUN: %if --need=target_X8632 --command %p2i --filetype=obj --disassemble \
+; RUN:   --target x8632 -i %s --args -O2 \
+; RUN:   | %if --need=target_X8632 --command FileCheck %s
+
+define i32 @integerAddLeft(i32 %a, i32 %b) {
+entry:
+  %tmp = add i32 %a, %b
+  %result = add i32 %a, %tmp
+  ret i32 %result
+}
+; CHECK-LABEL: integerAddLeft
+; CHECK-NEXT: mov {{e..}},DWORD PTR
+; CHECK-NEXT: mov {{e..}},DWORD PTR
+; CHECK-NEXT: add {{e..}},{{e..}}
+; CHECK-NEXT: add {{e..}},{{e..}}
+
+define i32 @integerAddRight(i32 %a, i32 %b) {
+entry:
+  %tmp = add i32 %a, %b
+  %result = add i32 %b, %tmp
+  ret i32 %result
+}
+; CHECK-LABEL: integerAddRight
+; CHECK-NEXT: mov {{e..}},DWORD PTR
+; CHECK-NEXT: mov {{e..}},DWORD PTR
+; CHECK-NEXT: add {{e..}},{{e..}}
+; CHECK-NEXT: add {{e..}},{{e..}}
+
+define i32 @integerMultiplyLeft(i32 %a, i32 %b) {
+entry:
+  %tmp = mul i32 %a, %b
+  %result = mul i32 %a, %tmp
+  ret i32 %result
+}
+; CHECK-LABEL: integerMultiplyLeft
+; CHECK-NEXT: mov {{e..}},DWORD PTR
+; CHECK-NEXT: mov {{e..}},DWORD PTR
+; CHECK-NEXT: imul {{e..}},{{e..}}
+; CHECK-NEXT: imul {{e..}},{{e..}}
+
+define i32 @integerMultiplyRight(i32 %a, i32 %b) {
+entry:
+  %tmp = mul i32 %a, %b
+  %result = mul i32 %b, %tmp
+  ret i32 %result
+}
+; CHECK-LABEL: integerMultiplyRight
+; CHECK-NEXT: mov {{e..}},DWORD PTR
+; CHECK-NEXT: mov {{e..}},DWORD PTR
+; CHECK-NEXT: imul {{e..}},{{e..}}
+; CHECK-NEXT: imul {{e..}},{{e..}}
+
+define float @floatAddLeft(float %a, float %b) {
+entry:
+  %tmp = fadd float %a, %b
+  %result = fadd float %a, %tmp
+  ret float %result
+}
+; CHECK-LABEL: floatAddLeft
+; CHECK-NEXT: movss xmm0,DWORD PTR
+; CHECK-NEXT: movss xmm1,DWORD PTR
+; CHECK-NEXT: addss xmm1,xmm0
+; CHECK-NEXT: addss xmm0,xmm1
+
+define float @floatAddRight(float %a, float %b) {
+entry:
+  %tmp = fadd float %a, %b
+  %result = fadd float %b, %tmp
+  ret float %result
+}
+; CHECK-LABEL: floatAddRight
+; CHECK-NEXT: movss xmm0,DWORD PTR
+; CHECK-NEXT: movss xmm1,DWORD PTR
+; CHECK-NEXT: addss xmm0,xmm1
+; CHECK-NEXT: addss xmm1,xmm0
+
+define float @floatMultiplyLeft(float %a, float %b) {
+entry:
+  %tmp = fmul float %a, %b
+  %result = fmul float %a, %tmp
+  ret float %result
+}
+; CHECK-LABEL: floatMultiplyLeft
+; CHECK-NEXT: movss xmm0,DWORD PTR
+; CHECK-NEXT: movss xmm1,DWORD PTR
+; CHECK-NEXT: mulss xmm1,xmm0
+; CHECK-NEXT: mulss xmm0,xmm1
+
+define float @floatMultiplyRight(float %a, float %b) {
+entry:
+  %tmp = fmul float %a, %b
+  %result = fmul float %b, %tmp
+  ret float %result
+}
+; CHECK-LABEL: floatMultiplyRight
+; CHECK-NEXT: movss xmm0,DWORD PTR
+; CHECK-NEXT: movss xmm1,DWORD PTR
+; CHECK-NEXT: mulss xmm0,xmm1
+; CHECK-NEXT: mulss xmm1,xmm0
commit	487bad0253e0d70d0cb191e8e5c71d01bf0044ee	[log] [tgz]
author	David Sehr <sehr@chromium.org>	Tue Oct 06 17:41:26 2015 -0700
committer	David Sehr <sehr@chromium.org>	Tue Oct 06 17:41:26 2015 -0700
tree	fbeaa29d88968a35c86559e5811a5fdaa44222a2
parent	e11f878a3569f9e316c478d2e5595e0b000d0720 [diff]