Subzero: Apply commutativity to the RMW optimization.

The read-modify-write (RMW) optimization looks for patterns like this:

  a = Load addr
  b = <op> a, other
  Store b, addr

and essentially transforms them into this:

  RMW <op>, addr, other

This CL also applies the transformation when the middle instruction is
  b = <op> other, a
and <op> is commutative.

BUG= https://code.google.com/p/nativeclient/issues/detail?id=4095
R=jpp@chromium.org

Review URL: https://codereview.chromium.org/1193103005
diff --git a/src/IceTargetLoweringX8632.cpp b/src/IceTargetLoweringX8632.cpp
index 6e80a8f..c1ba404 100644
--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
@@ -711,10 +711,13 @@
             if (!isSameMemAddressOperand(Load->getSourceAddress(),
                                          Store->getAddr()))
               continue;
-            if (false && Load->getSourceAddress() != Store->getAddr())
-              continue;
-            if (Arith->getSrc(0) != Load->getDest())
-              continue;
+            Operand *ArithSrcFromLoad = Arith->getSrc(0);
+            Operand *ArithSrcOther = Arith->getSrc(1);
+            if (ArithSrcFromLoad != Load->getDest()) {
+              if (!Arith->isCommutative() || ArithSrcOther != Load->getDest())
+                continue;
+              std::swap(ArithSrcFromLoad, ArithSrcOther);
+            }
             if (Arith->getDest() != Store->getData())
               continue;
             if (!canRMW(Arith))
@@ -734,8 +737,7 @@
             InstFakeDef *BeaconDef = InstFakeDef::create(Func, Beacon);
             Node->getInsts().insert(I3, BeaconDef);
             InstX8632FakeRMW *RMW = InstX8632FakeRMW::create(
-                Func, Arith->getSrc(1), Store->getAddr(), Beacon,
-                Arith->getOp());
+                Func, ArithSrcOther, Store->getAddr(), Beacon, Arith->getOp());
             Node->getInsts().insert(I3, RMW);
           }
         }
diff --git a/tests_lit/llvm2ice_tests/rmw.ll b/tests_lit/llvm2ice_tests/rmw.ll
index 321f612..12d365d 100644
--- a/tests_lit/llvm2ice_tests/rmw.ll
+++ b/tests_lit/llvm2ice_tests/rmw.ll
@@ -102,3 +102,31 @@
 ; Look for something like: add DWORD PTR [eax+ecx*4+12],ecx
 ; CHECK-LABEL: rmw_add_i32_var_addropt
 ; CHECK: add DWORD PTR [e{{..}}+e{{..}}*4+0xc],e{{ax|bx|cx|dx|bp|di|si}}
+
+; Test for commutativity opportunities.  This is the same as rmw_add_i32_var
+; except with the "add" operands reversed.
+define internal void @rmw_add_i32_var_comm(i32 %addr_arg, i32 %var) {
+entry:
+  %addr = inttoptr i32 %addr_arg to i32*
+  %val = load i32, i32* %addr, align 1
+  %rmw = add i32 %var, %val
+  store i32 %rmw, i32* %addr, align 1
+  ret void
+}
+; Look for something like: add DWORD PTR [eax],ecx
+; CHECK-LABEL: rmw_add_i32_var_comm
+; CHECK: add DWORD PTR [e{{ax|bx|cx|dx|bp|di|si}}],e{{ax|bx|cx|dx|bp|di|si}}
+
+; Test that commutativity isn't triggered for a non-commutative arithmetic
+; operator (sub).  This is the same as rmw_add_i32_var_comm except with a
+; "sub" operation.
+define internal i32 @no_rmw_sub_i32_var(i32 %addr_arg, i32 %var) {
+entry:
+  %addr = inttoptr i32 %addr_arg to i32*
+  %val = load i32, i32* %addr, align 1
+  %rmw = sub i32 %var, %val
+  store i32 %rmw, i32* %addr, align 1
+  ret i32 %rmw
+}
+; CHECK-LABEL: no_rmw_sub_i32_var
+; CHECK: sub e{{ax|bx|cx|dx|bp|di|si}},DWORD PTR [e{{ax|bx|cx|dx|bp|di|si}}]