Subzero: Strength-reduce mul by certain constants. These all appear to some degree in spec2k. This is implemented for i8/i16/i32 types. It is done as part of core lowering, so in theory all optimization levels could benefit, but it is explicitly disabled for Om1/O0 to keep things simple there. While clang appears to strength-reduce udiv/urem by a constant power of 2, for some reason it does not always strength-reduce multiplies (given that they appear in the spec2k bitcode). For multiplies by 3, 5, or 9, we can make use of the lea instruction. We can do combinations of shift and lea to multiply by other constants, e.g. 100=5*5*4. If too many operations would be required, just give up and use the mul instruction. BUG= https://code.google.com/p/nativeclient/issues/detail?id=4095 R=jpp@chromium.org, jvoung@chromium.org Review URL: https://codereview.chromium.org/1146803002

commit: 0933c0cfb8183a662e44a61d8671b20987d12c78 [log] [tgz]
author: Jim Stichnoth <stichnot@chromium.org> Fri Jun 12 10:41:16 2015 -0700
committer: Jim Stichnoth <stichnot@chromium.org> Fri Jun 12 10:41:16 2015 -0700
tree: e107d32e24ebf0605782265e74bb421441b74fde
parent: 326534a3bb3783ce52aae0f9a3d684aedbab2790 [diff]
diff --git a/crosstest/crosstest.cfg b/crosstest/crosstest.cfg
index 2222b90..620edb6 100644
--- a/crosstest/crosstest.cfg
+++ b/crosstest/crosstest.cfg

@@ -42,6 +42,13 @@
 driver: test_stacksave_main.c
 test: test_stacksave.c
 
+[test_strengthreduce]
+driver: test_strengthreduce_main.cpp
+test: test_strengthreduce.cpp
+# Disable clang-side optimizations so that pnacl-sz sees suitable
+# bitcode patterns.
+flags: --clang-opt=0
+
 [test_sync_atomic]
 driver: test_sync_atomic_main.cpp
 # Compile the non-Subzero object files straight from source since the native

diff --git a/crosstest/test_strengthreduce.cpp b/crosstest/test_strengthreduce.cpp
new file mode 100644
index 0000000..b6be659
--- /dev/null
+++ b/crosstest/test_strengthreduce.cpp

@@ -0,0 +1,30 @@
+//===- subzero/crosstest/test_strengthreduce.cpp - Strength reduction -----===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Implementation for crosstesting strength reduction.
+//
+//===----------------------------------------------------------------------===//
+
+#include "test_strengthreduce.h"
+
+// TODO(stichnot): Extend to i16 and i8 types, and also test the
+// commutativity transformations.  This may require hand-generating
+// .ll files, because of C/C++ integer promotion rules for arithmetic,
+// and because clang prefers to do its own commutativity
+// transformation.
+
+#define X(constant, suffix)                                                    \
+  uint32_t multiplyByConst##suffix(uint32_t Val) {                             \
+    return Val * (uint32_t)constant;                                           \
+  }                                                                            \
+  int32_t multiplyByConst##suffix(int32_t Val) {                               \
+    return Val * (int32_t)constant;                                            \
+  }
+CONST_TABLE
+#undef X

diff --git a/crosstest/test_strengthreduce.def b/crosstest/test_strengthreduce.def
new file mode 100644
index 0000000..a65437c
--- /dev/null
+++ b/crosstest/test_strengthreduce.def

@@ -0,0 +1,38 @@
+//===- subzero/crosstest/test_strengthreduce.def - macros -----*- C++ -*---===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines macros for crosstesting strength reduction.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TEST_STRENGTHREDUCE_DEF
+#define TEST_STRENGTHREDUCE_DEF
+
+#define XSTR(s) STR(s)
+#define STR(s) #s
+
+#define CONST_TABLE \
+  X(   -10,    _10) \
+  X(    -7,     _7) \
+  X(    -2,     _2) \
+  X(    -1,     _1) \
+  X(     0,      0) \
+  X(     1,      1) \
+  X(     2,      2) \
+  X(     3,      3) \
+  X(     4,      4) \
+  X(     5,      5) \
+  X(     7,      7) \
+  X(     9,      9) \
+  X(    10,     10) \
+  X(   100,    100) \
+  X(100000, 100000) \
+//#define X(constant, suffix)
+
+#endif // !TEST_STRENGTHREDUCE_DEF

diff --git a/crosstest/test_strengthreduce.h b/crosstest/test_strengthreduce.h
new file mode 100644
index 0000000..309eef9
--- /dev/null
+++ b/crosstest/test_strengthreduce.h

@@ -0,0 +1,23 @@
+//===- subzero/crosstest/test_strengthreduce.h - Prototypes ---*- C++ -*---===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the function prototypes used for crosstesting strength
+// reduction.
+//
+//===----------------------------------------------------------------------===//
+
+#include <stdint.h>
+
+#include "test_strengthreduce.def"
+
+#define X(constant, suffix)                                                    \
+  uint32_t multiplyByConst##suffix(uint32_t val);                              \
+  int32_t multiplyByConst##suffix(int32_t val);
+CONST_TABLE
+#undef X

diff --git a/crosstest/test_strengthreduce_main.cpp b/crosstest/test_strengthreduce_main.cpp
new file mode 100644
index 0000000..2c2aa98
--- /dev/null
+++ b/crosstest/test_strengthreduce_main.cpp

@@ -0,0 +1,65 @@
+//===- subzero/crosstest/test_strengthreduce_main.cpp - Driver for tests --===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Driver for crosstesting arithmetic strength-reducing optimizations.
+//
+//===----------------------------------------------------------------------===//
+
+/* crosstest.py --test=test_strengthreduce.cpp \
+   --driver=test_strengthreduce_main.cpp \
+   --prefix=Subzero_ --clang-opt=0 --output=test_strengthreduce */
+
+#include <iostream>
+
+// Include test_strengthreduce.h twice - once normally, and once
+// within the Subzero_ namespace, corresponding to the llc and Subzero
+// translated object files, respectively.
+#include "test_strengthreduce.h"
+namespace Subzero_ {
+#include "test_strengthreduce.h"
+}
+
+int main(int argc, char **argv) {
+  size_t TotalTests = 0;
+  size_t Passes = 0;
+  size_t Failures = 0;
+  static int32_t Values[] = {-100, -50, 0, 1, 8, 123, 0x33333333, 0x77777777};
+  for (size_t i = 0; i < sizeof(Values) / sizeof(*Values); ++i) {
+    int32_t SVal = Values[i];
+    int32_t ResultLlcS, ResultSzS;
+    uint32_t UVal = (uint32_t)Values[i];
+    int32_t ResultLlcU, ResultSzU;
+
+#define X(constant, suffix)                                                    \
+  ResultLlcS = multiplyByConst##suffix(UVal);                                  \
+  ResultSzS = Subzero_::multiplyByConst##suffix(UVal);                         \
+  if (ResultLlcS == ResultSzS) {                                               \
+    ++Passes;                                                                  \
+  } else {                                                                     \
+    ++Failures;                                                                \
+    std::cout << "multiplyByConstS" STR(suffix) "(" << SVal                    \
+              << "): sz=" << ResultSzS << " llc=" << ResultLlcS << "\n";       \
+  }                                                                            \
+  ResultLlcU = multiplyByConst##suffix(UVal);                                  \
+  ResultSzU = Subzero_::multiplyByConst##suffix(UVal);                         \
+  if (ResultLlcU == ResultSzU) {                                               \
+    ++Passes;                                                                  \
+  } else {                                                                     \
+    ++Failures;                                                                \
+    std::cout << "multiplyByConstU" STR(suffix) "(" << UVal                    \
+              << "): sz=" << ResultSzU << " llc=" << ResultLlcU << "\n";       \
+  }
+    CONST_TABLE
+#undef X
+  }
+
+  std::cout << "TotalTests=" << TotalTests << " Passes=" << Passes
+            << " Failures=" << Failures << "\n";
+  return Failures;
+}

diff --git a/pydir/crosstest.py b/pydir/crosstest.py
index c8c7fc8..f5d87a4 100755
--- a/pydir/crosstest.py
+++ b/pydir/crosstest.py

@@ -40,9 +40,11 @@
     argparser.add_argument('-O', required=False, default='2', dest='optlevel',
                            choices=['m1', '-1', '0', '1', '2'],
                            metavar='OPTLEVEL',
-                           help='Optimization level ' +
+                           help='Optimization level for llc and Subzero ' +
                                 '(m1 and -1 are equivalent).' +
                                 ' Default %(default)s.')
+    argparser.add_argument('--clang-opt', required=False, default=True,
+                           dest='clang_opt')
     argparser.add_argument('--mattr',  required=False, default='sse2',
                            dest='attr', choices=['sse2', 'sse4.1'],
                            metavar='ATTRIBUTE',
@@ -92,7 +94,8 @@
             bitcode_nonfinal = os.path.join(args.dir, base + '.' + key + '.bc')
             bitcode = os.path.join(args.dir, base + '.' + key + '.pnacl.ll')
             shellcmd(['{bin}/pnacl-clang'.format(bin=bindir),
-                      '-O2', '-c', arg, '-o', bitcode_nonfinal])
+                      ('-O2' if args.clang_opt else '-O0'), '-c', arg,
+                      '-o', bitcode_nonfinal])
             shellcmd(['{bin}/pnacl-opt'.format(bin=bindir),
                       '-pnacl-abi-simplify-preopt',
                       '-pnacl-abi-simplify-postopt',

diff --git a/src/IceELFObjectWriter.cpp b/src/IceELFObjectWriter.cpp
index 9c75482..edb46fa 100644
--- a/src/IceELFObjectWriter.cpp
+++ b/src/IceELFObjectWriter.cpp

@@ -383,8 +383,9 @@
       for (VariableDeclaration::Initializer *Init : Var->getInitializers()) {
         switch (Init->getKind()) {
         case VariableDeclaration::Initializer::DataInitializerKind: {
-          const auto Data = llvm::cast<VariableDeclaration::DataInitializer>(
-                                Init)->getContents();
+          const auto Data =
+              llvm::cast<VariableDeclaration::DataInitializer>(Init)
+                  ->getContents();
           Section->appendData(Str, llvm::StringRef(Data.data(), Data.size()));
           break;
         }

diff --git a/src/IceGlobalInits.h b/src/IceGlobalInits.h
index 4c51606..754f181 100644
--- a/src/IceGlobalInits.h
+++ b/src/IceGlobalInits.h

@@ -291,9 +291,7 @@
     return isExternal() && !hasInitializer();
   }
 
-  void setSuppressMangling() {
-    ForceSuppressMangling = true;
-  }
+  void setSuppressMangling() { ForceSuppressMangling = true; }
 
 private:
   // list of initializers for the declared variable.

diff --git a/src/IceTargetLoweringX8632.cpp b/src/IceTargetLoweringX8632.cpp
index 506b315..f6cd26f 100644
--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp

@@ -1287,6 +1287,102 @@
   _mov(Dest, esp);
 }
 
+// Strength-reduce scalar integer multiplication by a constant (for
+// i32 or narrower) for certain constants.  The lea instruction can be
+// used to multiply by 3, 5, or 9, and the lsh instruction can be used
+// to multiply by powers of 2.  These can be combined such that
+// e.g. multiplying by 100 can be done as 2 lea-based multiplies by 5,
+// combined with left-shifting by 2.
+bool TargetX8632::optimizeScalarMul(Variable *Dest, Operand *Src0,
+                                    int32_t Src1) {
+  // Disable this optimization for Om1 and O0, just to keep things
+  // simple there.
+  if (Ctx->getFlags().getOptLevel() < Opt_1)
+    return false;
+  Type Ty = Dest->getType();
+  Variable *T = nullptr;
+  if (Src1 == -1) {
+    _mov(T, Src0);
+    _neg(T);
+    _mov(Dest, T);
+    return true;
+  }
+  if (Src1 == 0) {
+    _mov(Dest, Ctx->getConstantZero(Ty));
+    return true;
+  }
+  if (Src1 == 1) {
+    _mov(T, Src0);
+    _mov(Dest, T);
+    return true;
+  }
+  // Don't bother with the edge case where Src1 == MININT.
+  if (Src1 == -Src1)
+    return false;
+  const bool Src1IsNegative = Src1 < 0;
+  if (Src1IsNegative)
+    Src1 = -Src1;
+  uint32_t Count9 = 0;
+  uint32_t Count5 = 0;
+  uint32_t Count3 = 0;
+  uint32_t Count2 = 0;
+  uint32_t CountOps = 0;
+  while (Src1 > 1) {
+    if (Src1 % 9 == 0) {
+      ++CountOps;
+      ++Count9;
+      Src1 /= 9;
+    } else if (Src1 % 5 == 0) {
+      ++CountOps;
+      ++Count5;
+      Src1 /= 5;
+    } else if (Src1 % 3 == 0) {
+      ++CountOps;
+      ++Count3;
+      Src1 /= 3;
+    } else if (Src1 % 2 == 0) {
+      if (Count2 == 0)
+        ++CountOps;
+      ++Count2;
+      Src1 /= 2;
+    } else {
+      return false;
+    }
+  }
+  // Lea optimization only works for i16 and i32 types, not i8.
+  if (Ty != IceType_i16 && Ty != IceType_i32 && (Count3 || Count5 || Count9))
+    return false;
+  // Limit the number of lea/shl operations for a single multiply, to
+  // a somewhat arbitrary choice of 3.
+  const uint32_t MaxOpsForOptimizedMul = 3;
+  if (CountOps > MaxOpsForOptimizedMul)
+    return false;
+  _mov(T, Src0);
+  Constant *Zero = Ctx->getConstantZero(IceType_i32);
+  for (uint32_t i = 0; i < Count9; ++i) {
+    const uint16_t Shift = 3; // log2(9-1)
+    _lea(T, OperandX8632Mem::create(Func, IceType_void, T, Zero, T, Shift));
+    _set_dest_nonkillable();
+  }
+  for (uint32_t i = 0; i < Count5; ++i) {
+    const uint16_t Shift = 2; // log2(5-1)
+    _lea(T, OperandX8632Mem::create(Func, IceType_void, T, Zero, T, Shift));
+    _set_dest_nonkillable();
+  }
+  for (uint32_t i = 0; i < Count3; ++i) {
+    const uint16_t Shift = 1; // log2(3-1)
+    _lea(T, OperandX8632Mem::create(Func, IceType_void, T, Zero, T, Shift));
+    _set_dest_nonkillable();
+  }
+  if (Count2) {
+    _shl(T, Ctx->getConstantInt(Ty, Count2));
+  }
+  if (Src1IsNegative)
+    _neg(T);
+  _mov(Dest, T);
+  return true;
+}
+
 void TargetX8632::lowerArithmetic(const InstArithmetic *Inst) {
   Variable *Dest = Inst->getDest();
   Operand *Src0 = legalize(Inst->getSrc(0));
@@ -1294,6 +1390,8 @@
   if (Inst->isCommutative()) {
     if (!llvm::isa<Variable>(Src0) && llvm::isa<Variable>(Src1))
       std::swap(Src0, Src1);
+    if (llvm::isa<Constant>(Src0) && !llvm::isa<Constant>(Src1))
+      std::swap(Src0, Src1);
   }
   if (Dest->getType() == IceType_i64) {
     Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
@@ -1521,7 +1619,9 @@
       llvm_unreachable("FP instruction with i64 type");
       break;
     }
-  } else if (isVectorType(Dest->getType())) {
+    return;
+  }
+  if (isVectorType(Dest->getType())) {
     // TODO: Trap on integer divide and integer modulo by zero.
     // See: https://code.google.com/p/nativeclient/issues/detail?id=3899
     if (llvm::isa<OperandX8632Mem>(Src1))
@@ -1650,175 +1750,248 @@
       scalarizeArithmetic(Inst->getOp(), Dest, Src0, Src1);
       break;
     }
-  } else { // Dest->getType() is non-i64 scalar
-    Variable *T_edx = nullptr;
-    Variable *T = nullptr;
-    switch (Inst->getOp()) {
-    case InstArithmetic::_num:
-      llvm_unreachable("Unknown arithmetic operator");
-      break;
-    case InstArithmetic::Add:
-      _mov(T, Src0);
-      _add(T, Src1);
-      _mov(Dest, T);
-      break;
-    case InstArithmetic::And:
-      _mov(T, Src0);
-      _and(T, Src1);
-      _mov(Dest, T);
-      break;
-    case InstArithmetic::Or:
-      _mov(T, Src0);
-      _or(T, Src1);
-      _mov(Dest, T);
-      break;
-    case InstArithmetic::Xor:
-      _mov(T, Src0);
-      _xor(T, Src1);
-      _mov(Dest, T);
-      break;
-    case InstArithmetic::Sub:
-      _mov(T, Src0);
-      _sub(T, Src1);
-      _mov(Dest, T);
-      break;
-    case InstArithmetic::Mul:
-      // TODO: Optimize for llvm::isa<Constant>(Src1)
-      // TODO: Strength-reduce multiplications by a constant,
-      // particularly -1 and powers of 2.  Advanced: use lea to
-      // multiply by 3, 5, 9.
-      //
-      // The 8-bit version of imul only allows the form "imul r/m8"
-      // where T must be in eax.
-      if (isByteSizedArithType(Dest->getType())) {
-        _mov(T, Src0, RegX8632::Reg_eax);
-        Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
-      } else {
-        _mov(T, Src0);
-      }
-      _imul(T, Src1);
-      _mov(Dest, T);
-      break;
-    case InstArithmetic::Shl:
-      _mov(T, Src0);
-      if (!llvm::isa<Constant>(Src1))
-        Src1 = legalizeToVar(Src1, RegX8632::Reg_ecx);
-      _shl(T, Src1);
-      _mov(Dest, T);
-      break;
-    case InstArithmetic::Lshr:
-      _mov(T, Src0);
-      if (!llvm::isa<Constant>(Src1))
-        Src1 = legalizeToVar(Src1, RegX8632::Reg_ecx);
-      _shr(T, Src1);
-      _mov(Dest, T);
-      break;
-    case InstArithmetic::Ashr:
-      _mov(T, Src0);
-      if (!llvm::isa<Constant>(Src1))
-        Src1 = legalizeToVar(Src1, RegX8632::Reg_ecx);
-      _sar(T, Src1);
-      _mov(Dest, T);
-      break;
-    case InstArithmetic::Udiv:
-      // div and idiv are the few arithmetic operators that do not allow
-      // immediates as the operand.
-      Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
-      if (isByteSizedArithType(Dest->getType())) {
-        Variable *T_ah = nullptr;
-        Constant *Zero = Ctx->getConstantZero(IceType_i8);
-        _mov(T, Src0, RegX8632::Reg_eax);
-        _mov(T_ah, Zero, RegX8632::Reg_ah);
-        _div(T, Src1, T_ah);
-        _mov(Dest, T);
-      } else {
-        Constant *Zero = Ctx->getConstantZero(IceType_i32);
-        _mov(T, Src0, RegX8632::Reg_eax);
-        _mov(T_edx, Zero, RegX8632::Reg_edx);
-        _div(T, Src1, T_edx);
-        _mov(Dest, T);
-      }
-      break;
-    case InstArithmetic::Sdiv:
-      Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
-      if (isByteSizedArithType(Dest->getType())) {
-        _mov(T, Src0, RegX8632::Reg_eax);
-        _cbwdq(T, T);
-        _idiv(T, Src1, T);
-        _mov(Dest, T);
-      } else {
-        T_edx = makeReg(IceType_i32, RegX8632::Reg_edx);
-        _mov(T, Src0, RegX8632::Reg_eax);
-        _cbwdq(T_edx, T);
-        _idiv(T, Src1, T_edx);
-        _mov(Dest, T);
-      }
-      break;
-    case InstArithmetic::Urem:
-      Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
-      if (isByteSizedArithType(Dest->getType())) {
-        Variable *T_ah = nullptr;
-        Constant *Zero = Ctx->getConstantZero(IceType_i8);
-        _mov(T, Src0, RegX8632::Reg_eax);
-        _mov(T_ah, Zero, RegX8632::Reg_ah);
-        _div(T_ah, Src1, T);
-        _mov(Dest, T_ah);
-      } else {
-        Constant *Zero = Ctx->getConstantZero(IceType_i32);
-        _mov(T_edx, Zero, RegX8632::Reg_edx);
-        _mov(T, Src0, RegX8632::Reg_eax);
-        _div(T_edx, Src1, T);
-        _mov(Dest, T_edx);
-      }
-      break;
-    case InstArithmetic::Srem:
-      Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
-      if (isByteSizedArithType(Dest->getType())) {
-        Variable *T_ah = makeReg(IceType_i8, RegX8632::Reg_ah);
-        _mov(T, Src0, RegX8632::Reg_eax);
-        _cbwdq(T, T);
-        Context.insert(InstFakeDef::create(Func, T_ah));
-        _idiv(T_ah, Src1, T);
-        _mov(Dest, T_ah);
-      } else {
-        T_edx = makeReg(IceType_i32, RegX8632::Reg_edx);
-        _mov(T, Src0, RegX8632::Reg_eax);
-        _cbwdq(T_edx, T);
-        _idiv(T_edx, Src1, T);
-        _mov(Dest, T_edx);
-      }
-      break;
-    case InstArithmetic::Fadd:
-      _mov(T, Src0);
-      _addss(T, Src1);
-      _mov(Dest, T);
-      break;
-    case InstArithmetic::Fsub:
-      _mov(T, Src0);
-      _subss(T, Src1);
-      _mov(Dest, T);
-      break;
-    case InstArithmetic::Fmul:
-      _mov(T, Src0);
-      _mulss(T, Src1);
-      _mov(Dest, T);
-      break;
-    case InstArithmetic::Fdiv:
-      _mov(T, Src0);
-      _divss(T, Src1);
-      _mov(Dest, T);
-      break;
-    case InstArithmetic::Frem: {
-      const SizeT MaxSrcs = 2;
-      Type Ty = Dest->getType();
-      InstCall *Call =
-          makeHelperCall(isFloat32Asserting32Or64(Ty) ? H_frem_f32 : H_frem_f64,
-                         Dest, MaxSrcs);
-      Call->addArg(Src0);
-      Call->addArg(Src1);
-      return lowerCall(Call);
-    } break;
+    return;
+  }
+  Variable *T_edx = nullptr;
+  Variable *T = nullptr;
+  switch (Inst->getOp()) {
+  case InstArithmetic::_num:
+    llvm_unreachable("Unknown arithmetic operator");
+    break;
+  case InstArithmetic::Add:
+    _mov(T, Src0);
+    _add(T, Src1);
+    _mov(Dest, T);
+    break;
+  case InstArithmetic::And:
+    _mov(T, Src0);
+    _and(T, Src1);
+    _mov(Dest, T);
+    break;
+  case InstArithmetic::Or:
+    _mov(T, Src0);
+    _or(T, Src1);
+    _mov(Dest, T);
+    break;
+  case InstArithmetic::Xor:
+    _mov(T, Src0);
+    _xor(T, Src1);
+    _mov(Dest, T);
+    break;
+  case InstArithmetic::Sub:
+    _mov(T, Src0);
+    _sub(T, Src1);
+    _mov(Dest, T);
+    break;
+  case InstArithmetic::Mul:
+    if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) {
+      if (optimizeScalarMul(Dest, Src0, C->getValue()))
+        return;
     }
+    // The 8-bit version of imul only allows the form "imul r/m8"
+    // where T must be in eax.
+    if (isByteSizedArithType(Dest->getType())) {
+      _mov(T, Src0, RegX8632::Reg_eax);
+      Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
+    } else {
+      _mov(T, Src0);
+    }
+    _imul(T, Src1);
+    _mov(Dest, T);
+    break;
+  case InstArithmetic::Shl:
+    _mov(T, Src0);
+    if (!llvm::isa<Constant>(Src1))
+      Src1 = legalizeToVar(Src1, RegX8632::Reg_ecx);
+    _shl(T, Src1);
+    _mov(Dest, T);
+    break;
+  case InstArithmetic::Lshr:
+    _mov(T, Src0);
+    if (!llvm::isa<Constant>(Src1))
+      Src1 = legalizeToVar(Src1, RegX8632::Reg_ecx);
+    _shr(T, Src1);
+    _mov(Dest, T);
+    break;
+  case InstArithmetic::Ashr:
+    _mov(T, Src0);
+    if (!llvm::isa<Constant>(Src1))
+      Src1 = legalizeToVar(Src1, RegX8632::Reg_ecx);
+    _sar(T, Src1);
+    _mov(Dest, T);
+    break;
+  case InstArithmetic::Udiv:
+    // div and idiv are the few arithmetic operators that do not allow
+    // immediates as the operand.
+    Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
+    if (isByteSizedArithType(Dest->getType())) {
+      Variable *T_ah = nullptr;
+      Constant *Zero = Ctx->getConstantZero(IceType_i8);
+      _mov(T, Src0, RegX8632::Reg_eax);
+      _mov(T_ah, Zero, RegX8632::Reg_ah);
+      _div(T, Src1, T_ah);
+      _mov(Dest, T);
+    } else {
+      Constant *Zero = Ctx->getConstantZero(IceType_i32);
+      _mov(T, Src0, RegX8632::Reg_eax);
+      _mov(T_edx, Zero, RegX8632::Reg_edx);
+      _div(T, Src1, T_edx);
+      _mov(Dest, T);
+    }
+    break;
+  case InstArithmetic::Sdiv:
+    // TODO(stichnot): Enable this after doing better performance
+    // and cross testing.
+    if (false && Ctx->getFlags().getOptLevel() >= Opt_1) {
+      // Optimize division by constant power of 2, but not for Om1
+      // or O0, just to keep things simple there.
+      if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) {
+        int32_t Divisor = C->getValue();
+        uint32_t UDivisor = static_cast<uint32_t>(Divisor);
+        if (Divisor > 0 && llvm::isPowerOf2_32(UDivisor)) {
+          uint32_t LogDiv = llvm::Log2_32(UDivisor);
+          Type Ty = Dest->getType();
+          // LLVM does the following for dest=src/(1<<log):
+          //   t=src
+          //   sar t,typewidth-1 // -1 if src is negative, 0 if not
+          //   shr t,typewidth-log
+          //   add t,src
+          //   sar t,log
+          //   dest=t
+          uint32_t TypeWidth = X86_CHAR_BIT * typeWidthInBytes(Ty);
+          _mov(T, Src0);
+          // If for some reason we are dividing by 1, just treat it
+          // like an assignment.
+          if (LogDiv > 0) {
+            // The initial sar is unnecessary when dividing by 2.
+            if (LogDiv > 1)
+              _sar(T, Ctx->getConstantInt(Ty, TypeWidth - 1));
+            _shr(T, Ctx->getConstantInt(Ty, TypeWidth - LogDiv));
+            _add(T, Src0);
+            _sar(T, Ctx->getConstantInt(Ty, LogDiv));
+          }
+          _mov(Dest, T);
+          return;
+        }
+      }
+    }
+    Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
+    if (isByteSizedArithType(Dest->getType())) {
+      _mov(T, Src0, RegX8632::Reg_eax);
+      _cbwdq(T, T);
+      _idiv(T, Src1, T);
+      _mov(Dest, T);
+    } else {
+      T_edx = makeReg(IceType_i32, RegX8632::Reg_edx);
+      _mov(T, Src0, RegX8632::Reg_eax);
+      _cbwdq(T_edx, T);
+      _idiv(T, Src1, T_edx);
+      _mov(Dest, T);
+    }
+    break;
+  case InstArithmetic::Urem:
+    Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
+    if (isByteSizedArithType(Dest->getType())) {
+      Variable *T_ah = nullptr;
+      Constant *Zero = Ctx->getConstantZero(IceType_i8);
+      _mov(T, Src0, RegX8632::Reg_eax);
+      _mov(T_ah, Zero, RegX8632::Reg_ah);
+      _div(T_ah, Src1, T);
+      _mov(Dest, T_ah);
+    } else {
+      Constant *Zero = Ctx->getConstantZero(IceType_i32);
+      _mov(T_edx, Zero, RegX8632::Reg_edx);
+      _mov(T, Src0, RegX8632::Reg_eax);
+      _div(T_edx, Src1, T);
+      _mov(Dest, T_edx);
+    }
+    break;
+  case InstArithmetic::Srem:
+    // TODO(stichnot): Enable this after doing better performance
+    // and cross testing.
+    if (false && Ctx->getFlags().getOptLevel() >= Opt_1) {
+      // Optimize mod by constant power of 2, but not for Om1 or O0,
+      // just to keep things simple there.
+      if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) {
+        int32_t Divisor = C->getValue();
+        uint32_t UDivisor = static_cast<uint32_t>(Divisor);
+        if (Divisor > 0 && llvm::isPowerOf2_32(UDivisor)) {
+          uint32_t LogDiv = llvm::Log2_32(UDivisor);
+          Type Ty = Dest->getType();
+          // LLVM does the following for dest=src%(1<<log):
+          //   t=src
+          //   sar t,typewidth-1 // -1 if src is negative, 0 if not
+          //   shr t,typewidth-log
+          //   add t,src
+          //   and t, -(1<<log)
+          //   sub t,src
+          //   neg t
+          //   dest=t
+          uint32_t TypeWidth = X86_CHAR_BIT * typeWidthInBytes(Ty);
+          // If for some reason we are dividing by 1, just assign 0.
+          if (LogDiv == 0) {
+            _mov(Dest, Ctx->getConstantZero(Ty));
+            return;
+          }
+          _mov(T, Src0);
+          // The initial sar is unnecessary when dividing by 2.
+          if (LogDiv > 1)
+            _sar(T, Ctx->getConstantInt(Ty, TypeWidth - 1));
+          _shr(T, Ctx->getConstantInt(Ty, TypeWidth - LogDiv));
+          _add(T, Src0);
+          _and(T, Ctx->getConstantInt(Ty, -(1 << LogDiv)));
+          _sub(T, Src0);
+          _neg(T);
+          _mov(Dest, T);
+          return;
+        }
+      }
+    }
+    Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
+    if (isByteSizedArithType(Dest->getType())) {
+      Variable *T_ah = makeReg(IceType_i8, RegX8632::Reg_ah);
+      _mov(T, Src0, RegX8632::Reg_eax);
+      _cbwdq(T, T);
+      Context.insert(InstFakeDef::create(Func, T_ah));
+      _idiv(T_ah, Src1, T);
+      _mov(Dest, T_ah);
+    } else {
+      T_edx = makeReg(IceType_i32, RegX8632::Reg_edx);
+      _mov(T, Src0, RegX8632::Reg_eax);
+      _cbwdq(T_edx, T);
+      _idiv(T_edx, Src1, T);
+      _mov(Dest, T_edx);
+    }
+    break;
+  case InstArithmetic::Fadd:
+    _mov(T, Src0);
+    _addss(T, Src1);
+    _mov(Dest, T);
+    break;
+  case InstArithmetic::Fsub:
+    _mov(T, Src0);
+    _subss(T, Src1);
+    _mov(Dest, T);
+    break;
+  case InstArithmetic::Fmul:
+    _mov(T, Src0);
+    _mulss(T, Src1);
+    _mov(Dest, T);
+    break;
+  case InstArithmetic::Fdiv:
+    _mov(T, Src0);
+    _divss(T, Src1);
+    _mov(Dest, T);
+    break;
+  case InstArithmetic::Frem: {
+    const SizeT MaxSrcs = 2;
+    Type Ty = Dest->getType();
+    InstCall *Call = makeHelperCall(
+        isFloat32Asserting32Or64(Ty) ? H_frem_f32 : H_frem_f64, Dest, MaxSrcs);
+    Call->addArg(Src0);
+    Call->addArg(Src1);
+    return lowerCall(Call);
+  }
   }
 }
 
@@ -3119,10 +3292,11 @@
       Func->setError("Unexpected memory ordering for AtomicRMW");
       return;
     }
-    lowerAtomicRMW(Instr->getDest(),
-                   static_cast<uint32_t>(llvm::cast<ConstantInteger32>(
-                                             Instr->getArg(0))->getValue()),
-                   Instr->getArg(1), Instr->getArg(2));
+    lowerAtomicRMW(
+        Instr->getDest(),
+        static_cast<uint32_t>(
+            llvm::cast<ConstantInteger32>(Instr->getArg(0))->getValue()),
+        Instr->getArg(1), Instr->getArg(2));
     return;
   case Intrinsics::AtomicStore: {
     if (!Intrinsics::isMemoryOrderValid(

diff --git a/src/IceTargetLoweringX8632.h b/src/IceTargetLoweringX8632.h
index ca26fe1..8d108bd 100644
--- a/src/IceTargetLoweringX8632.h
+++ b/src/IceTargetLoweringX8632.h

@@ -560,6 +560,8 @@
     Context.getLastInserted()->setDestNonKillable();
   }
 
+  bool optimizeScalarMul(Variable *Dest, Operand *Src0, int32_t Src1);
+
   const X86InstructionSet InstructionSet;
   bool IsEbpBasedFrame;
   bool NeedsStackAlignment;

diff --git a/tests_lit/assembler/x86/immediate_encodings.ll b/tests_lit/assembler/x86/immediate_encodings.ll
index c829f1b..0c96720 100644
--- a/tests_lit/assembler/x86/immediate_encodings.ll
+++ b/tests_lit/assembler/x86/immediate_encodings.ll

@@ -197,25 +197,25 @@
 define internal i32 @testMul16Imm16(i32 %arg) {
 entry:
   %arg_i16 = trunc i32 %arg to i16
-  %tmp = mul i16 %arg_i16, 1024
+  %tmp = mul i16 %arg_i16, 1025
   %result_i16 = add i16 %tmp, 1
   %result = zext i16 %result_i16 to i32
   ret i32 %result
 }
 ; CHECK-LABEL: testMul16Imm16
-; CHECK: 66 69 c0 00 04  imul ax,ax
+; CHECK: 66 69 c0 01 04  imul ax,ax
 ; CHECK-NEXT: add ax,0x1
 
 define internal i32 @testMul16Imm16Neg(i32 %arg) {
 entry:
   %arg_i16 = trunc i32 %arg to i16
-  %tmp = mul i16 %arg_i16, -256
+  %tmp = mul i16 %arg_i16, -255
   %result_i16 = add i16 %tmp, 1
   %result = zext i16 %result_i16 to i32
   ret i32 %result
 }
 ; CHECK-LABEL: testMul16Imm16Neg
-; CHECK: 66 69 c0 00 ff  imul ax,ax
+; CHECK: 66 69 c0 01 ff  imul ax,ax,0xff01
 ; CHECK-NEXT: add ax,0x1
 
 define internal i32 @testMul32Imm8(i32 %arg) {
@@ -236,19 +236,19 @@
 
 define internal i32 @testMul32Imm16(i32 %arg) {
 entry:
-  %result = mul i32 %arg, 1024
+  %result = mul i32 %arg, 1025
   ret i32 %result
 }
 ; CHECK-LABEL: testMul32Imm16
-; CHECK: 69 c0 00 04 00 00  imul eax,eax
+; CHECK: 69 c0 01 04 00 00  imul eax,eax
 
 define internal i32 @testMul32Imm16Neg(i32 %arg) {
 entry:
-  %result = mul i32 %arg, -256
+  %result = mul i32 %arg, -255
   ret i32 %result
 }
 ; CHECK-LABEL: testMul32Imm16Neg
-; CHECK: 69 c0 00 ff ff ff  imul eax,eax
+; CHECK: 69 c0 01 ff ff ff  imul eax,eax,0xffffff01
 
 ; The GPR shift instructions either allow an 8-bit immediate or
 ; have a special encoding for "1".

diff --git a/tests_lit/llvm2ice_tests/strength-reduce.ll b/tests_lit/llvm2ice_tests/strength-reduce.ll
new file mode 100644
index 0000000..50ca6e8
--- /dev/null
+++ b/tests_lit/llvm2ice_tests/strength-reduce.ll

@@ -0,0 +1,67 @@
+; This tests various strength reduction operations.
+
+; RUN: %if --need=target_X8632 --command %p2i --filetype=obj --disassemble \
+; RUN:   --target x8632 -i %s --args -O2 \
+; RUN:   | %if --need=target_X8632 --command FileCheck %s
+
+define internal i32 @mul_i32_arg_5(i32 %arg) {
+  %result = mul i32 %arg, 5
+  ret i32 %result
+}
+; CHECK-LABEL: mul_i32_arg_5
+; CHECK: lea [[REG:e..]],{{\[}}[[REG]]+[[REG]]*4]
+
+define internal i32 @mul_i32_5_arg(i32 %arg) {
+  %result = mul i32 5, %arg
+  ret i32 %result
+}
+; CHECK-LABEL: mul_i32_5_arg
+; CHECK: lea [[REG:e..]],{{\[}}[[REG]]+[[REG]]*4]
+
+define internal i32 @mul_i32_arg_18(i32 %arg) {
+  %result = mul i32 %arg, 18
+  ret i32 %result
+}
+; CHECK-LABEL: mul_i32_arg_18
+; CHECK-DAG: lea [[REG:e..]],{{\[}}[[REG]]+[[REG]]*8]
+; CHECK-DAG: shl [[REG]],1
+
+define internal i32 @mul_i32_arg_27(i32 %arg) {
+  %result = mul i32 %arg, 27
+  ret i32 %result
+}
+; CHECK-LABEL: mul_i32_arg_27
+; CHECK-DAG: lea [[REG:e..]],{{\[}}[[REG]]+[[REG]]*2]
+; CHECK-DAG: lea [[REG]],{{\[}}[[REG]]+[[REG]]*8]
+
+define internal i32 @mul_i32_arg_m45(i32 %arg) {
+  %result = mul i32 %arg, -45
+  ret i32 %result
+}
+; CHECK-LABEL: mul_i32_arg_m45
+; CHECK-DAG: lea [[REG:e..]],{{\[}}[[REG]]+[[REG]]*8]
+; CHECK-DAG: lea [[REG]],{{\[}}[[REG]]+[[REG]]*4]
+; CHECK: neg [[REG]]
+
+define internal i16 @mul_i16_arg_18(i16 %arg) {
+  %result = mul i16 %arg, 18
+  ret i16 %result
+}
+; Disassembly will look like "lea ax,[eax+eax*8]".
+; CHECK-LABEL: mul_i16_arg_18
+; CHECK-DAG: lea [[REG:..]],{{\[}}e[[REG]]+e[[REG]]*8]
+; CHECK-DAG: shl [[REG]],1
+
+define internal i8 @mul_i8_arg_16(i8 %arg) {
+  %result = mul i8 %arg, 16
+  ret i8 %result
+}
+; CHECK-LABEL: mul_i8_arg_16
+; CHECK: shl {{.*}},0x4
+
+define internal i8 @mul_i8_arg_18(i8 %arg) {
+  %result = mul i8 %arg, 18
+  ret i8 %result
+}
+; CHECK-LABEL: mul_i8_arg_18
+; CHECK: imul
commit	0933c0cfb8183a662e44a61d8671b20987d12c78	[log] [tgz]
author	Jim Stichnoth <stichnot@chromium.org>	Fri Jun 12 10:41:16 2015 -0700
committer	Jim Stichnoth <stichnot@chromium.org>	Fri Jun 12 10:41:16 2015 -0700
tree	e107d32e24ebf0605782265e74bb421441b74fde
parent	326534a3bb3783ce52aae0f9a3d684aedbab2790 [diff]