Lower a few basic ARM binops for i{8,16,32,64}.

Do basic lowering for add, sub, and, or, xor, mul.
We don't yet take advantage of commuting immediate operands
(e.g., use rsb to reverse subtract instead of sub) or
inverting immediate operands (use bic to bit clear instead
of using and).

The binary operations can set the flags register (e.g., to
have the carry bit for use with a subsequent adc
instruction). That is optional for the "data processing"
instructions.

I'm not yet able to compile 8bit.pnacl.ll and
64bit.pnacl.ll so 8-bit and 64-bit are not well tested yet.
Only tests are in the arith.ll file (like arith-opt.ll, but
assembled instead of testing the "verbose inst" output).

Not doing divide yet. ARM divide by 0 does not trap, but
PNaCl requires uniform behavior for such bad code. Thus,
in LLVM we insert a 0 check and would have to do the same.

BUG= https://code.google.com/p/nativeclient/issues/detail?id=4076
R=stichnot@chromium.org

Review URL: https://codereview.chromium.org/1127003003
diff --git a/src/IceInstARM32.cpp b/src/IceInstARM32.cpp
index 6ac1698..67f0fd4 100644
--- a/src/IceInstARM32.cpp
+++ b/src/IceInstARM32.cpp
@@ -60,11 +60,24 @@
   assert(Inst->getSrcSize() == 2);
   Variable *Dest = Inst->getDest();
   assert(Dest == Inst->getSrc(0));
-  Operand *Src1 = Inst->getSrc(1);
   Str << "\t" << Opcode << "\t";
   Dest->emit(Func);
   Str << ", ";
-  Src1->emit(Func);
+  Inst->getSrc(1)->emit(Func);
+}
+
+void emitThreeAddr(const char *Opcode, const Inst *Inst, const Cfg *Func,
+                   bool SetFlags) {
+  if (!ALLOW_DUMP)
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(Inst->getSrcSize() == 2);
+  Str << "\t" << Opcode << (SetFlags ? "s" : "") << "\t";
+  Inst->getDest()->emit(Func);
+  Str << ", ";
+  Inst->getSrc(0)->emit(Func);
+  Str << ", ";
+  Inst->getSrc(1)->emit(Func);
 }
 
 OperandARM32Mem::OperandARM32Mem(Cfg * /* Func */, Type Ty, Variable *Base,
@@ -146,6 +159,14 @@
   addSource(Mem);
 }
 
+InstARM32Mla::InstARM32Mla(Cfg *Func, Variable *Dest, Variable *Src0,
+                           Variable *Src1, Variable *Acc)
+    : InstARM32(Func, InstARM32::Mla, 3, Dest) {
+  addSource(Src0);
+  addSource(Src1);
+  addSource(Acc);
+}
+
 InstARM32Ret::InstARM32Ret(Cfg *Func, Variable *LR, Variable *Source)
     : InstARM32(Func, InstARM32::Ret, Source ? 2 : 1, nullptr) {
   addSource(LR);
@@ -153,6 +174,15 @@
     addSource(Source);
 }
 
+InstARM32Umull::InstARM32Umull(Cfg *Func, Variable *DestLo, Variable *DestHi,
+                               Variable *Src0, Variable *Src1)
+    : InstARM32(Func, InstARM32::Umull, 2, DestLo),
+      // DestHi is expected to have a FakeDef inserted by the lowering code.
+      DestHi(DestHi) {
+  addSource(Src0);
+  addSource(Src1);
+}
+
 // ======================== Dump routines ======================== //
 
 // Two-addr ops
@@ -162,6 +192,15 @@
 template <> const char *InstARM32Mvn::Opcode = "mvn";
 // Mov-like ops
 template <> const char *InstARM32Mov::Opcode = "mov";
+// Three-addr ops
+template <> const char *InstARM32Adc::Opcode = "adc";
+template <> const char *InstARM32Add::Opcode = "add";
+template <> const char *InstARM32And::Opcode = "and";
+template <> const char *InstARM32Eor::Opcode = "eor";
+template <> const char *InstARM32Mul::Opcode = "mul";
+template <> const char *InstARM32Orr::Opcode = "orr";
+template <> const char *InstARM32Sbc::Opcode = "sbc";
+template <> const char *InstARM32Sub::Opcode = "sub";
 
 void InstARM32::dump(const Cfg *Func) const {
   if (!ALLOW_DUMP)
@@ -217,7 +256,7 @@
 }
 
 void InstARM32Ldr::emitIAS(const Cfg *Func) const {
-  assert(getSrcSize() == 2);
+  assert(getSrcSize() == 1);
   (void)Func;
   llvm_unreachable("Not yet implemented");
 }
@@ -227,7 +266,40 @@
     return;
   Ostream &Str = Func->getContext()->getStrDump();
   dumpDest(Func);
-  Str << "ldr." << getSrc(0)->getType() << " ";
+  Str << " = ldr." << getSrc(0)->getType() << " ";
+  dumpSources(Func);
+}
+
+void InstARM32Mla::emit(const Cfg *Func) const {
+  if (!ALLOW_DUMP)
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(getSrcSize() == 3);
+  assert(getDest()->hasReg());
+  Str << "\t"
+      << "mla"
+      << "\t";
+  getDest()->emit(Func);
+  Str << ", ";
+  getSrc(0)->emit(Func);
+  Str << ", ";
+  getSrc(1)->emit(Func);
+  Str << ", ";
+  getSrc(2)->emit(Func);
+}
+
+void InstARM32Mla::emitIAS(const Cfg *Func) const {
+  assert(getSrcSize() == 3);
+  (void)Func;
+  llvm_unreachable("Not yet implemented");
+}
+
+void InstARM32Mla::dump(const Cfg *Func) const {
+  if (!ALLOW_DUMP)
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  dumpDest(Func);
+  Str << " = mla." << getSrc(0)->getType() << " ";
   dumpSources(Func);
 }
 
@@ -274,7 +346,9 @@
   assert(LR->hasReg());
   assert(LR->getRegNum() == RegARM32::Reg_lr);
   Ostream &Str = Func->getContext()->getStrEmit();
-  Str << "\tbx\t";
+  Str << "\t"
+      << "bx"
+      << "\t";
   LR->emit(Func);
 }
 
@@ -292,6 +366,39 @@
   dumpSources(Func);
 }
 
+void InstARM32Umull::emit(const Cfg *Func) const {
+  if (!ALLOW_DUMP)
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(getSrcSize() == 2);
+  assert(getDest()->hasReg());
+  Str << "\t"
+      << "umull"
+      << "\t";
+  getDest()->emit(Func);
+  Str << ", ";
+  DestHi->emit(Func);
+  Str << ", ";
+  getSrc(0)->emit(Func);
+  Str << ", ";
+  getSrc(1)->emit(Func);
+}
+
+void InstARM32Umull::emitIAS(const Cfg *Func) const {
+  assert(getSrcSize() == 2);
+  (void)Func;
+  llvm_unreachable("Not yet implemented");
+}
+
+void InstARM32Umull::dump(const Cfg *Func) const {
+  if (!ALLOW_DUMP)
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  dumpDest(Func);
+  Str << " = umull." << getSrc(0)->getType() << " ";
+  dumpSources(Func);
+}
+
 void OperandARM32Mem::emit(const Cfg *Func) const {
   if (!ALLOW_DUMP)
     return;
diff --git a/src/IceInstARM32.def b/src/IceInstARM32.def
index 7970e58..c314305 100644
--- a/src/IceInstARM32.def
+++ b/src/IceInstARM32.def
@@ -82,13 +82,13 @@
 //#define X(tag, elementty, width, sbits, ubits)
 
 // Shifter types for Data-processing operands as defined in section A5.1.2.
-#define ICEINSTARM32SHIFT_TABLE                 \
-  /* enum value, emit */                        \
-  X(LSL, "lsl")                                 \
-  X(LSR, "lsr")                                 \
-  X(ASR, "asr")                                 \
-  X(ROR, "ror")                                 \
-  X(RRX, "rrx")                                 \
+#define ICEINSTARM32SHIFT_TABLE                                         \
+  /* enum value, emit */                                                \
+  X(LSL, "lsl")                                                         \
+  X(LSR, "lsr")                                                         \
+  X(ASR, "asr")                                                         \
+  X(ROR, "ror")                                                         \
+  X(RRX, "rrx")                                                         \
 //#define X(tag, emit)
 
 #endif // SUBZERO_SRC_ICEINSTARM32_DEF
diff --git a/src/IceInstARM32.h b/src/IceInstARM32.h
index 2e3f764..c8d01e6 100644
--- a/src/IceInstARM32.h
+++ b/src/IceInstARM32.h
@@ -245,12 +245,22 @@
 public:
   enum InstKindARM32 {
     k__Start = Inst::Target,
+    Adc,
+    Add,
+    And,
+    Eor,
+    Ldr,
+    Mla,
     Mov,
     Movt,
     Movw,
+    Mul,
     Mvn,
+    Orr,
     Ret,
-    Ldr
+    Sbc,
+    Sub,
+    Umull
   };
 
   static const char *getWidthString(Type Ty);
@@ -267,6 +277,8 @@
 };
 
 void emitTwoAddr(const char *Opcode, const Inst *Inst, const Cfg *Func);
+void emitThreeAddr(const char *Opcode, const Inst *Inst, const Cfg *Func,
+                   bool SetFlags);
 
 // TODO(jvoung): add condition codes if instruction can be predicated.
 
@@ -397,6 +409,63 @@
   static const char *Opcode;
 };
 
+// Instructions of the form x := y op z. May have the side-effect of setting
+// status flags.
+template <InstARM32::InstKindARM32 K>
+class InstARM32ThreeAddrGPR : public InstARM32 {
+  InstARM32ThreeAddrGPR() = delete;
+  InstARM32ThreeAddrGPR(const InstARM32ThreeAddrGPR &) = delete;
+  InstARM32ThreeAddrGPR &operator=(const InstARM32ThreeAddrGPR &) = delete;
+
+public:
+  // Create an ordinary binary-op instruction like add, and sub.
+  // Dest and Src1 must be registers.
+  static InstARM32ThreeAddrGPR *create(Cfg *Func, Variable *Dest,
+                                       Variable *Src1, Operand *Src2,
+                                       bool SetFlags = false) {
+    return new (Func->allocate<InstARM32ThreeAddrGPR>())
+        InstARM32ThreeAddrGPR(Func, Dest, Src1, Src2, SetFlags);
+  }
+  void emit(const Cfg *Func) const override {
+    if (!ALLOW_DUMP)
+      return;
+    emitThreeAddr(Opcode, this, Func, SetFlags);
+  }
+  void emitIAS(const Cfg *Func) const override {
+    (void)Func;
+    llvm::report_fatal_error("Not yet implemented");
+  }
+  void dump(const Cfg *Func) const override {
+    if (!ALLOW_DUMP)
+      return;
+    Ostream &Str = Func->getContext()->getStrDump();
+    dumpDest(Func);
+    Str << " = " << Opcode << (SetFlags ? "s" : "") << "."
+        << getDest()->getType() << " ";
+    dumpSources(Func);
+  }
+  static bool classof(const Inst *Inst) { return isClassof(Inst, K); }
+
+private:
+  InstARM32ThreeAddrGPR(Cfg *Func, Variable *Dest, Variable *Src1,
+                        Operand *Src2, bool SetFlags)
+      : InstARM32(Func, K, 2, Dest), SetFlags(SetFlags) {
+    addSource(Src1);
+    addSource(Src2);
+  }
+  ~InstARM32ThreeAddrGPR() override {}
+  static const char *Opcode;
+  bool SetFlags;
+};
+
+typedef InstARM32ThreeAddrGPR<InstARM32::Adc> InstARM32Adc;
+typedef InstARM32ThreeAddrGPR<InstARM32::Add> InstARM32Add;
+typedef InstARM32ThreeAddrGPR<InstARM32::And> InstARM32And;
+typedef InstARM32ThreeAddrGPR<InstARM32::Eor> InstARM32Eor;
+typedef InstARM32ThreeAddrGPR<InstARM32::Mul> InstARM32Mul;
+typedef InstARM32ThreeAddrGPR<InstARM32::Orr> InstARM32Orr;
+typedef InstARM32ThreeAddrGPR<InstARM32::Sbc> InstARM32Sbc;
+typedef InstARM32ThreeAddrGPR<InstARM32::Sub> InstARM32Sub;
 // Move instruction (variable <- flex). This is more of a pseudo-inst.
 // If var is a register, then we use "mov". If var is stack, then we use
 // "str" to store to the stack.
@@ -428,6 +497,30 @@
   ~InstARM32Ldr() override {}
 };
 
+// Multiply Accumulate: d := x * y + a
+class InstARM32Mla : public InstARM32 {
+  InstARM32Mla() = delete;
+  InstARM32Mla(const InstARM32Mla &) = delete;
+  InstARM32Mla &operator=(const InstARM32Mla &) = delete;
+
+public:
+  // Everything must be a register.
+  static InstARM32Mla *create(Cfg *Func, Variable *Dest, Variable *Src0,
+                              Variable *Src1, Variable *Acc) {
+    return new (Func->allocate<InstARM32Mla>())
+        InstARM32Mla(Func, Dest, Src0, Src1, Acc);
+  }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Inst) { return isClassof(Inst, Mla); }
+
+private:
+  InstARM32Mla(Cfg *Func, Variable *Dest, Variable *Src0, Variable *Src1,
+               Variable *Acc);
+  ~InstARM32Mla() override {}
+};
+
 // Ret pseudo-instruction.  This is actually a "bx" instruction with
 // an "lr" register operand, but epilogue lowering will search for a Ret
 // instead of a generic "bx". This instruction also takes a Source
@@ -453,6 +546,31 @@
   ~InstARM32Ret() override {}
 };
 
+// Unsigned Multiply Long: d.lo, d.hi := x * y
+class InstARM32Umull : public InstARM32 {
+  InstARM32Umull() = delete;
+  InstARM32Umull(const InstARM32Umull &) = delete;
+  InstARM32Umull &operator=(const InstARM32Umull &) = delete;
+
+public:
+  // Everything must be a register.
+  static InstARM32Umull *create(Cfg *Func, Variable *DestLo, Variable *DestHi,
+                                Variable *Src0, Variable *Src1) {
+    return new (Func->allocate<InstARM32Umull>())
+        InstARM32Umull(Func, DestLo, DestHi, Src0, Src1);
+  }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Inst) { return isClassof(Inst, Umull); }
+
+private:
+  InstARM32Umull(Cfg *Func, Variable *DestLo, Variable *DestHi, Variable *Src0,
+                 Variable *Src1);
+  ~InstARM32Umull() override {}
+  Variable *DestHi;
+};
+
 // Declare partial template specializations of emit() methods that
 // already have default implementations.  Without this, there is the
 // possibility of ODR violations and link errors.
diff --git a/src/IceTargetLoweringARM32.cpp b/src/IceTargetLoweringARM32.cpp
index d65b546..73eb77c 100644
--- a/src/IceTargetLoweringARM32.cpp
+++ b/src/IceTargetLoweringARM32.cpp
@@ -540,40 +540,135 @@
   // Or it may be the case that the operands aren't swapped, but the
   // bits can be flipped and a different operation applied.
   // E.g., use BIC (bit clear) instead of AND for some masks.
-  Variable *Src0 = legalizeToVar(Inst->getSrc(0));
-  Operand *Src1 = legalize(Inst->getSrc(1), Legal_Reg | Legal_Flex);
-  (void)Src0;
-  (void)Src1;
+  Operand *Src0 = Inst->getSrc(0);
+  Operand *Src1 = Inst->getSrc(1);
   if (Dest->getType() == IceType_i64) {
-    UnimplementedError(Func->getContext()->getFlags());
+    Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
+    Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
+    Variable *Src0RLo = legalizeToVar(loOperand(Src0));
+    Variable *Src0RHi = legalizeToVar(hiOperand(Src0));
+    Operand *Src1Lo = legalize(loOperand(Src1), Legal_Reg | Legal_Flex);
+    Operand *Src1Hi = legalize(hiOperand(Src1), Legal_Reg | Legal_Flex);
+    Variable *T_Lo = makeReg(DestLo->getType());
+    Variable *T_Hi = makeReg(DestHi->getType());
+    switch (Inst->getOp()) {
+    case InstArithmetic::_num:
+      llvm_unreachable("Unknown arithmetic operator");
+      break;
+    case InstArithmetic::Add:
+      _adds(T_Lo, Src0RLo, Src1Lo);
+      _mov(DestLo, T_Lo);
+      _adc(T_Hi, Src0RHi, Src1Hi);
+      _mov(DestHi, T_Hi);
+      break;
+    case InstArithmetic::And:
+      _and(T_Lo, Src0RLo, Src1Lo);
+      _mov(DestLo, T_Lo);
+      _and(T_Hi, Src0RHi, Src1Hi);
+      _mov(DestHi, T_Hi);
+      break;
+    case InstArithmetic::Or:
+      _orr(T_Lo, Src0RLo, Src1Lo);
+      _mov(DestLo, T_Lo);
+      _orr(T_Hi, Src0RHi, Src1Hi);
+      _mov(DestHi, T_Hi);
+      break;
+    case InstArithmetic::Xor:
+      _eor(T_Lo, Src0RLo, Src1Lo);
+      _mov(DestLo, T_Lo);
+      _eor(T_Hi, Src0RHi, Src1Hi);
+      _mov(DestHi, T_Hi);
+      break;
+    case InstArithmetic::Sub:
+      _subs(T_Lo, Src0RLo, Src1Lo);
+      _mov(DestLo, T_Lo);
+      _sbc(T_Hi, Src0RHi, Src1Hi);
+      _mov(DestHi, T_Hi);
+      break;
+    case InstArithmetic::Mul: {
+      // GCC 4.8 does:
+      // a=b*c ==>
+      //   t_acc =(mul) (b.lo * c.hi)
+      //   t_acc =(mla) (c.lo * b.hi) + t_acc
+      //   t.hi,t.lo =(umull) b.lo * c.lo
+      //   t.hi += t_acc
+      //   a.lo = t.lo
+      //   a.hi = t.hi
+      //
+      // LLVM does:
+      //   t.hi,t.lo =(umull) b.lo * c.lo
+      //   t.hi =(mla) (b.lo * c.hi) + t.hi
+      //   t.hi =(mla) (b.hi * c.lo) + t.hi
+      //   a.lo = t.lo
+      //   a.hi = t.hi
+      //
+      // LLVM's lowering has fewer instructions, but more register pressure:
+      // t.lo is live from beginning to end, while GCC delays the two-dest
+      // instruction till the end, and kills c.hi immediately.
+      Variable *T_Acc = makeReg(IceType_i32);
+      Variable *T_Acc1 = makeReg(IceType_i32);
+      Variable *T_Hi1 = makeReg(IceType_i32);
+      Variable *Src1RLo = legalizeToVar(Src1Lo);
+      Variable *Src1RHi = legalizeToVar(Src1Hi);
+      _mul(T_Acc, Src0RLo, Src1RHi);
+      _mla(T_Acc1, Src1RLo, Src0RHi, T_Acc);
+      _umull(T_Lo, T_Hi1, Src0RLo, Src1RLo);
+      _add(T_Hi, T_Hi1, T_Acc1);
+      _mov(DestLo, T_Lo);
+      _mov(DestHi, T_Hi);
+    } break;
+    case InstArithmetic::Shl:
+    case InstArithmetic::Lshr:
+    case InstArithmetic::Ashr:
+    case InstArithmetic::Udiv:
+    case InstArithmetic::Sdiv:
+    case InstArithmetic::Urem:
+    case InstArithmetic::Srem:
+      UnimplementedError(Func->getContext()->getFlags());
+      break;
+    case InstArithmetic::Fadd:
+    case InstArithmetic::Fsub:
+    case InstArithmetic::Fmul:
+    case InstArithmetic::Fdiv:
+    case InstArithmetic::Frem:
+      llvm_unreachable("FP instruction with i64 type");
+      break;
+    }
   } else if (isVectorType(Dest->getType())) {
     UnimplementedError(Func->getContext()->getFlags());
   } else { // Dest->getType() is non-i64 scalar
+    Variable *Src0R = legalizeToVar(Inst->getSrc(0));
+    Src1 = legalize(Inst->getSrc(1), Legal_Reg | Legal_Flex);
+    Variable *T = makeReg(Dest->getType());
     switch (Inst->getOp()) {
     case InstArithmetic::_num:
       llvm_unreachable("Unknown arithmetic operator");
       break;
     case InstArithmetic::Add: {
-      UnimplementedError(Func->getContext()->getFlags());
-      // Variable *T = makeReg(Dest->getType());
-      // _add(T, Src0, Src1);
-      // _mov(Dest, T);
+      _add(T, Src0R, Src1);
+      _mov(Dest, T);
     } break;
-    case InstArithmetic::And:
-      UnimplementedError(Func->getContext()->getFlags());
-      break;
-    case InstArithmetic::Or:
-      UnimplementedError(Func->getContext()->getFlags());
-      break;
-    case InstArithmetic::Xor:
-      UnimplementedError(Func->getContext()->getFlags());
-      break;
-    case InstArithmetic::Sub:
-      UnimplementedError(Func->getContext()->getFlags());
-      break;
-    case InstArithmetic::Mul:
-      UnimplementedError(Func->getContext()->getFlags());
-      break;
+    case InstArithmetic::And: {
+      _and(T, Src0R, Src1);
+      _mov(Dest, T);
+    } break;
+    case InstArithmetic::Or: {
+      _orr(T, Src0R, Src1);
+      _mov(Dest, T);
+    } break;
+    case InstArithmetic::Xor: {
+      _eor(T, Src0R, Src1);
+      _mov(Dest, T);
+    } break;
+    case InstArithmetic::Sub: {
+      _sub(T, Src0R, Src1);
+      _mov(Dest, T);
+    } break;
+    case InstArithmetic::Mul: {
+      Variable *Src1R = legalizeToVar(Src1);
+      _mul(T, Src0R, Src1R);
+      _mov(Dest, T);
+    } break;
     case InstArithmetic::Shl:
       UnimplementedError(Func->getContext()->getFlags());
       break;
diff --git a/src/IceTargetLoweringARM32.h b/src/IceTargetLoweringARM32.h
index 0c21c7c..88a8eb1 100644
--- a/src/IceTargetLoweringARM32.h
+++ b/src/IceTargetLoweringARM32.h
@@ -130,9 +130,28 @@
   // with minimal syntactic overhead, so that the lowering code can
   // look as close to assembly as practical.
 
+  void _add(Variable *Dest, Variable *Src0, Operand *Src1) {
+    Context.insert(InstARM32Add::create(Func, Dest, Src0, Src1));
+  }
+  void _adds(Variable *Dest, Variable *Src0, Operand *Src1) {
+    const bool SetFlags = true;
+    Context.insert(InstARM32Add::create(Func, Dest, Src0, Src1, SetFlags));
+  }
+  void _adc(Variable *Dest, Variable *Src0, Operand *Src1) {
+    Context.insert(InstARM32Adc::create(Func, Dest, Src0, Src1));
+  }
+  void _and(Variable *Dest, Variable *Src0, Operand *Src1) {
+    Context.insert(InstARM32And::create(Func, Dest, Src0, Src1));
+  }
+  void _eor(Variable *Dest, Variable *Src0, Operand *Src1) {
+    Context.insert(InstARM32Eor::create(Func, Dest, Src0, Src1));
+  }
   void _ldr(Variable *Dest, OperandARM32Mem *Addr) {
     Context.insert(InstARM32Ldr::create(Func, Dest, Addr));
   }
+  void _mla(Variable *Dest, Variable *Src0, Variable *Src1, Variable *Acc) {
+    Context.insert(InstARM32Mla::create(Func, Dest, Src0, Src1, Acc));
+  }
   // If Dest=nullptr is passed in, then a new variable is created,
   // marked as infinite register allocation weight, and returned
   // through the in/out Dest argument.
@@ -144,18 +163,40 @@
   }
   // The Operand can only be a 16-bit immediate or a ConstantRelocatable
   // (with an upper16 relocation).
-  void _movt(Variable *&Dest, Operand *Src0) {
+  void _movt(Variable *Dest, Operand *Src0) {
     Context.insert(InstARM32Movt::create(Func, Dest, Src0));
   }
-  void _movw(Variable *&Dest, Operand *Src0) {
+  void _movw(Variable *Dest, Operand *Src0) {
     Context.insert(InstARM32Movw::create(Func, Dest, Src0));
   }
-  void _mvn(Variable *&Dest, Operand *Src0) {
+  void _mul(Variable *Dest, Variable *Src0, Variable *Src1) {
+    Context.insert(InstARM32Mul::create(Func, Dest, Src0, Src1));
+  }
+  void _mvn(Variable *Dest, Operand *Src0) {
     Context.insert(InstARM32Mvn::create(Func, Dest, Src0));
   }
+  void _orr(Variable *Dest, Variable *Src0, Operand *Src1) {
+    Context.insert(InstARM32Orr::create(Func, Dest, Src0, Src1));
+  }
+  void _sbc(Variable *Dest, Variable *Src0, Operand *Src1) {
+    Context.insert(InstARM32Sbc::create(Func, Dest, Src0, Src1));
+  }
+  void _sub(Variable *Dest, Variable *Src0, Operand *Src1) {
+    Context.insert(InstARM32Sub::create(Func, Dest, Src0, Src1));
+  }
+  void _subs(Variable *Dest, Variable *Src0, Operand *Src1) {
+    const bool SetFlags = true;
+    Context.insert(InstARM32Sub::create(Func, Dest, Src0, Src1, SetFlags));
+  }
   void _ret(Variable *LR, Variable *Src0 = nullptr) {
     Context.insert(InstARM32Ret::create(Func, LR, Src0));
   }
+  void _umull(Variable *DestLo, Variable *DestHi, Variable *Src0,
+              Variable *Src1) {
+    Context.insert(InstARM32Umull::create(Func, DestLo, DestHi, Src0, Src1));
+    // Model the modification to the second dest as a fake def.
+    Context.insert(InstFakeDef::create(Func, DestHi, DestLo));
+  }
 
   bool UsesFramePointer;
   bool NeedsStackAlignment;
diff --git a/tests_lit/llvm2ice_tests/arith-opt.ll b/tests_lit/llvm2ice_tests/arith-opt.ll
index 1f6a69b..66be97f 100644
--- a/tests_lit/llvm2ice_tests/arith-opt.ll
+++ b/tests_lit/llvm2ice_tests/arith-opt.ll
@@ -5,13 +5,6 @@
 
 ; RUN: %p2i -i %s --filetype=asm --args --verbose inst -threads=0 | FileCheck %s
 
-; TODO(jvoung): Enable test when it does not llvm::report_fatal_error.
-; The test runner wrappers don't handle error expected errors
-; so we can't just "not" the command.
-; RUIN: %if --need=target_ARM32 --command %p2i -i %s --filetype=asm \
-; RUIN:   --args --verbose inst -threads=0 --target arm32 \
-; RUIN:   | %if --need=target_ARM32 --command FileCheck %s --check-prefix ARM32
-
 define i32 @Add(i32 %a, i32 %b) {
 ; CHECK: define i32 @Add
 entry:
diff --git a/tests_lit/llvm2ice_tests/arith.ll b/tests_lit/llvm2ice_tests/arith.ll
new file mode 100644
index 0000000..976af4d
--- /dev/null
+++ b/tests_lit/llvm2ice_tests/arith.ll
@@ -0,0 +1,149 @@
+; Assembly test for simple arithmetic operations.
+
+; RUN: %if --need=target_X8632 --command %p2i --filetype=obj --disassemble \
+; RUN:   --target x8632 -i %s --args -O2 \
+; RUN:   | %if --need=target_X8632 --command FileCheck %s
+
+; TODO(jvoung): Stop skipping unimplemented parts (via --skip-unimplemented)
+; once enough infrastructure is in. Also, switch to --filetype=obj
+; when possible.
+; RUN: %if --need=target_ARM32 --command %p2i --filetype=asm --assemble \
+; RUN:   --disassemble --target arm32 -i %s --args -O2 --skip-unimplemented \
+; RUN:   | %if --need=target_ARM32 --command FileCheck --check-prefix ARM32 %s
+
+define i32 @Add(i32 %a, i32 %b) {
+entry:
+  %add = add i32 %b, %a
+  ret i32 %add
+}
+; CHECK-LABEL: Add
+; CHECK: add e
+; ARM32-LABEL: Add
+; ARM32: add r
+
+define i32 @And(i32 %a, i32 %b) {
+entry:
+  %and = and i32 %b, %a
+  ret i32 %and
+}
+; CHECK-LABEL: And
+; CHECK: and e
+; ARM32-LABEL: And
+; ARM32: and r
+
+define i32 @Or(i32 %a, i32 %b) {
+entry:
+  %or = or i32 %b, %a
+  ret i32 %or
+}
+; CHECK-LABEL: Or
+; CHECK: or e
+; ARM32-LABEL: Or
+; ARM32: orr r
+
+define i32 @Xor(i32 %a, i32 %b) {
+entry:
+  %xor = xor i32 %b, %a
+  ret i32 %xor
+}
+; CHECK-LABEL: Xor
+; CHECK: xor e
+; ARM32-LABEL: Xor
+; ARM32: eor r
+
+define i32 @Sub(i32 %a, i32 %b) {
+entry:
+  %sub = sub i32 %a, %b
+  ret i32 %sub
+}
+; CHECK-LABEL: Sub
+; CHECK: sub e
+; ARM32-LABEL: Sub
+; ARM32: sub r
+
+define i32 @Mul(i32 %a, i32 %b) {
+entry:
+  %mul = mul i32 %b, %a
+  ret i32 %mul
+}
+; CHECK-LABEL: Mul
+; CHECK: imul e
+; ARM32-LABEL: Mul
+; ARM32: mul r
+
+; Check for a valid ARM mul instruction where operands have to be registers.
+; On the other hand x86-32 does allow an immediate.
+define i32 @MulImm(i32 %a, i32 %b) {
+entry:
+  %mul = mul i32 %a, 99
+  ret i32 %mul
+}
+; CHECK-LABEL: MulImm
+; CHECK: imul e{{.*}},e{{.*}},0x63
+; ARM32-LABEL: MulImm
+; ARM32: mov {{.*}}, #99
+; ARM32: mul r{{.*}}, r{{.*}}, r{{.*}}
+
+; Check for a valid addressing mode in the x86-32 mul instruction when
+; the second source operand is an immediate.
+define i64 @MulImm64(i64 %a) {
+entry:
+  %mul = mul i64 %a, 99
+  ret i64 %mul
+}
+; NOTE: the lowering is currently a bit inefficient for small 64-bit constants.
+; The top bits of the immediate are 0, but the instructions modeling that
+; multiply by 0 are not eliminated (see expanded 64-bit ARM lowering).
+; CHECK-LABEL: MulImm64
+; CHECK: mov {{.*}},0x63
+; CHECK: mov {{.*}},0x0
+; CHECK-NOT: mul {{[0-9]+}}
+;
+; ARM32-LABEL: MulImm64
+; ARM32: mov {{.*}}, #99
+; ARM32: mov {{.*}}, #0
+; ARM32: mul r
+; ARM32: mla r
+; ARM32: umull r
+; ARM32: add r
+
+define i32 @Sdiv(i32 %a, i32 %b) {
+entry:
+  %div = sdiv i32 %a, %b
+  ret i32 %div
+}
+; CHECK-LABEL: Sdiv
+; CHECK: cdq
+; CHECK: idiv e
+; ARM32-LABEL: Sdiv
+; TODO(jvoung) -- implement divide and check here.
+; The lowering needs to check if the denominator is 0 and trap, since
+; ARM normally doesn't trap on divide by 0.
+
+define i32 @Srem(i32 %a, i32 %b) {
+entry:
+  %rem = srem i32 %a, %b
+  ret i32 %rem
+}
+; CHECK-LABEL: Srem
+; CHECK: cdq
+; CHECK: idiv e
+; ARM32-LABEL: Srem
+
+define i32 @Udiv(i32 %a, i32 %b) {
+entry:
+  %div = udiv i32 %a, %b
+  ret i32 %div
+}
+; CHECK-LABEL: Udiv
+; CHECK: div e
+; ARM32-LABEL: Udiv
+
+define i32 @Urem(i32 %a, i32 %b) {
+entry:
+  %rem = urem i32 %a, %b
+  ret i32 %rem
+}
+; CHECK-LABEL: Urem
+; CHECK: div e
+; ARM32-LABEL: Urem
diff --git a/tests_lit/llvm2ice_tests/function_aligned.ll b/tests_lit/llvm2ice_tests/function_aligned.ll
index 2b3da9a..c976dce 100644
--- a/tests_lit/llvm2ice_tests/function_aligned.ll
+++ b/tests_lit/llvm2ice_tests/function_aligned.ll
@@ -4,6 +4,7 @@
 ; Also, we are currently using hlts for non-executable padding.
 
 ; RUN: %p2i --filetype=obj --disassemble -i %s --args -O2 | FileCheck %s
+
 ; TODO(jvoung): Stop skipping unimplemented parts (via --skip-unimplemented)
 ; once enough infrastructure is in. Also, switch to --filetype=obj
 ; when possible.
diff --git a/tests_lit/llvm2ice_tests/int-arg.ll b/tests_lit/llvm2ice_tests/int-arg.ll
index 9be4e1e..ae4a6ab 100644
--- a/tests_lit/llvm2ice_tests/int-arg.ll
+++ b/tests_lit/llvm2ice_tests/int-arg.ll
@@ -1,8 +1,7 @@
 ; This file checks that Subzero generates code in accordance with the
 ; calling convention for integers.
 
-; RUN: %p2i -i %s --filetype=obj --disassemble --args -O2 \
-; RUN:   | FileCheck %s
+; RUN: %p2i -i %s --filetype=obj --disassemble --args -O2 | FileCheck %s
 
 ; TODO(jvoung): Stop skipping unimplemented parts (via --skip-unimplemented)
 ; once enough infrastructure is in. Also, switch to --filetype=obj