ARM lowering integer divide and remainder, with div by 0 checks.

ARM normally just returns 0 when dividing by 0 with the
software and hw implementations, which is different from
what X86 does. So, for NaCl, we've modified LLVM to trap
by inserting explicit 0 checks.

Uses -mattr=hwdiv-arm attribute to decide if 32-bit
sdiv/udiv are supported.

Also lower the unreachable-inst to a trap-inst, since we
need a trap instruction for divide by 0 anyway.

Misc: fix switch test under MINIMAL=1, since ARM requires
allow_dump for filetype=asm.
Random clang-format changes...

TODO: check via cross tests

BUG= https://code.google.com/p/nativeclient/issues/detail?id=4076
R=stichnot@chromium.org

Review URL: https://codereview.chromium.org/1214693004.
diff --git a/runtime/szrt.c b/runtime/szrt.c
index 009ebe3..ec6b8cd 100644
--- a/runtime/szrt.c
+++ b/runtime/szrt.c
@@ -42,6 +42,10 @@
 
 // Other helper calls emitted by Subzero but not implemented here:
 // Compiler-rt:
+//   __udivsi3     - udiv i32
+//   __divsi3      - sdiv i32
+//   __umodsi3     - urem i32
+//   __modsi3      - srem i32
 //   __udivdi3     - udiv i64
 //   __divdi3      - sdiv i64
 //   __umoddi3     - urem i64
diff --git a/src/IceInstARM32.cpp b/src/IceInstARM32.cpp
index 2620b29..0476934 100644
--- a/src/IceInstARM32.cpp
+++ b/src/IceInstARM32.cpp
@@ -125,6 +125,34 @@
   Inst->getSrc(1)->emit(Func);
 }
 
+void InstARM32Pred::emitFourAddr(const char *Opcode, const InstARM32Pred *Inst,
+                                 const Cfg *Func) {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(Inst->getSrcSize() == 3);
+  Str << "\t" << Opcode << Inst->getPredicate() << "\t";
+  Inst->getDest()->emit(Func);
+  Str << ", ";
+  Inst->getSrc(0)->emit(Func);
+  Str << ", ";
+  Inst->getSrc(1)->emit(Func);
+  Str << ", ";
+  Inst->getSrc(2)->emit(Func);
+}
+
+void InstARM32Pred::emitCmpLike(const char *Opcode, const InstARM32Pred *Inst,
+                                const Cfg *Func) {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(Inst->getSrcSize() == 2);
+  Str << "\t" << Opcode << Inst->getPredicate() << "\t";
+  Inst->getSrc(0)->emit(Func);
+  Str << ", ";
+  Inst->getSrc(1)->emit(Func);
+}
+
 OperandARM32Mem::OperandARM32Mem(Cfg * /* Func */, Type Ty, Variable *Base,
                                  ConstantInteger32 *ImmOffset, AddrMode Mode)
     : OperandARM32(kMem, Ty), Base(Base), ImmOffset(ImmOffset), Index(nullptr),
@@ -207,15 +235,19 @@
 }
 
 InstARM32Br::InstARM32Br(Cfg *Func, const CfgNode *TargetTrue,
-                         const CfgNode *TargetFalse, CondARM32::Cond Pred)
+                         const CfgNode *TargetFalse,
+                         const InstARM32Label *Label, CondARM32::Cond Pred)
     : InstARM32Pred(Func, InstARM32::Br, 0, nullptr, Pred),
-      TargetTrue(TargetTrue), TargetFalse(TargetFalse) {}
+      TargetTrue(TargetTrue), TargetFalse(TargetFalse), Label(Label) {}
 
 bool InstARM32Br::optimizeBranch(const CfgNode *NextNode) {
   // If there is no next block, then there can be no fallthrough to
   // optimize.
   if (NextNode == nullptr)
     return false;
+  // Intra-block conditional branches can't be optimized.
+  if (Label)
+    return false;
   // If there is no fallthrough node, such as a non-default case label
   // for a switch instruction, then there is no opportunity to
   // optimize.
@@ -264,11 +296,12 @@
   addSource(CallTarget);
 }
 
-InstARM32Cmp::InstARM32Cmp(Cfg *Func, Variable *Src1, Operand *Src2,
-                           CondARM32::Cond Predicate)
-    : InstARM32Pred(Func, InstARM32::Cmp, 2, nullptr, Predicate) {
-  addSource(Src1);
-  addSource(Src2);
+InstARM32Label::InstARM32Label(Cfg *Func, TargetARM32 *Target)
+    : InstARM32(Func, InstARM32::Label, 0, nullptr),
+      Number(Target->makeNextLabelNumber()) {}
+
+IceString InstARM32Label::getName(const Cfg *Func) const {
+  return ".L" + Func->getFunctionName() + "$local$__" + std::to_string(Number);
 }
 
 InstARM32Ldr::InstARM32Ldr(Cfg *Func, Variable *Dest, OperandARM32Mem *Mem,
@@ -277,15 +310,6 @@
   addSource(Mem);
 }
 
-InstARM32Mla::InstARM32Mla(Cfg *Func, Variable *Dest, Variable *Src0,
-                           Variable *Src1, Variable *Acc,
-                           CondARM32::Cond Predicate)
-    : InstARM32Pred(Func, InstARM32::Mla, 3, Dest, Predicate) {
-  addSource(Src0);
-  addSource(Src1);
-  addSource(Acc);
-}
-
 InstARM32Pop::InstARM32Pop(Cfg *Func, const VarList &Dests)
     : InstARM32(Func, InstARM32::Pop, 0, nullptr), Dests(Dests) {
   // Track modifications to Dests separately via FakeDefs.
@@ -314,6 +338,9 @@
   addSource(Mem);
 }
 
+InstARM32Trap::InstARM32Trap(Cfg *Func)
+    : InstARM32(Func, InstARM32::Trap, 0, nullptr) {}
+
 InstARM32Umull::InstARM32Umull(Cfg *Func, Variable *DestLo, Variable *DestHi,
                                Variable *Src0, Variable *Src1,
                                CondARM32::Cond Predicate)
@@ -348,7 +375,15 @@
 template <> const char *InstARM32Orr::Opcode = "orr";
 template <> const char *InstARM32Rsb::Opcode = "rsb";
 template <> const char *InstARM32Sbc::Opcode = "sbc";
+template <> const char *InstARM32Sdiv::Opcode = "sdiv";
 template <> const char *InstARM32Sub::Opcode = "sub";
+template <> const char *InstARM32Udiv::Opcode = "udiv";
+// Four-addr ops
+template <> const char *InstARM32Mla::Opcode = "mla";
+template <> const char *InstARM32Mls::Opcode = "mls";
+// Cmp-like ops
+template <> const char *InstARM32Cmp::Opcode = "cmp";
+template <> const char *InstARM32Tst::Opcode = "tst";
 
 void InstARM32::dump(const Cfg *Func) const {
   if (!BuildDefs::dump())
@@ -402,14 +437,18 @@
   Ostream &Str = Func->getContext()->getStrEmit();
   Str << "\t"
       << "b" << getPredicate() << "\t";
-  if (isUnconditionalBranch()) {
-    Str << getTargetFalse()->getAsmName();
+  if (Label) {
+    Str << Label->getName(Func);
   } else {
-    Str << getTargetTrue()->getAsmName();
-    if (getTargetFalse()) {
-      Str << "\n\t"
-          << "b"
-          << "\t" << getTargetFalse()->getAsmName();
+    if (isUnconditionalBranch()) {
+      Str << getTargetFalse()->getAsmName();
+    } else {
+      Str << getTargetTrue()->getAsmName();
+      if (getTargetFalse()) {
+        Str << "\n\t"
+            << "b"
+            << "\t" << getTargetFalse()->getAsmName();
+      }
     }
   }
 }
@@ -426,13 +465,18 @@
   Str << "br ";
 
   if (getPredicate() == CondARM32::AL) {
-    Str << "label %" << getTargetFalse()->getName();
+    Str << "label %"
+        << (Label ? Label->getName(Func) : getTargetFalse()->getName());
     return;
   }
 
-  Str << getPredicate() << ", label %" << getTargetTrue()->getName();
-  if (getTargetFalse()) {
-    Str << ", label %" << getTargetFalse()->getName();
+  if (Label) {
+    Str << "label %" << Label->getName(Func);
+  } else {
+    Str << getPredicate() << ", label %" << getTargetTrue()->getName();
+    if (getTargetFalse()) {
+      Str << ", label %" << getTargetFalse()->getName();
+    }
   }
 }
 
@@ -479,30 +523,23 @@
   getCallTarget()->dump(Func);
 }
 
-void InstARM32Cmp::emit(const Cfg *Func) const {
+void InstARM32Label::emit(const Cfg *Func) const {
   if (!BuildDefs::dump())
     return;
   Ostream &Str = Func->getContext()->getStrEmit();
-  assert(getSrcSize() == 2);
-  Str << "\t"
-      << "cmp" << getPredicate() << "\t";
-  getSrc(0)->emit(Func);
-  Str << ", ";
-  getSrc(1)->emit(Func);
+  Str << getName(Func) << ":";
 }
 
-void InstARM32Cmp::emitIAS(const Cfg *Func) const {
-  assert(getSrcSize() == 2);
+void InstARM32Label::emitIAS(const Cfg *Func) const {
   (void)Func;
   llvm_unreachable("Not yet implemented");
 }
 
-void InstARM32Cmp::dump(const Cfg *Func) const {
+void InstARM32Label::dump(const Cfg *Func) const {
   if (!BuildDefs::dump())
     return;
   Ostream &Str = Func->getContext()->getStrDump();
-  dumpOpcodePred(Str, "cmp", getSrc(0)->getType());
-  dumpSources(Func);
+  Str << getName(Func) << ":";
 }
 
 void InstARM32Ldr::emit(const Cfg *Func) const {
@@ -536,40 +573,6 @@
   dumpSources(Func);
 }
 
-void InstARM32Mla::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  assert(getSrcSize() == 3);
-  assert(getDest()->hasReg());
-  Str << "\t"
-      << "mla" << getPredicate() << "\t";
-  getDest()->emit(Func);
-  Str << ", ";
-  getSrc(0)->emit(Func);
-  Str << ", ";
-  getSrc(1)->emit(Func);
-  Str << ", ";
-  getSrc(2)->emit(Func);
-}
-
-void InstARM32Mla::emitIAS(const Cfg *Func) const {
-  assert(getSrcSize() == 3);
-  (void)Func;
-  llvm_unreachable("Not yet implemented");
-}
-
-void InstARM32Mla::dump(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrDump();
-  dumpDest(Func);
-  Str << " = ";
-  dumpOpcodePred(Str, "mla", getDest()->getType());
-  Str << " ";
-  dumpSources(Func);
-}
-
 template <> void InstARM32Movw::emit(const Cfg *Func) const {
   if (!BuildDefs::dump())
     return;
@@ -757,6 +760,33 @@
   getSrc(0)->dump(Func);
 }
 
+void InstARM32Trap::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(getSrcSize() == 0);
+  // There isn't a mnemonic for the special NaCl Trap encoding, so dump
+  // the raw bytes.
+  Str << "\t.long 0x";
+  ARM32::AssemblerARM32 *Asm = Func->getAssembler<ARM32::AssemblerARM32>();
+  for (uint8_t I : Asm->getNonExecBundlePadding()) {
+    Str.write_hex(I);
+  }
+}
+
+void InstARM32Trap::emitIAS(const Cfg *Func) const {
+  assert(getSrcSize() == 0);
+  (void)Func;
+  llvm_unreachable("Not yet implemented");
+}
+
+void InstARM32Trap::dump(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  Str << "trap";
+}
+
 void InstARM32Umull::emit(const Cfg *Func) const {
   if (!BuildDefs::dump())
     return;
diff --git a/src/IceInstARM32.h b/src/IceInstARM32.h
index 8167ed4..caef19f 100644
--- a/src/IceInstARM32.h
+++ b/src/IceInstARM32.h
@@ -262,10 +262,12 @@
     Call,
     Cmp,
     Eor,
+    Label,
     Ldr,
     Lsl,
     Lsr,
     Mla,
+    Mls,
     Mov,
     Movt,
     Movw,
@@ -277,9 +279,13 @@
     Ret,
     Rsb,
     Sbc,
+    Sdiv,
     Str,
     Sub,
     Sxt,
+    Trap,
+    Tst,
+    Udiv,
     Umull,
     Uxt
   };
@@ -322,6 +328,10 @@
                           const Cfg *Func);
   static void emitThreeAddr(const char *Opcode, const InstARM32Pred *Inst,
                             const Cfg *Func, bool SetFlags);
+  static void emitFourAddr(const char *Opcode, const InstARM32Pred *Inst,
+                           const Cfg *Func);
+  static void emitCmpLike(const char *Opcode, const InstARM32Pred *Inst,
+                          const Cfg *Func);
 
 protected:
   CondARM32::Cond Predicate;
@@ -477,11 +487,11 @@
   // Create an ordinary binary-op instruction like add, and sub.
   // Dest and Src1 must be registers.
   static InstARM32ThreeAddrGPR *create(Cfg *Func, Variable *Dest,
-                                       Variable *Src1, Operand *Src2,
+                                       Variable *Src0, Operand *Src1,
                                        CondARM32::Cond Predicate,
                                        bool SetFlags = false) {
     return new (Func->allocate<InstARM32ThreeAddrGPR>())
-        InstARM32ThreeAddrGPR(Func, Dest, Src1, Src2, Predicate, SetFlags);
+        InstARM32ThreeAddrGPR(Func, Dest, Src0, Src1, Predicate, SetFlags);
   }
   void emit(const Cfg *Func) const override {
     if (!BuildDefs::dump())
@@ -505,15 +515,107 @@
   static bool classof(const Inst *Inst) { return isClassof(Inst, K); }
 
 private:
-  InstARM32ThreeAddrGPR(Cfg *Func, Variable *Dest, Variable *Src1,
-                        Operand *Src2, CondARM32::Cond Predicate, bool SetFlags)
+  InstARM32ThreeAddrGPR(Cfg *Func, Variable *Dest, Variable *Src0,
+                        Operand *Src1, CondARM32::Cond Predicate, bool SetFlags)
       : InstARM32Pred(Func, K, 2, Dest, Predicate), SetFlags(SetFlags) {
+    addSource(Src0);
+    addSource(Src1);
+  }
+
+  static const char *Opcode;
+  bool SetFlags;
+};
+
+// Instructions of the form x := a op1 (y op2 z). E.g., multiply accumulate.
+template <InstARM32::InstKindARM32 K>
+class InstARM32FourAddrGPR : public InstARM32Pred {
+  InstARM32FourAddrGPR() = delete;
+  InstARM32FourAddrGPR(const InstARM32FourAddrGPR &) = delete;
+  InstARM32FourAddrGPR &operator=(const InstARM32FourAddrGPR &) = delete;
+
+public:
+  // Every operand must be a register.
+  static InstARM32FourAddrGPR *create(Cfg *Func, Variable *Dest, Variable *Src0,
+                                      Variable *Src1, Variable *Src2,
+                                      CondARM32::Cond Predicate) {
+    return new (Func->allocate<InstARM32FourAddrGPR>())
+        InstARM32FourAddrGPR(Func, Dest, Src0, Src1, Src2, Predicate);
+  }
+  void emit(const Cfg *Func) const override {
+    if (!BuildDefs::dump())
+      return;
+    emitFourAddr(Opcode, this, Func);
+  }
+  void emitIAS(const Cfg *Func) const override {
+    (void)Func;
+    llvm::report_fatal_error("Not yet implemented");
+  }
+  void dump(const Cfg *Func) const override {
+    if (!BuildDefs::dump())
+      return;
+    Ostream &Str = Func->getContext()->getStrDump();
+    dumpDest(Func);
+    Str << " = ";
+    dumpOpcodePred(Str, Opcode, getDest()->getType());
+    Str << " ";
+    dumpSources(Func);
+  }
+  static bool classof(const Inst *Inst) { return isClassof(Inst, K); }
+
+private:
+  InstARM32FourAddrGPR(Cfg *Func, Variable *Dest, Variable *Src0,
+                       Variable *Src1, Variable *Src2,
+                       CondARM32::Cond Predicate)
+      : InstARM32Pred(Func, K, 3, Dest, Predicate) {
+    addSource(Src0);
     addSource(Src1);
     addSource(Src2);
   }
 
   static const char *Opcode;
-  bool SetFlags;
+};
+
+// Instructions of the form x cmpop y (setting flags).
+template <InstARM32::InstKindARM32 K>
+class InstARM32CmpLike : public InstARM32Pred {
+  InstARM32CmpLike() = delete;
+  InstARM32CmpLike(const InstARM32CmpLike &) = delete;
+  InstARM32CmpLike &operator=(const InstARM32CmpLike &) = delete;
+
+public:
+  static InstARM32CmpLike *create(Cfg *Func, Variable *Src0, Operand *Src1,
+                                  CondARM32::Cond Predicate) {
+    return new (Func->allocate<InstARM32CmpLike>())
+        InstARM32CmpLike(Func, Src0, Src1, Predicate);
+  }
+  void emit(const Cfg *Func) const override {
+    if (!BuildDefs::dump())
+      return;
+    emitCmpLike(Opcode, this, Func);
+  }
+  void emitIAS(const Cfg *Func) const override {
+    (void)Func;
+    llvm_unreachable("Not yet implemented");
+  }
+  void dump(const Cfg *Func) const override {
+    if (!BuildDefs::dump())
+      return;
+    Ostream &Str = Func->getContext()->getStrDump();
+    dumpOpcodePred(Str, Opcode, getSrc(0)->getType());
+    Str << " ";
+    dumpSources(Func);
+  }
+  static bool classof(const Inst *Inst) { return isClassof(Inst, K); }
+
+private:
+  InstARM32CmpLike(Cfg *Func, Variable *Src0, Operand *Src1,
+                   CondARM32::Cond Predicate)
+      : InstARM32Pred(Func, K, 2, nullptr, Predicate) {
+    addSource(Src0);
+    addSource(Src1);
+  }
+
+  static const char *Opcode;
 };
 
 typedef InstARM32ThreeAddrGPR<InstARM32::Adc> InstARM32Adc;
@@ -528,7 +630,9 @@
 typedef InstARM32ThreeAddrGPR<InstARM32::Orr> InstARM32Orr;
 typedef InstARM32ThreeAddrGPR<InstARM32::Rsb> InstARM32Rsb;
 typedef InstARM32ThreeAddrGPR<InstARM32::Sbc> InstARM32Sbc;
+typedef InstARM32ThreeAddrGPR<InstARM32::Sdiv> InstARM32Sdiv;
 typedef InstARM32ThreeAddrGPR<InstARM32::Sub> InstARM32Sub;
+typedef InstARM32ThreeAddrGPR<InstARM32::Udiv> InstARM32Udiv;
 // Move instruction (variable <- flex). This is more of a pseudo-inst.
 // If var is a register, then we use "mov". If var is stack, then we use
 // "str" to store to the stack.
@@ -543,6 +647,35 @@
 // but we aren't using that for now, so just model as a Unaryop.
 typedef InstARM32UnaryopGPR<InstARM32::Sxt> InstARM32Sxt;
 typedef InstARM32UnaryopGPR<InstARM32::Uxt> InstARM32Uxt;
+typedef InstARM32FourAddrGPR<InstARM32::Mla> InstARM32Mla;
+typedef InstARM32FourAddrGPR<InstARM32::Mls> InstARM32Mls;
+typedef InstARM32CmpLike<InstARM32::Cmp> InstARM32Cmp;
+typedef InstARM32CmpLike<InstARM32::Tst> InstARM32Tst;
+
+// InstARM32Label represents an intra-block label that is the target
+// of an intra-block branch.  The offset between the label and the
+// branch must be fit in the instruction immediate (considered "near").
+class InstARM32Label : public InstARM32 {
+  InstARM32Label() = delete;
+  InstARM32Label(const InstARM32Label &) = delete;
+  InstARM32Label &operator=(const InstARM32Label &) = delete;
+
+public:
+  static InstARM32Label *create(Cfg *Func, TargetARM32 *Target) {
+    return new (Func->allocate<InstARM32Label>()) InstARM32Label(Func, Target);
+  }
+  uint32_t getEmitInstCount() const override { return 0; }
+  IceString getName(const Cfg *Func) const;
+  SizeT getNumber() const { return Number; }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+
+private:
+  InstARM32Label(Cfg *Func, TargetARM32 *Target);
+
+  SizeT Number; // used for unique label generation.
+};
 
 // Direct branch instruction.
 class InstARM32Br : public InstARM32Pred {
@@ -555,14 +688,16 @@
   static InstARM32Br *create(Cfg *Func, CfgNode *TargetTrue,
                              CfgNode *TargetFalse, CondARM32::Cond Predicate) {
     assert(Predicate != CondARM32::AL);
+    constexpr InstARM32Label *NoLabel = nullptr;
     return new (Func->allocate<InstARM32Br>())
-        InstARM32Br(Func, TargetTrue, TargetFalse, Predicate);
+        InstARM32Br(Func, TargetTrue, TargetFalse, NoLabel, Predicate);
   }
   // Create an unconditional branch to a node.
   static InstARM32Br *create(Cfg *Func, CfgNode *Target) {
-    const CfgNode *NoCondTarget = nullptr;
+    constexpr CfgNode *NoCondTarget = nullptr;
+    constexpr InstARM32Label *NoLabel = nullptr;
     return new (Func->allocate<InstARM32Br>())
-        InstARM32Br(Func, NoCondTarget, Target, CondARM32::AL);
+        InstARM32Br(Func, NoCondTarget, Target, NoLabel, CondARM32::AL);
   }
   // Create a non-terminator conditional branch to a node, with a
   // fallthrough to the next instruction in the current node.  This is
@@ -570,15 +705,27 @@
   static InstARM32Br *create(Cfg *Func, CfgNode *Target,
                              CondARM32::Cond Predicate) {
     assert(Predicate != CondARM32::AL);
-    const CfgNode *NoUncondTarget = nullptr;
+    constexpr CfgNode *NoUncondTarget = nullptr;
+    constexpr InstARM32Label *NoLabel = nullptr;
     return new (Func->allocate<InstARM32Br>())
-        InstARM32Br(Func, Target, NoUncondTarget, Predicate);
+        InstARM32Br(Func, Target, NoUncondTarget, NoLabel, Predicate);
+  }
+  // Create a conditional intra-block branch (or unconditional, if
+  // Condition==AL) to a label in the current block.
+  static InstARM32Br *create(Cfg *Func, InstARM32Label *Label,
+                             CondARM32::Cond Predicate) {
+    constexpr CfgNode *NoCondTarget = nullptr;
+    constexpr CfgNode *NoUncondTarget = nullptr;
+    return new (Func->allocate<InstARM32Br>())
+        InstARM32Br(Func, NoCondTarget, NoUncondTarget, Label, Predicate);
   }
   const CfgNode *getTargetTrue() const { return TargetTrue; }
   const CfgNode *getTargetFalse() const { return TargetFalse; }
   bool optimizeBranch(const CfgNode *NextNode);
   uint32_t getEmitInstCount() const override {
     uint32_t Sum = 0;
+    if (Label)
+      ++Sum;
     if (getTargetTrue())
       ++Sum;
     if (getTargetFalse())
@@ -596,10 +743,11 @@
 
 private:
   InstARM32Br(Cfg *Func, const CfgNode *TargetTrue, const CfgNode *TargetFalse,
-              CondARM32::Cond Predicate);
+              const InstARM32Label *Label, CondARM32::Cond Predicate);
 
   const CfgNode *TargetTrue;
   const CfgNode *TargetFalse;
+  const InstARM32Label *Label; // Intra-block branch target
 };
 
 // AdjustStack instruction - subtracts SP by the given amount and
@@ -653,28 +801,6 @@
   InstARM32Call(Cfg *Func, Variable *Dest, Operand *CallTarget);
 };
 
-// Integer compare instruction.
-class InstARM32Cmp : public InstARM32Pred {
-  InstARM32Cmp() = delete;
-  InstARM32Cmp(const InstARM32Cmp &) = delete;
-  InstARM32Cmp &operator=(const InstARM32Cmp &) = delete;
-
-public:
-  static InstARM32Cmp *create(Cfg *Func, Variable *Src1, Operand *Src2,
-                              CondARM32::Cond Predicate) {
-    return new (Func->allocate<InstARM32Cmp>())
-        InstARM32Cmp(Func, Src1, Src2, Predicate);
-  }
-  void emit(const Cfg *Func) const override;
-  void emitIAS(const Cfg *Func) const override;
-  void dump(const Cfg *Func) const override;
-  static bool classof(const Inst *Inst) { return isClassof(Inst, Cmp); }
-
-private:
-  InstARM32Cmp(Cfg *Func, Variable *Src1, Operand *Src2,
-               CondARM32::Cond Predicate);
-};
-
 // Load instruction.
 class InstARM32Ldr : public InstARM32Pred {
   InstARM32Ldr() = delete;
@@ -698,30 +824,6 @@
                CondARM32::Cond Predicate);
 };
 
-// Multiply Accumulate: d := x * y + a
-class InstARM32Mla : public InstARM32Pred {
-  InstARM32Mla() = delete;
-  InstARM32Mla(const InstARM32Mla &) = delete;
-  InstARM32Mla &operator=(const InstARM32Mla &) = delete;
-
-public:
-  // Everything must be a register.
-  static InstARM32Mla *create(Cfg *Func, Variable *Dest, Variable *Src0,
-                              Variable *Src1, Variable *Acc,
-                              CondARM32::Cond Predicate) {
-    return new (Func->allocate<InstARM32Mla>())
-        InstARM32Mla(Func, Dest, Src0, Src1, Acc, Predicate);
-  }
-  void emit(const Cfg *Func) const override;
-  void emitIAS(const Cfg *Func) const override;
-  void dump(const Cfg *Func) const override;
-  static bool classof(const Inst *Inst) { return isClassof(Inst, Mla); }
-
-private:
-  InstARM32Mla(Cfg *Func, Variable *Dest, Variable *Src0, Variable *Src1,
-               Variable *Acc, CondARM32::Cond Predicate);
-};
-
 // Pop into a list of GPRs. Technically this can be predicated, but we don't
 // need that functionality.
 class InstARM32Pop : public InstARM32 {
@@ -816,6 +918,24 @@
                CondARM32::Cond Predicate);
 };
 
+class InstARM32Trap : public InstARM32 {
+  InstARM32Trap() = delete;
+  InstARM32Trap(const InstARM32Trap &) = delete;
+  InstARM32Trap &operator=(const InstARM32Trap &) = delete;
+
+public:
+  static InstARM32Trap *create(Cfg *Func) {
+    return new (Func->allocate<InstARM32Trap>()) InstARM32Trap(Func);
+  }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Inst) { return isClassof(Inst, Trap); }
+
+private:
+  explicit InstARM32Trap(Cfg *Func);
+};
+
 // Unsigned Multiply Long: d.lo, d.hi := x * y
 class InstARM32Umull : public InstARM32Pred {
   InstARM32Umull() = delete;
diff --git a/src/IceInstX8632.cpp b/src/IceInstX8632.cpp
index ca62d79..96d801a 100644
--- a/src/IceInstX8632.cpp
+++ b/src/IceInstX8632.cpp
@@ -1939,7 +1939,7 @@
   if (!BuildDefs::dump())
     return;
   Ostream &Str = Func->getContext()->getStrDump();
-  Str << "ud2\n";
+  Str << "ud2";
 }
 
 void InstX8632Test::emit(const Cfg *Func) const {
@@ -1998,7 +1998,7 @@
   if (!BuildDefs::dump())
     return;
   Ostream &Str = Func->getContext()->getStrDump();
-  Str << "mfence\n";
+  Str << "mfence";
 }
 
 void InstX8632Store::emit(const Cfg *Func) const {
@@ -2512,7 +2512,6 @@
   Ostream &Str = Func->getContext()->getStrDump();
   dumpDest(Func);
   Str << " = fstp." << getDest()->getType() << ", st(0)";
-  Str << "\n";
 }
 
 template <> void InstX8632Pcmpeq::emit(const Cfg *Func) const {
diff --git a/src/IceTargetLowering.h b/src/IceTargetLowering.h
index 205e573..b543208 100644
--- a/src/IceTargetLowering.h
+++ b/src/IceTargetLowering.h
@@ -353,16 +353,20 @@
   const static constexpr char *H_fptoui_f64_i64 = "__Sz_fptoui_f64_i64";
   const static constexpr char *H_frem_f32 = "fmodf";
   const static constexpr char *H_frem_f64 = "fmod";
+  const static constexpr char *H_sdiv_i32 = "__divsi3";
   const static constexpr char *H_sdiv_i64 = "__divdi3";
   const static constexpr char *H_sitofp_i64_f32 = "__Sz_sitofp_i64_f32";
   const static constexpr char *H_sitofp_i64_f64 = "__Sz_sitofp_i64_f64";
+  const static constexpr char *H_srem_i32 = "__modsi3";
   const static constexpr char *H_srem_i64 = "__moddi3";
+  const static constexpr char *H_udiv_i32 = "__udivsi3";
   const static constexpr char *H_udiv_i64 = "__udivdi3";
   const static constexpr char *H_uitofp_4xi32_4xf32 = "__Sz_uitofp_4xi32_4xf32";
   const static constexpr char *H_uitofp_i32_f32 = "__Sz_uitofp_i32_f32";
   const static constexpr char *H_uitofp_i32_f64 = "__Sz_uitofp_i32_f64";
   const static constexpr char *H_uitofp_i64_f32 = "__Sz_uitofp_i64_f32";
   const static constexpr char *H_uitofp_i64_f64 = "__Sz_uitofp_i64_f64";
+  const static constexpr char *H_urem_i32 = "__umodsi3";
   const static constexpr char *H_urem_i64 = "__umoddi3";
 
 private:
diff --git a/src/IceTargetLoweringARM32.cpp b/src/IceTargetLoweringARM32.cpp
index fad9bcf..c090075 100644
--- a/src/IceTargetLoweringARM32.cpp
+++ b/src/IceTargetLoweringARM32.cpp
@@ -141,21 +141,34 @@
   return Utils::applyAlignment(Value, typeAlignInBytes);
 }
 
+// Conservatively check if at compile time we know that the operand is
+// definitely a non-zero integer.
+bool isGuaranteedNonzeroInt(const Operand *Op) {
+  if (auto *Const = llvm::dyn_cast_or_null<ConstantInteger32>(Op)) {
+    return Const->getValue() != 0;
+  }
+  return false;
+}
+
 } // end of anonymous namespace
 
-TargetARM32::TargetARM32(Cfg *Func) : TargetLowering(Func) {
+TargetARM32Features::TargetARM32Features(const ClFlags &Flags) {
   static_assert(
       (ARM32InstructionSet::End - ARM32InstructionSet::Begin) ==
           (TargetInstructionSet::ARM32InstructionSet_End -
            TargetInstructionSet::ARM32InstructionSet_Begin),
       "ARM32InstructionSet range different from TargetInstructionSet");
-  if (Func->getContext()->getFlags().getTargetInstructionSet() !=
+  if (Flags.getTargetInstructionSet() !=
       TargetInstructionSet::BaseInstructionSet) {
     InstructionSet = static_cast<ARM32InstructionSet>(
-        (Func->getContext()->getFlags().getTargetInstructionSet() -
+        (Flags.getTargetInstructionSet() -
          TargetInstructionSet::ARM32InstructionSet_Begin) +
         ARM32InstructionSet::Begin);
   }
+}
+
+TargetARM32::TargetARM32(Cfg *Func)
+    : TargetLowering(Func), CPUFeatures(Func->getContext()->getFlags()) {
   // TODO: Don't initialize IntegerRegisters and friends every time.
   // Instead, initialize in some sort of static initializer for the
   // class.
@@ -1009,6 +1022,75 @@
   _mov(Dest, SP);
 }
 
+void TargetARM32::div0Check(Type Ty, Operand *SrcLo, Operand *SrcHi) {
+  if (isGuaranteedNonzeroInt(SrcLo) || isGuaranteedNonzeroInt(SrcHi))
+    return;
+  Variable *SrcLoReg = legalizeToVar(SrcLo);
+  switch (Ty) {
+  default:
+    llvm_unreachable("Unexpected type");
+  case IceType_i8: {
+    Operand *Mask =
+        legalize(Ctx->getConstantInt32(0xFF), Legal_Reg | Legal_Flex);
+    _tst(SrcLoReg, Mask);
+    break;
+  }
+  case IceType_i16: {
+    Operand *Mask =
+        legalize(Ctx->getConstantInt32(0xFFFF), Legal_Reg | Legal_Flex);
+    _tst(SrcLoReg, Mask);
+    break;
+  }
+  case IceType_i32: {
+    _tst(SrcLoReg, SrcLoReg);
+    break;
+  }
+  case IceType_i64: {
+    Variable *ScratchReg = makeReg(IceType_i32);
+    _orrs(ScratchReg, SrcLoReg, SrcHi);
+    // ScratchReg isn't going to be used, but we need the
+    // side-effect of setting flags from this operation.
+    Context.insert(InstFakeUse::create(Func, ScratchReg));
+  }
+  }
+  InstARM32Label *Label = InstARM32Label::create(Func, this);
+  _br(Label, CondARM32::NE);
+  _trap();
+  Context.insert(Label);
+}
+
+void TargetARM32::lowerIDivRem(Variable *Dest, Variable *T, Variable *Src0R,
+                               Operand *Src1, ExtInstr ExtFunc,
+                               DivInstr DivFunc, const char *DivHelperName,
+                               bool IsRemainder) {
+  div0Check(Dest->getType(), Src1, nullptr);
+  Variable *Src1R = legalizeToVar(Src1);
+  Variable *T0R = Src0R;
+  Variable *T1R = Src1R;
+  if (Dest->getType() != IceType_i32) {
+    T0R = makeReg(IceType_i32);
+    (this->*ExtFunc)(T0R, Src0R, CondARM32::AL);
+    T1R = makeReg(IceType_i32);
+    (this->*ExtFunc)(T1R, Src1R, CondARM32::AL);
+  }
+  if (hasCPUFeature(TargetARM32Features::HWDivArm)) {
+    (this->*DivFunc)(T, T0R, T1R, CondARM32::AL);
+    if (IsRemainder) {
+      Variable *T2 = makeReg(IceType_i32);
+      _mls(T2, T, T1R, T0R);
+      T = T2;
+    }
+    _mov(Dest, T);
+  } else {
+    constexpr SizeT MaxSrcs = 2;
+    InstCall *Call = makeHelperCall(DivHelperName, Dest, MaxSrcs);
+    Call->addArg(T0R);
+    Call->addArg(T1R);
+    lowerCall(Call);
+  }
+  return;
+}
+
 void TargetARM32::lowerArithmetic(const InstArithmetic *Inst) {
   Variable *Dest = Inst->getDest();
   // TODO(jvoung): Should be able to flip Src0 and Src1 if it is easier
@@ -1182,9 +1264,47 @@
     case InstArithmetic::Udiv:
     case InstArithmetic::Sdiv:
     case InstArithmetic::Urem:
-    case InstArithmetic::Srem:
-      UnimplementedError(Func->getContext()->getFlags());
-      break;
+    case InstArithmetic::Srem: {
+      // Check for divide by 0 (ARM normally doesn't trap, but we want it
+      // to trap for NaCl). Src1Lo and Src1Hi may have already been legalized
+      // to a register, which will hide a constant source operand.
+      // Instead, check the not-yet-legalized Src1 to optimize-out a divide
+      // by 0 check.
+      if (auto *C64 = llvm::dyn_cast<ConstantInteger64>(Src1)) {
+        if (C64->getValue() == 0) {
+          div0Check(IceType_i64, Src1Lo, Src1Hi);
+        }
+      } else {
+        div0Check(IceType_i64, Src1Lo, Src1Hi);
+      }
+      // Technically, ARM has their own aeabi routines, but we can use the
+      // non-aeabi routine as well.  LLVM uses __aeabi_ldivmod for div,
+      // but uses the more standard __moddi3 for rem.
+      const char *HelperName = "";
+      switch (Inst->getOp()) {
+      case InstArithmetic::Udiv:
+        HelperName = H_udiv_i64;
+        break;
+      case InstArithmetic::Sdiv:
+        HelperName = H_sdiv_i64;
+        break;
+      case InstArithmetic::Urem:
+        HelperName = H_urem_i64;
+        break;
+      case InstArithmetic::Srem:
+        HelperName = H_srem_i64;
+        break;
+      default:
+        llvm_unreachable("Should have only matched div ops.");
+        break;
+      }
+      constexpr SizeT MaxSrcs = 2;
+      InstCall *Call = makeHelperCall(HelperName, Dest, MaxSrcs);
+      Call->addArg(Inst->getSrc(0));
+      Call->addArg(Inst->getSrc(1));
+      lowerCall(Call);
+      return;
+    }
     case InstArithmetic::Fadd:
     case InstArithmetic::Fsub:
     case InstArithmetic::Fmul:
@@ -1197,61 +1317,73 @@
     UnimplementedError(Func->getContext()->getFlags());
   } else { // Dest->getType() is non-i64 scalar
     Variable *Src0R = legalizeToVar(Inst->getSrc(0));
-    Src1 = legalize(Inst->getSrc(1), Legal_Reg | Legal_Flex);
+    Operand *Src1RF = legalize(Inst->getSrc(1), Legal_Reg | Legal_Flex);
     Variable *T = makeReg(Dest->getType());
     switch (Inst->getOp()) {
     case InstArithmetic::_num:
       llvm_unreachable("Unknown arithmetic operator");
       break;
     case InstArithmetic::Add: {
-      _add(T, Src0R, Src1);
+      _add(T, Src0R, Src1RF);
       _mov(Dest, T);
     } break;
     case InstArithmetic::And: {
-      _and(T, Src0R, Src1);
+      _and(T, Src0R, Src1RF);
       _mov(Dest, T);
     } break;
     case InstArithmetic::Or: {
-      _orr(T, Src0R, Src1);
+      _orr(T, Src0R, Src1RF);
       _mov(Dest, T);
     } break;
     case InstArithmetic::Xor: {
-      _eor(T, Src0R, Src1);
+      _eor(T, Src0R, Src1RF);
       _mov(Dest, T);
     } break;
     case InstArithmetic::Sub: {
-      _sub(T, Src0R, Src1);
+      _sub(T, Src0R, Src1RF);
       _mov(Dest, T);
     } break;
     case InstArithmetic::Mul: {
-      Variable *Src1R = legalizeToVar(Src1);
+      Variable *Src1R = legalizeToVar(Src1RF);
       _mul(T, Src0R, Src1R);
       _mov(Dest, T);
     } break;
     case InstArithmetic::Shl:
-      _lsl(T, Src0R, Src1);
+      _lsl(T, Src0R, Src1RF);
       _mov(Dest, T);
       break;
     case InstArithmetic::Lshr:
-      _lsr(T, Src0R, Src1);
+      _lsr(T, Src0R, Src1RF);
       _mov(Dest, T);
       break;
     case InstArithmetic::Ashr:
-      _asr(T, Src0R, Src1);
+      _asr(T, Src0R, Src1RF);
       _mov(Dest, T);
       break;
-    case InstArithmetic::Udiv:
-      UnimplementedError(Func->getContext()->getFlags());
-      break;
-    case InstArithmetic::Sdiv:
-      UnimplementedError(Func->getContext()->getFlags());
-      break;
-    case InstArithmetic::Urem:
-      UnimplementedError(Func->getContext()->getFlags());
-      break;
-    case InstArithmetic::Srem:
-      UnimplementedError(Func->getContext()->getFlags());
-      break;
+    case InstArithmetic::Udiv: {
+      constexpr bool IsRemainder = false;
+      lowerIDivRem(Dest, T, Src0R, Src1, &TargetARM32::_uxt,
+                   &TargetARM32::_udiv, H_udiv_i32, IsRemainder);
+      return;
+    }
+    case InstArithmetic::Sdiv: {
+      constexpr bool IsRemainder = false;
+      lowerIDivRem(Dest, T, Src0R, Src1, &TargetARM32::_sxt,
+                   &TargetARM32::_sdiv, H_sdiv_i32, IsRemainder);
+      return;
+    }
+    case InstArithmetic::Urem: {
+      constexpr bool IsRemainder = true;
+      lowerIDivRem(Dest, T, Src0R, Src1, &TargetARM32::_uxt,
+                   &TargetARM32::_udiv, H_urem_i32, IsRemainder);
+      return;
+    }
+    case InstArithmetic::Srem: {
+      constexpr bool IsRemainder = true;
+      lowerIDivRem(Dest, T, Src0R, Src1, &TargetARM32::_sxt,
+                   &TargetARM32::_sdiv, H_srem_i32, IsRemainder);
+      return;
+    }
     case InstArithmetic::Fadd:
       UnimplementedError(Func->getContext()->getFlags());
       break;
@@ -1322,7 +1454,7 @@
   Variable *Src0R = legalizeToVar(Cond);
   Constant *Zero = Ctx->getConstantZero(IceType_i32);
   _cmp(Src0R, Zero);
-  _br(CondARM32::NE, Inst->getTargetTrue(), Inst->getTargetFalse());
+  _br(Inst->getTargetTrue(), Inst->getTargetFalse(), CondARM32::NE);
 }
 
 void TargetARM32::lowerCall(const InstCall *Instr) {
@@ -2113,7 +2245,7 @@
 }
 
 void TargetARM32::lowerUnreachable(const InstUnreachable * /*Inst*/) {
-  UnimplementedError(Func->getContext()->getFlags());
+  _trap();
 }
 
 // Turn an i64 Phi instruction into a pair of i32 Phi instructions, to
@@ -2417,7 +2549,7 @@
 }
 
 TargetHeaderARM32::TargetHeaderARM32(GlobalContext *Ctx)
-    : TargetHeaderLowering(Ctx) {}
+    : TargetHeaderLowering(Ctx), CPUFeatures(Ctx->getFlags()) {}
 
 void TargetHeaderARM32::lower() {
   OstreamLocker L(Ctx);
@@ -2431,12 +2563,18 @@
   // sub-subsection of the first public subsection of the attributes.
   Str << ".eabi_attribute 67, \"2.09\"      @ Tag_conformance\n";
   // Chromebooks are at least A15, but do A9 for higher compat.
-  Str << ".cpu    cortex-a9\n"
-      << ".eabi_attribute 6, 10   @ Tag_CPU_arch: ARMv7\n"
+  // For some reason, the LLVM ARM asm parser has the .cpu directive override
+  // the mattr specified on the commandline. So to test hwdiv, we need to set
+  // the .cpu directive higher (can't just rely on --mattr=...).
+  if (CPUFeatures.hasFeature(TargetARM32Features::HWDivArm)) {
+    Str << ".cpu    cortex-a15\n";
+  } else {
+    Str << ".cpu    cortex-a9\n";
+  }
+  Str << ".eabi_attribute 6, 10   @ Tag_CPU_arch: ARMv7\n"
       << ".eabi_attribute 7, 65   @ Tag_CPU_arch_profile: App profile\n";
   Str << ".eabi_attribute 8, 1    @ Tag_ARM_ISA_use: Yes\n"
       << ".eabi_attribute 9, 2    @ Tag_THUMB_ISA_use: Thumb-2\n";
-  // TODO(jvoung): check other CPU features like HW div.
   Str << ".fpu    neon\n"
       << ".eabi_attribute 17, 1   @ Tag_ABI_PCS_GOT_use: permit directly\n"
       << ".eabi_attribute 20, 1   @ Tag_ABI_FP_denormal\n"
@@ -2450,6 +2588,9 @@
       << ".eabi_attribute 38, 1   @ Tag_ABI_FP_16bit_format\n"
       << ".eabi_attribute 42, 1   @ Tag_MPextension_use\n"
       << ".eabi_attribute 68, 1   @ Tag_Virtualization_use\n";
+  if (CPUFeatures.hasFeature(TargetARM32Features::HWDivArm)) {
+    Str << ".eabi_attribute 44, 2   @ Tag_DIV_use\n";
+  }
   // Technically R9 is used for TLS with Sandboxing, and we reserve it.
   // However, for compatibility with current NaCl LLVM, don't claim that.
   Str << ".eabi_attribute 14, 3   @ Tag_ABI_PCS_R9_use: Not used\n";
diff --git a/src/IceTargetLoweringARM32.h b/src/IceTargetLoweringARM32.h
index 1691d6c..02d9080 100644
--- a/src/IceTargetLoweringARM32.h
+++ b/src/IceTargetLoweringARM32.h
@@ -22,6 +22,30 @@
 
 namespace Ice {
 
+// Class encapsulating ARM cpu features / instruction set.
+class TargetARM32Features {
+  TargetARM32Features() = delete;
+  TargetARM32Features(const TargetARM32Features &) = delete;
+  TargetARM32Features &operator=(const TargetARM32Features &) = delete;
+
+public:
+  explicit TargetARM32Features(const ClFlags &Flags);
+
+  enum ARM32InstructionSet {
+    Begin,
+    // Neon is the PNaCl baseline instruction set.
+    Neon = Begin,
+    HWDivArm, // HW divide in ARM mode (not just Thumb mode).
+    End
+  };
+
+  bool hasFeature(ARM32InstructionSet I) const { return I <= InstructionSet; }
+
+private:
+  ARM32InstructionSet InstructionSet = ARM32InstructionSet::Begin;
+};
+
+// The target lowering logic for ARM32.
 class TargetARM32 : public TargetLowering {
   TargetARM32() = delete;
   TargetARM32(const TargetARM32 &) = delete;
@@ -75,15 +99,9 @@
   void finishArgumentLowering(Variable *Arg, Variable *FramePtr,
                               size_t BasicFrameOffset, size_t &InArgsSizeBytes);
 
-  enum ARM32InstructionSet {
-    Begin,
-    // Neon is the PNaCl baseline instruction set.
-    Neon = Begin,
-    HWDivArm, // HW divide in ARM mode (not just Thumb mode).
-    End
-  };
-
-  ARM32InstructionSet getInstructionSet() const { return InstructionSet; }
+  bool hasCPUFeature(TargetARM32Features::ARM32InstructionSet I) const {
+    return CPUFeatures.hasFeature(I);
+  }
 
 protected:
   explicit TargetARM32(Cfg *Func);
@@ -141,6 +159,18 @@
       llvm::SmallVectorImpl<int32_t> &Permutation,
       const llvm::SmallBitVector &ExcludeRegisters) const override;
 
+  // If a divide-by-zero check is needed, inserts a:
+  // test; branch .LSKIP; trap; .LSKIP: <continuation>.
+  // If no check is needed nothing is inserted.
+  void div0Check(Type Ty, Operand *SrcLo, Operand *SrcHi);
+  typedef void (TargetARM32::*ExtInstr)(Variable *, Variable *,
+                                        CondARM32::Cond);
+  typedef void (TargetARM32::*DivInstr)(Variable *, Variable *, Variable *,
+                                        CondARM32::Cond);
+  void lowerIDivRem(Variable *Dest, Variable *T, Variable *Src0R, Operand *Src1,
+                    ExtInstr ExtFunc, DivInstr DivFunc,
+                    const char *DivHelperName, bool IsRemainder);
+
   // The following are helpers that insert lowered ARM32 instructions
   // with minimal syntactic overhead, so that the lowering code can
   // look as close to assembly as practical.
@@ -175,8 +205,8 @@
             CondARM32::Cond Pred = CondARM32::AL) {
     Context.insert(InstARM32Bic::create(Func, Dest, Src0, Src1, Pred));
   }
-  void _br(CondARM32::Cond Condition, CfgNode *TargetTrue,
-           CfgNode *TargetFalse) {
+  void _br(CfgNode *TargetTrue, CfgNode *TargetFalse,
+           CondARM32::Cond Condition) {
     Context.insert(
         InstARM32Br::create(Func, TargetTrue, TargetFalse, Condition));
   }
@@ -186,6 +216,9 @@
   void _br(CfgNode *Target, CondARM32::Cond Condition) {
     Context.insert(InstARM32Br::create(Func, Target, Condition));
   }
+  void _br(InstARM32Label *Label, CondARM32::Cond Condition) {
+    Context.insert(InstARM32Br::create(Func, Label, Condition));
+  }
   void _cmp(Variable *Src0, Operand *Src1,
             CondARM32::Cond Pred = CondARM32::AL) {
     Context.insert(InstARM32Cmp::create(Func, Src0, Src1, Pred));
@@ -210,6 +243,10 @@
             CondARM32::Cond Pred = CondARM32::AL) {
     Context.insert(InstARM32Mla::create(Func, Dest, Src0, Src1, Acc, Pred));
   }
+  void _mls(Variable *Dest, Variable *Src0, Variable *Src1, Variable *Acc,
+            CondARM32::Cond Pred = CondARM32::AL) {
+    Context.insert(InstARM32Mls::create(Func, Dest, Src0, Src1, Acc, Pred));
+  }
   // If Dest=nullptr is passed in, then a new variable is created,
   // marked as infinite register allocation weight, and returned
   // through the in/out Dest argument.
@@ -248,6 +285,12 @@
             CondARM32::Cond Pred = CondARM32::AL) {
     Context.insert(InstARM32Orr::create(Func, Dest, Src0, Src1, Pred));
   }
+  void _orrs(Variable *Dest, Variable *Src0, Operand *Src1,
+             CondARM32::Cond Pred = CondARM32::AL) {
+    const bool SetFlags = true;
+    Context.insert(
+        InstARM32Orr::create(Func, Dest, Src0, Src1, Pred, SetFlags));
+  }
   void _push(const VarList &Sources) {
     Context.insert(InstARM32Push::create(Func, Sources));
   }
@@ -257,6 +300,9 @@
     for (Variable *Dest : Dests)
       Context.insert(InstFakeDef::create(Func, Dest));
   }
+  void _ret(Variable *LR, Variable *Src0 = nullptr) {
+    Context.insert(InstARM32Ret::create(Func, LR, Src0));
+  }
   void _rsb(Variable *Dest, Variable *Src0, Operand *Src1,
             CondARM32::Cond Pred = CondARM32::AL) {
     Context.insert(InstARM32Rsb::create(Func, Dest, Src0, Src1, Pred));
@@ -271,6 +317,10 @@
     Context.insert(
         InstARM32Sbc::create(Func, Dest, Src0, Src1, Pred, SetFlags));
   }
+  void _sdiv(Variable *Dest, Variable *Src0, Variable *Src1,
+             CondARM32::Cond Pred = CondARM32::AL) {
+    Context.insert(InstARM32Sdiv::create(Func, Dest, Src0, Src1, Pred));
+  }
   void _str(Variable *Value, OperandARM32Mem *Addr,
             CondARM32::Cond Pred = CondARM32::AL) {
     Context.insert(InstARM32Str::create(Func, Value, Addr, Pred));
@@ -289,8 +339,14 @@
             CondARM32::Cond Pred = CondARM32::AL) {
     Context.insert(InstARM32Sxt::create(Func, Dest, Src0, Pred));
   }
-  void _ret(Variable *LR, Variable *Src0 = nullptr) {
-    Context.insert(InstARM32Ret::create(Func, LR, Src0));
+  void _tst(Variable *Src0, Operand *Src1,
+            CondARM32::Cond Pred = CondARM32::AL) {
+    Context.insert(InstARM32Tst::create(Func, Src0, Src1, Pred));
+  }
+  void _trap() { Context.insert(InstARM32Trap::create(Func)); }
+  void _udiv(Variable *Dest, Variable *Src0, Variable *Src1,
+             CondARM32::Cond Pred = CondARM32::AL) {
+    Context.insert(InstARM32Udiv::create(Func, Dest, Src0, Src1, Pred));
   }
   void _umull(Variable *DestLo, Variable *DestHi, Variable *Src0,
               Variable *Src1, CondARM32::Cond Pred = CondARM32::AL) {
@@ -305,7 +361,7 @@
     Context.insert(InstARM32Uxt::create(Func, Dest, Src0, Pred));
   }
 
-  ARM32InstructionSet InstructionSet = ARM32InstructionSet::Begin;
+  TargetARM32Features CPUFeatures;
   bool UsesFramePointer = false;
   bool NeedsStackAlignment = false;
   bool MaybeLeafFunc = true;
@@ -386,6 +442,8 @@
 
 private:
   ~TargetHeaderARM32() = default;
+
+  TargetARM32Features CPUFeatures;
 };
 
 } // end of namespace Ice
diff --git a/tests_lit/llvm2ice_tests/64bit.pnacl.ll b/tests_lit/llvm2ice_tests/64bit.pnacl.ll
index 54d8ff0..2460a24 100644
--- a/tests_lit/llvm2ice_tests/64bit.pnacl.ll
+++ b/tests_lit/llvm2ice_tests/64bit.pnacl.ll
@@ -315,6 +315,11 @@
 
 ; OPTM1-LABEL: div64BitSigned
 ; OPTM1: call {{.*}} R_{{.*}}    __divdi3
+;
+; ARM32-LABEL: div64BitSigned
+; ARM32: orrs {{r.*}}, {{r.*}}
+; ARM32: bne
+; ARM32: bl {{.*}} __divdi3
 
 define internal i64 @div64BitSignedConst(i64 %a) {
 entry:
@@ -330,6 +335,14 @@
 ; OPTM1: mov     DWORD PTR [esp+0xc],0xb3a
 ; OPTM1: mov     DWORD PTR [esp+0x8],0x73ce2ff2
 ; OPTM1: call {{.*}} R_{{.*}}    __divdi3
+;
+; ARM32-LABEL: div64BitSignedConst
+; For a constant, we should be able to optimize-out the divide by zero check.
+; ARM32-NOT: orrs
+; ARM32: movw {{.*}} ; 0x2ff2
+; ARM32: movt {{.*}} ; 0x73ce
+; ARM32: movw {{.*}} ; 0xb3a
+; ARM32: bl {{.*}} __divdi3
 
 define internal i64 @div64BitUnsigned(i64 %a, i64 %b) {
 entry:
@@ -341,6 +354,11 @@
 ;
 ; OPTM1-LABEL: div64BitUnsigned
 ; OPTM1: call {{.*}} R_{{.*}}    __udivdi3
+;
+; ARM32-LABEL: div64BitUnsigned
+; ARM32: orrs {{r.*}}, {{r.*}}
+; ARM32: bne
+; ARM32: bl {{.*}} __udivdi3
 
 define internal i64 @rem64BitSigned(i64 %a, i64 %b) {
 entry:
@@ -352,6 +370,11 @@
 ;
 ; OPTM1-LABEL: rem64BitSigned
 ; OPTM1: call {{.*}} R_{{.*}}    __moddi3
+;
+; ARM32-LABEL: rem64BitSigned
+; ARM32: orrs {{r.*}}, {{r.*}}
+; ARM32: bne
+; ARM32: bl {{.*}} __moddi3
 
 define internal i64 @rem64BitUnsigned(i64 %a, i64 %b) {
 entry:
@@ -363,6 +386,11 @@
 ;
 ; OPTM1-LABEL: rem64BitUnsigned
 ; OPTM1: call {{.*}} R_{{.*}}    __umoddi3
+;
+; ARM32-LABEL: rem64BitUnsigned
+; ARM32: orrs {{r.*}}, {{r.*}}
+; ARM32: bne
+; ARM32: bl {{.*}} __umoddi3
 
 define internal i64 @shl64BitSigned(i64 %a, i64 %b) {
 entry:
diff --git a/tests_lit/llvm2ice_tests/arith.ll b/tests_lit/llvm2ice_tests/arith.ll
index ed82426..f6318ae 100644
--- a/tests_lit/llvm2ice_tests/arith.ll
+++ b/tests_lit/llvm2ice_tests/arith.ll
@@ -8,10 +8,15 @@
 ; once enough infrastructure is in. Also, switch to --filetype=obj
 ; when possible.
 ; RUN: %if --need=target_ARM32 --need=allow_dump \
-; RUN:   --command %p2i --filetype=asm --assemble \
-; RUN:   --disassemble --target arm32 -i %s --args -O2 --skip-unimplemented \
+; RUN:   --command %p2i --filetype=asm --assemble --disassemble --target arm32 \
+; RUN:   -i %s --args -O2 --skip-unimplemented \
 ; RUN:   | %if --need=target_ARM32 --need=allow_dump \
 ; RUN:   --command FileCheck --check-prefix ARM32 %s
+; RUN: %if --need=target_ARM32 --need=allow_dump \
+; RUN:   --command %p2i --filetype=asm --assemble --disassemble --target arm32 \
+; RUN:   -i %s --args -O2 --mattr=hwdiv-arm --skip-unimplemented \
+; RUN:   | %if --need=target_ARM32 --need=allow_dump \
+; RUN:   --command FileCheck --check-prefix ARM32HWDIV %s
 
 define i32 @Add(i32 %a, i32 %b) {
 entry:
@@ -117,10 +122,32 @@
 ; CHECK-LABEL: Sdiv
 ; CHECK: cdq
 ; CHECK: idiv e
+;
 ; ARM32-LABEL: Sdiv
-; TODO(jvoung) -- implement divide and check here.
-; The lowering needs to check if the denominator is 0 and trap, since
-; ARM normally doesn't trap on divide by 0.
+; ARM32: tst [[DENOM:r.*]], [[DENOM]]
+; ARM32: bne [[LABEL:[0-9a-f]+]]
+; ARM32: .word 0xe7fedef0
+; ARM32: [[LABEL]]: {{.*}} bl {{.*}} __divsi3
+; ARM32HWDIV-LABEL: Sdiv
+; ARM32HWDIV: tst
+; ARM32HWDIV: bne
+; ARM32HWDIV: sdiv
+
+define i32 @SdivConst(i32 %a) {
+entry:
+  %div = sdiv i32 %a, 219
+  ret i32 %div
+}
+; CHECK-LABEL: SdivConst
+; CHECK: cdq
+; CHECK: idiv e
+;
+; ARM32-LABEL: SdivConst
+; ARM32-NOT: tst
+; ARM32: bl {{.*}} __divsi3
+; ARM32HWDIV-LABEL: SdivConst
+; ARM32HWDIV-NOT: tst
+; ARM32HWDIV: sdiv
 
 define i32 @Srem(i32 %a, i32 %b) {
 entry:
@@ -130,7 +157,16 @@
 ; CHECK-LABEL: Srem
 ; CHECK: cdq
 ; CHECK: idiv e
+;
 ; ARM32-LABEL: Srem
+; ARM32: tst [[DENOM:r.*]], [[DENOM]]
+; ARM32: bne
+; ARM32: bl {{.*}} __modsi3
+; ARM32HWDIV-LABEL: Srem
+; ARM32HWDIV: tst
+; ARM32HWDIV: bne
+; ARM32HWDIV: sdiv
+; ARM32HWDIV: mls
 
 define i32 @Udiv(i32 %a, i32 %b) {
 entry:
@@ -139,7 +175,15 @@
 }
 ; CHECK-LABEL: Udiv
 ; CHECK: div e
+;
 ; ARM32-LABEL: Udiv
+; ARM32: tst [[DENOM:r.*]], [[DENOM]]
+; ARM32: bne
+; ARM32: bl {{.*}} __udivsi3
+; ARM32HWDIV-LABEL: Udiv
+; ARM32HWDIV: tst
+; ARM32HWDIV: bne
+; ARM32HWDIV: udiv
 
 define i32 @Urem(i32 %a, i32 %b) {
 entry:
@@ -148,4 +192,13 @@
 }
 ; CHECK-LABEL: Urem
 ; CHECK: div e
+;
 ; ARM32-LABEL: Urem
+; ARM32: tst [[DENOM:r.*]], [[DENOM]]
+; ARM32: bne
+; ARM32: bl {{.*}} __umodsi3
+; ARM32HWDIV-LABEL: Urem
+; ARM32HWDIV: tst
+; ARM32HWDIV: bne
+; ARM32HWDIV: udiv
+; ARM32HWDIV: mls
diff --git a/tests_lit/llvm2ice_tests/switch-opt.ll b/tests_lit/llvm2ice_tests/switch-opt.ll
index ed6b8b7..3cf0daf 100644
--- a/tests_lit/llvm2ice_tests/switch-opt.ll
+++ b/tests_lit/llvm2ice_tests/switch-opt.ll
@@ -5,9 +5,11 @@
 ; RUN: %p2i -i %s --filetype=obj --disassemble --args -O2 | FileCheck %s
 
 ; TODO(jvoung): Update to -02 once the phi assignments is done for ARM
-; RUN: %if --need=target_ARM32 --command %p2i --filetype=asm --assemble \
-; RUN:   --disassemble --target arm32 -i %s --args -Om1 --skip-unimplemented \
-; RUN:   | %if --need=target_ARM32 --command FileCheck --check-prefix ARM32 %s
+; RUN: %if --need=target_ARM32 --need=allow_dump \
+; RUN:   --command %p2i --filetype=asm --assemble --disassemble \
+; RUN:   --target arm32 -i %s --args -Om1 --skip-unimplemented \
+; RUN:   | %if --need=target_ARM32 --need=allow_dump \
+; RUN:     --command FileCheck --check-prefix ARM32 %s
 
 define i32 @testSwitch(i32 %a) {
 entry:
diff --git a/tests_lit/llvm2ice_tests/unreachable.ll b/tests_lit/llvm2ice_tests/unreachable.ll
index bc08bcd..1309b6f 100644
--- a/tests_lit/llvm2ice_tests/unreachable.ll
+++ b/tests_lit/llvm2ice_tests/unreachable.ll
@@ -1,7 +1,17 @@
 ; This tests the basic structure of the Unreachable instruction.
 
-; RUN: %p2i -i %s --filetype=obj --disassemble -a -O2 | FileCheck %s
-; RUN: %p2i -i %s --filetype=obj --disassemble -a -Om1 | FileCheck %s
+; RUN: %if --need=target_X8632 --command %p2i --filetype=obj --disassemble \
+; RUN:   --target x8632 -i %s --args -O2 \
+; RUN:   | %if --need=target_X8632 --command FileCheck %s
+; RUN: %if --need=target_X8632 --command %p2i --filetype=obj --disassemble \
+; RUN:   --target x8632 -i %s --args -Om1 \
+; RUN:   | %if --need=target_X8632 --command FileCheck %s
+
+; RUN: %if --need=target_ARM32 --need=allow_dump \
+; RUN:   --command %p2i --filetype=asm --assemble \
+; RUN:   --disassemble --target arm32 -i %s --args -O2 --skip-unimplemented \
+; RUN:   | %if --need=target_ARM32 --need=allow_dump \
+; RUN:   --command FileCheck --check-prefix ARM32 %s
 
 define internal i32 @divide(i32 %num, i32 %den) {
 entry:
@@ -22,3 +32,9 @@
 ; CHECK: cdq
 ; CHECK: idiv
 ; CHECK: ret
+
+; ARM32-LABEL: divide
+; ARM32: cmp
+; ARM32: .word 0xe7fedef0
+; ARM32: bl {{.*}} __divsi3
+; ARM32: bx lr