ARM32: Lower more integer intrinsics and test.

Lower stacksave/restore.
Lower ctlz, cttz, bswap, and popcount. Popcount is just
done with a helper call. Ctz can use the clz instruction
after reversing the bits.

We can only crosstest stacksave/restore for now which
happens to be written in C for the C99 VLAs. The CXX
crosstests I can't seem to compile with the arm-cross-g++
(missing headers), so I will check that later after
resolving the cross compilation issue.

BUG= https://code.google.com/p/nativeclient/issues/detail?id=4076
R=jpp@chromium.org

Review URL: https://codereview.chromium.org/1222943003 .
diff --git a/Makefile.standalone b/Makefile.standalone
index 83eddac..3d61754 100644
--- a/Makefile.standalone
+++ b/Makefile.standalone
@@ -327,7 +327,8 @@
 	  --toolchain-root $(TOOLCHAIN_ROOT) \
 	  -i x8632,native,sse2 -i x8632,native,sse4.1,test_vector_ops \
 	  -i x8632,sandbox,sse4.1,Om1 \
-	  -i arm32,native,neon,Om1,simple_loop
+	  -i arm32,native,neon,Om1,simple_loop \
+	  -i arm32,native,neon,Om1,test_stacksave
 	PNACL_BIN_PATH=$(PNACL_BIN_PATH) \
 	$(LLVM_SRC_PATH)/utils/lit/lit.py -sv crosstest/Output
 endif
diff --git a/src/IceInstARM32.cpp b/src/IceInstARM32.cpp
index 72178ac..e95f6b1 100644
--- a/src/IceInstARM32.cpp
+++ b/src/IceInstARM32.cpp
@@ -80,18 +80,15 @@
 }
 
 void InstARM32Pred::emitUnaryopGPR(const char *Opcode,
-                                   const InstARM32Pred *Inst, const Cfg *Func) {
+                                   const InstARM32Pred *Inst, const Cfg *Func,
+                                   bool NeedsWidthSuffix) {
   Ostream &Str = Func->getContext()->getStrEmit();
   assert(Inst->getSrcSize() == 1);
   Type SrcTy = Inst->getSrc(0)->getType();
-  Type DestTy = Inst->getDest()->getType();
   Str << "\t" << Opcode;
-  // Sxt and Uxt need source type width letter to define the operation.
-  // The other unary operations have the same source and dest type and
-  // as a result need only one letter.
-  if (SrcTy != DestTy)
+  if (NeedsWidthSuffix)
     Str << getWidthString(SrcTy);
-  Str << "\t";
+  Str << Inst->getPredicate() << "\t";
   Inst->getDest()->emit(Func);
   Str << ", ";
   Inst->getSrc(0)->emit(Func);
@@ -358,7 +355,10 @@
 template <> const char *InstARM32Movt::Opcode = "movt";
 // Unary ops
 template <> const char *InstARM32Movw::Opcode = "movw";
+template <> const char *InstARM32Clz::Opcode = "clz";
 template <> const char *InstARM32Mvn::Opcode = "mvn";
+template <> const char *InstARM32Rbit::Opcode = "rbit";
+template <> const char *InstARM32Rev::Opcode = "rev";
 template <> const char *InstARM32Sxt::Opcode = "sxt"; // still requires b/h
 template <> const char *InstARM32Uxt::Opcode = "uxt"; // still requires b/h
 // Mov-like ops
diff --git a/src/IceInstARM32.h b/src/IceInstARM32.h
index d12c4ff..8a7e1da 100644
--- a/src/IceInstARM32.h
+++ b/src/IceInstARM32.h
@@ -262,6 +262,7 @@
     Br,
     Call,
     Cmp,
+    Clz,
     Eor,
     Label,
     Ldr,
@@ -277,7 +278,9 @@
     Orr,
     Pop,
     Push,
+    Rbit,
     Ret,
+    Rev,
     Rsb,
     Sbc,
     Sdiv,
@@ -324,7 +327,7 @@
 
   /// Shared emit routines for common forms of instructions.
   static void emitUnaryopGPR(const char *Opcode, const InstARM32Pred *Inst,
-                             const Cfg *Func);
+                             const Cfg *Func, bool NeedsWidthSuffix);
   static void emitTwoAddr(const char *Opcode, const InstARM32Pred *Inst,
                           const Cfg *Func);
   static void emitThreeAddr(const char *Opcode, const InstARM32Pred *Inst,
@@ -345,7 +348,7 @@
 }
 
 /// Instructions of the form x := op(y).
-template <InstARM32::InstKindARM32 K>
+template <InstARM32::InstKindARM32 K, bool NeedsWidthSuffix>
 class InstARM32UnaryopGPR : public InstARM32Pred {
   InstARM32UnaryopGPR() = delete;
   InstARM32UnaryopGPR(const InstARM32UnaryopGPR &) = delete;
@@ -360,7 +363,7 @@
   void emit(const Cfg *Func) const override {
     if (!BuildDefs::dump())
       return;
-    emitUnaryopGPR(Opcode, this, Func);
+    emitUnaryopGPR(Opcode, this, Func, NeedsWidthSuffix);
   }
   void emitIAS(const Cfg *Func) const override {
     (void)Func;
@@ -641,13 +644,16 @@
 /// MovT leaves the bottom bits alone so dest is also a source.
 /// This helps indicate that a previous MovW setting dest is not dead code.
 typedef InstARM32TwoAddrGPR<InstARM32::Movt> InstARM32Movt;
-typedef InstARM32UnaryopGPR<InstARM32::Movw> InstARM32Movw;
-typedef InstARM32UnaryopGPR<InstARM32::Mvn> InstARM32Mvn;
+typedef InstARM32UnaryopGPR<InstARM32::Movw, false> InstARM32Movw;
+typedef InstARM32UnaryopGPR<InstARM32::Clz, false> InstARM32Clz;
+typedef InstARM32UnaryopGPR<InstARM32::Mvn, false> InstARM32Mvn;
+typedef InstARM32UnaryopGPR<InstARM32::Rbit, false> InstARM32Rbit;
+typedef InstARM32UnaryopGPR<InstARM32::Rev, false> InstARM32Rev;
 // Technically, the uxt{b,h} and sxt{b,h} instructions have a rotation
 // operand as well (rotate source by 8, 16, 24 bits prior to extending),
 // but we aren't using that for now, so just model as a Unaryop.
-typedef InstARM32UnaryopGPR<InstARM32::Sxt> InstARM32Sxt;
-typedef InstARM32UnaryopGPR<InstARM32::Uxt> InstARM32Uxt;
+typedef InstARM32UnaryopGPR<InstARM32::Sxt, true> InstARM32Sxt;
+typedef InstARM32UnaryopGPR<InstARM32::Uxt, true> InstARM32Uxt;
 typedef InstARM32FourAddrGPR<InstARM32::Mla> InstARM32Mla;
 typedef InstARM32FourAddrGPR<InstARM32::Mls> InstARM32Mls;
 typedef InstARM32CmpLike<InstARM32::Cmp> InstARM32Cmp;
diff --git a/src/IceTargetLowering.h b/src/IceTargetLowering.h
index 64672c4..a5b52ce 100644
--- a/src/IceTargetLowering.h
+++ b/src/IceTargetLowering.h
@@ -321,6 +321,9 @@
     Context.insert(InstBundleLock::create(Func, BundleOption));
   }
   void _bundle_unlock() { Context.insert(InstBundleUnlock::create(Func)); }
+  void _set_dest_nonkillable() {
+    Context.getLastInserted()->setDestNonKillable();
+  }
 
   Cfg *Func;
   GlobalContext *Ctx;
diff --git a/src/IceTargetLoweringARM32.cpp b/src/IceTargetLoweringARM32.cpp
index 6639da8..10fdfe1 100644
--- a/src/IceTargetLoweringARM32.cpp
+++ b/src/IceTargetLoweringARM32.cpp
@@ -1575,7 +1575,7 @@
 
   // Copy arguments that are passed on the stack to the appropriate
   // stack locations.
-  Variable *SP = Func->getTarget()->getPhysicalRegister(RegARM32::Reg_sp);
+  Variable *SP = getPhysicalRegister(RegARM32::Reg_sp);
   for (auto &StackArg : StackArgs) {
     ConstantInteger32 *Loc =
         llvm::cast<ConstantInteger32>(Ctx->getConstantInt32(StackArg.second));
@@ -1662,7 +1662,7 @@
   if (ParameterAreaSizeBytes) {
     Operand *AddAmount = legalize(Ctx->getConstantInt32(ParameterAreaSizeBytes),
                                   Legal_Reg | Legal_Flex);
-    Variable *SP = Func->getTarget()->getPhysicalRegister(RegARM32::Reg_sp);
+    Variable *SP = getPhysicalRegister(RegARM32::Reg_sp);
     _add(SP, SP, AddAmount);
   }
 
@@ -2032,19 +2032,91 @@
     return;
   }
   case Intrinsics::Bswap: {
-    UnimplementedError(Func->getContext()->getFlags());
+    Variable *Dest = Instr->getDest();
+    Operand *Val = Instr->getArg(0);
+    Type Ty = Val->getType();
+    if (Ty == IceType_i64) {
+      Variable *Val_Lo = legalizeToVar(loOperand(Val));
+      Variable *Val_Hi = legalizeToVar(hiOperand(Val));
+      Variable *T_Lo = makeReg(IceType_i32);
+      Variable *T_Hi = makeReg(IceType_i32);
+      Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
+      Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
+      _rev(T_Lo, Val_Lo);
+      _rev(T_Hi, Val_Hi);
+      _mov(DestLo, T_Hi);
+      _mov(DestHi, T_Lo);
+    } else {
+      assert(Ty == IceType_i32 || Ty == IceType_i16);
+      Variable *ValR = legalizeToVar(Val);
+      Variable *T = makeReg(Ty);
+      _rev(T, ValR);
+      if (Val->getType() == IceType_i16) {
+        Operand *Sixteen =
+            legalize(Ctx->getConstantInt32(16), Legal_Reg | Legal_Flex);
+        _lsr(T, T, Sixteen);
+      }
+      _mov(Dest, T);
+    }
     return;
   }
   case Intrinsics::Ctpop: {
-    UnimplementedError(Func->getContext()->getFlags());
+    Variable *Dest = Instr->getDest();
+    Operand *Val = Instr->getArg(0);
+    InstCall *Call = makeHelperCall(isInt32Asserting32Or64(Val->getType())
+                                        ? H_call_ctpop_i32
+                                        : H_call_ctpop_i64,
+                                    Dest, 1);
+    Call->addArg(Val);
+    lowerCall(Call);
+    // The popcount helpers always return 32-bit values, while the intrinsic's
+    // signature matches some 64-bit platform's native instructions and
+    // expect to fill a 64-bit reg. Thus, clear the upper bits of the dest
+    // just in case the user doesn't do that in the IR or doesn't toss the bits
+    // via truncate.
+    if (Val->getType() == IceType_i64) {
+      Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
+      Constant *Zero = Ctx->getConstantZero(IceType_i32);
+      _mov(DestHi, Zero);
+    }
     return;
   }
   case Intrinsics::Ctlz: {
-    UnimplementedError(Func->getContext()->getFlags());
+    // The "is zero undef" parameter is ignored and we always return
+    // a well-defined value.
+    Operand *Val = Instr->getArg(0);
+    Variable *ValLoR;
+    Variable *ValHiR = nullptr;
+    if (Val->getType() == IceType_i64) {
+      ValLoR = legalizeToVar(loOperand(Val));
+      ValHiR = legalizeToVar(hiOperand(Val));
+    } else {
+      ValLoR = legalizeToVar(Val);
+    }
+    lowerCLZ(Instr->getDest(), ValLoR, ValHiR);
     return;
   }
   case Intrinsics::Cttz: {
-    UnimplementedError(Func->getContext()->getFlags());
+    // Essentially like Clz, but reverse the bits first.
+    Operand *Val = Instr->getArg(0);
+    Variable *ValLoR;
+    Variable *ValHiR = nullptr;
+    if (Val->getType() == IceType_i64) {
+      ValLoR = legalizeToVar(loOperand(Val));
+      ValHiR = legalizeToVar(hiOperand(Val));
+      Variable *TLo = makeReg(IceType_i32);
+      Variable *THi = makeReg(IceType_i32);
+      _rbit(TLo, ValLoR);
+      _rbit(THi, ValHiR);
+      ValLoR = THi;
+      ValHiR = TLo;
+    } else {
+      ValLoR = legalizeToVar(Val);
+      Variable *T = makeReg(IceType_i32);
+      _rbit(T, ValLoR);
+      ValLoR = T;
+    }
+    lowerCLZ(Instr->getDest(), ValLoR, ValHiR);
     return;
   }
   case Intrinsics::Fabs: {
@@ -2077,13 +2149,15 @@
     return;
   }
   case Intrinsics::Memset: {
-    // The value operand needs to be extended to a stack slot size
-    // because the PNaCl ABI requires arguments to be at least 32 bits
-    // wide.
+    // The value operand needs to be extended to a stack slot size because the
+    // PNaCl ABI requires arguments to be at least 32 bits wide.
     Operand *ValOp = Instr->getArg(1);
     assert(ValOp->getType() == IceType_i8);
     Variable *ValExt = Func->makeVariable(stackSlotType());
     lowerCast(InstCast::create(Func, InstCast::Zext, ValExt, ValOp));
+    // Technically, ARM has their own __aeabi_memset, but we can use plain
+    // memset too. The value and size argument need to be flipped if we ever
+    // decide to use __aeabi_memset.
     InstCall *Call = makeHelperCall(H_call_memset, nullptr, 3);
     Call->addArg(Instr->getArg(0));
     Call->addArg(ValExt);
@@ -2111,15 +2185,19 @@
     return;
   }
   case Intrinsics::Stacksave: {
-    UnimplementedError(Func->getContext()->getFlags());
+    Variable *SP = getPhysicalRegister(RegARM32::Reg_sp);
+    Variable *Dest = Instr->getDest();
+    _mov(Dest, SP);
     return;
   }
   case Intrinsics::Stackrestore: {
-    UnimplementedError(Func->getContext()->getFlags());
+    Variable *SP = getPhysicalRegister(RegARM32::Reg_sp);
+    Operand *Val = legalize(Instr->getArg(0), Legal_Reg | Legal_Flex);
+    _mov_nonkillable(SP, Val);
     return;
   }
   case Intrinsics::Trap:
-    UnimplementedError(Func->getContext()->getFlags());
+    _trap();
     return;
   case Intrinsics::UnknownIntrinsic:
     Func->setError("Should not be lowering UnknownIntrinsic");
@@ -2128,6 +2206,34 @@
   return;
 }
 
+void TargetARM32::lowerCLZ(Variable *Dest, Variable *ValLoR, Variable *ValHiR) {
+  Type Ty = Dest->getType();
+  assert(Ty == IceType_i32 || Ty == IceType_i64);
+  Variable *T = makeReg(IceType_i32);
+  _clz(T, ValLoR);
+  if (Ty == IceType_i64) {
+    Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
+    Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
+    Operand *Zero =
+        legalize(Ctx->getConstantZero(IceType_i32), Legal_Reg | Legal_Flex);
+    Operand *ThirtyTwo =
+        legalize(Ctx->getConstantInt32(32), Legal_Reg | Legal_Flex);
+    _cmp(ValHiR, Zero);
+    Variable *T2 = makeReg(IceType_i32);
+    _add(T2, T, ThirtyTwo);
+    _clz(T2, ValHiR, CondARM32::NE);
+    // T2 is actually a source as well when the predicate is not AL
+    // (since it may leave T2 alone). We use set_dest_nonkillable to
+    // prolong the liveness of T2 as if it was used as a source.
+    _set_dest_nonkillable();
+    _mov(DestLo, T2);
+    _mov(DestHi, Ctx->getConstantZero(IceType_i32));
+    return;
+  }
+  _mov(Dest, T);
+  return;
+}
+
 void TargetARM32::lowerLoad(const InstLoad *Load) {
   // A Load instruction can be treated the same as an Assign
   // instruction, after the source operand is transformed into an
@@ -2186,7 +2292,7 @@
   // eliminated.  TODO: Are there more places where the fake use
   // should be inserted?  E.g. "void f(int n){while(1) g(n);}" may not
   // have a ret instruction.
-  Variable *SP = Func->getTarget()->getPhysicalRegister(RegARM32::Reg_sp);
+  Variable *SP = getPhysicalRegister(RegARM32::Reg_sp);
   Context.insert(InstFakeUse::create(Func, SP));
 }
 
diff --git a/src/IceTargetLoweringARM32.h b/src/IceTargetLoweringARM32.h
index 8aa3e11..becb615 100644
--- a/src/IceTargetLoweringARM32.h
+++ b/src/IceTargetLoweringARM32.h
@@ -172,6 +172,8 @@
                     ExtInstr ExtFunc, DivInstr DivFunc,
                     const char *DivHelperName, bool IsRemainder);
 
+  void lowerCLZ(Variable *Dest, Variable *ValLo, Variable *ValHi);
+
   // The following are helpers that insert lowered ARM32 instructions
   // with minimal syntactic overhead, so that the lowering code can
   // look as close to assembly as practical.
@@ -224,6 +226,10 @@
             CondARM32::Cond Pred = CondARM32::AL) {
     Context.insert(InstARM32Cmp::create(Func, Src0, Src1, Pred));
   }
+  void _clz(Variable *Dest, Variable *Src0,
+            CondARM32::Cond Pred = CondARM32::AL) {
+    Context.insert(InstARM32Clz::create(Func, Dest, Src0, Pred));
+  }
   void _eor(Variable *Dest, Variable *Src0, Operand *Src1,
             CondARM32::Cond Pred = CondARM32::AL) {
     Context.insert(InstARM32Eor::create(Func, Dest, Src0, Src1, Pred));
@@ -301,6 +307,14 @@
     for (Variable *Dest : Dests)
       Context.insert(InstFakeDef::create(Func, Dest));
   }
+  void _rbit(Variable *Dest, Variable *Src0,
+             CondARM32::Cond Pred = CondARM32::AL) {
+    Context.insert(InstARM32Rbit::create(Func, Dest, Src0, Pred));
+  }
+  void _rev(Variable *Dest, Variable *Src0,
+            CondARM32::Cond Pred = CondARM32::AL) {
+    Context.insert(InstARM32Rev::create(Func, Dest, Src0, Pred));
+  }
   void _ret(Variable *LR, Variable *Src0 = nullptr) {
     Context.insert(InstARM32Ret::create(Func, LR, Src0));
   }
diff --git a/src/IceTargetLoweringX86Base.h b/src/IceTargetLoweringX86Base.h
index 32d7d6b..c00f0b0 100644
--- a/src/IceTargetLoweringX86Base.h
+++ b/src/IceTargetLoweringX86Base.h
@@ -96,6 +96,7 @@
 
   using Machine::_bundle_lock;
   using Machine::_bundle_unlock;
+  using Machine::_set_dest_nonkillable;
   using Machine::getContext;
   using Machine::getStackAdjustment;
   using Machine::regAlloc;
@@ -587,9 +588,6 @@
   void _xor_rmw(typename Traits::X86OperandMem *DestSrc0, Operand *Src1) {
     Context.insert(Traits::Insts::XorRMW::create(Func, DestSrc0, Src1));
   }
-  void _set_dest_nonkillable() {
-    Context.getLastInserted()->setDestNonKillable();
-  }
 
   bool optimizeScalarMul(Variable *Dest, Operand *Src0, int32_t Src1);
   void findRMW();
diff --git a/src/IceTargetLoweringX86BaseImpl.h b/src/IceTargetLoweringX86BaseImpl.h
index 68cbf94..a277db2 100644
--- a/src/IceTargetLoweringX86BaseImpl.h
+++ b/src/IceTargetLoweringX86BaseImpl.h
@@ -3521,9 +3521,8 @@
     return;
   }
   case Intrinsics::Memset: {
-    // The value operand needs to be extended to a stack slot size
-    // because the PNaCl ABI requires arguments to be at least 32 bits
-    // wide.
+    // The value operand needs to be extended to a stack slot size because the
+    // PNaCl ABI requires arguments to be at least 32 bits wide.
     Operand *ValOp = Instr->getArg(1);
     assert(ValOp->getType() == IceType_i8);
     Variable *ValExt = Func->template makeVariable(stackSlotType());
@@ -5257,8 +5256,7 @@
       _lea(Reg, Traits::X86OperandMem::create(Func, IceType_i32, Reg, Offset,
                                               nullptr, 0));
       // make sure liveness analysis won't kill this variable, otherwise a
-      // liveness
-      // assertion will be triggered.
+      // liveness assertion will be triggered.
       _set_dest_nonkillable();
       if (Immediate->getType() != IceType_i32) {
         Variable *TruncReg = makeReg(Immediate->getType(), RegNum);
diff --git a/tests_lit/llvm2ice_tests/nacl-other-intrinsics.ll b/tests_lit/llvm2ice_tests/nacl-other-intrinsics.ll
index da56571..ef72d6e 100644
--- a/tests_lit/llvm2ice_tests/nacl-other-intrinsics.ll
+++ b/tests_lit/llvm2ice_tests/nacl-other-intrinsics.ll
@@ -1,21 +1,34 @@
 ; This tests the NaCl intrinsics not related to atomic operations.
 
-; RUN: %p2i -i %s --filetype=obj --disassemble --args -O2 -sandbox \
-; RUN:   | FileCheck %s
-; RUN: %p2i -i %s --filetype=obj --disassemble --args -Om1 -sandbox \
-; RUN:   | FileCheck %s
+; RUN: %if --need=target_X8632 --command %p2i --filetype=obj --disassemble \
+; RUN:   --target x8632 -i %s --args -O2 -sandbox \
+; RUN:   | %if --need=target_X8632 --command FileCheck %s
+; RUN: %if --need=target_X8632 --command %p2i --filetype=obj --disassemble \
+; RUN:   --target x8632 -i %s --args -Om1 -sandbox \
+; RUN:   | %if --need=target_X8632 --command FileCheck %s
 
 ; Do another run w/ O2 and a different check-prefix (otherwise O2 and Om1
 ; share the same "CHECK" prefix). This separate run helps check that
 ; some code is optimized out.
-; RUN: %p2i -i %s --filetype=obj --disassemble --args -O2 -sandbox \
-; RUN:   | FileCheck --check-prefix=CHECKO2REM %s
+; RUN: %if --need=target_X8632 --command %p2i --filetype=obj --disassemble \
+; RUN:   --target x8632 -i %s --args -O2 -sandbox \
+; RUN:   | %if --need=target_X8632 \
+; RUN:   --command FileCheck --check-prefix=CHECKO2REM %s
 
 ; Do O2 runs without -sandbox to make sure llvm.nacl.read.tp gets
 ; lowered to __nacl_read_tp instead of gs:0x0.
 ; We also know that because it's O2, it'll have the O2REM optimizations.
-; RUN: %p2i -i %s --filetype=obj --disassemble --args -O2 \
-; RUN:   | FileCheck --check-prefix=CHECKO2UNSANDBOXEDREM %s
+; RUN: %if --need=target_X8632 --command %p2i --filetype=obj --disassemble \
+; RUN:   --target x8632 -i %s --args -O2 \
+; RUN:   | %if --need=target_X8632 \
+; RUN:   --command FileCheck --check-prefix=CHECKO2UNSANDBOXEDREM %s
+
+; RUN: %if --need=target_ARM32 --need=allow_dump \
+; RUN:   --command %p2i --filetype=asm --assemble --disassemble --target arm32 \
+; RUN:   -i %s --args -O2 --skip-unimplemented \
+; RUN:   | %if --need=target_ARM32 --need=allow_dump \
+; RUN:   --command FileCheck --check-prefix ARM32 %s
+
 
 declare i8* @llvm.nacl.read.tp()
 declare void @llvm.memcpy.p0i8.p0i8.i32(i8*, i8*, i32, i32, i1)
@@ -106,6 +119,8 @@
 ; CHECK: call {{.*}} R_{{.*}} memcpy
 ; CHECKO2REM-LABEL: test_memcpy
 ; CHECKO2UNSANDBOXEDREM-LABEL: test_memcpy
+; ARM32-LABEL: test_memcpy
+; ARM32: bl {{.*}} memcpy
 
 ; TODO(jvoung) -- if we want to be clever, we can do this and the memmove,
 ; memset without a function call.
@@ -114,11 +129,13 @@
   %dst = inttoptr i32 %iptr_dst to i8*
   %src = inttoptr i32 %iptr_src to i8*
   call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %src,
-                                       i32 8, i32 1, i1 false)
+                                       i32 32, i32 1, i1 false)
   ret void
 }
 ; CHECK-LABEL: test_memcpy_const_len_align
 ; CHECK: call {{.*}} R_{{.*}} memcpy
+; ARM32-LABEL: test_memcpy_const_len_align
+; ARM32: bl {{.*}} memcpy
 
 define void @test_memmove(i32 %iptr_dst, i32 %iptr_src, i32 %len) {
 entry:
@@ -130,17 +147,21 @@
 }
 ; CHECK-LABEL: test_memmove
 ; CHECK: call {{.*}} R_{{.*}} memmove
+; ARM32-LABEL: test_memmove
+; ARM32: bl {{.*}} memmove
 
 define void @test_memmove_const_len_align(i32 %iptr_dst, i32 %iptr_src) {
 entry:
   %dst = inttoptr i32 %iptr_dst to i8*
   %src = inttoptr i32 %iptr_src to i8*
   call void @llvm.memmove.p0i8.p0i8.i32(i8* %dst, i8* %src,
-                                        i32 8, i32 1, i1 false)
+                                        i32 32, i32 1, i1 false)
   ret void
 }
 ; CHECK-LABEL: test_memmove_const_len_align
 ; CHECK: call {{.*}} R_{{.*}} memmove
+; ARM32-LABEL: test_memmove_const_len_align
+; ARM32: bl {{.*}} memmove
 
 define void @test_memset(i32 %iptr_dst, i32 %wide_val, i32 %len) {
 entry:
@@ -153,18 +174,24 @@
 ; CHECK-LABEL: test_memset
 ; CHECK: movzx
 ; CHECK: call {{.*}} R_{{.*}} memset
+; ARM32-LABEL: test_memset
+; ARM32: uxtb
+; ARM32: bl {{.*}} memset
 
 define void @test_memset_const_len_align(i32 %iptr_dst, i32 %wide_val) {
 entry:
   %val = trunc i32 %wide_val to i8
   %dst = inttoptr i32 %iptr_dst to i8*
   call void @llvm.memset.p0i8.i32(i8* %dst, i8 %val,
-                                  i32 8, i32 1, i1 false)
+                                  i32 32, i32 1, i1 false)
   ret void
 }
 ; CHECK-LABEL: test_memset_const_len_align
 ; CHECK: movzx
 ; CHECK: call {{.*}} R_{{.*}} memset
+; ARM32-LABEL: test_memset_const_len_align
+; ARM32: uxtb
+; ARM32: bl {{.*}} memset
 
 define void @test_memset_const_val(i32 %iptr_dst, i32 %len) {
 entry:
@@ -176,7 +203,9 @@
 ; Make sure the argument is legalized (can't movzx reg, 0).
 ; CHECK: movzx {{.*}},{{[^0]}}
 ; CHECK: call {{.*}} R_{{.*}} memset
-
+; ARM32-LABEL: test_memset_const_val
+; ARM32: uxtb
+; ARM32: bl {{.*}} memset
 
 define i32 @test_setjmplongjmp(i32 %iptr_env) {
 entry:
@@ -198,6 +227,9 @@
 ; CHECKO2REM-LABEL: test_setjmplongjmp
 ; CHECKO2REM: call {{.*}} R_{{.*}} setjmp
 ; CHECKO2REM: call {{.*}} R_{{.*}} longjmp
+; ARM32-LABEL: test_setjmplongjmp
+; ARM32: bl {{.*}} setjmp
+; ARM32: bl {{.*}} longjmp
 
 define i32 @test_setjmp_unused(i32 %iptr_env, i32 %i_other) {
 entry:
@@ -344,6 +376,8 @@
 }
 ; CHECK-LABEL: test_trap
 ; CHECK: ud2
+; ARM32-LABEL: test_trap
+; ARM32: .word 0xe7fedef0
 
 define i32 @test_bswap_16(i32 %x) {
 entry:
@@ -356,6 +390,9 @@
 ; Make sure this is the right operand size so that the most significant bit
 ; to least significant bit rotation happens at the right boundary.
 ; CHECK: rol {{[abcd]x|si|di|bp|word ptr}},0x8
+; ARM32-LABEL: test_bswap_16
+; ARM32: rev
+; ARM32: lsr {{.*}} #16
 
 define i32 @test_bswap_32(i32 %x) {
 entry:
@@ -364,6 +401,8 @@
 }
 ; CHECK-LABEL: test_bswap_32
 ; CHECK: bswap e{{.*}}
+; ARM32-LABEL: test_bswap_32
+; ARM32: rev
 
 define i64 @test_bswap_64(i64 %x) {
 entry:
@@ -373,6 +412,9 @@
 ; CHECK-LABEL: test_bswap_64
 ; CHECK: bswap e{{.*}}
 ; CHECK: bswap e{{.*}}
+; ARM32-LABEL: test_bswap_64
+; ARM32: rev
+; ARM32: rev
 
 define i32 @test_ctlz_32(i32 %x) {
 entry:
@@ -387,6 +429,8 @@
 ; CHECK: mov [[REG_RES:e.*]],0x3f
 ; CHECK: cmovne [[REG_RES]],[[REG_TMP]]
 ; CHECK: xor [[REG_RES]],0x1f
+; ARM32-LABEL: test_ctlz_32
+; ARM32: clz
 
 define i32 @test_ctlz_32_const() {
 entry:
@@ -398,6 +442,8 @@
 ; or memory.
 ; CHECK-LABEL: test_ctlz_32_const
 ; CHECK: bsr e{{.*}},{{.*}}e{{.*}}
+; ARM32-LABEL: test_ctlz_32_const
+; ARM32: clz
 
 define i32 @test_ctlz_32_ignored(i32 %x) {
 entry:
@@ -424,6 +470,12 @@
 ; CHECK: test [[REG_UPPER:.*]],[[REG_UPPER]]
 ; CHECK: cmove [[REG_RES2]],[[REG_RES1]]
 ; CHECK: mov {{.*}},0x0
+; ARM32-LABEL: test_ctlz_64
+; ARM32: clz
+; ARM32: cmp {{.*}}, #0
+; ARM32: add {{.*}}, #32
+; ARM32: clzne
+; ARM32: mov {{.*}}, #0
 
 define i32 @test_ctlz_64_const(i64 %x) {
 entry:
@@ -434,7 +486,9 @@
 ; CHECK-LABEL: test_ctlz_64_const
 ; CHECK: bsr e{{.*}},{{.*}}e{{.*}}
 ; CHECK: bsr e{{.*}},{{.*}}e{{.*}}
-
+; ARM32-LABEL: test_ctlz_64
+; ARM32: clz
+; ARM32: clzne
 
 define i32 @test_ctlz_64_ignored(i64 %x) {
 entry:
@@ -453,6 +507,9 @@
 ; CHECK: bsf [[REG_IF_NOTZERO:e.*]],{{.*}}
 ; CHECK: mov [[REG_IF_ZERO:e.*]],0x20
 ; CHECK: cmovne [[REG_IF_ZERO]],[[REG_IF_NOTZERO]]
+; ARM32-LABEL: test_cttz_32
+; ARM32: rbit
+; ARM32: clz
 
 define i64 @test_cttz_64(i64 %x) {
 entry:
@@ -468,6 +525,14 @@
 ; CHECK: test [[REG_LOWER]],[[REG_LOWER]]
 ; CHECK: cmove [[REG_RES2]],[[REG_RES1]]
 ; CHECK: mov {{.*}},0x0
+; ARM32-LABEL: test_cttz_64
+; ARM32: rbit
+; ARM32: rbit
+; ARM32: clz
+; ARM32: cmp {{.*}}, #0
+; ARM32: add {{.*}}, #32
+; ARM32: clzne
+; ARM32: mov {{.*}}, #0
 
 define i32 @test_popcount_32(i32 %x) {
 entry:
@@ -476,6 +541,8 @@
 }
 ; CHECK-LABEL: test_popcount_32
 ; CHECK: call {{.*}} R_{{.*}} __popcountsi2
+; ARM32-LABEL: test_popcount_32
+; ARM32: bl {{.*}} __popcountsi2
 
 define i64 @test_popcount_64(i64 %x) {
 entry:
@@ -487,7 +554,9 @@
 ; __popcountdi2 only returns a 32-bit result, so clear the upper bits of
 ; the return value just in case.
 ; CHECK: mov {{.*}},0x0
-
+; ARM32-LABEL: test_popcount_64
+; ARM32: bl {{.*}} __popcountdi2
+; ARM32: mov {{.*}}, #0
 
 define i32 @test_popcount_64_ret_i32(i64 %x) {
 entry:
@@ -509,6 +578,9 @@
 ; CHECK-LABEL: test_stacksave_noalloca
 ; CHECK: mov {{.*}},esp
 ; CHECK: mov esp,{{.*}}
+; ARM32-LABEL: test_stacksave_noalloca
+; ARM32: mov {{.*}}, sp
+; ARM32: mov sp, {{.*}}
 
 declare i32 @foo(i32 %x)
 
@@ -544,3 +616,8 @@
 ; CHECK: mov {{.*}},esp
 ; CHECK: mov {{.*}},esp
 ; CHECK: mov esp,{{.*}}
+; ARM32-LABEL: test_stacksave_multiple
+; ARM32: mov {{.*}}, sp
+; ARM32: mov {{.*}}, sp
+; ARM32: mov {{.*}}, sp
+; ARM32: mov sp, {{.*}}