ARM32: Lower more integer intrinsics and test.

Lower stacksave/restore.
Lower ctlz, cttz, bswap, and popcount. Popcount is just
done with a helper call. Ctz can use the clz instruction
after reversing the bits.

We can only crosstest stacksave/restore for now which
happens to be written in C for the C99 VLAs. The CXX
crosstests I can't seem to compile with the arm-cross-g++
(missing headers), so I will check that later after
resolving the cross compilation issue.

BUG= https://code.google.com/p/nativeclient/issues/detail?id=4076
R=jpp@chromium.org

Review URL: https://codereview.chromium.org/1222943003 .
diff --git a/src/IceInstARM32.cpp b/src/IceInstARM32.cpp
index 72178ac..e95f6b1 100644
--- a/src/IceInstARM32.cpp
+++ b/src/IceInstARM32.cpp
@@ -80,18 +80,15 @@
 }
 
 void InstARM32Pred::emitUnaryopGPR(const char *Opcode,
-                                   const InstARM32Pred *Inst, const Cfg *Func) {
+                                   const InstARM32Pred *Inst, const Cfg *Func,
+                                   bool NeedsWidthSuffix) {
   Ostream &Str = Func->getContext()->getStrEmit();
   assert(Inst->getSrcSize() == 1);
   Type SrcTy = Inst->getSrc(0)->getType();
-  Type DestTy = Inst->getDest()->getType();
   Str << "\t" << Opcode;
-  // Sxt and Uxt need source type width letter to define the operation.
-  // The other unary operations have the same source and dest type and
-  // as a result need only one letter.
-  if (SrcTy != DestTy)
+  if (NeedsWidthSuffix)
     Str << getWidthString(SrcTy);
-  Str << "\t";
+  Str << Inst->getPredicate() << "\t";
   Inst->getDest()->emit(Func);
   Str << ", ";
   Inst->getSrc(0)->emit(Func);
@@ -358,7 +355,10 @@
 template <> const char *InstARM32Movt::Opcode = "movt";
 // Unary ops
 template <> const char *InstARM32Movw::Opcode = "movw";
+template <> const char *InstARM32Clz::Opcode = "clz";
 template <> const char *InstARM32Mvn::Opcode = "mvn";
+template <> const char *InstARM32Rbit::Opcode = "rbit";
+template <> const char *InstARM32Rev::Opcode = "rev";
 template <> const char *InstARM32Sxt::Opcode = "sxt"; // still requires b/h
 template <> const char *InstARM32Uxt::Opcode = "uxt"; // still requires b/h
 // Mov-like ops
diff --git a/src/IceInstARM32.h b/src/IceInstARM32.h
index d12c4ff..8a7e1da 100644
--- a/src/IceInstARM32.h
+++ b/src/IceInstARM32.h
@@ -262,6 +262,7 @@
     Br,
     Call,
     Cmp,
+    Clz,
     Eor,
     Label,
     Ldr,
@@ -277,7 +278,9 @@
     Orr,
     Pop,
     Push,
+    Rbit,
     Ret,
+    Rev,
     Rsb,
     Sbc,
     Sdiv,
@@ -324,7 +327,7 @@
 
   /// Shared emit routines for common forms of instructions.
   static void emitUnaryopGPR(const char *Opcode, const InstARM32Pred *Inst,
-                             const Cfg *Func);
+                             const Cfg *Func, bool NeedsWidthSuffix);
   static void emitTwoAddr(const char *Opcode, const InstARM32Pred *Inst,
                           const Cfg *Func);
   static void emitThreeAddr(const char *Opcode, const InstARM32Pred *Inst,
@@ -345,7 +348,7 @@
 }
 
 /// Instructions of the form x := op(y).
-template <InstARM32::InstKindARM32 K>
+template <InstARM32::InstKindARM32 K, bool NeedsWidthSuffix>
 class InstARM32UnaryopGPR : public InstARM32Pred {
   InstARM32UnaryopGPR() = delete;
   InstARM32UnaryopGPR(const InstARM32UnaryopGPR &) = delete;
@@ -360,7 +363,7 @@
   void emit(const Cfg *Func) const override {
     if (!BuildDefs::dump())
       return;
-    emitUnaryopGPR(Opcode, this, Func);
+    emitUnaryopGPR(Opcode, this, Func, NeedsWidthSuffix);
   }
   void emitIAS(const Cfg *Func) const override {
     (void)Func;
@@ -641,13 +644,16 @@
 /// MovT leaves the bottom bits alone so dest is also a source.
 /// This helps indicate that a previous MovW setting dest is not dead code.
 typedef InstARM32TwoAddrGPR<InstARM32::Movt> InstARM32Movt;
-typedef InstARM32UnaryopGPR<InstARM32::Movw> InstARM32Movw;
-typedef InstARM32UnaryopGPR<InstARM32::Mvn> InstARM32Mvn;
+typedef InstARM32UnaryopGPR<InstARM32::Movw, false> InstARM32Movw;
+typedef InstARM32UnaryopGPR<InstARM32::Clz, false> InstARM32Clz;
+typedef InstARM32UnaryopGPR<InstARM32::Mvn, false> InstARM32Mvn;
+typedef InstARM32UnaryopGPR<InstARM32::Rbit, false> InstARM32Rbit;
+typedef InstARM32UnaryopGPR<InstARM32::Rev, false> InstARM32Rev;
 // Technically, the uxt{b,h} and sxt{b,h} instructions have a rotation
 // operand as well (rotate source by 8, 16, 24 bits prior to extending),
 // but we aren't using that for now, so just model as a Unaryop.
-typedef InstARM32UnaryopGPR<InstARM32::Sxt> InstARM32Sxt;
-typedef InstARM32UnaryopGPR<InstARM32::Uxt> InstARM32Uxt;
+typedef InstARM32UnaryopGPR<InstARM32::Sxt, true> InstARM32Sxt;
+typedef InstARM32UnaryopGPR<InstARM32::Uxt, true> InstARM32Uxt;
 typedef InstARM32FourAddrGPR<InstARM32::Mla> InstARM32Mla;
 typedef InstARM32FourAddrGPR<InstARM32::Mls> InstARM32Mls;
 typedef InstARM32CmpLike<InstARM32::Cmp> InstARM32Cmp;
diff --git a/src/IceTargetLowering.h b/src/IceTargetLowering.h
index 64672c4..a5b52ce 100644
--- a/src/IceTargetLowering.h
+++ b/src/IceTargetLowering.h
@@ -321,6 +321,9 @@
     Context.insert(InstBundleLock::create(Func, BundleOption));
   }
   void _bundle_unlock() { Context.insert(InstBundleUnlock::create(Func)); }
+  void _set_dest_nonkillable() {
+    Context.getLastInserted()->setDestNonKillable();
+  }
 
   Cfg *Func;
   GlobalContext *Ctx;
diff --git a/src/IceTargetLoweringARM32.cpp b/src/IceTargetLoweringARM32.cpp
index 6639da8..10fdfe1 100644
--- a/src/IceTargetLoweringARM32.cpp
+++ b/src/IceTargetLoweringARM32.cpp
@@ -1575,7 +1575,7 @@
 
   // Copy arguments that are passed on the stack to the appropriate
   // stack locations.
-  Variable *SP = Func->getTarget()->getPhysicalRegister(RegARM32::Reg_sp);
+  Variable *SP = getPhysicalRegister(RegARM32::Reg_sp);
   for (auto &StackArg : StackArgs) {
     ConstantInteger32 *Loc =
         llvm::cast<ConstantInteger32>(Ctx->getConstantInt32(StackArg.second));
@@ -1662,7 +1662,7 @@
   if (ParameterAreaSizeBytes) {
     Operand *AddAmount = legalize(Ctx->getConstantInt32(ParameterAreaSizeBytes),
                                   Legal_Reg | Legal_Flex);
-    Variable *SP = Func->getTarget()->getPhysicalRegister(RegARM32::Reg_sp);
+    Variable *SP = getPhysicalRegister(RegARM32::Reg_sp);
     _add(SP, SP, AddAmount);
   }
 
@@ -2032,19 +2032,91 @@
     return;
   }
   case Intrinsics::Bswap: {
-    UnimplementedError(Func->getContext()->getFlags());
+    Variable *Dest = Instr->getDest();
+    Operand *Val = Instr->getArg(0);
+    Type Ty = Val->getType();
+    if (Ty == IceType_i64) {
+      Variable *Val_Lo = legalizeToVar(loOperand(Val));
+      Variable *Val_Hi = legalizeToVar(hiOperand(Val));
+      Variable *T_Lo = makeReg(IceType_i32);
+      Variable *T_Hi = makeReg(IceType_i32);
+      Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
+      Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
+      _rev(T_Lo, Val_Lo);
+      _rev(T_Hi, Val_Hi);
+      _mov(DestLo, T_Hi);
+      _mov(DestHi, T_Lo);
+    } else {
+      assert(Ty == IceType_i32 || Ty == IceType_i16);
+      Variable *ValR = legalizeToVar(Val);
+      Variable *T = makeReg(Ty);
+      _rev(T, ValR);
+      if (Val->getType() == IceType_i16) {
+        Operand *Sixteen =
+            legalize(Ctx->getConstantInt32(16), Legal_Reg | Legal_Flex);
+        _lsr(T, T, Sixteen);
+      }
+      _mov(Dest, T);
+    }
     return;
   }
   case Intrinsics::Ctpop: {
-    UnimplementedError(Func->getContext()->getFlags());
+    Variable *Dest = Instr->getDest();
+    Operand *Val = Instr->getArg(0);
+    InstCall *Call = makeHelperCall(isInt32Asserting32Or64(Val->getType())
+                                        ? H_call_ctpop_i32
+                                        : H_call_ctpop_i64,
+                                    Dest, 1);
+    Call->addArg(Val);
+    lowerCall(Call);
+    // The popcount helpers always return 32-bit values, while the intrinsic's
+    // signature matches some 64-bit platform's native instructions and
+    // expect to fill a 64-bit reg. Thus, clear the upper bits of the dest
+    // just in case the user doesn't do that in the IR or doesn't toss the bits
+    // via truncate.
+    if (Val->getType() == IceType_i64) {
+      Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
+      Constant *Zero = Ctx->getConstantZero(IceType_i32);
+      _mov(DestHi, Zero);
+    }
     return;
   }
   case Intrinsics::Ctlz: {
-    UnimplementedError(Func->getContext()->getFlags());
+    // The "is zero undef" parameter is ignored and we always return
+    // a well-defined value.
+    Operand *Val = Instr->getArg(0);
+    Variable *ValLoR;
+    Variable *ValHiR = nullptr;
+    if (Val->getType() == IceType_i64) {
+      ValLoR = legalizeToVar(loOperand(Val));
+      ValHiR = legalizeToVar(hiOperand(Val));
+    } else {
+      ValLoR = legalizeToVar(Val);
+    }
+    lowerCLZ(Instr->getDest(), ValLoR, ValHiR);
     return;
   }
   case Intrinsics::Cttz: {
-    UnimplementedError(Func->getContext()->getFlags());
+    // Essentially like Clz, but reverse the bits first.
+    Operand *Val = Instr->getArg(0);
+    Variable *ValLoR;
+    Variable *ValHiR = nullptr;
+    if (Val->getType() == IceType_i64) {
+      ValLoR = legalizeToVar(loOperand(Val));
+      ValHiR = legalizeToVar(hiOperand(Val));
+      Variable *TLo = makeReg(IceType_i32);
+      Variable *THi = makeReg(IceType_i32);
+      _rbit(TLo, ValLoR);
+      _rbit(THi, ValHiR);
+      ValLoR = THi;
+      ValHiR = TLo;
+    } else {
+      ValLoR = legalizeToVar(Val);
+      Variable *T = makeReg(IceType_i32);
+      _rbit(T, ValLoR);
+      ValLoR = T;
+    }
+    lowerCLZ(Instr->getDest(), ValLoR, ValHiR);
     return;
   }
   case Intrinsics::Fabs: {
@@ -2077,13 +2149,15 @@
     return;
   }
   case Intrinsics::Memset: {
-    // The value operand needs to be extended to a stack slot size
-    // because the PNaCl ABI requires arguments to be at least 32 bits
-    // wide.
+    // The value operand needs to be extended to a stack slot size because the
+    // PNaCl ABI requires arguments to be at least 32 bits wide.
     Operand *ValOp = Instr->getArg(1);
     assert(ValOp->getType() == IceType_i8);
     Variable *ValExt = Func->makeVariable(stackSlotType());
     lowerCast(InstCast::create(Func, InstCast::Zext, ValExt, ValOp));
+    // Technically, ARM has their own __aeabi_memset, but we can use plain
+    // memset too. The value and size argument need to be flipped if we ever
+    // decide to use __aeabi_memset.
     InstCall *Call = makeHelperCall(H_call_memset, nullptr, 3);
     Call->addArg(Instr->getArg(0));
     Call->addArg(ValExt);
@@ -2111,15 +2185,19 @@
     return;
   }
   case Intrinsics::Stacksave: {
-    UnimplementedError(Func->getContext()->getFlags());
+    Variable *SP = getPhysicalRegister(RegARM32::Reg_sp);
+    Variable *Dest = Instr->getDest();
+    _mov(Dest, SP);
     return;
   }
   case Intrinsics::Stackrestore: {
-    UnimplementedError(Func->getContext()->getFlags());
+    Variable *SP = getPhysicalRegister(RegARM32::Reg_sp);
+    Operand *Val = legalize(Instr->getArg(0), Legal_Reg | Legal_Flex);
+    _mov_nonkillable(SP, Val);
     return;
   }
   case Intrinsics::Trap:
-    UnimplementedError(Func->getContext()->getFlags());
+    _trap();
     return;
   case Intrinsics::UnknownIntrinsic:
     Func->setError("Should not be lowering UnknownIntrinsic");
@@ -2128,6 +2206,34 @@
   return;
 }
 
+void TargetARM32::lowerCLZ(Variable *Dest, Variable *ValLoR, Variable *ValHiR) {
+  Type Ty = Dest->getType();
+  assert(Ty == IceType_i32 || Ty == IceType_i64);
+  Variable *T = makeReg(IceType_i32);
+  _clz(T, ValLoR);
+  if (Ty == IceType_i64) {
+    Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
+    Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
+    Operand *Zero =
+        legalize(Ctx->getConstantZero(IceType_i32), Legal_Reg | Legal_Flex);
+    Operand *ThirtyTwo =
+        legalize(Ctx->getConstantInt32(32), Legal_Reg | Legal_Flex);
+    _cmp(ValHiR, Zero);
+    Variable *T2 = makeReg(IceType_i32);
+    _add(T2, T, ThirtyTwo);
+    _clz(T2, ValHiR, CondARM32::NE);
+    // T2 is actually a source as well when the predicate is not AL
+    // (since it may leave T2 alone). We use set_dest_nonkillable to
+    // prolong the liveness of T2 as if it was used as a source.
+    _set_dest_nonkillable();
+    _mov(DestLo, T2);
+    _mov(DestHi, Ctx->getConstantZero(IceType_i32));
+    return;
+  }
+  _mov(Dest, T);
+  return;
+}
+
 void TargetARM32::lowerLoad(const InstLoad *Load) {
   // A Load instruction can be treated the same as an Assign
   // instruction, after the source operand is transformed into an
@@ -2186,7 +2292,7 @@
   // eliminated.  TODO: Are there more places where the fake use
   // should be inserted?  E.g. "void f(int n){while(1) g(n);}" may not
   // have a ret instruction.
-  Variable *SP = Func->getTarget()->getPhysicalRegister(RegARM32::Reg_sp);
+  Variable *SP = getPhysicalRegister(RegARM32::Reg_sp);
   Context.insert(InstFakeUse::create(Func, SP));
 }
 
diff --git a/src/IceTargetLoweringARM32.h b/src/IceTargetLoweringARM32.h
index 8aa3e11..becb615 100644
--- a/src/IceTargetLoweringARM32.h
+++ b/src/IceTargetLoweringARM32.h
@@ -172,6 +172,8 @@
                     ExtInstr ExtFunc, DivInstr DivFunc,
                     const char *DivHelperName, bool IsRemainder);
 
+  void lowerCLZ(Variable *Dest, Variable *ValLo, Variable *ValHi);
+
   // The following are helpers that insert lowered ARM32 instructions
   // with minimal syntactic overhead, so that the lowering code can
   // look as close to assembly as practical.
@@ -224,6 +226,10 @@
             CondARM32::Cond Pred = CondARM32::AL) {
     Context.insert(InstARM32Cmp::create(Func, Src0, Src1, Pred));
   }
+  void _clz(Variable *Dest, Variable *Src0,
+            CondARM32::Cond Pred = CondARM32::AL) {
+    Context.insert(InstARM32Clz::create(Func, Dest, Src0, Pred));
+  }
   void _eor(Variable *Dest, Variable *Src0, Operand *Src1,
             CondARM32::Cond Pred = CondARM32::AL) {
     Context.insert(InstARM32Eor::create(Func, Dest, Src0, Src1, Pred));
@@ -301,6 +307,14 @@
     for (Variable *Dest : Dests)
       Context.insert(InstFakeDef::create(Func, Dest));
   }
+  void _rbit(Variable *Dest, Variable *Src0,
+             CondARM32::Cond Pred = CondARM32::AL) {
+    Context.insert(InstARM32Rbit::create(Func, Dest, Src0, Pred));
+  }
+  void _rev(Variable *Dest, Variable *Src0,
+            CondARM32::Cond Pred = CondARM32::AL) {
+    Context.insert(InstARM32Rev::create(Func, Dest, Src0, Pred));
+  }
   void _ret(Variable *LR, Variable *Src0 = nullptr) {
     Context.insert(InstARM32Ret::create(Func, LR, Src0));
   }
diff --git a/src/IceTargetLoweringX86Base.h b/src/IceTargetLoweringX86Base.h
index 32d7d6b..c00f0b0 100644
--- a/src/IceTargetLoweringX86Base.h
+++ b/src/IceTargetLoweringX86Base.h
@@ -96,6 +96,7 @@
 
   using Machine::_bundle_lock;
   using Machine::_bundle_unlock;
+  using Machine::_set_dest_nonkillable;
   using Machine::getContext;
   using Machine::getStackAdjustment;
   using Machine::regAlloc;
@@ -587,9 +588,6 @@
   void _xor_rmw(typename Traits::X86OperandMem *DestSrc0, Operand *Src1) {
     Context.insert(Traits::Insts::XorRMW::create(Func, DestSrc0, Src1));
   }
-  void _set_dest_nonkillable() {
-    Context.getLastInserted()->setDestNonKillable();
-  }
 
   bool optimizeScalarMul(Variable *Dest, Operand *Src0, int32_t Src1);
   void findRMW();
diff --git a/src/IceTargetLoweringX86BaseImpl.h b/src/IceTargetLoweringX86BaseImpl.h
index 68cbf94..a277db2 100644
--- a/src/IceTargetLoweringX86BaseImpl.h
+++ b/src/IceTargetLoweringX86BaseImpl.h
@@ -3521,9 +3521,8 @@
     return;
   }
   case Intrinsics::Memset: {
-    // The value operand needs to be extended to a stack slot size
-    // because the PNaCl ABI requires arguments to be at least 32 bits
-    // wide.
+    // The value operand needs to be extended to a stack slot size because the
+    // PNaCl ABI requires arguments to be at least 32 bits wide.
     Operand *ValOp = Instr->getArg(1);
     assert(ValOp->getType() == IceType_i8);
     Variable *ValExt = Func->template makeVariable(stackSlotType());
@@ -5257,8 +5256,7 @@
       _lea(Reg, Traits::X86OperandMem::create(Func, IceType_i32, Reg, Offset,
                                               nullptr, 0));
       // make sure liveness analysis won't kill this variable, otherwise a
-      // liveness
-      // assertion will be triggered.
+      // liveness assertion will be triggered.
       _set_dest_nonkillable();
       if (Immediate->getType() != IceType_i32) {
         Variable *TruncReg = makeReg(Immediate->getType(), RegNum);