ARM: Assign "actuals" at call site to the appropriate GPR/stack slot.

Actually assign arguments to r0-r3 at the call site. Previously
this was left unhandled. There was only logic for pulling
formal parameters out of r0-r3.

Refactor the GPR counter and move it into a class so that the
rounding up for i64 arguments is in one place for callsites
and for pulling out of parameters. We might be able to use a
similar pattern to count the FP/SIMD registers later.

BUG= https://code.google.com/p/nativeclient/issues/detail?id=4076
R=stichnot@chromium.org

Review URL: https://codereview.chromium.org/1187513006.
diff --git a/src/IceTargetLoweringARM32.cpp b/src/IceTargetLoweringARM32.cpp
index 9bb2386..571e2a3 100644
--- a/src/IceTargetLoweringARM32.cpp
+++ b/src/IceTargetLoweringARM32.cpp
@@ -120,9 +120,6 @@
 #undef X
 } // end of namespace dummy1
 
-// The maximum number of arguments to pass in GPR registers.
-const uint32_t ARM32_MAX_GPR_ARG = 4;
-
 // Stack alignment
 const uint32_t ARM32_STACK_ALIGNMENT_BYTES = 16;
 
@@ -132,6 +129,18 @@
   return Utils::applyAlignment(Value, ARM32_STACK_ALIGNMENT_BYTES);
 }
 
+// Value is in bytes. Return Value adjusted to the next highest multiple
+// of the stack alignment required for the given type.
+uint32_t applyStackAlignmentTy(uint32_t Value, Type Ty) {
+  // Use natural alignment, except that normally (non-NaCl) ARM only
+  // aligns vectors to 8 bytes.
+  // TODO(jvoung): Check this ...
+  size_t typeAlignInBytes = typeWidthInBytes(Ty);
+  if (isVectorType(Ty))
+    typeAlignInBytes = 8;
+  return Utils::applyAlignment(Value, typeAlignInBytes);
+}
+
 } // end of anonymous namespace
 
 TargetARM32::TargetARM32(Cfg *Func)
@@ -377,7 +386,7 @@
     Offset += getStackAdjustment();
   // TODO(jvoung): Handle out of range. Perhaps we need a scratch register
   // to materialize a larger offset.
-  const bool SignExt = false;
+  constexpr bool SignExt = false;
   if (!OperandARM32Mem::canHoldOffset(Var->getType(), SignExt, Offset)) {
     llvm::report_fatal_error("Illegal stack offset");
   }
@@ -389,13 +398,39 @@
   Str << "]";
 }
 
+bool TargetARM32::CallingConv::I64InRegs(std::pair<int32_t, int32_t> *Regs) {
+  if (NumGPRRegsUsed >= ARM32_MAX_GPR_ARG)
+    return false;
+  int32_t RegLo, RegHi;
+  // Always start i64 registers at an even register, so this may end
+  // up padding away a register.
+  if (NumGPRRegsUsed % 2 != 0) {
+    ++NumGPRRegsUsed;
+  }
+  RegLo = RegARM32::Reg_r0 + NumGPRRegsUsed;
+  ++NumGPRRegsUsed;
+  RegHi = RegARM32::Reg_r0 + NumGPRRegsUsed;
+  ++NumGPRRegsUsed;
+  // If this bumps us past the boundary, don't allocate to a register
+  // and leave any previously speculatively consumed registers as consumed.
+  if (NumGPRRegsUsed > ARM32_MAX_GPR_ARG)
+    return false;
+  Regs->first = RegLo;
+  Regs->second = RegHi;
+  return true;
+}
+
+bool TargetARM32::CallingConv::I32InReg(int32_t *Reg) {
+  if (NumGPRRegsUsed >= ARM32_MAX_GPR_ARG)
+    return false;
+  *Reg = RegARM32::Reg_r0 + NumGPRRegsUsed;
+  ++NumGPRRegsUsed;
+  return true;
+}
+
 void TargetARM32::lowerArguments() {
   VarList &Args = Func->getArgs();
-  // The first few integer type parameters can use r0-r3, regardless of their
-  // position relative to the floating-point/vector arguments in the argument
-  // list. Floating-point and vector arguments can use q0-q3 (aka d0-d7,
-  // s0-s15).
-  unsigned NumGPRRegsUsed = 0;
+  TargetARM32::CallingConv CC;
 
   // For each register argument, replace Arg in the argument list with the
   // home register.  Then generate an instruction in the prolog to copy the
@@ -414,22 +449,8 @@
       UnimplementedError(Func->getContext()->getFlags());
       continue;
     } else if (Ty == IceType_i64) {
-      if (NumGPRRegsUsed >= ARM32_MAX_GPR_ARG)
-        continue;
-      int32_t RegLo;
-      int32_t RegHi;
-      // Always start i64 registers at an even register, so this may end
-      // up padding away a register.
-      if (NumGPRRegsUsed % 2 != 0) {
-        ++NumGPRRegsUsed;
-      }
-      RegLo = RegARM32::Reg_r0 + NumGPRRegsUsed;
-      ++NumGPRRegsUsed;
-      RegHi = RegARM32::Reg_r0 + NumGPRRegsUsed;
-      ++NumGPRRegsUsed;
-      // If this bumps us past the boundary, don't allocate to a register
-      // and leave any previously speculatively consumed registers as consumed.
-      if (NumGPRRegsUsed > ARM32_MAX_GPR_ARG)
+      std::pair<int32_t, int32_t> RegPair;
+      if (!CC.I64InRegs(&RegPair))
         continue;
       Variable *RegisterArg = Func->makeVariable(Ty);
       Variable *RegisterLo = Func->makeVariable(IceType_i32);
@@ -439,9 +460,9 @@
         RegisterLo->setName(Func, "home_reg_lo:" + Arg->getName(Func));
         RegisterHi->setName(Func, "home_reg_hi:" + Arg->getName(Func));
       }
-      RegisterLo->setRegNum(RegLo);
+      RegisterLo->setRegNum(RegPair.first);
       RegisterLo->setIsArg();
-      RegisterHi->setRegNum(RegHi);
+      RegisterHi->setRegNum(RegPair.second);
       RegisterHi->setIsArg();
       RegisterArg->setLoHi(RegisterLo, RegisterHi);
       RegisterArg->setIsArg();
@@ -452,10 +473,9 @@
       continue;
     } else {
       assert(Ty == IceType_i32);
-      if (NumGPRRegsUsed >= ARM32_MAX_GPR_ARG)
+      int32_t RegNum;
+      if (!CC.I32InReg(&RegNum))
         continue;
-      int32_t RegNum = RegARM32::Reg_r0 + NumGPRRegsUsed;
-      ++NumGPRRegsUsed;
       Variable *RegisterArg = Func->makeVariable(Ty);
       if (ALLOW_DUMP) {
         RegisterArg->setName(Func, "home_reg:" + Arg->getName(Func));
@@ -492,9 +512,7 @@
     finishArgumentLowering(Hi, FramePtr, BasicFrameOffset, InArgsSizeBytes);
     return;
   }
-  if (isVectorType(Ty)) {
-    InArgsSizeBytes = applyStackAlignment(InArgsSizeBytes);
-  }
+  InArgsSizeBytes = applyStackAlignmentTy(InArgsSizeBytes, Ty);
   Arg->setStackOffset(BasicFrameOffset + InArgsSizeBytes);
   InArgsSizeBytes += typeWidthInBytesOnStack(Ty);
   // If the argument variable has been assigned a register, we need to load
@@ -672,9 +690,10 @@
 
   const VarList &Args = Func->getArgs();
   size_t InArgsSizeBytes = 0;
-  unsigned NumGPRArgs = 0;
+  TargetARM32::CallingConv CC;
   for (Variable *Arg : Args) {
     Type Ty = Arg->getType();
+    bool InRegs = false;
     // Skip arguments passed in registers.
     if (isVectorType(Ty)) {
       UnimplementedError(Func->getContext()->getFlags());
@@ -682,19 +701,16 @@
     } else if (isFloatingType(Ty)) {
       UnimplementedError(Func->getContext()->getFlags());
       continue;
-    } else if (Ty == IceType_i64 && NumGPRArgs < ARM32_MAX_GPR_ARG) {
-      // Start at an even register.
-      if (NumGPRArgs % 2 == 1) {
-        ++NumGPRArgs;
-      }
-      NumGPRArgs += 2;
-      if (NumGPRArgs <= ARM32_MAX_GPR_ARG)
-        continue;
-    } else if (NumGPRArgs < ARM32_MAX_GPR_ARG) {
-      ++NumGPRArgs;
-      continue;
+    } else if (Ty == IceType_i64) {
+      std::pair<int32_t, int32_t> DummyRegs;
+      InRegs = CC.I64InRegs(&DummyRegs);
+    } else {
+      assert(Ty == IceType_i32);
+      int32_t DummyReg;
+      InRegs = CC.I32InReg(&DummyReg);
     }
-    finishArgumentLowering(Arg, FramePtr, BasicFrameOffset, InArgsSizeBytes);
+    if (!InRegs)
+      finishArgumentLowering(Arg, FramePtr, BasicFrameOffset, InArgsSizeBytes);
   }
 
   // Fill in stack offsets for locals.
@@ -1314,10 +1330,97 @@
 
 void TargetARM32::lowerCall(const InstCall *Instr) {
   MaybeLeafFunc = false;
+  NeedsStackAlignment = true;
 
-  // TODO(jvoung): assign arguments to registers and stack. Also reserve stack.
-  if (Instr->getNumArgs()) {
-    UnimplementedError(Func->getContext()->getFlags());
+  // Assign arguments to registers and stack. Also reserve stack.
+  TargetARM32::CallingConv CC;
+  // Pair of Arg Operand -> GPR number assignments.
+  llvm::SmallVector<std::pair<Operand *, int32_t>,
+                    TargetARM32::CallingConv::ARM32_MAX_GPR_ARG> GPRArgs;
+  // Pair of Arg Operand -> stack offset.
+  llvm::SmallVector<std::pair<Operand *, int32_t>, 8> StackArgs;
+  int32_t ParameterAreaSizeBytes = 0;
+
+  // Classify each argument operand according to the location where the
+  // argument is passed.
+  for (SizeT i = 0, NumArgs = Instr->getNumArgs(); i < NumArgs; ++i) {
+    Operand *Arg = Instr->getArg(i);
+    Type Ty = Arg->getType();
+    bool InRegs = false;
+    if (isVectorType(Ty)) {
+      UnimplementedError(Func->getContext()->getFlags());
+    } else if (isFloatingType(Ty)) {
+      UnimplementedError(Func->getContext()->getFlags());
+    } else if (Ty == IceType_i64) {
+      std::pair<int32_t, int32_t> Regs;
+      if (CC.I64InRegs(&Regs)) {
+        InRegs = true;
+        Operand *Lo = loOperand(Arg);
+        Operand *Hi = hiOperand(Arg);
+        GPRArgs.push_back(std::make_pair(Lo, Regs.first));
+        GPRArgs.push_back(std::make_pair(Hi, Regs.second));
+      }
+    } else {
+      assert(Ty == IceType_i32);
+      int32_t Reg;
+      if (CC.I32InReg(&Reg)) {
+        InRegs = true;
+        GPRArgs.push_back(std::make_pair(Arg, Reg));
+      }
+    }
+
+    if (!InRegs) {
+      ParameterAreaSizeBytes =
+          applyStackAlignmentTy(ParameterAreaSizeBytes, Ty);
+      StackArgs.push_back(std::make_pair(Arg, ParameterAreaSizeBytes));
+      ParameterAreaSizeBytes += typeWidthInBytesOnStack(Arg->getType());
+    }
+  }
+
+  // Adjust the parameter area so that the stack is aligned.  It is
+  // assumed that the stack is already aligned at the start of the
+  // calling sequence.
+  ParameterAreaSizeBytes = applyStackAlignment(ParameterAreaSizeBytes);
+
+  // Subtract the appropriate amount for the argument area.  This also
+  // takes care of setting the stack adjustment during emission.
+  //
+  // TODO: If for some reason the call instruction gets dead-code
+  // eliminated after lowering, we would need to ensure that the
+  // pre-call and the post-call esp adjustment get eliminated as well.
+  if (ParameterAreaSizeBytes) {
+    Operand *SubAmount = legalize(Ctx->getConstantInt32(ParameterAreaSizeBytes),
+                                  Legal_Reg | Legal_Flex);
+    _adjust_stack(ParameterAreaSizeBytes, SubAmount);
+  }
+
+  // Copy arguments that are passed on the stack to the appropriate
+  // stack locations.
+  Variable *SP = Func->getTarget()->getPhysicalRegister(RegARM32::Reg_sp);
+  for (auto &StackArg : StackArgs) {
+    ConstantInteger32 *Loc =
+        llvm::cast<ConstantInteger32>(Ctx->getConstantInt32(StackArg.second));
+    Type Ty = StackArg.first->getType();
+    OperandARM32Mem *Addr;
+    constexpr bool SignExt = false;
+    if (OperandARM32Mem::canHoldOffset(Ty, SignExt, StackArg.second)) {
+      Addr = OperandARM32Mem::create(Func, Ty, SP, Loc);
+    } else {
+      Variable *NewBase = Func->makeVariable(SP->getType());
+      lowerArithmetic(
+          InstArithmetic::create(Func, InstArithmetic::Add, NewBase, SP, Loc));
+      Addr = formMemoryOperand(NewBase, Ty);
+    }
+    lowerStore(InstStore::create(Func, StackArg.first, Addr));
+  }
+
+  // Copy arguments to be passed in registers to the appropriate registers.
+  for (auto &GPRArg : GPRArgs) {
+    Variable *Reg = legalizeToVar(GPRArg.first, GPRArg.second);
+    // Generate a FakeUse of register arguments so that they do not get
+    // dead code eliminated as a result of the FakeKill of scratch
+    // registers after the call.
+    Context.insert(InstFakeUse::create(Func, Reg));
   }
 
   // Generate the call instruction.  Assign its result to a temporary
@@ -1361,6 +1464,9 @@
     }
   }
   Operand *CallTarget = Instr->getCallTarget();
+  // TODO(jvoung): Handle sandboxing.
+  // const bool NeedSandboxing = Ctx->getFlags().getUseSandboxing();
+
   // Allow ConstantRelocatable to be left alone as a direct call,
   // but force other constants like ConstantInteger32 to be in
   // a register and make it an indirect call.
@@ -1372,6 +1478,15 @@
   if (ReturnRegHi)
     Context.insert(InstFakeDef::create(Func, ReturnRegHi));
 
+  // Add the appropriate offset to SP.  The call instruction takes care
+  // of resetting the stack offset during emission.
+  if (ParameterAreaSizeBytes) {
+    Operand *AddAmount = legalize(Ctx->getConstantInt32(ParameterAreaSizeBytes),
+                                  Legal_Reg | Legal_Flex);
+    Variable *SP = Func->getTarget()->getPhysicalRegister(RegARM32::Reg_sp);
+    _add(SP, SP, AddAmount);
+  }
+
   // Insert a register-kill pseudo instruction.
   Context.insert(InstFakeKill::create(Func, NewCall));