Subzero. ARM32. Combine allocas.

BUG= https://code.google.com/p/nativeclient/issues/detail?id=4076
R=stichnot@chromium.org

Review URL: https://codereview.chromium.org/1465213002 .
diff --git a/src/IceTargetLoweringARM32.cpp b/src/IceTargetLoweringARM32.cpp
index 12810f6..f23609b 100644
--- a/src/IceTargetLoweringARM32.cpp
+++ b/src/IceTargetLoweringARM32.cpp
@@ -265,7 +265,7 @@
 }
 
 void TargetARM32::findMaxStackOutArgsSize() {
-  // MinNeededOutArgsBytes should be updated if the Target ever creates an
+  // MinNeededOutArgsBytes should be updated if the Target ever creates a
   // high-level InstCall that requires more stack bytes.
   constexpr size_t MinNeededOutArgsBytes = 0;
   MaxOutArgsSizeBytes = MinNeededOutArgsBytes;
@@ -291,7 +291,7 @@
   findMaxStackOutArgsSize();
 
   // Do not merge Alloca instructions, and lay out the stack.
-  static constexpr bool SortAndCombineAllocas = false;
+  static constexpr bool SortAndCombineAllocas = true;
   Func->processAllocas(SortAndCombineAllocas);
   Func->dump("After Alloca processing");
 
@@ -356,6 +356,7 @@
   regAlloc(RAK_Global);
   if (Func->hasError())
     return;
+
   copyRegAllocFromInfWeightVariable64On32(Func->getVariables());
   Func->dump("After linear scan regalloc");
 
@@ -364,6 +365,8 @@
     Func->dump("After advanced Phi lowering");
   }
 
+  ForbidTemporaryWithoutReg _(this);
+
   // Stack frame mapping.
   Func->genFrame();
   if (Func->hasError())
@@ -399,8 +402,8 @@
   findMaxStackOutArgsSize();
 
   // Do not merge Alloca instructions, and lay out the stack.
-  static constexpr bool SortAndCombineAllocas = false;
-  Func->processAllocas(SortAndCombineAllocas);
+  static constexpr bool DontSortAndCombineAllocas = false;
+  Func->processAllocas(DontSortAndCombineAllocas);
   Func->dump("After Alloca processing");
 
   Func->placePhiLoads();
@@ -424,9 +427,12 @@
   regAlloc(RAK_InfOnly);
   if (Func->hasError())
     return;
+
   copyRegAllocFromInfWeightVariable64On32(Func->getVariables());
   Func->dump("After regalloc of infinite-weight variables");
 
+  ForbidTemporaryWithoutReg _(this);
+
   Func->genFrame();
   if (Func->hasError())
     return;
@@ -520,6 +526,7 @@
     llvm::report_fatal_error(
         "Infinite-weight Variable has no register assigned");
   }
+  assert(!Var->isRematerializable());
   int32_t Offset = Var->getStackOffset();
   int32_t BaseRegNum = Var->getBaseRegNum();
   if (BaseRegNum == Variable::NoRegister) {
@@ -850,6 +857,9 @@
     SpillAreaSizeBytes = StackSize - StackOffset;
   }
 
+  // Combine fixed alloca with SpillAreaSize.
+  SpillAreaSizeBytes += FixedAllocaSizeBytes;
+
   // Generate "sub sp, SpillAreaSizeBytes"
   if (SpillAreaSizeBytes) {
     // Use the scratch register if needed to legalize the immediate.
@@ -857,7 +867,11 @@
                                   Legal_Reg | Legal_Flex, getReservedTmpReg());
     Variable *SP = getPhysicalRegister(RegARM32::Reg_sp);
     _sub(SP, SP, SubAmount);
+    if (FixedAllocaAlignBytes > ARM32_STACK_ALIGNMENT_BYTES) {
+      alignRegisterPow2(SP, FixedAllocaAlignBytes);
+    }
   }
+
   Ctx->statsUpdateFrameBytes(SpillAreaSizeBytes);
 
   // Fill in stack offsets for stack args, and copy args into registers for
@@ -1034,6 +1048,7 @@
                                                Variable *OrigBaseReg,
                                                Variable **NewBaseReg,
                                                int32_t *NewBaseOffset) {
+  assert(!OrigBaseReg->isRematerializable());
   if (isLegalMemOffset(Ty, Offset)) {
     return OperandARM32Mem::create(
         Func, Ty, OrigBaseReg,
@@ -1053,6 +1068,7 @@
     OffsetDiff = 0;
   }
 
+  assert(!(*NewBaseReg)->isRematerializable());
   return OperandARM32Mem::create(
       Func, Ty, *NewBaseReg,
       llvm::cast<ConstantInteger32>(Ctx->getConstantInt32(OffsetDiff)),
@@ -1076,8 +1092,9 @@
 
   bool Legalized = false;
   if (!Dest->hasReg()) {
-    auto *const SrcR = llvm::cast<Variable>(Src);
+    auto *SrcR = llvm::cast<Variable>(Src);
     assert(SrcR->hasReg());
+    assert(!SrcR->isRematerializable());
     const int32_t Offset = Dest->getStackOffset();
     // This is a _mov(Mem(), Variable), i.e., a store.
     _str(SrcR, createMemOperand(DestTy, Offset, OrigBaseReg, NewBaseReg,
@@ -1087,12 +1104,26 @@
     Context.insert(InstFakeDef::create(Func, Dest));
     Legalized = true;
   } else if (auto *Var = llvm::dyn_cast<Variable>(Src)) {
-    if (!Var->hasReg()) {
-      const int32_t Offset = Var->getStackOffset();
-      _ldr(Dest, createMemOperand(DestTy, Offset, OrigBaseReg, NewBaseReg,
-                                  NewBaseOffset),
-           MovInstr->getPredicate());
+    if (Var->isRematerializable()) {
+      // Rematerialization arithmetic.
+      const int32_t ExtraOffset =
+          (static_cast<SizeT>(Var->getRegNum()) == getFrameReg())
+              ? getFrameFixedAllocaOffset()
+              : 0;
+
+      const int32_t Offset = Var->getStackOffset() + ExtraOffset;
+      Operand *OffsetRF = legalize(Ctx->getConstantInt32(Offset),
+                                   Legal_Reg | Legal_Flex, Dest->getRegNum());
+      _add(Dest, Var, OffsetRF);
       Legalized = true;
+    } else {
+      if (!Var->hasReg()) {
+        const int32_t Offset = Var->getStackOffset();
+        _ldr(Dest, createMemOperand(DestTy, Offset, OrigBaseReg, NewBaseReg,
+                                    NewBaseOffset),
+             MovInstr->getPredicate());
+        Legalized = true;
+      }
     }
   }
 
@@ -1163,13 +1194,15 @@
     // increment) in case of duplication.
     assert(Mem->getAddrMode() == OperandARM32Mem::Offset ||
            Mem->getAddrMode() == OperandARM32Mem::NegOffset);
+    Variable *BaseR = legalizeToReg(Mem->getBase());
     if (Mem->isRegReg()) {
-      return OperandARM32Mem::create(Func, IceType_i32, Mem->getBase(),
-                                     Mem->getIndex(), Mem->getShiftOp(),
-                                     Mem->getShiftAmt(), Mem->getAddrMode());
+      Variable *IndexR = legalizeToReg(Mem->getIndex());
+      return OperandARM32Mem::create(Func, IceType_i32, BaseR, IndexR,
+                                     Mem->getShiftOp(), Mem->getShiftAmt(),
+                                     Mem->getAddrMode());
     } else {
-      return OperandARM32Mem::create(Func, IceType_i32, Mem->getBase(),
-                                     Mem->getOffset(), Mem->getAddrMode());
+      return OperandARM32Mem::create(Func, IceType_i32, BaseR, Mem->getOffset(),
+                                     Mem->getAddrMode());
     }
   }
   llvm_unreachable("Unsupported operand type");
@@ -1201,7 +1234,9 @@
       Variable *NewBase = Func->makeVariable(Base->getType());
       lowerArithmetic(InstArithmetic::create(Func, InstArithmetic::Add, NewBase,
                                              Base, Four));
-      return OperandARM32Mem::create(Func, SplitType, NewBase, Mem->getIndex(),
+      Variable *BaseR = legalizeToReg(NewBase);
+      Variable *IndexR = legalizeToReg(Mem->getIndex());
+      return OperandARM32Mem::create(Func, SplitType, BaseR, IndexR,
                                      Mem->getShiftOp(), Mem->getShiftAmt(),
                                      Mem->getAddrMode());
     } else {
@@ -1216,16 +1251,17 @@
         // mode into a RegReg addressing mode. Since NaCl sandboxing disallows
         // RegReg addressing modes, prefer adding to base and replacing
         // instead. Thus we leave the old offset alone.
-        Constant *Four = Ctx->getConstantInt32(4);
+        Constant *_4 = Ctx->getConstantInt32(4);
         Variable *NewBase = Func->makeVariable(Base->getType());
         lowerArithmetic(InstArithmetic::create(Func, InstArithmetic::Add,
-                                               NewBase, Base, Four));
+                                               NewBase, Base, _4));
         Base = NewBase;
       } else {
         Offset =
             llvm::cast<ConstantInteger32>(Ctx->getConstantInt32(NextOffsetVal));
       }
-      return OperandARM32Mem::create(Func, SplitType, Base, Offset,
+      Variable *BaseR = legalizeToReg(Base);
+      return OperandARM32Mem::create(Func, SplitType, BaseR, Offset,
                                      Mem->getAddrMode());
     }
   }
@@ -1264,7 +1300,6 @@
 }
 
 void TargetARM32::lowerAlloca(const InstAlloca *Inst) {
-  UsesFramePointer = true;
   // Conservatively require the stack to be aligned. Some stack adjustment
   // operations implemented below assume that the stack is aligned before the
   // alloca. All the alloca code ensures that the stack alignment is preserved
@@ -1272,29 +1307,53 @@
   // cases.
   NeedsStackAlignment = true;
 
-  // TODO(stichnot): minimize the number of adjustments of SP, etc.
-  Variable *SP = getPhysicalRegister(RegARM32::Reg_sp);
-  Variable *Dest = Inst->getDest();
-  uint32_t AlignmentParam = Inst->getAlignInBytes();
   // For default align=0, set it to the real value 1, to avoid any
   // bit-manipulation problems below.
-  AlignmentParam = std::max(AlignmentParam, 1u);
+  const uint32_t AlignmentParam = std::max(1u, Inst->getAlignInBytes());
 
   // LLVM enforces power of 2 alignment.
   assert(llvm::isPowerOf2_32(AlignmentParam));
   assert(llvm::isPowerOf2_32(ARM32_STACK_ALIGNMENT_BYTES));
 
-  uint32_t Alignment = std::max(AlignmentParam, ARM32_STACK_ALIGNMENT_BYTES);
-  if (Alignment > ARM32_STACK_ALIGNMENT_BYTES) {
+  const uint32_t Alignment =
+      std::max(AlignmentParam, ARM32_STACK_ALIGNMENT_BYTES);
+  const bool OverAligned = Alignment > ARM32_STACK_ALIGNMENT_BYTES;
+  const bool OptM1 = Ctx->getFlags().getOptLevel() == Opt_m1;
+  const bool AllocaWithKnownOffset = Inst->getKnownFrameOffset();
+  const bool UseFramePointer =
+      hasFramePointer() || OverAligned || !AllocaWithKnownOffset || OptM1;
+
+  if (UseFramePointer)
+    setHasFramePointer();
+
+  Variable *SP = getPhysicalRegister(RegARM32::Reg_sp);
+  if (OverAligned) {
     alignRegisterPow2(SP, Alignment);
   }
+
+  Variable *Dest = Inst->getDest();
   Operand *TotalSize = Inst->getSizeInBytes();
+
   if (const auto *ConstantTotalSize =
           llvm::dyn_cast<ConstantInteger32>(TotalSize)) {
-    uint32_t Value = ConstantTotalSize->getValue();
-    Value = Utils::applyAlignment(Value, Alignment);
-    Operand *SubAmount = legalize(Ctx->getConstantInt32(Value));
-    _sub(SP, SP, SubAmount);
+    const uint32_t Value =
+        Utils::applyAlignment(ConstantTotalSize->getValue(), Alignment);
+    // Constant size alloca.
+    if (!UseFramePointer) {
+      // If we don't need a Frame Pointer, this alloca has a known offset to the
+      // stack pointer. We don't need adjust the stack pointer, nor assign any
+      // value to Dest, as Dest is rematerializable.
+      assert(Dest->isRematerializable());
+      FixedAllocaSizeBytes += Value;
+      Context.insert(InstFakeDef::create(Func, Dest));
+      return;
+    }
+
+    // If a frame pointer is required, then we need to store the alloca'd result
+    // in Dest.
+    Operand *SubAmountRF =
+        legalize(Ctx->getConstantInt32(Value), Legal_Reg | Legal_Flex);
+    _sub(SP, SP, SubAmountRF);
   } else {
     // Non-constant sizes need to be adjusted to the next highest multiple of
     // the required alignment at runtime.
@@ -1306,6 +1365,8 @@
     alignRegisterPow2(T, Alignment);
     _sub(SP, SP, T);
   }
+
+  // Adds back a few bytes to SP to account for the out args area.
   Variable *T = SP;
   if (MaxOutArgsSizeBytes != 0) {
     T = makeReg(getPointerType());
@@ -1313,6 +1374,7 @@
         Ctx->getConstantInt32(MaxOutArgsSizeBytes), Legal_Reg | Legal_Flex);
     _add(T, SP, OutArgsSizeRF);
   }
+
   _mov(Dest, T);
 }
 
@@ -1976,6 +2038,12 @@
 
 void TargetARM32::lowerArithmetic(const InstArithmetic *Inst) {
   Variable *Dest = Inst->getDest();
+
+  if (Dest->isRematerializable()) {
+    Context.insert(InstFakeDef::create(Func, Dest));
+    return;
+  }
+
   if (Dest->getType() == IceType_i1) {
     lowerInt1Arithmetic(Inst);
     return;
@@ -2139,8 +2207,8 @@
     if (Srcs.hasConstOperand()) {
       // TODO(jpp): lowering Src0R here is wrong -- Src0R it is not guaranteed
       // to be used.
-      Variable *Src0R = Srcs.src0R(this);
       if (Srcs.immediateIsFlexEncodable()) {
+        Variable *Src0R = Srcs.src0R(this);
         Operand *Src1RF = Srcs.src1RF(this);
         if (Srcs.swappedOperands()) {
           _rsb(T, Src0R, Src1RF);
@@ -2151,6 +2219,7 @@
         return;
       }
       if (!Srcs.swappedOperands() && Srcs.negatedImmediateIsFlexEncodable()) {
+        Variable *Src0R = Srcs.src0R(this);
         Operand *Src1F = Srcs.negatedSrc1F(this);
         _add(T, Src0R, Src1F);
         _mov(Dest, T);
@@ -2215,6 +2284,12 @@
 
 void TargetARM32::lowerAssign(const InstAssign *Inst) {
   Variable *Dest = Inst->getDest();
+
+  if (Dest->isRematerializable()) {
+    Context.insert(InstFakeDef::create(Func, Dest));
+    return;
+  }
+
   Operand *Src0 = Inst->getSrc(0);
   assert(Dest->getType() == Src0->getType());
   if (Dest->getType() == IceType_i64) {
@@ -4425,13 +4500,17 @@
   assert(OffsetImm < 0 ? (ValidImmMask & -OffsetImm) == -OffsetImm
                        : (ValidImmMask & OffsetImm) == OffsetImm);
 
+  Variable *BaseR = makeReg(getPointerType());
+  Context.insert(InstAssign::create(Func, BaseR, BaseVar));
   if (OffsetReg != nullptr) {
-    return OperandARM32Mem::create(Func, Ty, BaseVar, OffsetReg, ShiftKind,
+    Variable *OffsetR = makeReg(getPointerType());
+    Context.insert(InstAssign::create(Func, OffsetR, OffsetReg));
+    return OperandARM32Mem::create(Func, Ty, BaseR, OffsetR, ShiftKind,
                                    OffsetRegShamt);
   }
 
   return OperandARM32Mem::create(
-      Func, Ty, BaseVar,
+      Func, Ty, BaseR,
       llvm::cast<ConstantInteger32>(Ctx->getConstantInt32(OffsetImm)));
 }
 
@@ -4630,7 +4709,8 @@
   if (RegNum == Variable::NoRegister) {
     if (Variable *Subst = getContext().availabilityGet(From)) {
       // At this point we know there is a potential substitution available.
-      if (Subst->mustHaveReg() && !Subst->hasReg()) {
+      if (!Subst->isRematerializable() && Subst->mustHaveReg() &&
+          !Subst->hasReg()) {
         // At this point we know the substitution will have a register.
         if (From->getType() == Subst->getType()) {
           // At this point we know the substitution's register is compatible.
@@ -4788,6 +4868,13 @@
   }
 
   if (auto *Var = llvm::dyn_cast<Variable>(From)) {
+    if (Var->isRematerializable()) {
+      // TODO(jpp): We don't need to rematerialize Var if legalize() was invoked
+      // for a Variable in a Mem operand.
+      Variable *T = makeReg(Var->getType(), RegNum);
+      _mov(T, Var);
+      return T;
+    }
     // Check if the variable is guaranteed a physical register. This can happen
     // either when the variable is pre-colored or when it is assigned infinite
     // weight.
@@ -4844,9 +4931,9 @@
   // If we didn't do address mode optimization, then we only have a
   // base/offset to work with. ARM always requires a base register, so
   // just use that to hold the operand.
-  Variable *Base = legalizeToReg(Operand);
+  Variable *BaseR = legalizeToReg(Operand);
   return OperandARM32Mem::create(
-      Func, Ty, Base,
+      Func, Ty, BaseR,
       llvm::cast<ConstantInteger32>(Ctx->getConstantZero(IceType_i32)));
 }
 
@@ -4863,6 +4950,7 @@
 Variable *TargetARM32::makeReg(Type Type, int32_t RegNum) {
   // There aren't any 64-bit integer registers for ARM32.
   assert(Type != IceType_i64);
+  assert(AllowTemporaryWithNoReg || RegNum != Variable::NoRegister);
   Variable *Reg = Func->makeVariable(Type);
   if (RegNum == Variable::NoRegister)
     Reg->setMustHaveReg();
@@ -4871,7 +4959,8 @@
   return Reg;
 }
 
-void TargetARM32::alignRegisterPow2(Variable *Reg, uint32_t Align) {
+void TargetARM32::alignRegisterPow2(Variable *Reg, uint32_t Align,
+                                    int32_t TmpRegNum) {
   assert(llvm::isPowerOf2_32(Align));
   uint32_t RotateAmt;
   uint32_t Immed_8;
@@ -4880,10 +4969,12 @@
   // it fits at all). Assume Align is usually small, in which case BIC works
   // better. Thus, this rounds down to the alignment.
   if (OperandARM32FlexImm::canHoldImm(Align - 1, &RotateAmt, &Immed_8)) {
-    Mask = legalize(Ctx->getConstantInt32(Align - 1), Legal_Reg | Legal_Flex);
+    Mask = legalize(Ctx->getConstantInt32(Align - 1), Legal_Reg | Legal_Flex,
+                    TmpRegNum);
     _bic(Reg, Reg, Mask);
   } else {
-    Mask = legalize(Ctx->getConstantInt32(-Align), Legal_Reg | Legal_Flex);
+    Mask = legalize(Ctx->getConstantInt32(-Align), Legal_Reg | Legal_Flex,
+                    TmpRegNum);
     _and(Reg, Reg, Mask);
   }
 }