Subzero ARM: lower alloca instruction.
Lower alloca in a way similar to x86. Subtract the stack
and align if needed, then copy that stack address to dest.
Sometimes use "bic" for the mask, sometimes use "and",
depending on what fits better.
BUG= https://code.google.com/p/nativeclient/issues/detail?id=4076
R=stichnot@chromium.org
Review URL: https://codereview.chromium.org/1156713003
diff --git a/src/IceInstARM32.cpp b/src/IceInstARM32.cpp
index 16d3ac1..2c33c43 100644
--- a/src/IceInstARM32.cpp
+++ b/src/IceInstARM32.cpp
@@ -290,6 +290,7 @@
template <> const char *InstARM32Adc::Opcode = "adc";
template <> const char *InstARM32Add::Opcode = "add";
template <> const char *InstARM32And::Opcode = "and";
+template <> const char *InstARM32Bic::Opcode = "bic";
template <> const char *InstARM32Eor::Opcode = "eor";
template <> const char *InstARM32Lsl::Opcode = "lsl";
template <> const char *InstARM32Mul::Opcode = "mul";
diff --git a/src/IceInstARM32.h b/src/IceInstARM32.h
index 1ee1831..e65ef4a 100644
--- a/src/IceInstARM32.h
+++ b/src/IceInstARM32.h
@@ -252,6 +252,7 @@
Adc,
Add,
And,
+ Bic,
Br,
Call,
Cmp,
@@ -510,6 +511,7 @@
typedef InstARM32ThreeAddrGPR<InstARM32::Adc> InstARM32Adc;
typedef InstARM32ThreeAddrGPR<InstARM32::Add> InstARM32Add;
typedef InstARM32ThreeAddrGPR<InstARM32::And> InstARM32And;
+typedef InstARM32ThreeAddrGPR<InstARM32::Bic> InstARM32Bic;
typedef InstARM32ThreeAddrGPR<InstARM32::Eor> InstARM32Eor;
typedef InstARM32ThreeAddrGPR<InstARM32::Lsl> InstARM32Lsl;
typedef InstARM32ThreeAddrGPR<InstARM32::Mul> InstARM32Mul;
diff --git a/src/IceTargetLoweringARM32.cpp b/src/IceTargetLoweringARM32.cpp
index 26f01f9..2305a1b 100644
--- a/src/IceTargetLoweringARM32.cpp
+++ b/src/IceTargetLoweringARM32.cpp
@@ -123,6 +123,9 @@
// The maximum number of arguments to pass in GPR registers.
const uint32_t ARM32_MAX_GPR_ARG = 4;
+// Stack alignment
+const uint32_t ARM32_STACK_ALIGNMENT_BYTES = 16;
+
} // end of anonymous namespace
TargetARM32::TargetARM32(Cfg *Func)
@@ -607,8 +610,42 @@
// stack alignment is preserved after the alloca. The stack alignment
// restriction can be relaxed in some cases.
NeedsStackAlignment = true;
- (void)Inst;
- UnimplementedError(Func->getContext()->getFlags());
+
+ // TODO(stichnot): minimize the number of adjustments of SP, etc.
+ Variable *SP = getPhysicalRegister(RegARM32::Reg_sp);
+ Variable *Dest = Inst->getDest();
+ uint32_t AlignmentParam = Inst->getAlignInBytes();
+ // For default align=0, set it to the real value 1, to avoid any
+ // bit-manipulation problems below.
+ AlignmentParam = std::max(AlignmentParam, 1u);
+
+ // LLVM enforces power of 2 alignment.
+ assert(llvm::isPowerOf2_32(AlignmentParam));
+ assert(llvm::isPowerOf2_32(ARM32_STACK_ALIGNMENT_BYTES));
+
+ uint32_t Alignment = std::max(AlignmentParam, ARM32_STACK_ALIGNMENT_BYTES);
+ if (Alignment > ARM32_STACK_ALIGNMENT_BYTES) {
+ alignRegisterPow2(SP, Alignment);
+ }
+ Operand *TotalSize = Inst->getSizeInBytes();
+ if (const auto *ConstantTotalSize =
+ llvm::dyn_cast<ConstantInteger32>(TotalSize)) {
+ uint32_t Value = ConstantTotalSize->getValue();
+ Value = Utils::applyAlignment(Value, Alignment);
+ Operand *SubAmount = legalize(Ctx->getConstantInt32(Value));
+ _sub(SP, SP, SubAmount);
+ } else {
+ // Non-constant sizes need to be adjusted to the next highest
+ // multiple of the required alignment at runtime.
+ TotalSize = legalize(TotalSize);
+ Variable *T = makeReg(IceType_i32);
+ _mov(T, TotalSize);
+ Operand *AddAmount = legalize(Ctx->getConstantInt32(Alignment - 1));
+ _add(T, T, AddAmount);
+ alignRegisterPow2(T, Alignment);
+ _sub(SP, SP, T);
+ }
+ _mov(Dest, SP);
}
void TargetARM32::lowerArithmetic(const InstArithmetic *Inst) {
@@ -1528,6 +1565,23 @@
return Reg;
}
+void TargetARM32::alignRegisterPow2(Variable *Reg, uint32_t Align) {
+ assert(llvm::isPowerOf2_32(Align));
+ uint32_t RotateAmt = 0;
+ uint32_t Immed_8;
+ Operand *Mask;
+ // Use AND or BIC to mask off the bits, depending on which immediate fits
+ // (if it fits at all). Assume Align is usually small, in which case BIC
+ // works better.
+ if (OperandARM32FlexImm::canHoldImm(Align - 1, &RotateAmt, &Immed_8)) {
+ Mask = legalize(Ctx->getConstantInt32(Align - 1), Legal_Reg | Legal_Flex);
+ _bic(Reg, Reg, Mask);
+ } else {
+ Mask = legalize(Ctx->getConstantInt32(-Align), Legal_Reg | Legal_Flex);
+ _and(Reg, Reg, Mask);
+ }
+}
+
void TargetARM32::postLower() {
if (Ctx->getFlags().getOptLevel() == Opt_m1)
return;
diff --git a/src/IceTargetLoweringARM32.h b/src/IceTargetLoweringARM32.h
index 04d5984..019a3e0 100644
--- a/src/IceTargetLoweringARM32.h
+++ b/src/IceTargetLoweringARM32.h
@@ -118,6 +118,7 @@
Variable *makeReg(Type Ty, int32_t RegNum = Variable::NoRegister);
static Type stackSlotType();
Variable *copyToReg(Operand *Src, int32_t RegNum = Variable::NoRegister);
+ void alignRegisterPow2(Variable *Reg, uint32_t Align);
// Returns a vector in a register with the given constant entries.
Variable *makeVectorOfZeros(Type Ty, int32_t RegNum = Variable::NoRegister);
@@ -148,6 +149,10 @@
CondARM32::Cond Pred = CondARM32::AL) {
Context.insert(InstARM32And::create(Func, Dest, Src0, Src1, Pred));
}
+ void _bic(Variable *Dest, Variable *Src0, Operand *Src1,
+ CondARM32::Cond Pred = CondARM32::AL) {
+ Context.insert(InstARM32Bic::create(Func, Dest, Src0, Src1, Pred));
+ }
void _br(CondARM32::Cond Condition, CfgNode *TargetTrue,
CfgNode *TargetFalse) {
Context.insert(
diff --git a/src/IceTargetLoweringX8632.cpp b/src/IceTargetLoweringX8632.cpp
index 0896c47..e7f5c5d 100644
--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
@@ -138,18 +138,10 @@
// The number of different NOP instructions
const uint32_t X86_NUM_NOP_VARIANTS = 5;
-// Value and Alignment are in bytes. Return Value adjusted to the next
-// highest multiple of Alignment.
-uint32_t applyAlignment(uint32_t Value, uint32_t Alignment) {
- // power of 2
- assert((Alignment & (Alignment - 1)) == 0);
- return (Value + Alignment - 1) & -Alignment;
-}
-
// Value is in bytes. Return Value adjusted to the next highest multiple
// of the stack alignment.
uint32_t applyStackAlignment(uint32_t Value) {
- return applyAlignment(Value, X86_STACK_ALIGNMENT_BYTES);
+ return Utils::applyAlignment(Value, X86_STACK_ALIGNMENT_BYTES);
}
// In some cases, there are x-macros tables for both high-level and
@@ -957,7 +949,7 @@
assert(SpillAreaAlignmentBytes <= X86_STACK_ALIGNMENT_BYTES);
uint32_t PaddingStart = X86_RET_IP_SIZE_BYTES + PreservedRegsSizeBytes;
uint32_t SpillAreaStart =
- applyAlignment(PaddingStart, SpillAreaAlignmentBytes);
+ Utils::applyAlignment(PaddingStart, SpillAreaAlignmentBytes);
SpillAreaPaddingBytes = SpillAreaStart - PaddingStart;
SpillAreaSizeBytes += SpillAreaPaddingBytes;
}
@@ -968,7 +960,7 @@
if (LocalsSlotsAlignmentBytes) {
assert(LocalsSlotsAlignmentBytes <= SpillAreaAlignmentBytes);
GlobalsAndSubsequentPaddingSize =
- applyAlignment(GlobalsSize, LocalsSlotsAlignmentBytes);
+ Utils::applyAlignment(GlobalsSize, LocalsSlotsAlignmentBytes);
SpillAreaSizeBytes += GlobalsAndSubsequentPaddingSize - GlobalsSize;
}
@@ -1261,7 +1253,7 @@
// restriction can be relaxed in some cases.
NeedsStackAlignment = true;
- // TODO(sehr,stichnot): minimize the number of adjustments of esp, etc.
+ // TODO(stichnot): minimize the number of adjustments of esp, etc.
Variable *esp = getPhysicalRegister(RegX8632::Reg_esp);
Operand *TotalSize = legalize(Inst->getSizeInBytes());
Variable *Dest = Inst->getDest();
@@ -1271,17 +1263,17 @@
AlignmentParam = std::max(AlignmentParam, 1u);
// LLVM enforces power of 2 alignment.
- assert((AlignmentParam & (AlignmentParam - 1)) == 0);
- assert((X86_STACK_ALIGNMENT_BYTES & (X86_STACK_ALIGNMENT_BYTES - 1)) == 0);
+ assert(llvm::isPowerOf2_32(AlignmentParam));
+ assert(llvm::isPowerOf2_32(X86_STACK_ALIGNMENT_BYTES));
uint32_t Alignment = std::max(AlignmentParam, X86_STACK_ALIGNMENT_BYTES);
if (Alignment > X86_STACK_ALIGNMENT_BYTES) {
_and(esp, Ctx->getConstantInt32(-Alignment));
}
- if (ConstantInteger32 *ConstantTotalSize =
+ if (const auto *ConstantTotalSize =
llvm::dyn_cast<ConstantInteger32>(TotalSize)) {
uint32_t Value = ConstantTotalSize->getValue();
- Value = applyAlignment(Value, Alignment);
+ Value = Utils::applyAlignment(Value, Alignment);
_sub(esp, Ctx->getConstantInt32(Value));
} else {
// Non-constant sizes need to be adjusted to the next highest
diff --git a/src/IceUtils.h b/src/IceUtils.h
index cf0be63..a3b5462 100644
--- a/src/IceUtils.h
+++ b/src/IceUtils.h
@@ -61,16 +61,27 @@
return IsUint(N, Value);
}
+ // Return true if the addition X + Y will cause integer overflow for
+ // integers of type T.
template <typename T> static inline bool WouldOverflowAdd(T X, T Y) {
return ((X > 0 && Y > 0 && (X > std::numeric_limits<T>::max() - Y)) ||
(X < 0 && Y < 0 && (X < std::numeric_limits<T>::min() - Y)));
}
+ // Return true if X is already aligned by N, where N is a power of 2.
template <typename T> static inline bool IsAligned(T X, intptr_t N) {
assert(llvm::isPowerOf2_64(N));
return (X & (N - 1)) == 0;
}
+ // Return Value adjusted to the next highest multiple of Alignment.
+ static inline uint32_t applyAlignment(uint32_t Value, uint32_t Alignment) {
+ assert(llvm::isPowerOf2_32(Alignment));
+ return (Value + Alignment - 1) & -Alignment;
+ }
+
+ // Return amount which must be added to adjust Pos to the next highest
+ // multiple of Align.
static inline uint64_t OffsetToAlignment(uint64_t Pos, uint64_t Align) {
assert(llvm::isPowerOf2_64(Align));
uint64_t Mod = Pos & (Align - 1);
@@ -79,6 +90,7 @@
return Align - Mod;
}
+ // Rotate the value bit pattern to the left by shift bits.
// Precondition: 0 <= shift < 32
static inline uint32_t rotateLeft32(uint32_t value, uint32_t shift) {
if (shift == 0)
@@ -86,6 +98,7 @@
return (value << shift) | (value >> (32 - shift));
}
+ // Rotate the value bit pattern to the right by shift bits.
static inline uint32_t rotateRight32(uint32_t value, uint32_t shift) {
if (shift == 0)
return value;
diff --git a/tests_lit/llvm2ice_tests/alloc.ll b/tests_lit/llvm2ice_tests/alloc.ll
index ce197cc..e3a0e39 100644
--- a/tests_lit/llvm2ice_tests/alloc.ll
+++ b/tests_lit/llvm2ice_tests/alloc.ll
@@ -1,7 +1,19 @@
; This is a basic test of the alloca instruction.
-; RUN: %p2i --filetype=obj --disassemble -i %s --args -O2 | FileCheck %s
-; RUN: %p2i --filetype=obj --disassemble -i %s --args -Om1 | FileCheck %s
+; RUN: %if --need=target_X8632 --command %p2i --filetype=obj --disassemble \
+; RUN: --target x8632 -i %s --args -O2 \
+; RUN: | %if --need=target_X8632 --command FileCheck %s
+
+; RUN: %if --need=target_X8632 --command %p2i --filetype=obj --disassemble \
+; RUN: --target x8632 -i %s --args -Om1 \
+; RUN: | %if --need=target_X8632 --command FileCheck %s
+
+; TODO(jvoung): Stop skipping unimplemented parts (via --skip-unimplemented)
+; once enough infrastructure is in. Also, switch to --filetype=obj
+; when possible.
+; RUN: %if --need=target_ARM32 --command %p2i --filetype=asm --assemble \
+; RUN: --disassemble --target arm32 -i %s --args -O2 --skip-unimplemented \
+; RUN: | %if --need=target_ARM32 --command FileCheck --check-prefix ARM32 %s
define void @fixed_416_align_16(i32 %n) {
entry:
@@ -16,6 +28,10 @@
; CHECK: mov DWORD PTR [esp],eax
; CHECK: call {{.*}} R_{{.*}} f1
+; ARM32-LABEL: fixed_416_align_16
+; ARM32: sub sp, sp, #416
+; ARM32: bl {{.*}} R_{{.*}} f1
+
define void @fixed_416_align_32(i32 %n) {
entry:
%array = alloca i8, i32 400, align 32
@@ -30,6 +46,12 @@
; CHECK: mov DWORD PTR [esp],eax
; CHECK: call {{.*}} R_{{.*}} f1
+; ARM32-LABEL: fixed_416_align_32
+; ARM32: bic sp, sp, #31
+; ARM32: sub sp, sp, #416
+; ARM32: bl {{.*}} R_{{.*}} f1
+
+; Show that the amount to allocate will be rounded up.
define void @fixed_351_align_16(i32 %n) {
entry:
%array = alloca i8, i32 351, align 16
@@ -43,6 +65,10 @@
; CHECK: mov DWORD PTR [esp],eax
; CHECK: call {{.*}} R_{{.*}} f1
+; ARM32-LABEL: fixed_351_align_16
+; ARM32: sub sp, sp, #352
+; ARM32: bl {{.*}} R_{{.*}} f1
+
define void @fixed_351_align_32(i32 %n) {
entry:
%array = alloca i8, i32 351, align 32
@@ -57,8 +83,15 @@
; CHECK: mov DWORD PTR [esp],eax
; CHECK: call {{.*}} R_{{.*}} f1
+; ARM32-LABEL: fixed_351_align_32
+; ARM32: bic sp, sp, #31
+; ARM32: sub sp, sp, #352
+; ARM32: bl {{.*}} R_{{.*}} f1
+
declare void @f1(i32 %ignored)
+declare void @f2(i32 %ignored)
+
define void @variable_n_align_16(i32 %n) {
entry:
%array = alloca i8, i32 %n, align 16
@@ -75,6 +108,12 @@
; CHECK: mov DWORD PTR [esp],eax
; CHECK: call {{.*}} R_{{.*}} f2
+; ARM32-LABEL: variable_n_align_16
+; ARM32: add r0, r0, #15
+; ARM32: bic r0, r0, #15
+; ARM32: sub sp, sp, r0
+; ARM32: bl {{.*}} R_{{.*}} f2
+
define void @variable_n_align_32(i32 %n) {
entry:
%array = alloca i8, i32 %n, align 32
@@ -93,6 +132,13 @@
; CHECK: mov DWORD PTR [esp],eax
; CHECK: call {{.*}} R_{{.*}} f2
+; ARM32-LABEL: variable_n_align_32
+; ARM32: bic sp, sp, #31
+; ARM32: add r0, r0, #31
+; ARM32: bic r0, r0, #31
+; ARM32: sub sp, sp, r0
+; ARM32: bl {{.*}} R_{{.*}} f2
+
; Test alloca with default (0) alignment.
define void @align0(i32 %n) {
entry:
@@ -106,4 +152,56 @@
; CHECK: and [[REG]],0xfffffff0
; CHECK: sub esp,[[REG]]
-declare void @f2(i32 %ignored)
\ No newline at end of file
+; ARM32-LABEL: align0
+; ARM32: add r0, r0, #15
+; ARM32: bic r0, r0, #15
+; ARM32: sub sp, sp, r0
+
+; Test a large alignment where a mask might not fit in an immediate
+; field of an instruction for some architectures.
+define void @align1MB(i32 %n) {
+entry:
+ %array = alloca i8, i32 %n, align 1048576
+ %__2 = ptrtoint i8* %array to i32
+ call void @f2(i32 %__2)
+ ret void
+}
+; CHECK-LABEL: align1MB
+; CHECK: and esp,0xfff00000
+; CHECK: add [[REG:.*]],0xfffff
+; CHECK: and [[REG]],0xfff00000
+; CHECK: sub esp,[[REG]]
+
+; ARM32-LABEL: align1MB
+; ARM32: movw [[REG:.*]], #0
+; ARM32: movt [[REG]], #65520 ; 0xfff0
+; ARM32: and sp, sp, [[REG]]
+; ARM32: movw [[REG2:.*]], #65535 ; 0xffff
+; ARM32: movt [[REG2]], #15
+; ARM32: add r0, r0, [[REG2]]
+; ARM32: movw [[REG3:.*]], #0
+; ARM32: movt [[REG3]], #65520 ; 0xfff0
+; ARM32: and r0, r0, [[REG3]]
+; ARM32: sub sp, sp, r0
+
+; Test a large alignment where a mask might still fit in an immediate
+; field of an instruction for some architectures.
+define void @align512MB(i32 %n) {
+entry:
+ %array = alloca i8, i32 %n, align 536870912
+ %__2 = ptrtoint i8* %array to i32
+ call void @f2(i32 %__2)
+ ret void
+}
+; CHECK-LABEL: align512MB
+; CHECK: and esp,0xe0000000
+; CHECK: add [[REG:.*]],0x1fffffff
+; CHECK: and [[REG]],0xe0000000
+; CHECK: sub esp,[[REG]]
+
+; ARM32-LABEL: align512MB
+; ARM32: and sp, sp, #-536870912 ; 0xe0000000
+; ARM32: mvn [[REG:.*]], #-536870912 ; 0xe0000000
+; ARM32: add r0, r0, [[REG]]
+; ARM32: and r0, r0, #-536870912 ; 0xe0000000
+; ARM32: sub sp, sp, r0