blob: ef588ec2416680d5a5647e1162f575aabd987dfb [file] [log] [blame]
//===- subzero/src/IceTargetLoweringARM32.cpp - ARM32 lowering ------------===//
//
// The Subzero Code Generator
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
///
/// \file
/// \brief Implements the TargetLoweringARM32 class, which consists almost
/// entirely of the lowering sequence for each high-level instruction.
///
//===----------------------------------------------------------------------===//
#include "IceTargetLoweringARM32.h"
#include "IceCfg.h"
#include "IceCfgNode.h"
#include "IceClFlags.h"
#include "IceDefs.h"
#include "IceELFObjectWriter.h"
#include "IceGlobalInits.h"
#include "IceInstARM32.def"
#include "IceInstARM32.h"
#include "IceInstVarIter.h"
#include "IceLiveness.h"
#include "IceOperand.h"
#include "IcePhiLoweringImpl.h"
#include "IceRegistersARM32.h"
#include "IceTargetLoweringARM32.def"
#include "IceUtils.h"
#include "llvm/Support/MathExtras.h"
#include <algorithm>
#include <array>
#include <utility>
namespace ARM32 {
std::unique_ptr<::Ice::TargetLowering> createTargetLowering(::Ice::Cfg *Func) {
return ::Ice::ARM32::TargetARM32::create(Func);
}
std::unique_ptr<::Ice::TargetDataLowering>
createTargetDataLowering(::Ice::GlobalContext *Ctx) {
return ::Ice::ARM32::TargetDataARM32::create(Ctx);
}
std::unique_ptr<::Ice::TargetHeaderLowering>
createTargetHeaderLowering(::Ice::GlobalContext *Ctx) {
return ::Ice::ARM32::TargetHeaderARM32::create(Ctx);
}
void staticInit(::Ice::GlobalContext *Ctx) {
::Ice::ARM32::TargetARM32::staticInit(Ctx);
}
bool shouldBePooled(const ::Ice::Constant *C) {
return ::Ice::ARM32::TargetARM32::shouldBePooled(C);
}
::Ice::Type getPointerType() {
return ::Ice::ARM32::TargetARM32::getPointerType();
}
} // end of namespace ARM32
namespace Ice {
namespace ARM32 {
namespace {
/// SizeOf is used to obtain the size of an initializer list as a constexpr
/// expression. This is only needed until our C++ library is updated to
/// C++ 14 -- which defines constexpr members to std::initializer_list.
class SizeOf {
SizeOf(const SizeOf &) = delete;
SizeOf &operator=(const SizeOf &) = delete;
public:
constexpr SizeOf() : Size(0) {}
template <typename... T>
explicit constexpr SizeOf(T...) : Size(__length<T...>::value) {}
constexpr SizeT size() const { return Size; }
private:
template <typename T, typename... U> struct __length {
static constexpr std::size_t value = 1 + __length<U...>::value;
};
template <typename T> struct __length<T> {
static constexpr std::size_t value = 1;
};
const std::size_t Size;
};
} // end of anonymous namespace
// Defines the RegARM32::Table table with register information.
RegARM32::RegTableType RegARM32::RegTable[RegARM32::Reg_NUM] = {
#define X(val, encode, name, cc_arg, scratch, preserved, stackptr, frameptr, \
isGPR, isInt, isI64Pair, isFP32, isFP64, isVec128, alias_init) \
{name, encode, \
cc_arg, scratch, \
preserved, stackptr, \
frameptr, isGPR, \
isInt, isI64Pair, \
isFP32, isFP64, \
isVec128, (SizeOf alias_init).size(), \
alias_init},
REGARM32_TABLE
#undef X
};
namespace {
// The following table summarizes the logic for lowering the icmp instruction
// for i32 and narrower types. Each icmp condition has a clear mapping to an
// ARM32 conditional move instruction.
const struct TableIcmp32_ {
CondARM32::Cond Mapping;
} TableIcmp32[] = {
#define X(val, is_signed, swapped64, C_32, C1_64, C2_64, C_V, INV_V, NEG_V) \
{CondARM32::C_32},
ICMPARM32_TABLE
#undef X
};
// The following table summarizes the logic for lowering the icmp instruction
// for the i64 type. Two conditional moves are needed for setting to 1 or 0.
// The operands may need to be swapped, and there is a slight difference for
// signed vs unsigned (comparing hi vs lo first, and using cmp vs sbc).
const struct TableIcmp64_ {
bool IsSigned;
bool Swapped;
CondARM32::Cond C1, C2;
} TableIcmp64[] = {
#define X(val, is_signed, swapped64, C_32, C1_64, C2_64, C_V, INV_V, NEG_V) \
{is_signed, swapped64, CondARM32::C1_64, CondARM32::C2_64},
ICMPARM32_TABLE
#undef X
};
CondARM32::Cond getIcmp32Mapping(InstIcmp::ICond Cond) {
assert(Cond < llvm::array_lengthof(TableIcmp32));
return TableIcmp32[Cond].Mapping;
}
// In some cases, there are x-macros tables for both high-level and low-level
// instructions/operands that use the same enum key value. The tables are kept
// separate to maintain a proper separation between abstraction layers. There
// is a risk that the tables could get out of sync if enum values are reordered
// or if entries are added or deleted. The following anonymous namespaces use
// static_asserts to ensure everything is kept in sync.
// Validate the enum values in ICMPARM32_TABLE.
namespace {
// Define a temporary set of enum values based on low-level table entries.
enum _icmp_ll_enum {
#define X(val, is_signed, swapped64, C_32, C1_64, C2_64, C_V, INV_V, NEG_V) \
_icmp_ll_##val,
ICMPARM32_TABLE
#undef X
_num
};
// Define a set of constants based on high-level table entries.
#define X(tag, reverse, str) \
static constexpr int _icmp_hl_##tag = InstIcmp::tag;
ICEINSTICMP_TABLE
#undef X
// Define a set of constants based on low-level table entries, and ensure the
// table entry keys are consistent.
#define X(val, is_signed, swapped64, C_32, C1_64, C2_64, C_V, INV_V, NEG_V) \
static_assert( \
_icmp_ll_##val == _icmp_hl_##val, \
"Inconsistency between ICMPARM32_TABLE and ICEINSTICMP_TABLE: " #val);
ICMPARM32_TABLE
#undef X
// Repeat the static asserts with respect to the high-level table entries in
// case the high-level table has extra entries.
#define X(tag, reverse, str) \
static_assert( \
_icmp_hl_##tag == _icmp_ll_##tag, \
"Inconsistency between ICMPARM32_TABLE and ICEINSTICMP_TABLE: " #tag);
ICEINSTICMP_TABLE
#undef X
} // end of anonymous namespace
// Stack alignment
const uint32_t ARM32_STACK_ALIGNMENT_BYTES = 16;
// Value is in bytes. Return Value adjusted to the next highest multiple of the
// stack alignment.
uint32_t applyStackAlignment(uint32_t Value) {
return Utils::applyAlignment(Value, ARM32_STACK_ALIGNMENT_BYTES);
}
// Value is in bytes. Return Value adjusted to the next highest multiple of the
// stack alignment required for the given type.
uint32_t applyStackAlignmentTy(uint32_t Value, Type Ty) {
// Use natural alignment, except that normally (non-NaCl) ARM only aligns
// vectors to 8 bytes.
// TODO(jvoung): Check this ...
size_t typeAlignInBytes = typeWidthInBytes(Ty);
if (isVectorType(Ty))
typeAlignInBytes = 8;
return Utils::applyAlignment(Value, typeAlignInBytes);
}
// Conservatively check if at compile time we know that the operand is
// definitely a non-zero integer.
bool isGuaranteedNonzeroInt(const Operand *Op) {
if (auto *Const = llvm::dyn_cast_or_null<ConstantInteger32>(Op)) {
return Const->getValue() != 0;
}
return false;
}
} // end of anonymous namespace
TargetARM32Features::TargetARM32Features(const ClFlags &Flags) {
static_assert(
(ARM32InstructionSet::End - ARM32InstructionSet::Begin) ==
(TargetInstructionSet::ARM32InstructionSet_End -
TargetInstructionSet::ARM32InstructionSet_Begin),
"ARM32InstructionSet range different from TargetInstructionSet");
if (Flags.getTargetInstructionSet() !=
TargetInstructionSet::BaseInstructionSet) {
InstructionSet = static_cast<ARM32InstructionSet>(
(Flags.getTargetInstructionSet() -
TargetInstructionSet::ARM32InstructionSet_Begin) +
ARM32InstructionSet::Begin);
}
}
namespace {
constexpr SizeT NumGPRArgs =
#define X(val, encode, name, cc_arg, scratch, preserved, stackptr, frameptr, \
isGPR, isInt, isI64Pair, isFP32, isFP64, isVec128, alias_init) \
+(((cc_arg) > 0) ? 1 : 0)
REGARM32_GPR_TABLE
#undef X
;
std::array<RegNumT, NumGPRArgs> GPRArgInitializer;
constexpr SizeT NumI64Args =
#define X(val, encode, name, cc_arg, scratch, preserved, stackptr, frameptr, \
isGPR, isInt, isI64Pair, isFP32, isFP64, isVec128, alias_init) \
+(((cc_arg) > 0) ? 1 : 0)
REGARM32_I64PAIR_TABLE
#undef X
;
std::array<RegNumT, NumI64Args> I64ArgInitializer;
constexpr SizeT NumFP32Args =
#define X(val, encode, name, cc_arg, scratch, preserved, stackptr, frameptr, \
isGPR, isInt, isI64Pair, isFP32, isFP64, isVec128, alias_init) \
+(((cc_arg) > 0) ? 1 : 0)
REGARM32_FP32_TABLE
#undef X
;
std::array<RegNumT, NumFP32Args> FP32ArgInitializer;
constexpr SizeT NumFP64Args =
#define X(val, encode, name, cc_arg, scratch, preserved, stackptr, frameptr, \
isGPR, isInt, isI64Pair, isFP32, isFP64, isVec128, alias_init) \
+(((cc_arg) > 0) ? 1 : 0)
REGARM32_FP64_TABLE
#undef X
;
std::array<RegNumT, NumFP64Args> FP64ArgInitializer;
constexpr SizeT NumVec128Args =
#define X(val, encode, name, cc_arg, scratch, preserved, stackptr, frameptr, \
isGPR, isInt, isI64Pair, isFP32, isFP64, isVec128, alias_init) \
+(((cc_arg > 0)) ? 1 : 0)
REGARM32_VEC128_TABLE
#undef X
;
std::array<RegNumT, NumVec128Args> Vec128ArgInitializer;
const char *getRegClassName(RegClass C) {
auto ClassNum = static_cast<RegARM32::RegClassARM32>(C);
assert(ClassNum < RegARM32::RCARM32_NUM);
switch (ClassNum) {
default:
assert(C < RC_Target);
return regClassString(C);
// Add handling of new register classes below.
case RegARM32::RCARM32_QtoS:
return "QtoS";
}
}
} // end of anonymous namespace
TargetARM32::TargetARM32(Cfg *Func)
: TargetLowering(Func), CPUFeatures(getFlags()) {}
void TargetARM32::staticInit(GlobalContext *Ctx) {
RegNumT::setLimit(RegARM32::Reg_NUM);
// Limit this size (or do all bitsets need to be the same width)???
SmallBitVector IntegerRegisters(RegARM32::Reg_NUM);
SmallBitVector I64PairRegisters(RegARM32::Reg_NUM);
SmallBitVector Float32Registers(RegARM32::Reg_NUM);
SmallBitVector Float64Registers(RegARM32::Reg_NUM);
SmallBitVector VectorRegisters(RegARM32::Reg_NUM);
SmallBitVector QtoSRegisters(RegARM32::Reg_NUM);
SmallBitVector InvalidRegisters(RegARM32::Reg_NUM);
const unsigned EncodedReg_q8 = RegARM32::RegTable[RegARM32::Reg_q8].Encoding;
for (int i = 0; i < RegARM32::Reg_NUM; ++i) {
const auto &Entry = RegARM32::RegTable[i];
IntegerRegisters[i] = Entry.IsInt;
I64PairRegisters[i] = Entry.IsI64Pair;
Float32Registers[i] = Entry.IsFP32;
Float64Registers[i] = Entry.IsFP64;
VectorRegisters[i] = Entry.IsVec128;
RegisterAliases[i].resize(RegARM32::Reg_NUM);
// TODO(eholk): It would be better to store a QtoS flag in the
// IceRegistersARM32 table than to compare their encodings here.
QtoSRegisters[i] = Entry.IsVec128 && Entry.Encoding < EncodedReg_q8;
for (int j = 0; j < Entry.NumAliases; ++j) {
assert(i == j || !RegisterAliases[i][Entry.Aliases[j]]);
RegisterAliases[i].set(Entry.Aliases[j]);
}
assert(RegisterAliases[i][i]);
if (Entry.CCArg <= 0) {
continue;
}
const auto RegNum = RegNumT::fromInt(i);
if (Entry.IsGPR) {
GPRArgInitializer[Entry.CCArg - 1] = RegNum;
} else if (Entry.IsI64Pair) {
I64ArgInitializer[Entry.CCArg - 1] = RegNum;
} else if (Entry.IsFP32) {
FP32ArgInitializer[Entry.CCArg - 1] = RegNum;
} else if (Entry.IsFP64) {
FP64ArgInitializer[Entry.CCArg - 1] = RegNum;
} else if (Entry.IsVec128) {
Vec128ArgInitializer[Entry.CCArg - 1] = RegNum;
}
}
TypeToRegisterSet[IceType_void] = InvalidRegisters;
TypeToRegisterSet[IceType_i1] = IntegerRegisters;
TypeToRegisterSet[IceType_i8] = IntegerRegisters;
TypeToRegisterSet[IceType_i16] = IntegerRegisters;
TypeToRegisterSet[IceType_i32] = IntegerRegisters;
TypeToRegisterSet[IceType_i64] = I64PairRegisters;
TypeToRegisterSet[IceType_f32] = Float32Registers;
TypeToRegisterSet[IceType_f64] = Float64Registers;
TypeToRegisterSet[IceType_v4i1] = VectorRegisters;
TypeToRegisterSet[IceType_v8i1] = VectorRegisters;
TypeToRegisterSet[IceType_v16i1] = VectorRegisters;
TypeToRegisterSet[IceType_v16i8] = VectorRegisters;
TypeToRegisterSet[IceType_v8i16] = VectorRegisters;
TypeToRegisterSet[IceType_v4i32] = VectorRegisters;
TypeToRegisterSet[IceType_v4f32] = VectorRegisters;
TypeToRegisterSet[RegARM32::RCARM32_QtoS] = QtoSRegisters;
for (size_t i = 0; i < llvm::array_lengthof(TypeToRegisterSet); ++i)
TypeToRegisterSetUnfiltered[i] = TypeToRegisterSet[i];
filterTypeToRegisterSet(
Ctx, RegARM32::Reg_NUM, TypeToRegisterSet,
llvm::array_lengthof(TypeToRegisterSet),
[](RegNumT RegNum) -> std::string {
// This function simply removes ", " from the
// register name.
std::string Name = RegARM32::getRegName(RegNum);
constexpr const char RegSeparator[] = ", ";
constexpr size_t RegSeparatorWidth =
llvm::array_lengthof(RegSeparator) - 1;
for (size_t Pos = Name.find(RegSeparator); Pos != std::string::npos;
Pos = Name.find(RegSeparator)) {
Name.replace(Pos, RegSeparatorWidth, "");
}
return Name;
},
getRegClassName);
}
namespace {
void copyRegAllocFromInfWeightVariable64On32(const VarList &Vars) {
for (Variable *Var : Vars) {
auto *Var64 = llvm::dyn_cast<Variable64On32>(Var);
if (!Var64) {
// This is not the variable we are looking for.
continue;
}
// only allow infinite-weight i64 temporaries to be register allocated.
assert(!Var64->hasReg() || Var64->mustHaveReg());
if (!Var64->hasReg()) {
continue;
}
const auto FirstReg =
RegNumT::fixme(RegARM32::getI64PairFirstGPRNum(Var->getRegNum()));
// This assumes little endian.
Variable *Lo = Var64->getLo();
Variable *Hi = Var64->getHi();
assert(Lo->hasReg() == Hi->hasReg());
if (Lo->hasReg()) {
continue;
}
Lo->setRegNum(FirstReg);
Lo->setMustHaveReg();
Hi->setRegNum(RegNumT::fixme(FirstReg + 1));
Hi->setMustHaveReg();
}
}
} // end of anonymous namespace
uint32_t TargetARM32::getCallStackArgumentsSizeBytes(const InstCall *Call) {
TargetARM32::CallingConv CC;
RegNumT DummyReg;
size_t OutArgsSizeBytes = 0;
for (SizeT i = 0, NumArgs = Call->getNumArgs(); i < NumArgs; ++i) {
Operand *Arg = legalizeUndef(Call->getArg(i));
const Type Ty = Arg->getType();
if (isScalarIntegerType(Ty)) {
if (CC.argInGPR(Ty, &DummyReg)) {
continue;
}
} else {
if (CC.argInVFP(Ty, &DummyReg)) {
continue;
}
}
OutArgsSizeBytes = applyStackAlignmentTy(OutArgsSizeBytes, Ty);
OutArgsSizeBytes += typeWidthInBytesOnStack(Ty);
}
return applyStackAlignment(OutArgsSizeBytes);
}
void TargetARM32::genTargetHelperCallFor(Inst *Instr) {
constexpr bool NoTailCall = false;
constexpr bool IsTargetHelperCall = true;
switch (Instr->getKind()) {
default:
return;
case Inst::Arithmetic: {
Variable *Dest = Instr->getDest();
const Type DestTy = Dest->getType();
const InstArithmetic::OpKind Op =
llvm::cast<InstArithmetic>(Instr)->getOp();
if (isVectorType(DestTy)) {
switch (Op) {
default:
break;
case InstArithmetic::Fdiv:
case InstArithmetic::Frem:
case InstArithmetic::Sdiv:
case InstArithmetic::Srem:
case InstArithmetic::Udiv:
case InstArithmetic::Urem:
scalarizeArithmetic(Op, Dest, Instr->getSrc(0), Instr->getSrc(1));
Instr->setDeleted();
return;
}
}
switch (DestTy) {
default:
return;
case IceType_i64: {
// Technically, ARM has its own aeabi routines, but we can use the
// non-aeabi routine as well. LLVM uses __aeabi_ldivmod for div, but uses
// the more standard __moddi3 for rem.
RuntimeHelper HelperID = RuntimeHelper::H_Num;
switch (Op) {
default:
return;
case InstArithmetic::Udiv:
HelperID = RuntimeHelper::H_udiv_i64;
break;
case InstArithmetic::Sdiv:
HelperID = RuntimeHelper::H_sdiv_i64;
break;
case InstArithmetic::Urem:
HelperID = RuntimeHelper::H_urem_i64;
break;
case InstArithmetic::Srem:
HelperID = RuntimeHelper::H_srem_i64;
break;
}
Operand *TargetHelper = Ctx->getRuntimeHelperFunc(HelperID);
ARM32HelpersPreamble[TargetHelper] = &TargetARM32::preambleDivRem;
constexpr SizeT MaxArgs = 2;
auto *Call = Context.insert<InstCall>(MaxArgs, Dest, TargetHelper,
NoTailCall, IsTargetHelperCall);
Call->addArg(Instr->getSrc(0));
Call->addArg(Instr->getSrc(1));
Instr->setDeleted();
return;
}
case IceType_i32:
case IceType_i16:
case IceType_i8: {
const bool HasHWDiv = hasCPUFeature(TargetARM32Features::HWDivArm);
InstCast::OpKind CastKind;
RuntimeHelper HelperID = RuntimeHelper::H_Num;
switch (Op) {
default:
return;
case InstArithmetic::Udiv:
HelperID = HasHWDiv ? RuntimeHelper::H_Num : RuntimeHelper::H_udiv_i32;
CastKind = InstCast::Zext;
break;
case InstArithmetic::Sdiv:
HelperID = HasHWDiv ? RuntimeHelper::H_Num : RuntimeHelper::H_sdiv_i32;
CastKind = InstCast::Sext;
break;
case InstArithmetic::Urem:
HelperID = HasHWDiv ? RuntimeHelper::H_Num : RuntimeHelper::H_urem_i32;
CastKind = InstCast::Zext;
break;
case InstArithmetic::Srem:
HelperID = HasHWDiv ? RuntimeHelper::H_Num : RuntimeHelper::H_srem_i32;
CastKind = InstCast::Sext;
break;
}
if (HelperID == RuntimeHelper::H_Num) {
// HelperID should only ever be undefined when the processor does not
// have a hardware divider. If any other helpers are ever introduced,
// the following assert will have to be modified.
assert(HasHWDiv);
return;
}
Operand *Src0 = Instr->getSrc(0);
Operand *Src1 = Instr->getSrc(1);
if (DestTy != IceType_i32) {
// Src0 and Src1 have to be zero-, or signed-extended to i32. For Src0,
// we just insert a InstCast right before the call to the helper.
Variable *Src0_32 = Func->makeVariable(IceType_i32);
Context.insert<InstCast>(CastKind, Src0_32, Src0);
Src0 = Src0_32;
// For extending Src1, we will just insert an InstCast if Src1 is not a
// Constant. If it is, then we extend it here, and not during program
// runtime. This allows preambleDivRem to optimize-out the div-by-0
// check.
if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) {
const int32_t ShAmt = (DestTy == IceType_i16) ? 16 : 24;
int32_t NewC = C->getValue();
if (CastKind == InstCast::Zext) {
NewC &= ~(0x80000000l >> ShAmt);
} else {
NewC = (NewC << ShAmt) >> ShAmt;
}
Src1 = Ctx->getConstantInt32(NewC);
} else {
Variable *Src1_32 = Func->makeVariable(IceType_i32);
Context.insert<InstCast>(CastKind, Src1_32, Src1);
Src1 = Src1_32;
}
}
Operand *TargetHelper = Ctx->getRuntimeHelperFunc(HelperID);
ARM32HelpersPreamble[TargetHelper] = &TargetARM32::preambleDivRem;
constexpr SizeT MaxArgs = 2;
auto *Call = Context.insert<InstCall>(MaxArgs, Dest, TargetHelper,
NoTailCall, IsTargetHelperCall);
assert(Src0->getType() == IceType_i32);
Call->addArg(Src0);
assert(Src1->getType() == IceType_i32);
Call->addArg(Src1);
Instr->setDeleted();
return;
}
case IceType_f64:
case IceType_f32: {
if (Op != InstArithmetic::Frem) {
return;
}
constexpr SizeT MaxArgs = 2;
Operand *TargetHelper = Ctx->getRuntimeHelperFunc(
DestTy == IceType_f32 ? RuntimeHelper::H_frem_f32
: RuntimeHelper::H_frem_f64);
auto *Call = Context.insert<InstCall>(MaxArgs, Dest, TargetHelper,
NoTailCall, IsTargetHelperCall);
Call->addArg(Instr->getSrc(0));
Call->addArg(Instr->getSrc(1));
Instr->setDeleted();
return;
}
}
llvm::report_fatal_error("Control flow should never have reached here.");
}
case Inst::Cast: {
Variable *Dest = Instr->getDest();
Operand *Src0 = Instr->getSrc(0);
const Type DestTy = Dest->getType();
const Type SrcTy = Src0->getType();
auto *CastInstr = llvm::cast<InstCast>(Instr);
const InstCast::OpKind CastKind = CastInstr->getCastKind();
switch (CastKind) {
default:
return;
case InstCast::Fptosi:
case InstCast::Fptoui: {
if (DestTy != IceType_i64) {
return;
}
const bool DestIsSigned = CastKind == InstCast::Fptosi;
const bool Src0IsF32 = isFloat32Asserting32Or64(SrcTy);
Operand *TargetHelper = Ctx->getRuntimeHelperFunc(
Src0IsF32 ? (DestIsSigned ? RuntimeHelper::H_fptosi_f32_i64
: RuntimeHelper::H_fptoui_f32_i64)
: (DestIsSigned ? RuntimeHelper::H_fptosi_f64_i64
: RuntimeHelper::H_fptoui_f64_i64));
static constexpr SizeT MaxArgs = 1;
auto *Call = Context.insert<InstCall>(MaxArgs, Dest, TargetHelper,
NoTailCall, IsTargetHelperCall);
Call->addArg(Src0);
Instr->setDeleted();
return;
}
case InstCast::Sitofp:
case InstCast::Uitofp: {
if (SrcTy != IceType_i64) {
return;
}
const bool SourceIsSigned = CastKind == InstCast::Sitofp;
const bool DestIsF32 = isFloat32Asserting32Or64(Dest->getType());
Operand *TargetHelper = Ctx->getRuntimeHelperFunc(
DestIsF32 ? (SourceIsSigned ? RuntimeHelper::H_sitofp_i64_f32
: RuntimeHelper::H_uitofp_i64_f32)
: (SourceIsSigned ? RuntimeHelper::H_sitofp_i64_f64
: RuntimeHelper::H_uitofp_i64_f64));
static constexpr SizeT MaxArgs = 1;
auto *Call = Context.insert<InstCall>(MaxArgs, Dest, TargetHelper,
NoTailCall, IsTargetHelperCall);
Call->addArg(Src0);
Instr->setDeleted();
return;
}
case InstCast::Bitcast: {
if (DestTy == SrcTy) {
return;
}
Variable *CallDest = Dest;
RuntimeHelper HelperID = RuntimeHelper::H_Num;
switch (DestTy) {
default:
return;
case IceType_i8:
assert(SrcTy == IceType_v8i1);
HelperID = RuntimeHelper::H_bitcast_8xi1_i8;
CallDest = Func->makeVariable(IceType_i32);
break;
case IceType_i16:
assert(SrcTy == IceType_v16i1);
HelperID = RuntimeHelper::H_bitcast_16xi1_i16;
CallDest = Func->makeVariable(IceType_i32);
break;
case IceType_v8i1: {
assert(SrcTy == IceType_i8);
HelperID = RuntimeHelper::H_bitcast_i8_8xi1;
Variable *Src0AsI32 = Func->makeVariable(stackSlotType());
// Arguments to functions are required to be at least 32 bits wide.
Context.insert<InstCast>(InstCast::Zext, Src0AsI32, Src0);
Src0 = Src0AsI32;
} break;
case IceType_v16i1: {
assert(SrcTy == IceType_i16);
HelperID = RuntimeHelper::H_bitcast_i16_16xi1;
Variable *Src0AsI32 = Func->makeVariable(stackSlotType());
// Arguments to functions are required to be at least 32 bits wide.
Context.insert<InstCast>(InstCast::Zext, Src0AsI32, Src0);
Src0 = Src0AsI32;
} break;
}
constexpr SizeT MaxSrcs = 1;
InstCall *Call = makeHelperCall(HelperID, CallDest, MaxSrcs);
Call->addArg(Src0);
Context.insert(Call);
// The PNaCl ABI disallows i8/i16 return types, so truncate the helper
// call result to the appropriate type as necessary.
if (CallDest->getType() != Dest->getType())
Context.insert<InstCast>(InstCast::Trunc, Dest, CallDest);
Instr->setDeleted();
return;
}
case InstCast::Trunc: {
if (DestTy == SrcTy) {
return;
}
if (!isVectorType(SrcTy)) {
return;
}
assert(typeNumElements(DestTy) == typeNumElements(SrcTy));
assert(typeElementType(DestTy) == IceType_i1);
assert(isVectorIntegerType(SrcTy));
return;
}
case InstCast::Sext:
case InstCast::Zext: {
if (DestTy == SrcTy) {
return;
}
if (!isVectorType(DestTy)) {
return;
}
assert(typeNumElements(DestTy) == typeNumElements(SrcTy));
assert(typeElementType(SrcTy) == IceType_i1);
assert(isVectorIntegerType(DestTy));
return;
}
}
llvm::report_fatal_error("Control flow should never have reached here.");
}
case Inst::Intrinsic: {
Variable *Dest = Instr->getDest();
auto *Intrinsic = llvm::cast<InstIntrinsic>(Instr);
Intrinsics::IntrinsicID ID = Intrinsic->getIntrinsicID();
switch (ID) {
default:
return;
case Intrinsics::Ctpop: {
Operand *Src0 = Intrinsic->getArg(0);
Operand *TargetHelper =
Ctx->getRuntimeHelperFunc(isInt32Asserting32Or64(Src0->getType())
? RuntimeHelper::H_call_ctpop_i32
: RuntimeHelper::H_call_ctpop_i64);
static constexpr SizeT MaxArgs = 1;
auto *Call = Context.insert<InstCall>(MaxArgs, Dest, TargetHelper,
NoTailCall, IsTargetHelperCall);
Call->addArg(Src0);
Instr->setDeleted();
if (Src0->getType() == IceType_i64) {
ARM32HelpersPostamble[TargetHelper] = &TargetARM32::postambleCtpop64;
}
return;
}
case Intrinsics::Longjmp: {
static constexpr SizeT MaxArgs = 2;
static constexpr Variable *NoDest = nullptr;
Operand *TargetHelper =
Ctx->getRuntimeHelperFunc(RuntimeHelper::H_call_longjmp);
auto *Call = Context.insert<InstCall>(MaxArgs, NoDest, TargetHelper,
NoTailCall, IsTargetHelperCall);
Call->addArg(Intrinsic->getArg(0));
Call->addArg(Intrinsic->getArg(1));
Instr->setDeleted();
return;
}
case Intrinsics::Memcpy: {
// In the future, we could potentially emit an inline memcpy/memset, etc.
// for intrinsic calls w/ a known length.
static constexpr SizeT MaxArgs = 3;
static constexpr Variable *NoDest = nullptr;
Operand *TargetHelper =
Ctx->getRuntimeHelperFunc(RuntimeHelper::H_call_memcpy);
auto *Call = Context.insert<InstCall>(MaxArgs, NoDest, TargetHelper,
NoTailCall, IsTargetHelperCall);
Call->addArg(Intrinsic->getArg(0));
Call->addArg(Intrinsic->getArg(1));
Call->addArg(Intrinsic->getArg(2));
Instr->setDeleted();
return;
}
case Intrinsics::Memmove: {
static constexpr SizeT MaxArgs = 3;
static constexpr Variable *NoDest = nullptr;
Operand *TargetHelper =
Ctx->getRuntimeHelperFunc(RuntimeHelper::H_call_memmove);
auto *Call = Context.insert<InstCall>(MaxArgs, NoDest, TargetHelper,
NoTailCall, IsTargetHelperCall);
Call->addArg(Intrinsic->getArg(0));
Call->addArg(Intrinsic->getArg(1));
Call->addArg(Intrinsic->getArg(2));
Instr->setDeleted();
return;
}
case Intrinsics::Memset: {
// The value operand needs to be extended to a stack slot size because the
// PNaCl ABI requires arguments to be at least 32 bits wide.
Operand *ValOp = Intrinsic->getArg(1);
assert(ValOp->getType() == IceType_i8);
Variable *ValExt = Func->makeVariable(stackSlotType());
Context.insert<InstCast>(InstCast::Zext, ValExt, ValOp);
// Technically, ARM has its own __aeabi_memset, but we can use plain
// memset too. The value and size argument need to be flipped if we ever
// decide to use __aeabi_memset.
static constexpr SizeT MaxArgs = 3;
static constexpr Variable *NoDest = nullptr;
Operand *TargetHelper =
Ctx->getRuntimeHelperFunc(RuntimeHelper::H_call_memset);
auto *Call = Context.insert<InstCall>(MaxArgs, NoDest, TargetHelper,
NoTailCall, IsTargetHelperCall);
Call->addArg(Intrinsic->getArg(0));
Call->addArg(ValExt);
Call->addArg(Intrinsic->getArg(2));
Instr->setDeleted();
return;
}
case Intrinsics::Setjmp: {
static constexpr SizeT MaxArgs = 1;
Operand *TargetHelper =
Ctx->getRuntimeHelperFunc(RuntimeHelper::H_call_setjmp);
auto *Call = Context.insert<InstCall>(MaxArgs, Dest, TargetHelper,
NoTailCall, IsTargetHelperCall);
Call->addArg(Intrinsic->getArg(0));
Instr->setDeleted();
return;
}
}
llvm::report_fatal_error("Control flow should never have reached here.");
}
}
}
void TargetARM32::findMaxStackOutArgsSize() {
// MinNeededOutArgsBytes should be updated if the Target ever creates a
// high-level InstCall that requires more stack bytes.
constexpr size_t MinNeededOutArgsBytes = 0;
MaxOutArgsSizeBytes = MinNeededOutArgsBytes;
for (CfgNode *Node : Func->getNodes()) {
Context.init(Node);
while (!Context.atEnd()) {
PostIncrLoweringContext PostIncrement(Context);
Inst *CurInstr = iteratorToInst(Context.getCur());
if (auto *Call = llvm::dyn_cast<InstCall>(CurInstr)) {
SizeT OutArgsSizeBytes = getCallStackArgumentsSizeBytes(Call);
MaxOutArgsSizeBytes = std::max(MaxOutArgsSizeBytes, OutArgsSizeBytes);
}
}
}
}
GlobalString
TargetARM32::createGotoffRelocation(const ConstantRelocatable *CR) {
GlobalString CRName = CR->getName();
GlobalString CRGotoffName =
Ctx->getGlobalString("GOTOFF$" + Func->getFunctionName() + "$" + CRName);
if (KnownGotoffs.count(CRGotoffName) == 0) {
constexpr bool SuppressMangling = true;
auto *Global =
VariableDeclaration::create(Func->getGlobalPool(), SuppressMangling);
Global->setIsConstant(true);
Global->setName(CRName);
Func->getGlobalPool()->willNotBeEmitted(Global);
auto *Gotoff =
VariableDeclaration::create(Func->getGlobalPool(), SuppressMangling);
constexpr auto GotFixup = R_ARM_GOTOFF32;
Gotoff->setIsConstant(true);
Gotoff->addInitializer(VariableDeclaration::RelocInitializer::create(
Func->getGlobalPool(), Global, {RelocOffset::create(Ctx, 0)},
GotFixup));
Gotoff->setName(CRGotoffName);
Func->addGlobal(Gotoff);
KnownGotoffs.emplace(CRGotoffName);
}
return CRGotoffName;
}
void TargetARM32::translateO2() {
TimerMarker T(TimerStack::TT_O2, Func);
genTargetHelperCalls();
findMaxStackOutArgsSize();
// Do not merge Alloca instructions, and lay out the stack.
static constexpr bool SortAndCombineAllocas = true;
Func->processAllocas(SortAndCombineAllocas);
Func->dump("After Alloca processing");
if (!getFlags().getEnablePhiEdgeSplit()) {
// Lower Phi instructions.
Func->placePhiLoads();
if (Func->hasError())
return;
Func->placePhiStores();
if (Func->hasError())
return;
Func->deletePhis();
if (Func->hasError())
return;
Func->dump("After Phi lowering");
}
// Address mode optimization.
Func->getVMetadata()->init(VMK_SingleDefs);
Func->doAddressOpt();
Func->materializeVectorShuffles();
// Argument lowering
Func->doArgLowering();
// Target lowering. This requires liveness analysis for some parts of the
// lowering decisions, such as compare/branch fusing. If non-lightweight
// liveness analysis is used, the instructions need to be renumbered first.
// TODO: This renumbering should only be necessary if we're actually
// calculating live intervals, which we only do for register allocation.
Func->renumberInstructions();
if (Func->hasError())
return;
// TODO: It should be sufficient to use the fastest liveness calculation,
// i.e. livenessLightweight(). However, for some reason that slows down the
// rest of the translation. Investigate.
Func->liveness(Liveness_Basic);
if (Func->hasError())
return;
Func->dump("After ARM32 address mode opt");
Func->genCode();
if (Func->hasError())
return;
Func->dump("After ARM32 codegen");
// Register allocation. This requires instruction renumbering and full
// liveness analysis.
Func->renumberInstructions();
if (Func->hasError())
return;
Func->liveness(Liveness_Intervals);
if (Func->hasError())
return;
// The post-codegen dump is done here, after liveness analysis and associated
// cleanup, to make the dump cleaner and more useful.
Func->dump("After initial ARM32 codegen");
// Validate the live range computations. The expensive validation call is
// deliberately only made when assertions are enabled.
assert(Func->validateLiveness());
Func->getVMetadata()->init(VMK_All);
regAlloc(RAK_Global);
if (Func->hasError())
return;
copyRegAllocFromInfWeightVariable64On32(Func->getVariables());
Func->dump("After linear scan regalloc");
if (getFlags().getEnablePhiEdgeSplit()) {
Func->advancedPhiLowering();
Func->dump("After advanced Phi lowering");
}
ForbidTemporaryWithoutReg _(this);
// Stack frame mapping.
Func->genFrame();
if (Func->hasError())
return;
Func->dump("After stack frame mapping");
postLowerLegalization();
if (Func->hasError())
return;
Func->dump("After postLowerLegalization");
Func->contractEmptyNodes();
Func->reorderNodes();
// Branch optimization. This needs to be done just before code emission. In
// particular, no transformations that insert or reorder CfgNodes should be
// done after branch optimization. We go ahead and do it before nop insertion
// to reduce the amount of work needed for searching for opportunities.
Func->doBranchOpt();
Func->dump("After branch optimization");
}
void TargetARM32::translateOm1() {
TimerMarker T(TimerStack::TT_Om1, Func);
genTargetHelperCalls();
findMaxStackOutArgsSize();
// Do not merge Alloca instructions, and lay out the stack.
static constexpr bool DontSortAndCombineAllocas = false;
Func->processAllocas(DontSortAndCombineAllocas);
Func->dump("After Alloca processing");
Func->placePhiLoads();
if (Func->hasError())
return;
Func->placePhiStores();
if (Func->hasError())
return;
Func->deletePhis();
if (Func->hasError())
return;
Func->dump("After Phi lowering");
Func->doArgLowering();
Func->genCode();
if (Func->hasError())
return;
Func->dump("After initial ARM32 codegen");
regAlloc(RAK_InfOnly);
if (Func->hasError())
return;
copyRegAllocFromInfWeightVariable64On32(Func->getVariables());
Func->dump("After regalloc of infinite-weight variables");
ForbidTemporaryWithoutReg _(this);
Func->genFrame();
if (Func->hasError())
return;
Func->dump("After stack frame mapping");
postLowerLegalization();
if (Func->hasError())
return;
Func->dump("After postLowerLegalization");
}
uint32_t TargetARM32::getStackAlignment() const {
return ARM32_STACK_ALIGNMENT_BYTES;
}
bool TargetARM32::doBranchOpt(Inst *I, const CfgNode *NextNode) {
if (auto *Br = llvm::dyn_cast<InstARM32Br>(I)) {
return Br->optimizeBranch(NextNode);
}
return false;
}
const char *TargetARM32::getRegName(RegNumT RegNum, Type Ty) const {
(void)Ty;
return RegARM32::getRegName(RegNum);
}
Variable *TargetARM32::getPhysicalRegister(RegNumT RegNum, Type Ty) {
static const Type DefaultType[] = {
#define X(val, encode, name, cc_arg, scratch, preserved, stackptr, frameptr, \
isGPR, isInt, isI64Pair, isFP32, isFP64, isVec128, alias_init) \
(isFP32) \
? IceType_f32 \
: ((isFP64) ? IceType_f64 : ((isVec128 ? IceType_v4i32 : IceType_i32))),
REGARM32_TABLE
#undef X
};
if (Ty == IceType_void) {
assert(unsigned(RegNum) < llvm::array_lengthof(DefaultType));
Ty = DefaultType[RegNum];
}
if (PhysicalRegisters[Ty].empty())
PhysicalRegisters[Ty].resize(RegARM32::Reg_NUM);
assert(unsigned(RegNum) < PhysicalRegisters[Ty].size());
Variable *Reg = PhysicalRegisters[Ty][RegNum];
if (Reg == nullptr) {
Reg = Func->makeVariable(Ty);
Reg->setRegNum(RegNum);
PhysicalRegisters[Ty][RegNum] = Reg;
// Specially mark a named physical register as an "argument" so that it is
// considered live upon function entry. Otherwise it's possible to get
// liveness validation errors for saving callee-save registers.
Func->addImplicitArg(Reg);
// Don't bother tracking the live range of a named physical register.
Reg->setIgnoreLiveness();
}
return Reg;
}
void TargetARM32::emitJumpTable(const Cfg *Func,
const InstJumpTable *JumpTable) const {
(void)Func;
(void)JumpTable;
UnimplementedError(getFlags());
}
void TargetARM32::emitVariable(const Variable *Var) const {
if (!BuildDefs::dump())
return;
Ostream &Str = Ctx->getStrEmit();
if (Var->hasReg()) {
Str << getRegName(Var->getRegNum(), Var->getType());
return;
}
if (Var->mustHaveReg()) {
llvm::report_fatal_error("Infinite-weight Variable (" + Var->getName() +
") has no register assigned - function " +
Func->getFunctionName());
}
assert(!Var->isRematerializable());
int32_t Offset = Var->getStackOffset();
auto BaseRegNum = Var->getBaseRegNum();
if (BaseRegNum.hasNoValue()) {
BaseRegNum = getFrameOrStackReg();
}
const Type VarTy = Var->getType();
Str << "[" << getRegName(BaseRegNum, VarTy);
if (Offset != 0) {
Str << ", #" << Offset;
}
Str << "]";
}
TargetARM32::CallingConv::CallingConv()
: GPRegsUsed(RegARM32::Reg_NUM),
GPRArgs(GPRArgInitializer.rbegin(), GPRArgInitializer.rend()),
I64Args(I64ArgInitializer.rbegin(), I64ArgInitializer.rend()),
VFPRegsUsed(RegARM32::Reg_NUM),
FP32Args(FP32ArgInitializer.rbegin(), FP32ArgInitializer.rend()),
FP64Args(FP64ArgInitializer.rbegin(), FP64ArgInitializer.rend()),
Vec128Args(Vec128ArgInitializer.rbegin(), Vec128ArgInitializer.rend()) {}
bool TargetARM32::CallingConv::argInGPR(Type Ty, RegNumT *Reg) {
CfgVector<RegNumT> *Source;
switch (Ty) {
default: {
assert(isScalarIntegerType(Ty));
Source = &GPRArgs;
} break;
case IceType_i64: {
Source = &I64Args;
} break;
}
discardUnavailableGPRsAndTheirAliases(Source);
if (Source->empty()) {
GPRegsUsed.set();
return false;
}
*Reg = Source->back();
// Note that we don't Source->pop_back() here. This is intentional. Notice how
// we mark all of Reg's aliases as Used. So, for the next argument,
// Source->back() is marked as unavailable, and it is thus implicitly popped
// from the stack.
GPRegsUsed |= RegisterAliases[*Reg];
return true;
}
// GPR are not packed when passing parameters. Thus, a function foo(i32, i64,
// i32) will have the first argument in r0, the second in r1-r2, and the third
// on the stack. To model this behavior, whenever we pop a register from Regs,
// we remove all of its aliases from the pool of available GPRs. This has the
// effect of computing the "closure" on the GPR registers.
void TargetARM32::CallingConv::discardUnavailableGPRsAndTheirAliases(
CfgVector<RegNumT> *Regs) {
while (!Regs->empty() && GPRegsUsed[Regs->back()]) {
GPRegsUsed |= RegisterAliases[Regs->back()];
Regs->pop_back();
}
}
bool TargetARM32::CallingConv::argInVFP(Type Ty, RegNumT *Reg) {
CfgVector<RegNumT> *Source;
switch (Ty) {
default: {
assert(isVectorType(Ty));
Source = &Vec128Args;
} break;
case IceType_f32: {
Source = &FP32Args;
} break;
case IceType_f64: {
Source = &FP64Args;
} break;
}
discardUnavailableVFPRegs(Source);
if (Source->empty()) {
VFPRegsUsed.set();
return false;
}
*Reg = Source->back();
VFPRegsUsed |= RegisterAliases[*Reg];
return true;
}
// Arguments in VFP registers are not packed, so we don't mark the popped
// registers' aliases as unavailable.
void TargetARM32::CallingConv::discardUnavailableVFPRegs(
CfgVector<RegNumT> *Regs) {
while (!Regs->empty() && VFPRegsUsed[Regs->back()]) {
Regs->pop_back();
}
}
void TargetARM32::lowerArguments() {
VarList &Args = Func->getArgs();
TargetARM32::CallingConv CC;
// For each register argument, replace Arg in the argument list with the home
// register. Then generate an instruction in the prolog to copy the home
// register to the assigned location of Arg.
Context.init(Func->getEntryNode());
Context.setInsertPoint(Context.getCur());
for (SizeT I = 0, E = Args.size(); I < E; ++I) {
Variable *Arg = Args[I];
Type Ty = Arg->getType();
RegNumT RegNum;
if (isScalarIntegerType(Ty)) {
if (!CC.argInGPR(Ty, &RegNum)) {
continue;
}
} else {
if (!CC.argInVFP(Ty, &RegNum)) {
continue;
}
}
Variable *RegisterArg = Func->makeVariable(Ty);
if (BuildDefs::dump()) {
RegisterArg->setName(Func, "home_reg:" + Arg->getName());
}
RegisterArg->setIsArg();
Arg->setIsArg(false);
Args[I] = RegisterArg;
switch (Ty) {
default: {
RegisterArg->setRegNum(RegNum);
} break;
case IceType_i64: {
auto *RegisterArg64 = llvm::cast<Variable64On32>(RegisterArg);
RegisterArg64->initHiLo(Func);
RegisterArg64->getLo()->setRegNum(
RegNumT::fixme(RegARM32::getI64PairFirstGPRNum(RegNum)));
RegisterArg64->getHi()->setRegNum(
RegNumT::fixme(RegARM32::getI64PairSecondGPRNum(RegNum)));
} break;
}
Context.insert<InstAssign>(Arg, RegisterArg);
}
}
// Helper function for addProlog().
//
// This assumes Arg is an argument passed on the stack. This sets the frame
// offset for Arg and updates InArgsSizeBytes according to Arg's width. For an
// I64 arg that has been split into Lo and Hi components, it calls itself
// recursively on the components, taking care to handle Lo first because of the
// little-endian architecture. Lastly, this function generates an instruction
// to copy Arg into its assigned register if applicable.
void TargetARM32::finishArgumentLowering(Variable *Arg, Variable *FramePtr,
size_t BasicFrameOffset,
size_t *InArgsSizeBytes) {
const Type Ty = Arg->getType();
*InArgsSizeBytes = applyStackAlignmentTy(*InArgsSizeBytes, Ty);
if (auto *Arg64On32 = llvm::dyn_cast<Variable64On32>(Arg)) {
Variable *const Lo = Arg64On32->getLo();
Variable *const Hi = Arg64On32->getHi();
finishArgumentLowering(Lo, FramePtr, BasicFrameOffset, InArgsSizeBytes);
finishArgumentLowering(Hi, FramePtr, BasicFrameOffset, InArgsSizeBytes);
return;
}
assert(Ty != IceType_i64);
const int32_t ArgStackOffset = BasicFrameOffset + *InArgsSizeBytes;
*InArgsSizeBytes += typeWidthInBytesOnStack(Ty);
if (!Arg->hasReg()) {
Arg->setStackOffset(ArgStackOffset);
return;
}
// If the argument variable has been assigned a register, we need to copy the
// value from the stack slot.
Variable *Parameter = Func->makeVariable(Ty);
Parameter->setMustNotHaveReg();
Parameter->setStackOffset(ArgStackOffset);
_mov(Arg, Parameter);
}
Type TargetARM32::stackSlotType() { return IceType_i32; }
void TargetARM32::addProlog(CfgNode *Node) {
// Stack frame layout:
//
// +------------------------+
// | 1. preserved registers |
// +------------------------+
// | 2. padding |
// +------------------------+ <--- FramePointer (if used)
// | 3. global spill area |
// +------------------------+
// | 4. padding |
// +------------------------+
// | 5. local spill area |
// +------------------------+
// | 6. padding |
// +------------------------+
// | 7. allocas (variable) |
// +------------------------+
// | 8. padding |
// +------------------------+
// | 9. out args |
// +------------------------+ <--- StackPointer
//
// The following variables record the size in bytes of the given areas:
// * PreservedRegsSizeBytes: area 1
// * SpillAreaPaddingBytes: area 2
// * GlobalsSize: area 3
// * GlobalsAndSubsequentPaddingSize: areas 3 - 4
// * LocalsSpillAreaSize: area 5
// * SpillAreaSizeBytes: areas 2 - 6, and 9
// * MaxOutArgsSizeBytes: area 9
//
// Determine stack frame offsets for each Variable without a register
// assignment. This can be done as one variable per stack slot. Or, do
// coalescing by running the register allocator again with an infinite set of
// registers (as a side effect, this gives variables a second chance at
// physical register assignment).
//
// A middle ground approach is to leverage sparsity and allocate one block of
// space on the frame for globals (variables with multi-block lifetime), and
// one block to share for locals (single-block lifetime).
Context.init(Node);
Context.setInsertPoint(Context.getCur());
SmallBitVector CalleeSaves = getRegisterSet(RegSet_CalleeSave, RegSet_None);
RegsUsed = SmallBitVector(CalleeSaves.size());
VarList SortedSpilledVariables;
size_t GlobalsSize = 0;
// If there is a separate locals area, this represents that area. Otherwise
// it counts any variable not counted by GlobalsSize.
SpillAreaSizeBytes = 0;
// If there is a separate locals area, this specifies the alignment for it.
uint32_t LocalsSlotsAlignmentBytes = 0;
// The entire spill locations area gets aligned to largest natural alignment
// of the variables that have a spill slot.
uint32_t SpillAreaAlignmentBytes = 0;
// For now, we don't have target-specific variables that need special
// treatment (no stack-slot-linked SpillVariable type).
std::function<bool(Variable *)> TargetVarHook = [](Variable *Var) {
static constexpr bool AssignStackSlot = false;
static constexpr bool DontAssignStackSlot = !AssignStackSlot;
if (llvm::isa<Variable64On32>(Var)) {
return DontAssignStackSlot;
}
return AssignStackSlot;
};
// Compute the list of spilled variables and bounds for GlobalsSize, etc.
getVarStackSlotParams(SortedSpilledVariables, RegsUsed, &GlobalsSize,
&SpillAreaSizeBytes, &SpillAreaAlignmentBytes,
&LocalsSlotsAlignmentBytes, TargetVarHook);
uint32_t LocalsSpillAreaSize = SpillAreaSizeBytes;
SpillAreaSizeBytes += GlobalsSize;
// Add push instructions for preserved registers. On ARM, "push" can push a
// whole list of GPRs via a bitmask (0-15). Unlike x86, ARM also has
// callee-saved float/vector registers.
//
// The "vpush" instruction can handle a whole list of float/vector registers,
// but it only handles contiguous sequences of registers by specifying the
// start and the length.
PreservedGPRs.reserve(CalleeSaves.size());
PreservedSRegs.reserve(CalleeSaves.size());
// Consider FP and LR as callee-save / used as needed.
if (UsesFramePointer) {
if (RegsUsed[RegARM32::Reg_fp]) {
llvm::report_fatal_error("Frame pointer has been used.");
}
CalleeSaves[RegARM32::Reg_fp] = true;
RegsUsed[RegARM32::Reg_fp] = true;
}
if (!MaybeLeafFunc) {
CalleeSaves[RegARM32::Reg_lr] = true;
RegsUsed[RegARM32::Reg_lr] = true;
}
// Make two passes over the used registers. The first pass records all the
// used registers -- and their aliases. Then, we figure out which GPRs and
// VFP S registers should be saved. We don't bother saving D/Q registers
// because their uses are recorded as S regs uses.
SmallBitVector ToPreserve(RegARM32::Reg_NUM);
for (SizeT i = 0; i < CalleeSaves.size(); ++i) {
if (CalleeSaves[i] && RegsUsed[i]) {
ToPreserve |= RegisterAliases[i];
}
}
uint32_t NumCallee = 0;
size_t PreservedRegsSizeBytes = 0;
// RegClasses is a tuple of
//
// <First Register in Class, Last Register in Class, Vector of Save Registers>
//
// We use this tuple to figure out which register we should push/pop during
// prolog/epilog.
using RegClassType = std::tuple<uint32_t, uint32_t, VarList *>;
const RegClassType RegClasses[] = {
RegClassType(RegARM32::Reg_GPR_First, RegARM32::Reg_GPR_Last,
&PreservedGPRs),
RegClassType(RegARM32::Reg_SREG_First, RegARM32::Reg_SREG_Last,
&PreservedSRegs)};
for (const auto &RegClass : RegClasses) {
const uint32_t FirstRegInClass = std::get<0>(RegClass);
const uint32_t LastRegInClass = std::get<1>(RegClass);
VarList *const PreservedRegsInClass = std::get<2>(RegClass);
for (uint32_t Reg = FirstRegInClass; Reg <= LastRegInClass; ++Reg) {
if (!ToPreserve[Reg]) {
continue;
}
++NumCallee;
Variable *PhysicalRegister = getPhysicalRegister(RegNumT::fromInt(Reg));
PreservedRegsSizeBytes +=
typeWidthInBytesOnStack(PhysicalRegister->getType());
PreservedRegsInClass->push_back(PhysicalRegister);
}
}
Ctx->statsUpdateRegistersSaved(NumCallee);
if (!PreservedSRegs.empty())
_push(PreservedSRegs);
if (!PreservedGPRs.empty())
_push(PreservedGPRs);
// Generate "mov FP, SP" if needed.
if (UsesFramePointer) {
Variable *FP = getPhysicalRegister(RegARM32::Reg_fp);
Variable *SP = getPhysicalRegister(RegARM32::Reg_sp);
_mov(FP, SP);
// Keep FP live for late-stage liveness analysis (e.g. asm-verbose mode).
Context.insert<InstFakeUse>(FP);
}
// Align the variables area. SpillAreaPaddingBytes is the size of the region
// after the preserved registers and before the spill areas.
// LocalsSlotsPaddingBytes is the amount of padding between the globals and
// locals area if they are separate.
assert(SpillAreaAlignmentBytes <= ARM32_STACK_ALIGNMENT_BYTES);
assert(LocalsSlotsAlignmentBytes <= SpillAreaAlignmentBytes);
uint32_t SpillAreaPaddingBytes = 0;
uint32_t LocalsSlotsPaddingBytes = 0;
alignStackSpillAreas(PreservedRegsSizeBytes, SpillAreaAlignmentBytes,
GlobalsSize, LocalsSlotsAlignmentBytes,
&SpillAreaPaddingBytes, &LocalsSlotsPaddingBytes);
SpillAreaSizeBytes += SpillAreaPaddingBytes + LocalsSlotsPaddingBytes;
uint32_t GlobalsAndSubsequentPaddingSize =
GlobalsSize + LocalsSlotsPaddingBytes;
// Adds the out args space to the stack, and align SP if necessary.
if (!NeedsStackAlignment) {
SpillAreaSizeBytes += MaxOutArgsSizeBytes;
} else {
uint32_t StackOffset = PreservedRegsSizeBytes;
uint32_t StackSize = applyStackAlignment(StackOffset + SpillAreaSizeBytes);
StackSize = applyStackAlignment(StackSize + MaxOutArgsSizeBytes);
SpillAreaSizeBytes = StackSize - StackOffset;
}
// Combine fixed alloca with SpillAreaSize.
SpillAreaSizeBytes += FixedAllocaSizeBytes;
// Generate "sub sp, SpillAreaSizeBytes"
if (SpillAreaSizeBytes) {
Variable *SP = getPhysicalRegister(RegARM32::Reg_sp);
// Use the scratch register if needed to legalize the immediate.
Operand *SubAmount = legalize(Ctx->getConstantInt32(SpillAreaSizeBytes),
Legal_Reg | Legal_Flex, getReservedTmpReg());
_sub(SP, SP, SubAmount);
if (FixedAllocaAlignBytes > ARM32_STACK_ALIGNMENT_BYTES) {
alignRegisterPow2(SP, FixedAllocaAlignBytes);
}
}
Ctx->statsUpdateFrameBytes(SpillAreaSizeBytes);
// Fill in stack offsets for stack args, and copy args into registers for
// those that were register-allocated. Args are pushed right to left, so
// Arg[0] is closest to the stack/frame pointer.
Variable *FramePtr = getPhysicalRegister(getFrameOrStackReg());
size_t BasicFrameOffset = PreservedRegsSizeBytes;
if (!UsesFramePointer)
BasicFrameOffset += SpillAreaSizeBytes;
const VarList &Args = Func->getArgs();
size_t InArgsSizeBytes = 0;
TargetARM32::CallingConv CC;
for (Variable *Arg : Args) {
RegNumT DummyReg;
const Type Ty = Arg->getType();
// Skip arguments passed in registers.
if (isScalarIntegerType(Ty)) {
if (CC.argInGPR(Ty, &DummyReg)) {
continue;
}
} else {
if (CC.argInVFP(Ty, &DummyReg)) {
continue;
}
}
finishArgumentLowering(Arg, FramePtr, BasicFrameOffset, &InArgsSizeBytes);
}
// Fill in stack offsets for locals.
assignVarStackSlots(SortedSpilledVariables, SpillAreaPaddingBytes,
SpillAreaSizeBytes, GlobalsAndSubsequentPaddingSize,
UsesFramePointer);
this->HasComputedFrame = true;
if (BuildDefs::dump() && Func->isVerbose(IceV_Frame)) {
OstreamLocker _(Func->getContext());
Ostream &Str = Func->getContext()->getStrDump();
Str << "Stack layout:\n";
uint32_t SPAdjustmentPaddingSize =
SpillAreaSizeBytes - LocalsSpillAreaSize -
GlobalsAndSubsequentPaddingSize - SpillAreaPaddingBytes -
MaxOutArgsSizeBytes;
Str << " in-args = " << InArgsSizeBytes << " bytes\n"
<< " preserved registers = " << PreservedRegsSizeBytes << " bytes\n"
<< " spill area padding = " << SpillAreaPaddingBytes << " bytes\n"
<< " globals spill area = " << GlobalsSize << " bytes\n"
<< " globals-locals spill areas intermediate padding = "
<< GlobalsAndSubsequentPaddingSize - GlobalsSize << " bytes\n"
<< " locals spill area = " << LocalsSpillAreaSize << " bytes\n"
<< " SP alignment padding = " << SPAdjustmentPaddingSize << " bytes\n";
Str << "Stack details:\n"
<< " SP adjustment = " << SpillAreaSizeBytes << " bytes\n"
<< " spill area alignment = " << SpillAreaAlignmentBytes << " bytes\n"
<< " outgoing args size = " << MaxOutArgsSizeBytes << " bytes\n"
<< " locals spill area alignment = " << LocalsSlotsAlignmentBytes
<< " bytes\n"
<< " is FP based = " << UsesFramePointer << "\n";
}
}
void TargetARM32::addEpilog(CfgNode *Node) {
InstList &Insts = Node->getInsts();
InstList::reverse_iterator RI, E;
for (RI = Insts.rbegin(), E = Insts.rend(); RI != E; ++RI) {
if (llvm::isa<InstARM32Ret>(*RI))
break;
}
if (RI == E)
return;
// Convert the reverse_iterator position into its corresponding (forward)
// iterator position.
InstList::iterator InsertPoint = reverseToForwardIterator(RI);
--InsertPoint;
Context.init(Node);
Context.setInsertPoint(InsertPoint);
Variable *SP = getPhysicalRegister(RegARM32::Reg_sp);
if (UsesFramePointer) {
Variable *FP = getPhysicalRegister(RegARM32::Reg_fp);
// For late-stage liveness analysis (e.g. asm-verbose mode), adding a fake
// use of SP before the assignment of SP=FP keeps previous SP adjustments
// from being dead-code eliminated.
Context.insert<InstFakeUse>(SP);
_mov_redefined(SP, FP);
} else {
// add SP, SpillAreaSizeBytes
if (SpillAreaSizeBytes) {
// Use the scratch register if needed to legalize the immediate.
Operand *AddAmount =
legalize(Ctx->getConstantInt32(SpillAreaSizeBytes),
Legal_Reg | Legal_Flex, getReservedTmpReg());
_add(SP, SP, AddAmount);
}
}
if (!PreservedGPRs.empty())
_pop(PreservedGPRs);
if (!PreservedSRegs.empty())
_pop(PreservedSRegs);
}
bool TargetARM32::isLegalMemOffset(Type Ty, int32_t Offset) const {
constexpr bool ZeroExt = false;
return OperandARM32Mem::canHoldOffset(Ty, ZeroExt, Offset);
}
Variable *TargetARM32::PostLoweringLegalizer::newBaseRegister(
Variable *Base, int32_t Offset, RegNumT ScratchRegNum) {
// Legalize will likely need a movw/movt combination, but if the top bits are
// all 0 from negating the offset and subtracting, we could use that instead.
const bool ShouldSub = Offset != 0 && (-Offset & 0xFFFF0000) == 0;
Variable *ScratchReg = Target->makeReg(IceType_i32, ScratchRegNum);
if (ShouldSub) {
Operand *OffsetVal =
Target->legalize(Target->Ctx->getConstantInt32(-Offset),
Legal_Reg | Legal_Flex, ScratchRegNum);
Target->_sub(ScratchReg, Base, OffsetVal);
} else {
Operand *OffsetVal =
Target->legalize(Target->Ctx->getConstantInt32(Offset),
Legal_Reg | Legal_Flex, ScratchRegNum);
Target->_add(ScratchReg, Base, OffsetVal);
}
if (ScratchRegNum == Target->getReservedTmpReg()) {
const bool BaseIsStackOrFramePtr =
Base->getRegNum() == Target->getFrameOrStackReg();
// There is currently no code path that would trigger this assertion, so we
// leave this assertion here in case it is ever violated. This is not a
// fatal error (thus the use of assert() and not llvm::report_fatal_error)
// as the program compiled by subzero will still work correctly.
assert(BaseIsStackOrFramePtr);
// Side-effect: updates TempBase to reflect the new Temporary.
if (BaseIsStackOrFramePtr) {
TempBaseReg = ScratchReg;
TempBaseOffset = Offset;
} else {
TempBaseReg = nullptr;
TempBaseOffset = 0;
}
}
return ScratchReg;
}
OperandARM32Mem *TargetARM32::PostLoweringLegalizer::createMemOperand(
Type Ty, Variable *Base, int32_t Offset, bool AllowOffsets) {
assert(!Base->isRematerializable());
if (Offset == 0 || (AllowOffsets && Target->isLegalMemOffset(Ty, Offset))) {
return OperandARM32Mem::create(
Target->Func, Ty, Base,
llvm::cast<ConstantInteger32>(Target->Ctx->getConstantInt32(Offset)),
OperandARM32Mem::Offset);
}
if (!AllowOffsets || TempBaseReg == nullptr) {
newBaseRegister(Base, Offset, Target->getReservedTmpReg());
}
int32_t OffsetDiff = Offset - TempBaseOffset;
assert(AllowOffsets || OffsetDiff == 0);
if (!Target->isLegalMemOffset(Ty, OffsetDiff)) {
newBaseRegister(Base, Offset, Target->getReservedTmpReg());
OffsetDiff = 0;
}
assert(!TempBaseReg->isRematerializable());
return OperandARM32Mem::create(
Target->Func, Ty, TempBaseReg,
llvm::cast<ConstantInteger32>(Target->Ctx->getConstantInt32(OffsetDiff)),
OperandARM32Mem::Offset);
}
void TargetARM32::PostLoweringLegalizer::resetTempBaseIfClobberedBy(
const Inst *Instr) {
bool ClobbersTempBase = false;
if (TempBaseReg != nullptr) {
Variable *Dest = Instr->getDest();
if (llvm::isa<InstARM32Call>(Instr)) {
// The following assertion is an invariant, so we remove it from the if
// test. If the invariant is ever broken/invalidated/changed, remember
// to add it back to the if condition.
assert(TempBaseReg->getRegNum() == Target->getReservedTmpReg());
// The linker may need to clobber IP if the call is too far from PC. Thus,
// we assume IP will be overwritten.
ClobbersTempBase = true;
} else if (Dest != nullptr &&
Dest->getRegNum() == TempBaseReg->getRegNum()) {
// Register redefinition.
ClobbersTempBase = true;
}
}
if (ClobbersTempBase) {
TempBaseReg = nullptr;
TempBaseOffset = 0;
}
}
void TargetARM32::PostLoweringLegalizer::legalizeMov(InstARM32Mov *MovInstr) {
Variable *Dest = MovInstr->getDest();
assert(Dest != nullptr);
Type DestTy = Dest->getType();
assert(DestTy != IceType_i64);
Operand *Src = MovInstr->getSrc(0);
Type SrcTy = Src->getType();
(void)SrcTy;
assert(SrcTy != IceType_i64);
if (MovInstr->isMultiDest() || MovInstr->isMultiSource())
return;
bool Legalized = false;
if (!Dest->hasReg()) {
auto *SrcR = llvm::cast<Variable>(Src);
assert(SrcR->hasReg());
assert(!SrcR->isRematerializable());
const int32_t Offset = Dest->getStackOffset();
// This is a _mov(Mem(), Variable), i.e., a store.
Target->_str(SrcR, createMemOperand(DestTy, StackOrFrameReg, Offset),
MovInstr->getPredicate());
// _str() does not have a Dest, so we add a fake-def(Dest).
Target->Context.insert<InstFakeDef>(Dest);
Legalized = true;
} else if (auto *Var = llvm::dyn_cast<Variable>(Src)) {
if (Var->isRematerializable()) {
// This is equivalent to an x86 _lea(RematOffset(%esp/%ebp), Variable).
// ExtraOffset is only needed for frame-pointer based frames as we have
// to account for spill storage.
const int32_t ExtraOffset = (Var->getRegNum() == Target->getFrameReg())
? Target->getFrameFixedAllocaOffset()
: 0;
const int32_t Offset = Var->getStackOffset() + ExtraOffset;
Variable *Base = Target->getPhysicalRegister(Var->getRegNum());
Variable *T = newBaseRegister(Base, Offset, Dest->getRegNum());
Target->_mov(Dest, T);
Legalized = true;
} else {
if (!Var->hasReg()) {
// This is a _mov(Variable, Mem()), i.e., a load.
const int32_t Offset = Var->getStackOffset();
Target->_ldr(Dest, createMemOperand(DestTy, StackOrFrameReg, Offset),
MovInstr->getPredicate());
Legalized = true;
}
}
}
if (Legalized) {
if (MovInstr->isDestRedefined()) {
Target->_set_dest_redefined();
}
MovInstr->setDeleted();
}
}
// ARM32 address modes:
// ld/st i[8|16|32]: [reg], [reg +/- imm12], [pc +/- imm12],
// [reg +/- reg << shamt5]
// ld/st f[32|64] : [reg], [reg +/- imm8] , [pc +/- imm8]
// ld/st vectors : [reg]
//
// For now, we don't handle address modes with Relocatables.
namespace {
// MemTraits contains per-type valid address mode information.
#define X(tag, elementty, int_width, fp_width, uvec_width, svec_width, sbits, \
ubits, rraddr, shaddr) \
static_assert(!(shaddr) || rraddr, "Check ICETYPEARM32_TABLE::" #tag);
ICETYPEARM32_TABLE
#undef X
static const struct {
int32_t ValidImmMask;
bool CanHaveImm;
bool CanHaveIndex;
bool CanHaveShiftedIndex;
} MemTraits[] = {
#define X(tag, elementty, int_width, fp_width, uvec_width, svec_width, sbits, \
ubits, rraddr, shaddr) \
{ \
(1 << ubits) - 1, \
(ubits) > 0, \
rraddr, \
shaddr, \
},
ICETYPEARM32_TABLE
#undef X
};
static constexpr SizeT MemTraitsSize = llvm::array_lengthof(MemTraits);
} // end of anonymous namespace
OperandARM32Mem *
TargetARM32::PostLoweringLegalizer::legalizeMemOperand(OperandARM32Mem *Mem,
bool AllowOffsets) {
assert(!Mem->isRegReg() || !Mem->getIndex()->isRematerializable());
assert(Mem->isRegReg() || Target->isLegalMemOffset(
Mem->getType(), Mem->getOffset()->getValue()));
bool Legalized = false;
Variable *Base = Mem->getBase();
int32_t Offset = Mem->isRegReg() ? 0 : Mem->getOffset()->getValue();
if (Base->isRematerializable()) {
const int32_t ExtraOffset = (Base->getRegNum() == Target->getFrameReg())
? Target->getFrameFixedAllocaOffset()
: 0;
Offset += Base->getStackOffset() + ExtraOffset;
Base = Target->getPhysicalRegister(Base->getRegNum());
assert(!Base->isRematerializable());
Legalized = true;
}
if (!Legalized) {
return nullptr;
}
if (!Mem->isRegReg()) {
return createMemOperand(Mem->getType(), Base, Offset, AllowOffsets);
}
assert(MemTraits[Mem->getType()].CanHaveIndex);
if (Offset != 0) {
if (TempBaseReg == nullptr) {
Base = newBaseRegister(Base, Offset, Target->getReservedTmpReg());
} else {
uint32_t Imm8, Rotate;
const int32_t OffsetDiff = Offset - TempBaseOffset;
if (OffsetDiff == 0) {
Base = TempBaseReg;
} else if (OperandARM32FlexImm::canHoldImm(OffsetDiff, &Rotate, &Imm8)) {
auto *OffsetDiffF = OperandARM32FlexImm::create(
Target->Func, IceType_i32, Imm8, Rotate);
Target->_add(TempBaseReg, TempBaseReg, OffsetDiffF);
TempBaseOffset += OffsetDiff;
Base = TempBaseReg;
} else if (OperandARM32FlexImm::canHoldImm(-OffsetDiff, &Rotate, &Imm8)) {
auto *OffsetDiffF = OperandARM32FlexImm::create(
Target->Func, IceType_i32, Imm8, Rotate);
Target->_sub(TempBaseReg, TempBaseReg, OffsetDiffF);
TempBaseOffset += OffsetDiff;
Base = TempBaseReg;
} else {
Base = newBaseRegister(Base, Offset, Target->getReservedTmpReg());
}
}
}
return OperandARM32Mem::create(Target->Func, Mem->getType(), Base,
Mem->getIndex(), Mem->getShiftOp(),
Mem->getShiftAmt(), Mem->getAddrMode());
}
void TargetARM32::postLowerLegalization() {
// If a stack variable's frame offset doesn't fit, convert from:
// ldr X, OFF[SP]
// to:
// movw/movt TMP, OFF_PART
// add TMP, TMP, SP
// ldr X, OFF_MORE[TMP]
//
// This is safe because we have reserved TMP, and add for ARM does not
// clobber the flags register.
Func->dump("Before postLowerLegalization");
assert(hasComputedFrame());
// Do a fairly naive greedy clustering for now. Pick the first stack slot
// that's out of bounds and make a new base reg using the architecture's temp
// register. If that works for the next slot, then great. Otherwise, create a
// new base register, clobbering the previous base register. Never share a
// base reg across different basic blocks. This isn't ideal if local and
// multi-block variables are far apart and their references are interspersed.
// It may help to be more coordinated about assign stack slot numbers and may
// help to assign smaller offsets to higher-weight variables so that they
// don't depend on this legalization.
for (CfgNode *Node : Func->getNodes()) {
Context.init(Node);
// One legalizer per basic block, otherwise we would share the Temporary
// Base Register between basic blocks.
PostLoweringLegalizer Legalizer(this);
while (!Context.atEnd()) {
PostIncrLoweringContext PostIncrement(Context);
Inst *CurInstr = iteratorToInst(Context.getCur());
// Check if the previous TempBaseReg is clobbered, and reset if needed.
Legalizer.resetTempBaseIfClobberedBy(CurInstr);
if (auto *MovInstr = llvm::dyn_cast<InstARM32Mov>(CurInstr)) {
Legalizer.legalizeMov(MovInstr);
} else if (auto *LdrInstr = llvm::dyn_cast<InstARM32Ldr>(CurInstr)) {
if (OperandARM32Mem *LegalMem = Legalizer.legalizeMemOperand(
llvm::cast<OperandARM32Mem>(LdrInstr->getSrc(0)))) {
_ldr(CurInstr->getDest(), LegalMem, LdrInstr->getPredicate());
CurInstr->setDeleted();
}
} else if (auto *LdrexInstr = llvm::dyn_cast<InstARM32Ldrex>(CurInstr)) {
constexpr bool DisallowOffsetsBecauseLdrex = false;
if (OperandARM32Mem *LegalMem = Legalizer.legalizeMemOperand(
llvm::cast<OperandARM32Mem>(LdrexInstr->getSrc(0)),
DisallowOffsetsBecauseLdrex)) {
_ldrex(CurInstr->getDest(), LegalMem, LdrexInstr->getPredicate());
CurInstr->setDeleted();
}
} else if (auto *StrInstr = llvm::dyn_cast<InstARM32Str>(CurInstr)) {
if (OperandARM32Mem *LegalMem = Legalizer.legalizeMemOperand(
llvm::cast<OperandARM32Mem>(StrInstr->getSrc(1)))) {
_str(llvm::cast<Variable>(CurInstr->getSrc(0)), LegalMem,
StrInstr->getPredicate());
CurInstr->setDeleted();
}
} else if (auto *StrexInstr = llvm::dyn_cast<InstARM32Strex>(CurInstr)) {
constexpr bool DisallowOffsetsBecauseStrex = false;
if (OperandARM32Mem *LegalMem = Legalizer.legalizeMemOperand(
llvm::cast<OperandARM32Mem>(StrexInstr->getSrc(1)),
DisallowOffsetsBecauseStrex)) {
_strex(CurInstr->getDest(), llvm::cast<Variable>(CurInstr->getSrc(0)),
LegalMem, StrexInstr->getPredicate());
CurInstr->setDeleted();
}
}
// Sanity-check: the Legalizer will either have no Temp, or it will be
// bound to IP.
Legalizer.assertNoTempOrAssignedToIP();
}
}
}
Operand *TargetARM32::loOperand(Operand *Operand) {
assert(Operand->getType() == IceType_i64);
if (Operand->getType() != IceType_i64)
return Operand;
if (auto *Var64On32 = llvm::dyn_cast<Variable64On32>(Operand))
return Var64On32->getLo();
if (auto *Const = llvm::dyn_cast<ConstantInteger64>(Operand))
return Ctx->getConstantInt32(static_cast<uint32_t>(Const->getValue()));
if (auto *Mem = llvm::dyn_cast<OperandARM32Mem>(Operand)) {
// Conservatively disallow memory operands with side-effects (pre/post
// increment) in case of duplication.
assert(Mem->getAddrMode() == OperandARM32Mem::Offset ||
Mem->getAddrMode() == OperandARM32Mem::NegOffset);
if (Mem->isRegReg()) {
Variable *IndexR = legalizeToReg(Mem->getIndex());
return OperandARM32Mem::create(Func, IceType_i32, Mem->getBase(), IndexR,
Mem->getShiftOp(), Mem->getShiftAmt(),
Mem->getAddrMode());
} else {
return OperandARM32Mem::create(Func, IceType_i32, Mem->getBase(),
Mem->getOffset(), Mem->getAddrMode());
}
}
llvm::report_fatal_error("Unsupported operand type");
return nullptr;
}
Operand *TargetARM32::hiOperand(Operand *Operand) {
assert(Operand->getType() == IceType_i64);
if (Operand->getType() != IceType_i64)
return Operand;
if (auto *Var64On32 = llvm::dyn_cast<Variable64On32>(Operand))
return Var64On32->getHi();
if (auto *Const = llvm::dyn_cast<ConstantInteger64>(Operand)) {
return Ctx->getConstantInt32(
static_cast<uint32_t>(Const->getValue() >> 32));
}
if (auto *Mem = llvm::dyn_cast<OperandARM32Mem>(Operand)) {
// Conservatively disallow memory operands with side-effects in case of
// duplication.
assert(Mem->getAddrMode() == OperandARM32Mem::Offset ||
Mem->getAddrMode() == OperandARM32Mem::NegOffset);
const Type SplitType = IceType_i32;
if (Mem->isRegReg()) {
// We have to make a temp variable T, and add 4 to either Base or Index.
// The Index may be shifted, so adding 4 can mean something else. Thus,
// prefer T := Base + 4, and use T as the new Base.
Variable *Base = Mem->getBase();
Constant *Four = Ctx->getConstantInt32(4);
Variable *NewBase = Func->makeVariable(Base->getType());
lowerArithmetic(InstArithmetic::create(Func, InstArithmetic::Add, NewBase,
Base, Four));
Variable *BaseR = legalizeToReg(NewBase);
Variable *IndexR = legalizeToReg(Mem->getIndex());
return OperandARM32Mem::create(Func, SplitType, BaseR, IndexR,
Mem->getShiftOp(), Mem->getShiftAmt(),
Mem->getAddrMode());
} else {
Variable *Base = Mem->getBase();
ConstantInteger32 *Offset = Mem->getOffset();
assert(!Utils::WouldOverflowAdd(Offset->getValue(), 4));
int32_t NextOffsetVal = Offset->getValue() + 4;
constexpr bool ZeroExt = false;
if (!OperandARM32Mem::canHoldOffset(SplitType, ZeroExt, NextOffsetVal)) {
// We have to make a temp variable and add 4 to either Base or Offset.
// If we add 4 to Offset, this will convert a non-RegReg addressing
// mode into a RegReg addressing mode. Since NaCl sandboxing disallows
// RegReg addressing modes, prefer adding to base and replacing
// instead. Thus we leave the old offset alone.
Constant *_4 = Ctx->getConstantInt32(4);
Variable *NewBase = Func->makeVariable(Base->getType());
lowerArithmetic(InstArithmetic::create(Func, InstArithmetic::Add,
NewBase, Base, _4));
Base = NewBase;
} else {
Offset =
llvm::cast<ConstantInteger32>(Ctx->getConstantInt32(NextOffsetVal));
}
Variable *BaseR = legalizeToReg(Base);
return OperandARM32Mem::create(Func, SplitType, BaseR, Offset,
Mem->getAddrMode());
}
}
llvm::report_fatal_error("Unsupported operand type");
return nullptr;
}
SmallBitVector TargetARM32::getRegisterSet(RegSetMask Include,
RegSetMask Exclude) const {
SmallBitVector Registers(RegARM32::Reg_NUM);
for (uint32_t i = 0; i < RegARM32::Reg_NUM; ++i) {
const auto &Entry = RegARM32::RegTable[i];
if (Entry.Scratch && (Include & RegSet_CallerSave))
Registers[i] = true;
if (Entry.Preserved && (Include & RegSet_CalleeSave))
Registers[i] = true;
if (Entry.StackPtr && (Include & RegSet_StackPointer))
Registers[i] = true;
if (Entry.FramePtr && (Include & RegSet_FramePointer))
Registers[i] = true;
if (Entry.Scratch && (Exclude & RegSet_CallerSave))
Registers[i] = false;
if (Entry.Preserved && (Exclude & RegSet_CalleeSave))
Registers[i] = false;
if (Entry.StackPtr && (Exclude & RegSet_StackPointer))
Registers[i] = false;
if (Entry.FramePtr && (Exclude & RegSet_FramePointer))
Registers[i] = false;
}
return Registers;
}
void TargetARM32::lowerAlloca(const InstAlloca *Instr) {
// Conservatively require the stack to be aligned. Some stack adjustment
// operations implemented below assume that the stack is aligned before the
// alloca. All the alloca code ensures that the stack alignment is preserved
// after the alloca. The stack alignment restriction can be relaxed in some
// cases.
NeedsStackAlignment = true;
// For default align=0, set it to the real value 1, to avoid any
// bit-manipulation problems below.
const uint32_t AlignmentParam = std::max(1u, Instr->getAlignInBytes());
// LLVM enforces power of 2 alignment.
assert(llvm::isPowerOf2_32(AlignmentParam));
assert(llvm::isPowerOf2_32(ARM32_STACK_ALIGNMENT_BYTES));
const uint32_t Alignment =
std::max(AlignmentParam, ARM32_STACK_ALIGNMENT_BYTES);
const bool OverAligned = Alignment > ARM32_STACK_ALIGNMENT_BYTES;
const bool OptM1 = Func->getOptLevel() == Opt_m1;
const bool AllocaWithKnownOffset = Instr->getKnownFrameOffset();
const bool UseFramePointer =
hasFramePointer() || OverAligned || !AllocaWithKnownOffset || OptM1;
if (UseFramePointer)
setHasFramePointer();
Variable *SP = getPhysicalRegister(RegARM32::Reg_sp);
if (OverAligned) {
alignRegisterPow2(SP, Alignment);
}
Variable *Dest = Instr->getDest();
Operand *TotalSize = Instr->getSizeInBytes();
if (const auto *ConstantTotalSize =
llvm::dyn_cast<ConstantInteger32>(TotalSize)) {
const uint32_t Value =
Utils::applyAlignment(ConstantTotalSize->getValue(), Alignment);
// Constant size alloca.
if (!UseFramePointer) {
// If we don't need a Frame Pointer, this alloca has a known offset to the
// stack pointer. We don't need adjust the stack pointer, nor assign any
// value to Dest, as Dest is rematerializable.
assert(Dest->isRematerializable());
FixedAllocaSizeBytes += Value;
Context.insert<InstFakeDef>(Dest);
return;
}
// If a frame pointer is required, then we need to store the alloca'd result
// in Dest.
Operand *SubAmountRF =
legalize(Ctx->getConstantInt32(Value), Legal_Reg | Legal_Flex);
_sub(SP, SP, SubAmountRF);
} else {
// Non-constant sizes need to be adjusted to the next highest multiple of
// the required alignment at runtime.
TotalSize = legalize(TotalSize, Legal_Reg | Legal_Flex);
Variable *T = makeReg(IceType_i32);
_mov(T, TotalSize);
Operand *AddAmount = legalize(Ctx->getConstantInt32(Alignment - 1));
_add(T, T, AddAmount);
alignRegisterPow2(T, Alignment);
_sub(SP, SP, T);
}
// Adds back a few bytes to SP to account for the out args area.
Variable *T = SP;
if (MaxOutArgsSizeBytes != 0) {
T = makeReg(getPointerType());
Operand *OutArgsSizeRF = legalize(
Ctx->getConstantInt32(MaxOutArgsSizeBytes), Legal_Reg | Legal_Flex);
_add(T, SP, OutArgsSizeRF);
}
_mov(Dest, T);
}
void TargetARM32::div0Check(Type Ty, Operand *SrcLo, Operand *SrcHi) {
if (isGuaranteedNonzeroInt(SrcLo) || isGuaranteedNonzeroInt(SrcHi))
return;
Variable *SrcLoReg = legalizeToReg(SrcLo);
switch (Ty) {
default:
llvm_unreachable(
("Unexpected type in div0Check: " + typeStdString(Ty)).c_str());
case IceType_i8:
case IceType_i16: {
Operand *ShAmtImm = shAmtImm(32 - getScalarIntBitWidth(Ty));
Variable *T = makeReg(IceType_i32);
_lsls(T, SrcLoReg, ShAmtImm);
Context.insert<InstFakeUse>(T);
} break;
case IceType_i32: {
_tst(SrcLoReg, SrcLoReg);
break;
}
case IceType_i64: {
Variable *T = makeReg(IceType_i32);
_orrs(T, SrcLoReg, legalize(SrcHi, Legal_Reg | Legal_Flex));
// T isn't going to be used, but we need the side-effect of setting flags
// from this operation.
Context.insert<InstFakeUse>(T);
}
}
auto *Label = InstARM32Label::create(Func, this);
_br(Label, CondARM32::NE);
_trap();
Context.insert(Label);
}
void TargetARM32::lowerIDivRem(Variable *Dest, Variable *T, Variable *Src0R,
Operand *Src1, ExtInstr ExtFunc,
DivInstr DivFunc, bool IsRemainder) {
div0Check(Dest->getType(), Src1, nullptr);
Variable *Src1R = legalizeToReg(Src1);
Variable *T0R = Src0R;
Variable *T1R = Src1R;
if (Dest->getType() != IceType_i32) {
T0R = makeReg(IceType_i32);
(this->*ExtFunc)(T0R, Src0R, CondARM32::AL);
T1R = makeReg(IceType_i32);
(this->*ExtFunc)(T1R, Src1R, CondARM32::AL);
}
if (hasCPUFeature(TargetARM32Features::HWDivArm)) {
(this->*DivFunc)(T, T0R, T1R, CondARM32::AL);
if (IsRemainder) {
Variable *T2 = makeReg(IceType_i32);
_mls(T2, T, T1R, T0R);
T = T2;
}
_mov(Dest, T);
} else {
llvm::report_fatal_error("div should have already been turned into a call");
}
}
TargetARM32::SafeBoolChain
TargetARM32::lowerInt1Arithmetic(const InstArithmetic *Instr) {
Variable *Dest = Instr->getDest();
assert(Dest->getType() == IceType_i1);
// So folding didn't work for Instr. Not a problem: We just need to
// materialize the Sources, and perform the operation. We create regular
// Variables (and not infinite-weight ones) because this call might recurse a
// lot, and we might end up with tons of infinite weight temporaries.
assert(Instr->getSrcSize() == 2);
Variable *Src0 = Func->makeVariable(IceType_i1);
SafeBoolChain Src0Safe = lowerInt1(Src0, Instr->getSrc(0));
Operand *Src1 = Instr->getSrc(1);
SafeBoolChain Src1Safe = SBC_Yes;
if (!llvm::isa<Constant>(Src1)) {
Variable *Src1V = Func->makeVariable(IceType_i1);
Src1Safe = lowerInt1(Src1V, Src1);
Src1 = Src1V;
}
Variable *T = makeReg(IceType_i1);
Src0 = legalizeToReg(Src0);
Operand *Src1RF = legalize(Src1, Legal_Reg | Legal_Flex);
switch (Instr->getOp()) {
default:
// If this Unreachable is ever executed, add the offending operation to
// the list of valid consumers.
llvm::report_fatal_error("Unhandled i1 Op");
case InstArithmetic::And:
_and(T, Src0, Src1RF);
break;
case InstArithmetic::Or:
_orr(T, Src0, Src1RF);
break;
case InstArithmetic::Xor:
_eor(T, Src0, Src1RF);
break;
}
_mov(Dest, T);
return Src0Safe == SBC_Yes && Src1Safe == SBC_Yes ? SBC_Yes : SBC_No;
}
namespace {
// NumericOperands is used during arithmetic/icmp lowering for constant folding.
// It holds the two sources operands, and maintains some state as to whether one
// of them is a constant. If one of the operands is a constant, then it will be
// be stored as the operation's second source, with a bit indicating whether the
// operands were swapped.
//
// The class is split into a base class with operand type-independent methods,
// and a derived, templated class, for each type of operand we want to fold
// constants for:
//
// NumericOperandsBase --> NumericOperands<ConstantFloat>
// --> NumericOperands<ConstantDouble>
// --> NumericOperands<ConstantInt32>
//
// NumericOperands<ConstantInt32> also exposes helper methods for emitting
// inverted/negated immediates.
class NumericOperandsBase {
NumericOperandsBase() = delete;
NumericOperandsBase(const NumericOperandsBase &) = delete;
NumericOperandsBase &operator=(const NumericOperandsBase &) = delete;
public:
NumericOperandsBase(Operand *S0, Operand *S1)
: Src0(NonConstOperand(S0, S1)), Src1(ConstOperand(S0, S1)),
Swapped(Src0 == S1 && S0 != S1) {
assert(Src0 != nullptr);
assert(Src1 != nullptr);
assert(Src0 != Src1 || S0 == S1);
}
bool hasConstOperand() const {
return llvm::isa<Constant>(Src1) && !llvm::isa<ConstantRelocatable>(Src1);
}
bool swappedOperands() const { return Swapped; }
Variable *src0R(TargetARM32 *Target) const {
return legalizeToReg(Target, Src0);
}
Variable *unswappedSrc0R(TargetARM32 *Target) const {
return legalizeToReg(Target, Swapped ? Src1 : Src0);
}
Operand *src1RF(TargetARM32 *Target) const {
return legalizeToRegOrFlex(Target, Src1);
}
Variable *unswappedSrc1R(TargetARM32 *Target) const {
return legalizeToReg(Target, Swapped ? Src0 : Src1);
}
Operand *src1() const { return Src1; }
protected:
Operand *const Src0;
Operand *const Src1;
const bool Swapped;
static Variable *legalizeToReg(TargetARM32 *Target, Operand *Src) {
return Target->legalizeToReg(Src);
}
static Operand *legalizeToRegOrFlex(TargetARM32 *Target, Operand *Src) {
return Target->legalize(Src,
TargetARM32::Legal_Reg | TargetARM32::Legal_Flex);
}
private:
static Operand *NonConstOperand(Operand *S0, Operand *S1) {
if (!llvm::isa<Constant>(S0))
return S0;
if (!llvm::isa<Constant>(S1))
return S1;
if (llvm::isa<ConstantRelocatable>(S1) &&
!llvm::isa<ConstantRelocatable>(S0))
return S1;
return S0;
}
static Operand *ConstOperand(Operand *S0, Operand *S1) {
if (!llvm::isa<Constant>(S0))
return S1;
if (!llvm::isa<Constant>(S1))
return S0;
if (llvm::isa<ConstantRelocatable>(S1) &&
!llvm::isa<ConstantRelocatable>(S0))
return S0;
return S1;
}
};
template <typename C> class NumericOperands : public NumericOperandsBase {
NumericOperands() = delete;
NumericOperands(const NumericOperands &) = delete;
NumericOperands &operator=(const NumericOperands &) = delete;
public:
NumericOperands(Operand *S0, Operand *S1) : NumericOperandsBase(S0, S1) {
assert(!hasConstOperand() || llvm::isa<C>(this->Src1));
}
typename C::PrimType getConstantValue() const {
return llvm::cast<C>(Src1)->getValue();
}
};
using FloatOperands = NumericOperands<ConstantFloat>;
using DoubleOperands = NumericOperands<ConstantDouble>;
class Int32Operands : public NumericOperands<ConstantInteger32> {
Int32Operands() = delete;
Int32Operands(const Int32Operands &) = delete;
Int32Operands &operator=(const Int32Operands &) = delete;
public:
Int32Operands(Operand *S0, Operand *S1) : NumericOperands(S0, S1) {}
Operand *unswappedSrc1RShAmtImm(TargetARM32 *Target) const {
if (!swappedOperands() && hasConstOperand()) {
return Target->shAmtImm(getConstantValue() & 0x1F);
}
return legalizeToReg(Target, Swapped ? Src0 : Src1);
}
bool isSrc1ImmediateZero() const {
if (!swappedOperands() && hasConstOperand()) {
return getConstantValue() == 0;
}
return false;
}
bool immediateIsFlexEncodable() const {
uint32_t Rotate, Imm8;
return OperandARM32FlexImm::canHoldImm(getConstantValue(), &Rotate, &Imm8);
}
bool negatedImmediateIsFlexEncodable() const {
uint32_t Rotate, Imm8;
return OperandARM32FlexImm::canHoldImm(
-static_cast<int32_t>(getConstantValue()), &Rotate, &Imm8);
}
Operand *negatedSrc1F(TargetARM32 *Target) const {
return legalizeToRegOrFlex(Target,
Target->getCtx()->getConstantInt32(
-static_cast<int32_t>(getConstantValue())));
}
bool invertedImmediateIsFlexEncodable() const {
uint32_t Rotate, Imm8;
return OperandARM32FlexImm::canHoldImm(
~static_cast<uint32_t>(getConstantValue()), &Rotate, &Imm8);
}
Operand *invertedSrc1F(TargetARM32 *Target) const {
return legalizeToRegOrFlex(Target,
Target->getCtx()->getConstantInt32(
~static_cast<uint32_t>(getConstantValue())));
}
};
} // end of anonymous namespace
void TargetARM32::preambleDivRem(const InstCall *Instr) {
Operand *Src1 = Instr->getArg(1);
switch (Src1->getType()) {
default:
llvm::report_fatal_error("Invalid type for idiv.");
case IceType_i64: {
if (auto *C = llvm::dyn_cast<ConstantInteger64>(Src1)) {
if (C->getValue() == 0) {
_trap();
return;
}
}
div0Check(IceType_i64, loOperand(Src1), hiOperand(Src1));
return;
}
case IceType_i32: {
// Src0 and Src1 have already been appropriately extended to an i32, so we
// don't check for i8 and i16.
if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) {
if (C->getValue() == 0) {
_trap();
return;
}
}
div0Check(IceType_i32, Src1, nullptr);
return;
}
}
}
void TargetARM32::lowerInt64Arithmetic(InstArithmetic::OpKind Op,
Variable *Dest, Operand *Src0,
Operand *Src1) {
Int32Operands SrcsLo(loOperand(Src0), loOperand(Src1));
Int32Operands SrcsHi(hiOperand(Src0), hiOperand(Src1));
assert(SrcsLo.swappedOperands() == SrcsHi.swappedOperands());
assert(SrcsLo.hasConstOperand() == SrcsHi.hasConstOperand());
auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
Variable *T_Lo = makeReg(DestLo->getType());
Variable *T_Hi = makeReg(DestHi->getType());
switch (Op) {
case InstArithmetic::_num:
llvm::report_fatal_error("Unknown arithmetic operator");
return;
case InstArithmetic::Add: {
Variable *Src0LoR = SrcsLo.src0R(this);
Operand *Src1LoRF = SrcsLo.src1RF(this);
Variable *Src0HiR = SrcsHi.src0R(this);
Operand *Src1HiRF = SrcsHi.src1RF(this);
_adds(T_Lo, Src0LoR, Src1LoRF);
_mov(DestLo, T_Lo);
_adc(T_Hi, Src0HiR, Src1HiRF);
_mov(DestHi, T_Hi);
return;
}
case InstArithmetic::And: {
Variable *Src0LoR = SrcsLo.src0R(this);
Operand *Src1LoRF = SrcsLo.src1RF(this);
Variable *Src0HiR = SrcsHi.src0R(this);
Operand *Src1HiRF = SrcsHi.src1RF(this);
_and(T_Lo, Src0LoR, Src1LoRF);
_mov(DestLo, T_Lo);
_and(T_Hi, Src0HiR, Src1HiRF);
_mov(DestHi, T_Hi);
return;
}
case InstArithmetic::Or: {
Variable *Src0LoR = SrcsLo.src0R(this);
Operand *Src1LoRF = SrcsLo.src1RF(this);
Variable *Src0HiR = SrcsHi.src0R(this);
Operand *Src1HiRF = SrcsHi.src1RF(this);
_orr(T_Lo, Src0LoR, Src1LoRF);
_mov(DestLo, T_Lo);
_orr(T_Hi, Src0HiR, Src1HiRF);
_mov(DestHi, T_Hi);
return;
}
case InstArithmetic::Xor: {
Variable *Src0LoR = SrcsLo.src0R(this);
Operand *Src1LoRF = SrcsLo.src1RF(this);
Variable *Src0HiR = SrcsHi.src0R(this);
Operand *Src1HiRF = SrcsHi.src1RF(this);
_eor(T_Lo, Src0LoR, Src1LoRF);
_mov(DestLo, T_Lo);
_eor(T_Hi, Src0HiR, Src1HiRF);
_mov(DestHi, T_Hi);
return;
}
case InstArithmetic::Sub: {
Variable *Src0LoR = SrcsLo.src0R(this);
Operand *Src1LoRF = SrcsLo.src1RF(this);
Variable *Src0HiR = SrcsHi.src0R(this);
Operand *Src1HiRF = SrcsHi.src1RF(this);
if (SrcsLo.swappedOperands()) {
_rsbs(T_Lo, Src0LoR, Src1LoRF);
_mov(DestLo, T_Lo);
_rsc(T_Hi, Src0HiR, Src1HiRF);
_mov(DestHi, T_Hi);
} else {
_subs(T_Lo, Src0LoR, Src1LoRF);
_mov(DestLo, T_Lo);
_sbc(T_Hi, Src0HiR, Src1HiRF);
_mov(DestHi, T_Hi);
}
return;
}
case InstArithmetic::Mul: {
// GCC 4.8 does:
// a=b*c ==>
// t_acc =(mul) (b.lo * c.hi)
// t_acc =(mla) (c.lo * b.hi) + t_acc
// t.hi,t.lo =(umull) b.lo * c.lo
// t.hi += t_acc
// a.lo = t.lo
// a.hi = t.hi
//
// LLVM does:
// t.hi,t.lo =(umull) b.lo * c.lo
// t.hi =(mla) (b.lo * c.hi) + t.hi
// t.hi =(mla) (b.hi * c.lo) + t.hi
// a.lo = t.lo
// a.hi = t.hi
//
// LLVM's lowering has fewer instructions, but more register pressure:
// t.lo is live from beginning to end, while GCC delays the two-dest
// instruction till the end, and kills c.hi immediately.
Variable *T_Acc = makeReg(IceType_i32);
Variable *T_Acc1 = makeReg(IceType_i32);
Variable *T_Hi1 = makeReg(IceType_i32);
Variable *Src0RLo = SrcsLo.unswappedSrc0R(this);
Variable *Src0RHi = SrcsHi.unswappedSrc0R(this);
Variable *Src1RLo = SrcsLo.unswappedSrc1R(this);
Variable *Src1RHi = SrcsHi.unswappedSrc1R(this);
_mul(T_Acc, Src0RLo, Src1RHi);
_mla(T_Acc1, Src1RLo, Src0RHi, T_Acc);
_umull(T_Lo, T_Hi1, Src0RLo, Src1RLo);
_add(T_Hi, T_Hi1, T_Acc1);
_mov(DestLo, T_Lo);
_mov(DestHi, T_Hi);
return;
}
case InstArithmetic::Shl: {
if (!SrcsLo.swappedOperands() && SrcsLo.hasConstOperand()) {
Variable *Src0RLo = SrcsLo.src0R(this);
// Truncating the ShAmt to [0, 63] because that's what ARM does anyway.
const int32_t ShAmtImm = SrcsLo.getConstantValue() & 0x3F;
if (ShAmtImm == 0) {
_mov(DestLo, Src0RLo);
_mov(DestHi, SrcsHi.src0R(this));
return;
}
if (ShAmtImm >= 32) {
if (ShAmtImm == 32) {
_mov(DestHi, Src0RLo);
} else {
Operand *ShAmtOp = shAmtImm(ShAmtImm - 32);
_lsl(T_Hi, Src0RLo, ShAmtOp);
_mov(DestHi, T_Hi);
}
Operand *_0 =
legalize(Ctx->getConstantZero(IceType_i32), Legal_Reg | Legal_Flex);
_mov(T_Lo, _0);
_mov(DestLo, T_Lo);
return;
}
Variable *Src0RHi = SrcsHi.src0R(this);
Operand *ShAmtOp = shAmtImm(ShAmtImm);
Operand *ComplShAmtOp = shAmtImm(32 - ShAmtImm);
_lsl(T_Hi, Src0RHi, ShAmtOp);
_orr(T_Hi, T_Hi,
OperandARM32FlexReg::create(Func, IceType_i32, Src0RLo,
OperandARM32::LSR, ComplShAmtOp));
_mov(DestHi, T_Hi);
_lsl(T_Lo, Src0RLo, ShAmtOp);
_mov(DestLo, T_Lo);
return;
}
// a=b<<c ==>
// pnacl-llc does:
// mov t_b.lo, b.lo
// mov t_b.hi, b.hi
// mov t_c.lo, c.lo
// rsb T0, t_c.lo, #32
// lsr T1, t_b.lo, T0
// orr t_a.hi, T1, t_b.hi, lsl t_c.lo
// sub T2, t_c.lo, #32
// cmp T2, #0
// lslge t_a.hi, t_b.lo, T2
// lsl t_a.lo, t_b.lo, t_c.lo
// mov a.lo, t_a.lo
// mov a.hi, t_a.hi
//
// GCC 4.8 does:
// sub t_c1, c.lo, #32
// lsl t_hi, b.hi, c.lo
// orr t_hi, t_hi, b.lo, lsl t_c1
// rsb t_c2, c.lo, #32
// orr t_hi, t_hi, b.lo, lsr t_c2
// lsl t_lo, b.lo, c.lo
// a.lo = t_lo
// a.hi = t_hi
//
// These are incompatible, therefore we mimic pnacl-llc.
// Can be strength-reduced for constant-shifts, but we don't do that for
// now.
// Given the sub/rsb T_C, C.lo, #32, one of the T_C will be negative. On
// ARM, shifts only take the lower 8 bits of the shift register, and
// saturate to the range 0-32, so the negative value will saturate to 32.
Operand *_32 = legalize(Ctx->getConstantInt32(32), Legal_Reg | Legal_Flex);
Operand *_0 =
legalize(Ctx->getConstantZero(IceType_i32), Legal_Reg | Legal_Flex);
Variable *T0 = makeReg(IceType_i32);
Variable *T1 = makeReg(IceType_i32);
Variable *T2 = makeReg(IceType_i32);
Variable *TA_Hi = makeReg(IceType_i32);
Variable *TA_Lo = makeReg(IceType_i32);
Variable *Src0RLo = SrcsLo.unswappedSrc0R(this);
Variable *Src0RHi = SrcsHi.unswappedSrc0R(this);
Variable *Src1RLo = SrcsLo.unswappedSrc1R(this);
_rsb(T0, Src1RLo, _32);
_lsr(T1, Src0RLo, T0);
_orr(TA_Hi, T1,
OperandARM32FlexReg::create(Func, IceType_i32, Src0RHi,
OperandARM32::LSL, Src1RLo));
_sub(T2, Src1RLo, _32);
_cmp(T2, _0);
_lsl(TA_Hi, Src0RLo, T2, CondARM32::GE);
_set_dest_redefined();
_lsl(TA_Lo, Src0RLo, Src1RLo);
_mov(DestLo, TA_Lo);
_mov(DestHi, TA_Hi);
return;
}
case InstArithmetic::Lshr:
case InstArithmetic::Ashr: {
const bool ASR = Op == InstArithmetic::Ashr;
if (!SrcsLo.swappedOperands() && SrcsLo.hasConstOperand()) {
Variable *Src0RHi = SrcsHi.src0R(this);
// Truncating the ShAmt to [0, 63] because that's what ARM does anyway.
const int32_t ShAmt = SrcsLo.getConstantValue() & 0x3F;
if (ShAmt == 0) {
_mov(DestHi, Src0RHi);
_mov(DestLo, SrcsLo.src0R(this));
return;
}
if (ShAmt >= 32) {
if (ShAmt == 32) {
_mov(DestLo, Src0RHi);
} else {
Operand *ShAmtImm = shAmtImm(ShAmt - 32);
if (ASR) {
_asr(T_Lo, Src0RHi, ShAmtImm);
} else {
_lsr(T_Lo, Src0RHi, ShAmtImm);
}
_mov(DestLo, T_Lo);
}
if (ASR) {
Operand *_31 = shAmtImm(31);
_asr(T_Hi, Src0RHi, _31);
} else {