blob: 401b55ef5129084312930e1865aed0da6790b7e8 [file] [log] [blame]
//===- subzero/src/IceTargetLoweringX8664.h - lowering for x86-64 -*- C++ -*-=//
//
// The Subzero Code Generator
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
///
/// \file
/// \brief Declares the TargetLoweringX8664 class, which implements the
/// TargetLowering interface for the X86 64-bit architecture.
///
//===----------------------------------------------------------------------===//
#ifndef SUBZERO_SRC_ICETARGETLOWERINGX8664_H
#define SUBZERO_SRC_ICETARGETLOWERINGX8664_H
#include "IceAssemblerX8664.h"
#include "IceCfg.h"
#include "IceDefs.h"
#include "IceGlobalContext.h"
#include "IceInst.h"
#include "IceInstX8664.h"
#include "IceSwitchLowering.h"
#include "IceTargetLoweringX86.h"
#include "IceTargetLoweringX86RegClass.h"
#include "IceUtils.h"
#include <array>
#include <type_traits>
#include <utility>
namespace Ice {
namespace X8664 {
using namespace ::Ice::X86;
constexpr Type WordType = IceType_i64;
class BoolFoldingEntry {
BoolFoldingEntry(const BoolFoldingEntry &) = delete;
public:
BoolFoldingEntry() = default;
explicit BoolFoldingEntry(Inst *I);
BoolFoldingEntry &operator=(const BoolFoldingEntry &) = default;
/// Instr is the instruction producing the i1-type variable of interest.
Inst *Instr = nullptr;
/// IsComplex is the cached result of BoolFolding::hasComplexLowering(Instr).
bool IsComplex = false;
/// IsLiveOut is initialized conservatively to true, and is set to false when
/// we encounter an instruction that ends Var's live range. We disable the
/// folding optimization when Var is live beyond this basic block. Note that
/// if liveness analysis is not performed (e.g. in Om1 mode), IsLiveOut will
/// always be true and the folding optimization will never be performed.
bool IsLiveOut = true;
// NumUses counts the number of times Var is used as a source operand in the
// basic block. If IsComplex is true and there is more than one use of Var,
// then the folding optimization is disabled for Var.
uint32_t NumUses = 0;
};
class BoolFolding {
public:
enum BoolFoldingProducerKind {
PK_None,
// TODO(jpp): PK_Icmp32 is no longer meaningful. Rename to PK_IcmpNative.
PK_Icmp32,
PK_Icmp64,
PK_Fcmp,
PK_Trunc,
PK_Arith // A flag-setting arithmetic instruction.
};
/// Currently the actual enum values are not used (other than CK_None), but we
/// go ahead and produce them anyway for symmetry with the
/// BoolFoldingProducerKind.
enum BoolFoldingConsumerKind { CK_None, CK_Br, CK_Select, CK_Sext, CK_Zext };
private:
BoolFolding(const BoolFolding &) = delete;
BoolFolding &operator=(const BoolFolding &) = delete;
public:
BoolFolding() = default;
static BoolFoldingProducerKind getProducerKind(const Inst *Instr);
static BoolFoldingConsumerKind getConsumerKind(const Inst *Instr);
static bool hasComplexLowering(const Inst *Instr);
static bool isValidFolding(BoolFoldingProducerKind ProducerKind,
BoolFoldingConsumerKind ConsumerKind);
void init(CfgNode *Node);
const Inst *getProducerFor(const Operand *Opnd) const;
void dump(const Cfg *Func) const;
private:
/// Returns true if Producers contains a valid entry for the given VarNum.
bool containsValid(SizeT VarNum) const {
auto Element = Producers.find(VarNum);
return Element != Producers.end() && Element->second.Instr != nullptr;
}
void setInvalid(SizeT VarNum) { Producers[VarNum].Instr = nullptr; }
void invalidateProducersOnStore(const Inst *Instr);
/// Producers maps Variable::Number to a BoolFoldingEntry.
CfgUnorderedMap<SizeT, BoolFoldingEntry> Producers;
};
class TargetX8664 : public TargetX86 {
TargetX8664() = delete;
TargetX8664(const TargetX8664 &) = delete;
TargetX8664 &operator=(const TargetX8664 &) = delete;
friend class BoolFolding;
public:
using BrCond = CondX86::BrCond;
using CmppsCond = CondX86::CmppsCond;
using SegmentRegisters = X86OperandMem::SegmentRegisters;
using InstX86Br = Insts::Br;
using InstX86FakeRMW = Insts::FakeRMW;
using InstX86Label = Insts::Label;
~TargetX8664() override = default;
static void staticInit(GlobalContext *Ctx);
static bool shouldBePooled(const Constant *C);
static ::Ice::Type getPointerType();
void translateOm1() override;
void translateO2() override;
void doLoadOpt();
bool doBranchOpt(Inst *I, const CfgNode *NextNode) override;
SizeT getNumRegisters() const override { return RegisterSet::Reg_NUM; }
Inst *createLoweredMove(Variable *Dest, Variable *SrcVar) override {
if (isVectorType(Dest->getType())) {
return Insts::Movp::create(Func, Dest, SrcVar);
}
return Insts::Mov::create(Func, Dest, SrcVar);
(void)Dest;
(void)SrcVar;
return nullptr;
}
Variable *getPhysicalRegister(RegNumT RegNum,
Type Ty = IceType_void) override;
const char *getRegName(RegNumT RegNum, Type Ty) const override;
static const char *getRegClassName(RegClass C) {
auto ClassNum = static_cast<RegClassX86>(C);
assert(ClassNum < RCX86_NUM);
switch (ClassNum) {
default:
assert(C < RC_Target);
return regClassString(C);
case RCX86_Is64To8:
return "i64to8"; // 64-bit GPR truncable to i8
case RCX86_Is32To8:
return "i32to8"; // 32-bit GPR truncable to i8
case RCX86_Is16To8:
return "i16to8"; // 16-bit GPR truncable to i8
case RCX86_IsTrunc8Rcvr:
return "i8from"; // 8-bit GPR truncable from wider GPRs
case RCX86_IsAhRcvr:
return "i8fromah"; // 8-bit GPR that ah can be assigned to
}
}
SmallBitVector getRegisterSet(RegSetMask Include,
RegSetMask Exclude) const override;
const SmallBitVector &
getRegistersForVariable(const Variable *Var) const override {
RegClass RC = Var->getRegClass();
assert(static_cast<RegClassX86>(RC) < RCX86_NUM);
return TypeToRegisterSet[RC];
}
const SmallBitVector &
getAllRegistersForVariable(const Variable *Var) const override {
RegClass RC = Var->getRegClass();
assert(static_cast<RegClassX86>(RC) < RCX86_NUM);
return TypeToRegisterSetUnfiltered[RC];
}
const SmallBitVector &getAliasesForRegister(RegNumT Reg) const override {
Reg.assertIsValid();
return RegisterAliases[Reg];
}
bool hasFramePointer() const override { return IsEbpBasedFrame; }
void setHasFramePointer() override { IsEbpBasedFrame = true; }
RegNumT getStackReg() const override { return RegX8664::Reg_rsp; }
RegNumT getFrameReg() const override { return RegX8664::Reg_rbp; }
RegNumT getFrameOrStackReg() const override {
// If the stack pointer needs to be aligned, then the frame pointer is
// unaligned, so always use the stack pointer.
if (needsStackPointerAlignment())
return getStackReg();
return IsEbpBasedFrame ? getFrameReg() : getStackReg();
}
size_t typeWidthInBytesOnStack(Type Ty) const override {
// Round up to the next multiple of WordType bytes.
const uint32_t WordSizeInBytes = typeWidthInBytes(WordType);
return Utils::applyAlignment(typeWidthInBytes(Ty), WordSizeInBytes);
}
uint32_t getStackAlignment() const override {
return X86_STACK_ALIGNMENT_BYTES;
}
bool needsStackPointerAlignment() const override {
// If the ABI's stack alignment is smaller than the vector size,
// use the (realigned) stack pointer for addressing any stack variables.
return X86_STACK_ALIGNMENT_BYTES < RequiredStackAlignment;
}
void reserveFixedAllocaArea(size_t Size, size_t Align) override {
FixedAllocaSizeBytes = Size;
assert(llvm::isPowerOf2_32(Align));
FixedAllocaAlignBytes = Align;
PrologEmitsFixedAllocas = true;
}
/// Returns the (negative) offset from ebp/rbp where the fixed Allocas start.
int32_t getFrameFixedAllocaOffset() const override {
return FixedAllocaSizeBytes - (SpillAreaSizeBytes - maxOutArgsSizeBytes());
}
virtual uint32_t maxOutArgsSizeBytes() const override {
return MaxOutArgsSizeBytes;
}
virtual void updateMaxOutArgsSizeBytes(uint32_t Size) {
MaxOutArgsSizeBytes = std::max(MaxOutArgsSizeBytes, Size);
}
bool shouldSplitToVariable64On32(Type Ty) const override { return false; }
SizeT getMinJumpTableSize() const override { return 4; }
void emitVariable(const Variable *Var) const override;
void emit(const ConstantInteger32 *C) const final;
void emit(const ConstantInteger64 *C) const final;
void emit(const ConstantFloat *C) const final;
void emit(const ConstantDouble *C) const final;
void emit(const ConstantUndef *C) const final;
void emit(const ConstantRelocatable *C) const final;
void initNodeForLowering(CfgNode *Node) override;
void addProlog(CfgNode *Node) override;
void finishArgumentLowering(Variable *Arg, Variable *FramePtr,
size_t BasicFrameOffset, size_t StackAdjBytes,
size_t &InArgsSizeBytes);
void addEpilog(CfgNode *Node) override;
Operand *legalizeUndef(Operand *From, RegNumT RegNum = RegNumT());
protected:
void postLower() override;
void lowerAlloca(const InstAlloca *Instr) override;
void lowerArguments() override;
void lowerArithmetic(const InstArithmetic *Instr) override;
void lowerAssign(const InstAssign *Instr) override;
void lowerBr(const InstBr *Instr) override;
void lowerBreakpoint(const InstBreakpoint *Instr) override;
void lowerCall(const InstCall *Instr) override;
void lowerCast(const InstCast *Instr) override;
void lowerExtractElement(const InstExtractElement *Instr) override;
void lowerFcmp(const InstFcmp *Instr) override;
void lowerIcmp(const InstIcmp *Instr) override;
void lowerIntrinsic(const InstIntrinsic *Instr) override;
void lowerInsertElement(const InstInsertElement *Instr) override;
void lowerLoad(const InstLoad *Instr) override;
void lowerPhi(const InstPhi *Instr) override;
void lowerRet(const InstRet *Instr) override;
void lowerSelect(const InstSelect *Instr) override;
void lowerShuffleVector(const InstShuffleVector *Instr) override;
void lowerStore(const InstStore *Instr) override;
void lowerSwitch(const InstSwitch *Instr) override;
void lowerUnreachable(const InstUnreachable *Instr) override;
void lowerOther(const Inst *Instr) override;
void lowerRMW(const InstX86FakeRMW *RMW);
void prelowerPhis() override;
uint32_t getCallStackArgumentsSizeBytes(const CfgVector<Type> &ArgTypes,
Type ReturnType);
uint32_t getCallStackArgumentsSizeBytes(const InstCall *Instr) override;
void genTargetHelperCallFor(Inst *Instr) override;
/// OptAddr wraps all the possible operands that an x86 address might have.
struct OptAddr {
Variable *Base = nullptr;
Variable *Index = nullptr;
uint16_t Shift = 0;
int32_t Offset = 0;
ConstantRelocatable *Relocatable = nullptr;
};
// Builds information for a canonical address expresion:
// <Relocatable + Offset>(Base, Index, Shift)
X86OperandMem *computeAddressOpt(const Inst *Instr, Type MemType,
Operand *Addr);
void doAddressOptOther() override;
void doAddressOptLoad() override;
void doAddressOptStore() override;
void doAddressOptLoadSubVector() override;
void doAddressOptStoreSubVector() override;
void doMockBoundsCheck(Operand *Opnd) override;
/// Naive lowering of cmpxchg.
void lowerAtomicCmpxchg(Variable *DestPrev, Operand *Ptr, Operand *Expected,
Operand *Desired);
/// Attempt a more optimized lowering of cmpxchg. Returns true if optimized.
bool tryOptimizedCmpxchgCmpBr(Variable *DestPrev, Operand *Ptr,
Operand *Expected, Operand *Desired);
void lowerAtomicRMW(Variable *Dest, uint32_t Operation, Operand *Ptr,
Operand *Val);
void lowerCountZeros(bool Cttz, Type Ty, Variable *Dest, Operand *FirstVal,
Operand *SecondVal);
/// Load from memory for a given type.
void typedLoad(Type Ty, Variable *Dest, Variable *Base, Constant *Offset);
/// Store to memory for a given type.
void typedStore(Type Ty, Variable *Value, Variable *Base, Constant *Offset);
/// Copy memory of given type from Src to Dest using OffsetAmt on both.
void copyMemory(Type Ty, Variable *Dest, Variable *Src, int32_t OffsetAmt);
/// Replace some calls to memcpy with inline instructions.
void lowerMemcpy(Operand *Dest, Operand *Src, Operand *Count);
/// Replace some calls to memmove with inline instructions.
void lowerMemmove(Operand *Dest, Operand *Src, Operand *Count);
/// Replace some calls to memset with inline instructions.
void lowerMemset(Operand *Dest, Operand *Val, Operand *Count);
/// Lower an indirect jump .
void lowerIndirectJump(Variable *JumpTarget);
/// Check the comparison is in [Min,Max]. The flags register will be modified
/// with:
/// - below equal, if in range
/// - above, set if not in range
/// The index into the range is returned.
Operand *lowerCmpRange(Operand *Comparison, uint64_t Min, uint64_t Max);
/// Lowering of a cluster of switch cases. If the case is not matched control
/// will pass to the default label provided. If the default label is nullptr
/// then control will fall through to the next instruction. DoneCmp should be
/// true if the flags contain the result of a comparison with the Comparison.
void lowerCaseCluster(const CaseCluster &Case, Operand *Src0, bool DoneCmp,
CfgNode *DefaultLabel = nullptr);
using LowerBinOp = void (TargetX8664::*)(Variable *, Operand *);
void expandAtomicRMWAsCmpxchg(LowerBinOp op_lo, LowerBinOp op_hi,
Variable *Dest, Operand *Ptr, Operand *Val);
void eliminateNextVectorSextInstruction(Variable *SignExtendedResult);
void emitStackProbe(size_t StackSizeBytes);
/// Emit just the call instruction (without argument or return variable
/// processing).
Inst *emitCallToTarget(Operand *CallTarget, Variable *ReturnReg,
size_t NumVariadicFpArgs = 0);
/// Materialize the moves needed to return a value of the specified type.
Variable *moveReturnValueToRegister(Operand *Value, Type ReturnType);
/// Emit a jump table to the constant pool.
void emitJumpTable(const Cfg *Func,
const InstJumpTable *JumpTable) const override;
/// Emit a fake use of esp to make sure esp stays alive for the entire
/// function. Otherwise some esp adjustments get dead-code eliminated.
void keepEspLiveAtExit() {
Variable *esp =
Func->getTarget()->getPhysicalRegister(getStackReg(), WordType);
Context.insert<InstFakeUse>(esp);
}
/// Operand legalization helpers. To deal with address mode constraints, the
/// helpers will create a new Operand and emit instructions that guarantee
/// that the Operand kind is one of those indicated by the LegalMask (a
/// bitmask of allowed kinds). If the input Operand is known to already meet
/// the constraints, it may be simply returned as the result, without creating
/// any new instructions or operands.
enum OperandLegalization {
Legal_None = 0,
Legal_Reg = 1 << 0, // physical register, not stack location
Legal_Imm = 1 << 1,
Legal_Mem = 1 << 2, // includes [eax+4*ecx] as well as [esp+12]
Legal_Rematerializable = 1 << 3,
Legal_AddrAbs = 1 << 4, // ConstantRelocatable doesn't have to add RebasePtr
Legal_Default = ~(Legal_Rematerializable | Legal_AddrAbs)
// TODO(stichnot): Figure out whether this default works for x86-64.
};
using LegalMask = uint32_t;
Operand *legalize(Operand *From, LegalMask Allowed = Legal_Default,
RegNumT RegNum = RegNumT());
Variable *legalizeToReg(Operand *From, RegNumT RegNum = RegNumT());
/// Legalize the first source operand for use in the cmp instruction.
Operand *legalizeSrc0ForCmp(Operand *Src0, Operand *Src1);
/// Turn a pointer operand into a memory operand that can be used by a real
/// load/store operation. Legalizes the operand as well. This is a nop if the
/// operand is already a legal memory operand.
X86OperandMem *formMemoryOperand(Operand *Ptr, Type Ty,
bool DoLegalize = true);
Variable *makeReg(Type Ty, RegNumT RegNum = RegNumT());
static Type stackSlotType();
static constexpr uint32_t NoSizeLimit = 0;
/// Returns the largest type which is equal to or larger than Size bytes. The
/// type is suitable for copying memory i.e. a load and store will be a single
/// instruction (for example x86 will get f64 not i64).
static Type largestTypeInSize(uint32_t Size, uint32_t MaxSize = NoSizeLimit);
/// Returns the smallest type which is equal to or larger than Size bytes. If
/// one doesn't exist then the largest type smaller than Size bytes is
/// returned. The type is suitable for memory copies as described at
/// largestTypeInSize.
static Type firstTypeThatFitsSize(uint32_t Size,
uint32_t MaxSize = NoSizeLimit);
Variable *copyToReg8(Operand *Src, RegNumT RegNum = RegNumT());
Variable *copyToReg(Operand *Src, RegNumT RegNum = RegNumT());
/// Returns a register containing all zeros, without affecting the FLAGS
/// register, using the best instruction for the type.
Variable *makeZeroedRegister(Type Ty, RegNumT RegNum = RegNumT());
/// \name Returns a vector in a register with the given constant entries.
/// @{
Variable *makeVectorOfZeros(Type Ty, RegNumT RegNum = RegNumT());
Variable *makeVectorOfOnes(Type Ty, RegNumT RegNum = RegNumT());
Variable *makeVectorOfMinusOnes(Type Ty, RegNumT RegNum = RegNumT());
Variable *makeVectorOfHighOrderBits(Type Ty, RegNumT RegNum = RegNumT());
Variable *makeVectorOfFabsMask(Type Ty, RegNumT RegNum = RegNumT());
/// @}
/// Return a memory operand corresponding to a stack allocated Variable.
X86OperandMem *getMemoryOperandForStackSlot(Type Ty, Variable *Slot,
uint32_t Offset = 0);
/// The following are helpers that insert lowered x86 instructions with
/// minimal syntactic overhead, so that the lowering code can look as close to
/// assembly as practical.
void _adc(Variable *Dest, Operand *Src0) {
Context.insert<Insts::Adc>(Dest, Src0);
}
void _adc_rmw(X86OperandMem *DestSrc0, Operand *Src1) {
Context.insert<Insts::AdcRMW>(DestSrc0, Src1);
}
void _add(Variable *Dest, Operand *Src0) {
Context.insert<Insts::Add>(Dest, Src0);
}
void _add_rmw(X86OperandMem *DestSrc0, Operand *Src1) {
Context.insert<Insts::AddRMW>(DestSrc0, Src1);
}
void _addps(Variable *Dest, Operand *Src0) {
Context.insert<Insts::Addps>(Dest, Src0);
}
void _addss(Variable *Dest, Operand *Src0) {
Context.insert<Insts::Addss>(Dest, Src0);
}
void _add_sp(Operand *Adjustment);
void _and(Variable *Dest, Operand *Src0) {
Context.insert<Insts::And>(Dest, Src0);
}
void _andnps(Variable *Dest, Operand *Src0) {
Context.insert<Insts::Andnps>(Dest, Src0);
}
void _andps(Variable *Dest, Operand *Src0) {
Context.insert<Insts::Andps>(Dest, Src0);
}
void _and_rmw(X86OperandMem *DestSrc0, Operand *Src1) {
Context.insert<Insts::AndRMW>(DestSrc0, Src1);
}
void _blendvps(Variable *Dest, Operand *Src0, Operand *Src1) {
Context.insert<Insts::Blendvps>(Dest, Src0, Src1);
}
void _br(BrCond Condition, CfgNode *TargetTrue, CfgNode *TargetFalse) {
Context.insert<InstX86Br>(TargetTrue, TargetFalse, Condition,
InstX86Br::Far);
}
void _br(CfgNode *Target) {
Context.insert<InstX86Br>(Target, InstX86Br::Far);
}
void _br(BrCond Condition, CfgNode *Target) {
Context.insert<InstX86Br>(Target, Condition, InstX86Br::Far);
}
void _br(BrCond Condition, InstX86Label *Label,
InstX86Br::Mode Kind = InstX86Br::Near) {
Context.insert<InstX86Br>(Label, Condition, Kind);
}
void _bsf(Variable *Dest, Operand *Src0) {
Context.insert<Insts::Bsf>(Dest, Src0);
}
void _bsr(Variable *Dest, Operand *Src0) {
Context.insert<Insts::Bsr>(Dest, Src0);
}
void _bswap(Variable *SrcDest) { Context.insert<Insts::Bswap>(SrcDest); }
void _cbwdq(Variable *Dest, Operand *Src0) {
Context.insert<Insts::Cbwdq>(Dest, Src0);
}
void _cmov(Variable *Dest, Operand *Src0, BrCond Condition) {
Context.insert<Insts::Cmov>(Dest, Src0, Condition);
}
void _cmp(Operand *Src0, Operand *Src1) {
Context.insert<Insts::Icmp>(Src0, Src1);
}
void _cmpps(Variable *Dest, Operand *Src0, CmppsCond Condition) {
Context.insert<Insts::Cmpps>(Dest, Src0, Condition);
}
void _cmpxchg(Operand *DestOrAddr, Variable *Eax, Variable *Desired,
bool Locked) {
Context.insert<Insts::Cmpxchg>(DestOrAddr, Eax, Desired, Locked);
// Mark eax as possibly modified by cmpxchg.
Context.insert<InstFakeDef>(Eax, llvm::dyn_cast<Variable>(DestOrAddr));
_set_dest_redefined();
Context.insert<InstFakeUse>(Eax);
}
void _cmpxchg8b(X86OperandMem *Addr, Variable *Edx, Variable *Eax,
Variable *Ecx, Variable *Ebx, bool Locked) {
Context.insert<Insts::Cmpxchg8b>(Addr, Edx, Eax, Ecx, Ebx, Locked);
// Mark edx, and eax as possibly modified by cmpxchg8b.
Context.insert<InstFakeDef>(Edx);
_set_dest_redefined();
Context.insert<InstFakeUse>(Edx);
Context.insert<InstFakeDef>(Eax);
_set_dest_redefined();
Context.insert<InstFakeUse>(Eax);
}
void _cvt(Variable *Dest, Operand *Src0, Insts::Cvt::CvtVariant Variant) {
Context.insert<Insts::Cvt>(Dest, Src0, Variant);
}
void _round(Variable *Dest, Operand *Src0, Operand *Imm) {
Context.insert<Insts::Round>(Dest, Src0, Imm);
}
void _div(Variable *Dest, Operand *Src0, Operand *Src1) {
Context.insert<Insts::Div>(Dest, Src0, Src1);
}
void _divps(Variable *Dest, Operand *Src0) {
Context.insert<Insts::Divps>(Dest, Src0);
}
void _divss(Variable *Dest, Operand *Src0) {
Context.insert<Insts::Divss>(Dest, Src0);
}
void _idiv(Variable *Dest, Operand *Src0, Operand *Src1) {
Context.insert<Insts::Idiv>(Dest, Src0, Src1);
}
void _imul(Variable *Dest, Operand *Src0) {
Context.insert<Insts::Imul>(Dest, Src0);
}
void _imul_imm(Variable *Dest, Operand *Src0, Constant *Imm) {
Context.insert<Insts::ImulImm>(Dest, Src0, Imm);
}
void _insertps(Variable *Dest, Operand *Src0, Operand *Src1) {
Context.insert<Insts::Insertps>(Dest, Src0, Src1);
}
void _int3() { Context.insert<Insts::Int3>(); }
void _jmp(Operand *Target) { Context.insert<Insts::Jmp>(Target); }
void _lea(Variable *Dest, Operand *Src0) {
Context.insert<Insts::Lea>(Dest, Src0);
}
void _link_bp();
void _push_reg(RegNumT RegNum);
void _pop_reg(RegNumT RegNum);
void _mfence() { Context.insert<Insts::Mfence>(); }
/// Moves can be used to redefine registers, creating "partial kills" for
/// liveness. Mark where moves are used in this way.
void _redefined(Inst *MovInst, bool IsRedefinition = true) {
if (IsRedefinition)
MovInst->setDestRedefined();
}
/// If Dest=nullptr is passed in, then a new variable is created, marked as
/// infinite register allocation weight, and returned through the in/out Dest
/// argument.
Insts::Mov *_mov(Variable *&Dest, Operand *Src0, RegNumT RegNum = RegNumT()) {
if (Dest == nullptr)
Dest = makeReg(Src0->getType(), RegNum);
return Context.insert<Insts::Mov>(Dest, Src0);
}
void _mov_sp(Operand *NewValue);
Insts::Movp *_movp(Variable *Dest, Operand *Src0) {
return Context.insert<Insts::Movp>(Dest, Src0);
}
void _movd(Variable *Dest, Operand *Src0) {
Context.insert<Insts::Movd>(Dest, Src0);
}
void _movq(Variable *Dest, Operand *Src0) {
Context.insert<Insts::Movq>(Dest, Src0);
}
void _movss(Variable *Dest, Variable *Src0) {
Context.insert<Insts::MovssRegs>(Dest, Src0);
}
void _movsx(Variable *Dest, Operand *Src0) {
Context.insert<Insts::Movsx>(Dest, Src0);
}
Insts::Movzx *_movzx(Variable *Dest, Operand *Src0) {
return Context.insert<Insts::Movzx>(Dest, Src0);
}
void _maxss(Variable *Dest, Operand *Src0) {
Context.insert<Insts::Maxss>(Dest, Src0);
}
void _minss(Variable *Dest, Operand *Src0) {
Context.insert<Insts::Minss>(Dest, Src0);
}
void _maxps(Variable *Dest, Operand *Src0) {
Context.insert<Insts::Maxps>(Dest, Src0);
}
void _minps(Variable *Dest, Operand *Src0) {
Context.insert<Insts::Minps>(Dest, Src0);
}
void _mul(Variable *Dest, Variable *Src0, Operand *Src1) {
Context.insert<Insts::Mul>(Dest, Src0, Src1);
}
void _mulps(Variable *Dest, Operand *Src0) {
Context.insert<Insts::Mulps>(Dest, Src0);
}
void _mulss(Variable *Dest, Operand *Src0) {
Context.insert<Insts::Mulss>(Dest, Src0);
}
void _neg(Variable *SrcDest) { Context.insert<Insts::Neg>(SrcDest); }
void _nop(SizeT Variant) { Context.insert<Insts::Nop>(Variant); }
void _or(Variable *Dest, Operand *Src0) {
Context.insert<Insts::Or>(Dest, Src0);
}
void _orps(Variable *Dest, Operand *Src0) {
Context.insert<Insts::Orps>(Dest, Src0);
}
void _or_rmw(X86OperandMem *DestSrc0, Operand *Src1) {
Context.insert<Insts::OrRMW>(DestSrc0, Src1);
}
void _padd(Variable *Dest, Operand *Src0) {
Context.insert<Insts::Padd>(Dest, Src0);
}
void _padds(Variable *Dest, Operand *Src0) {
Context.insert<Insts::Padds>(Dest, Src0);
}
void _paddus(Variable *Dest, Operand *Src0) {
Context.insert<Insts::Paddus>(Dest, Src0);
}
void _pand(Variable *Dest, Operand *Src0) {
Context.insert<Insts::Pand>(Dest, Src0);
}
void _pandn(Variable *Dest, Operand *Src0) {
Context.insert<Insts::Pandn>(Dest, Src0);
}
void _pblendvb(Variable *Dest, Operand *Src0, Operand *Src1) {
Context.insert<Insts::Pblendvb>(Dest, Src0, Src1);
}
void _pcmpeq(Variable *Dest, Operand *Src0,
Type ArithmeticTypeOverride = IceType_void) {
Context.insert<Insts::Pcmpeq>(Dest, Src0, ArithmeticTypeOverride);
}
void _pcmpgt(Variable *Dest, Operand *Src0) {
Context.insert<Insts::Pcmpgt>(Dest, Src0);
}
void _pextr(Variable *Dest, Operand *Src0, Operand *Src1) {
Context.insert<Insts::Pextr>(Dest, Src0, Src1);
}
void _pinsr(Variable *Dest, Operand *Src0, Operand *Src1) {
Context.insert<Insts::Pinsr>(Dest, Src0, Src1);
}
void _pmull(Variable *Dest, Operand *Src0) {
Context.insert<Insts::Pmull>(Dest, Src0);
}
void _pmulhw(Variable *Dest, Operand *Src0) {
Context.insert<Insts::Pmulhw>(Dest, Src0);
}
void _pmulhuw(Variable *Dest, Operand *Src0) {
Context.insert<Insts::Pmulhuw>(Dest, Src0);
}
void _pmaddwd(Variable *Dest, Operand *Src0) {
Context.insert<Insts::Pmaddwd>(Dest, Src0);
}
void _pmuludq(Variable *Dest, Operand *Src0) {
Context.insert<Insts::Pmuludq>(Dest, Src0);
}
void _pop(Variable *Dest) { Context.insert<Insts::Pop>(Dest); }
void _por(Variable *Dest, Operand *Src0) {
Context.insert<Insts::Por>(Dest, Src0);
}
void _punpckl(Variable *Dest, Operand *Src0) {
Context.insert<Insts::Punpckl>(Dest, Src0);
}
void _punpckh(Variable *Dest, Operand *Src0) {
Context.insert<Insts::Punpckh>(Dest, Src0);
}
void _packss(Variable *Dest, Operand *Src0) {
Context.insert<Insts::Packss>(Dest, Src0);
}
void _packus(Variable *Dest, Operand *Src0) {
Context.insert<Insts::Packus>(Dest, Src0);
}
void _pshufb(Variable *Dest, Operand *Src0) {
Context.insert<Insts::Pshufb>(Dest, Src0);
}
void _pshufd(Variable *Dest, Operand *Src0, Operand *Src1) {
Context.insert<Insts::Pshufd>(Dest, Src0, Src1);
}
void _psll(Variable *Dest, Operand *Src0) {
Context.insert<Insts::Psll>(Dest, Src0);
}
void _psra(Variable *Dest, Operand *Src0) {
Context.insert<Insts::Psra>(Dest, Src0);
}
void _psrl(Variable *Dest, Operand *Src0) {
Context.insert<Insts::Psrl>(Dest, Src0);
}
void _psub(Variable *Dest, Operand *Src0) {
Context.insert<Insts::Psub>(Dest, Src0);
}
void _psubs(Variable *Dest, Operand *Src0) {
Context.insert<Insts::Psubs>(Dest, Src0);
}
void _psubus(Variable *Dest, Operand *Src0) {
Context.insert<Insts::Psubus>(Dest, Src0);
}
void _push(Operand *Src0) { Context.insert<Insts::Push>(Src0); }
void _pxor(Variable *Dest, Operand *Src0) {
Context.insert<Insts::Pxor>(Dest, Src0);
}
void _ret(Variable *Src0 = nullptr) { Context.insert<Insts::Ret>(Src0); }
void _rol(Variable *Dest, Operand *Src0) {
Context.insert<Insts::Rol>(Dest, Src0);
}
void _round(Variable *Dest, Operand *Src, Constant *Imm) {
Context.insert<Insts::Round>(Dest, Src, Imm);
}
void _sar(Variable *Dest, Operand *Src0) {
Context.insert<Insts::Sar>(Dest, Src0);
}
void _sbb(Variable *Dest, Operand *Src0) {
Context.insert<Insts::Sbb>(Dest, Src0);
}
void _sbb_rmw(X86OperandMem *DestSrc0, Operand *Src1) {
Context.insert<Insts::SbbRMW>(DestSrc0, Src1);
}
void _setcc(Variable *Dest, BrCond Condition) {
Context.insert<Insts::Setcc>(Dest, Condition);
}
void _shl(Variable *Dest, Operand *Src0) {
Context.insert<Insts::Shl>(Dest, Src0);
}
void _shld(Variable *Dest, Variable *Src0, Operand *Src1) {
Context.insert<Insts::Shld>(Dest, Src0, Src1);
}
void _shr(Variable *Dest, Operand *Src0) {
Context.insert<Insts::Shr>(Dest, Src0);
}
void _shrd(Variable *Dest, Variable *Src0, Operand *Src1) {
Context.insert<Insts::Shrd>(Dest, Src0, Src1);
}
void _shufps(Variable *Dest, Operand *Src0, Operand *Src1) {
Context.insert<Insts::Shufps>(Dest, Src0, Src1);
}
void _movmsk(Variable *Dest, Operand *Src0) {
Context.insert<Insts::Movmsk>(Dest, Src0);
}
void _sqrt(Variable *Dest, Operand *Src0) {
Context.insert<Insts::Sqrt>(Dest, Src0);
}
void _store(Operand *Value, X86Operand *Mem) {
Context.insert<Insts::Store>(Value, Mem);
}
void _storep(Variable *Value, X86OperandMem *Mem) {
Context.insert<Insts::StoreP>(Value, Mem);
}
void _storeq(Operand *Value, X86OperandMem *Mem) {
Context.insert<Insts::StoreQ>(Value, Mem);
}
void _stored(Operand *Value, X86OperandMem *Mem) {
Context.insert<Insts::StoreD>(Value, Mem);
}
void _sub(Variable *Dest, Operand *Src0) {
Context.insert<Insts::Sub>(Dest, Src0);
}
void _sub_rmw(X86OperandMem *DestSrc0, Operand *Src1) {
Context.insert<Insts::SubRMW>(DestSrc0, Src1);
}
void _sub_sp(Operand *Adjustment);
void _subps(Variable *Dest, Operand *Src0) {
Context.insert<Insts::Subps>(Dest, Src0);
}
void _subss(Variable *Dest, Operand *Src0) {
Context.insert<Insts::Subss>(Dest, Src0);
}
void _test(Operand *Src0, Operand *Src1) {
Context.insert<Insts::Test>(Src0, Src1);
}
void _ucomiss(Operand *Src0, Operand *Src1) {
Context.insert<Insts::Ucomiss>(Src0, Src1);
}
void _ud2() { Context.insert<Insts::UD2>(); }
void _unlink_bp();
void _xadd(Operand *Dest, Variable *Src, bool Locked) {
Context.insert<Insts::Xadd>(Dest, Src, Locked);
// The xadd exchanges Dest and Src (modifying Src). Model that update with
// a FakeDef followed by a FakeUse.
Context.insert<InstFakeDef>(Src, llvm::dyn_cast<Variable>(Dest));
_set_dest_redefined();
Context.insert<InstFakeUse>(Src);
}
void _xchg(Operand *Dest, Variable *Src) {
Context.insert<Insts::Xchg>(Dest, Src);
// The xchg modifies Dest and Src -- model that update with a
// FakeDef/FakeUse.
Context.insert<InstFakeDef>(Src, llvm::dyn_cast<Variable>(Dest));
_set_dest_redefined();
Context.insert<InstFakeUse>(Src);
}
void _xor(Variable *Dest, Operand *Src0) {
Context.insert<Insts::Xor>(Dest, Src0);
}
void _xorps(Variable *Dest, Operand *Src0) {
Context.insert<Insts::Xorps>(Dest, Src0);
}
void _xor_rmw(X86OperandMem *DestSrc0, Operand *Src1) {
Context.insert<Insts::XorRMW>(DestSrc0, Src1);
}
void _iaca_start() {
if (!BuildDefs::minimal())
Context.insert<Insts::IacaStart>();
}
void _iaca_end() {
if (!BuildDefs::minimal())
Context.insert<Insts::IacaEnd>();
}
/// This class helps wrap IACA markers around the code generated by the
/// current scope. It means you don't need to put an end before each return.
class ScopedIacaMark {
ScopedIacaMark(const ScopedIacaMark &) = delete;
ScopedIacaMark &operator=(const ScopedIacaMark &) = delete;
public:
ScopedIacaMark(TargetX8664 *Lowering) : Lowering(Lowering) {
Lowering->_iaca_start();
}
~ScopedIacaMark() { end(); }
void end() {
if (!Lowering)
return;
Lowering->_iaca_end();
Lowering = nullptr;
}
private:
TargetX8664 *Lowering;
};
bool optimizeScalarMul(Variable *Dest, Operand *Src0, int32_t Src1);
void findRMW();
static uint32_t applyStackAlignment(uint32_t Value);
bool IsEbpBasedFrame = false;
/// Stack alignment guaranteed by the ABI.
static constexpr uint32_t X86_STACK_ALIGNMENT_BYTES = 16;
/// Stack alignment required by the currently lowered function.
const uint32_t RequiredStackAlignment = X86_STACK_ALIGNMENT_BYTES;
size_t SpillAreaSizeBytes = 0;
size_t FixedAllocaSizeBytes = 0;
size_t FixedAllocaAlignBytes = 0;
bool PrologEmitsFixedAllocas = false;
uint32_t MaxOutArgsSizeBytes = 0;
static std::array<SmallBitVector, RCX86_NUM> TypeToRegisterSet;
static std::array<SmallBitVector, RCX86_NUM> TypeToRegisterSetUnfiltered;
static std::array<SmallBitVector, RegisterSet::Reg_NUM> RegisterAliases;
SmallBitVector RegsUsed;
std::array<VarList, IceType_NUM> PhysicalRegisters;
private:
void lowerShift64(InstArithmetic::OpKind Op, Operand *Src0Lo, Operand *Src0Hi,
Operand *Src1Lo, Variable *DestLo, Variable *DestHi);
/// Emit the code for a combined operation and consumer instruction, or set
/// the destination variable of the operation if Consumer == nullptr.
void lowerIcmpAndConsumer(const InstIcmp *Icmp, const Inst *Consumer);
void lowerFcmpAndConsumer(const InstFcmp *Fcmp, const Inst *Consumer);
void lowerArithAndConsumer(const InstArithmetic *Arith, const Inst *Consumer);
/// Emit a setcc instruction if Consumer == nullptr; otherwise emit a
/// specialized version of Consumer.
void setccOrConsumer(BrCond Condition, Variable *Dest, const Inst *Consumer);
/// Emit a mov [1|0] instruction if Consumer == nullptr; otherwise emit a
/// specialized version of Consumer.
void movOrConsumer(bool IcmpResult, Variable *Dest, const Inst *Consumer);
/// Emit the code for instructions with a vector type.
void lowerIcmpVector(const InstIcmp *Icmp);
void lowerFcmpVector(const InstFcmp *Icmp);
void lowerSelectVector(const InstSelect *Instr);
/// Helpers for select lowering.
void lowerSelectMove(Variable *Dest, BrCond Cond, Operand *SrcT,
Operand *SrcF);
void lowerSelectIntMove(Variable *Dest, BrCond Cond, Operand *SrcT,
Operand *SrcF);
/// Generic helper to move an arbitrary type from Src to Dest.
void lowerMove(Variable *Dest, Operand *Src, bool IsRedefinition);
/// Optimizations for idiom recognition.
bool lowerOptimizeFcmpSelect(const InstFcmp *Fcmp, const InstSelect *Select);
BoolFolding FoldingInfo;
/// Helpers for lowering ShuffleVector
/// @{
Variable *lowerShuffleVector_AllFromSameSrc(Operand *Src, SizeT Index0,
SizeT Index1, SizeT Index2,
SizeT Index3);
static constexpr SizeT IGNORE_INDEX = 0x80000000u;
Variable *lowerShuffleVector_TwoFromSameSrc(Operand *Src0, SizeT Index0,
SizeT Index1, Operand *Src1,
SizeT Index2, SizeT Index3);
static constexpr SizeT UNIFIED_INDEX_0 = 0;
static constexpr SizeT UNIFIED_INDEX_1 = 2;
Variable *lowerShuffleVector_UnifyFromDifferentSrcs(Operand *Src0,
SizeT Index0,
Operand *Src1,
SizeT Index1);
static constexpr SizeT CLEAR_ALL_BITS = 0x80;
SizeT PshufbMaskCount = 0;
GlobalString lowerShuffleVector_NewMaskName();
ConstantRelocatable *lowerShuffleVector_CreatePshufbMask(
int8_t Idx0, int8_t Idx1, int8_t Idx2, int8_t Idx3, int8_t Idx4,
int8_t Idx5, int8_t Idx6, int8_t Idx7, int8_t Idx8, int8_t Idx9,
int8_t Idx10, int8_t Idx11, int8_t Idx12, int8_t Idx13, int8_t Idx14,
int8_t Idx15);
void lowerShuffleVector_UsingPshufb(Variable *Dest, Operand *Src0,
Operand *Src1, int8_t Idx0, int8_t Idx1,
int8_t Idx2, int8_t Idx3, int8_t Idx4,
int8_t Idx5, int8_t Idx6, int8_t Idx7,
int8_t Idx8, int8_t Idx9, int8_t Idx10,
int8_t Idx11, int8_t Idx12, int8_t Idx13,
int8_t Idx14, int8_t Idx15);
/// @}
/// The following table summarizes the logic for lowering the fcmp
/// instruction. There is one table entry for each of the 16 conditions.
///
/// The first four columns describe the case when the operands are floating
/// point scalar values. A comment in lowerFcmp() describes the lowering
/// template. In the most general case, there is a compare followed by two
/// conditional branches, because some fcmp conditions don't map to a single
/// x86 conditional branch. However, in many cases it is possible to swap the
/// operands in the comparison and have a single conditional branch. Since
/// it's quite tedious to validate the table by hand, good execution tests are
/// helpful.
///
/// The last two columns describe the case when the operands are vectors of
/// floating point values. For most fcmp conditions, there is a clear mapping
/// to a single x86 cmpps instruction variant. Some fcmp conditions require
/// special code to handle and these are marked in the table with a
/// Cmpps_Invalid predicate.
/// {@
static const struct TableFcmpType {
uint32_t Default;
bool SwapScalarOperands;
CondX86::BrCond C1, C2;
bool SwapVectorOperands;
CondX86::CmppsCond Predicate;
} TableFcmp[];
static const size_t TableFcmpSize;
/// @}
/// The following table summarizes the logic for lowering the icmp instruction
/// for i32 and narrower types. Each icmp condition has a clear mapping to an
/// x86 conditional branch instruction.
/// {@
static const struct TableIcmp32Type {
CondX86::BrCond Mapping;
} TableIcmp32[];
static const size_t TableIcmp32Size;
/// @}
/// The following table summarizes the logic for lowering the icmp instruction
/// for the i64 type. For Eq and Ne, two separate 32-bit comparisons and
/// conditional branches are needed. For the other conditions, three separate
/// conditional branches are needed.
/// {@
static const struct TableIcmp64Type {
CondX86::BrCond C1, C2, C3;
} TableIcmp64[];
static const size_t TableIcmp64Size;
/// @}
static CondX86::BrCond getIcmp32Mapping(InstIcmp::ICond Cond) {
assert(Cond < TableIcmp32Size);
return TableIcmp32[Cond].Mapping;
}
public:
static std::unique_ptr<::Ice::TargetLowering> create(Cfg *Func) {
return makeUnique<TargetX8664>(Func);
}
std::unique_ptr<::Ice::Assembler> createAssembler() const override {
return makeUnique<X8664::AssemblerX8664>();
}
private:
ENABLE_MAKE_UNIQUE;
explicit TargetX8664(Cfg *Func);
};
class TargetDataX8664 final : public TargetDataLowering {
TargetDataX8664() = delete;
TargetDataX8664(const TargetDataX8664 &) = delete;
TargetDataX8664 &operator=(const TargetDataX8664 &) = delete;
public:
~TargetDataX8664() override = default;
static std::unique_ptr<TargetDataLowering> create(GlobalContext *Ctx) {
return makeUnique<TargetDataX8664>(Ctx);
}
void lowerGlobals(const VariableDeclarationList &Vars,
const std::string &SectionSuffix) override;
void lowerConstants() override;
void lowerJumpTables() override;
private:
ENABLE_MAKE_UNIQUE;
explicit TargetDataX8664(GlobalContext *Ctx) : TargetDataLowering(Ctx) {}
template <typename T> static void emitConstantPool(GlobalContext *Ctx);
};
class TargetHeaderX86 : public TargetHeaderLowering {
TargetHeaderX86() = delete;
TargetHeaderX86(const TargetHeaderX86 &) = delete;
TargetHeaderX86 &operator=(const TargetHeaderX86 &) = delete;
public:
~TargetHeaderX86() = default;
static std::unique_ptr<TargetHeaderLowering> create(GlobalContext *Ctx) {
return makeUnique<TargetHeaderX86>(Ctx);
}
private:
ENABLE_MAKE_UNIQUE;
explicit TargetHeaderX86(GlobalContext *Ctx) : TargetHeaderLowering(Ctx) {}
};
} // end of namespace X8664
} // end of namespace Ice
#endif // SUBZERO_SRC_ICETARGETLOWERINGX8664_H