third_party/subzero/src/IceTargetLoweringX8632.h - SwiftShader - Git at Google

 //===- subzero/src/IceTargetLoweringX8632.h - x86-32 lowering ---*- C++ -*-===//
 //
 //                        The Subzero Code Generator
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 ///
 /// \file
 /// \brief Declares the TargetLoweringX8632 class, which implements the
 /// TargetLowering interface for the x86-32 architecture.
 ///
 //===----------------------------------------------------------------------===//

 #ifndef SUBZERO_SRC_ICETARGETLOWERINGX8632_H
 #define SUBZERO_SRC_ICETARGETLOWERINGX8632_H

 #include "IceAssemblerX8632.h"
 #include "IceDefs.h"
 #include "IceInst.h"
 #include "IceInstX8632.h"
 #include "IceRegistersX8632.h"
 #include "IceSwitchLowering.h"
 #include "IceTargetLoweringX86.h"
 #include "IceTargetLoweringX86RegClass.h"
 #include "IceUtils.h"

 #include <array>
 #include <type_traits>
 #include <utility>

 namespace Ice {
 namespace X8632 {

 using namespace ::Ice::X86;

 constexpr Type WordType = IceType_i32;

 class BoolFoldingEntry {
   BoolFoldingEntry(const BoolFoldingEntry &) = delete;

 public:
   BoolFoldingEntry() = default;
   explicit BoolFoldingEntry(Inst *I);
   BoolFoldingEntry &operator=(const BoolFoldingEntry &) = default;
   /// Instr is the instruction producing the i1-type variable of interest.
   Inst *Instr = nullptr;
   /// IsComplex is the cached result of BoolFolding::hasComplexLowering(Instr).
   bool IsComplex = false;
   /// IsLiveOut is initialized conservatively to true, and is set to false when
   /// we encounter an instruction that ends Var's live range. We disable the
   /// folding optimization when Var is live beyond this basic block. Note that
   /// if liveness analysis is not performed (e.g. in Om1 mode), IsLiveOut will
   /// always be true and the folding optimization will never be performed.
   bool IsLiveOut = true;
   // NumUses counts the number of times Var is used as a source operand in the
   // basic block. If IsComplex is true and there is more than one use of Var,
   // then the folding optimization is disabled for Var.
   uint32_t NumUses = 0;
 };

 class BoolFolding {
 public:
   enum BoolFoldingProducerKind {
     PK_None,
     // TODO(jpp): PK_Icmp32 is no longer meaningful. Rename to PK_IcmpNative.
     PK_Icmp32,
     PK_Icmp64,
     PK_Fcmp,
     PK_Trunc,
     PK_Arith // A flag-setting arithmetic instruction.
   };

   /// Currently the actual enum values are not used (other than CK_None), but we
   /// go ahead and produce them anyway for symmetry with the
   /// BoolFoldingProducerKind.
   enum BoolFoldingConsumerKind { CK_None, CK_Br, CK_Select, CK_Sext, CK_Zext };

 private:
   BoolFolding(const BoolFolding &) = delete;
   BoolFolding &operator=(const BoolFolding &) = delete;

 public:
   BoolFolding() = default;
   static BoolFoldingProducerKind getProducerKind(const Inst *Instr);
   static BoolFoldingConsumerKind getConsumerKind(const Inst *Instr);
   static bool hasComplexLowering(const Inst *Instr);
   static bool isValidFolding(BoolFoldingProducerKind ProducerKind,
                              BoolFoldingConsumerKind ConsumerKind);
   void init(CfgNode *Node);
   const Inst *getProducerFor(const Operand *Opnd) const;
   void dump(const Cfg *Func) const;

 private:
   /// Returns true if Producers contains a valid entry for the given VarNum.
   bool containsValid(SizeT VarNum) const {
     auto Element = Producers.find(VarNum);
     return Element != Producers.end() && Element->second.Instr != nullptr;
   }
   void setInvalid(SizeT VarNum) { Producers[VarNum].Instr = nullptr; }
   void invalidateProducersOnStore(const Inst *Instr);
   /// Producers maps Variable::Number to a BoolFoldingEntry.
   CfgUnorderedMap<SizeT, BoolFoldingEntry> Producers;
 };

 class TargetX8632 : public TargetX86 {
   TargetX8632() = delete;
   TargetX8632(const TargetX8632 &) = delete;
   TargetX8632 &operator=(const TargetX8632 &) = delete;

   friend class BoolFolding;

 public:
   using BrCond = CondX86::BrCond;
   using CmppsCond = CondX86::CmppsCond;

   using SegmentRegisters = X86OperandMem::SegmentRegisters;

   using InstX86Br = Insts::Br;
   using InstX86FakeRMW = Insts::FakeRMW;
   using InstX86Label = Insts::Label;

   ~TargetX8632() override = default;

   static void staticInit(GlobalContext *Ctx);
   static bool shouldBePooled(const Constant *C);
   static ::Ice::Type getPointerType();

   void translateOm1() override;
   void translateO2() override;
   void doLoadOpt();
   bool doBranchOpt(Inst *I, const CfgNode *NextNode) override;

   SizeT getNumRegisters() const override { return RegisterSet::Reg_NUM; }

   Inst *createLoweredMove(Variable *Dest, Variable *SrcVar) override {
     if (isVectorType(Dest->getType())) {
       return Insts::Movp::create(Func, Dest, SrcVar);
     }
     return Insts::Mov::create(Func, Dest, SrcVar);
     (void)Dest;
     (void)SrcVar;
     return nullptr;
   }

   Variable *getPhysicalRegister(RegNumT RegNum,
                                 Type Ty = IceType_void) override;
   const char *getRegName(RegNumT RegNum, Type Ty) const override;
   static const char *getRegClassName(RegClass C) {
     auto ClassNum = static_cast<RegClassX86>(C);
     assert(ClassNum < RCX86_NUM);
     switch (ClassNum) {
     default:
       assert(C < RC_Target);
       return regClassString(C);
     case RCX86_Is64To8:
       return "i64to8"; // 64-bit GPR truncable to i8
     case RCX86_Is32To8:
       return "i32to8"; // 32-bit GPR truncable to i8
     case RCX86_Is16To8:
       return "i16to8"; // 16-bit GPR truncable to i8
     case RCX86_IsTrunc8Rcvr:
       return "i8from"; // 8-bit GPR truncable from wider GPRs
     case RCX86_IsAhRcvr:
       return "i8fromah"; // 8-bit GPR that ah can be assigned to
     }
   }
   SmallBitVector getRegisterSet(RegSetMask Include,
                                 RegSetMask Exclude) const override;
   const SmallBitVector &
   getRegistersForVariable(const Variable *Var) const override {
     RegClass RC = Var->getRegClass();
     assert(static_cast<RegClassX86>(RC) < RCX86_NUM);
     return TypeToRegisterSet[RC];
   }

   const SmallBitVector &
   getAllRegistersForVariable(const Variable *Var) const override {
     RegClass RC = Var->getRegClass();
     assert(static_cast<RegClassX86>(RC) < RCX86_NUM);
     return TypeToRegisterSetUnfiltered[RC];
   }

   const SmallBitVector &getAliasesForRegister(RegNumT Reg) const override {
     Reg.assertIsValid();
     return RegisterAliases[Reg];
   }

   bool hasFramePointer() const override { return IsEbpBasedFrame; }
   void setHasFramePointer() override { IsEbpBasedFrame = true; }
   RegNumT getStackReg() const override { return RegX8632::Reg_esp; }
   RegNumT getFrameReg() const override { return RegX8632::Reg_ebp; }
   RegNumT getFrameOrStackReg() const override {
     // If the stack pointer needs to be aligned, then the frame pointer is
     // unaligned, so always use the stack pointer.
     if (needsStackPointerAlignment())
       return getStackReg();
     return IsEbpBasedFrame ? getFrameReg() : getStackReg();
   }
   size_t typeWidthInBytesOnStack(Type Ty) const override {
     // Round up to the next multiple of WordType bytes.
     const uint32_t WordSizeInBytes = typeWidthInBytes(WordType);
     return Utils::applyAlignment(typeWidthInBytes(Ty), WordSizeInBytes);
   }
   uint32_t getStackAlignment() const override {
     return X86_STACK_ALIGNMENT_BYTES;
   }
   bool needsStackPointerAlignment() const override {
     // If the ABI's stack alignment is smaller than the vector size (16 bytes),
     // use the (realigned) stack pointer for addressing any stack variables.
     return X86_STACK_ALIGNMENT_BYTES < 16;
   }
   void reserveFixedAllocaArea(size_t Size, size_t Align) override {
     FixedAllocaSizeBytes = Size;
     assert(llvm::isPowerOf2_32(Align));
     FixedAllocaAlignBytes = Align;
     PrologEmitsFixedAllocas = true;
   }
   /// Returns the (negative) offset from ebp/rbp where the fixed Allocas start.
   int32_t getFrameFixedAllocaOffset() const override {
     return FixedAllocaSizeBytes - (SpillAreaSizeBytes - maxOutArgsSizeBytes());
   }
   virtual uint32_t maxOutArgsSizeBytes() const override {
     return MaxOutArgsSizeBytes;
   }
   virtual void updateMaxOutArgsSizeBytes(uint32_t Size) {
     MaxOutArgsSizeBytes = std::max(MaxOutArgsSizeBytes, Size);
   }

   bool shouldSplitToVariable64On32(Type Ty) const override {
     return Ty == IceType_i64;
   }

   SizeT getMinJumpTableSize() const override { return 4; }

   void emitVariable(const Variable *Var) const override;

   void emit(const ConstantInteger32 *C) const final;
   void emit(const ConstantInteger64 *C) const final;
   void emit(const ConstantFloat *C) const final;
   void emit(const ConstantDouble *C) const final;
   void emit(const ConstantUndef *C) const final;
   void emit(const ConstantRelocatable *C) const final;

   void initNodeForLowering(CfgNode *Node) override;

   Operand *loOperand(Operand *Operand);
   Operand *hiOperand(Operand *Operand);

   void addProlog(CfgNode *Node) override;
   void finishArgumentLowering(Variable *Arg, Variable *FramePtr,
                               size_t BasicFrameOffset, size_t StackAdjBytes,
                               size_t &InArgsSizeBytes);
   void addEpilog(CfgNode *Node) override;

   Operand *legalizeUndef(Operand *From, RegNumT RegNum = RegNumT());

 protected:
   void postLower() override;

   void lowerAlloca(const InstAlloca *Instr) override;
   void lowerArguments() override;
   void lowerArithmetic(const InstArithmetic *Instr) override;
   void lowerAssign(const InstAssign *Instr) override;
   void lowerBr(const InstBr *Instr) override;
   void lowerBreakpoint(const InstBreakpoint *Instr) override;
   void lowerCall(const InstCall *Instr) override;
   void lowerCast(const InstCast *Instr) override;
   void lowerExtractElement(const InstExtractElement *Instr) override;
   void lowerFcmp(const InstFcmp *Instr) override;
   void lowerIcmp(const InstIcmp *Instr) override;

   void lowerIntrinsic(const InstIntrinsic *Instr) override;
   void lowerInsertElement(const InstInsertElement *Instr) override;
   void lowerLoad(const InstLoad *Instr) override;
   void lowerPhi(const InstPhi *Instr) override;
   void lowerRet(const InstRet *Instr) override;
   void lowerSelect(const InstSelect *Instr) override;
   void lowerShuffleVector(const InstShuffleVector *Instr) override;
   void lowerStore(const InstStore *Instr) override;
   void lowerSwitch(const InstSwitch *Instr) override;
   void lowerUnreachable(const InstUnreachable *Instr) override;
   void lowerOther(const Inst *Instr) override;
   void lowerRMW(const InstX86FakeRMW *RMW);
   void prelowerPhis() override;
   uint32_t getCallStackArgumentsSizeBytes(const CfgVector<Type> &ArgTypes,
                                           Type ReturnType);
   uint32_t getCallStackArgumentsSizeBytes(const InstCall *Instr) override;
   void genTargetHelperCallFor(Inst *Instr) override;

   /// OptAddr wraps all the possible operands that an x86 address might have.
   struct OptAddr {
     Variable *Base = nullptr;
     Variable *Index = nullptr;
     uint16_t Shift = 0;
     int32_t Offset = 0;
     ConstantRelocatable *Relocatable = nullptr;
   };

   // Builds information for a canonical address expresion:
   //   <Relocatable + Offset>(Base, Index, Shift)
   X86OperandMem *computeAddressOpt(const Inst *Instr, Type MemType,
                                    Operand *Addr);
   void doAddressOptOther() override;
   void doAddressOptLoad() override;
   void doAddressOptStore() override;
   void doAddressOptLoadSubVector() override;
   void doAddressOptStoreSubVector() override;
   void doMockBoundsCheck(Operand *Opnd) override;

   /// Naive lowering of cmpxchg.
   void lowerAtomicCmpxchg(Variable *DestPrev, Operand *Ptr, Operand *Expected,
                           Operand *Desired);
   /// Attempt a more optimized lowering of cmpxchg. Returns true if optimized.
   bool tryOptimizedCmpxchgCmpBr(Variable *DestPrev, Operand *Ptr,
                                 Operand *Expected, Operand *Desired);
   void lowerAtomicRMW(Variable *Dest, uint32_t Operation, Operand *Ptr,
                       Operand *Val);
   void lowerCountZeros(bool Cttz, Type Ty, Variable *Dest, Operand *FirstVal,
                        Operand *SecondVal);
   /// Load from memory for a given type.
   void typedLoad(Type Ty, Variable *Dest, Variable *Base, Constant *Offset);
   /// Store to memory for a given type.
   void typedStore(Type Ty, Variable *Value, Variable *Base, Constant *Offset);
   /// Copy memory of given type from Src to Dest using OffsetAmt on both.
   void copyMemory(Type Ty, Variable *Dest, Variable *Src, int32_t OffsetAmt);
   /// Replace some calls to memcpy with inline instructions.
   void lowerMemcpy(Operand *Dest, Operand *Src, Operand *Count);
   /// Replace some calls to memmove with inline instructions.
   void lowerMemmove(Operand *Dest, Operand *Src, Operand *Count);
   /// Replace some calls to memset with inline instructions.
   void lowerMemset(Operand *Dest, Operand *Val, Operand *Count);

   /// Lower an indirect jump adding sandboxing when needed.
   void lowerIndirectJump(Variable *JumpTarget);

   /// Check the comparison is in [Min,Max]. The flags register will be modified
   /// with:
   ///   - below equal, if in range
   ///   - above, set if not in range
   /// The index into the range is returned.
   Operand *lowerCmpRange(Operand *Comparison, uint64_t Min, uint64_t Max);
   /// Lowering of a cluster of switch cases. If the case is not matched control
   /// will pass to the default label provided. If the default label is nullptr
   /// then control will fall through to the next instruction. DoneCmp should be
   /// true if the flags contain the result of a comparison with the Comparison.
   void lowerCaseCluster(const CaseCluster &Case, Operand *Src0, bool DoneCmp,
                         CfgNode *DefaultLabel = nullptr);

   using LowerBinOp = void (TargetX8632::*)(Variable *, Operand *);
   void expandAtomicRMWAsCmpxchg(LowerBinOp op_lo, LowerBinOp op_hi,
                                 Variable *Dest, Operand *Ptr, Operand *Val);

   void eliminateNextVectorSextInstruction(Variable *SignExtendedResult);

   void emitStackProbe(size_t StackSizeBytes);

   /// Emit just the call instruction (without argument or return variable
   /// processing), sandboxing if needed.
   Inst *emitCallToTarget(Operand *CallTarget, Variable *ReturnReg,
                          size_t NumVariadicFpArgs = 0);
   /// Materialize the moves needed to return a value of the specified type.
   Variable *moveReturnValueToRegister(Operand *Value, Type ReturnType);

   /// Emit a jump table to the constant pool.
   void emitJumpTable(const Cfg *Func,
                      const InstJumpTable *JumpTable) const override;

   /// Emit a fake use of esp to make sure esp stays alive for the entire
   /// function. Otherwise some esp adjustments get dead-code eliminated.
   void keepEspLiveAtExit() {
     Variable *esp =
         Func->getTarget()->getPhysicalRegister(getStackReg(), WordType);
     Context.insert<InstFakeUse>(esp);
   }

   /// Operand legalization helpers. To deal with address mode constraints, the
   /// helpers will create a new Operand and emit instructions that guarantee
   /// that the Operand kind is one of those indicated by the LegalMask (a
   /// bitmask of allowed kinds). If the input Operand is known to already meet
   /// the constraints, it may be simply returned as the result, without creating
   /// any new instructions or operands.
   enum OperandLegalization {
     Legal_None = 0,
     Legal_Reg = 1 << 0, // physical register, not stack location
     Legal_Imm = 1 << 1,
     Legal_Mem = 1 << 2, // includes [eax+4*ecx] as well as [esp+12]
     Legal_Rematerializable = 1 << 3,
     Legal_AddrAbs = 1 << 4, // ConstantRelocatable doesn't have to add RebasePtr
     Legal_Default = ~(Legal_Rematerializable | Legal_AddrAbs)
     // TODO(stichnot): Figure out whether this default works for x86-64.
   };
   using LegalMask = uint32_t;
   Operand *legalize(Operand *From, LegalMask Allowed = Legal_Default,
                     RegNumT RegNum = RegNumT());
   Variable *legalizeToReg(Operand *From, RegNumT RegNum = RegNumT());
   /// Legalize the first source operand for use in the cmp instruction.
   Operand *legalizeSrc0ForCmp(Operand *Src0, Operand *Src1);
   /// Turn a pointer operand into a memory operand that can be used by a real
   /// load/store operation. Legalizes the operand as well. This is a nop if the
   /// operand is already a legal memory operand.
   X86OperandMem *formMemoryOperand(Operand *Ptr, Type Ty,
                                    bool DoLegalize = true);

   Variable *makeReg(Type Ty, RegNumT RegNum = RegNumT());
   static Type stackSlotType();

   static constexpr uint32_t NoSizeLimit = 0;
   /// Returns the largest type which is equal to or larger than Size bytes. The
   /// type is suitable for copying memory i.e. a load and store will be a single
   /// instruction (for example x86 will get f64 not i64).
   static Type largestTypeInSize(uint32_t Size, uint32_t MaxSize = NoSizeLimit);
   /// Returns the smallest type which is equal to or larger than Size bytes. If
   /// one doesn't exist then the largest type smaller than Size bytes is
   /// returned. The type is suitable for memory copies as described at
   /// largestTypeInSize.
   static Type firstTypeThatFitsSize(uint32_t Size,
                                     uint32_t MaxSize = NoSizeLimit);

   Variable *copyToReg8(Operand *Src, RegNumT RegNum = RegNumT());
   Variable *copyToReg(Operand *Src, RegNumT RegNum = RegNumT());

   /// Returns a register containing all zeros, without affecting the FLAGS
   /// register, using the best instruction for the type.
   Variable *makeZeroedRegister(Type Ty, RegNumT RegNum = RegNumT());

   /// \name Returns a vector in a register with the given constant entries.
   /// @{
   Variable *makeVectorOfZeros(Type Ty, RegNumT RegNum = RegNumT());
   Variable *makeVectorOfOnes(Type Ty, RegNumT RegNum = RegNumT());
   Variable *makeVectorOfMinusOnes(Type Ty, RegNumT RegNum = RegNumT());
   Variable *makeVectorOfHighOrderBits(Type Ty, RegNumT RegNum = RegNumT());
   Variable *makeVectorOfFabsMask(Type Ty, RegNumT RegNum = RegNumT());
   /// @}

   /// Return a memory operand corresponding to a stack allocated Variable.
   X86OperandMem *getMemoryOperandForStackSlot(Type Ty, Variable *Slot,
                                               uint32_t Offset = 0);

   /// The following are helpers that insert lowered x86 instructions with
   /// minimal syntactic overhead, so that the lowering code can look as close to
   /// assembly as practical.
   void _adc(Variable *Dest, Operand *Src0) {
     Context.insert<Insts::Adc>(Dest, Src0);
   }
   void _adc_rmw(X86OperandMem *DestSrc0, Operand *Src1) {
     Context.insert<Insts::AdcRMW>(DestSrc0, Src1);
   }
   void _add(Variable *Dest, Operand *Src0) {
     Context.insert<Insts::Add>(Dest, Src0);
   }
   void _add_rmw(X86OperandMem *DestSrc0, Operand *Src1) {
     Context.insert<Insts::AddRMW>(DestSrc0, Src1);
   }
   void _addps(Variable *Dest, Operand *Src0) {
     Context.insert<Insts::Addps>(Dest, Src0);
   }
   void _addss(Variable *Dest, Operand *Src0) {
     Context.insert<Insts::Addss>(Dest, Src0);
   }
   void _add_sp(Operand *Adjustment);
   void _and(Variable *Dest, Operand *Src0) {
     Context.insert<Insts::And>(Dest, Src0);
   }
   void _andnps(Variable *Dest, Operand *Src0) {
     Context.insert<Insts::Andnps>(Dest, Src0);
   }
   void _andps(Variable *Dest, Operand *Src0) {
     Context.insert<Insts::Andps>(Dest, Src0);
   }
   void _and_rmw(X86OperandMem *DestSrc0, Operand *Src1) {
     Context.insert<Insts::AndRMW>(DestSrc0, Src1);
   }
   void _blendvps(Variable *Dest, Operand *Src0, Operand *Src1) {
     Context.insert<Insts::Blendvps>(Dest, Src0, Src1);
   }
   void _br(BrCond Condition, CfgNode *TargetTrue, CfgNode *TargetFalse) {
     Context.insert<InstX86Br>(TargetTrue, TargetFalse, Condition,
                               InstX86Br::Far);
   }
   void _br(CfgNode *Target) {
     Context.insert<InstX86Br>(Target, InstX86Br::Far);
   }
   void _br(BrCond Condition, CfgNode *Target) {
     Context.insert<InstX86Br>(Target, Condition, InstX86Br::Far);
   }
   void _br(BrCond Condition, InstX86Label *Label,
            InstX86Br::Mode Kind = InstX86Br::Near) {
     Context.insert<InstX86Br>(Label, Condition, Kind);
   }
   void _bsf(Variable *Dest, Operand *Src0) {
     Context.insert<Insts::Bsf>(Dest, Src0);
   }
   void _bsr(Variable *Dest, Operand *Src0) {
     Context.insert<Insts::Bsr>(Dest, Src0);
   }
   void _bswap(Variable *SrcDest) { Context.insert<Insts::Bswap>(SrcDest); }
   void _cbwdq(Variable *Dest, Operand *Src0) {
     Context.insert<Insts::Cbwdq>(Dest, Src0);
   }
   void _cmov(Variable *Dest, Operand *Src0, BrCond Condition) {
     Context.insert<Insts::Cmov>(Dest, Src0, Condition);
   }
   void _cmp(Operand *Src0, Operand *Src1) {
     Context.insert<Insts::Icmp>(Src0, Src1);
   }
   void _cmpps(Variable *Dest, Operand *Src0, CmppsCond Condition) {
     Context.insert<Insts::Cmpps>(Dest, Src0, Condition);
   }
   void _cmpxchg(Operand *DestOrAddr, Variable *Eax, Variable *Desired,
                 bool Locked) {
     Context.insert<Insts::Cmpxchg>(DestOrAddr, Eax, Desired, Locked);
     // Mark eax as possibly modified by cmpxchg.
     Context.insert<InstFakeDef>(Eax, llvm::dyn_cast<Variable>(DestOrAddr));
     _set_dest_redefined();
     Context.insert<InstFakeUse>(Eax);
   }
   void _cmpxchg8b(X86OperandMem *Addr, Variable *Edx, Variable *Eax,
                   Variable *Ecx, Variable *Ebx, bool Locked) {
     Context.insert<Insts::Cmpxchg8b>(Addr, Edx, Eax, Ecx, Ebx, Locked);
     // Mark edx, and eax as possibly modified by cmpxchg8b.
     Context.insert<InstFakeDef>(Edx);
     _set_dest_redefined();
     Context.insert<InstFakeUse>(Edx);
     Context.insert<InstFakeDef>(Eax);
     _set_dest_redefined();
     Context.insert<InstFakeUse>(Eax);
   }
   void _cvt(Variable *Dest, Operand *Src0, Insts::Cvt::CvtVariant Variant) {
     Context.insert<Insts::Cvt>(Dest, Src0, Variant);
   }
   void _round(Variable *Dest, Operand *Src0, Operand *Imm) {
     Context.insert<Insts::Round>(Dest, Src0, Imm);
   }
   void _div(Variable *Dest, Operand *Src0, Operand *Src1) {
     Context.insert<Insts::Div>(Dest, Src0, Src1);
   }
   void _divps(Variable *Dest, Operand *Src0) {
     Context.insert<Insts::Divps>(Dest, Src0);
   }
   void _divss(Variable *Dest, Operand *Src0) {
     Context.insert<Insts::Divss>(Dest, Src0);
   }
   void _fld(Operand *Src0) { Context.insert<Insts::Fld>(Src0); }
   void _fstp(Variable *Dest) { Context.insert<Insts::Fstp>(Dest); }
   void _idiv(Variable *Dest, Operand *Src0, Operand *Src1) {
     Context.insert<Insts::Idiv>(Dest, Src0, Src1);
   }
   void _imul(Variable *Dest, Operand *Src0) {
     Context.insert<Insts::Imul>(Dest, Src0);
   }
   void _imul_imm(Variable *Dest, Operand *Src0, Constant *Imm) {
     Context.insert<Insts::ImulImm>(Dest, Src0, Imm);
   }
   void _insertps(Variable *Dest, Operand *Src0, Operand *Src1) {
     Context.insert<Insts::Insertps>(Dest, Src0, Src1);
   }
   void _int3() { Context.insert<Insts::Int3>(); }
   void _jmp(Operand *Target) { Context.insert<Insts::Jmp>(Target); }
   void _lea(Variable *Dest, Operand *Src0) {
     Context.insert<Insts::Lea>(Dest, Src0);
   }
   void _link_bp();
   void _push_reg(RegNumT RegNum);
   void _pop_reg(RegNumT RegNum);
   void _mfence() { Context.insert<Insts::Mfence>(); }
   /// Moves can be used to redefine registers, creating "partial kills" for
   /// liveness.  Mark where moves are used in this way.
   void _redefined(Inst *MovInst, bool IsRedefinition = true) {
     if (IsRedefinition)
       MovInst->setDestRedefined();
   }
   /// If Dest=nullptr is passed in, then a new variable is created, marked as
   /// infinite register allocation weight, and returned through the in/out Dest
   /// argument.
   Insts::Mov *_mov(Variable *&Dest, Operand *Src0, RegNumT RegNum = RegNumT()) {
     if (Dest == nullptr)
       Dest = makeReg(Src0->getType(), RegNum);
     return Context.insert<Insts::Mov>(Dest, Src0);
   }
   void _mov_sp(Operand *NewValue);
   Insts::Movp *_movp(Variable *Dest, Operand *Src0) {
     return Context.insert<Insts::Movp>(Dest, Src0);
   }
   void _movd(Variable *Dest, Operand *Src0) {
     Context.insert<Insts::Movd>(Dest, Src0);
   }
   void _movq(Variable *Dest, Operand *Src0) {
     Context.insert<Insts::Movq>(Dest, Src0);
   }
   void _movss(Variable *Dest, Variable *Src0) {
     Context.insert<Insts::MovssRegs>(Dest, Src0);
   }
   void _movsx(Variable *Dest, Operand *Src0) {
     Context.insert<Insts::Movsx>(Dest, Src0);
   }
   Insts::Movzx *_movzx(Variable *Dest, Operand *Src0) {
     return Context.insert<Insts::Movzx>(Dest, Src0);
   }
   void _maxss(Variable *Dest, Operand *Src0) {
     Context.insert<Insts::Maxss>(Dest, Src0);
   }
   void _minss(Variable *Dest, Operand *Src0) {
     Context.insert<Insts::Minss>(Dest, Src0);
   }
   void _maxps(Variable *Dest, Operand *Src0) {
     Context.insert<Insts::Maxps>(Dest, Src0);
   }
   void _minps(Variable *Dest, Operand *Src0) {
     Context.insert<Insts::Minps>(Dest, Src0);
   }
   void _mul(Variable *Dest, Variable *Src0, Operand *Src1) {
     Context.insert<Insts::Mul>(Dest, Src0, Src1);
   }
   void _mulps(Variable *Dest, Operand *Src0) {
     Context.insert<Insts::Mulps>(Dest, Src0);
   }
   void _mulss(Variable *Dest, Operand *Src0) {
     Context.insert<Insts::Mulss>(Dest, Src0);
   }
   void _neg(Variable *SrcDest) { Context.insert<Insts::Neg>(SrcDest); }
   void _nop(SizeT Variant) { Context.insert<Insts::Nop>(Variant); }
   void _or(Variable *Dest, Operand *Src0) {
     Context.insert<Insts::Or>(Dest, Src0);
   }
   void _orps(Variable *Dest, Operand *Src0) {
     Context.insert<Insts::Orps>(Dest, Src0);
   }
   void _or_rmw(X86OperandMem *DestSrc0, Operand *Src1) {
     Context.insert<Insts::OrRMW>(DestSrc0, Src1);
   }
   void _padd(Variable *Dest, Operand *Src0) {
     Context.insert<Insts::Padd>(Dest, Src0);
   }
   void _padds(Variable *Dest, Operand *Src0) {
     Context.insert<Insts::Padds>(Dest, Src0);
   }
   void _paddus(Variable *Dest, Operand *Src0) {
     Context.insert<Insts::Paddus>(Dest, Src0);
   }
   void _pand(Variable *Dest, Operand *Src0) {
     Context.insert<Insts::Pand>(Dest, Src0);
   }
   void _pandn(Variable *Dest, Operand *Src0) {
     Context.insert<Insts::Pandn>(Dest, Src0);
   }
   void _pblendvb(Variable *Dest, Operand *Src0, Operand *Src1) {
     Context.insert<Insts::Pblendvb>(Dest, Src0, Src1);
   }
   void _pcmpeq(Variable *Dest, Operand *Src0,
                Type ArithmeticTypeOverride = IceType_void) {
     Context.insert<Insts::Pcmpeq>(Dest, Src0, ArithmeticTypeOverride);
   }
   void _pcmpgt(Variable *Dest, Operand *Src0) {
     Context.insert<Insts::Pcmpgt>(Dest, Src0);
   }
   void _pextr(Variable *Dest, Operand *Src0, Operand *Src1) {
     Context.insert<Insts::Pextr>(Dest, Src0, Src1);
   }
   void _pinsr(Variable *Dest, Operand *Src0, Operand *Src1) {
     Context.insert<Insts::Pinsr>(Dest, Src0, Src1);
   }
   void _pmull(Variable *Dest, Operand *Src0) {
     Context.insert<Insts::Pmull>(Dest, Src0);
   }
   void _pmulhw(Variable *Dest, Operand *Src0) {
     Context.insert<Insts::Pmulhw>(Dest, Src0);
   }
   void _pmulhuw(Variable *Dest, Operand *Src0) {
     Context.insert<Insts::Pmulhuw>(Dest, Src0);
   }
   void _pmaddwd(Variable *Dest, Operand *Src0) {
     Context.insert<Insts::Pmaddwd>(Dest, Src0);
   }
   void _pmuludq(Variable *Dest, Operand *Src0) {
     Context.insert<Insts::Pmuludq>(Dest, Src0);
   }
   void _pop(Variable *Dest) { Context.insert<Insts::Pop>(Dest); }
   void _por(Variable *Dest, Operand *Src0) {
     Context.insert<Insts::Por>(Dest, Src0);
   }
   void _punpckl(Variable *Dest, Operand *Src0) {
     Context.insert<Insts::Punpckl>(Dest, Src0);
   }
   void _punpckh(Variable *Dest, Operand *Src0) {
     Context.insert<Insts::Punpckh>(Dest, Src0);
   }
   void _packss(Variable *Dest, Operand *Src0) {
     Context.insert<Insts::Packss>(Dest, Src0);
   }
   void _packus(Variable *Dest, Operand *Src0) {
     Context.insert<Insts::Packus>(Dest, Src0);
   }
   void _pshufb(Variable *Dest, Operand *Src0) {
     Context.insert<Insts::Pshufb>(Dest, Src0);
   }
   void _pshufd(Variable *Dest, Operand *Src0, Operand *Src1) {
     Context.insert<Insts::Pshufd>(Dest, Src0, Src1);
   }
   void _psll(Variable *Dest, Operand *Src0) {
     Context.insert<Insts::Psll>(Dest, Src0);
   }
   void _psra(Variable *Dest, Operand *Src0) {
     Context.insert<Insts::Psra>(Dest, Src0);
   }
   void _psrl(Variable *Dest, Operand *Src0) {
     Context.insert<Insts::Psrl>(Dest, Src0);
   }
   void _psub(Variable *Dest, Operand *Src0) {
     Context.insert<Insts::Psub>(Dest, Src0);
   }
   void _psubs(Variable *Dest, Operand *Src0) {
     Context.insert<Insts::Psubs>(Dest, Src0);
   }
   void _psubus(Variable *Dest, Operand *Src0) {
     Context.insert<Insts::Psubus>(Dest, Src0);
   }
   void _push(Operand *Src0) { Context.insert<Insts::Push>(Src0); }
   void _pxor(Variable *Dest, Operand *Src0) {
     Context.insert<Insts::Pxor>(Dest, Src0);
   }
   void _ret(Variable *Src0 = nullptr) { Context.insert<Insts::Ret>(Src0); }
   void _rol(Variable *Dest, Operand *Src0) {
     Context.insert<Insts::Rol>(Dest, Src0);
   }
   void _round(Variable *Dest, Operand *Src, Constant *Imm) {
     Context.insert<Insts::Round>(Dest, Src, Imm);
   }
   void _sar(Variable *Dest, Operand *Src0) {
     Context.insert<Insts::Sar>(Dest, Src0);
   }
   void _sbb(Variable *Dest, Operand *Src0) {
     Context.insert<Insts::Sbb>(Dest, Src0);
   }
   void _sbb_rmw(X86OperandMem *DestSrc0, Operand *Src1) {
     Context.insert<Insts::SbbRMW>(DestSrc0, Src1);
   }
   void _setcc(Variable *Dest, BrCond Condition) {
     Context.insert<Insts::Setcc>(Dest, Condition);
   }
   void _shl(Variable *Dest, Operand *Src0) {
     Context.insert<Insts::Shl>(Dest, Src0);
   }
   void _shld(Variable *Dest, Variable *Src0, Operand *Src1) {
     Context.insert<Insts::Shld>(Dest, Src0, Src1);
   }
   void _shr(Variable *Dest, Operand *Src0) {
     Context.insert<Insts::Shr>(Dest, Src0);
   }
   void _shrd(Variable *Dest, Variable *Src0, Operand *Src1) {
     Context.insert<Insts::Shrd>(Dest, Src0, Src1);
   }
   void _shufps(Variable *Dest, Operand *Src0, Operand *Src1) {
     Context.insert<Insts::Shufps>(Dest, Src0, Src1);
   }
   void _movmsk(Variable *Dest, Operand *Src0) {
     Context.insert<Insts::Movmsk>(Dest, Src0);
   }
   void _sqrt(Variable *Dest, Operand *Src0) {
     Context.insert<Insts::Sqrt>(Dest, Src0);
   }
   void _store(Operand *Value, X86Operand *Mem) {
     Context.insert<Insts::Store>(Value, Mem);
   }
   void _storep(Variable *Value, X86OperandMem *Mem) {
     Context.insert<Insts::StoreP>(Value, Mem);
   }
   void _storeq(Operand *Value, X86OperandMem *Mem) {
     Context.insert<Insts::StoreQ>(Value, Mem);
   }
   void _stored(Operand *Value, X86OperandMem *Mem) {
     Context.insert<Insts::StoreD>(Value, Mem);
   }
   void _sub(Variable *Dest, Operand *Src0) {
     Context.insert<Insts::Sub>(Dest, Src0);
   }
   void _sub_rmw(X86OperandMem *DestSrc0, Operand *Src1) {
     Context.insert<Insts::SubRMW>(DestSrc0, Src1);
   }
   void _sub_sp(Operand *Adjustment);
   void _subps(Variable *Dest, Operand *Src0) {
     Context.insert<Insts::Subps>(Dest, Src0);
   }
   void _subss(Variable *Dest, Operand *Src0) {
     Context.insert<Insts::Subss>(Dest, Src0);
   }
   void _test(Operand *Src0, Operand *Src1) {
     Context.insert<Insts::Test>(Src0, Src1);
   }
   void _ucomiss(Operand *Src0, Operand *Src1) {
     Context.insert<Insts::Ucomiss>(Src0, Src1);
   }
   void _ud2() { Context.insert<Insts::UD2>(); }
   void _unlink_bp();
   void _xadd(Operand *Dest, Variable *Src, bool Locked) {
     Context.insert<Insts::Xadd>(Dest, Src, Locked);
     // The xadd exchanges Dest and Src (modifying Src). Model that update with
     // a FakeDef followed by a FakeUse.
     Context.insert<InstFakeDef>(Src, llvm::dyn_cast<Variable>(Dest));
     _set_dest_redefined();
     Context.insert<InstFakeUse>(Src);
   }
   void _xchg(Operand *Dest, Variable *Src) {
     Context.insert<Insts::Xchg>(Dest, Src);
     // The xchg modifies Dest and Src -- model that update with a
     // FakeDef/FakeUse.
     Context.insert<InstFakeDef>(Src, llvm::dyn_cast<Variable>(Dest));
     _set_dest_redefined();
     Context.insert<InstFakeUse>(Src);
   }
   void _xor(Variable *Dest, Operand *Src0) {
     Context.insert<Insts::Xor>(Dest, Src0);
   }
   void _xorps(Variable *Dest, Operand *Src0) {
     Context.insert<Insts::Xorps>(Dest, Src0);
   }
   void _xor_rmw(X86OperandMem *DestSrc0, Operand *Src1) {
     Context.insert<Insts::XorRMW>(DestSrc0, Src1);
   }

   void _iaca_start() {
     if (!BuildDefs::minimal())
       Context.insert<Insts::IacaStart>();
   }
   void _iaca_end() {
     if (!BuildDefs::minimal())
       Context.insert<Insts::IacaEnd>();
   }

   /// This class helps wrap IACA markers around the code generated by the
   /// current scope. It means you don't need to put an end before each return.
   class ScopedIacaMark {
     ScopedIacaMark(const ScopedIacaMark &) = delete;
     ScopedIacaMark &operator=(const ScopedIacaMark &) = delete;

   public:
     ScopedIacaMark(TargetX8632 *Lowering) : Lowering(Lowering) {
       Lowering->_iaca_start();
     }
     ~ScopedIacaMark() { end(); }
     void end() {
       if (!Lowering)
         return;
       Lowering->_iaca_end();
       Lowering = nullptr;
     }

   private:
     TargetX8632 *Lowering;
   };

   bool optimizeScalarMul(Variable *Dest, Operand *Src0, int32_t Src1);
   void findRMW();

   static uint32_t applyStackAlignment(uint32_t Value);

   bool IsEbpBasedFrame = false;

 #if defined(_WIN32)
   // Windows 32-bit only guarantees 4 byte stack alignment
   static constexpr uint32_t X86_STACK_ALIGNMENT_BYTES = 4;
 #else
   /// Stack alignment guaranteed by the System V ABI.
   static constexpr uint32_t X86_STACK_ALIGNMENT_BYTES = 16;
 #endif
   /// Stack alignment required by the currently lowered function.
   size_t RequiredStackAlignment = X86_STACK_ALIGNMENT_BYTES;
   size_t SpillAreaSizeBytes = 0;
   size_t FixedAllocaSizeBytes = 0;
   size_t FixedAllocaAlignBytes = 0;
   bool PrologEmitsFixedAllocas = false;
   uint32_t MaxOutArgsSizeBytes = 0;
   static std::array<SmallBitVector, RCX86_NUM> TypeToRegisterSet;
   static std::array<SmallBitVector, RCX86_NUM> TypeToRegisterSetUnfiltered;
   static std::array<SmallBitVector, RegisterSet::Reg_NUM> RegisterAliases;
   SmallBitVector RegsUsed;
   std::array<VarList, IceType_NUM> PhysicalRegisters;
   // RebasePtr is a Variable that holds the Rebasing pointer (if any) for the
   // current sandboxing type.
   Variable *RebasePtr = nullptr;

 private:
   void lowerShift64(InstArithmetic::OpKind Op, Operand *Src0Lo, Operand *Src0Hi,
                     Operand *Src1Lo, Variable *DestLo, Variable *DestHi);

   /// Emit the code for a combined operation and consumer instruction, or set
   /// the destination variable of the operation if Consumer == nullptr.
   void lowerIcmpAndConsumer(const InstIcmp *Icmp, const Inst *Consumer);
   void lowerFcmpAndConsumer(const InstFcmp *Fcmp, const Inst *Consumer);
   void lowerArithAndConsumer(const InstArithmetic *Arith, const Inst *Consumer);

   /// Emit a setcc instruction if Consumer == nullptr; otherwise emit a
   /// specialized version of Consumer.
   void setccOrConsumer(BrCond Condition, Variable *Dest, const Inst *Consumer);

   /// Emit a mov [1|0] instruction if Consumer == nullptr; otherwise emit a
   /// specialized version of Consumer.
   void movOrConsumer(bool IcmpResult, Variable *Dest, const Inst *Consumer);

   /// Emit the code for instructions with a vector type.
   void lowerIcmpVector(const InstIcmp *Icmp);
   void lowerFcmpVector(const InstFcmp *Icmp);
   void lowerSelectVector(const InstSelect *Instr);

   /// Helpers for select lowering.
   void lowerSelectMove(Variable *Dest, BrCond Cond, Operand *SrcT,
                        Operand *SrcF);
   void lowerSelectIntMove(Variable *Dest, BrCond Cond, Operand *SrcT,
                           Operand *SrcF);
   /// Generic helper to move an arbitrary type from Src to Dest.
   void lowerMove(Variable *Dest, Operand *Src, bool IsRedefinition);

   /// Optimizations for idiom recognition.
   bool lowerOptimizeFcmpSelect(const InstFcmp *Fcmp, const InstSelect *Select);

   /// x86lowerIcmp64 handles 64-bit icmp lowering.
   void lowerIcmp64(const InstIcmp *Icmp, const Inst *Consumer);

   BoolFolding FoldingInfo;

   /// Helpers for lowering ShuffleVector
   /// @{
   Variable *lowerShuffleVector_AllFromSameSrc(Operand *Src, SizeT Index0,
                                               SizeT Index1, SizeT Index2,
                                               SizeT Index3);
   static constexpr SizeT IGNORE_INDEX = 0x80000000u;
   Variable *lowerShuffleVector_TwoFromSameSrc(Operand *Src0, SizeT Index0,
                                               SizeT Index1, Operand *Src1,
                                               SizeT Index2, SizeT Index3);
   static constexpr SizeT UNIFIED_INDEX_0 = 0;
   static constexpr SizeT UNIFIED_INDEX_1 = 2;
   Variable *lowerShuffleVector_UnifyFromDifferentSrcs(Operand *Src0,
                                                       SizeT Index0,
                                                       Operand *Src1,
                                                       SizeT Index1);
   static constexpr SizeT CLEAR_ALL_BITS = 0x80;
   SizeT PshufbMaskCount = 0;
   GlobalString lowerShuffleVector_NewMaskName();
   ConstantRelocatable *lowerShuffleVector_CreatePshufbMask(
       int8_t Idx0, int8_t Idx1, int8_t Idx2, int8_t Idx3, int8_t Idx4,
       int8_t Idx5, int8_t Idx6, int8_t Idx7, int8_t Idx8, int8_t Idx9,
       int8_t Idx10, int8_t Idx11, int8_t Idx12, int8_t Idx13, int8_t Idx14,
       int8_t Idx15);
   void lowerShuffleVector_UsingPshufb(Variable *Dest, Operand *Src0,
                                       Operand *Src1, int8_t Idx0, int8_t Idx1,
                                       int8_t Idx2, int8_t Idx3, int8_t Idx4,
                                       int8_t Idx5, int8_t Idx6, int8_t Idx7,
                                       int8_t Idx8, int8_t Idx9, int8_t Idx10,
                                       int8_t Idx11, int8_t Idx12, int8_t Idx13,
                                       int8_t Idx14, int8_t Idx15);
   /// @}

   /// The following table summarizes the logic for lowering the fcmp
   /// instruction. There is one table entry for each of the 16 conditions.
   ///
   /// The first four columns describe the case when the operands are floating
   /// point scalar values. A comment in lowerFcmp() describes the lowering
   /// template. In the most general case, there is a compare followed by two
   /// conditional branches, because some fcmp conditions don't map to a single
   /// x86 conditional branch. However, in many cases it is possible to swap the
   /// operands in the comparison and have a single conditional branch. Since
   /// it's quite tedious to validate the table by hand, good execution tests are
   /// helpful.
   ///
   /// The last two columns describe the case when the operands are vectors of
   /// floating point values. For most fcmp conditions, there is a clear mapping
   /// to a single x86 cmpps instruction variant. Some fcmp conditions require
   /// special code to handle and these are marked in the table with a
   /// Cmpps_Invalid predicate.
   /// {@
   static const struct TableFcmpType {
     uint32_t Default;
     bool SwapScalarOperands;
     CondX86::BrCond C1, C2;
     bool SwapVectorOperands;
     CondX86::CmppsCond Predicate;
   } TableFcmp[];
   static const size_t TableFcmpSize;
   /// @}

   /// The following table summarizes the logic for lowering the icmp instruction
   /// for i32 and narrower types. Each icmp condition has a clear mapping to an
   /// x86 conditional branch instruction.
   /// {@
   static const struct TableIcmp32Type {
     CondX86::BrCond Mapping;
   } TableIcmp32[];
   static const size_t TableIcmp32Size;
   /// @}

   /// The following table summarizes the logic for lowering the icmp instruction
   /// for the i64 type. For Eq and Ne, two separate 32-bit comparisons and
   /// conditional branches are needed. For the other conditions, three separate
   /// conditional branches are needed.
   /// {@
   static const struct TableIcmp64Type {
     CondX86::BrCond C1, C2, C3;
   } TableIcmp64[];
   static const size_t TableIcmp64Size;
   /// @}

   static CondX86::BrCond getIcmp32Mapping(InstIcmp::ICond Cond) {
     assert(static_cast<size_t>(Cond) < TableIcmp32Size);
     return TableIcmp32[Cond].Mapping;
   }

 public:
   static std::unique_ptr<::Ice::TargetLowering> create(Cfg *Func) {
     return makeUnique<TargetX8632>(Func);
   }

   std::unique_ptr<::Ice::Assembler> createAssembler() const override {
     return makeUnique<X8632::AssemblerX8632>();
   }

 private:
   ENABLE_MAKE_UNIQUE;

   explicit TargetX8632(Cfg *Func);
 };

 class TargetDataX8632 final : public TargetDataLowering {
   TargetDataX8632() = delete;
   TargetDataX8632(const TargetDataX8632 &) = delete;
   TargetDataX8632 &operator=(const TargetDataX8632 &) = delete;

 public:
   ~TargetDataX8632() override = default;

   static std::unique_ptr<TargetDataLowering> create(GlobalContext *Ctx) {
     return makeUnique<TargetDataX8632>(Ctx);
   }

   void lowerGlobals(const VariableDeclarationList &Vars,
                     const std::string &SectionSuffix) override;
   void lowerConstants() override;
   void lowerJumpTables() override;

 private:
   ENABLE_MAKE_UNIQUE;

   explicit TargetDataX8632(GlobalContext *Ctx) : TargetDataLowering(Ctx) {}
   template <typename T> static void emitConstantPool(GlobalContext *Ctx);
 };

 class TargetHeaderX86 : public TargetHeaderLowering {
   TargetHeaderX86() = delete;
   TargetHeaderX86(const TargetHeaderX86 &) = delete;
   TargetHeaderX86 &operator=(const TargetHeaderX86 &) = delete;

 public:
   ~TargetHeaderX86() = default;

   static std::unique_ptr<TargetHeaderLowering> create(GlobalContext *Ctx) {
     return makeUnique<TargetHeaderX86>(Ctx);
   }

 private:
   ENABLE_MAKE_UNIQUE;

   explicit TargetHeaderX86(GlobalContext *Ctx) : TargetHeaderLowering(Ctx) {}
 };

 } // end of namespace X8632
 } // end of namespace Ice

 #endif // SUBZERO_SRC_ICETARGETLOWERINGX8632_H