| //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| /// \file |
| /// This file implements the targeting of the Machinelegalizer class for |
| /// AMDGPU. |
| /// \todo This should be generated by TableGen. |
| //===----------------------------------------------------------------------===// |
| |
| #if defined(_MSC_VER) || defined(__MINGW32__) |
| // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI |
| // from the Visual C++ cmath / math.h headers: |
| // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019 |
| #define _USE_MATH_DEFINES |
| #endif |
| |
| #include "AMDGPU.h" |
| #include "AMDGPULegalizerInfo.h" |
| #include "AMDGPUTargetMachine.h" |
| #include "SIMachineFunctionInfo.h" |
| #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" |
| #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" |
| #include "llvm/CodeGen/TargetOpcodes.h" |
| #include "llvm/CodeGen/ValueTypes.h" |
| #include "llvm/IR/DerivedTypes.h" |
| #include "llvm/IR/DiagnosticInfo.h" |
| #include "llvm/IR/Type.h" |
| #include "llvm/Support/Debug.h" |
| |
| #define DEBUG_TYPE "amdgpu-legalinfo" |
| |
| using namespace llvm; |
| using namespace LegalizeActions; |
| using namespace LegalizeMutations; |
| using namespace LegalityPredicates; |
| |
| |
| static LegalityPredicate isMultiple32(unsigned TypeIdx, |
| unsigned MaxSize = 1024) { |
| return [=](const LegalityQuery &Query) { |
| const LLT Ty = Query.Types[TypeIdx]; |
| const LLT EltTy = Ty.getScalarType(); |
| return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0; |
| }; |
| } |
| |
| static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) { |
| return [=](const LegalityQuery &Query) { |
| return Query.Types[TypeIdx].getSizeInBits() == Size; |
| }; |
| } |
| |
| static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { |
| return [=](const LegalityQuery &Query) { |
| const LLT Ty = Query.Types[TypeIdx]; |
| return Ty.isVector() && |
| Ty.getNumElements() % 2 != 0 && |
| Ty.getElementType().getSizeInBits() < 32 && |
| Ty.getSizeInBits() % 32 != 0; |
| }; |
| } |
| |
| static LegalityPredicate isWideVec16(unsigned TypeIdx) { |
| return [=](const LegalityQuery &Query) { |
| const LLT Ty = Query.Types[TypeIdx]; |
| const LLT EltTy = Ty.getScalarType(); |
| return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; |
| }; |
| } |
| |
| static LegalizeMutation oneMoreElement(unsigned TypeIdx) { |
| return [=](const LegalityQuery &Query) { |
| const LLT Ty = Query.Types[TypeIdx]; |
| const LLT EltTy = Ty.getElementType(); |
| return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); |
| }; |
| } |
| |
| static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { |
| return [=](const LegalityQuery &Query) { |
| const LLT Ty = Query.Types[TypeIdx]; |
| const LLT EltTy = Ty.getElementType(); |
| unsigned Size = Ty.getSizeInBits(); |
| unsigned Pieces = (Size + 63) / 64; |
| unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; |
| return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); |
| }; |
| } |
| |
| // Increase the number of vector elements to reach the next multiple of 32-bit |
| // type. |
| static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { |
| return [=](const LegalityQuery &Query) { |
| const LLT Ty = Query.Types[TypeIdx]; |
| |
| const LLT EltTy = Ty.getElementType(); |
| const int Size = Ty.getSizeInBits(); |
| const int EltSize = EltTy.getSizeInBits(); |
| const int NextMul32 = (Size + 31) / 32; |
| |
| assert(EltSize < 32); |
| |
| const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; |
| return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); |
| }; |
| } |
| |
| static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { |
| return [=](const LegalityQuery &Query) { |
| const LLT QueryTy = Query.Types[TypeIdx]; |
| return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; |
| }; |
| } |
| |
| static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { |
| return [=](const LegalityQuery &Query) { |
| const LLT QueryTy = Query.Types[TypeIdx]; |
| return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; |
| }; |
| } |
| |
| static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { |
| return [=](const LegalityQuery &Query) { |
| const LLT QueryTy = Query.Types[TypeIdx]; |
| return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; |
| }; |
| } |
| |
| // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of |
| // v2s16. |
| static LegalityPredicate isRegisterType(unsigned TypeIdx) { |
| return [=](const LegalityQuery &Query) { |
| const LLT Ty = Query.Types[TypeIdx]; |
| if (Ty.isVector()) { |
| const int EltSize = Ty.getElementType().getSizeInBits(); |
| return EltSize == 32 || EltSize == 64 || |
| (EltSize == 16 && Ty.getNumElements() % 2 == 0) || |
| EltSize == 128 || EltSize == 256; |
| } |
| |
| return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024; |
| }; |
| } |
| |
| static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) { |
| return [=](const LegalityQuery &Query) { |
| return Query.Types[TypeIdx].getElementType() == Type; |
| }; |
| } |
| |
| static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { |
| return [=](const LegalityQuery &Query) { |
| const LLT Ty = Query.Types[TypeIdx]; |
| return !Ty.isVector() && Ty.getSizeInBits() > 32 && |
| Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); |
| }; |
| } |
| |
| AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, |
| const GCNTargetMachine &TM) |
| : ST(ST_) { |
| using namespace TargetOpcode; |
| |
| auto GetAddrSpacePtr = [&TM](unsigned AS) { |
| return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); |
| }; |
| |
| const LLT S1 = LLT::scalar(1); |
| const LLT S8 = LLT::scalar(8); |
| const LLT S16 = LLT::scalar(16); |
| const LLT S32 = LLT::scalar(32); |
| const LLT S64 = LLT::scalar(64); |
| const LLT S96 = LLT::scalar(96); |
| const LLT S128 = LLT::scalar(128); |
| const LLT S256 = LLT::scalar(256); |
| const LLT S1024 = LLT::scalar(1024); |
| |
| const LLT V2S16 = LLT::vector(2, 16); |
| const LLT V4S16 = LLT::vector(4, 16); |
| |
| const LLT V2S32 = LLT::vector(2, 32); |
| const LLT V3S32 = LLT::vector(3, 32); |
| const LLT V4S32 = LLT::vector(4, 32); |
| const LLT V5S32 = LLT::vector(5, 32); |
| const LLT V6S32 = LLT::vector(6, 32); |
| const LLT V7S32 = LLT::vector(7, 32); |
| const LLT V8S32 = LLT::vector(8, 32); |
| const LLT V9S32 = LLT::vector(9, 32); |
| const LLT V10S32 = LLT::vector(10, 32); |
| const LLT V11S32 = LLT::vector(11, 32); |
| const LLT V12S32 = LLT::vector(12, 32); |
| const LLT V13S32 = LLT::vector(13, 32); |
| const LLT V14S32 = LLT::vector(14, 32); |
| const LLT V15S32 = LLT::vector(15, 32); |
| const LLT V16S32 = LLT::vector(16, 32); |
| const LLT V32S32 = LLT::vector(32, 32); |
| |
| const LLT V2S64 = LLT::vector(2, 64); |
| const LLT V3S64 = LLT::vector(3, 64); |
| const LLT V4S64 = LLT::vector(4, 64); |
| const LLT V5S64 = LLT::vector(5, 64); |
| const LLT V6S64 = LLT::vector(6, 64); |
| const LLT V7S64 = LLT::vector(7, 64); |
| const LLT V8S64 = LLT::vector(8, 64); |
| const LLT V16S64 = LLT::vector(16, 64); |
| |
| std::initializer_list<LLT> AllS32Vectors = |
| {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, |
| V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; |
| std::initializer_list<LLT> AllS64Vectors = |
| {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; |
| |
| const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); |
| const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); |
| const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); |
| const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); |
| const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); |
| const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); |
| const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); |
| |
| const LLT CodePtr = FlatPtr; |
| |
| const std::initializer_list<LLT> AddrSpaces64 = { |
| GlobalPtr, ConstantPtr, FlatPtr |
| }; |
| |
| const std::initializer_list<LLT> AddrSpaces32 = { |
| LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr |
| }; |
| |
| const std::initializer_list<LLT> FPTypesBase = { |
| S32, S64 |
| }; |
| |
| const std::initializer_list<LLT> FPTypes16 = { |
| S32, S64, S16 |
| }; |
| |
| const std::initializer_list<LLT> FPTypesPK16 = { |
| S32, S64, S16, V2S16 |
| }; |
| |
| setAction({G_BRCOND, S1}, Legal); // VCC branches |
| setAction({G_BRCOND, S32}, Legal); // SCC branches |
| |
| // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more |
| // elements for v3s16 |
| getActionDefinitionsBuilder(G_PHI) |
| .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) |
| .legalFor(AllS32Vectors) |
| .legalFor(AllS64Vectors) |
| .legalFor(AddrSpaces64) |
| .legalFor(AddrSpaces32) |
| .clampScalar(0, S32, S256) |
| .widenScalarToNextPow2(0, 32) |
| .clampMaxNumElements(0, S32, 16) |
| .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) |
| .legalIf(isPointer(0)); |
| |
| if (ST.has16BitInsts()) { |
| getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) |
| .legalFor({S32, S16}) |
| .clampScalar(0, S16, S32) |
| .scalarize(0); |
| } else { |
| getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) |
| .legalFor({S32}) |
| .clampScalar(0, S32, S32) |
| .scalarize(0); |
| } |
| |
| // FIXME: Not really legal. Placeholder for custom lowering. |
| getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) |
| .legalFor({S32, S64}) |
| .clampScalar(0, S32, S64) |
| .widenScalarToNextPow2(0, 32) |
| .scalarize(0); |
| |
| getActionDefinitionsBuilder({G_UMULH, G_SMULH}) |
| .legalFor({S32}) |
| .clampScalar(0, S32, S32) |
| .scalarize(0); |
| |
| // Report legal for any types we can handle anywhere. For the cases only legal |
| // on the SALU, RegBankSelect will be able to re-legalize. |
| getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) |
| .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) |
| .clampScalar(0, S32, S64) |
| .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) |
| .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) |
| .widenScalarToNextPow2(0) |
| .scalarize(0); |
| |
| getActionDefinitionsBuilder({G_UADDO, G_USUBO, |
| G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) |
| .legalFor({{S32, S1}, {S32, S32}}) |
| .clampScalar(0, S32, S32) |
| .scalarize(0); // TODO: Implement. |
| |
| getActionDefinitionsBuilder({G_SADDO, G_SSUBO}) |
| .lower(); |
| |
| getActionDefinitionsBuilder(G_BITCAST) |
| // Don't worry about the size constraint. |
| .legalIf(all(isRegisterType(0), isRegisterType(1))) |
| // FIXME: Testing hack |
| .legalForCartesianProduct({S16, LLT::vector(2, 8), }); |
| |
| getActionDefinitionsBuilder(G_FCONSTANT) |
| .legalFor({S32, S64, S16}) |
| .clampScalar(0, S16, S64); |
| |
| getActionDefinitionsBuilder(G_IMPLICIT_DEF) |
| .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr, |
| ConstantPtr, LocalPtr, FlatPtr, PrivatePtr}) |
| .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) |
| .clampScalarOrElt(0, S32, S1024) |
| .legalIf(isMultiple32(0)) |
| .widenScalarToNextPow2(0, 32) |
| .clampMaxNumElements(0, S32, 16); |
| |
| |
| // FIXME: i1 operands to intrinsics should always be legal, but other i1 |
| // values may not be legal. We need to figure out how to distinguish |
| // between these two scenarios. |
| getActionDefinitionsBuilder(G_CONSTANT) |
| .legalFor({S1, S32, S64, S16, GlobalPtr, |
| LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) |
| .clampScalar(0, S32, S64) |
| .widenScalarToNextPow2(0) |
| .legalIf(isPointer(0)); |
| |
| setAction({G_FRAME_INDEX, PrivatePtr}, Legal); |
| getActionDefinitionsBuilder(G_GLOBAL_VALUE) |
| .customFor({LocalPtr, GlobalPtr, ConstantPtr, Constant32Ptr}); |
| |
| |
| auto &FPOpActions = getActionDefinitionsBuilder( |
| { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) |
| .legalFor({S32, S64}); |
| auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) |
| .customFor({S32, S64}); |
| auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) |
| .customFor({S32, S64}); |
| |
| if (ST.has16BitInsts()) { |
| if (ST.hasVOP3PInsts()) |
| FPOpActions.legalFor({S16, V2S16}); |
| else |
| FPOpActions.legalFor({S16}); |
| |
| TrigActions.customFor({S16}); |
| FDIVActions.customFor({S16}); |
| } |
| |
| auto &MinNumMaxNum = getActionDefinitionsBuilder({ |
| G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); |
| |
| if (ST.hasVOP3PInsts()) { |
| MinNumMaxNum.customFor(FPTypesPK16) |
| .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) |
| .clampMaxNumElements(0, S16, 2) |
| .clampScalar(0, S16, S64) |
| .scalarize(0); |
| } else if (ST.has16BitInsts()) { |
| MinNumMaxNum.customFor(FPTypes16) |
| .clampScalar(0, S16, S64) |
| .scalarize(0); |
| } else { |
| MinNumMaxNum.customFor(FPTypesBase) |
| .clampScalar(0, S32, S64) |
| .scalarize(0); |
| } |
| |
| if (ST.hasVOP3PInsts()) |
| FPOpActions.clampMaxNumElements(0, S16, 2); |
| |
| FPOpActions |
| .scalarize(0) |
| .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); |
| |
| TrigActions |
| .scalarize(0) |
| .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); |
| |
| FDIVActions |
| .scalarize(0) |
| .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); |
| |
| getActionDefinitionsBuilder({G_FNEG, G_FABS}) |
| .legalFor(FPTypesPK16) |
| .clampMaxNumElements(0, S16, 2) |
| .scalarize(0) |
| .clampScalar(0, S16, S64); |
| |
| // TODO: Implement |
| getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower(); |
| |
| if (ST.has16BitInsts()) { |
| getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) |
| .legalFor({S32, S64, S16}) |
| .scalarize(0) |
| .clampScalar(0, S16, S64); |
| } else { |
| getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) |
| .legalFor({S32, S64}) |
| .scalarize(0) |
| .clampScalar(0, S32, S64); |
| } |
| |
| getActionDefinitionsBuilder(G_FPTRUNC) |
| .legalFor({{S32, S64}, {S16, S32}}) |
| .scalarize(0); |
| |
| getActionDefinitionsBuilder(G_FPEXT) |
| .legalFor({{S64, S32}, {S32, S16}}) |
| .lowerFor({{S64, S16}}) // FIXME: Implement |
| .scalarize(0); |
| |
| // TODO: Verify V_BFI_B32 is generated from expanded bit ops. |
| getActionDefinitionsBuilder(G_FCOPYSIGN).lower(); |
| |
| getActionDefinitionsBuilder(G_FSUB) |
| // Use actual fsub instruction |
| .legalFor({S32}) |
| // Must use fadd + fneg |
| .lowerFor({S64, S16, V2S16}) |
| .scalarize(0) |
| .clampScalar(0, S32, S64); |
| |
| // Whether this is legal depends on the floating point mode for the function. |
| auto &FMad = getActionDefinitionsBuilder(G_FMAD); |
| if (ST.hasMadF16()) |
| FMad.customFor({S32, S16}); |
| else |
| FMad.customFor({S32}); |
| FMad.scalarize(0) |
| .lower(); |
| |
| getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) |
| .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, |
| {S32, S1}, {S64, S1}, {S16, S1}, |
| {S96, S32}, |
| // FIXME: Hack |
| {S64, LLT::scalar(33)}, |
| {S32, S8}, {S32, LLT::scalar(24)}}) |
| .scalarize(0) |
| .clampScalar(0, S32, S64); |
| |
| // TODO: Split s1->s64 during regbankselect for VALU. |
| auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) |
| .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) |
| .lowerFor({{S32, S64}}) |
| .lowerIf(typeIs(1, S1)) |
| .customFor({{S64, S64}}); |
| if (ST.has16BitInsts()) |
| IToFP.legalFor({{S16, S16}}); |
| IToFP.clampScalar(1, S32, S64) |
| .scalarize(0); |
| |
| auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) |
| .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}); |
| if (ST.has16BitInsts()) |
| FPToI.legalFor({{S16, S16}}); |
| else |
| FPToI.minScalar(1, S32); |
| |
| FPToI.minScalar(0, S32) |
| .scalarize(0); |
| |
| getActionDefinitionsBuilder(G_INTRINSIC_ROUND) |
| .scalarize(0) |
| .lower(); |
| |
| if (ST.has16BitInsts()) { |
| getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) |
| .legalFor({S16, S32, S64}) |
| .clampScalar(0, S16, S64) |
| .scalarize(0); |
| } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { |
| getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) |
| .legalFor({S32, S64}) |
| .clampScalar(0, S32, S64) |
| .scalarize(0); |
| } else { |
| getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) |
| .legalFor({S32}) |
| .customFor({S64}) |
| .clampScalar(0, S32, S64) |
| .scalarize(0); |
| } |
| |
| getActionDefinitionsBuilder(G_PTR_ADD) |
| .legalForCartesianProduct(AddrSpaces64, {S64}) |
| .legalForCartesianProduct(AddrSpaces32, {S32}) |
| .scalarize(0); |
| |
| getActionDefinitionsBuilder(G_PTR_MASK) |
| .scalarize(0) |
| .alwaysLegal(); |
| |
| setAction({G_BLOCK_ADDR, CodePtr}, Legal); |
| |
| auto &CmpBuilder = |
| getActionDefinitionsBuilder(G_ICMP) |
| // The compare output type differs based on the register bank of the output, |
| // so make both s1 and s32 legal. |
| // |
| // Scalar compares producing output in scc will be promoted to s32, as that |
| // is the allocatable register type that will be needed for the copy from |
| // scc. This will be promoted during RegBankSelect, and we assume something |
| // before that won't try to use s32 result types. |
| // |
| // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg |
| // bank. |
| .legalForCartesianProduct( |
| {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) |
| .legalForCartesianProduct( |
| {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); |
| if (ST.has16BitInsts()) { |
| CmpBuilder.legalFor({{S1, S16}}); |
| } |
| |
| CmpBuilder |
| .widenScalarToNextPow2(1) |
| .clampScalar(1, S32, S64) |
| .scalarize(0) |
| .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); |
| |
| getActionDefinitionsBuilder(G_FCMP) |
| .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) |
| .widenScalarToNextPow2(1) |
| .clampScalar(1, S32, S64) |
| .scalarize(0); |
| |
| // FIXME: fexp, flog2, flog10 needs to be custom lowered. |
| getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2, |
| G_FLOG, G_FLOG2, G_FLOG10}) |
| .legalFor({S32}) |
| .scalarize(0); |
| |
| // The 64-bit versions produce 32-bit results, but only on the SALU. |
| getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF, |
| G_CTTZ, G_CTTZ_ZERO_UNDEF, |
| G_CTPOP}) |
| .legalFor({{S32, S32}, {S32, S64}}) |
| .clampScalar(0, S32, S32) |
| .clampScalar(1, S32, S64) |
| .scalarize(0) |
| .widenScalarToNextPow2(0, 32) |
| .widenScalarToNextPow2(1, 32); |
| |
| // TODO: Expand for > s32 |
| getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE}) |
| .legalFor({S32}) |
| .clampScalar(0, S32, S32) |
| .scalarize(0); |
| |
| if (ST.has16BitInsts()) { |
| if (ST.hasVOP3PInsts()) { |
| getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) |
| .legalFor({S32, S16, V2S16}) |
| .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) |
| .clampMaxNumElements(0, S16, 2) |
| .clampScalar(0, S16, S32) |
| .widenScalarToNextPow2(0) |
| .scalarize(0); |
| } else { |
| getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) |
| .legalFor({S32, S16}) |
| .widenScalarToNextPow2(0) |
| .clampScalar(0, S16, S32) |
| .scalarize(0); |
| } |
| } else { |
| getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) |
| .legalFor({S32}) |
| .clampScalar(0, S32, S32) |
| .widenScalarToNextPow2(0) |
| .scalarize(0); |
| } |
| |
| auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { |
| return [=](const LegalityQuery &Query) { |
| return Query.Types[TypeIdx0].getSizeInBits() < |
| Query.Types[TypeIdx1].getSizeInBits(); |
| }; |
| }; |
| |
| auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { |
| return [=](const LegalityQuery &Query) { |
| return Query.Types[TypeIdx0].getSizeInBits() > |
| Query.Types[TypeIdx1].getSizeInBits(); |
| }; |
| }; |
| |
| getActionDefinitionsBuilder(G_INTTOPTR) |
| // List the common cases |
| .legalForCartesianProduct(AddrSpaces64, {S64}) |
| .legalForCartesianProduct(AddrSpaces32, {S32}) |
| .scalarize(0) |
| // Accept any address space as long as the size matches |
| .legalIf(sameSize(0, 1)) |
| .widenScalarIf(smallerThan(1, 0), |
| [](const LegalityQuery &Query) { |
| return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); |
| }) |
| .narrowScalarIf(greaterThan(1, 0), |
| [](const LegalityQuery &Query) { |
| return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); |
| }); |
| |
| getActionDefinitionsBuilder(G_PTRTOINT) |
| // List the common cases |
| .legalForCartesianProduct(AddrSpaces64, {S64}) |
| .legalForCartesianProduct(AddrSpaces32, {S32}) |
| .scalarize(0) |
| // Accept any address space as long as the size matches |
| .legalIf(sameSize(0, 1)) |
| .widenScalarIf(smallerThan(0, 1), |
| [](const LegalityQuery &Query) { |
| return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); |
| }) |
| .narrowScalarIf( |
| greaterThan(0, 1), |
| [](const LegalityQuery &Query) { |
| return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); |
| }); |
| |
| getActionDefinitionsBuilder(G_ADDRSPACE_CAST) |
| .scalarize(0) |
| .custom(); |
| |
| // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we |
| // handle some operations by just promoting the register during |
| // selection. There are also d16 loads on GFX9+ which preserve the high bits. |
| auto maxSizeForAddrSpace = [this](unsigned AS) -> unsigned { |
| switch (AS) { |
| // FIXME: Private element size. |
| case AMDGPUAS::PRIVATE_ADDRESS: |
| return 32; |
| // FIXME: Check subtarget |
| case AMDGPUAS::LOCAL_ADDRESS: |
| return ST.useDS128() ? 128 : 64; |
| |
| // Treat constant and global as identical. SMRD loads are sometimes usable |
| // for global loads (ideally constant address space should be eliminated) |
| // depending on the context. Legality cannot be context dependent, but |
| // RegBankSelect can split the load as necessary depending on the pointer |
| // register bank/uniformity and if the memory is invariant or not written in |
| // a kernel. |
| case AMDGPUAS::CONSTANT_ADDRESS: |
| case AMDGPUAS::GLOBAL_ADDRESS: |
| return 512; |
| default: |
| return 128; |
| } |
| }; |
| |
| const auto needToSplitLoad = [=](const LegalityQuery &Query) -> bool { |
| const LLT DstTy = Query.Types[0]; |
| |
| // Split vector extloads. |
| unsigned MemSize = Query.MMODescrs[0].SizeInBits; |
| unsigned Align = Query.MMODescrs[0].AlignInBits; |
| |
| if (MemSize < DstTy.getSizeInBits()) |
| MemSize = std::max(MemSize, Align); |
| |
| if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) |
| return true; |
| |
| const LLT PtrTy = Query.Types[1]; |
| unsigned AS = PtrTy.getAddressSpace(); |
| if (MemSize > maxSizeForAddrSpace(AS)) |
| return true; |
| |
| // Catch weird sized loads that don't evenly divide into the access sizes |
| // TODO: May be able to widen depending on alignment etc. |
| unsigned NumRegs = MemSize / 32; |
| if (NumRegs == 3 && !ST.hasDwordx3LoadStores()) |
| return true; |
| |
| if (Align < MemSize) { |
| const SITargetLowering *TLI = ST.getTargetLowering(); |
| return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8); |
| } |
| |
| return false; |
| }; |
| |
| unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; |
| unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; |
| unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; |
| |
| // TODO: Refine based on subtargets which support unaligned access or 128-bit |
| // LDS |
| // TODO: Unsupported flat for SI. |
| |
| for (unsigned Op : {G_LOAD, G_STORE}) { |
| const bool IsStore = Op == G_STORE; |
| |
| auto &Actions = getActionDefinitionsBuilder(Op); |
| // Whitelist the common cases. |
| // TODO: Pointer loads |
| // TODO: Wide constant loads |
| // TODO: Only CI+ has 3x loads |
| // TODO: Loads to s16 on gfx9 |
| Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, |
| {V2S32, GlobalPtr, 64, GlobalAlign32}, |
| {V3S32, GlobalPtr, 96, GlobalAlign32}, |
| {S96, GlobalPtr, 96, GlobalAlign32}, |
| {V4S32, GlobalPtr, 128, GlobalAlign32}, |
| {S128, GlobalPtr, 128, GlobalAlign32}, |
| {S64, GlobalPtr, 64, GlobalAlign32}, |
| {V2S64, GlobalPtr, 128, GlobalAlign32}, |
| {V2S16, GlobalPtr, 32, GlobalAlign32}, |
| {S32, GlobalPtr, 8, GlobalAlign8}, |
| {S32, GlobalPtr, 16, GlobalAlign16}, |
| |
| {S32, LocalPtr, 32, 32}, |
| {S64, LocalPtr, 64, 32}, |
| {V2S32, LocalPtr, 64, 32}, |
| {S32, LocalPtr, 8, 8}, |
| {S32, LocalPtr, 16, 16}, |
| {V2S16, LocalPtr, 32, 32}, |
| |
| {S32, PrivatePtr, 32, 32}, |
| {S32, PrivatePtr, 8, 8}, |
| {S32, PrivatePtr, 16, 16}, |
| {V2S16, PrivatePtr, 32, 32}, |
| |
| {S32, FlatPtr, 32, GlobalAlign32}, |
| {S32, FlatPtr, 16, GlobalAlign16}, |
| {S32, FlatPtr, 8, GlobalAlign8}, |
| {V2S16, FlatPtr, 32, GlobalAlign32}, |
| |
| {S32, ConstantPtr, 32, GlobalAlign32}, |
| {V2S32, ConstantPtr, 64, GlobalAlign32}, |
| {V3S32, ConstantPtr, 96, GlobalAlign32}, |
| {V4S32, ConstantPtr, 128, GlobalAlign32}, |
| {S64, ConstantPtr, 64, GlobalAlign32}, |
| {S128, ConstantPtr, 128, GlobalAlign32}, |
| {V2S32, ConstantPtr, 32, GlobalAlign32}}); |
| Actions |
| .customIf(typeIs(1, Constant32Ptr)) |
| .narrowScalarIf( |
| [=](const LegalityQuery &Query) -> bool { |
| return !Query.Types[0].isVector() && needToSplitLoad(Query); |
| }, |
| [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { |
| const LLT DstTy = Query.Types[0]; |
| const LLT PtrTy = Query.Types[1]; |
| |
| const unsigned DstSize = DstTy.getSizeInBits(); |
| unsigned MemSize = Query.MMODescrs[0].SizeInBits; |
| |
| // Split extloads. |
| if (DstSize > MemSize) |
| return std::make_pair(0, LLT::scalar(MemSize)); |
| |
| if (DstSize > 32 && (DstSize % 32 != 0)) { |
| // FIXME: Need a way to specify non-extload of larger size if |
| // suitably aligned. |
| return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); |
| } |
| |
| unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace()); |
| if (MemSize > MaxSize) |
| return std::make_pair(0, LLT::scalar(MaxSize)); |
| |
| unsigned Align = Query.MMODescrs[0].AlignInBits; |
| return std::make_pair(0, LLT::scalar(Align)); |
| }) |
| .fewerElementsIf( |
| [=](const LegalityQuery &Query) -> bool { |
| return Query.Types[0].isVector() && needToSplitLoad(Query); |
| }, |
| [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { |
| const LLT DstTy = Query.Types[0]; |
| const LLT PtrTy = Query.Types[1]; |
| |
| LLT EltTy = DstTy.getElementType(); |
| unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace()); |
| |
| // Split if it's too large for the address space. |
| if (Query.MMODescrs[0].SizeInBits > MaxSize) { |
| unsigned NumElts = DstTy.getNumElements(); |
| unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; |
| |
| // FIXME: Refine when odd breakdowns handled |
| // The scalars will need to be re-legalized. |
| if (NumPieces == 1 || NumPieces >= NumElts || |
| NumElts % NumPieces != 0) |
| return std::make_pair(0, EltTy); |
| |
| return std::make_pair(0, |
| LLT::vector(NumElts / NumPieces, EltTy)); |
| } |
| |
| // Need to split because of alignment. |
| unsigned Align = Query.MMODescrs[0].AlignInBits; |
| unsigned EltSize = EltTy.getSizeInBits(); |
| if (EltSize > Align && |
| (EltSize / Align < DstTy.getNumElements())) { |
| return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); |
| } |
| |
| // May need relegalization for the scalars. |
| return std::make_pair(0, EltTy); |
| }) |
| .minScalar(0, S32); |
| |
| if (IsStore) |
| Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); |
| |
| // TODO: Need a bitcast lower option? |
| Actions |
| .legalIf([=](const LegalityQuery &Query) { |
| const LLT Ty0 = Query.Types[0]; |
| unsigned Size = Ty0.getSizeInBits(); |
| unsigned MemSize = Query.MMODescrs[0].SizeInBits; |
| unsigned Align = Query.MMODescrs[0].AlignInBits; |
| |
| // FIXME: Widening store from alignment not valid. |
| if (MemSize < Size) |
| MemSize = std::max(MemSize, Align); |
| |
| // No extending vector loads. |
| if (Size > MemSize && Ty0.isVector()) |
| return false; |
| |
| switch (MemSize) { |
| case 8: |
| case 16: |
| return Size == 32; |
| case 32: |
| case 64: |
| case 128: |
| return true; |
| case 96: |
| return ST.hasDwordx3LoadStores(); |
| case 256: |
| case 512: |
| return true; |
| default: |
| return false; |
| } |
| }) |
| .widenScalarToNextPow2(0) |
| // TODO: v3s32->v4s32 with alignment |
| .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); |
| } |
| |
| auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) |
| .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, |
| {S32, GlobalPtr, 16, 2 * 8}, |
| {S32, LocalPtr, 8, 8}, |
| {S32, LocalPtr, 16, 16}, |
| {S32, PrivatePtr, 8, 8}, |
| {S32, PrivatePtr, 16, 16}, |
| {S32, ConstantPtr, 8, 8}, |
| {S32, ConstantPtr, 16, 2 * 8}}); |
| if (ST.hasFlatAddressSpace()) { |
| ExtLoads.legalForTypesWithMemDesc( |
| {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); |
| } |
| |
| ExtLoads.clampScalar(0, S32, S32) |
| .widenScalarToNextPow2(0) |
| .unsupportedIfMemSizeNotPow2() |
| .lower(); |
| |
| auto &Atomics = getActionDefinitionsBuilder( |
| {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, |
| G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, |
| G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, |
| G_ATOMICRMW_UMIN}) |
| .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, |
| {S64, GlobalPtr}, {S64, LocalPtr}}); |
| if (ST.hasFlatAddressSpace()) { |
| Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); |
| } |
| |
| getActionDefinitionsBuilder(G_ATOMICRMW_FADD) |
| .legalFor({{S32, LocalPtr}}); |
| |
| // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output |
| // demarshalling |
| getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) |
| .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, |
| {S32, FlatPtr}, {S64, FlatPtr}}) |
| .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, |
| {S32, RegionPtr}, {S64, RegionPtr}}); |
| |
| getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS) |
| .lower(); |
| |
| // TODO: Pointer types, any 32-bit or 64-bit vector |
| |
| // Condition should be s32 for scalar, s1 for vector. |
| getActionDefinitionsBuilder(G_SELECT) |
| .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, |
| GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, |
| LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) |
| .clampScalar(0, S16, S64) |
| .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) |
| .fewerElementsIf(numElementsNotEven(0), scalarize(0)) |
| .scalarize(1) |
| .clampMaxNumElements(0, S32, 2) |
| .clampMaxNumElements(0, LocalPtr, 2) |
| .clampMaxNumElements(0, PrivatePtr, 2) |
| .scalarize(0) |
| .widenScalarToNextPow2(0) |
| .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); |
| |
| // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can |
| // be more flexible with the shift amount type. |
| auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) |
| .legalFor({{S32, S32}, {S64, S32}}); |
| if (ST.has16BitInsts()) { |
| if (ST.hasVOP3PInsts()) { |
| Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}}) |
| .clampMaxNumElements(0, S16, 2); |
| } else |
| Shifts.legalFor({{S16, S32}, {S16, S16}}); |
| |
| // TODO: Support 16-bit shift amounts |
| Shifts.clampScalar(1, S32, S32); |
| Shifts.clampScalar(0, S16, S64); |
| Shifts.widenScalarToNextPow2(0, 16); |
| } else { |
| // Make sure we legalize the shift amount type first, as the general |
| // expansion for the shifted type will produce much worse code if it hasn't |
| // been truncated already. |
| Shifts.clampScalar(1, S32, S32); |
| Shifts.clampScalar(0, S32, S64); |
| Shifts.widenScalarToNextPow2(0, 32); |
| } |
| Shifts.scalarize(0); |
| |
| for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { |
| unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; |
| unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; |
| unsigned IdxTypeIdx = 2; |
| |
| getActionDefinitionsBuilder(Op) |
| .customIf([=](const LegalityQuery &Query) { |
| const LLT EltTy = Query.Types[EltTypeIdx]; |
| const LLT VecTy = Query.Types[VecTypeIdx]; |
| const LLT IdxTy = Query.Types[IdxTypeIdx]; |
| return (EltTy.getSizeInBits() == 16 || |
| EltTy.getSizeInBits() % 32 == 0) && |
| VecTy.getSizeInBits() % 32 == 0 && |
| VecTy.getSizeInBits() <= 1024 && |
| IdxTy.getSizeInBits() == 32; |
| }) |
| .clampScalar(EltTypeIdx, S32, S64) |
| .clampScalar(VecTypeIdx, S32, S64) |
| .clampScalar(IdxTypeIdx, S32, S32); |
| } |
| |
| getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) |
| .unsupportedIf([=](const LegalityQuery &Query) { |
| const LLT &EltTy = Query.Types[1].getElementType(); |
| return Query.Types[0] != EltTy; |
| }); |
| |
| for (unsigned Op : {G_EXTRACT, G_INSERT}) { |
| unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; |
| unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; |
| |
| // FIXME: Doesn't handle extract of illegal sizes. |
| getActionDefinitionsBuilder(Op) |
| .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) |
| // FIXME: Multiples of 16 should not be legal. |
| .legalIf([=](const LegalityQuery &Query) { |
| const LLT BigTy = Query.Types[BigTyIdx]; |
| const LLT LitTy = Query.Types[LitTyIdx]; |
| return (BigTy.getSizeInBits() % 32 == 0) && |
| (LitTy.getSizeInBits() % 16 == 0); |
| }) |
| .widenScalarIf( |
| [=](const LegalityQuery &Query) { |
| const LLT BigTy = Query.Types[BigTyIdx]; |
| return (BigTy.getScalarSizeInBits() < 16); |
| }, |
| LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) |
| .widenScalarIf( |
| [=](const LegalityQuery &Query) { |
| const LLT LitTy = Query.Types[LitTyIdx]; |
| return (LitTy.getScalarSizeInBits() < 16); |
| }, |
| LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) |
| .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) |
| .widenScalarToNextPow2(BigTyIdx, 32); |
| |
| } |
| |
| auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) |
| .legalForCartesianProduct(AllS32Vectors, {S32}) |
| .legalForCartesianProduct(AllS64Vectors, {S64}) |
| .clampNumElements(0, V16S32, V32S32) |
| .clampNumElements(0, V2S64, V16S64) |
| .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); |
| |
| if (ST.hasScalarPackInsts()) |
| BuildVector.legalFor({V2S16, S32}); |
| |
| BuildVector |
| .minScalarSameAs(1, 0) |
| .legalIf(isRegisterType(0)) |
| .minScalarOrElt(0, S32); |
| |
| if (ST.hasScalarPackInsts()) { |
| getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) |
| .legalFor({V2S16, S32}) |
| .lower(); |
| } else { |
| getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) |
| .lower(); |
| } |
| |
| getActionDefinitionsBuilder(G_CONCAT_VECTORS) |
| .legalIf(isRegisterType(0)); |
| |
| // TODO: Don't fully scalarize v2s16 pieces |
| getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); |
| |
| // Merge/Unmerge |
| for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { |
| unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; |
| unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; |
| |
| auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { |
| const LLT &Ty = Query.Types[TypeIdx]; |
| if (Ty.isVector()) { |
| const LLT &EltTy = Ty.getElementType(); |
| if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64) |
| return true; |
| if (!isPowerOf2_32(EltTy.getSizeInBits())) |
| return true; |
| } |
| return false; |
| }; |
| |
| auto &Builder = getActionDefinitionsBuilder(Op) |
| .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) |
| // Clamp the little scalar to s8-s256 and make it a power of 2. It's not |
| // worth considering the multiples of 64 since 2*192 and 2*384 are not |
| // valid. |
| .clampScalar(LitTyIdx, S16, S256) |
| .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) |
| .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) |
| .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), |
| elementTypeIs(1, S16)), |
| changeTo(1, V2S16)) |
| // Break up vectors with weird elements into scalars |
| .fewerElementsIf( |
| [=](const LegalityQuery &Query) { return notValidElt(Query, 0); }, |
| scalarize(0)) |
| .fewerElementsIf( |
| [=](const LegalityQuery &Query) { return notValidElt(Query, 1); }, |
| scalarize(1)) |
| .clampScalar(BigTyIdx, S32, S1024) |
| .lowerFor({{S16, V2S16}}); |
| |
| if (Op == G_MERGE_VALUES) { |
| Builder.widenScalarIf( |
| // TODO: Use 16-bit shifts if legal for 8-bit values? |
| [=](const LegalityQuery &Query) { |
| const LLT Ty = Query.Types[LitTyIdx]; |
| return Ty.getSizeInBits() < 32; |
| }, |
| changeTo(LitTyIdx, S32)); |
| } |
| |
| Builder.widenScalarIf( |
| [=](const LegalityQuery &Query) { |
| const LLT Ty = Query.Types[BigTyIdx]; |
| return !isPowerOf2_32(Ty.getSizeInBits()) && |
| Ty.getSizeInBits() % 16 != 0; |
| }, |
| [=](const LegalityQuery &Query) { |
| // Pick the next power of 2, or a multiple of 64 over 128. |
| // Whichever is smaller. |
| const LLT &Ty = Query.Types[BigTyIdx]; |
| unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); |
| if (NewSizeInBits >= 256) { |
| unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); |
| if (RoundedTo < NewSizeInBits) |
| NewSizeInBits = RoundedTo; |
| } |
| return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); |
| }) |
| .legalIf([=](const LegalityQuery &Query) { |
| const LLT &BigTy = Query.Types[BigTyIdx]; |
| const LLT &LitTy = Query.Types[LitTyIdx]; |
| |
| if (BigTy.isVector() && BigTy.getSizeInBits() < 32) |
| return false; |
| if (LitTy.isVector() && LitTy.getSizeInBits() < 32) |
| return false; |
| |
| return BigTy.getSizeInBits() % 16 == 0 && |
| LitTy.getSizeInBits() % 16 == 0 && |
| BigTy.getSizeInBits() <= 1024; |
| }) |
| // Any vectors left are the wrong size. Scalarize them. |
| .scalarize(0) |
| .scalarize(1); |
| } |
| |
| getActionDefinitionsBuilder(G_SEXT_INREG).lower(); |
| |
| getActionDefinitionsBuilder({G_READ_REGISTER, G_WRITE_REGISTER}).lower(); |
| |
| getActionDefinitionsBuilder(G_READCYCLECOUNTER) |
| .legalFor({S64}); |
| |
| getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, |
| G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, |
| G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) |
| .unsupported(); |
| |
| computeTables(); |
| verify(*ST.getInstrInfo()); |
| } |
| |
| bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI, |
| MachineRegisterInfo &MRI, |
| MachineIRBuilder &B, |
| GISelChangeObserver &Observer) const { |
| switch (MI.getOpcode()) { |
| case TargetOpcode::G_ADDRSPACE_CAST: |
| return legalizeAddrSpaceCast(MI, MRI, B); |
| case TargetOpcode::G_FRINT: |
| return legalizeFrint(MI, MRI, B); |
| case TargetOpcode::G_FCEIL: |
| return legalizeFceil(MI, MRI, B); |
| case TargetOpcode::G_INTRINSIC_TRUNC: |
| return legalizeIntrinsicTrunc(MI, MRI, B); |
| case TargetOpcode::G_SITOFP: |
| return legalizeITOFP(MI, MRI, B, true); |
| case TargetOpcode::G_UITOFP: |
| return legalizeITOFP(MI, MRI, B, false); |
| case TargetOpcode::G_FMINNUM: |
| case TargetOpcode::G_FMAXNUM: |
| case TargetOpcode::G_FMINNUM_IEEE: |
| case TargetOpcode::G_FMAXNUM_IEEE: |
| return legalizeMinNumMaxNum(MI, MRI, B); |
| case TargetOpcode::G_EXTRACT_VECTOR_ELT: |
| return legalizeExtractVectorElt(MI, MRI, B); |
| case TargetOpcode::G_INSERT_VECTOR_ELT: |
| return legalizeInsertVectorElt(MI, MRI, B); |
| case TargetOpcode::G_FSIN: |
| case TargetOpcode::G_FCOS: |
| return legalizeSinCos(MI, MRI, B); |
| case TargetOpcode::G_GLOBAL_VALUE: |
| return legalizeGlobalValue(MI, MRI, B); |
| case TargetOpcode::G_LOAD: |
| return legalizeLoad(MI, MRI, B, Observer); |
| case TargetOpcode::G_FMAD: |
| return legalizeFMad(MI, MRI, B); |
| case TargetOpcode::G_FDIV: |
| return legalizeFDIV(MI, MRI, B); |
| case TargetOpcode::G_ATOMIC_CMPXCHG: |
| return legalizeAtomicCmpXChg(MI, MRI, B); |
| default: |
| return false; |
| } |
| |
| llvm_unreachable("expected switch to return"); |
| } |
| |
| Register AMDGPULegalizerInfo::getSegmentAperture( |
| unsigned AS, |
| MachineRegisterInfo &MRI, |
| MachineIRBuilder &B) const { |
| MachineFunction &MF = B.getMF(); |
| const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
| const LLT S32 = LLT::scalar(32); |
| |
| assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); |
| |
| if (ST.hasApertureRegs()) { |
| // FIXME: Use inline constants (src_{shared, private}_base) instead of |
| // getreg. |
| unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? |
| AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : |
| AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; |
| unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? |
| AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : |
| AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; |
| unsigned Encoding = |
| AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | |
| Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | |
| WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; |
| |
| Register ApertureReg = MRI.createGenericVirtualRegister(S32); |
| Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); |
| |
| B.buildInstr(AMDGPU::S_GETREG_B32) |
| .addDef(GetReg) |
| .addImm(Encoding); |
| MRI.setType(GetReg, S32); |
| |
| auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); |
| B.buildInstr(TargetOpcode::G_SHL) |
| .addDef(ApertureReg) |
| .addUse(GetReg) |
| .addUse(ShiftAmt.getReg(0)); |
| |
| return ApertureReg; |
| } |
| |
| Register QueuePtr = MRI.createGenericVirtualRegister( |
| LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); |
| |
| const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
| if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr)) |
| return Register(); |
| |
| // Offset into amd_queue_t for group_segment_aperture_base_hi / |
| // private_segment_aperture_base_hi. |
| uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; |
| |
| // TODO: can we be smarter about machine pointer info? |
| MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); |
| MachineMemOperand *MMO = MF.getMachineMemOperand( |
| PtrInfo, |
| MachineMemOperand::MOLoad | |
| MachineMemOperand::MODereferenceable | |
| MachineMemOperand::MOInvariant, |
| 4, |
| MinAlign(64, StructOffset)); |
| |
| Register LoadResult = MRI.createGenericVirtualRegister(S32); |
| Register LoadAddr; |
| |
| B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); |
| B.buildLoad(LoadResult, LoadAddr, *MMO); |
| return LoadResult; |
| } |
| |
| bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( |
| MachineInstr &MI, MachineRegisterInfo &MRI, |
| MachineIRBuilder &B) const { |
| MachineFunction &MF = B.getMF(); |
| |
| B.setInstr(MI); |
| |
| const LLT S32 = LLT::scalar(32); |
| Register Dst = MI.getOperand(0).getReg(); |
| Register Src = MI.getOperand(1).getReg(); |
| |
| LLT DstTy = MRI.getType(Dst); |
| LLT SrcTy = MRI.getType(Src); |
| unsigned DestAS = DstTy.getAddressSpace(); |
| unsigned SrcAS = SrcTy.getAddressSpace(); |
| |
| // TODO: Avoid reloading from the queue ptr for each cast, or at least each |
| // vector element. |
| assert(!DstTy.isVector()); |
| |
| const AMDGPUTargetMachine &TM |
| = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); |
| |
| const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
| if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { |
| MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); |
| return true; |
| } |
| |
| if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { |
| // Truncate. |
| B.buildExtract(Dst, Src, 0); |
| MI.eraseFromParent(); |
| return true; |
| } |
| |
| if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { |
| const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); |
| uint32_t AddrHiVal = Info->get32BitAddressHighBits(); |
| |
| // FIXME: This is a bit ugly due to creating a merge of 2 pointers to |
| // another. Merge operands are required to be the same type, but creating an |
| // extra ptrtoint would be kind of pointless. |
| auto HighAddr = B.buildConstant( |
| LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); |
| B.buildMerge(Dst, {Src, HighAddr.getReg(0)}); |
| MI.eraseFromParent(); |
| return true; |
| } |
| |
| if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { |
| assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || |
| DestAS == AMDGPUAS::PRIVATE_ADDRESS); |
| unsigned NullVal = TM.getNullPointerValue(DestAS); |
| |
| auto SegmentNull = B.buildConstant(DstTy, NullVal); |
| auto FlatNull = B.buildConstant(SrcTy, 0); |
| |
| Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy); |
| |
| // Extract low 32-bits of the pointer. |
| B.buildExtract(PtrLo32, Src, 0); |
| |
| Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1)); |
| B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0)); |
| B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); |
| |
| MI.eraseFromParent(); |
| return true; |
| } |
| |
| if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) |
| return false; |
| |
| if (!ST.hasFlatAddressSpace()) |
| return false; |
| |
| auto SegmentNull = |
| B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); |
| auto FlatNull = |
| B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); |
| |
| Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); |
| if (!ApertureReg.isValid()) |
| return false; |
| |
| Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1)); |
| B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0)); |
| |
| Register BuildPtr = MRI.createGenericVirtualRegister(DstTy); |
| |
| // Coerce the type of the low half of the result so we can use merge_values. |
| Register SrcAsInt = MRI.createGenericVirtualRegister(S32); |
| B.buildInstr(TargetOpcode::G_PTRTOINT) |
| .addDef(SrcAsInt) |
| .addUse(Src); |
| |
| // TODO: Should we allow mismatched types but matching sizes in merges to |
| // avoid the ptrtoint? |
| B.buildMerge(BuildPtr, {SrcAsInt, ApertureReg}); |
| B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0)); |
| |
| MI.eraseFromParent(); |
| return true; |
| } |
| |
| bool AMDGPULegalizerInfo::legalizeFrint( |
| MachineInstr &MI, MachineRegisterInfo &MRI, |
| MachineIRBuilder &B) const { |
| B.setInstr(MI); |
| |
| Register Src = MI.getOperand(1).getReg(); |
| LLT Ty = MRI.getType(Src); |
| assert(Ty.isScalar() && Ty.getSizeInBits() == 64); |
| |
| APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); |
| APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); |
| |
| auto C1 = B.buildFConstant(Ty, C1Val); |
| auto CopySign = B.buildFCopysign(Ty, C1, Src); |
| |
| // TODO: Should this propagate fast-math-flags? |
| auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); |
| auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); |
| |
| auto C2 = B.buildFConstant(Ty, C2Val); |
| auto Fabs = B.buildFAbs(Ty, Src); |
| |
| auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); |
| B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); |
| return true; |
| } |
| |
| bool AMDGPULegalizerInfo::legalizeFceil( |
| MachineInstr &MI, MachineRegisterInfo &MRI, |
| MachineIRBuilder &B) const { |
| B.setInstr(MI); |
| |
| const LLT S1 = LLT::scalar(1); |
| const LLT S64 = LLT::scalar(64); |
| |
| Register Src = MI.getOperand(1).getReg(); |
| assert(MRI.getType(Src) == S64); |
| |
| // result = trunc(src) |
| // if (src > 0.0 && src != result) |
| // result += 1.0 |
| |
| auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src}); |
| |
| const auto Zero = B.buildFConstant(S64, 0.0); |
| const auto One = B.buildFConstant(S64, 1.0); |
| auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); |
| auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); |
| auto And = B.buildAnd(S1, Lt0, NeTrunc); |
| auto Add = B.buildSelect(S64, And, One, Zero); |
| |
| // TODO: Should this propagate fast-math-flags? |
| B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); |
| return true; |
| } |
| |
| static MachineInstrBuilder extractF64Exponent(unsigned Hi, |
| MachineIRBuilder &B) { |
| const unsigned FractBits = 52; |
| const unsigned ExpBits = 11; |
| LLT S32 = LLT::scalar(32); |
| |
| auto Const0 = B.buildConstant(S32, FractBits - 32); |
| auto Const1 = B.buildConstant(S32, ExpBits); |
| |
| auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) |
| .addUse(Const0.getReg(0)) |
| .addUse(Const1.getReg(0)); |
| |
| return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); |
| } |
| |
| bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( |
| MachineInstr &MI, MachineRegisterInfo &MRI, |
| MachineIRBuilder &B) const { |
| B.setInstr(MI); |
| |
| const LLT S1 = LLT::scalar(1); |
| const LLT S32 = LLT::scalar(32); |
| const LLT S64 = LLT::scalar(64); |
| |
| Register Src = MI.getOperand(1).getReg(); |
| assert(MRI.getType(Src) == S64); |
| |
| // TODO: Should this use extract since the low half is unused? |
| auto Unmerge = B.buildUnmerge({S32, S32}, Src); |
| Register Hi = Unmerge.getReg(1); |
| |
| // Extract the upper half, since this is where we will find the sign and |
| // exponent. |
| auto Exp = extractF64Exponent(Hi, B); |
| |
| const unsigned FractBits = 52; |
| |
| // Extract the sign bit. |
| const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); |
| auto SignBit = B.buildAnd(S32, Hi, SignBitMask); |
| |
| const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); |
| |
| const auto Zero32 = B.buildConstant(S32, 0); |
| |
| // Extend back to 64-bits. |
| auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)}); |
| |
| auto Shr = B.buildAShr(S64, FractMask, Exp); |
| auto Not = B.buildNot(S64, Shr); |
| auto Tmp0 = B.buildAnd(S64, Src, Not); |
| auto FiftyOne = B.buildConstant(S32, FractBits - 1); |
| |
| auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); |
| auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); |
| |
| auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); |
| B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); |
| return true; |
| } |
| |
| bool AMDGPULegalizerInfo::legalizeITOFP( |
| MachineInstr &MI, MachineRegisterInfo &MRI, |
| MachineIRBuilder &B, bool Signed) const { |
| B.setInstr(MI); |
| |
| Register Dst = MI.getOperand(0).getReg(); |
| Register Src = MI.getOperand(1).getReg(); |
| |
| const LLT S64 = LLT::scalar(64); |
| const LLT S32 = LLT::scalar(32); |
| |
| assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); |
| |
| auto Unmerge = B.buildUnmerge({S32, S32}, Src); |
| |
| auto CvtHi = Signed ? |
| B.buildSITOFP(S64, Unmerge.getReg(1)) : |
| B.buildUITOFP(S64, Unmerge.getReg(1)); |
| |
| auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); |
| |
| auto ThirtyTwo = B.buildConstant(S32, 32); |
| auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) |
| .addUse(CvtHi.getReg(0)) |
| .addUse(ThirtyTwo.getReg(0)); |
| |
| // TODO: Should this propagate fast-math-flags? |
| B.buildFAdd(Dst, LdExp, CvtLo); |
| MI.eraseFromParent(); |
| return true; |
| } |
| |
| bool AMDGPULegalizerInfo::legalizeMinNumMaxNum( |
| MachineInstr &MI, MachineRegisterInfo &MRI, |
| MachineIRBuilder &B) const { |
| MachineFunction &MF = B.getMF(); |
| const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
| |
| const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || |
| MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; |
| |
| // With ieee_mode disabled, the instructions have the correct behavior |
| // already for G_FMINNUM/G_FMAXNUM |
| if (!MFI->getMode().IEEE) |
| return !IsIEEEOp; |
| |
| if (IsIEEEOp) |
| return true; |
| |
| MachineIRBuilder HelperBuilder(MI); |
| GISelObserverWrapper DummyObserver; |
| LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); |
| HelperBuilder.setInstr(MI); |
| return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; |
| } |
| |
| bool AMDGPULegalizerInfo::legalizeExtractVectorElt( |
| MachineInstr &MI, MachineRegisterInfo &MRI, |
| MachineIRBuilder &B) const { |
| // TODO: Should move some of this into LegalizerHelper. |
| |
| // TODO: Promote dynamic indexing of s16 to s32 |
| // TODO: Dynamic s64 indexing is only legal for SGPR. |
| Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI); |
| if (!IdxVal) // Dynamic case will be selected to register indexing. |
| return true; |
| |
| Register Dst = MI.getOperand(0).getReg(); |
| Register Vec = MI.getOperand(1).getReg(); |
| |
| LLT VecTy = MRI.getType(Vec); |
| LLT EltTy = VecTy.getElementType(); |
| assert(EltTy == MRI.getType(Dst)); |
| |
| B.setInstr(MI); |
| |
| if (IdxVal.getValue() < VecTy.getNumElements()) |
| B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits()); |
| else |
| B.buildUndef(Dst); |
| |
| MI.eraseFromParent(); |
| return true; |
| } |
| |
| bool AMDGPULegalizerInfo::legalizeInsertVectorElt( |
| MachineInstr &MI, MachineRegisterInfo &MRI, |
| MachineIRBuilder &B) const { |
| // TODO: Should move some of this into LegalizerHelper. |
| |
| // TODO: Promote dynamic indexing of s16 to s32 |
| // TODO: Dynamic s64 indexing is only legal for SGPR. |
| Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI); |
| if (!IdxVal) // Dynamic case will be selected to register indexing. |
| return true; |
| |
| Register Dst = MI.getOperand(0).getReg(); |
| Register Vec = MI.getOperand(1).getReg(); |
| Register Ins = MI.getOperand(2).getReg(); |
| |
| LLT VecTy = MRI.getType(Vec); |
| LLT EltTy = VecTy.getElementType(); |
| assert(EltTy == MRI.getType(Ins)); |
| |
| B.setInstr(MI); |
| |
| if (IdxVal.getValue() < VecTy.getNumElements()) |
| B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits()); |
| else |
| B.buildUndef(Dst); |
| |
| MI.eraseFromParent(); |
| return true; |
| } |
| |
| bool AMDGPULegalizerInfo::legalizeSinCos( |
| MachineInstr &MI, MachineRegisterInfo &MRI, |
| MachineIRBuilder &B) const { |
| B.setInstr(MI); |
| |
| Register DstReg = MI.getOperand(0).getReg(); |
| Register SrcReg = MI.getOperand(1).getReg(); |
| LLT Ty = MRI.getType(DstReg); |
| unsigned Flags = MI.getFlags(); |
| |
| Register TrigVal; |
| auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI); |
| if (ST.hasTrigReducedRange()) { |
| auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); |
| TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) |
| .addUse(MulVal.getReg(0)) |
| .setMIFlags(Flags).getReg(0); |
| } else |
| TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); |
| |
| Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? |
| Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; |
| B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) |
| .addUse(TrigVal) |
| .setMIFlags(Flags); |
| MI.eraseFromParent(); |
| return true; |
| } |
| |
| bool AMDGPULegalizerInfo::buildPCRelGlobalAddress( |
| Register DstReg, LLT PtrTy, |
| MachineIRBuilder &B, const GlobalValue *GV, |
| unsigned Offset, unsigned GAFlags) const { |
| // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered |
| // to the following code sequence: |
| // |
| // For constant address space: |
| // s_getpc_b64 s[0:1] |
| // s_add_u32 s0, s0, $symbol |
| // s_addc_u32 s1, s1, 0 |
| // |
| // s_getpc_b64 returns the address of the s_add_u32 instruction and then |
| // a fixup or relocation is emitted to replace $symbol with a literal |
| // constant, which is a pc-relative offset from the encoding of the $symbol |
| // operand to the global variable. |
| // |
| // For global address space: |
| // s_getpc_b64 s[0:1] |
| // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo |
| // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi |
| // |
| // s_getpc_b64 returns the address of the s_add_u32 instruction and then |
| // fixups or relocations are emitted to replace $symbol@*@lo and |
| // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, |
| // which is a 64-bit pc-relative offset from the encoding of the $symbol |
| // operand to the global variable. |
| // |
| // What we want here is an offset from the value returned by s_getpc |
| // (which is the address of the s_add_u32 instruction) to the global |
| // variable, but since the encoding of $symbol starts 4 bytes after the start |
| // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too |
| // small. This requires us to add 4 to the global variable offset in order to |
| // compute the correct address. |
| |
| LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); |
| |
| Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : |
| B.getMRI()->createGenericVirtualRegister(ConstPtrTy); |
| |
| MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) |
| .addDef(PCReg); |
| |
| MIB.addGlobalAddress(GV, Offset + 4, GAFlags); |
| if (GAFlags == SIInstrInfo::MO_NONE) |
| MIB.addImm(0); |
| else |
| MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); |
| |
| B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); |
| |
| if (PtrTy.getSizeInBits() == 32) |
| B.buildExtract(DstReg, PCReg, 0); |
| return true; |
| } |
| |
| bool AMDGPULegalizerInfo::legalizeGlobalValue( |
| MachineInstr &MI, MachineRegisterInfo &MRI, |
| MachineIRBuilder &B) const { |
| Register DstReg = MI.getOperand(0).getReg(); |
| LLT Ty = MRI.getType(DstReg); |
| unsigned AS = Ty.getAddressSpace(); |
| |
| const GlobalValue *GV = MI.getOperand(1).getGlobal(); |
| MachineFunction &MF = B.getMF(); |
| SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
| B.setInstr(MI); |
| |
| if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { |
| if (!MFI->isEntryFunction()) { |
| const Function &Fn = MF.getFunction(); |
| DiagnosticInfoUnsupported BadLDSDecl( |
| Fn, "local memory global used by non-kernel function", MI.getDebugLoc()); |
| Fn.getContext().diagnose(BadLDSDecl); |
| } |
| |
| // TODO: We could emit code to handle the initialization somewhere. |
| if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { |
| B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV)); |
| MI.eraseFromParent(); |
| return true; |
| } |
| |
| const Function &Fn = MF.getFunction(); |
| DiagnosticInfoUnsupported BadInit( |
| Fn, "unsupported initializer for address space", MI.getDebugLoc()); |
| Fn.getContext().diagnose(BadInit); |
| return true; |
| } |
| |
| const SITargetLowering *TLI = ST.getTargetLowering(); |
| |
| if (TLI->shouldEmitFixup(GV)) { |
| buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); |
| MI.eraseFromParent(); |
| return true; |
| } |
| |
| if (TLI->shouldEmitPCReloc(GV)) { |
| buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); |
| MI.eraseFromParent(); |
| return true; |
| } |
| |
| LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); |
| Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); |
| |
| MachineMemOperand *GOTMMO = MF.getMachineMemOperand( |
| MachinePointerInfo::getGOT(MF), |
| MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | |
| MachineMemOperand::MOInvariant, |
| 8 /*Size*/, 8 /*Align*/); |
| |
| buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); |
| |
| if (Ty.getSizeInBits() == 32) { |
| // Truncate if this is a 32-bit constant adrdess. |
| auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); |
| B.buildExtract(DstReg, Load, 0); |
| } else |
| B.buildLoad(DstReg, GOTAddr, *GOTMMO); |
| |
| MI.eraseFromParent(); |
| return true; |
| } |
| |
| bool AMDGPULegalizerInfo::legalizeLoad( |
| MachineInstr &MI, MachineRegisterInfo &MRI, |
| MachineIRBuilder &B, GISelChangeObserver &Observer) const { |
| B.setInstr(MI); |
| LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); |
| auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg()); |
| Observer.changingInstr(MI); |
| MI.getOperand(1).setReg(Cast.getReg(0)); |
| Observer.changedInstr(MI); |
| return true; |
| } |
| |
| bool AMDGPULegalizerInfo::legalizeFMad( |
| MachineInstr &MI, MachineRegisterInfo &MRI, |
| MachineIRBuilder &B) const { |
| LLT Ty = MRI.getType(MI.getOperand(0).getReg()); |
| assert(Ty.isScalar()); |
| |
| MachineFunction &MF = B.getMF(); |
| const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
| |
| // TODO: Always legal with future ftz flag. |
| if (Ty == LLT::scalar(32) && !MFI->getMode().FP32Denormals) |
| return true; |
| if (Ty == LLT::scalar(16) && !MFI->getMode().FP64FP16Denormals) |
| return true; |
| |
| |
| MachineIRBuilder HelperBuilder(MI); |
| GISelObserverWrapper DummyObserver; |
| LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); |
| HelperBuilder.setMBB(*MI.getParent()); |
| return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; |
| } |
| |
| bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( |
| MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { |
| Register DstReg = MI.getOperand(0).getReg(); |
| Register PtrReg = MI.getOperand(1).getReg(); |
| Register CmpVal = MI.getOperand(2).getReg(); |
| Register NewVal = MI.getOperand(3).getReg(); |
| |
| assert(SITargetLowering::isFlatGlobalAddrSpace( |
| MRI.getType(PtrReg).getAddressSpace()) && |
| "this should not have been custom lowered"); |
| |
| LLT ValTy = MRI.getType(CmpVal); |
| LLT VecTy = LLT::vector(2, ValTy); |
| |
| B.setInstr(MI); |
| Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); |
| |
| B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) |
| .addDef(DstReg) |
| .addUse(PtrReg) |
| .addUse(PackedVal) |
| .setMemRefs(MI.memoperands()); |
| |
| MI.eraseFromParent(); |
| return true; |
| } |
| |
| // Return the use branch instruction, otherwise null if the usage is invalid. |
| static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, |
| MachineRegisterInfo &MRI, |
| MachineInstr *&Br) { |
| Register CondDef = MI.getOperand(0).getReg(); |
| if (!MRI.hasOneNonDBGUse(CondDef)) |
| return nullptr; |
| |
| MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); |
| if (UseMI.getParent() != MI.getParent() || |
| UseMI.getOpcode() != AMDGPU::G_BRCOND) |
| return nullptr; |
| |
| // Make sure the cond br is followed by a G_BR |
| MachineBasicBlock::iterator Next = std::next(UseMI.getIterator()); |
| if (Next != MI.getParent()->end()) { |
| if (Next->getOpcode() != AMDGPU::G_BR) |
| return nullptr; |
| Br = &*Next; |
| } |
| |
| return &UseMI; |
| } |
| |
| Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI, |
| Register Reg, LLT Ty) const { |
| Register LiveIn = MRI.getLiveInVirtReg(Reg); |
| if (LiveIn) |
| return LiveIn; |
| |
| Register NewReg = MRI.createGenericVirtualRegister(Ty); |
| MRI.addLiveIn(Reg, NewReg); |
| return NewReg; |
| } |
| |
| bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, |
| const ArgDescriptor *Arg) const { |
| if (!Arg->isRegister() || !Arg->getRegister().isValid()) |
| return false; // TODO: Handle these |
| |
| assert(Arg->getRegister().isPhysical()); |
| |
| MachineRegisterInfo &MRI = *B.getMRI(); |
| |
| LLT Ty = MRI.getType(DstReg); |
| Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty); |
| |
| if (Arg->isMasked()) { |
| // TODO: Should we try to emit this once in the entry block? |
| const LLT S32 = LLT::scalar(32); |
| const unsigned Mask = Arg->getMask(); |
| const unsigned Shift = countTrailingZeros<unsigned>(Mask); |
| |
| Register AndMaskSrc = LiveIn; |
| |
| if (Shift != 0) { |
| auto ShiftAmt = B.buildConstant(S32, Shift); |
| AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); |
| } |
| |
| B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); |
| } else |
| B.buildCopy(DstReg, LiveIn); |
| |
| // Insert the argument copy if it doens't already exist. |
| // FIXME: It seems EmitLiveInCopies isn't called anywhere? |
| if (!MRI.getVRegDef(LiveIn)) { |
| // FIXME: Should have scoped insert pt |
| MachineBasicBlock &OrigInsBB = B.getMBB(); |
| auto OrigInsPt = B.getInsertPt(); |
| |
| MachineBasicBlock &EntryMBB = B.getMF().front(); |
| EntryMBB.addLiveIn(Arg->getRegister()); |
| B.setInsertPt(EntryMBB, EntryMBB.begin()); |
| B.buildCopy(LiveIn, Arg->getRegister()); |
| |
| B.setInsertPt(OrigInsBB, OrigInsPt); |
| } |
| |
| return true; |
| } |
| |
| bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( |
| MachineInstr &MI, |
| MachineRegisterInfo &MRI, |
| MachineIRBuilder &B, |
| AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { |
| B.setInstr(MI); |
| |
| const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); |
| |
| const ArgDescriptor *Arg; |
| const TargetRegisterClass *RC; |
| std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType); |
| if (!Arg) { |
| LLVM_DEBUG(dbgs() << "Required arg register missing\n"); |
| return false; |
| } |
| |
| if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) { |
| MI.eraseFromParent(); |
| return true; |
| } |
| |
| return false; |
| } |
| |
| bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, |
| MachineRegisterInfo &MRI, |
| MachineIRBuilder &B) const { |
| B.setInstr(MI); |
| Register Dst = MI.getOperand(0).getReg(); |
| LLT DstTy = MRI.getType(Dst); |
| LLT S16 = LLT::scalar(16); |
| LLT S32 = LLT::scalar(32); |
| LLT S64 = LLT::scalar(64); |
| |
| if (legalizeFastUnsafeFDIV(MI, MRI, B)) |
| return true; |
| |
| if (DstTy == S16) |
| return legalizeFDIV16(MI, MRI, B); |
| if (DstTy == S32) |
| return legalizeFDIV32(MI, MRI, B); |
| if (DstTy == S64) |
| return legalizeFDIV64(MI, MRI, B); |
| |
| return false; |
| } |
| |
| bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, |
| MachineRegisterInfo &MRI, |
| MachineIRBuilder &B) const { |
| Register Res = MI.getOperand(0).getReg(); |
| Register LHS = MI.getOperand(1).getReg(); |
| Register RHS = MI.getOperand(2).getReg(); |
| |
| uint16_t Flags = MI.getFlags(); |
| |
| LLT ResTy = MRI.getType(Res); |
| LLT S32 = LLT::scalar(32); |
| LLT S64 = LLT::scalar(64); |
| |
| const MachineFunction &MF = B.getMF(); |
| bool Unsafe = |
| MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); |
| |
| if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) |
| return false; |
| |
| if (!Unsafe && ResTy == S32 && |
| MF.getInfo<SIMachineFunctionInfo>()->getMode().FP32Denormals) |
| return false; |
| |
| if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { |
| // 1 / x -> RCP(x) |
| if (CLHS->isExactlyValue(1.0)) { |
| B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) |
| .addUse(RHS) |
| .setMIFlags(Flags); |
| |
| MI.eraseFromParent(); |
| return true; |
| } |
| |
| // -1 / x -> RCP( FNEG(x) ) |
| if (CLHS->isExactlyValue(-1.0)) { |
| auto FNeg = B.buildFNeg(ResTy, RHS, Flags); |
| B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) |
| .addUse(FNeg.getReg(0)) |
| .setMIFlags(Flags); |
| |
| MI.eraseFromParent(); |
| return true; |
| } |
| } |
| |
| // x / y -> x * (1.0 / y) |
| if (Unsafe) { |
| auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) |
| .addUse(RHS) |
| .setMIFlags(Flags); |
| B.buildFMul(Res, LHS, RCP, Flags); |
| |
| MI.eraseFromParent(); |
| return true; |
| } |
| |
| return false; |
| } |
| |
| bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, |
| MachineRegisterInfo &MRI, |
| MachineIRBuilder &B) const { |
| B.setInstr(MI); |
| Register Res = MI.getOperand(0).getReg(); |
| Register LHS = MI.getOperand(1).getReg(); |
| Register RHS = MI.getOperand(2).getReg(); |
| |
| uint16_t Flags = MI.getFlags(); |
| |
| LLT S16 = LLT::scalar(16); |
| LLT S32 = LLT::scalar(32); |
| |
| auto LHSExt = B.buildFPExt(S32, LHS, Flags); |
| auto RHSExt = B.buildFPExt(S32, RHS, Flags); |
| |
| auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) |
| .addUse(RHSExt.getReg(0)) |
| .setMIFlags(Flags); |
| |
| auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); |
| auto RDst = B.buildFPTrunc(S16, QUOT, Flags); |
| |
| B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) |
| .addUse(RDst.getReg(0)) |
| .addUse(RHS) |
| .addUse(LHS) |
| .setMIFlags(Flags); |
| |
| MI.eraseFromParent(); |
| return true; |
| } |
| |
| // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions |
| // to enable denorm mode. When 'Enable' is false, disable denorm mode. |
| static void toggleSPDenormMode(bool Enable, |
| MachineIRBuilder &B, |
| const GCNSubtarget &ST, |
| AMDGPU::SIModeRegisterDefaults Mode) { |
| // Set SP denorm mode to this value. |
| unsigned SPDenormMode = |
| Enable ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT; |
| |
| if (ST.hasDenormModeInst()) { |
| // Preserve default FP64FP16 denorm mode while updating FP32 mode. |
| unsigned DPDenormModeDefault = Mode.FP64FP16Denormals |
| ? FP_DENORM_FLUSH_NONE |
| : FP_DENORM_FLUSH_IN_FLUSH_OUT; |
| |
| unsigned NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); |
| B.buildInstr(AMDGPU::S_DENORM_MODE) |
| .addImm(NewDenormModeValue); |
| |
| } else { |
| // Select FP32 bit field in mode register. |
| unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | |
| (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | |
| (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); |
| |
| B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) |
| .addImm(SPDenormMode) |
| .addImm(SPDenormModeBitField); |
| } |
| } |
| |
| bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, |
| MachineRegisterInfo &MRI, |
| MachineIRBuilder &B) const { |
| B.setInstr(MI); |
| Register Res = MI.getOperand(0).getReg(); |
| Register LHS = MI.getOperand(1).getReg(); |
| Register RHS = MI.getOperand(2).getReg(); |
| const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); |
| AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); |
| |
| uint16_t Flags = MI.getFlags(); |
| |
| LLT S32 = LLT::scalar(32); |
| LLT S1 = LLT::scalar(1); |
| |
| auto One = B.buildFConstant(S32, 1.0f); |
| |
| auto DenominatorScaled = |
| B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) |
| .addUse(RHS) |
| .addUse(LHS) |
| .addImm(1) |
| .setMIFlags(Flags); |
| auto NumeratorScaled = |
| B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) |
| .addUse(LHS) |
| .addUse(RHS) |
| .addImm(0) |
| .setMIFlags(Flags); |
| |
| auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) |
| .addUse(DenominatorScaled.getReg(0)) |
| .setMIFlags(Flags); |
| auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); |
| |
| // FIXME: Doesn't correctly model the FP mode switch, and the FP operations |
| // aren't modeled as reading it. |
| if (!Mode.FP32Denormals) |
| toggleSPDenormMode(true, B, ST, Mode); |
| |
| auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); |
| auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); |
| auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); |
| auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); |
| auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); |
| auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); |
| |
| if (!Mode.FP32Denormals) |
| toggleSPDenormMode(false, B, ST, Mode); |
| |
| auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) |
| .addUse(Fma4.getReg(0)) |
| .addUse(Fma1.getReg(0)) |
| .addUse(Fma3.getReg(0)) |
| .addUse(NumeratorScaled.getReg(1)) |
| .setMIFlags(Flags); |
| |
| B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) |
| .addUse(Fmas.getReg(0)) |
| .addUse(RHS) |
| .addUse(LHS) |
| .setMIFlags(Flags); |
| |
| MI.eraseFromParent(); |
| return true; |
| } |
| |
| bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, |
| MachineRegisterInfo &MRI, |
| MachineIRBuilder &B) const { |
| B.setInstr(MI); |
| Register Res = MI.getOperand(0).getReg(); |
| Register LHS = MI.getOperand(1).getReg(); |
| Register RHS = MI.getOperand(2).getReg(); |
| |
| uint16_t Flags = MI.getFlags(); |
| |
| LLT S64 = LLT::scalar(64); |
| LLT S1 = LLT::scalar(1); |
| |
| auto One = B.buildFConstant(S64, 1.0); |
| |
| auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) |
| .addUse(LHS) |
| .addUse(RHS) |
| .addImm(1) |
| .setMIFlags(Flags); |
| |
| auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); |
| |
| auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) |
| .addUse(DivScale0.getReg(0)) |
| .setMIFlags(Flags); |
| |
| auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); |
| auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); |
| auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); |
| |
| auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) |
| .addUse(LHS) |
| .addUse(RHS) |
| .addImm(0) |
| .setMIFlags(Flags); |
| |
| auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); |
| auto Mul = B.buildMul(S64, DivScale1.getReg(0), Fma3, Flags); |
| auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); |
| |
| Register Scale; |
| if (!ST.hasUsableDivScaleConditionOutput()) { |
| // Workaround a hardware bug on SI where the condition output from div_scale |
| // is not usable. |
| |
| Scale = MRI.createGenericVirtualRegister(S1); |
| |
| LLT S32 = LLT::scalar(32); |
| |
| auto NumUnmerge = B.buildUnmerge(S32, LHS); |
| auto DenUnmerge = B.buildUnmerge(S32, RHS); |
| auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); |
| auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); |
| |
| auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), |
| Scale1Unmerge.getReg(1)); |
| auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), |
| Scale0Unmerge.getReg(1)); |
| B.buildXor(Scale, CmpNum, CmpDen); |
| } else { |
| Scale = DivScale1.getReg(1); |
| } |
| |
| auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) |
| .addUse(Fma4.getReg(0)) |
| .addUse(Fma3.getReg(0)) |
| .addUse(Mul.getReg(0)) |
| .addUse(Scale) |
| .setMIFlags(Flags); |
| |
| B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) |
| .addUse(Fmas.getReg(0)) |
| .addUse(RHS) |
| .addUse(LHS) |
| .setMIFlags(Flags); |
| |
| MI.eraseFromParent(); |
| return true; |
| } |
| |
| bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, |
| MachineRegisterInfo &MRI, |
| MachineIRBuilder &B) const { |
| B.setInstr(MI); |
| Register Res = MI.getOperand(0).getReg(); |
| Register LHS = MI.getOperand(2).getReg(); |
| Register RHS = MI.getOperand(3).getReg(); |
| uint16_t Flags = MI.getFlags(); |
| |
| LLT S32 = LLT::scalar(32); |
| LLT S1 = LLT::scalar(1); |
| |
| auto Abs = B.buildFAbs(S32, RHS, Flags); |
| const APFloat C0Val(1.0f); |
| |
| auto C0 = B.buildConstant(S32, 0x6f800000); |
| auto C1 = B.buildConstant(S32, 0x2f800000); |
| auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); |
| |
| auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); |
| auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); |
| |
| auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); |
| |
| auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) |
| .addUse(Mul0.getReg(0)) |
| .setMIFlags(Flags); |
| |
| auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); |
| |
| B.buildFMul(Res, Sel, Mul1, Flags); |
| |
| MI.eraseFromParent(); |
| return true; |
| } |
| |
| bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, |
| MachineRegisterInfo &MRI, |
| MachineIRBuilder &B) const { |
| const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); |
| if (!MFI->isEntryFunction()) { |
| return legalizePreloadedArgIntrin(MI, MRI, B, |
| AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); |
| } |
| |
| B.setInstr(MI); |
| |
| uint64_t Offset = |
| ST.getTargetLowering()->getImplicitParameterOffset( |
| B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); |
| Register DstReg = MI.getOperand(0).getReg(); |
| LLT DstTy = MRI.getType(DstReg); |
| LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); |
| |
| const ArgDescriptor *Arg; |
| const TargetRegisterClass *RC; |
| std::tie(Arg, RC) |
| = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); |
| if (!Arg) |
| return false; |
| |
| Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); |
| if (!loadInputValue(KernargPtrReg, B, Arg)) |
| return false; |
| |
| B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); |
| MI.eraseFromParent(); |
| return true; |
| } |
| |
| bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, |
| MachineRegisterInfo &MRI, |
| MachineIRBuilder &B, |
| unsigned AddrSpace) const { |
| B.setInstr(MI); |
| Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); |
| auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); |
| B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); |
| MI.eraseFromParent(); |
| return true; |
| } |
| |
| /// Handle register layout difference for f16 images for some subtargets. |
| Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, |
| MachineRegisterInfo &MRI, |
| Register Reg) const { |
| if (!ST.hasUnpackedD16VMem()) |
| return Reg; |
| |
| const LLT S16 = LLT::scalar(16); |
| const LLT S32 = LLT::scalar(32); |
| LLT StoreVT = MRI.getType(Reg); |
| assert(StoreVT.isVector() && StoreVT.getElementType() == S16); |
| |
| auto Unmerge = B.buildUnmerge(S16, Reg); |
| |
| SmallVector<Register, 4> WideRegs; |
| for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) |
| WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); |
| |
| int NumElts = StoreVT.getNumElements(); |
| |
| return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); |
| } |
| |
| bool AMDGPULegalizerInfo::legalizeRawBufferStore(MachineInstr &MI, |
| MachineRegisterInfo &MRI, |
| MachineIRBuilder &B, |
| bool IsFormat) const { |
| // TODO: Reject f16 format on targets where unsupported. |
| Register VData = MI.getOperand(1).getReg(); |
| LLT Ty = MRI.getType(VData); |
| |
| B.setInstr(MI); |
| |
| const LLT S32 = LLT::scalar(32); |
| const LLT S16 = LLT::scalar(16); |
| |
| // Fixup illegal register types for i8 stores. |
| if (Ty == LLT::scalar(8) || Ty == S16) { |
| Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); |
| MI.getOperand(1).setReg(AnyExt); |
| return true; |
| } |
| |
| if (Ty.isVector()) { |
| if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { |
| if (IsFormat) |
| MI.getOperand(1).setReg(handleD16VData(B, MRI, VData)); |
| return true; |
| } |
| |
| return Ty.getElementType() == S32 && Ty.getNumElements() <= 4; |
| } |
| |
| return Ty == S32; |
| } |
| |
| bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, |
| MachineRegisterInfo &MRI, |
| MachineIRBuilder &B) const { |
| // Replace the use G_BRCOND with the exec manipulate and branch pseudos. |
| auto IntrID = MI.getIntrinsicID(); |
| switch (IntrID) { |
| case Intrinsic::amdgcn_if: |
| case Intrinsic::amdgcn_else: { |
| MachineInstr *Br = nullptr; |
| if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) { |
| const SIRegisterInfo *TRI |
| = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); |
| |
| B.setInstr(*BrCond); |
| Register Def = MI.getOperand(1).getReg(); |
| Register Use = MI.getOperand(3).getReg(); |
| |
| MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB(); |
| if (Br) |
| BrTarget = Br->getOperand(0).getMBB(); |
| |
| if (IntrID == Intrinsic::amdgcn_if) { |
| B.buildInstr(AMDGPU::SI_IF) |
| .addDef(Def) |
| .addUse(Use) |
| .addMBB(BrTarget); |
| } else { |
| B.buildInstr(AMDGPU::SI_ELSE) |
| .addDef(Def) |
| .addUse(Use) |
| .addMBB(BrTarget) |
| .addImm(0); |
| } |
| |
| if (Br) |
| Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB()); |
| |
| MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); |
| MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); |
| MI.eraseFromParent(); |
| BrCond->eraseFromParent(); |
| return true; |
| } |
| |
| return false; |
| } |
| case Intrinsic::amdgcn_loop: { |
| MachineInstr *Br = nullptr; |
| if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) { |
| const SIRegisterInfo *TRI |
| = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); |
| |
| B.setInstr(*BrCond); |
| |
| // FIXME: Need to adjust branch targets based on unconditional branch. |
| Register Reg = MI.getOperand(2).getReg(); |
| B.buildInstr(AMDGPU::SI_LOOP) |
| .addUse(Reg) |
| .addMBB(BrCond->getOperand(1).getMBB()); |
| MI.eraseFromParent(); |
| BrCond->eraseFromParent(); |
| MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); |
| return true; |
| } |
| |
| return false; |
| } |
| case Intrinsic::amdgcn_kernarg_segment_ptr: |
| return legalizePreloadedArgIntrin( |
| MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); |
| case Intrinsic::amdgcn_implicitarg_ptr: |
| return legalizeImplicitArgPtr(MI, MRI, B); |
| case Intrinsic::amdgcn_workitem_id_x: |
| return legalizePreloadedArgIntrin(MI, MRI, B, |
| AMDGPUFunctionArgInfo::WORKITEM_ID_X); |
| case Intrinsic::amdgcn_workitem_id_y: |
| return legalizePreloadedArgIntrin(MI, MRI, B, |
| AMDGPUFunctionArgInfo::WORKITEM_ID_Y); |
| case Intrinsic::amdgcn_workitem_id_z: |
| return legalizePreloadedArgIntrin(MI, MRI, B, |
| AMDGPUFunctionArgInfo::WORKITEM_ID_Z); |
| case Intrinsic::amdgcn_workgroup_id_x: |
| return legalizePreloadedArgIntrin(MI, MRI, B, |
| AMDGPUFunctionArgInfo::WORKGROUP_ID_X); |
| case Intrinsic::amdgcn_workgroup_id_y: |
| return legalizePreloadedArgIntrin(MI, MRI, B, |
| AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); |
| case Intrinsic::amdgcn_workgroup_id_z: |
| return legalizePreloadedArgIntrin(MI, MRI, B, |
| AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); |
| case Intrinsic::amdgcn_dispatch_ptr: |
| return legalizePreloadedArgIntrin(MI, MRI, B, |
| AMDGPUFunctionArgInfo::DISPATCH_PTR); |
| case Intrinsic::amdgcn_queue_ptr: |
| return legalizePreloadedArgIntrin(MI, MRI, B, |
| AMDGPUFunctionArgInfo::QUEUE_PTR); |
| case Intrinsic::amdgcn_implicit_buffer_ptr: |
| return legalizePreloadedArgIntrin( |
| MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); |
| case Intrinsic::amdgcn_dispatch_id: |
| return legalizePreloadedArgIntrin(MI, MRI, B, |
| AMDGPUFunctionArgInfo::DISPATCH_ID); |
| case Intrinsic::amdgcn_fdiv_fast: |
| return legalizeFDIVFastIntrin(MI, MRI, B); |
| case Intrinsic::amdgcn_is_shared: |
| return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); |
| case Intrinsic::amdgcn_is_private: |
| return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); |
| case Intrinsic::amdgcn_wavefrontsize: { |
| B.setInstr(MI); |
| B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); |
| MI.eraseFromParent(); |
| return true; |
| } |
| case Intrinsic::amdgcn_raw_buffer_store: |
| return legalizeRawBufferStore(MI, MRI, B, false); |
| case Intrinsic::amdgcn_raw_buffer_store_format: |
| return legalizeRawBufferStore(MI, MRI, B, true); |
| default: |
| return true; |
| } |
| |
| return true; |
| } |