| //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| // |
| // This file defines the interfaces that X86 uses to lower LLVM code into a |
| // selection DAG. |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #include "X86ISelLowering.h" |
| #include "Utils/X86ShuffleDecode.h" |
| #include "X86CallingConv.h" |
| #include "X86FrameLowering.h" |
| #include "X86InstrBuilder.h" |
| #include "X86IntrinsicsInfo.h" |
| #include "X86MachineFunctionInfo.h" |
| #include "X86TargetMachine.h" |
| #include "X86TargetObjectFile.h" |
| #include "llvm/ADT/SmallBitVector.h" |
| #include "llvm/ADT/SmallSet.h" |
| #include "llvm/ADT/Statistic.h" |
| #include "llvm/ADT/StringExtras.h" |
| #include "llvm/ADT/StringSwitch.h" |
| #include "llvm/Analysis/BlockFrequencyInfo.h" |
| #include "llvm/Analysis/EHPersonalities.h" |
| #include "llvm/Analysis/ProfileSummaryInfo.h" |
| #include "llvm/CodeGen/IntrinsicLowering.h" |
| #include "llvm/CodeGen/MachineFrameInfo.h" |
| #include "llvm/CodeGen/MachineFunction.h" |
| #include "llvm/CodeGen/MachineInstrBuilder.h" |
| #include "llvm/CodeGen/MachineJumpTableInfo.h" |
| #include "llvm/CodeGen/MachineModuleInfo.h" |
| #include "llvm/CodeGen/MachineRegisterInfo.h" |
| #include "llvm/CodeGen/TargetLowering.h" |
| #include "llvm/CodeGen/WinEHFuncInfo.h" |
| #include "llvm/IR/CallSite.h" |
| #include "llvm/IR/CallingConv.h" |
| #include "llvm/IR/Constants.h" |
| #include "llvm/IR/DerivedTypes.h" |
| #include "llvm/IR/DiagnosticInfo.h" |
| #include "llvm/IR/Function.h" |
| #include "llvm/IR/GlobalAlias.h" |
| #include "llvm/IR/GlobalVariable.h" |
| #include "llvm/IR/Instructions.h" |
| #include "llvm/IR/Intrinsics.h" |
| #include "llvm/MC/MCAsmInfo.h" |
| #include "llvm/MC/MCContext.h" |
| #include "llvm/MC/MCExpr.h" |
| #include "llvm/MC/MCSymbol.h" |
| #include "llvm/Support/CommandLine.h" |
| #include "llvm/Support/Debug.h" |
| #include "llvm/Support/ErrorHandling.h" |
| #include "llvm/Support/KnownBits.h" |
| #include "llvm/Support/MathExtras.h" |
| #include "llvm/Target/TargetOptions.h" |
| #include <algorithm> |
| #include <bitset> |
| #include <cctype> |
| #include <numeric> |
| using namespace llvm; |
| |
| #define DEBUG_TYPE "x86-isel" |
| |
| STATISTIC(NumTailCalls, "Number of tail calls"); |
| |
| static cl::opt<int> ExperimentalPrefLoopAlignment( |
| "x86-experimental-pref-loop-alignment", cl::init(4), |
| cl::desc( |
| "Sets the preferable loop alignment for experiments (as log2 bytes)" |
| "(the last x86-experimental-pref-loop-alignment bits" |
| " of the loop header PC will be 0)."), |
| cl::Hidden); |
| |
| // Added in 10.0. |
| static cl::opt<bool> EnableOldKNLABI( |
| "x86-enable-old-knl-abi", cl::init(false), |
| cl::desc("Enables passing v32i16 and v64i8 in 2 YMM registers instead of " |
| "one ZMM register on AVX512F, but not AVX512BW targets."), |
| cl::Hidden); |
| |
| static cl::opt<bool> MulConstantOptimization( |
| "mul-constant-optimization", cl::init(true), |
| cl::desc("Replace 'mul x, Const' with more effective instructions like " |
| "SHIFT, LEA, etc."), |
| cl::Hidden); |
| |
| static cl::opt<bool> ExperimentalUnorderedISEL( |
| "x86-experimental-unordered-atomic-isel", cl::init(false), |
| cl::desc("Use LoadSDNode and StoreSDNode instead of " |
| "AtomicSDNode for unordered atomic loads and " |
| "stores respectively."), |
| cl::Hidden); |
| |
| /// Call this when the user attempts to do something unsupported, like |
| /// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike |
| /// report_fatal_error, so calling code should attempt to recover without |
| /// crashing. |
| static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl, |
| const char *Msg) { |
| MachineFunction &MF = DAG.getMachineFunction(); |
| DAG.getContext()->diagnose( |
| DiagnosticInfoUnsupported(MF.getFunction(), Msg, dl.getDebugLoc())); |
| } |
| |
| X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, |
| const X86Subtarget &STI) |
| : TargetLowering(TM), Subtarget(STI) { |
| bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87(); |
| X86ScalarSSEf64 = Subtarget.hasSSE2(); |
| X86ScalarSSEf32 = Subtarget.hasSSE1(); |
| MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0)); |
| |
| // Set up the TargetLowering object. |
| |
| // X86 is weird. It always uses i8 for shift amounts and setcc results. |
| setBooleanContents(ZeroOrOneBooleanContent); |
| // X86-SSE is even stranger. It uses -1 or 0 for vector masks. |
| setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); |
| |
| // For 64-bit, since we have so many registers, use the ILP scheduler. |
| // For 32-bit, use the register pressure specific scheduling. |
| // For Atom, always use ILP scheduling. |
| if (Subtarget.isAtom()) |
| setSchedulingPreference(Sched::ILP); |
| else if (Subtarget.is64Bit()) |
| setSchedulingPreference(Sched::ILP); |
| else |
| setSchedulingPreference(Sched::RegPressure); |
| const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); |
| setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister()); |
| |
| // Bypass expensive divides and use cheaper ones. |
| if (TM.getOptLevel() >= CodeGenOpt::Default) { |
| if (Subtarget.hasSlowDivide32()) |
| addBypassSlowDiv(32, 8); |
| if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit()) |
| addBypassSlowDiv(64, 32); |
| } |
| |
| if (Subtarget.isTargetWindowsMSVC() || |
| Subtarget.isTargetWindowsItanium()) { |
| // Setup Windows compiler runtime calls. |
| setLibcallName(RTLIB::SDIV_I64, "_alldiv"); |
| setLibcallName(RTLIB::UDIV_I64, "_aulldiv"); |
| setLibcallName(RTLIB::SREM_I64, "_allrem"); |
| setLibcallName(RTLIB::UREM_I64, "_aullrem"); |
| setLibcallName(RTLIB::MUL_I64, "_allmul"); |
| setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall); |
| setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall); |
| setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall); |
| setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall); |
| setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall); |
| } |
| |
| if (Subtarget.getTargetTriple().isOSMSVCRT()) { |
| // MSVCRT doesn't have powi; fall back to pow |
| setLibcallName(RTLIB::POWI_F32, nullptr); |
| setLibcallName(RTLIB::POWI_F64, nullptr); |
| } |
| |
| // If we don't have cmpxchg8b(meaing this is a 386/486), limit atomic size to |
| // 32 bits so the AtomicExpandPass will expand it so we don't need cmpxchg8b. |
| // FIXME: Should we be limitting the atomic size on other configs? Default is |
| // 1024. |
| if (!Subtarget.hasCmpxchg8b()) |
| setMaxAtomicSizeInBitsSupported(32); |
| |
| // Set up the register classes. |
| addRegisterClass(MVT::i8, &X86::GR8RegClass); |
| addRegisterClass(MVT::i16, &X86::GR16RegClass); |
| addRegisterClass(MVT::i32, &X86::GR32RegClass); |
| if (Subtarget.is64Bit()) |
| addRegisterClass(MVT::i64, &X86::GR64RegClass); |
| |
| for (MVT VT : MVT::integer_valuetypes()) |
| setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); |
| |
| // We don't accept any truncstore of integer registers. |
| setTruncStoreAction(MVT::i64, MVT::i32, Expand); |
| setTruncStoreAction(MVT::i64, MVT::i16, Expand); |
| setTruncStoreAction(MVT::i64, MVT::i8 , Expand); |
| setTruncStoreAction(MVT::i32, MVT::i16, Expand); |
| setTruncStoreAction(MVT::i32, MVT::i8 , Expand); |
| setTruncStoreAction(MVT::i16, MVT::i8, Expand); |
| |
| setTruncStoreAction(MVT::f64, MVT::f32, Expand); |
| |
| // SETOEQ and SETUNE require checking two conditions. |
| setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand); |
| setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand); |
| setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand); |
| setCondCodeAction(ISD::SETUNE, MVT::f32, Expand); |
| setCondCodeAction(ISD::SETUNE, MVT::f64, Expand); |
| setCondCodeAction(ISD::SETUNE, MVT::f80, Expand); |
| |
| // Integer absolute. |
| if (Subtarget.hasCMov()) { |
| setOperationAction(ISD::ABS , MVT::i16 , Custom); |
| setOperationAction(ISD::ABS , MVT::i32 , Custom); |
| } |
| setOperationAction(ISD::ABS , MVT::i64 , Custom); |
| |
| // Funnel shifts. |
| for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) { |
| setOperationAction(ShiftOp , MVT::i16 , Custom); |
| setOperationAction(ShiftOp , MVT::i32 , Custom); |
| if (Subtarget.is64Bit()) |
| setOperationAction(ShiftOp , MVT::i64 , Custom); |
| } |
| |
| if (!Subtarget.useSoftFloat()) { |
| // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this |
| // operation. |
| setOperationAction(ISD::UINT_TO_FP, MVT::i8, Promote); |
| setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i8, Promote); |
| setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote); |
| setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i16, Promote); |
| // We have an algorithm for SSE2, and we turn this into a 64-bit |
| // FILD or VCVTUSI2SS/SD for other targets. |
| setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom); |
| setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom); |
| // We have an algorithm for SSE2->double, and we turn this into a |
| // 64-bit FILD followed by conditional FADD for other targets. |
| setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom); |
| setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom); |
| |
| // Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have |
| // this operation. |
| setOperationAction(ISD::SINT_TO_FP, MVT::i8, Promote); |
| setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i8, Promote); |
| // SSE has no i16 to fp conversion, only i32. We promote in the handler |
| // to allow f80 to use i16 and f64 to use i16 with sse1 only |
| setOperationAction(ISD::SINT_TO_FP, MVT::i16, Custom); |
| setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i16, Custom); |
| // f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not |
| setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); |
| setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom); |
| // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64 |
| // are Legal, f80 is custom lowered. |
| setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom); |
| setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom); |
| |
| // Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have |
| // this operation. |
| setOperationAction(ISD::FP_TO_SINT, MVT::i8, Promote); |
| // FIXME: This doesn't generate invalid exception when it should. PR44019. |
| setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i8, Promote); |
| setOperationAction(ISD::FP_TO_SINT, MVT::i16, Custom); |
| setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i16, Custom); |
| setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); |
| setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom); |
| // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64 |
| // are Legal, f80 is custom lowered. |
| setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); |
| setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom); |
| |
| // Handle FP_TO_UINT by promoting the destination to a larger signed |
| // conversion. |
| setOperationAction(ISD::FP_TO_UINT, MVT::i8, Promote); |
| // FIXME: This doesn't generate invalid exception when it should. PR44019. |
| setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i8, Promote); |
| setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote); |
| // FIXME: This doesn't generate invalid exception when it should. PR44019. |
| setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i16, Promote); |
| setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); |
| setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom); |
| setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); |
| setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom); |
| } |
| |
| // Handle address space casts between mixed sized pointers. |
| setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom); |
| setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom); |
| |
| // TODO: when we have SSE, these could be more efficient, by using movd/movq. |
| if (!X86ScalarSSEf64) { |
| setOperationAction(ISD::BITCAST , MVT::f32 , Expand); |
| setOperationAction(ISD::BITCAST , MVT::i32 , Expand); |
| if (Subtarget.is64Bit()) { |
| setOperationAction(ISD::BITCAST , MVT::f64 , Expand); |
| // Without SSE, i64->f64 goes through memory. |
| setOperationAction(ISD::BITCAST , MVT::i64 , Expand); |
| } |
| } else if (!Subtarget.is64Bit()) |
| setOperationAction(ISD::BITCAST , MVT::i64 , Custom); |
| |
| // Scalar integer divide and remainder are lowered to use operations that |
| // produce two results, to match the available instructions. This exposes |
| // the two-result form to trivial CSE, which is able to combine x/y and x%y |
| // into a single instruction. |
| // |
| // Scalar integer multiply-high is also lowered to use two-result |
| // operations, to match the available instructions. However, plain multiply |
| // (low) operations are left as Legal, as there are single-result |
| // instructions for this in x86. Using the two-result multiply instructions |
| // when both high and low results are needed must be arranged by dagcombine. |
| for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) { |
| setOperationAction(ISD::MULHS, VT, Expand); |
| setOperationAction(ISD::MULHU, VT, Expand); |
| setOperationAction(ISD::SDIV, VT, Expand); |
| setOperationAction(ISD::UDIV, VT, Expand); |
| setOperationAction(ISD::SREM, VT, Expand); |
| setOperationAction(ISD::UREM, VT, Expand); |
| } |
| |
| setOperationAction(ISD::BR_JT , MVT::Other, Expand); |
| setOperationAction(ISD::BRCOND , MVT::Other, Custom); |
| for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128, |
| MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) { |
| setOperationAction(ISD::BR_CC, VT, Expand); |
| setOperationAction(ISD::SELECT_CC, VT, Expand); |
| } |
| if (Subtarget.is64Bit()) |
| setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); |
| setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal); |
| setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); |
| setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand); |
| |
| setOperationAction(ISD::FREM , MVT::f32 , Expand); |
| setOperationAction(ISD::FREM , MVT::f64 , Expand); |
| setOperationAction(ISD::FREM , MVT::f80 , Expand); |
| setOperationAction(ISD::FREM , MVT::f128 , Expand); |
| setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom); |
| |
| // Promote the i8 variants and force them on up to i32 which has a shorter |
| // encoding. |
| setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32); |
| setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8 , MVT::i32); |
| if (!Subtarget.hasBMI()) { |
| setOperationAction(ISD::CTTZ , MVT::i16 , Custom); |
| setOperationAction(ISD::CTTZ , MVT::i32 , Custom); |
| setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Legal); |
| setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal); |
| if (Subtarget.is64Bit()) { |
| setOperationAction(ISD::CTTZ , MVT::i64 , Custom); |
| setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal); |
| } |
| } |
| |
| if (Subtarget.hasLZCNT()) { |
| // When promoting the i8 variants, force them to i32 for a shorter |
| // encoding. |
| setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32); |
| setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32); |
| } else { |
| setOperationAction(ISD::CTLZ , MVT::i8 , Custom); |
| setOperationAction(ISD::CTLZ , MVT::i16 , Custom); |
| setOperationAction(ISD::CTLZ , MVT::i32 , Custom); |
| setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Custom); |
| setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Custom); |
| setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Custom); |
| if (Subtarget.is64Bit()) { |
| setOperationAction(ISD::CTLZ , MVT::i64 , Custom); |
| setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom); |
| } |
| } |
| |
| // Special handling for half-precision floating point conversions. |
| // If we don't have F16C support, then lower half float conversions |
| // into library calls. |
| if (Subtarget.useSoftFloat() || !Subtarget.hasF16C()) { |
| setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand); |
| setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand); |
| } |
| |
| // There's never any support for operations beyond MVT::f32. |
| setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); |
| setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand); |
| setOperationAction(ISD::FP16_TO_FP, MVT::f128, Expand); |
| setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand); |
| setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand); |
| setOperationAction(ISD::FP_TO_FP16, MVT::f128, Expand); |
| |
| setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand); |
| setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand); |
| setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand); |
| setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f16, Expand); |
| setTruncStoreAction(MVT::f32, MVT::f16, Expand); |
| setTruncStoreAction(MVT::f64, MVT::f16, Expand); |
| setTruncStoreAction(MVT::f80, MVT::f16, Expand); |
| setTruncStoreAction(MVT::f128, MVT::f16, Expand); |
| |
| if (Subtarget.hasPOPCNT()) { |
| setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32); |
| } else { |
| setOperationAction(ISD::CTPOP , MVT::i8 , Expand); |
| setOperationAction(ISD::CTPOP , MVT::i16 , Expand); |
| setOperationAction(ISD::CTPOP , MVT::i32 , Expand); |
| if (Subtarget.is64Bit()) |
| setOperationAction(ISD::CTPOP , MVT::i64 , Expand); |
| else |
| setOperationAction(ISD::CTPOP , MVT::i64 , Custom); |
| } |
| |
| setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom); |
| |
| if (!Subtarget.hasMOVBE()) |
| setOperationAction(ISD::BSWAP , MVT::i16 , Expand); |
| |
| // X86 wants to expand cmov itself. |
| for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) { |
| setOperationAction(ISD::SELECT, VT, Custom); |
| setOperationAction(ISD::SETCC, VT, Custom); |
| setOperationAction(ISD::STRICT_FSETCC, VT, Custom); |
| setOperationAction(ISD::STRICT_FSETCCS, VT, Custom); |
| } |
| for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) { |
| if (VT == MVT::i64 && !Subtarget.is64Bit()) |
| continue; |
| setOperationAction(ISD::SELECT, VT, Custom); |
| setOperationAction(ISD::SETCC, VT, Custom); |
| } |
| |
| // Custom action for SELECT MMX and expand action for SELECT_CC MMX |
| setOperationAction(ISD::SELECT, MVT::x86mmx, Custom); |
| setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand); |
| |
| setOperationAction(ISD::EH_RETURN , MVT::Other, Custom); |
| // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since |
| // LLVM/Clang supports zero-cost DWARF and SEH exception handling. |
| setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom); |
| setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom); |
| setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom); |
| if (TM.Options.ExceptionModel == ExceptionHandling::SjLj) |
| setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume"); |
| |
| // Darwin ABI issue. |
| for (auto VT : { MVT::i32, MVT::i64 }) { |
| if (VT == MVT::i64 && !Subtarget.is64Bit()) |
| continue; |
| setOperationAction(ISD::ConstantPool , VT, Custom); |
| setOperationAction(ISD::JumpTable , VT, Custom); |
| setOperationAction(ISD::GlobalAddress , VT, Custom); |
| setOperationAction(ISD::GlobalTLSAddress, VT, Custom); |
| setOperationAction(ISD::ExternalSymbol , VT, Custom); |
| setOperationAction(ISD::BlockAddress , VT, Custom); |
| } |
| |
| // 64-bit shl, sra, srl (iff 32-bit x86) |
| for (auto VT : { MVT::i32, MVT::i64 }) { |
| if (VT == MVT::i64 && !Subtarget.is64Bit()) |
| continue; |
| setOperationAction(ISD::SHL_PARTS, VT, Custom); |
| setOperationAction(ISD::SRA_PARTS, VT, Custom); |
| setOperationAction(ISD::SRL_PARTS, VT, Custom); |
| } |
| |
| if (Subtarget.hasSSEPrefetch() || Subtarget.has3DNow()) |
| setOperationAction(ISD::PREFETCH , MVT::Other, Legal); |
| |
| setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom); |
| |
| // Expand certain atomics |
| for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) { |
| setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom); |
| setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom); |
| setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom); |
| setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom); |
| setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom); |
| setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom); |
| setOperationAction(ISD::ATOMIC_STORE, VT, Custom); |
| } |
| |
| if (!Subtarget.is64Bit()) |
| setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom); |
| |
| if (Subtarget.hasCmpxchg16b()) { |
| setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom); |
| } |
| |
| // FIXME - use subtarget debug flags |
| if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() && |
| !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() && |
| TM.Options.ExceptionModel != ExceptionHandling::SjLj) { |
| setOperationAction(ISD::EH_LABEL, MVT::Other, Expand); |
| } |
| |
| setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom); |
| setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom); |
| |
| setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom); |
| setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom); |
| |
| setOperationAction(ISD::TRAP, MVT::Other, Legal); |
| setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal); |
| |
| // VASTART needs to be custom lowered to use the VarArgsFrameIndex |
| setOperationAction(ISD::VASTART , MVT::Other, Custom); |
| setOperationAction(ISD::VAEND , MVT::Other, Expand); |
| bool Is64Bit = Subtarget.is64Bit(); |
| setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand); |
| setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand); |
| |
| setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); |
| setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); |
| |
| setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom); |
| |
| // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering. |
| setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom); |
| setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom); |
| |
| if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) { |
| // f32 and f64 use SSE. |
| // Set up the FP register classes. |
| addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass |
| : &X86::FR32RegClass); |
| addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass |
| : &X86::FR64RegClass); |
| |
| // Disable f32->f64 extload as we can only generate this in one instruction |
| // under optsize. So its easier to pattern match (fpext (load)) for that |
| // case instead of needing to emit 2 instructions for extload in the |
| // non-optsize case. |
| setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand); |
| |
| for (auto VT : { MVT::f32, MVT::f64 }) { |
| // Use ANDPD to simulate FABS. |
| setOperationAction(ISD::FABS, VT, Custom); |
| |
| // Use XORP to simulate FNEG. |
| setOperationAction(ISD::FNEG, VT, Custom); |
| |
| // Use ANDPD and ORPD to simulate FCOPYSIGN. |
| setOperationAction(ISD::FCOPYSIGN, VT, Custom); |
| |
| // These might be better off as horizontal vector ops. |
| setOperationAction(ISD::FADD, VT, Custom); |
| setOperationAction(ISD::FSUB, VT, Custom); |
| |
| // We don't support sin/cos/fmod |
| setOperationAction(ISD::FSIN , VT, Expand); |
| setOperationAction(ISD::FCOS , VT, Expand); |
| setOperationAction(ISD::FSINCOS, VT, Expand); |
| } |
| |
| // Lower this to MOVMSK plus an AND. |
| setOperationAction(ISD::FGETSIGN, MVT::i64, Custom); |
| setOperationAction(ISD::FGETSIGN, MVT::i32, Custom); |
| |
| } else if (!useSoftFloat() && X86ScalarSSEf32 && (UseX87 || Is64Bit)) { |
| // Use SSE for f32, x87 for f64. |
| // Set up the FP register classes. |
| addRegisterClass(MVT::f32, &X86::FR32RegClass); |
| if (UseX87) |
| addRegisterClass(MVT::f64, &X86::RFP64RegClass); |
| |
| // Use ANDPS to simulate FABS. |
| setOperationAction(ISD::FABS , MVT::f32, Custom); |
| |
| // Use XORP to simulate FNEG. |
| setOperationAction(ISD::FNEG , MVT::f32, Custom); |
| |
| if (UseX87) |
| setOperationAction(ISD::UNDEF, MVT::f64, Expand); |
| |
| // Use ANDPS and ORPS to simulate FCOPYSIGN. |
| if (UseX87) |
| setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); |
| setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); |
| |
| // We don't support sin/cos/fmod |
| setOperationAction(ISD::FSIN , MVT::f32, Expand); |
| setOperationAction(ISD::FCOS , MVT::f32, Expand); |
| setOperationAction(ISD::FSINCOS, MVT::f32, Expand); |
| |
| if (UseX87) { |
| // Always expand sin/cos functions even though x87 has an instruction. |
| setOperationAction(ISD::FSIN, MVT::f64, Expand); |
| setOperationAction(ISD::FCOS, MVT::f64, Expand); |
| setOperationAction(ISD::FSINCOS, MVT::f64, Expand); |
| } |
| } else if (UseX87) { |
| // f32 and f64 in x87. |
| // Set up the FP register classes. |
| addRegisterClass(MVT::f64, &X86::RFP64RegClass); |
| addRegisterClass(MVT::f32, &X86::RFP32RegClass); |
| |
| for (auto VT : { MVT::f32, MVT::f64 }) { |
| setOperationAction(ISD::UNDEF, VT, Expand); |
| setOperationAction(ISD::FCOPYSIGN, VT, Expand); |
| |
| // Always expand sin/cos functions even though x87 has an instruction. |
| setOperationAction(ISD::FSIN , VT, Expand); |
| setOperationAction(ISD::FCOS , VT, Expand); |
| setOperationAction(ISD::FSINCOS, VT, Expand); |
| } |
| } |
| |
| // Expand FP32 immediates into loads from the stack, save special cases. |
| if (isTypeLegal(MVT::f32)) { |
| if (UseX87 && (getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) { |
| addLegalFPImmediate(APFloat(+0.0f)); // FLD0 |
| addLegalFPImmediate(APFloat(+1.0f)); // FLD1 |
| addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS |
| addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS |
| } else // SSE immediates. |
| addLegalFPImmediate(APFloat(+0.0f)); // xorps |
| } |
| // Expand FP64 immediates into loads from the stack, save special cases. |
| if (isTypeLegal(MVT::f64)) { |
| if (UseX87 && getRegClassFor(MVT::f64) == &X86::RFP64RegClass) { |
| addLegalFPImmediate(APFloat(+0.0)); // FLD0 |
| addLegalFPImmediate(APFloat(+1.0)); // FLD1 |
| addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS |
| addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS |
| } else // SSE immediates. |
| addLegalFPImmediate(APFloat(+0.0)); // xorpd |
| } |
| // Handle constrained floating-point operations of scalar. |
| setOperationAction(ISD::STRICT_FADD, MVT::f32, Legal); |
| setOperationAction(ISD::STRICT_FADD, MVT::f64, Legal); |
| setOperationAction(ISD::STRICT_FSUB, MVT::f32, Legal); |
| setOperationAction(ISD::STRICT_FSUB, MVT::f64, Legal); |
| setOperationAction(ISD::STRICT_FMUL, MVT::f32, Legal); |
| setOperationAction(ISD::STRICT_FMUL, MVT::f64, Legal); |
| setOperationAction(ISD::STRICT_FDIV, MVT::f32, Legal); |
| setOperationAction(ISD::STRICT_FDIV, MVT::f64, Legal); |
| setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal); |
| setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Legal); |
| setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Legal); |
| setOperationAction(ISD::STRICT_FSQRT, MVT::f32, Legal); |
| setOperationAction(ISD::STRICT_FSQRT, MVT::f64, Legal); |
| |
| // We don't support FMA. |
| setOperationAction(ISD::FMA, MVT::f64, Expand); |
| setOperationAction(ISD::FMA, MVT::f32, Expand); |
| |
| // f80 always uses X87. |
| if (UseX87) { |
| addRegisterClass(MVT::f80, &X86::RFP80RegClass); |
| setOperationAction(ISD::UNDEF, MVT::f80, Expand); |
| setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand); |
| { |
| APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended()); |
| addLegalFPImmediate(TmpFlt); // FLD0 |
| TmpFlt.changeSign(); |
| addLegalFPImmediate(TmpFlt); // FLD0/FCHS |
| |
| bool ignored; |
| APFloat TmpFlt2(+1.0); |
| TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven, |
| &ignored); |
| addLegalFPImmediate(TmpFlt2); // FLD1 |
| TmpFlt2.changeSign(); |
| addLegalFPImmediate(TmpFlt2); // FLD1/FCHS |
| } |
| |
| // Always expand sin/cos functions even though x87 has an instruction. |
| setOperationAction(ISD::FSIN , MVT::f80, Expand); |
| setOperationAction(ISD::FCOS , MVT::f80, Expand); |
| setOperationAction(ISD::FSINCOS, MVT::f80, Expand); |
| |
| setOperationAction(ISD::FFLOOR, MVT::f80, Expand); |
| setOperationAction(ISD::FCEIL, MVT::f80, Expand); |
| setOperationAction(ISD::FTRUNC, MVT::f80, Expand); |
| setOperationAction(ISD::FRINT, MVT::f80, Expand); |
| setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand); |
| setOperationAction(ISD::FMA, MVT::f80, Expand); |
| setOperationAction(ISD::LROUND, MVT::f80, Expand); |
| setOperationAction(ISD::LLROUND, MVT::f80, Expand); |
| setOperationAction(ISD::LRINT, MVT::f80, Expand); |
| setOperationAction(ISD::LLRINT, MVT::f80, Expand); |
| |
| // Handle constrained floating-point operations of scalar. |
| setOperationAction(ISD::STRICT_FADD , MVT::f80, Legal); |
| setOperationAction(ISD::STRICT_FSUB , MVT::f80, Legal); |
| setOperationAction(ISD::STRICT_FMUL , MVT::f80, Legal); |
| setOperationAction(ISD::STRICT_FDIV , MVT::f80, Legal); |
| setOperationAction(ISD::STRICT_FSQRT , MVT::f80, Legal); |
| setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Legal); |
| // FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten |
| // as Custom. |
| setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Legal); |
| } |
| |
| // f128 uses xmm registers, but most operations require libcalls. |
| if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) { |
| addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass |
| : &X86::VR128RegClass); |
| |
| addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps |
| |
| setOperationAction(ISD::FADD, MVT::f128, LibCall); |
| setOperationAction(ISD::STRICT_FADD, MVT::f128, LibCall); |
| setOperationAction(ISD::FSUB, MVT::f128, LibCall); |
| setOperationAction(ISD::STRICT_FSUB, MVT::f128, LibCall); |
| setOperationAction(ISD::FDIV, MVT::f128, LibCall); |
| setOperationAction(ISD::STRICT_FDIV, MVT::f128, LibCall); |
| setOperationAction(ISD::FMUL, MVT::f128, LibCall); |
| setOperationAction(ISD::STRICT_FMUL, MVT::f128, LibCall); |
| setOperationAction(ISD::FMA, MVT::f128, LibCall); |
| setOperationAction(ISD::STRICT_FMA, MVT::f128, LibCall); |
| |
| setOperationAction(ISD::FABS, MVT::f128, Custom); |
| setOperationAction(ISD::FNEG, MVT::f128, Custom); |
| setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom); |
| |
| setOperationAction(ISD::FSIN, MVT::f128, LibCall); |
| setOperationAction(ISD::STRICT_FSIN, MVT::f128, LibCall); |
| setOperationAction(ISD::FCOS, MVT::f128, LibCall); |
| setOperationAction(ISD::STRICT_FCOS, MVT::f128, LibCall); |
| setOperationAction(ISD::FSINCOS, MVT::f128, LibCall); |
| // No STRICT_FSINCOS |
| setOperationAction(ISD::FSQRT, MVT::f128, LibCall); |
| setOperationAction(ISD::STRICT_FSQRT, MVT::f128, LibCall); |
| |
| setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom); |
| setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f128, Custom); |
| // We need to custom handle any FP_ROUND with an f128 input, but |
| // LegalizeDAG uses the result type to know when to run a custom handler. |
| // So we have to list all legal floating point result types here. |
| if (isTypeLegal(MVT::f32)) { |
| setOperationAction(ISD::FP_ROUND, MVT::f32, Custom); |
| setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom); |
| } |
| if (isTypeLegal(MVT::f64)) { |
| setOperationAction(ISD::FP_ROUND, MVT::f64, Custom); |
| setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom); |
| } |
| if (isTypeLegal(MVT::f80)) { |
| setOperationAction(ISD::FP_ROUND, MVT::f80, Custom); |
| setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Custom); |
| } |
| |
| setOperationAction(ISD::SETCC, MVT::f128, Custom); |
| |
| setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand); |
| setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand); |
| setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f80, Expand); |
| setTruncStoreAction(MVT::f128, MVT::f32, Expand); |
| setTruncStoreAction(MVT::f128, MVT::f64, Expand); |
| setTruncStoreAction(MVT::f128, MVT::f80, Expand); |
| } |
| |
| // Always use a library call for pow. |
| setOperationAction(ISD::FPOW , MVT::f32 , Expand); |
| setOperationAction(ISD::FPOW , MVT::f64 , Expand); |
| setOperationAction(ISD::FPOW , MVT::f80 , Expand); |
| setOperationAction(ISD::FPOW , MVT::f128 , Expand); |
| |
| setOperationAction(ISD::FLOG, MVT::f80, Expand); |
| setOperationAction(ISD::FLOG2, MVT::f80, Expand); |
| setOperationAction(ISD::FLOG10, MVT::f80, Expand); |
| setOperationAction(ISD::FEXP, MVT::f80, Expand); |
| setOperationAction(ISD::FEXP2, MVT::f80, Expand); |
| setOperationAction(ISD::FMINNUM, MVT::f80, Expand); |
| setOperationAction(ISD::FMAXNUM, MVT::f80, Expand); |
| |
| // Some FP actions are always expanded for vector types. |
| for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32, |
| MVT::v2f64, MVT::v4f64, MVT::v8f64 }) { |
| setOperationAction(ISD::FSIN, VT, Expand); |
| setOperationAction(ISD::FSINCOS, VT, Expand); |
| setOperationAction(ISD::FCOS, VT, Expand); |
| setOperationAction(ISD::FREM, VT, Expand); |
| setOperationAction(ISD::FCOPYSIGN, VT, Expand); |
| setOperationAction(ISD::FPOW, VT, Expand); |
| setOperationAction(ISD::FLOG, VT, Expand); |
| setOperationAction(ISD::FLOG2, VT, Expand); |
| setOperationAction(ISD::FLOG10, VT, Expand); |
| setOperationAction(ISD::FEXP, VT, Expand); |
| setOperationAction(ISD::FEXP2, VT, Expand); |
| } |
| |
| // First set operation action for all vector types to either promote |
| // (for widening) or expand (for scalarization). Then we will selectively |
| // turn on ones that can be effectively codegen'd. |
| for (MVT VT : MVT::fixedlen_vector_valuetypes()) { |
| setOperationAction(ISD::SDIV, VT, Expand); |
| setOperationAction(ISD::UDIV, VT, Expand); |
| setOperationAction(ISD::SREM, VT, Expand); |
| setOperationAction(ISD::UREM, VT, Expand); |
| setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand); |
| setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand); |
| setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand); |
| setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand); |
| setOperationAction(ISD::FMA, VT, Expand); |
| setOperationAction(ISD::FFLOOR, VT, Expand); |
| setOperationAction(ISD::FCEIL, VT, Expand); |
| setOperationAction(ISD::FTRUNC, VT, Expand); |
| setOperationAction(ISD::FRINT, VT, Expand); |
| setOperationAction(ISD::FNEARBYINT, VT, Expand); |
| setOperationAction(ISD::SMUL_LOHI, VT, Expand); |
| setOperationAction(ISD::MULHS, VT, Expand); |
| setOperationAction(ISD::UMUL_LOHI, VT, Expand); |
| setOperationAction(ISD::MULHU, VT, Expand); |
| setOperationAction(ISD::SDIVREM, VT, Expand); |
| setOperationAction(ISD::UDIVREM, VT, Expand); |
| setOperationAction(ISD::CTPOP, VT, Expand); |
| setOperationAction(ISD::CTTZ, VT, Expand); |
| setOperationAction(ISD::CTLZ, VT, Expand); |
| setOperationAction(ISD::ROTL, VT, Expand); |
| setOperationAction(ISD::ROTR, VT, Expand); |
| setOperationAction(ISD::BSWAP, VT, Expand); |
| setOperationAction(ISD::SETCC, VT, Expand); |
| setOperationAction(ISD::FP_TO_UINT, VT, Expand); |
| setOperationAction(ISD::FP_TO_SINT, VT, Expand); |
| setOperationAction(ISD::UINT_TO_FP, VT, Expand); |
| setOperationAction(ISD::SINT_TO_FP, VT, Expand); |
| setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand); |
| setOperationAction(ISD::TRUNCATE, VT, Expand); |
| setOperationAction(ISD::SIGN_EXTEND, VT, Expand); |
| setOperationAction(ISD::ZERO_EXTEND, VT, Expand); |
| setOperationAction(ISD::ANY_EXTEND, VT, Expand); |
| setOperationAction(ISD::SELECT_CC, VT, Expand); |
| for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) { |
| setTruncStoreAction(InnerVT, VT, Expand); |
| |
| setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand); |
| setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand); |
| |
| // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like |
| // types, we have to deal with them whether we ask for Expansion or not. |
| // Setting Expand causes its own optimisation problems though, so leave |
| // them legal. |
| if (VT.getVectorElementType() == MVT::i1) |
| setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand); |
| |
| // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are |
| // split/scalarized right now. |
| if (VT.getVectorElementType() == MVT::f16) |
| setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand); |
| } |
| } |
| |
| // FIXME: In order to prevent SSE instructions being expanded to MMX ones |
| // with -msoft-float, disable use of MMX as well. |
| if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) { |
| addRegisterClass(MVT::x86mmx, &X86::VR64RegClass); |
| // No operations on x86mmx supported, everything uses intrinsics. |
| } |
| |
| if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) { |
| addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass |
| : &X86::VR128RegClass); |
| |
| setOperationAction(ISD::FNEG, MVT::v4f32, Custom); |
| setOperationAction(ISD::FABS, MVT::v4f32, Custom); |
| setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom); |
| setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); |
| setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); |
| setOperationAction(ISD::VSELECT, MVT::v4f32, Custom); |
| setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); |
| setOperationAction(ISD::SELECT, MVT::v4f32, Custom); |
| |
| setOperationAction(ISD::LOAD, MVT::v2f32, Custom); |
| setOperationAction(ISD::STORE, MVT::v2f32, Custom); |
| |
| setOperationAction(ISD::STRICT_FADD, MVT::v4f32, Legal); |
| setOperationAction(ISD::STRICT_FSUB, MVT::v4f32, Legal); |
| setOperationAction(ISD::STRICT_FMUL, MVT::v4f32, Legal); |
| setOperationAction(ISD::STRICT_FDIV, MVT::v4f32, Legal); |
| setOperationAction(ISD::STRICT_FSQRT, MVT::v4f32, Legal); |
| } |
| |
| if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) { |
| addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass |
| : &X86::VR128RegClass); |
| |
| // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM |
| // registers cannot be used even for integer operations. |
| addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass |
| : &X86::VR128RegClass); |
| addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass |
| : &X86::VR128RegClass); |
| addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass |
| : &X86::VR128RegClass); |
| addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass |
| : &X86::VR128RegClass); |
| |
| for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8, |
| MVT::v2i16, MVT::v4i16, MVT::v2i32 }) { |
| setOperationAction(ISD::SDIV, VT, Custom); |
| setOperationAction(ISD::SREM, VT, Custom); |
| setOperationAction(ISD::UDIV, VT, Custom); |
| setOperationAction(ISD::UREM, VT, Custom); |
| } |
| |
| setOperationAction(ISD::MUL, MVT::v2i8, Custom); |
| setOperationAction(ISD::MUL, MVT::v4i8, Custom); |
| setOperationAction(ISD::MUL, MVT::v8i8, Custom); |
| |
| setOperationAction(ISD::MUL, MVT::v16i8, Custom); |
| setOperationAction(ISD::MUL, MVT::v4i32, Custom); |
| setOperationAction(ISD::MUL, MVT::v2i64, Custom); |
| setOperationAction(ISD::MULHU, MVT::v4i32, Custom); |
| setOperationAction(ISD::MULHS, MVT::v4i32, Custom); |
| setOperationAction(ISD::MULHU, MVT::v16i8, Custom); |
| setOperationAction(ISD::MULHS, MVT::v16i8, Custom); |
| setOperationAction(ISD::MULHU, MVT::v8i16, Legal); |
| setOperationAction(ISD::MULHS, MVT::v8i16, Legal); |
| setOperationAction(ISD::MUL, MVT::v8i16, Legal); |
| setOperationAction(ISD::FNEG, MVT::v2f64, Custom); |
| setOperationAction(ISD::FABS, MVT::v2f64, Custom); |
| setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom); |
| |
| for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) { |
| setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom); |
| setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom); |
| setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom); |
| setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom); |
| } |
| |
| setOperationAction(ISD::UADDSAT, MVT::v16i8, Legal); |
| setOperationAction(ISD::SADDSAT, MVT::v16i8, Legal); |
| setOperationAction(ISD::USUBSAT, MVT::v16i8, Legal); |
| setOperationAction(ISD::SSUBSAT, MVT::v16i8, Legal); |
| setOperationAction(ISD::UADDSAT, MVT::v8i16, Legal); |
| setOperationAction(ISD::SADDSAT, MVT::v8i16, Legal); |
| setOperationAction(ISD::USUBSAT, MVT::v8i16, Legal); |
| setOperationAction(ISD::SSUBSAT, MVT::v8i16, Legal); |
| setOperationAction(ISD::UADDSAT, MVT::v4i32, Custom); |
| setOperationAction(ISD::USUBSAT, MVT::v4i32, Custom); |
| setOperationAction(ISD::UADDSAT, MVT::v2i64, Custom); |
| setOperationAction(ISD::USUBSAT, MVT::v2i64, Custom); |
| |
| setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); |
| setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); |
| setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); |
| |
| for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) { |
| setOperationAction(ISD::SETCC, VT, Custom); |
| setOperationAction(ISD::STRICT_FSETCC, VT, Custom); |
| setOperationAction(ISD::STRICT_FSETCCS, VT, Custom); |
| setOperationAction(ISD::CTPOP, VT, Custom); |
| setOperationAction(ISD::ABS, VT, Custom); |
| |
| // The condition codes aren't legal in SSE/AVX and under AVX512 we use |
| // setcc all the way to isel and prefer SETGT in some isel patterns. |
| setCondCodeAction(ISD::SETLT, VT, Custom); |
| setCondCodeAction(ISD::SETLE, VT, Custom); |
| } |
| |
| for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) { |
| setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); |
| setOperationAction(ISD::BUILD_VECTOR, VT, Custom); |
| setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); |
| setOperationAction(ISD::VSELECT, VT, Custom); |
| setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); |
| } |
| |
| for (auto VT : { MVT::v2f64, MVT::v2i64 }) { |
| setOperationAction(ISD::BUILD_VECTOR, VT, Custom); |
| setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); |
| setOperationAction(ISD::VSELECT, VT, Custom); |
| |
| if (VT == MVT::v2i64 && !Subtarget.is64Bit()) |
| continue; |
| |
| setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); |
| setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); |
| } |
| |
| // Custom lower v2i64 and v2f64 selects. |
| setOperationAction(ISD::SELECT, MVT::v2f64, Custom); |
| setOperationAction(ISD::SELECT, MVT::v2i64, Custom); |
| setOperationAction(ISD::SELECT, MVT::v4i32, Custom); |
| setOperationAction(ISD::SELECT, MVT::v8i16, Custom); |
| setOperationAction(ISD::SELECT, MVT::v16i8, Custom); |
| |
| setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); |
| setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom); |
| setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4i32, Legal); |
| setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i32, Custom); |
| |
| // Custom legalize these to avoid over promotion or custom promotion. |
| for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) { |
| setOperationAction(ISD::FP_TO_SINT, VT, Custom); |
| setOperationAction(ISD::FP_TO_UINT, VT, Custom); |
| setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Custom); |
| setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Custom); |
| } |
| |
| setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); |
| setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i32, Legal); |
| setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom); |
| setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2i32, Custom); |
| |
| setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom); |
| setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2i32, Custom); |
| |
| setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom); |
| setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32, Custom); |
| |
| // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion. |
| setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom); |
| setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2f32, Custom); |
| setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom); |
| setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2f32, Custom); |
| |
| setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom); |
| setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v2f32, Custom); |
| setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom); |
| setOperationAction(ISD::STRICT_FP_ROUND, MVT::v2f32, Custom); |
| |
| // We want to legalize this to an f64 load rather than an i64 load on |
| // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for |
| // store. |
| setOperationAction(ISD::LOAD, MVT::v2i32, Custom); |
| setOperationAction(ISD::LOAD, MVT::v4i16, Custom); |
| setOperationAction(ISD::LOAD, MVT::v8i8, Custom); |
| setOperationAction(ISD::STORE, MVT::v2i32, Custom); |
| setOperationAction(ISD::STORE, MVT::v4i16, Custom); |
| setOperationAction(ISD::STORE, MVT::v8i8, Custom); |
| |
| setOperationAction(ISD::BITCAST, MVT::v2i32, Custom); |
| setOperationAction(ISD::BITCAST, MVT::v4i16, Custom); |
| setOperationAction(ISD::BITCAST, MVT::v8i8, Custom); |
| if (!Subtarget.hasAVX512()) |
| setOperationAction(ISD::BITCAST, MVT::v16i1, Custom); |
| |
| setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom); |
| setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom); |
| setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom); |
| |
| setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom); |
| |
| setOperationAction(ISD::TRUNCATE, MVT::v2i8, Custom); |
| setOperationAction(ISD::TRUNCATE, MVT::v2i16, Custom); |
| setOperationAction(ISD::TRUNCATE, MVT::v2i32, Custom); |
| setOperationAction(ISD::TRUNCATE, MVT::v4i8, Custom); |
| setOperationAction(ISD::TRUNCATE, MVT::v4i16, Custom); |
| setOperationAction(ISD::TRUNCATE, MVT::v8i8, Custom); |
| |
| // In the customized shift lowering, the legal v4i32/v2i64 cases |
| // in AVX2 will be recognized. |
| for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) { |
| setOperationAction(ISD::SRL, VT, Custom); |
| setOperationAction(ISD::SHL, VT, Custom); |
| setOperationAction(ISD::SRA, VT, Custom); |
| } |
| |
| setOperationAction(ISD::ROTL, MVT::v4i32, Custom); |
| setOperationAction(ISD::ROTL, MVT::v8i16, Custom); |
| |
| // With AVX512, expanding (and promoting the shifts) is better. |
| if (!Subtarget.hasAVX512()) |
| setOperationAction(ISD::ROTL, MVT::v16i8, Custom); |
| |
| setOperationAction(ISD::STRICT_FSQRT, MVT::v2f64, Legal); |
| setOperationAction(ISD::STRICT_FADD, MVT::v2f64, Legal); |
| setOperationAction(ISD::STRICT_FSUB, MVT::v2f64, Legal); |
| setOperationAction(ISD::STRICT_FMUL, MVT::v2f64, Legal); |
| setOperationAction(ISD::STRICT_FDIV, MVT::v2f64, Legal); |
| } |
| |
| if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) { |
| setOperationAction(ISD::ABS, MVT::v16i8, Legal); |
| setOperationAction(ISD::ABS, MVT::v8i16, Legal); |
| setOperationAction(ISD::ABS, MVT::v4i32, Legal); |
| setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom); |
| setOperationAction(ISD::CTLZ, MVT::v16i8, Custom); |
| setOperationAction(ISD::CTLZ, MVT::v8i16, Custom); |
| setOperationAction(ISD::CTLZ, MVT::v4i32, Custom); |
| setOperationAction(ISD::CTLZ, MVT::v2i64, Custom); |
| |
| // These might be better off as horizontal vector ops. |
| setOperationAction(ISD::ADD, MVT::i16, Custom); |
| setOperationAction(ISD::ADD, MVT::i32, Custom); |
| setOperationAction(ISD::SUB, MVT::i16, Custom); |
| setOperationAction(ISD::SUB, MVT::i32, Custom); |
| } |
| |
| if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) { |
| for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) { |
| setOperationAction(ISD::FFLOOR, RoundedTy, Legal); |
| setOperationAction(ISD::STRICT_FFLOOR, RoundedTy, Legal); |
| setOperationAction(ISD::FCEIL, RoundedTy, Legal); |
| setOperationAction(ISD::STRICT_FCEIL, RoundedTy, Legal); |
| setOperationAction(ISD::FTRUNC, RoundedTy, Legal); |
| setOperationAction(ISD::STRICT_FTRUNC, RoundedTy, Legal); |
| setOperationAction(ISD::FRINT, RoundedTy, Legal); |
| setOperationAction(ISD::STRICT_FRINT, RoundedTy, Legal); |
| setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal); |
| setOperationAction(ISD::STRICT_FNEARBYINT, RoundedTy, Legal); |
| } |
| |
| setOperationAction(ISD::SMAX, MVT::v16i8, Legal); |
| setOperationAction(ISD::SMAX, MVT::v4i32, Legal); |
| setOperationAction(ISD::UMAX, MVT::v8i16, Legal); |
| setOperationAction(ISD::UMAX, MVT::v4i32, Legal); |
| setOperationAction(ISD::SMIN, MVT::v16i8, Legal); |
| setOperationAction(ISD::SMIN, MVT::v4i32, Legal); |
| setOperationAction(ISD::UMIN, MVT::v8i16, Legal); |
| setOperationAction(ISD::UMIN, MVT::v4i32, Legal); |
| |
| // FIXME: Do we need to handle scalar-to-vector here? |
| setOperationAction(ISD::MUL, MVT::v4i32, Legal); |
| |
| // We directly match byte blends in the backend as they match the VSELECT |
| // condition form. |
| setOperationAction(ISD::VSELECT, MVT::v16i8, Legal); |
| |
| // SSE41 brings specific instructions for doing vector sign extend even in |
| // cases where we don't have SRA. |
| for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) { |
| setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal); |
| setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal); |
| } |
| |
| // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X |
| for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) { |
| setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal); |
| setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal); |
| setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal); |
| setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal); |
| setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal); |
| setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal); |
| } |
| |
| // i8 vectors are custom because the source register and source |
| // source memory operand types are not the same width. |
| setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom); |
| |
| if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) { |
| // We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can |
| // do the pre and post work in the vector domain. |
| setOperationAction(ISD::UINT_TO_FP, MVT::v4i64, Custom); |
| setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i64, Custom); |
| // We need to mark SINT_TO_FP as Custom even though we want to expand it |
| // so that DAG combine doesn't try to turn it into uint_to_fp. |
| setOperationAction(ISD::SINT_TO_FP, MVT::v4i64, Custom); |
| setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i64, Custom); |
| } |
| } |
| |
| if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) { |
| for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64, |
| MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) |
| setOperationAction(ISD::ROTL, VT, Custom); |
| |
| // XOP can efficiently perform BITREVERSE with VPPERM. |
| for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) |
| setOperationAction(ISD::BITREVERSE, VT, Custom); |
| |
| for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64, |
| MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) |
| setOperationAction(ISD::BITREVERSE, VT, Custom); |
| } |
| |
| if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) { |
| bool HasInt256 = Subtarget.hasInt256(); |
| |
| addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass |
| : &X86::VR256RegClass); |
| addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass |
| : &X86::VR256RegClass); |
| addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass |
| : &X86::VR256RegClass); |
| addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass |
| : &X86::VR256RegClass); |
| addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass |
| : &X86::VR256RegClass); |
| addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass |
| : &X86::VR256RegClass); |
| |
| for (auto VT : { MVT::v8f32, MVT::v4f64 }) { |
| setOperationAction(ISD::FFLOOR, VT, Legal); |
| setOperationAction(ISD::STRICT_FFLOOR, VT, Legal); |
| setOperationAction(ISD::FCEIL, VT, Legal); |
| setOperationAction(ISD::STRICT_FCEIL, VT, Legal); |
| setOperationAction(ISD::FTRUNC, VT, Legal); |
| setOperationAction(ISD::STRICT_FTRUNC, VT, Legal); |
| setOperationAction(ISD::FRINT, VT, Legal); |
| setOperationAction(ISD::STRICT_FRINT, VT, Legal); |
| setOperationAction(ISD::FNEARBYINT, VT, Legal); |
| setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal); |
| setOperationAction(ISD::FNEG, VT, Custom); |
| setOperationAction(ISD::FABS, VT, Custom); |
| setOperationAction(ISD::FCOPYSIGN, VT, Custom); |
| } |
| |
| // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted |
| // even though v8i16 is a legal type. |
| setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32); |
| setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32); |
| setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32); |
| setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32); |
| setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal); |
| setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i32, Legal); |
| |
| setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal); |
| setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i32, Legal); |
| |
| setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f32, Legal); |
| setOperationAction(ISD::STRICT_FADD, MVT::v8f32, Legal); |
| setOperationAction(ISD::STRICT_FADD, MVT::v4f64, Legal); |
| setOperationAction(ISD::STRICT_FSUB, MVT::v8f32, Legal); |
| setOperationAction(ISD::STRICT_FSUB, MVT::v4f64, Legal); |
| setOperationAction(ISD::STRICT_FMUL, MVT::v8f32, Legal); |
| setOperationAction(ISD::STRICT_FMUL, MVT::v4f64, Legal); |
| setOperationAction(ISD::STRICT_FDIV, MVT::v8f32, Legal); |
| setOperationAction(ISD::STRICT_FDIV, MVT::v4f64, Legal); |
| setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f64, Legal); |
| setOperationAction(ISD::STRICT_FSQRT, MVT::v8f32, Legal); |
| setOperationAction(ISD::STRICT_FSQRT, MVT::v4f64, Legal); |
| |
| if (!Subtarget.hasAVX512()) |
| setOperationAction(ISD::BITCAST, MVT::v32i1, Custom); |
| |
| // In the customized shift lowering, the legal v8i32/v4i64 cases |
| // in AVX2 will be recognized. |
| for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) { |
| setOperationAction(ISD::SRL, VT, Custom); |
| setOperationAction(ISD::SHL, VT, Custom); |
| setOperationAction(ISD::SRA, VT, Custom); |
| } |
| |
| // These types need custom splitting if their input is a 128-bit vector. |
| setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom); |
| setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom); |
| setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom); |
| setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom); |
| |
| setOperationAction(ISD::ROTL, MVT::v8i32, Custom); |
| setOperationAction(ISD::ROTL, MVT::v16i16, Custom); |
| |
| // With BWI, expanding (and promoting the shifts) is the better. |
| if (!Subtarget.hasBWI()) |
| setOperationAction(ISD::ROTL, MVT::v32i8, Custom); |
| |
| setOperationAction(ISD::SELECT, MVT::v4f64, Custom); |
| setOperationAction(ISD::SELECT, MVT::v4i64, Custom); |
| setOperationAction(ISD::SELECT, MVT::v8i32, Custom); |
| setOperationAction(ISD::SELECT, MVT::v16i16, Custom); |
| setOperationAction(ISD::SELECT, MVT::v32i8, Custom); |
| setOperationAction(ISD::SELECT, MVT::v8f32, Custom); |
| |
| for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) { |
| setOperationAction(ISD::SIGN_EXTEND, VT, Custom); |
| setOperationAction(ISD::ZERO_EXTEND, VT, Custom); |
| setOperationAction(ISD::ANY_EXTEND, VT, Custom); |
| } |
| |
| setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom); |
| setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom); |
| setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom); |
| setOperationAction(ISD::BITREVERSE, MVT::v32i8, Custom); |
| |
| for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) { |
| setOperationAction(ISD::SETCC, VT, Custom); |
| setOperationAction(ISD::STRICT_FSETCC, VT, Custom); |
| setOperationAction(ISD::STRICT_FSETCCS, VT, Custom); |
| setOperationAction(ISD::CTPOP, VT, Custom); |
| setOperationAction(ISD::CTLZ, VT, Custom); |
| |
| // The condition codes aren't legal in SSE/AVX and under AVX512 we use |
| // setcc all the way to isel and prefer SETGT in some isel patterns. |
| setCondCodeAction(ISD::SETLT, VT, Custom); |
| setCondCodeAction(ISD::SETLE, VT, Custom); |
| } |
| |
| if (Subtarget.hasAnyFMA()) { |
| for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32, |
| MVT::v2f64, MVT::v4f64 }) { |
| setOperationAction(ISD::FMA, VT, Legal); |
| setOperationAction(ISD::STRICT_FMA, VT, Legal); |
| } |
| } |
| |
| for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) { |
| setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom); |
| setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom); |
| } |
| |
| setOperationAction(ISD::MUL, MVT::v4i64, Custom); |
| setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom); |
| setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom); |
| setOperationAction(ISD::MUL, MVT::v32i8, Custom); |
| |
| setOperationAction(ISD::MULHU, MVT::v8i32, Custom); |
| setOperationAction(ISD::MULHS, MVT::v8i32, Custom); |
| setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom); |
| setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom); |
| setOperationAction(ISD::MULHU, MVT::v32i8, Custom); |
| setOperationAction(ISD::MULHS, MVT::v32i8, Custom); |
| |
| setOperationAction(ISD::ABS, MVT::v4i64, Custom); |
| setOperationAction(ISD::SMAX, MVT::v4i64, Custom); |
| setOperationAction(ISD::UMAX, MVT::v4i64, Custom); |
| setOperationAction(ISD::SMIN, MVT::v4i64, Custom); |
| setOperationAction(ISD::UMIN, MVT::v4i64, Custom); |
| |
| setOperationAction(ISD::UADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom); |
| setOperationAction(ISD::SADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom); |
| setOperationAction(ISD::USUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom); |
| setOperationAction(ISD::SSUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom); |
| setOperationAction(ISD::UADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom); |
| setOperationAction(ISD::SADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom); |
| setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom); |
| setOperationAction(ISD::SSUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom); |
| |
| for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) { |
| setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom); |
| setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom); |
| setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom); |
| setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom); |
| setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom); |
| } |
| |
| for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) { |
| setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom); |
| setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom); |
| } |
| |
| if (HasInt256) { |
| // The custom lowering for UINT_TO_FP for v8i32 becomes interesting |
| // when we have a 256bit-wide blend with immediate. |
| setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom); |
| setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i32, Custom); |
| |
| // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X |
| for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) { |
| setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal); |
| setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal); |
| setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal); |
| setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal); |
| setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal); |
| setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal); |
| } |
| } |
| |
| for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64, |
| MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) { |
| setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom); |
| setOperationAction(ISD::MSTORE, VT, Legal); |
| } |
| |
| // Extract subvector is special because the value type |
| // (result) is 128-bit but the source is 256-bit wide. |
| for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64, |
| MVT::v4f32, MVT::v2f64 }) { |
| setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal); |
| } |
| |
| // Custom lower several nodes for 256-bit types. |
| for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64, |
| MVT::v8f32, MVT::v4f64 }) { |
| setOperationAction(ISD::BUILD_VECTOR, VT, Custom); |
| setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); |
| setOperationAction(ISD::VSELECT, VT, Custom); |
| setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); |
| setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); |
| setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); |
| setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal); |
| setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); |
| setOperationAction(ISD::STORE, VT, Custom); |
| } |
| |
| if (HasInt256) { |
| setOperationAction(ISD::VSELECT, MVT::v32i8, Legal); |
| |
| // Custom legalize 2x32 to get a little better code. |
| setOperationAction(ISD::MGATHER, MVT::v2f32, Custom); |
| setOperationAction(ISD::MGATHER, MVT::v2i32, Custom); |
| |
| for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64, |
| MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) |
| setOperationAction(ISD::MGATHER, VT, Custom); |
| } |
| } |
| |
| // This block controls legalization of the mask vector sizes that are |
| // available with AVX512. 512-bit vectors are in a separate block controlled |
| // by useAVX512Regs. |
| if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) { |
| addRegisterClass(MVT::v1i1, &X86::VK1RegClass); |
| addRegisterClass(MVT::v2i1, &X86::VK2RegClass); |
| addRegisterClass(MVT::v4i1, &X86::VK4RegClass); |
| addRegisterClass(MVT::v8i1, &X86::VK8RegClass); |
| addRegisterClass(MVT::v16i1, &X86::VK16RegClass); |
| |
| setOperationAction(ISD::SELECT, MVT::v1i1, Custom); |
| setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom); |
| setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom); |
| |
| setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32); |
| setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32); |
| setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32); |
| setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32); |
| setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1, MVT::v8i32); |
| setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1, MVT::v8i32); |
| setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1, MVT::v4i32); |
| setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1, MVT::v4i32); |
| setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Custom); |
| setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Custom); |
| setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i1, Custom); |
| setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i1, Custom); |
| |
| // There is no byte sized k-register load or store without AVX512DQ. |
| if (!Subtarget.hasDQI()) { |
| setOperationAction(ISD::LOAD, MVT::v1i1, Custom); |
| setOperationAction(ISD::LOAD, MVT::v2i1, Custom); |
| setOperationAction(ISD::LOAD, MVT::v4i1, Custom); |
| setOperationAction(ISD::LOAD, MVT::v8i1, Custom); |
| |
| setOperationAction(ISD::STORE, MVT::v1i1, Custom); |
| setOperationAction(ISD::STORE, MVT::v2i1, Custom); |
| setOperationAction(ISD::STORE, MVT::v4i1, Custom); |
| setOperationAction(ISD::STORE, MVT::v8i1, Custom); |
| } |
| |
| // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors. |
| for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) { |
| setOperationAction(ISD::SIGN_EXTEND, VT, Custom); |
| setOperationAction(ISD::ZERO_EXTEND, VT, Custom); |
| setOperationAction(ISD::ANY_EXTEND, VT, Custom); |
| } |
| |
| for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) { |
| setOperationAction(ISD::ADD, VT, Custom); |
| setOperationAction(ISD::SUB, VT, Custom); |
| setOperationAction(ISD::MUL, VT, Custom); |
| setOperationAction(ISD::SETCC, VT, Custom); |
| setOperationAction(ISD::STRICT_FSETCC, VT, Custom); |
| setOperationAction(ISD::STRICT_FSETCCS, VT, Custom); |
| setOperationAction(ISD::SELECT, VT, Custom); |
| setOperationAction(ISD::TRUNCATE, VT, Custom); |
| setOperationAction(ISD::UADDSAT, VT, Custom); |
| setOperationAction(ISD::SADDSAT, VT, Custom); |
| setOperationAction(ISD::USUBSAT, VT, Custom); |
| setOperationAction(ISD::SSUBSAT, VT, Custom); |
| |
| setOperationAction(ISD::BUILD_VECTOR, VT, Custom); |
| setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); |
| setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); |
| setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); |
| setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); |
| setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); |
| setOperationAction(ISD::VSELECT, VT, Expand); |
| } |
| |
| for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 }) |
| setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); |
| } |
| |
| // This block controls legalization for 512-bit operations with 32/64 bit |
| // elements. 512-bits can be disabled based on prefer-vector-width and |
| // required-vector-width function attributes. |
| if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) { |
| addRegisterClass(MVT::v16i32, &X86::VR512RegClass); |
| addRegisterClass(MVT::v16f32, &X86::VR512RegClass); |
| addRegisterClass(MVT::v8i64, &X86::VR512RegClass); |
| addRegisterClass(MVT::v8f64, &X86::VR512RegClass); |
| |
| for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) { |
| setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal); |
| setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal); |
| setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal); |
| setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal); |
| setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal); |
| } |
| |
| for (MVT VT : { MVT::v16f32, MVT::v8f64 }) { |
| setOperationAction(ISD::FNEG, VT, Custom); |
| setOperationAction(ISD::FABS, VT, Custom); |
| setOperationAction(ISD::FMA, VT, Legal); |
| setOperationAction(ISD::STRICT_FMA, VT, Legal); |
| setOperationAction(ISD::FCOPYSIGN, VT, Custom); |
| } |
| |
| for (MVT VT : { MVT::v16i1, MVT::v16i8, MVT::v16i16 }) { |
| setOperationPromotedToType(ISD::FP_TO_SINT , VT, MVT::v16i32); |
| setOperationPromotedToType(ISD::FP_TO_UINT , VT, MVT::v16i32); |
| setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, VT, MVT::v16i32); |
| setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, VT, MVT::v16i32); |
| } |
| setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal); |
| setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal); |
| setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v16i32, Legal); |
| setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v16i32, Legal); |
| setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal); |
| setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal); |
| setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i32, Legal); |
| setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i32, Legal); |
| |
| setOperationAction(ISD::STRICT_FADD, MVT::v16f32, Legal); |
| setOperationAction(ISD::STRICT_FADD, MVT::v8f64, Legal); |
| setOperationAction(ISD::STRICT_FSUB, MVT::v16f32, Legal); |
| setOperationAction(ISD::STRICT_FSUB, MVT::v8f64, Legal); |
| setOperationAction(ISD::STRICT_FMUL, MVT::v16f32, Legal); |
| setOperationAction(ISD::STRICT_FMUL, MVT::v8f64, Legal); |
| setOperationAction(ISD::STRICT_FDIV, MVT::v16f32, Legal); |
| setOperationAction(ISD::STRICT_FDIV, MVT::v8f64, Legal); |
| setOperationAction(ISD::STRICT_FSQRT, MVT::v16f32, Legal); |
| setOperationAction(ISD::STRICT_FSQRT, MVT::v8f64, Legal); |
| setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f64, Legal); |
| setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f32, Legal); |
| |
| setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal); |
| setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal); |
| setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal); |
| setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal); |
| setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal); |
| |
| // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE |
| // to 512-bit rather than use the AVX2 instructions so that we can use |
| // k-masks. |
| if (!Subtarget.hasVLX()) { |
| for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64, |
| MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) { |
| setOperationAction(ISD::MLOAD, VT, Custom); |
| setOperationAction(ISD::MSTORE, VT, Custom); |
| } |
| } |
| |
| setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom); |
| setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom); |
| setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom); |
| setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom); |
| setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom); |
| setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom); |
| setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom); |
| setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom); |
| |
| // Need to custom widen this if we don't have AVX512BW. |
| setOperationAction(ISD::ANY_EXTEND, MVT::v8i8, Custom); |
| setOperationAction(ISD::ZERO_EXTEND, MVT::v8i8, Custom); |
| setOperationAction(ISD::SIGN_EXTEND, MVT::v8i8, Custom); |
| |
| for (auto VT : { MVT::v16f32, MVT::v8f64 }) { |
| setOperationAction(ISD::FFLOOR, VT, Legal); |
| setOperationAction(ISD::STRICT_FFLOOR, VT, Legal); |
| setOperationAction(ISD::FCEIL, VT, Legal); |
| setOperationAction(ISD::STRICT_FCEIL, VT, Legal); |
| setOperationAction(ISD::FTRUNC, VT, Legal); |
| setOperationAction(ISD::STRICT_FTRUNC, VT, Legal); |
| setOperationAction(ISD::FRINT, VT, Legal); |
| setOperationAction(ISD::STRICT_FRINT, VT, Legal); |
| setOperationAction(ISD::FNEARBYINT, VT, Legal); |
| setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal); |
| |
| setOperationAction(ISD::SELECT, VT, Custom); |
| } |
| |
| // Without BWI we need to use custom lowering to handle MVT::v64i8 input. |
| for (auto VT : {MVT::v16i32, MVT::v8i64, MVT::v64i8}) { |
| setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom); |
| setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom); |
| } |
| |
| setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom); |
| setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom); |
| setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom); |
| setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom); |
| |
| setOperationAction(ISD::MUL, MVT::v8i64, Custom); |
| setOperationAction(ISD::MUL, MVT::v16i32, Legal); |
| |
| setOperationAction(ISD::MULHU, MVT::v16i32, Custom); |
| setOperationAction(ISD::MULHS, MVT::v16i32, Custom); |
| |
| for (auto VT : { MVT::v16i32, MVT::v8i64 }) { |
| setOperationAction(ISD::SMAX, VT, Legal); |
| setOperationAction(ISD::UMAX, VT, Legal); |
| setOperationAction(ISD::SMIN, VT, Legal); |
| setOperationAction(ISD::UMIN, VT, Legal); |
| setOperationAction(ISD::ABS, VT, Legal); |
| setOperationAction(ISD::SRL, VT, Custom); |
| setOperationAction(ISD::SHL, VT, Custom); |
| setOperationAction(ISD::SRA, VT, Custom); |
| setOperationAction(ISD::CTPOP, VT, Custom); |
| setOperationAction(ISD::ROTL, VT, Custom); |
| setOperationAction(ISD::ROTR, VT, Custom); |
| setOperationAction(ISD::SETCC, VT, Custom); |
| setOperationAction(ISD::STRICT_FSETCC, VT, Custom); |
| setOperationAction(ISD::STRICT_FSETCCS, VT, Custom); |
| setOperationAction(ISD::SELECT, VT, Custom); |
| |
| // The condition codes aren't legal in SSE/AVX and under AVX512 we use |
| // setcc all the way to isel and prefer SETGT in some isel patterns. |
| setCondCodeAction(ISD::SETLT, VT, Custom); |
| setCondCodeAction(ISD::SETLE, VT, Custom); |
| } |
| |
| if (Subtarget.hasDQI()) { |
| setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal); |
| setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal); |
| setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i64, Legal); |
| setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i64, Legal); |
| setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal); |
| setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal); |
| setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i64, Legal); |
| setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i64, Legal); |
| |
| setOperationAction(ISD::MUL, MVT::v8i64, Legal); |
| } |
| |
| if (Subtarget.hasCDI()) { |
| // NonVLX sub-targets extend 128/256 vectors to use the 512 version. |
| for (auto VT : { MVT::v16i32, MVT::v8i64} ) { |
| setOperationAction(ISD::CTLZ, VT, Legal); |
| } |
| } // Subtarget.hasCDI() |
| |
| if (Subtarget.hasVPOPCNTDQ()) { |
| for (auto VT : { MVT::v16i32, MVT::v8i64 }) |
| setOperationAction(ISD::CTPOP, VT, Legal); |
| } |
| |
| // Extract subvector is special because the value type |
| // (result) is 256-bit but the source is 512-bit wide. |
| // 128-bit was made Legal under AVX1. |
| for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64, |
| MVT::v8f32, MVT::v4f64 }) |
| setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal); |
| |
| for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) { |
| setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); |
| setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); |
| setOperationAction(ISD::BUILD_VECTOR, VT, Custom); |
| setOperationAction(ISD::VSELECT, VT, Custom); |
| setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); |
| setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); |
| setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal); |
| setOperationAction(ISD::MLOAD, VT, Legal); |
| setOperationAction(ISD::MSTORE, VT, Legal); |
| setOperationAction(ISD::MGATHER, VT, Custom); |
| setOperationAction(ISD::MSCATTER, VT, Custom); |
| } |
| if (!Subtarget.hasBWI()) { |
| // Need to custom split v32i16/v64i8 bitcasts. |
| setOperationAction(ISD::BITCAST, MVT::v32i16, Custom); |
| setOperationAction(ISD::BITCAST, MVT::v64i8, Custom); |
| |
| // Better to split these into two 256-bit ops. |
| setOperationAction(ISD::BITREVERSE, MVT::v8i64, Custom); |
| setOperationAction(ISD::BITREVERSE, MVT::v16i32, Custom); |
| } |
| |
| if (Subtarget.hasVBMI2()) { |
| for (auto VT : { MVT::v16i32, MVT::v8i64 }) { |
| setOperationAction(ISD::FSHL, VT, Custom); |
| setOperationAction(ISD::FSHR, VT, Custom); |
| } |
| } |
| }// has AVX-512 |
| |
| // This block controls legalization for operations that don't have |
| // pre-AVX512 equivalents. Without VLX we use 512-bit operations for |
| // narrower widths. |
| if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) { |
| // These operations are handled on non-VLX by artificially widening in |
| // isel patterns. |
| |
| setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, |
| Subtarget.hasVLX() ? Legal : Custom); |
| setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, |
| Subtarget.hasVLX() ? Legal : Custom); |
| setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom); |
| setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i32, |
| Subtarget.hasVLX() ? Legal : Custom); |
| setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4i32, |
| Subtarget.hasVLX() ? Legal : Custom); |
| setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i32, Custom); |
| setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, |
| Subtarget.hasVLX() ? Legal : Custom); |
| setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, |
| Subtarget.hasVLX() ? Legal : Custom); |
| setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i32, |
| Subtarget.hasVLX() ? Legal : Custom); |
| setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32, |
| Subtarget.hasVLX() ? Legal : Custom); |
| |
| for (auto VT : { MVT::v2i64, MVT::v4i64 }) { |
| setOperationAction(ISD::SMAX, VT, Legal); |
| setOperationAction(ISD::UMAX, VT, Legal); |
| setOperationAction(ISD::SMIN, VT, Legal); |
| setOperationAction(ISD::UMIN, VT, Legal); |
| setOperationAction(ISD::ABS, VT, Legal); |
| } |
| |
| for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) { |
| setOperationAction(ISD::ROTL, VT, Custom); |
| setOperationAction(ISD::ROTR, VT, Custom); |
| } |
| |
| // Custom legalize 2x32 to get a little better code. |
| setOperationAction(ISD::MSCATTER, MVT::v2f32, Custom); |
| setOperationAction(ISD::MSCATTER, MVT::v2i32, Custom); |
| |
| for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64, |
| MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) |
| setOperationAction(ISD::MSCATTER, VT, Custom); |
| |
| if (Subtarget.hasDQI()) { |
| for (auto VT : { MVT::v2i64, MVT::v4i64 }) { |
| setOperationAction(ISD::SINT_TO_FP, VT, |
| Subtarget.hasVLX() ? Legal : Custom); |
| setOperationAction(ISD::UINT_TO_FP, VT, |
| Subtarget.hasVLX() ? Legal : Custom); |
| setOperationAction(ISD::STRICT_SINT_TO_FP, VT, |
| Subtarget.hasVLX() ? Legal : Custom); |
| setOperationAction(ISD::STRICT_UINT_TO_FP, VT, |
| Subtarget.hasVLX() ? Legal : Custom); |
| setOperationAction(ISD::FP_TO_SINT, VT, |
| Subtarget.hasVLX() ? Legal : Custom); |
| setOperationAction(ISD::FP_TO_UINT, VT, |
| Subtarget.hasVLX() ? Legal : Custom); |
| setOperationAction(ISD::STRICT_FP_TO_SINT, VT, |
| Subtarget.hasVLX() ? Legal : Custom); |
| setOperationAction(ISD::STRICT_FP_TO_UINT, VT, |
| Subtarget.hasVLX() ? Legal : Custom); |
| setOperationAction(ISD::MUL, VT, Legal); |
| } |
| } |
| |
| if (Subtarget.hasCDI()) { |
| for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) { |
| setOperationAction(ISD::CTLZ, VT, Legal); |
| } |
| } // Subtarget.hasCDI() |
| |
| if (Subtarget.hasVPOPCNTDQ()) { |
| for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) |
| setOperationAction(ISD::CTPOP, VT, Legal); |
| } |
| } |
| |
| // This block control legalization of v32i1/v64i1 which are available with |
| // AVX512BW. 512-bit v32i16 and v64i8 vector legalization is controlled with |
| // useBWIRegs. |
| if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) { |
| addRegisterClass(MVT::v32i1, &X86::VK32RegClass); |
| addRegisterClass(MVT::v64i1, &X86::VK64RegClass); |
| |
| for (auto VT : { MVT::v32i1, MVT::v64i1 }) { |
| setOperationAction(ISD::ADD, VT, Custom); |
| setOperationAction(ISD::SUB, VT, Custom); |
| setOperationAction(ISD::MUL, VT, Custom); |
| setOperationAction(ISD::VSELECT, VT, Expand); |
| setOperationAction(ISD::UADDSAT, VT, Custom); |
| setOperationAction(ISD::SADDSAT, VT, Custom); |
| setOperationAction(ISD::USUBSAT, VT, Custom); |
| setOperationAction(ISD::SSUBSAT, VT, Custom); |
| |
| setOperationAction(ISD::TRUNCATE, VT, Custom); |
| setOperationAction(ISD::SETCC, VT, Custom); |
| setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); |
| setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); |
| setOperationAction(ISD::SELECT, VT, Custom); |
| setOperationAction(ISD::BUILD_VECTOR, VT, Custom); |
| setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); |
| } |
| |
| setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Custom); |
| setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Custom); |
| setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom); |
| setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom); |
| for (auto VT : { MVT::v16i1, MVT::v32i1 }) |
| setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); |
| |
| // Extends from v32i1 masks to 256-bit vectors. |
| setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom); |
| setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom); |
| setOperationAction(ISD::ANY_EXTEND, MVT::v32i8, Custom); |
| } |
| |
| // This block controls legalization for v32i16 and v64i8. 512-bits can be |
| // disabled based on prefer-vector-width and required-vector-width function |
| // attributes. |
| if (!Subtarget.useSoftFloat() && Subtarget.useBWIRegs()) { |
| addRegisterClass(MVT::v32i16, &X86::VR512RegClass); |
| addRegisterClass(MVT::v64i8, &X86::VR512RegClass); |
| |
| // Extends from v64i1 masks to 512-bit vectors. |
| setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom); |
| setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom); |
| setOperationAction(ISD::ANY_EXTEND, MVT::v64i8, Custom); |
| |
| setOperationAction(ISD::MUL, MVT::v32i16, Legal); |
| setOperationAction(ISD::MUL, MVT::v64i8, Custom); |
| setOperationAction(ISD::MULHS, MVT::v32i16, Legal); |
| setOperationAction(ISD::MULHU, MVT::v32i16, Legal); |
| setOperationAction(ISD::MULHS, MVT::v64i8, Custom); |
| setOperationAction(ISD::MULHU, MVT::v64i8, Custom); |
| setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i16, Custom); |
| setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i8, Custom); |
| setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i16, Legal); |
| setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i8, Legal); |
| setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom); |
| setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom); |
| setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i16, Custom); |
| setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v64i8, Custom); |
| setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom); |
| setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom); |
| setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom); |
| setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i16, Custom); |
| setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i8, Custom); |
| setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i16, Custom); |
| setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i8, Custom); |
| setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom); |
| setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom); |
| |
| setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v32i16, Custom); |
| setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v32i16, Custom); |
| |
| setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal); |
| |
| for (auto VT : { MVT::v64i8, MVT::v32i16 }) { |
| setOperationAction(ISD::BUILD_VECTOR, VT, Custom); |
| setOperationAction(ISD::VSELECT, VT, Custom); |
| setOperationAction(ISD::ABS, VT, Legal); |
| setOperationAction(ISD::SRL, VT, Custom); |
| setOperationAction(ISD::SHL, VT, Custom); |
| setOperationAction(ISD::SRA, VT, Custom); |
| setOperationAction(ISD::MLOAD, VT, Legal); |
| setOperationAction(ISD::MSTORE, VT, Legal); |
| setOperationAction(ISD::CTPOP, VT, Custom); |
| setOperationAction(ISD::CTLZ, VT, Custom); |
| setOperationAction(ISD::SMAX, VT, Legal); |
| setOperationAction(ISD::UMAX, VT, Legal); |
| setOperationAction(ISD::SMIN, VT, Legal); |
| setOperationAction(ISD::UMIN, VT, Legal); |
| setOperationAction(ISD::SETCC, VT, Custom); |
| setOperationAction(ISD::UADDSAT, VT, Legal); |
| setOperationAction(ISD::SADDSAT, VT, Legal); |
| setOperationAction(ISD::USUBSAT, VT, Legal); |
| setOperationAction(ISD::SSUBSAT, VT, Legal); |
| setOperationAction(ISD::SELECT, VT, Custom); |
| |
| // The condition codes aren't legal in SSE/AVX and under AVX512 we use |
| // setcc all the way to isel and prefer SETGT in some isel patterns. |
| setCondCodeAction(ISD::SETLT, VT, Custom); |
| setCondCodeAction(ISD::SETLE, VT, Custom); |
| } |
| |
| for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) { |
| setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal); |
| } |
| |
| if (Subtarget.hasBITALG()) { |
| for (auto VT : { MVT::v64i8, MVT::v32i16 }) |
| setOperationAction(ISD::CTPOP, VT, Legal); |
| } |
| |
| if (Subtarget.hasVBMI2()) { |
| setOperationAction(ISD::FSHL, MVT::v32i16, Custom); |
| setOperationAction(ISD::FSHR, MVT::v32i16, Custom); |
| } |
| } |
| |
| if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) { |
| for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) { |
| setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom); |
| setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom); |
| } |
| |
| // These operations are handled on non-VLX by artificially widening in |
| // isel patterns. |
| // TODO: Custom widen in lowering on non-VLX and drop the isel patterns? |
| |
| if (Subtarget.hasBITALG()) { |
| for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 }) |
| setOperationAction(ISD::CTPOP, VT, Legal); |
| } |
| } |
| |
| if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) { |
| setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal); |
| setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal); |
| setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal); |
| setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal); |
| setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal); |
| |
| setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal); |
| setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal); |
| setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal); |
| setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal); |
| setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal); |
| |
| if (Subtarget.hasDQI()) { |
| // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion. |
| // v2f32 UINT_TO_FP is already custom under SSE2. |
| assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && |
| isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) && |
| "Unexpected operation action!"); |
| // v2i64 FP_TO_S/UINT(v2f32) custom conversion. |
| setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom); |
| setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom); |
| setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f32, Custom); |
| setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f32, Custom); |
| } |
| |
| if (Subtarget.hasBWI()) { |
| setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal); |
| setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal); |
| } |
| |
| if (Subtarget.hasVBMI2()) { |
| // TODO: Make these legal even without VLX? |
| for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64, |
| MVT::v16i16, MVT::v8i32, MVT::v4i64 }) { |
| setOperationAction(ISD::FSHL, VT, Custom); |
| setOperationAction(ISD::FSHR, VT, Custom); |
| } |
| } |
| |
| setOperationAction(ISD::TRUNCATE, MVT::v16i32, Custom); |
| setOperationAction(ISD::TRUNCATE, MVT::v8i64, Custom); |
| setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom); |
| } |
| |
| // We want to custom lower some of our intrinsics. |
| setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); |
| setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); |
| setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); |
| if (!Subtarget.is64Bit()) { |
| setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom); |
| } |
| |
| // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't |
| // handle type legalization for these operations here. |
| // |
| // FIXME: We really should do custom legalization for addition and |
| // subtraction on x86-32 once PR3203 is fixed. We really can't do much better |
| // than generic legalization for 64-bit multiplication-with-overflow, though. |
| for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) { |
| if (VT == MVT::i64 && !Subtarget.is64Bit()) |
| continue; |
| // Add/Sub/Mul with overflow operations are custom lowered. |
| setOperationAction(ISD::SADDO, VT, Custom); |
| setOperationAction(ISD::UADDO, VT, Custom); |
| setOperationAction(ISD::SSUBO, VT, Custom); |
| setOperationAction(ISD::USUBO, VT, Custom); |
| setOperationAction(ISD::SMULO, VT, Custom); |
| setOperationAction(ISD::UMULO, VT, Custom); |
| |
| // Support carry in as value rather than glue. |
| setOperationAction(ISD::ADDCARRY, VT, Custom); |
| setOperationAction(ISD::SUBCARRY, VT, Custom); |
| setOperationAction(ISD::SETCCCARRY, VT, Custom); |
| } |
| |
| if (!Subtarget.is64Bit()) { |
| // These libcalls are not available in 32-bit. |
| setLibcallName(RTLIB::SHL_I128, nullptr); |
| setLibcallName(RTLIB::SRL_I128, nullptr); |
| setLibcallName(RTLIB::SRA_I128, nullptr); |
| setLibcallName(RTLIB::MUL_I128, nullptr); |
| } |
| |
| // Combine sin / cos into _sincos_stret if it is available. |
| if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr && |
| getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) { |
| setOperationAction(ISD::FSINCOS, MVT::f64, Custom); |
| setOperationAction(ISD::FSINCOS, MVT::f32, Custom); |
| } |
| |
| if (Subtarget.isTargetWin64()) { |
| setOperationAction(ISD::SDIV, MVT::i128, Custom); |
| setOperationAction(ISD::UDIV, MVT::i128, Custom); |
| setOperationAction(ISD::SREM, MVT::i128, Custom); |
| setOperationAction(ISD::UREM, MVT::i128, Custom); |
| setOperationAction(ISD::SDIVREM, MVT::i128, Custom); |
| setOperationAction(ISD::UDIVREM, MVT::i128, Custom); |
| } |
| |
| // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)` |
| // is. We should promote the value to 64-bits to solve this. |
| // This is what the CRT headers do - `fmodf` is an inline header |
| // function casting to f64 and calling `fmod`. |
| if (Subtarget.is32Bit() && |
| (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium())) |
| for (ISD::NodeType Op : |
| {ISD::FCEIL, ISD::STRICT_FCEIL, |
| ISD::FCOS, ISD::STRICT_FCOS, |
| ISD::FEXP, ISD::STRICT_FEXP, |
| ISD::FFLOOR, ISD::STRICT_FFLOOR, |
| ISD::FREM, ISD::STRICT_FREM, |
| ISD::FLOG, ISD::STRICT_FLOG, |
| ISD::FLOG10, ISD::STRICT_FLOG10, |
| ISD::FPOW, ISD::STRICT_FPOW, |
| ISD::FSIN, ISD::STRICT_FSIN}) |
| if (isOperationExpand(Op, MVT::f32)) |
| setOperationAction(Op, MVT::f32, Promote); |
| |
| // We have target-specific dag combine patterns for the following nodes: |
| setTargetDAGCombine(ISD::VECTOR_SHUFFLE); |
| setTargetDAGCombine(ISD::SCALAR_TO_VECTOR); |
| setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); |
| setTargetDAGCombine(ISD::CONCAT_VECTORS); |
| setTargetDAGCombine(ISD::INSERT_SUBVECTOR); |
| setTargetDAGCombine(ISD::EXTRACT_SUBVECTOR); |
| setTargetDAGCombine(ISD::BITCAST); |
| setTargetDAGCombine(ISD::VSELECT); |
| setTargetDAGCombine(ISD::SELECT); |
| setTargetDAGCombine(ISD::SHL); |
| setTargetDAGCombine(ISD::SRA); |
| setTargetDAGCombine(ISD::SRL); |
| setTargetDAGCombine(ISD::OR); |
| setTargetDAGCombine(ISD::AND); |
| setTargetDAGCombine(ISD::ADD); |
| setTargetDAGCombine(ISD::FADD); |
| setTargetDAGCombine(ISD::FSUB); |
| setTargetDAGCombine(ISD::FNEG); |
| setTargetDAGCombine(ISD::FMA); |
| setTargetDAGCombine(ISD::FMINNUM); |
| setTargetDAGCombine(ISD::FMAXNUM); |
| setTargetDAGCombine(ISD::SUB); |
| setTargetDAGCombine(ISD::LOAD); |
| setTargetDAGCombine(ISD::MLOAD); |
| setTargetDAGCombine(ISD::STORE); |
| setTargetDAGCombine(ISD::MSTORE); |
| setTargetDAGCombine(ISD::TRUNCATE); |
| setTargetDAGCombine(ISD::ZERO_EXTEND); |
| setTargetDAGCombine(ISD::ANY_EXTEND); |
| setTargetDAGCombine(ISD::SIGN_EXTEND); |
| setTargetDAGCombine(ISD::SIGN_EXTEND_INREG); |
| setTargetDAGCombine(ISD::ANY_EXTEND_VECTOR_INREG); |
| setTargetDAGCombine(ISD::SIGN_EXTEND_VECTOR_INREG); |
| setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG); |
| setTargetDAGCombine(ISD::SINT_TO_FP); |
| setTargetDAGCombine(ISD::UINT_TO_FP); |
| setTargetDAGCombine(ISD::STRICT_SINT_TO_FP); |
| setTargetDAGCombine(ISD::STRICT_UINT_TO_FP); |
| setTargetDAGCombine(ISD::SETCC); |
| setTargetDAGCombine(ISD::MUL); |
| setTargetDAGCombine(ISD::XOR); |
| setTargetDAGCombine(ISD::MSCATTER); |
| setTargetDAGCombine(ISD::MGATHER); |
| |
| computeRegisterProperties(Subtarget.getRegisterInfo()); |
| |
| MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores |
| MaxStoresPerMemsetOptSize = 8; |
| MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores |
| MaxStoresPerMemcpyOptSize = 4; |
| MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores |
| MaxStoresPerMemmoveOptSize = 4; |
| |
| // TODO: These control memcmp expansion in CGP and could be raised higher, but |
| // that needs to benchmarked and balanced with the potential use of vector |
| // load/store types (PR33329, PR33914). |
| MaxLoadsPerMemcmp = 2; |
| MaxLoadsPerMemcmpOptSize = 2; |
| |
| // Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4). |
| setPrefLoopAlignment(Align(1ULL << ExperimentalPrefLoopAlignment)); |
| |
| // An out-of-order CPU can speculatively execute past a predictable branch, |
| // but a conditional move could be stalled by an expensive earlier operation. |
| PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder(); |
| EnableExtLdPromotion = true; |
| setPrefFunctionAlignment(Align(16)); |
| |
| verifyIntrinsicTables(); |
| |
| // Default to having -disable-strictnode-mutation on |
| IsStrictFPEnabled = true; |
| } |
| |
| // This has so far only been implemented for 64-bit MachO. |
| bool X86TargetLowering::useLoadStackGuardNode() const { |
| return Subtarget.isTargetMachO() && Subtarget.is64Bit(); |
| } |
| |
| bool X86TargetLowering::useStackGuardXorFP() const { |
| // Currently only MSVC CRTs XOR the frame pointer into the stack guard value. |
| return Subtarget.getTargetTriple().isOSMSVCRT() && !Subtarget.isTargetMachO(); |
| } |
| |
| SDValue X86TargetLowering::emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val, |
| const SDLoc &DL) const { |
| EVT PtrTy = getPointerTy(DAG.getDataLayout()); |
| unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP; |
| MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val); |
| return SDValue(Node, 0); |
| } |
| |
| TargetLoweringBase::LegalizeTypeAction |
| X86TargetLowering::getPreferredVectorAction(MVT VT) const { |
| if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI()) |
| return TypeSplitVector; |
| |
| if (VT.getVectorNumElements() != 1 && |
| VT.getVectorElementType() != MVT::i1) |
| return TypeWidenVector; |
| |
| return TargetLoweringBase::getPreferredVectorAction(VT); |
| } |
| |
| MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context, |
| CallingConv::ID CC, |
| EVT VT) const { |
| // v32i1 vectors should be promoted to v32i8 to match avx2. |
| if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI()) |
| return MVT::v32i8; |
| // Break wide or odd vXi1 vectors into scalars to match avx2 behavior. |
| if (VT.isVector() && VT.getVectorElementType() == MVT::i1 && |
| Subtarget.hasAVX512() && |
| (!isPowerOf2_32(VT.getVectorNumElements()) || |
| (VT.getVectorNumElements() > 16 && !Subtarget.hasBWI()) || |
| (VT.getVectorNumElements() > 64 && Subtarget.hasBWI()))) |
| return MVT::i8; |
| // Split v64i1 vectors if we don't have v64i8 available. |
| if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() && |
| CC != CallingConv::X86_RegCall) |
| return MVT::v32i1; |
| // FIXME: Should we just make these types legal and custom split operations? |
| if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !EnableOldKNLABI && |
| Subtarget.useAVX512Regs() && !Subtarget.hasBWI()) |
| return MVT::v16i32; |
| return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT); |
| } |
| |
| unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context, |
| CallingConv::ID CC, |
| EVT VT) const { |
| // v32i1 vectors should be promoted to v32i8 to match avx2. |
| if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI()) |
| return 1; |
| // Break wide or odd vXi1 vectors into scalars to match avx2 behavior. |
| if (VT.isVector() && VT.getVectorElementType() == MVT::i1 && |
| Subtarget.hasAVX512() && |
| (!isPowerOf2_32(VT.getVectorNumElements()) || |
| (VT.getVectorNumElements() > 16 && !Subtarget.hasBWI()) || |
| (VT.getVectorNumElements() > 64 && Subtarget.hasBWI()))) |
| return VT.getVectorNumElements(); |
| // Split v64i1 vectors if we don't have v64i8 available. |
| if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() && |
| CC != CallingConv::X86_RegCall) |
| return 2; |
| // FIXME: Should we just make these types legal and custom split operations? |
| if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !EnableOldKNLABI && |
| Subtarget.useAVX512Regs() && !Subtarget.hasBWI()) |
| return 1; |
| return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT); |
| } |
| |
| unsigned X86TargetLowering::getVectorTypeBreakdownForCallingConv( |
| LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, |
| unsigned &NumIntermediates, MVT &RegisterVT) const { |
| // Break wide or odd vXi1 vectors into scalars to match avx2 behavior. |
| if (VT.isVector() && VT.getVectorElementType() == MVT::i1 && |
| Subtarget.hasAVX512() && |
| (!isPowerOf2_32(VT.getVectorNumElements()) || |
| (VT.getVectorNumElements() > 16 && !Subtarget.hasBWI()) || |
| (VT.getVectorNumElements() > 64 && Subtarget.hasBWI()))) { |
| RegisterVT = MVT::i8; |
| IntermediateVT = MVT::i1; |
| NumIntermediates = VT.getVectorNumElements(); |
| return NumIntermediates; |
| } |
| |
| // Split v64i1 vectors if we don't have v64i8 available. |
| if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() && |
| CC != CallingConv::X86_RegCall) { |
| RegisterVT = MVT::v32i1; |
| IntermediateVT = MVT::v32i1; |
| NumIntermediates = 2; |
| return 2; |
| } |
| |
| return TargetLowering::getVectorTypeBreakdownForCallingConv(Context, CC, VT, IntermediateVT, |
| NumIntermediates, RegisterVT); |
| } |
| |
| EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL, |
| LLVMContext& Context, |
| EVT VT) const { |
| if (!VT.isVector()) |
| return MVT::i8; |
| |
| if (Subtarget.hasAVX512()) { |
| const unsigned NumElts = VT.getVectorNumElements(); |
| |
| // Figure out what this type will be legalized to. |
| EVT LegalVT = VT; |
| while (getTypeAction(Context, LegalVT) != TypeLegal) |
| LegalVT = getTypeToTransformTo(Context, LegalVT); |
| |
| // If we got a 512-bit vector then we'll definitely have a vXi1 compare. |
| if (LegalVT.getSimpleVT().is512BitVector()) |
| return EVT::getVectorVT(Context, MVT::i1, NumElts); |
| |
| if (LegalVT.getSimpleVT().isVector() && Subtarget.hasVLX()) { |
| // If we legalized to less than a 512-bit vector, then we will use a vXi1 |
| // compare for vXi32/vXi64 for sure. If we have BWI we will also support |
| // vXi16/vXi8. |
| MVT EltVT = LegalVT.getSimpleVT().getVectorElementType(); |
| if (Subtarget.hasBWI() || EltVT.getSizeInBits() >= 32) |
| return EVT::getVectorVT(Context, MVT::i1, NumElts); |
| } |
| } |
| |
| return VT.changeVectorElementTypeToInteger(); |
| } |
| |
| /// Helper for getByValTypeAlignment to determine |
| /// the desired ByVal argument alignment. |
| static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) { |
| if (MaxAlign == 16) |
| return; |
| if (VectorType *VTy = dyn_cast<VectorType>(Ty)) { |
| if (VTy->getBitWidth() == 128) |
| MaxAlign = 16; |
| } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { |
| unsigned EltAlign = 0; |
| getMaxByValAlign(ATy->getElementType(), EltAlign); |
| if (EltAlign > MaxAlign) |
| MaxAlign = EltAlign; |
| } else if (StructType *STy = dyn_cast<StructType>(Ty)) { |
| for (auto *EltTy : STy->elements()) { |
| unsigned EltAlign = 0; |
| getMaxByValAlign(EltTy, EltAlign); |
| if (EltAlign > MaxAlign) |
| MaxAlign = EltAlign; |
| if (MaxAlign == 16) |
| break; |
| } |
| } |
| } |
| |
| /// Return the desired alignment for ByVal aggregate |
| /// function arguments in the caller parameter area. For X86, aggregates |
| /// that contain SSE vectors are placed at 16-byte boundaries while the rest |
| /// are at 4-byte boundaries. |
| unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty, |
| const DataLayout &DL) const { |
| if (Subtarget.is64Bit()) { |
| // Max of 8 and alignment of type. |
| unsigned TyAlign = DL.getABITypeAlignment(Ty); |
| if (TyAlign > 8) |
| return TyAlign; |
| return 8; |
| } |
| |
| unsigned Align = 4; |
| if (Subtarget.hasSSE1()) |
| getMaxByValAlign(Ty, Align); |
| return Align; |
| } |
| |
| /// Returns the target specific optimal type for load |
| /// and store operations as a result of memset, memcpy, and memmove |
| /// lowering. If DstAlign is zero that means it's safe to destination |
| /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it |
| /// means there isn't a need to check it against alignment requirement, |
| /// probably because the source does not need to be loaded. If 'IsMemset' is |
| /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that |
| /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy |
| /// source is constant so it does not need to be loaded. |
| /// It returns EVT::Other if the type should be determined using generic |
| /// target-independent logic. |
| /// For vector ops we check that the overall size isn't larger than our |
| /// preferred vector width. |
| EVT X86TargetLowering::getOptimalMemOpType( |
| uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset, |
| bool ZeroMemset, bool MemcpyStrSrc, |
| const AttributeList &FuncAttributes) const { |
| if (!FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) { |
| if (Size >= 16 && (!Subtarget.isUnalignedMem16Slow() || |
| ((DstAlign == 0 || DstAlign >= 16) && |
| (SrcAlign == 0 || SrcAlign >= 16)))) { |
| // FIXME: Check if unaligned 64-byte accesses are slow. |
| if (Size >= 64 && Subtarget.hasAVX512() && |
| (Subtarget.getPreferVectorWidth() >= 512)) { |
| return Subtarget.hasBWI() ? MVT::v64i8 : MVT::v16i32; |
| } |
| // FIXME: Check if unaligned 32-byte accesses are slow. |
| if (Size >= 32 && Subtarget.hasAVX() && |
| (Subtarget.getPreferVectorWidth() >= 256)) { |
| // Although this isn't a well-supported type for AVX1, we'll let |
| // legalization and shuffle lowering produce the optimal codegen. If we |
| // choose an optimal type with a vector element larger than a byte, |
| // getMemsetStores() may create an intermediate splat (using an integer |
| // multiply) before we splat as a vector. |
| return MVT::v32i8; |
| } |
| if (Subtarget.hasSSE2() && (Subtarget.getPreferVectorWidth() >= 128)) |
| return MVT::v16i8; |
| // TODO: Can SSE1 handle a byte vector? |
| // If we have SSE1 registers we should be able to use them. |
| if (Subtarget.hasSSE1() && (Subtarget.is64Bit() || Subtarget.hasX87()) && |
| (Subtarget.getPreferVectorWidth() >= 128)) |
| return MVT::v4f32; |
| } else if ((!IsMemset || ZeroMemset) && !MemcpyStrSrc && Size >= 8 && |
| !Subtarget.is64Bit() && Subtarget.hasSSE2()) { |
| // Do not use f64 to lower memcpy if source is string constant. It's |
| // better to use i32 to avoid the loads. |
| // Also, do not use f64 to lower memset unless this is a memset of zeros. |
| // The gymnastics of splatting a byte value into an XMM register and then |
| // only using 8-byte stores (because this is a CPU with slow unaligned |
| // 16-byte accesses) makes that a loser. |
| return MVT::f64; |
| } |
| } |
| // This is a compromise. If we reach here, unaligned accesses may be slow on |
| // this target. However, creating smaller, aligned accesses could be even |
| // slower and would certainly be a lot more code. |
| if (Subtarget.is64Bit() && Size >= 8) |
| return MVT::i64; |
| return MVT::i32; |
| } |
| |
| bool X86TargetLowering::isSafeMemOpType(MVT VT) const { |
| if (VT == MVT::f32) |
| return X86ScalarSSEf32; |
| else if (VT == MVT::f64) |
| return X86ScalarSSEf64; |
| return true; |
| } |
| |
| bool X86TargetLowering::allowsMisalignedMemoryAccesses( |
| EVT VT, unsigned, unsigned Align, MachineMemOperand::Flags Flags, |
| bool *Fast) const { |
| if (Fast) { |
| switch (VT.getSizeInBits()) { |
| default: |
| // 8-byte and under are always assumed to be fast. |
| *Fast = true; |
| break; |
| case 128: |
| *Fast = !Subtarget.isUnalignedMem16Slow(); |
| break; |
| case 256: |
| *Fast = !Subtarget.isUnalignedMem32Slow(); |
| break; |
| // TODO: What about AVX-512 (512-bit) accesses? |
| } |
| } |
| // NonTemporal vector memory ops must be aligned. |
| if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) { |
| // NT loads can only be vector aligned, so if its less aligned than the |
| // minimum vector size (which we can split the vector down to), we might as |
| // well use a regular unaligned vector load. |
| // We don't have any NT loads pre-SSE41. |
| if (!!(Flags & MachineMemOperand::MOLoad)) |
| return (Align < 16 || !Subtarget.hasSSE41()); |
| return false; |
| } |
| // Misaligned accesses of any size are always allowed. |
| return true; |
| } |
| |
| /// Return the entry encoding for a jump table in the |
| /// current function. The returned value is a member of the |
| /// MachineJumpTableInfo::JTEntryKind enum. |
| unsigned X86TargetLowering::getJumpTableEncoding() const { |
| // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF |
| // symbol. |
| if (isPositionIndependent() && Subtarget.isPICStyleGOT()) |
| return MachineJumpTableInfo::EK_Custom32; |
| |
| // Otherwise, use the normal jump table encoding heuristics. |
| return TargetLowering::getJumpTableEncoding(); |
| } |
| |
| bool X86TargetLowering::useSoftFloat() const { |
| return Subtarget.useSoftFloat(); |
| } |
| |
| void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC, |
| ArgListTy &Args) const { |
| |
| // Only relabel X86-32 for C / Stdcall CCs. |
| if (Subtarget.is64Bit()) |
| return; |
| if (CC != CallingConv::C && CC != CallingConv::X86_StdCall) |
| return; |
| unsigned ParamRegs = 0; |
| if (auto *M = MF->getFunction().getParent()) |
| ParamRegs = M->getNumberRegisterParameters(); |
| |
| // Mark the first N int arguments as having reg |
| for (unsigned Idx = 0; Idx < Args.size(); Idx++) { |
| Type *T = Args[Idx].Ty; |
| if (T->isIntOrPtrTy()) |
| if (MF->getDataLayout().getTypeAllocSize(T) <= 8) { |
| unsigned numRegs = 1; |
| if (MF->getDataLayout().getTypeAllocSize(T) > 4) |
| numRegs = 2; |
| if (ParamRegs < numRegs) |
| return; |
| ParamRegs -= numRegs; |
| Args[Idx].IsInReg = true; |
| } |
| } |
| } |
| |
| const MCExpr * |
| X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI, |
| const MachineBasicBlock *MBB, |
| unsigned uid,MCContext &Ctx) const{ |
| assert(isPositionIndependent() && Subtarget.isPICStyleGOT()); |
| // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF |
| // entries. |
| return MCSymbolRefExpr::create(MBB->getSymbol(), |
| MCSymbolRefExpr::VK_GOTOFF, Ctx); |
| } |
| |
| /// Returns relocation base for the given PIC jumptable. |
| SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table, |
| SelectionDAG &DAG) const { |
| if (!Subtarget.is64Bit()) |
| // This doesn't have SDLoc associated with it, but is not really the |
| // same as a Register. |
| return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), |
| getPointerTy(DAG.getDataLayout())); |
| return Table; |
| } |
| |
| /// This returns the relocation base for the given PIC jumptable, |
| /// the same as getPICJumpTableRelocBase, but as an MCExpr. |
| const MCExpr *X86TargetLowering:: |
| getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI, |
| MCContext &Ctx) const { |
| // X86-64 uses RIP relative addressing based on the jump table label. |
| if (Subtarget.isPICStyleRIPRel()) |
| return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx); |
| |
| // Otherwise, the reference is relative to the PIC base. |
| return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx); |
| } |
| |
| std::pair<const TargetRegisterClass *, uint8_t> |
| X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI, |
| MVT VT) const { |
| const TargetRegisterClass *RRC = nullptr; |
| uint8_t Cost = 1; |
| switch (VT.SimpleTy) { |
| default: |
| return TargetLowering::findRepresentativeClass(TRI, VT); |
| case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64: |
| RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass; |
| break; |
| case MVT::x86mmx: |
| RRC = &X86::VR64RegClass; |
| break; |
| case MVT::f32: case MVT::f64: |
| case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: |
| case MVT::v4f32: case MVT::v2f64: |
| case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64: |
| case MVT::v8f32: case MVT::v4f64: |
| case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64: |
| case MVT::v16f32: case MVT::v8f64: |
| RRC = &X86::VR128XRegClass; |
| break; |
| } |
| return std::make_pair(RRC, Cost); |
| } |
| |
| unsigned X86TargetLowering::getAddressSpace() const { |
| if (Subtarget.is64Bit()) |
| return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257; |
| return 256; |
| } |
| |
| static bool hasStackGuardSlotTLS(const Triple &TargetTriple) { |
| return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() || |
| (TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17)); |
| } |
| |
| static Constant* SegmentOffset(IRBuilder<> &IRB, |
| unsigned Offset, unsigned AddressSpace) { |
| return ConstantExpr::getIntToPtr( |
| ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset), |
| Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace)); |
| } |
| |
| Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const { |
| // glibc, bionic, and Fuchsia have a special slot for the stack guard in |
| // tcbhead_t; use it instead of the usual global variable (see |
| // sysdeps/{i386,x86_64}/nptl/tls.h) |
| if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) { |
| if (Subtarget.isTargetFuchsia()) { |
| // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value. |
| return SegmentOffset(IRB, 0x10, getAddressSpace()); |
| } else { |
| // %fs:0x28, unless we're using a Kernel code model, in which case |
| // it's %gs:0x28. gs:0x14 on i386. |
| unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14; |
| return SegmentOffset(IRB, Offset, getAddressSpace()); |
| } |
| } |
| |
| return TargetLowering::getIRStackGuard(IRB); |
| } |
| |
| void X86TargetLowering::insertSSPDeclarations(Module &M) const { |
| // MSVC CRT provides functionalities for stack protection. |
| if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() || |
| Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) { |
| // MSVC CRT has a global variable holding security cookie. |
| M.getOrInsertGlobal("__security_cookie", |
| Type::getInt8PtrTy(M.getContext())); |
| |
| // MSVC CRT has a function to validate security cookie. |
| FunctionCallee SecurityCheckCookie = M.getOrInsertFunction( |
| "__security_check_cookie", Type::getVoidTy(M.getContext()), |
| Type::getInt8PtrTy(M.getContext())); |
| if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) { |
| F->setCallingConv(CallingConv::X86_FastCall); |
| F->addAttribute(1, Attribute::AttrKind::InReg); |
| } |
| return; |
| } |
| // glibc, bionic, and Fuchsia have a special slot for the stack guard. |
| if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) |
| return; |
| TargetLowering::insertSSPDeclarations(M); |
| } |
| |
| Value *X86TargetLowering::getSDagStackGuard(const Module &M) const { |
| // MSVC CRT has a global variable holding security cookie. |
| if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() || |
| Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) { |
| return M.getGlobalVariable("__security_cookie"); |
| } |
| return TargetLowering::getSDagStackGuard(M); |
| } |
| |
| Function *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const { |
| // MSVC CRT has a function to validate security cookie. |
| if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() || |
| Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) { |
| return M.getFunction("__security_check_cookie"); |
| } |
| return TargetLowering::getSSPStackGuardCheck(M); |
| } |
| |
| Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const { |
| if (Subtarget.getTargetTriple().isOSContiki()) |
| return getDefaultSafeStackPointerLocation(IRB, false); |
| |
| // Android provides a fixed TLS slot for the SafeStack pointer. See the |
| // definition of TLS_SLOT_SAFESTACK in |
| // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h |
| if (Subtarget.isTargetAndroid()) { |
| // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs: |
| // %gs:0x24 on i386 |
| unsigned Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24; |
| return SegmentOffset(IRB, Offset, getAddressSpace()); |
| } |
| |
| // Fuchsia is similar. |
| if (Subtarget.isTargetFuchsia()) { |
| // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value. |
| return SegmentOffset(IRB, 0x18, getAddressSpace()); |
| } |
| |
| return TargetLowering::getSafeStackPointerLocation(IRB); |
| } |
| |
| bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS, |
| unsigned DestAS) const { |
| assert(SrcAS != DestAS && "Expected different address spaces!"); |
| |
| const TargetMachine &TM = getTargetMachine(); |
| if (TM.getPointerSize(SrcAS) != TM.getPointerSize(DestAS)) |
| return false; |
| |
| return SrcAS < 256 && DestAS < 256; |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // Return Value Calling Convention Implementation |
| //===----------------------------------------------------------------------===// |
| |
| bool X86TargetLowering::CanLowerReturn( |
| CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, |
| const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const { |
| SmallVector<CCValAssign, 16> RVLocs; |
| CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context); |
| return CCInfo.CheckReturn(Outs, RetCC_X86); |
| } |
| |
| const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const { |
| static const MCPhysReg ScratchRegs[] = { X86::R11, 0 }; |
| return ScratchRegs; |
| } |
| |
| /// Lowers masks values (v*i1) to the local register values |
| /// \returns DAG node after lowering to register type |
| static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc, |
| const SDLoc &Dl, SelectionDAG &DAG) { |
| EVT ValVT = ValArg.getValueType(); |
| |
| if (ValVT == MVT::v1i1) |
| return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, Dl, ValLoc, ValArg, |
| DAG.getIntPtrConstant(0, Dl)); |
| |
| if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) || |
| (ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) { |
| // Two stage lowering might be required |
| // bitcast: v8i1 -> i8 / v16i1 -> i16 |
| // anyextend: i8 -> i32 / i16 -> i32 |
| EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16; |
| SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg); |
| if (ValLoc == MVT::i32) |
| ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy); |
| return ValToCopy; |
| } |
| |
| if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) || |
| (ValVT == MVT::v64i1 && ValLoc == MVT::i64)) { |
| // One stage lowering is required |
| // bitcast: v32i1 -> i32 / v64i1 -> i64 |
| return DAG.getBitcast(ValLoc, ValArg); |
| } |
| |
| return DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValArg); |
| } |
| |
| /// Breaks v64i1 value into two registers and adds the new node to the DAG |
| static void Passv64i1ArgInRegs( |
| const SDLoc &Dl, SelectionDAG &DAG, SDValue &Arg, |
| SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass, CCValAssign &VA, |
| CCValAssign &NextVA, const X86Subtarget &Subtarget) { |
| assert(Subtarget.hasBWI() && "Expected AVX512BW target!"); |
| assert(Subtarget.is32Bit() && "Expecting 32 bit target"); |
| assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value"); |
| assert(VA.isRegLoc() && NextVA.isRegLoc() && |
| "The value should reside in two registers"); |
| |
| // Before splitting the value we cast it to i64 |
| Arg = DAG.getBitcast(MVT::i64, Arg); |
| |
| // Splitting the value into two i32 types |
| SDValue Lo, Hi; |
| Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg, |
| DAG.getConstant(0, Dl, MVT::i32)); |
| Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg, |
| DAG.getConstant(1, Dl, MVT::i32)); |
| |
| // Attach the two i32 types into corresponding registers |
| RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo)); |
| RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi)); |
| } |
| |
| SDValue |
| X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, |
| bool isVarArg, |
| const SmallVectorImpl<ISD::OutputArg> &Outs, |
| const SmallVectorImpl<SDValue> &OutVals, |
| const SDLoc &dl, SelectionDAG &DAG) const { |
| MachineFunction &MF = DAG.getMachineFunction(); |
| X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); |
| |
| // In some cases we need to disable registers from the default CSR list. |
| // For example, when they are used for argument passing. |
| bool ShouldDisableCalleeSavedRegister = |
| CallConv == CallingConv::X86_RegCall || |
| MF.getFunction().hasFnAttribute("no_caller_saved_registers"); |
| |
| if (CallConv == CallingConv::X86_INTR && !Outs.empty()) |
| report_fatal_error("X86 interrupts may not return any value"); |
| |
| SmallVector<CCValAssign, 16> RVLocs; |
| CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext()); |
| CCInfo.AnalyzeReturn(Outs, RetCC_X86); |
| |
| SDValue Flag; |
| SmallVector<SDValue, 6> RetOps; |
| RetOps.push_back(Chain); // Operand #0 = Chain (updated below) |
| // Operand #1 = Bytes To Pop |
| RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl, |
| MVT::i32)); |
| |
| // Copy the result values into the output registers. |
| for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E; |
| ++I, ++OutsIndex) { |
| CCValAssign &VA = RVLocs[I]; |
| assert(VA.isRegLoc() && "Can only return in registers!"); |
| |
| // Add the register to the CalleeSaveDisableRegs list. |
| if (ShouldDisableCalleeSavedRegister) |
| MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg()); |
| |
| SDValue ValToCopy = OutVals[OutsIndex]; |
| EVT ValVT = ValToCopy.getValueType(); |
| |
| // Promote values to the appropriate types. |
| if (VA.getLocInfo() == CCValAssign::SExt) |
| ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy); |
| else if (VA.getLocInfo() == CCValAssign::ZExt) |
| ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy); |
| else if (VA.getLocInfo() == CCValAssign::AExt) { |
| if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1) |
| ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG); |
| else |
| ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy); |
| } |
| else if (VA.getLocInfo() == CCValAssign::BCvt) |
| ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy); |
| |
| assert(VA.getLocInfo() != CCValAssign::FPExt && |
| "Unexpected FP-extend for return value."); |
| |
| // Report an error if we have attempted to return a value via an XMM |
| // register and SSE was disabled. |
| if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) { |
| errorUnsupported(DAG, dl, "SSE register return with SSE disabled"); |
| VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts. |
| } else if (!Subtarget.hasSSE2() && |
| X86::FR64XRegClass.contains(VA.getLocReg()) && |
| ValVT == MVT::f64) { |
| // When returning a double via an XMM register, report an error if SSE2 is |
| // not enabled. |
| errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled"); |
| VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts. |
| } |
| |
| // Returns in ST0/ST1 are handled specially: these are pushed as operands to |
| // the RET instruction and handled by the FP Stackifier. |
| if (VA.getLocReg() == X86::FP0 || |
| VA.getLocReg() == X86::FP1) { |
| // If this is a copy from an xmm register to ST(0), use an FPExtend to |
| // change the value to the FP stack register class. |
| if (isScalarFPTypeInSSEReg(VA.getValVT())) |
| ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy); |
| RetOps.push_back(ValToCopy); |
| // Don't emit a copytoreg. |
| continue; |
| } |
| |
| // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64 |
| // which is returned in RAX / RDX. |
| if (Subtarget.is64Bit()) { |
| if (ValVT == MVT::x86mmx) { |
| if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) { |
| ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy); |
| ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, |
| ValToCopy); |
| // If we don't have SSE2 available, convert to v4f32 so the generated |
| // register is legal. |
| if (!Subtarget.hasSSE2()) |
| ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy); |
| } |
| } |
| } |
| |
| SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; |
| |
| if (VA.needsCustom()) { |
| assert(VA.getValVT() == MVT::v64i1 && |
| "Currently the only custom case is when we split v64i1 to 2 regs"); |
| |
| Passv64i1ArgInRegs(dl, DAG, ValToCopy, RegsToPass, VA, RVLocs[++I], |
| Subtarget); |
| |
| assert(2 == RegsToPass.size() && |
| "Expecting two registers after Pass64BitArgInRegs"); |
| |
| // Add the second register to the CalleeSaveDisableRegs list. |
| if (ShouldDisableCalleeSavedRegister) |
| MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg()); |
| } else { |
| RegsToPass.push_back(std::make_pair(VA.getLocReg(), ValToCopy)); |
| } |
| |
| // Add nodes to the DAG and add the values into the RetOps list |
| for (auto &Reg : RegsToPass) { |
| Chain = DAG.getCopyToReg(Chain, dl, Reg.first, Reg.second, Flag); |
| Flag = Chain.getValue(1); |
| RetOps.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType())); |
| } |
| } |
| |
| // Swift calling convention does not require we copy the sret argument |
| // into %rax/%eax for the return, and SRetReturnReg is not set for Swift. |
| |
| // All x86 ABIs require that for returning structs by value we copy |
| // the sret argument into %rax/%eax (depending on ABI) for the return. |
| // We saved the argument into a virtual register in the entry block, |
| // so now we copy the value out and into %rax/%eax. |
| // |
| // Checking Function.hasStructRetAttr() here is insufficient because the IR |
| // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is |
| // false, then an sret argument may be implicitly inserted in the SelDAG. In |
| // either case FuncInfo->setSRetReturnReg() will have been called. |
| if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) { |
| // When we have both sret and another return value, we should use the |
| // original Chain stored in RetOps[0], instead of the current Chain updated |
| // in the above loop. If we only have sret, RetOps[0] equals to Chain. |
| |
| // For the case of sret and another return value, we have |
| // Chain_0 at the function entry |
| // Chain_1 = getCopyToReg(Chain_0) in the above loop |
| // If we use Chain_1 in getCopyFromReg, we will have |
| // Val = getCopyFromReg(Chain_1) |
| // Chain_2 = getCopyToReg(Chain_1, Val) from below |
| |
| // getCopyToReg(Chain_0) will be glued together with |
| // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be |
| // in Unit B, and we will have cyclic dependency between Unit A and Unit B: |
| // Data dependency from Unit B to Unit A due to usage of Val in |
| // getCopyToReg(Chain_1, Val) |
| // Chain dependency from Unit A to Unit B |
| |
| // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg. |
| SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg, |
| getPointerTy(MF.getDataLayout())); |
| |
| unsigned RetValReg |
| = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ? |
| X86::RAX : X86::EAX; |
| Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag); |
| Flag = Chain.getValue(1); |
| |
| // RAX/EAX now acts like a return value. |
| RetOps.push_back( |
| DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout()))); |
| |
| // Add the returned register to the CalleeSaveDisableRegs list. |
| if (ShouldDisableCalleeSavedRegister) |
| MF.getRegInfo().disableCalleeSavedRegister(RetValReg); |
| } |
| |
| const X86RegisterInfo *TRI = Subtarget.getRegisterInfo(); |
| const MCPhysReg *I = |
| TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction()); |
| if (I) { |
| for (; *I; ++I) { |
| if (X86::GR64RegClass.contains(*I)) |
| RetOps.push_back(DAG.getRegister(*I, MVT::i64)); |
| else |
| llvm_unreachable("Unexpected register class in CSRsViaCopy!"); |
| } |
| } |
| |
| RetOps[0] = Chain; // Update chain. |
| |
| // Add the flag if we have it. |
| if (Flag.getNode()) |
| RetOps.push_back(Flag); |
| |
| X86ISD::NodeType opcode = X86ISD::RET_FLAG; |
| if (CallConv == CallingConv::X86_INTR) |
| opcode = X86ISD::IRET; |
| return DAG.getNode(opcode, dl, MVT::Other, RetOps); |
| } |
| |
| bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const { |
| if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0)) |
| return false; |
| |
| SDValue TCChain = Chain; |
| SDNode *Copy = *N->use_begin(); |
| if (Copy->getOpcode() == ISD::CopyToReg) { |
| // If the copy has a glue operand, we conservatively assume it isn't safe to |
| // perform a tail call. |
| if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue) |
| return false; |
| TCChain = Copy->getOperand(0); |
| } else if (Copy->getOpcode() != ISD::FP_EXTEND) |
| return false; |
| |
| bool HasRet = false; |
| for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end(); |
| UI != UE; ++UI) { |
| if (UI->getOpcode() != X86ISD::RET_FLAG) |
| return false; |
| // If we are returning more than one value, we can definitely |
| // not make a tail call see PR19530 |
| if (UI->getNumOperands() > 4) |
| return false; |
| if (UI->getNumOperands() == 4 && |
| UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue) |
| return false; |
| HasRet = true; |
| } |
| |
| if (!HasRet) |
| return false; |
| |
| Chain = TCChain; |
| return true; |
| } |
| |
| EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT, |
| ISD::NodeType ExtendKind) const { |
| MVT ReturnMVT = MVT::i32; |
| |
| bool Darwin = Subtarget.getTargetTriple().isOSDarwin(); |
| if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) { |
| // The ABI does not require i1, i8 or i16 to be extended. |
| // |
| // On Darwin, there is code in the wild relying on Clang's old behaviour of |
| // always extending i8/i16 return values, so keep doing that for now. |
| // (PR26665). |
| ReturnMVT = MVT::i8; |
| } |
| |
| EVT MinVT = getRegisterType(Context, ReturnMVT); |
| return VT.bitsLT(MinVT) ? MinVT : VT; |
| } |
| |
| /// Reads two 32 bit registers and creates a 64 bit mask value. |
| /// \param VA The current 32 bit value that need to be assigned. |
| /// \param NextVA The next 32 bit value that need to be assigned. |
| /// \param Root The parent DAG node. |
| /// \param [in,out] InFlag Represents SDvalue in the parent DAG node for |
| /// glue purposes. In the case the DAG is already using |
| /// physical register instead of virtual, we should glue |
| /// our new SDValue to InFlag SDvalue. |
| /// \return a new SDvalue of size 64bit. |
| static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA, |
| SDValue &Root, SelectionDAG &DAG, |
| const SDLoc &Dl, const X86Subtarget &Subtarget, |
| SDValue *InFlag = nullptr) { |
| assert((Subtarget.hasBWI()) && "Expected AVX512BW target!"); |
| assert(Subtarget.is32Bit() && "Expecting 32 bit target"); |
| assert(VA.getValVT() == MVT::v64i1 && |
| "Expecting first location of 64 bit width type"); |
| assert(NextVA.getValVT() == VA.getValVT() && |
| "The locations should have the same type"); |
| assert(VA.isRegLoc() && NextVA.isRegLoc() && |
| "The values should reside in two registers"); |
| |
| SDValue Lo, Hi; |
| SDValue ArgValueLo, ArgValueHi; |
| |
| MachineFunction &MF = DAG.getMachineFunction(); |
| const TargetRegisterClass *RC = &X86::GR32RegClass; |
| |
| // Read a 32 bit value from the registers. |
| if (nullptr == InFlag) { |
| // When no physical register is present, |
| // create an intermediate virtual register. |
| unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); |
| ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32); |
| Reg = MF.addLiveIn(NextVA.getLocReg(), RC); |
| ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32); |
| } else { |
| // When a physical register is available read the value from it and glue |
| // the reads together. |
| ArgValueLo = |
| DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InFlag); |
| *InFlag = ArgValueLo.getValue(2); |
| ArgValueHi = |
| DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InFlag); |
| *InFlag = ArgValueHi.getValue(2); |
| } |
| |
| // Convert the i32 type into v32i1 type. |
| Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo); |
| |
| // Convert the i32 type into v32i1 type. |
| Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi); |
| |
| // Concatenate the two values together. |
| return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi); |
| } |
| |
| /// The function will lower a register of various sizes (8/16/32/64) |
| /// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1) |
| /// \returns a DAG node contains the operand after lowering to mask type. |
| static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT, |
| const EVT &ValLoc, const SDLoc &Dl, |
| SelectionDAG &DAG) { |
| SDValue ValReturned = ValArg; |
| |
| if (ValVT == MVT::v1i1) |
| return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned); |
| |
| if (ValVT == MVT::v64i1) { |
| // In 32 bit machine, this case is handled by getv64i1Argument |
| assert(ValLoc == MVT::i64 && "Expecting only i64 locations"); |
| // In 64 bit machine, There is no need to truncate the value only bitcast |
| } else { |
| MVT maskLen; |
| switch (ValVT.getSimpleVT().SimpleTy) { |
| case MVT::v8i1: |
| maskLen = MVT::i8; |
| break; |
| case MVT::v16i1: |
| maskLen = MVT::i16; |
| break; |
| case MVT::v32i1: |
| maskLen = MVT::i32; |
| break; |
| default: |
| llvm_unreachable("Expecting a vector of i1 types"); |
| } |
| |
| ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned); |
| } |
| return DAG.getBitcast(ValVT, ValReturned); |
| } |
| |
| /// Lower the result values of a call into the |
| /// appropriate copies out of appropriate physical registers. |
| /// |
| SDValue X86TargetLowering::LowerCallResult( |
| SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, |
| const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, |
| SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, |
| uint32_t *RegMask) const { |
| |
| const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); |
| // Assign locations to each value returned by this call. |
| SmallVector<CCValAssign, 16> RVLocs; |
| CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, |
| *DAG.getContext()); |
| CCInfo.AnalyzeCallResult(Ins, RetCC_X86); |
| |
| // Copy all of the result registers out of their specified physreg. |
| for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E; |
| ++I, ++InsIndex) { |
| CCValAssign &VA = RVLocs[I]; |
| EVT CopyVT = VA.getLocVT(); |
| |
| // In some calling conventions we need to remove the used registers |
| // from the register mask. |
| if (RegMask) { |
| for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /*IncludeSelf=*/true); |
| SubRegs.isValid(); ++SubRegs) |
| RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32)); |
| } |
| |
| // Report an error if there was an attempt to return FP values via XMM |
| // registers. |
| if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) { |
| errorUnsupported(DAG, dl, "SSE register return with SSE disabled"); |
| if (VA.getLocReg() == X86::XMM1) |
| VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts. |
| else |
| VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts. |
| } else if (!Subtarget.hasSSE2() && |
| X86::FR64XRegClass.contains(VA.getLocReg()) && |
| CopyVT == MVT::f64) { |
| errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled"); |
| if (VA.getLocReg() == X86::XMM1) |
| VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts. |
| else |
| VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts. |
| } |
| |
| // If we prefer to use the value in xmm registers, copy it out as f80 and |
| // use a truncate to move it from fp stack reg to xmm reg. |
| bool RoundAfterCopy = false; |
| if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) && |
| isScalarFPTypeInSSEReg(VA.getValVT())) { |
| if (!Subtarget.hasX87()) |
| report_fatal_error("X87 register return with X87 disabled"); |
| CopyVT = MVT::f80; |
| RoundAfterCopy = (CopyVT != VA.getLocVT()); |
| } |
| |
| SDValue Val; |
| if (VA.needsCustom()) { |
| assert(VA.getValVT() == MVT::v64i1 && |
| "Currently the only custom case is when we split v64i1 to 2 regs"); |
| Val = |
| getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InFlag); |
| } else { |
| Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag) |
| .getValue(1); |
| Val = Chain.getValue(0); |
| InFlag = Chain.getValue(2); |
| } |
| |
| if (RoundAfterCopy) |
| Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val, |
| // This truncation won't change the value. |
| DAG.getIntPtrConstant(1, dl)); |
| |
| if (VA.isExtInLoc() && (VA.getValVT().getScalarType() == MVT::i1)) { |
| if (VA.getValVT().isVector() && |
| ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) || |
| (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) { |
| // promoting a mask type (v*i1) into a register of type i64/i32/i16/i8 |
| Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG); |
| } else |
| Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val); |
| } |
| |
| if (VA.getLocInfo() == CCValAssign::BCvt) |
| Val = DAG.getBitcast(VA.getValVT(), Val); |
| |
| InVals.push_back(Val); |
| } |
| |
| return Chain; |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // C & StdCall & Fast Calling Convention implementation |
| //===----------------------------------------------------------------------===// |
| // StdCall calling convention seems to be standard for many Windows' API |
| // routines and around. It differs from C calling convention just a little: |
| // callee should clean up the stack, not caller. Symbols should be also |
| // decorated in some fancy way :) It doesn't support any vector arguments. |
| // For info on fast calling convention see Fast Calling Convention (tail call) |
| // implementation LowerX86_32FastCCCallTo. |
| |
| /// CallIsStructReturn - Determines whether a call uses struct return |
| /// semantics. |
| enum StructReturnType { |
| NotStructReturn, |
| RegStructReturn, |
| StackStructReturn |
| }; |
| static StructReturnType |
| callIsStructReturn(ArrayRef<ISD::OutputArg> Outs, bool IsMCU) { |
| if (Outs.empty()) |
| return NotStructReturn; |
| |
| const ISD::ArgFlagsTy &Flags = Outs[0].Flags; |
| if (!Flags.isSRet()) |
| return NotStructReturn; |
| if (Flags.isInReg() || IsMCU) |
| return RegStructReturn; |
| return StackStructReturn; |
| } |
| |
| /// Determines whether a function uses struct return semantics. |
| static StructReturnType |
| argsAreStructReturn(ArrayRef<ISD::InputArg> Ins, bool IsMCU) { |
| if (Ins.empty()) |
| return NotStructReturn; |
| |
| const ISD::ArgFlagsTy &Flags = Ins[0].Flags; |
| if (!Flags.isSRet()) |
| return NotStructReturn; |
| if (Flags.isInReg() || IsMCU) |
| return RegStructReturn; |
| return StackStructReturn; |
| } |
| |
| /// Make a copy of an aggregate at address specified by "Src" to address |
| /// "Dst" with size and alignment information specified by the specific |
| /// parameter attribute. The copy will be passed as a byval function parameter. |
| static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst, |
| SDValue Chain, ISD::ArgFlagsTy Flags, |
| SelectionDAG &DAG, const SDLoc &dl) { |
| SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32); |
| |
| return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(), |
| /*isVolatile*/false, /*AlwaysInline=*/true, |
| /*isTailCall*/false, |
| MachinePointerInfo(), MachinePointerInfo()); |
| } |
| |
| /// Return true if the calling convention is one that we can guarantee TCO for. |
| static bool canGuaranteeTCO(CallingConv::ID CC) { |
| return (CC == CallingConv::Fast || CC == CallingConv::GHC || |
| CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE || |
| CC == CallingConv::HHVM || CC == CallingConv::Tail); |
| } |
| |
| /// Return true if we might ever do TCO for calls with this calling convention. |
| static bool mayTailCallThisCC(CallingConv::ID CC) { |
| switch (CC) { |
| // C calling conventions: |
| case CallingConv::C: |
| case CallingConv::Win64: |
| case CallingConv::X86_64_SysV: |
| // Callee pop conventions: |
| case CallingConv::X86_ThisCall: |
| case CallingConv::X86_StdCall: |
| case CallingConv::X86_VectorCall: |
| case CallingConv::X86_FastCall: |
| // Swift: |
| case CallingConv::Swift: |
| return true; |
| default: |
| return canGuaranteeTCO(CC); |
| } |
| } |
| |
| /// Return true if the function is being made into a tailcall target by |
| /// changing its ABI. |
| static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) { |
| return (GuaranteedTailCallOpt && canGuaranteeTCO(CC)) || CC == CallingConv::Tail; |
| } |
| |
| bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const { |
| if (!CI->isTailCall()) |
| return false; |
| |
| ImmutableCallSite CS(CI); |
| CallingConv::ID CalleeCC = CS.getCallingConv(); |
| if (!mayTailCallThisCC(CalleeCC)) |
| return false; |
| |
| return true; |
| } |
| |
| SDValue |
| X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv, |
| const SmallVectorImpl<ISD::InputArg> &Ins, |
| const SDLoc &dl, SelectionDAG &DAG, |
| const CCValAssign &VA, |
| MachineFrameInfo &MFI, unsigned i) const { |
| // Create the nodes corresponding to a load from this parameter slot. |
| ISD::ArgFlagsTy Flags = Ins[i].Flags; |
| bool AlwaysUseMutable = shouldGuaranteeTCO( |
| CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt); |
| bool isImmutable = !AlwaysUseMutable && !Flags.isByVal(); |
| EVT ValVT; |
| MVT PtrVT = getPointerTy(DAG.getDataLayout()); |
| |
| // If value is passed by pointer we have address passed instead of the value |
| // itself. No need to extend if the mask value and location share the same |
| // absolute size. |
| bool ExtendedInMem = |
| VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 && |
| VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits(); |
| |
| if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem) |
| ValVT = VA.getLocVT(); |
| else |
| ValVT = VA.getValVT(); |
| |
| // FIXME: For now, all byval parameter objects are marked mutable. This can be |
| // changed with more analysis. |
| // In case of tail call optimization mark all arguments mutable. Since they |
| // could be overwritten by lowering of arguments in case of a tail call. |
| if (Flags.isByVal()) { |
| unsigned Bytes = Flags.getByValSize(); |
| if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects. |
| |
| // FIXME: For now, all byval parameter objects are marked as aliasing. This |
| // can be improved with deeper analysis. |
| int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable, |
| /*isAliased=*/true); |
| return DAG.getFrameIndex(FI, PtrVT); |
| } |
| |
| // This is an argument in memory. We might be able to perform copy elision. |
| // If the argument is passed directly in memory without any extension, then we |
| // can perform copy elision. Large vector types, for example, may be passed |
| // indirectly by pointer. |
| if (Flags.isCopyElisionCandidate() && |
| VA.getLocInfo() != CCValAssign::Indirect && !ExtendedInMem) { |
| EVT ArgVT = Ins[i].ArgVT; |
| SDValue PartAddr; |
| if (Ins[i].PartOffset == 0) { |
| // If this is a one-part value or the first part of a multi-part value, |
| // create a stack object for the entire argument value type and return a |
| // load from our portion of it. This assumes that if the first part of an |
| // argument is in memory, the rest will also be in memory. |
| int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(), |
| /*IsImmutable=*/false); |
| PartAddr = DAG.getFrameIndex(FI, PtrVT); |
| return DAG.getLoad( |
| ValVT, dl, Chain, PartAddr, |
| MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)); |
| } else { |
| // This is not the first piece of an argument in memory. See if there is |
| // already a fixed stack object including this offset. If so, assume it |
| // was created by the PartOffset == 0 branch above and create a load from |
| // the appropriate offset into it. |
| int64_t PartBegin = VA.getLocMemOffset(); |
| int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8; |
| int FI = MFI.getObjectIndexBegin(); |
| for (; MFI.isFixedObjectIndex(FI); ++FI) { |
| int64_t ObjBegin = MFI.getObjectOffset(FI); |
| int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI); |
| if (ObjBegin <= PartBegin && PartEnd <= ObjEnd) |
| break; |
| } |
| if (MFI.isFixedObjectIndex(FI)) { |
| SDValue Addr = |
| DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT), |
| DAG.getIntPtrConstant(Ins[i].PartOffset, dl)); |
| return DAG.getLoad( |
| ValVT, dl, Chain, Addr, |
| MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI, |
| Ins[i].PartOffset)); |
| } |
| } |
| } |
| |
| int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8, |
| VA.getLocMemOffset(), isImmutable); |
| |
| // Set SExt or ZExt flag. |
| if (VA.getLocInfo() == CCValAssign::ZExt) { |
| MFI.setObjectZExt(FI, true); |
| } else if (VA.getLocInfo() == CCValAssign::SExt) { |
| MFI.setObjectSExt(FI, true); |
| } |
| |
| SDValue FIN = DAG.getFrameIndex(FI, PtrVT); |
| SDValue Val = DAG.getLoad( |
| ValVT, dl, Chain, FIN, |
| MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)); |
| return ExtendedInMem |
| ? (VA.getValVT().isVector() |
| ? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val) |
| : DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val)) |
| : Val; |
| } |
| |
| // FIXME: Get this from tablegen. |
| static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv, |
| const X86Subtarget &Subtarget) { |
| assert(Subtarget.is64Bit()); |
| |
| if (Subtarget.isCallingConvWin64(CallConv)) { |
| static const MCPhysReg GPR64ArgRegsWin64[] = { |
| X86::RCX, X86::RDX, X86::R8, X86::R9 |
| }; |
| return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64)); |
| } |
| |
| static const MCPhysReg GPR64ArgRegs64Bit[] = { |
| X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9 |
| }; |
| return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit)); |
| } |
| |
| // FIXME: Get this from tablegen. |
| static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF, |
| CallingConv::ID CallConv, |
| const X86Subtarget &Subtarget) { |
| assert(Subtarget.is64Bit()); |
| if (Subtarget.isCallingConvWin64(CallConv)) { |
| // The XMM registers which might contain var arg parameters are shadowed |
| // in their paired GPR. So we only need to save the GPR to their home |
| // slots. |
| // TODO: __vectorcall will change this. |
| return None; |
| } |
| |
| const Function &F = MF.getFunction(); |
| bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat); |
| bool isSoftFloat = Subtarget.useSoftFloat(); |
| assert(!(isSoftFloat && NoImplicitFloatOps) && |
| "SSE register cannot be used when SSE is disabled!"); |
| if (isSoftFloat || NoImplicitFloatOps || !Subtarget.hasSSE1()) |
| // Kernel mode asks for SSE to be disabled, so there are no XMM argument |
| // registers. |
| return None; |
| |
| static const MCPhysReg XMMArgRegs64Bit[] = { |
| X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, |
| X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 |
| }; |
| return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit)); |
| } |
| |
| #ifndef NDEBUG |
| static bool isSortedByValueNo(ArrayRef<CCValAssign> ArgLocs) { |
| return std::is_sorted(ArgLocs.begin(), ArgLocs.end(), |
| [](const CCValAssign &A, const CCValAssign &B) -> bool { |
| return A.getValNo() < B.getValNo(); |
| }); |
| } |
| #endif |
| |
| SDValue X86TargetLowering::LowerFormalArguments( |
| SDValue Chain, CallingConv::ID CallConv, bool isVarArg, |
| const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, |
| SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { |
| MachineFunction &MF = DAG.getMachineFunction(); |
| X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); |
| const TargetFrameLowering &TFI = *Subtarget.getFrameLowering(); |
| |
| const Function &F = MF.getFunction(); |
| if (F.hasExternalLinkage() && Subtarget.isTargetCygMing() && |
| F.getName() == "main") |
| FuncInfo->setForceFramePointer(true); |
| |
| MachineFrameInfo &MFI = MF.getFrameInfo(); |
| bool Is64Bit = Subtarget.is64Bit(); |
| bool IsWin64 = Subtarget.isCallingConvWin64(CallConv); |
| |
| assert( |
| !(isVarArg && canGuaranteeTCO(CallConv)) && |
| "Var args not supported with calling conv' regcall, fastcc, ghc or hipe"); |
| |
| // Assign locations to all of the incoming arguments. |
| SmallVector<CCValAssign, 16> ArgLocs; |
| CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext()); |
| |
| // Allocate shadow area for Win64. |
| if (IsWin64) |
| CCInfo.AllocateStack(32, 8); |
| |
| CCInfo.AnalyzeArguments(Ins, CC_X86); |
| |
| // In vectorcall calling convention a second pass is required for the HVA |
| // types. |
| if (CallingConv::X86_VectorCall == CallConv) { |
| CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86); |
| } |
| |
| // The next loop assumes that the locations are in the same order of the |
| // input arguments. |
| assert(isSortedByValueNo(ArgLocs) && |
| "Argument Location list must be sorted before lowering"); |
| |
| SDValue ArgValue; |
| for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E; |
| ++I, ++InsIndex) { |
| assert(InsIndex < Ins.size() && "Invalid Ins index"); |
| CCValAssign &VA = ArgLocs[I]; |
| |
| if (VA.isRegLoc()) { |
| EVT RegVT = VA.getLocVT(); |
| if (VA.needsCustom()) { |
| assert( |
| VA.getValVT() == MVT::v64i1 && |
| "Currently the only custom case is when we split v64i1 to 2 regs"); |
| |
| // v64i1 values, in regcall calling convention, that are |
| // compiled to 32 bit arch, are split up into two registers. |
| ArgValue = |
| getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget); |
| } else { |
| const TargetRegisterClass *RC; |
| if (RegVT == MVT::i8) |
| RC = &X86::GR8RegClass; |
| else if (RegVT == MVT::i16) |
| RC = &X86::GR16RegClass; |
| else if (RegVT == MVT::i32) |
| RC = &X86::GR32RegClass; |
| else if (Is64Bit && RegVT == MVT::i64) |
| RC = &X86::GR64RegClass; |
| else if (RegVT == MVT::f32) |
| RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass; |
| else if (RegVT == MVT::f64) |
| RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass; |
| else if (RegVT == MVT::f80) |
| RC = &X86::RFP80RegClass; |
| else if (RegVT == MVT::f128) |
| RC = &X86::VR128RegClass; |
| else if (RegVT.is512BitVector()) |
| RC = &X86::VR512RegClass; |
| else if (RegVT.is256BitVector()) |
| RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass; |
| else if (RegVT.is128BitVector()) |
| RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass; |
| else if (RegVT == MVT::x86mmx) |
| RC = &X86::VR64RegClass; |
| else if (RegVT == MVT::v1i1) |
| RC = &X86::VK1RegClass; |
| else if (RegVT == MVT::v8i1) |
| RC = &X86::VK8RegClass; |
| else if (RegVT == MVT::v16i1) |
| RC = &X86::VK16RegClass; |
| else if (RegVT == MVT::v32i1) |
| RC = &X86::VK32RegClass; |
| else if (RegVT == MVT::v64i1) |
| RC = &X86::VK64RegClass; |
| else |
| llvm_unreachable("Unknown argument type!"); |
| |
| unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); |
| ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); |
| } |
| |
| // If this is an 8 or 16-bit value, it is really passed promoted to 32 |
| // bits. Insert an assert[sz]ext to capture this, then truncate to the |
| // right size. |
| if (VA.getLocInfo() == CCValAssign::SExt) |
| ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, |
| DAG.getValueType(VA.getValVT())); |
| else if (VA.getLocInfo() == CCValAssign::ZExt) |
| ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, |
| DAG.getValueType(VA.getValVT())); |
| else if (VA.getLocInfo() == CCValAssign::BCvt) |
| ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue); |
| |
| if (VA.isExtInLoc()) { |
| // Handle MMX values passed in XMM regs. |
| if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1) |
| ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue); |
| else if (VA.getValVT().isVector() && |
| VA.getValVT().getScalarType() == MVT::i1 && |
| ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) || |
| (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) { |
| // Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8 |
| ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG); |
| } else |
| ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); |
| } |
| } else { |
| assert(VA.isMemLoc()); |
| ArgValue = |
| LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex); |
| } |
| |
| // If value is passed via pointer - do a load. |
| if (VA.getLocInfo() == CCValAssign::Indirect && !Ins[I].Flags.isByVal()) |
| ArgValue = |
| DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo()); |
| |
| InVals.push_back(ArgValue); |
| } |
| |
| for (unsigned I = 0, E = Ins.size(); I != E; ++I) { |
| // Swift calling convention does not require we copy the sret argument |
| // into %rax/%eax for the return. We don't set SRetReturnReg for Swift. |
| if (CallConv == CallingConv::Swift) |
| continue; |
| |
| // All x86 ABIs require that for returning structs by value we copy the |
| // sret argument into %rax/%eax (depending on ABI) for the return. Save |
| // the argument into a virtual register so that we can access it from the |
| // return points. |
| if (Ins[I].Flags.isSRet()) { |
| unsigned Reg = FuncInfo->getSRetReturnReg(); |
| if (!Reg) { |
| MVT PtrTy = getPointerTy(DAG.getDataLayout()); |
| Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy)); |
| FuncInfo->setSRetReturnReg(Reg); |
| } |
| SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]); |
| Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain); |
| break; |
| } |
| } |
| |
| unsigned StackSize = CCInfo.getNextStackOffset(); |
| // Align stack specially for tail calls. |
| if (shouldGuaranteeTCO(CallConv, |
| MF.getTarget().Options.GuaranteedTailCallOpt)) |
| StackSize = GetAlignedArgumentStackSize(StackSize, DAG); |
| |
| // If the function takes variable number of arguments, make a frame index for |
| // the start of the first vararg value... for expansion of llvm.va_start. We |
| // can skip this if there are no va_start calls. |
| if (MFI.hasVAStart() && |
| (Is64Bit || (CallConv != CallingConv::X86_FastCall && |
| CallConv != CallingConv::X86_ThisCall))) { |
| FuncInfo->setVarArgsFrameIndex(MFI.CreateFixedObject(1, StackSize, true)); |
| } |
| |
| // Figure out if XMM registers are in use. |
| assert(!(Subtarget.useSoftFloat() && |
| F.hasFnAttribute(Attribute::NoImplicitFloat)) && |
| "SSE register cannot be used when SSE is disabled!"); |
| |
| // 64-bit calling conventions support varargs and register parameters, so we |
| // have to do extra work to spill them in the prologue. |
| if (Is64Bit && isVarArg && MFI.hasVAStart()) { |
| // Find the first unallocated argument registers. |
| ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget); |
| ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget); |
| unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs); |
| unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs); |
| assert(!(NumXMMRegs && !Subtarget.hasSSE1()) && |
| "SSE register cannot be used when SSE is disabled!"); |
| |
| // Gather all the live in physical registers. |
| SmallVector<SDValue, 6> LiveGPRs; |
| SmallVector<SDValue, 8> LiveXMMRegs; |
| SDValue ALVal; |
| for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) { |
| unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass); |
| LiveGPRs.push_back( |
| DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64)); |
| } |
| if (!ArgXMMs.empty()) { |
| unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass); |
| ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8); |
| for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) { |
| unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass); |
| LiveXMMRegs.push_back( |
| DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32)); |
| } |
| } |
| |
| if (IsWin64) { |
| // Get to the caller-allocated home save location. Add 8 to account |
| // for the return address. |
| int HomeOffset = TFI.getOffsetOfLocalArea() + 8; |
| FuncInfo->setRegSaveFrameIndex( |
| MFI.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false)); |
| // Fixup to set vararg frame on shadow area (4 x i64). |
| if (NumIntRegs < 4) |
| FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex()); |
| } else { |
| // For X86-64, if there are vararg parameters that are passed via |
| // registers, then we must store them to their spots on the stack so |
| // they may be loaded by dereferencing the result of va_next. |
| FuncInfo->setVarArgsGPOffset(NumIntRegs * 8); |
| FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16); |
| FuncInfo->setRegSaveFrameIndex(MFI.CreateStackObject( |
| ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false)); |
| } |
| |
| // Store the integer parameter registers. |
| SmallVector<SDValue, 8> MemOps; |
| SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), |
| getPointerTy(DAG.getDataLayout())); |
| unsigned Offset = FuncInfo->getVarArgsGPOffset(); |
| for (SDValue Val : LiveGPRs) { |
| SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), |
| RSFIN, DAG.getIntPtrConstant(Offset, dl)); |
| SDValue Store = |
| DAG.getStore(Val.getValue(1), dl, Val, FIN, |
| MachinePointerInfo::getFixedStack( |
| DAG.getMachineFunction(), |
| FuncInfo->getRegSaveFrameIndex(), Offset)); |
| MemOps.push_back(Store); |
| Offset += 8; |
| } |
| |
| if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) { |
| // Now store the XMM (fp + vector) parameter registers. |
| SmallVector<SDValue, 12> SaveXMMOps; |
| SaveXMMOps.push_back(Chain); |
| SaveXMMOps.push_back(ALVal); |
| SaveXMMOps.push_back(DAG.getIntPtrConstant( |
| FuncInfo->getRegSaveFrameIndex(), dl)); |
| SaveXMMOps.push_back(DAG.getIntPtrConstant( |
| FuncInfo->getVarArgsFPOffset(), dl)); |
| SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(), |
| LiveXMMRegs.end()); |
| MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl, |
| MVT::Other, SaveXMMOps)); |
| } |
| |
| if (!MemOps.empty()) |
| Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); |
| } |
| |
| if (isVarArg && MFI.hasMustTailInVarArgFunc()) { |
| // Find the largest legal vector type. |
| MVT VecVT = MVT::Other; |
| // FIXME: Only some x86_32 calling conventions support AVX512. |
| if (Subtarget.useAVX512Regs() && |
| (Is64Bit || (CallConv == CallingConv::X86_VectorCall || |
| CallConv == CallingConv::Intel_OCL_BI))) |
| VecVT = MVT::v16f32; |
| else if (Subtarget.hasAVX()) |
| VecVT = MVT::v8f32; |
| else if (Subtarget.hasSSE2()) |
| VecVT = MVT::v4f32; |
| |
| // We forward some GPRs and some vector types. |
| SmallVector<MVT, 2> RegParmTypes; |
| MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32; |
| RegParmTypes.push_back(IntVT); |
| if (VecVT != MVT::Other) |
| RegParmTypes.push_back(VecVT); |
| |
| // Compute the set of forwarded registers. The rest are scratch. |
| SmallVectorImpl<ForwardedRegister> &Forwards = |
| FuncInfo->getForwardedMustTailRegParms(); |
| CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86); |
| |
| // Forward AL for SysV x86_64 targets, since it is used for varargs. |
| if (Is64Bit && !IsWin64 && !CCInfo.isAllocated(X86::AL)) { |
| unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass); |
| Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8)); |
| } |
| |
| // Copy all forwards from physical to virtual registers. |
| for (ForwardedRegister &FR : Forwards) { |
| // FIXME: Can we use a less constrained schedule? |
| SDValue RegVal = DAG.getCopyFromReg(Chain, dl, FR.VReg, FR.VT); |
| FR.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(FR.VT)); |
| Chain = DAG.getCopyToReg(Chain, dl, FR.VReg, RegVal); |
| } |
| } |
| |
| // Some CCs need callee pop. |
| if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, |
| MF.getTarget().Options.GuaranteedTailCallOpt)) { |
| FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything. |
| } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) { |
| // X86 interrupts must pop the error code (and the alignment padding) if |
| // present. |
| FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4); |
| } else { |
| FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing. |
| // If this is an sret function, the return should pop the hidden pointer. |
| if (!Is64Bit && !canGuaranteeTCO(CallConv) && |
| !Subtarget.getTargetTriple().isOSMSVCRT() && |
| argsAreStructReturn(Ins, Subtarget.isTargetMCU()) == StackStructReturn) |
| FuncInfo->setBytesToPopOnReturn(4); |
| } |
| |
| if (!Is64Bit) { |
| // RegSaveFrameIndex is X86-64 only. |
| FuncInfo->setRegSaveFrameIndex(0xAAAAAAA); |
| if (CallConv == CallingConv::X86_FastCall || |
| CallConv == CallingConv::X86_ThisCall) |
| // fastcc functions can't have varargs. |
| FuncInfo->setVarArgsFrameIndex(0xAAAAAAA); |
| } |
| |
| FuncInfo->setArgumentStackSize(StackSize); |
| |
| if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) { |
| EHPersonality Personality = classifyEHPersonality(F.getPersonalityFn()); |
| if (Personality == EHPersonality::CoreCLR) { |
| assert(Is64Bit); |
| // TODO: Add a mechanism to frame lowering that will allow us to indicate |
| // that we'd prefer this slot be allocated towards the bottom of the frame |
| // (i.e. near the stack pointer after allocating the frame). Every |
| // funclet needs a copy of this slot in its (mostly empty) frame, and the |
| // offset from the bottom of this and each funclet's frame must be the |
| // same, so the size of funclets' (mostly empty) frames is dictated by |
| // how far this slot is from the bottom (since they allocate just enough |
| // space to accommodate holding this slot at the correct offset). |
| int PSPSymFI = MFI.CreateStackObject(8, 8, /*isSS=*/false); |
| EHInfo->PSPSymFrameIdx = PSPSymFI; |
| } |
| } |
| |
| if (CallConv == CallingConv::X86_RegCall || |
| F.hasFnAttribute("no_caller_saved_registers")) { |
| MachineRegisterInfo &MRI = MF.getRegInfo(); |
| for (std::pair<unsigned, unsigned> Pair : MRI.liveins()) |
| MRI.disableCalleeSavedRegister(Pair.first); |
| } |
| |
| return Chain; |
| } |
| |
| SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, |
| SDValue Arg, const SDLoc &dl, |
| SelectionDAG &DAG, |
| const CCValAssign &VA, |
| ISD::ArgFlagsTy Flags) const { |
| unsigned LocMemOffset = VA.getLocMemOffset(); |
| SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl); |
| PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), |
| StackPtr, PtrOff); |
| if (Flags.isByVal()) |
| return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl); |
| |
| return DAG.getStore( |
| Chain, dl, Arg, PtrOff, |
| MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset)); |
| } |
| |
| /// Emit a load of return address if tail call |
| /// optimization is performed and it is required. |
| SDValue X86TargetLowering::EmitTailCallLoadRetAddr( |
| SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall, |
| bool Is64Bit, int FPDiff, const SDLoc &dl) const { |
| // Adjust the Return address stack slot. |
| EVT VT = getPointerTy(DAG.getDataLayout()); |
| OutRetAddr = getReturnAddressFrameIndex(DAG); |
| |
| // Load the "old" Return address. |
| OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo()); |
| return SDValue(OutRetAddr.getNode(), 1); |
| } |
| |
| /// Emit a store of the return address if tail call |
| /// optimization is performed and it is required (FPDiff!=0). |
| static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF, |
| SDValue Chain, SDValue RetAddrFrIdx, |
| EVT PtrVT, unsigned SlotSize, |
| int FPDiff, const SDLoc &dl) { |
| // Store the return address to the appropriate stack slot. |
| if (!FPDiff) return Chain; |
| // Calculate the new stack slot for the return address. |
| int NewReturnAddrFI = |
| MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize, |
| false); |
| SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT); |
| Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx, |
| MachinePointerInfo::getFixedStack( |
| DAG.getMachineFunction(), NewReturnAddrFI)); |
| return Chain; |
| } |
| |
| /// Returns a vector_shuffle mask for an movs{s|d}, movd |
| /// operation of specified width. |
| static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1, |
| SDValue V2) { |
| unsigned NumElems = VT.getVectorNumElements(); |
| SmallVector<int, 8> Mask; |
| Mask.push_back(NumElems); |
| for (unsigned i = 1; i != NumElems; ++i) |
| Mask.push_back(i); |
| return DAG.getVectorShuffle(VT, dl, V1, V2, Mask); |
| } |
| |
| SDValue |
| X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, |
| SmallVectorImpl<SDValue> &InVals) const { |
| SelectionDAG &DAG = CLI.DAG; |
| SDLoc &dl = CLI.DL; |
| SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs; |
| SmallVectorImpl<SDValue> &OutVals = CLI.OutVals; |
| SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins; |
| SDValue Chain = CLI.Chain; |
| SDValue Callee = CLI.Callee; |
| CallingConv::ID CallConv = CLI.CallConv; |
| bool &isTailCall = CLI.IsTailCall; |
| bool isVarArg = CLI.IsVarArg; |
| |
| MachineFunction &MF = DAG.getMachineFunction(); |
| bool Is64Bit = Subtarget.is64Bit(); |
| bool IsWin64 = Subtarget.isCallingConvWin64(CallConv); |
| StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU()); |
| bool IsSibcall = false; |
| bool IsGuaranteeTCO = MF.getTarget().Options.GuaranteedTailCallOpt || |
| CallConv == CallingConv::Tail; |
| X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>(); |
| const auto *CI = dyn_cast_or_null<CallInst>(CLI.CS.getInstruction()); |
| const Function *Fn = CI ? CI->getCalledFunction() : nullptr; |
| bool HasNCSR = (CI && CI->hasFnAttr("no_caller_saved_registers")) || |
| (Fn && Fn->hasFnAttribute("no_caller_saved_registers")); |
| const auto *II = dyn_cast_or_null<InvokeInst>(CLI.CS.getInstruction()); |
| bool HasNoCfCheck = |
| (CI && CI->doesNoCfCheck()) || (II && II->doesNoCfCheck()); |
| const Module *M = MF.getMMI().getModule(); |
| Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch"); |
| |
| MachineFunction::CallSiteInfo CSInfo; |
| |
| if (CallConv == CallingConv::X86_INTR) |
| report_fatal_error("X86 interrupts may not be called directly"); |
| |
| if (Subtarget.isPICStyleGOT() && !IsGuaranteeTCO) { |
| // If we are using a GOT, disable tail calls to external symbols with |
| // default visibility. Tail calling such a symbol requires using a GOT |
| // relocation, which forces early binding of the symbol. This breaks code |
| // that require lazy function symbol resolution. Using musttail or |
| // GuaranteedTailCallOpt will override this. |
| GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee); |
| if (!G || (!G->getGlobal()->hasLocalLinkage() && |
| G->getGlobal()->hasDefaultVisibility())) |
| isTailCall = false; |
| } |
| |
| bool IsMustTail = CLI.CS && CLI.CS.isMustTailCall(); |
| if (IsMustTail) { |
| // Force this to be a tail call. The verifier rules are enough to ensure |
| // that we can lower this successfully without moving the return address |
| // around. |
| isTailCall = true; |
| } else if (isTailCall) { |
| // Check if it's really possible to do a tail call. |
| isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, |
| isVarArg, SR != NotStructReturn, |
| MF.getFunction().hasStructRetAttr(), CLI.RetTy, |
| Outs, OutVals, Ins, DAG); |
| |
| // Sibcalls are automatically detected tailcalls which do not require |
| // ABI changes. |
| if (!IsGuaranteeTCO && isTailCall) |
| IsSibcall = true; |
| |
| if (isTailCall) |
| ++NumTailCalls; |
| } |
| |
| assert(!(isVarArg && canGuaranteeTCO(CallConv)) && |
| "Var args not supported with calling convention fastcc, ghc or hipe"); |
| |
| // Analyze operands of the call, assigning locations to each operand. |
| SmallVector<CCValAssign, 16> ArgLocs; |
| CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext()); |
| |
| // Allocate shadow area for Win64. |
| if (IsWin64) |
| CCInfo.AllocateStack(32, 8); |
| |
| CCInfo.AnalyzeArguments(Outs, CC_X86); |
| |
| // In vectorcall calling convention a second pass is required for the HVA |
| // types. |
| if (CallingConv::X86_VectorCall == CallConv) { |
| CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86); |
| } |
| |
| // Get a count of how many bytes are to be pushed on the stack. |
| unsigned NumBytes = CCInfo.getAlignedCallFrameSize(); |
| if (IsSibcall) |
| // This is a sibcall. The memory operands are available in caller's |
| // own caller's stack. |
| NumBytes = 0; |
| else if (IsGuaranteeTCO && canGuaranteeTCO(CallConv)) |
| NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); |
| |
| int FPDiff = 0; |
| if (isTailCall && !IsSibcall && !IsMustTail) { |
| // Lower arguments at fp - stackoffset + fpdiff. |
| unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn(); |
| |
| FPDiff = NumBytesCallerPushed - NumBytes; |
| |
| // Set the delta of movement of the returnaddr stackslot. |
| // But only set if delta is greater than previous delta. |
| if (FPDiff < X86Info->getTCReturnAddrDelta()) |
| X86Info->setTCReturnAddrDelta(FPDiff); |
| } |
| |
| unsigned NumBytesToPush = NumBytes; |
| unsigned NumBytesToPop = NumBytes; |
| |
| // If we have an inalloca argument, all stack space has already been allocated |
| // for us and be right at the top of the stack. We don't support multiple |
| // arguments passed in memory when using inalloca. |
| if (!Outs.empty() && Outs.back().Flags.isInAlloca()) { |
| NumBytesToPush = 0; |
| if (!ArgLocs.back().isMemLoc()) |
| report_fatal_error("cannot use inalloca attribute on a register " |
| "parameter"); |
| if (ArgLocs.back().getLocMemOffset() != 0) |
| report_fatal_error("any parameter with the inalloca attribute must be " |
| "the only memory argument"); |
| } |
| |
| if (!IsSibcall && !IsMustTail) |
| Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush, |
| NumBytes - NumBytesToPush, dl); |
| |
| SDValue RetAddrFrIdx; |
| // Load return address for tail calls. |
| if (isTailCall && FPDiff) |
| Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall, |
| Is64Bit, FPDiff, dl); |
| |
| SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; |
| SmallVector<SDValue, 8> MemOpChains; |
| SDValue StackPtr; |
| |
| // The next loop assumes that the locations are in the same order of the |
| // input arguments. |
| assert(isSortedByValueNo(ArgLocs) && |
| "Argument Location list must be sorted before lowering"); |
| |
| // Walk the register/memloc assignments, inserting copies/loads. In the case |
| // of tail call optimization arguments are handle later. |
| const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); |
| for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E; |
| ++I, ++OutIndex) { |
| assert(OutIndex < Outs.size() && "Invalid Out index"); |
| // Skip inalloca arguments, they have already been written. |
| ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags; |
| if (Flags.isInAlloca()) |
| continue; |
| |
| CCValAssign &VA = ArgLocs[I]; |
| EVT RegVT = VA.getLocVT(); |
| SDValue Arg = OutVals[OutIndex]; |
| bool isByVal = Flags.isByVal(); |
| |
| // Promote the value if needed. |
| switch (VA.getLocInfo()) { |
| default: llvm_unreachable("Unknown loc info!"); |
| case CCValAssign::Full: break; |
| case CCValAssign::SExt: |
| Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg); |
| break; |
| case CCValAssign::ZExt: |
| Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg); |
| break; |
| case CCValAssign::AExt: |
| if (Arg.getValueType().isVector() && |
| Arg.getValueType().getVectorElementType() == MVT::i1) |
| Arg = lowerMasksToReg(Arg, RegVT, dl, DAG); |
| else if (RegVT.is128BitVector()) { |
| // Special case: passing MMX values in XMM registers. |
| Arg = DAG.getBitcast(MVT::i64, Arg); |
| Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg); |
| Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg); |
| } else |
| Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg); |
| break; |
| case CCValAssign::BCvt: |
| Arg = DAG.getBitcast(RegVT, Arg); |
| break; |
| case CCValAssign::Indirect: { |
| if (isByVal) { |
| // Memcpy the argument to a temporary stack slot to prevent |
| // the caller from seeing any modifications the callee may make |
| // as guaranteed by the `byval` attribute. |
| int FrameIdx = MF.getFrameInfo().CreateStackObject( |
| Flags.getByValSize(), std::max(16, (int)Flags.getByValAlign()), |
| false); |
| SDValue StackSlot = |
| DAG.getFrameIndex(FrameIdx, getPointerTy(DAG.getDataLayout())); |
| Chain = |
| CreateCopyOfByValArgument(Arg, StackSlot, Chain, Flags, DAG, dl); |
| // From now on treat this as a regular pointer |
| Arg = StackSlot; |
| isByVal = false; |
| } else { |
| // Store the argument. |
| SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT()); |
| int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex(); |
| Chain = DAG.getStore( |
| Chain, dl, Arg, SpillSlot, |
| MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)); |
| Arg = SpillSlot; |
| } |
| break; |
| } |
| } |
| |
| if (VA.needsCustom()) { |
| assert(VA.getValVT() == MVT::v64i1 && |
| "Currently the only custom case is when we split v64i1 to 2 regs"); |
| // Split v64i1 value into two registers |
| Passv64i1ArgInRegs(dl, DAG, Arg, RegsToPass, VA, ArgLocs[++I], Subtarget); |
| } else if (VA.isRegLoc()) { |
| RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); |
| const TargetOptions &Options = DAG.getTarget().Options; |
| if (Options.EnableDebugEntryValues) |
| CSInfo.emplace_back(VA.getLocReg(), I); |
| if (isVarArg && IsWin64) { |
| // Win64 ABI requires argument XMM reg to be copied to the corresponding |
| // shadow reg if callee is a varargs function. |
| unsigned ShadowReg = 0; |
| switch (VA.getLocReg()) { |
| case X86::XMM0: ShadowReg = X86::RCX; break; |
| case X86::XMM1: ShadowReg = X86::RDX; break; |
| case X86::XMM2: ShadowReg = X86::R8; break; |
| case X86::XMM3: ShadowReg = X86::R9; break; |
| } |
| if (ShadowReg) |
| RegsToPass.push_back(std::make_pair(ShadowReg, Arg)); |
| } |
| } else if (!IsSibcall && (!isTailCall || isByVal)) { |
| assert(VA.isMemLoc()); |
| if (!StackPtr.getNode()) |
| StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(), |
| getPointerTy(DAG.getDataLayout())); |
| MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, |
| dl, DAG, VA, Flags)); |
| } |
| } |
| |
| if (!MemOpChains.empty()) |
| Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains); |
| |
| if (Subtarget.isPICStyleGOT()) { |
| // ELF / PIC requires GOT in the EBX register before function calls via PLT |
| // GOT pointer. |
| if (!isTailCall) { |
| RegsToPass.push_back(std::make_pair( |
| unsigned(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), |
| getPointerTy(DAG.getDataLayout())))); |
| } else { |
| // If we are tail calling and generating PIC/GOT style code load the |
| // address of the callee into ECX. The value in ecx is used as target of |
| // the tail jump. This is done to circumvent the ebx/callee-saved problem |
| // for tail calls on PIC/GOT architectures. Normally we would just put the |
| // address of GOT into ebx and then call target@PLT. But for tail calls |
| // ebx would be restored (since ebx is callee saved) before jumping to the |
| // target@PLT. |
| |
| // Note: The actual moving to ECX is done further down. |
| GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee); |
| if (G && !G->getGlobal()->hasLocalLinkage() && |
| G->getGlobal()->hasDefaultVisibility()) |
| Callee = LowerGlobalAddress(Callee, DAG); |
| else if (isa<ExternalSymbolSDNode>(Callee)) |
| Callee = LowerExternalSymbol(Callee, DAG); |
| } |
| } |
| |
| if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) { |
| // From AMD64 ABI document: |
| // For calls that may call functions that use varargs or stdargs |
| // (prototype-less calls or calls to functions containing ellipsis (...) in |
| // the declaration) %al is used as hidden argument to specify the number |
| // of SSE registers used. The contents of %al do not need to match exactly |
| // the number of registers, but must be an ubound on the number of SSE |
| // registers used and is in the range 0 - 8 inclusive. |
| |
| // Count the number of XMM registers allocated. |
| static const MCPhysReg XMMArgRegs[] = { |
| X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, |
| X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 |
| }; |
| unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs); |
| assert((Subtarget.hasSSE1() || !NumXMMRegs) |
| && "SSE registers cannot be used when SSE is disabled"); |
| |
| RegsToPass.push_back(std::make_pair(unsigned(X86::AL), |
| DAG.getConstant(NumXMMRegs, dl, |
| MVT::i8))); |
| } |
| |
| if (isVarArg && IsMustTail) { |
| const auto &Forwards = X86Info->getForwardedMustTailRegParms(); |
| for (const auto &F : Forwards) { |
| SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT); |
| RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val)); |
| } |
| } |
| |
| // For tail calls lower the arguments to the 'real' stack slots. Sibcalls |
| // don't need this because the eligibility check rejects calls that require |
| // shuffling arguments passed in memory. |
| if (!IsSibcall && isTailCall) { |
| // Force all the incoming stack arguments to be loaded from the stack |
| // before any new outgoing arguments are stored to the stack, because the |
| // outgoing stack slots may alias the incoming argument stack slots, and |
| // the alias isn't otherwise explicit. This is slightly more conservative |
| // than necessary, because it means that each store effectively depends |
| // on every argument instead of just those arguments it would clobber. |
| SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain); |
| |
| SmallVector<SDValue, 8> MemOpChains2; |
| SDValue FIN; |
| int FI = 0; |
| for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E; |
| ++I, ++OutsIndex) { |
| CCValAssign &VA = ArgLocs[I]; |
| |
| if (VA.isRegLoc()) { |
| if (VA.needsCustom()) { |
| assert((CallConv == CallingConv::X86_RegCall) && |
| "Expecting custom case only in regcall calling convention"); |
| // This means that we are in special case where one argument was |
| // passed through two register locations - Skip the next location |
| ++I; |
| } |
| |
| continue; |
| } |
| |
| assert(VA.isMemLoc()); |
| SDValue Arg = OutVals[OutsIndex]; |
| ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags; |
| // Skip inalloca arguments. They don't require any work. |
| if (Flags.isInAlloca()) |
| continue; |
| // Create frame index. |
| int32_t Offset = VA.getLocMemOffset()+FPDiff; |
| uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8; |
| FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true); |
| FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); |
| |
| if (Flags.isByVal()) { |
| // Copy relative to framepointer. |
| SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl); |
| if (!StackPtr.getNode()) |
| StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(), |
| getPointerTy(DAG.getDataLayout())); |
| Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), |
| StackPtr, Source); |
| |
| MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN, |
| ArgChain, |
| Flags, DAG, dl)); |
| } else { |
| // Store relative to framepointer. |
| MemOpChains2.push_back(DAG.getStore( |
| ArgChain, dl, Arg, FIN, |
| MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI))); |
| } |
| } |
| |
| if (!MemOpChains2.empty()) |
| Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2); |
| |
| // Store the return address to the appropriate stack slot. |
| Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, |
| getPointerTy(DAG.getDataLayout()), |
| RegInfo->getSlotSize(), FPDiff, dl); |
| } |
| |
| // Build a sequence of copy-to-reg nodes chained together with token chain |
| // and flag operands which copy the outgoing args into registers. |
| SDValue InFlag; |
| for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { |
| Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, |
| RegsToPass[i].second, InFlag); |
| InFlag = Chain.getValue(1); |
| } |
| |
| if (DAG.getTarget().getCodeModel() == CodeModel::Large) { |
| assert(Is64Bit && "Large code model is only legal in 64-bit mode."); |
| // In the 64-bit large code model, we have to make all calls |
| // through a register, since the call instruction's 32-bit |
| // pc-relative offset may not be large enough to hold the whole |
| // address. |
| } else if (Callee->getOpcode() == ISD::GlobalAddress || |
| Callee->getOpcode() == ISD::ExternalSymbol) { |
| // Lower direct calls to global addresses and external symbols. Setting |
| // ForCall to true here has the effect of removing WrapperRIP when possible |
| // to allow direct calls to be selected without first materializing the |
| // address into a register. |
| Callee = LowerGlobalOrExternal(Callee, DAG, /*ForCall=*/true); |
| } else if (Subtarget.isTarget64BitILP32() && |
| Callee->getValueType(0) == MVT::i32) { |
| // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI |
| Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee); |
| } |
| |
| // Returns a chain & a flag for retval copy to use. |
| SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); |
| SmallVector<SDValue, 8> Ops; |
| |
| if (!IsSibcall && isTailCall && !IsMustTail) { |
| Chain = DAG.getCALLSEQ_END(Chain, |
| DAG.getIntPtrConstant(NumBytesToPop, dl, true), |
| DAG.getIntPtrConstant(0, dl, true), InFlag, dl); |
| InFlag = Chain.getValue(1); |
| } |
| |
| Ops.push_back(Chain); |
| Ops.push_back(Callee); |
| |
| if (isTailCall) |
| Ops.push_back(DAG.getConstant(FPDiff, dl, MVT::i32)); |
| |
| // Add argument registers to the end of the list so that they are known live |
| // into the call. |
| for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) |
| Ops.push_back(DAG.getRegister(RegsToPass[i].first, |
| RegsToPass[i].second.getValueType())); |
| |
| // Add a register mask operand representing the call-preserved registers. |
| // If HasNCSR is asserted (attribute NoCallerSavedRegisters exists) then we |
| // set X86_INTR calling convention because it has the same CSR mask |
| // (same preserved registers). |
| const uint32_t *Mask = RegInfo->getCallPreservedMask( |
| MF, HasNCSR ? (CallingConv::ID)CallingConv::X86_INTR : CallConv); |
| assert(Mask && "Missing call preserved mask for calling convention"); |
| |
| // If this is an invoke in a 32-bit function using a funclet-based |
| // personality, assume the function clobbers all registers. If an exception |
| // is thrown, the runtime will not restore CSRs. |
| // FIXME: Model this more precisely so that we can register allocate across |
| // the normal edge and spill and fill across the exceptional edge. |
| if (!Is64Bit && CLI.CS && CLI.CS.isInvoke()) { |
| const Function &CallerFn = MF.getFunction(); |
| EHPersonality Pers = |
| CallerFn.hasPersonalityFn() |
| ? classifyEHPersonality(CallerFn.getPersonalityFn()) |
| : EHPersonality::Unknown; |
| if (isFuncletEHPersonality(Pers)) |
| Mask = RegInfo->getNoPreservedMask(); |
| } |
| |
| // Define a new register mask from the existing mask. |
| uint32_t *RegMask = nullptr; |
| |
| // In some calling conventions we need to remove the used physical registers |
| // from the reg mask. |
| if (CallConv == CallingConv::X86_RegCall || HasNCSR) { |
| const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); |
| |
| // Allocate a new Reg Mask and copy Mask. |
| RegMask = MF.allocateRegMask(); |
| unsigned RegMaskSize = MachineOperand::getRegMaskSize(TRI->getNumRegs()); |
| memcpy(RegMask, Mask, sizeof(RegMask[0]) * RegMaskSize); |
| |
| // Make sure all sub registers of the argument registers are reset |
| // in the RegMask. |
| for (auto const &RegPair : RegsToPass) |
| for (MCSubRegIterator SubRegs(RegPair.first, TRI, /*IncludeSelf=*/true); |
| SubRegs.isValid(); ++SubRegs) |
| RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32)); |
| |
| // Create the RegMask Operand according to our updated mask. |
| Ops.push_back(DAG.getRegisterMask(RegMask)); |
| } else { |
| // Create the RegMask Operand according to the static mask. |
| Ops.push_back(DAG.getRegisterMask(Mask)); |
| } |
| |
| if (InFlag.getNode()) |
| Ops.push_back(InFlag); |
| |
| if (isTailCall) { |
| // We used to do: |
| //// If this is the first return lowered for this function, add the regs |
| //// to the liveout set for the function. |
| // This isn't right, although it's probably harmless on x86; liveouts |
| // should be computed from returns not tail calls. Consider a void |
| // function making a tail call to a function returning int. |
| MF.getFrameInfo().setHasTailCall(); |
| SDValue Ret = DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops); |
| DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo)); |
| return Ret; |
| } |
| |
| if (HasNoCfCheck && IsCFProtectionSupported) { |
| Chain = DAG.getNode(X86ISD::NT_CALL, dl, NodeTys, Ops); |
| } else { |
| Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops); |
| } |
| InFlag = Chain.getValue(1); |
| DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo)); |
| |
| // Save heapallocsite metadata. |
| if (CLI.CS) |
| if (MDNode *HeapAlloc = CLI.CS->getMetadata("heapallocsite")) |
| DAG.addHeapAllocSite(Chain.getNode(), HeapAlloc); |
| |
| // Create the CALLSEQ_END node. |
| unsigned NumBytesForCalleeToPop; |
| if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, |
| DAG.getTarget().Options.GuaranteedTailCallOpt)) |
| NumBytesForCalleeToPop = NumBytes; // Callee pops everything |
| else if (!Is64Bit && !canGuaranteeTCO(CallConv) && |
| !Subtarget.getTargetTriple().isOSMSVCRT() && |
| SR == StackStructReturn) |
| // If this is a call to a struct-return function, the callee |
| // pops the hidden struct pointer, so we have to push it back. |
| // This is common for Darwin/X86, Linux & Mingw32 targets. |
| // For MSVC Win32 targets, the caller pops the hidden struct pointer. |
| NumBytesForCalleeToPop = 4; |
| else |
| NumBytesForCalleeToPop = 0; // Callee pops nothing. |
| |
| if (CLI.DoesNotReturn && !getTargetMachine().Options.TrapUnreachable) { |
| // No need to reset the stack after the call if the call doesn't return. To |
| // make the MI verify, we'll pretend the callee does it for us. |
| NumBytesForCalleeToPop = NumBytes; |
| } |
| |
| // Returns a flag for retval copy to use. |
| if (!IsSibcall) { |
| Chain = DAG.getCALLSEQ_END(Chain, |
| DAG.getIntPtrConstant(NumBytesToPop, dl, true), |
| DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl, |
| true), |
| InFlag, dl); |
| InFlag = Chain.getValue(1); |
| } |
| |
| // Handle result values, copying them out of physregs into vregs that we |
| // return. |
| return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG, |
| InVals, RegMask); |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // Fast Calling Convention (tail call) implementation |
| //===----------------------------------------------------------------------===// |
| |
| // Like std call, callee cleans arguments, convention except that ECX is |
| // reserved for storing the tail called function address. Only 2 registers are |
| // free for argument passing (inreg). Tail call optimization is performed |
| // provided: |
| // * tailcallopt is enabled |
| // * caller/callee are fastcc |
| // On X86_64 architecture with GOT-style position independent code only local |
| // (within module) calls are supported at the moment. |
| // To keep the stack aligned according to platform abi the function |
| // GetAlignedArgumentStackSize ensures that argument delta is always multiples |
| // of stack alignment. (Dynamic linkers need this - darwin's dyld for example) |
| // If a tail called function callee has more arguments than the caller the |
| // caller needs to make sure that there is room to move the RETADDR to. This is |
| // achieved by reserving an area the size of the argument delta right after the |
| // original RETADDR, but before the saved framepointer or the spilled registers |
| // e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4) |
| // stack layout: |
| // arg1 |
| // arg2 |
| // RETADDR |
| // [ new RETADDR |
| // move area ] |
| // (possible EBP) |
| // ESI |
| // EDI |
| // local1 .. |
| |
| /// Make the stack size align e.g 16n + 12 aligned for a 16-byte align |
| /// requirement. |
| unsigned |
| X86TargetLowering::GetAlignedArgumentStackSize(const unsigned StackSize, |
| SelectionDAG &DAG) const { |
| const Align StackAlignment(Subtarget.getFrameLowering()->getStackAlignment()); |
| const uint64_t SlotSize = Subtarget.getRegisterInfo()->getSlotSize(); |
| assert(StackSize % SlotSize == 0 && |
| "StackSize must be a multiple of SlotSize"); |
| return alignTo(StackSize + SlotSize, StackAlignment) - SlotSize; |
| } |
| |
| /// Return true if the given stack call argument is already available in the |
| /// same position (relatively) of the caller's incoming argument stack. |
| static |
| bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, |
| MachineFrameInfo &MFI, const MachineRegisterInfo *MRI, |
| const X86InstrInfo *TII, const CCValAssign &VA) { |
| unsigned Bytes = Arg.getValueSizeInBits() / 8; |
| |
| for (;;) { |
| // Look through nodes that don't alter the bits of the incoming value. |
| unsigned Op = Arg.getOpcode(); |
| if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) { |
| Arg = Arg.getOperand(0); |
| continue; |
| } |
| if (Op == ISD::TRUNCATE) { |
| const SDValue &TruncInput = Arg.getOperand(0); |
| if (TruncInput.getOpcode() == ISD::AssertZext && |
| cast<VTSDNode>(TruncInput.getOperand(1))->getVT() == |
| Arg.getValueType()) { |
| Arg = TruncInput.getOperand(0); |
| continue; |
| } |
| } |
| break; |
| } |
| |
| int FI = INT_MAX; |
| if (Arg.getOpcode() == ISD::CopyFromReg) { |
| unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg(); |
| if (!Register::isVirtualRegister(VR)) |
| return false; |
| MachineInstr *Def = MRI->getVRegDef(VR); |
| if (!Def) |
| return false; |
| if (!Flags.isByVal()) { |
| if (!TII->isLoadFromStackSlot(*Def, FI)) |
| return false; |
| } else { |
| unsigned Opcode = Def->getOpcode(); |
| if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r || |
| Opcode == X86::LEA64_32r) && |
| Def->getOperand(1).isFI()) { |
| FI = Def->getOperand(1).getIndex(); |
| Bytes = Flags.getByValSize(); |
| } else |
| return false; |
| } |
| } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) { |
| if (Flags.isByVal()) |
| // ByVal argument is passed in as a pointer but it's now being |
| // dereferenced. e.g. |
| // define @foo(%struct.X* %A) { |
| // tail call @bar(%struct.X* byval %A) |
| // } |
| return false; |
| SDValue Ptr = Ld->getBasePtr(); |
| FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr); |
| if (!FINode) |
| return false; |
| FI = FINode->getIndex(); |
| } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) { |
| FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg); |
| FI = FINode->getIndex(); |
| Bytes = Flags.getByValSize(); |
| } else |
| return false; |
| |
| assert(FI != INT_MAX); |
| if (!MFI.isFixedObjectIndex(FI)) |
| return false; |
| |
| if (Offset != MFI.getObjectOffset(FI)) |
| return false; |
| |
| // If this is not byval, check that the argument stack object is immutable. |
| // inalloca and argument copy elision can create mutable argument stack |
| // objects. Byval objects can be mutated, but a byval call intends to pass the |
| // mutated memory. |
| if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI)) |
| return false; |
| |
| if (VA.getLocVT().getSizeInBits() > Arg.getValueSizeInBits()) { |
| // If the argument location is wider than the argument type, check that any |
| // extension flags match. |
| if (Flags.isZExt() != MFI.isObjectZExt(FI) || |
| Flags.isSExt() != MFI.isObjectSExt(FI)) { |
| return false; |
| } |
| } |
| |
| return Bytes == MFI.getObjectSize(FI); |
| } |
| |
| /// Check whether the call is eligible for tail call optimization. Targets |
| /// that want to do tail call optimization should implement this function. |
| bool X86TargetLowering::IsEligibleForTailCallOptimization( |
| SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, |
| bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy, |
| const SmallVectorImpl<ISD::OutputArg> &Outs, |
| const SmallVectorImpl<SDValue> &OutVals, |
| const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const { |
| if (!mayTailCallThisCC(CalleeCC)) |
| return false; |
| |
| // If -tailcallopt is specified, make fastcc functions tail-callable. |
| MachineFunction &MF = DAG.getMachineFunction(); |
| const Function &CallerF = MF.getFunction(); |
| |
| // If the function return type is x86_fp80 and the callee return type is not, |
| // then the FP_EXTEND of the call result is not a nop. It's not safe to |
| // perform a tailcall optimization here. |
| if (CallerF.getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty()) |
| return false; |
| |
| CallingConv::ID CallerCC = CallerF.getCallingConv(); |
| bool CCMatch = CallerCC == CalleeCC; |
| bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC); |
| bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC); |
| bool IsGuaranteeTCO = DAG.getTarget().Options.GuaranteedTailCallOpt || |
| CalleeCC == CallingConv::Tail; |
| |
| // Win64 functions have extra shadow space for argument homing. Don't do the |
| // sibcall if the caller and callee have mismatched expectations for this |
| // space. |
| if (IsCalleeWin64 != IsCallerWin64) |
| return false; |
| |
| if (IsGuaranteeTCO) { |
| if (canGuaranteeTCO(CalleeCC) && CCMatch) |
| return true; |
| return false; |
| } |
| |
| // Look for obvious safe cases to perform tail call optimization that do not |
| // require ABI changes. This is what gcc calls sibcall. |
| |
| // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to |
| // emit a special epilogue. |
| const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); |
| if (RegInfo->needsStackRealignment(MF)) |
| return false; |
| |
| // Also avoid sibcall optimization if either caller or callee uses struct |
| // return semantics. |
| if (isCalleeStructRet || isCallerStructRet) |
| return false; |
| |
| // Do not sibcall optimize vararg calls unless all arguments are passed via |
| // registers. |
| LLVMContext &C = *DAG.getContext(); |
| if (isVarArg && !Outs.empty()) { |
| // Optimizing for varargs on Win64 is unlikely to be safe without |
| // additional testing. |
| if (IsCalleeWin64 || IsCallerWin64) |
| return false; |
| |
| SmallVector<CCValAssign, 16> ArgLocs; |
| CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C); |
| |
| CCInfo.AnalyzeCallOperands(Outs, CC_X86); |
| for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) |
| if (!ArgLocs[i].isRegLoc()) |
| return false; |
| } |
| |
| // If the call result is in ST0 / ST1, it needs to be popped off the x87 |
| // stack. Therefore, if it's not used by the call it is not safe to optimize |
| // this into a sibcall. |
| bool Unused = false; |
| for (unsigned i = 0, e = Ins.size(); i != e; ++i) { |
| if (!Ins[i].Used) { |
| Unused = true; |
| break; |
| } |
| } |
| if (Unused) { |
| SmallVector<CCValAssign, 16> RVLocs; |
| CCState CCInfo(CalleeCC, false, MF, RVLocs, C); |
| CCInfo.AnalyzeCallResult(Ins, RetCC_X86); |
| for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { |
| CCValAssign &VA = RVLocs[i]; |
| if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) |
| return false; |
| } |
| } |
| |
| // Check that the call results are passed in the same way. |
| if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins, |
| RetCC_X86, RetCC_X86)) |
| return false; |
| // The callee has to preserve all registers the caller needs to preserve. |
| const X86RegisterInfo *TRI = Subtarget.getRegisterInfo(); |
| const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC); |
| if (!CCMatch) { |
| const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC); |
| if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved)) |
| return false; |
| } |
| |
| unsigned StackArgsSize = 0; |
| |
| // If the callee takes no arguments then go on to check the results of the |
| // call. |
| if (!Outs.empty()) { |
| // Check if stack adjustment is needed. For now, do not do this if any |
| // argument is passed on the stack. |
| SmallVector<CCValAssign, 16> ArgLocs; |
| CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C); |
| |
| // Allocate shadow area for Win64 |
| if (IsCalleeWin64) |
| CCInfo.AllocateStack(32, 8); |
| |
| CCInfo.AnalyzeCallOperands(Outs, CC_X86); |
| StackArgsSize = CCInfo.getNextStackOffset(); |
| |
| if (CCInfo.getNextStackOffset()) { |
| // Check if the arguments are already laid out in the right way as |
| // the caller's fixed stack objects. |
| MachineFrameInfo &MFI = MF.getFrameInfo(); |
| const MachineRegisterInfo *MRI = &MF.getRegInfo(); |
| const X86InstrInfo *TII = Subtarget.getInstrInfo(); |
| for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { |
| CCValAssign &VA = ArgLocs[i]; |
| SDValue Arg = OutVals[i]; |
| ISD::ArgFlagsTy Flags = Outs[i].Flags; |
| if (VA.getLocInfo() == CCValAssign::Indirect) |
| return false; |
| if (!VA.isRegLoc()) { |
| if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, |
| MFI, MRI, TII, VA)) |
| return false; |
| } |
| } |
| } |
| |
| bool PositionIndependent = isPositionIndependent(); |
| // If the tailcall address may be in a register, then make sure it's |
| // possible to register allocate for it. In 32-bit, the call address can |
| // only target EAX, EDX, or ECX since the tail call must be scheduled after |
| // callee-saved registers are restored. These happen to be the same |
| // registers used to pass 'inreg' arguments so watch out for those. |
| if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) && |
| !isa<ExternalSymbolSDNode>(Callee)) || |
| PositionIndependent)) { |
| unsigned NumInRegs = 0; |
| // In PIC we need an extra register to formulate the address computation |
| // for the callee. |
| unsigned MaxInRegs = PositionIndependent ? 2 : 3; |
| |
| for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { |
| CCValAssign &VA = ArgLocs[i]; |
| if (!VA.isRegLoc()) |
| continue; |
| Register Reg = VA.getLocReg(); |
| switch (Reg) { |
| default: break; |
| case X86::EAX: case X86::EDX: case X86::ECX: |
| if (++NumInRegs == MaxInRegs) |
| return false; |
| break; |
| } |
| } |
| } |
| |
| const MachineRegisterInfo &MRI = MF.getRegInfo(); |
| if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals)) |
| return false; |
| } |
| |
| bool CalleeWillPop = |
| X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg, |
| MF.getTarget().Options.GuaranteedTailCallOpt); |
| |
| if (unsigned BytesToPop = |
| MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) { |
| // If we have bytes to pop, the callee must pop them. |
| bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize; |
| if (!CalleePopMatches) |
| return false; |
| } else if (CalleeWillPop && StackArgsSize > 0) { |
| // If we don't have bytes to pop, make sure the callee doesn't pop any. |
| return false; |
| } |
| |
| return true; |
| } |
| |
| FastISel * |
| X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, |
| const TargetLibraryInfo *libInfo) const { |
| return X86::createFastISel(funcInfo, libInfo); |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // Other Lowering Hooks |
| //===----------------------------------------------------------------------===// |
| |
| static bool MayFoldLoad(SDValue Op) { |
| return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode()); |
| } |
| |
| static bool MayFoldIntoStore(SDValue Op) { |
| return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin()); |
| } |
| |
| static bool MayFoldIntoZeroExtend(SDValue Op) { |
| if (Op.hasOneUse()) { |
| unsigned Opcode = Op.getNode()->use_begin()->getOpcode(); |
| return (ISD::ZERO_EXTEND == Opcode); |
| } |
| return false; |
| } |
| |
| static bool isTargetShuffle(unsigned Opcode) { |
| switch(Opcode) { |
| default: return false; |
| case X86ISD::BLENDI: |
| case X86ISD::PSHUFB: |
| case X86ISD::PSHUFD: |
| case X86ISD::PSHUFHW: |
| case X86ISD::PSHUFLW: |
| case X86ISD::SHUFP: |
| case X86ISD::INSERTPS: |
| case X86ISD::EXTRQI: |
| case X86ISD::INSERTQI: |
| case X86ISD::PALIGNR: |
| case X86ISD::VSHLDQ: |
| case X86ISD::VSRLDQ: |
| case X86ISD::MOVLHPS: |
| case X86ISD::MOVHLPS: |
| case X86ISD::MOVSHDUP: |
| case X86ISD::MOVSLDUP: |
| case X86ISD::MOVDDUP: |
| case X86ISD::MOVSS: |
| case X86ISD::MOVSD: |
| case X86ISD::UNPCKL: |
| case X86ISD::UNPCKH: |
| case X86ISD::VBROADCAST: |
| case X86ISD::VPERMILPI: |
| case X86ISD::VPERMILPV: |
| case X86ISD::VPERM2X128: |
| case X86ISD::SHUF128: |
| case X86ISD::VPERMIL2: |
| case X86ISD::VPERMI: |
| case X86ISD::VPPERM: |
| case X86ISD::VPERMV: |
| case X86ISD::VPERMV3: |
| case X86ISD::VZEXT_MOVL: |
| return true; |
| } |
| } |
| |
| static bool isTargetShuffleVariableMask(unsigned Opcode) { |
| switch (Opcode) { |
| default: return false; |
| // Target Shuffles. |
| case X86ISD::PSHUFB: |
| case X86ISD::VPERMILPV: |
| case X86ISD::VPERMIL2: |
| case X86ISD::VPPERM: |
| case X86ISD::VPERMV: |
| case X86ISD::VPERMV3: |
| return true; |
| // 'Faux' Target Shuffles. |
| case ISD::OR: |
| case ISD::AND: |
| case X86ISD::ANDNP: |
| return true; |
| } |
| } |
| |
| SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const { |
| MachineFunction &MF = DAG.getMachineFunction(); |
| const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); |
| X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); |
| int ReturnAddrIndex = FuncInfo->getRAIndex(); |
| |
| if (ReturnAddrIndex == 0) { |
| // Set up a frame object for the return address. |
| unsigned SlotSize = RegInfo->getSlotSize(); |
| ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize, |
| -(int64_t)SlotSize, |
| false); |
| FuncInfo->setRAIndex(ReturnAddrIndex); |
| } |
| |
| return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout())); |
| } |
| |
| bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, |
| bool hasSymbolicDisplacement) { |
| // Offset should fit into 32 bit immediate field. |
| if (!isInt<32>(Offset)) |
| return false; |
| |
| // If we don't have a symbolic displacement - we don't have any extra |
| // restrictions. |
| if (!hasSymbolicDisplacement) |
| return true; |
| |
| // FIXME: Some tweaks might be needed for medium code model. |
| if (M != CodeModel::Small && M != CodeModel::Kernel) |
| return false; |
| |
| // For small code model we assume that latest object is 16MB before end of 31 |
| // bits boundary. We may also accept pretty large negative constants knowing |
| // that all objects are in the positive half of address space. |
| if (M == CodeModel::Small && Offset < 16*1024*1024) |
| return true; |
| |
| // For kernel code model we know that all object resist in the negative half |
| // of 32bits address space. We may not accept negative offsets, since they may |
| // be just off and we may accept pretty large positive ones. |
| if (M == CodeModel::Kernel && Offset >= 0) |
| return true; |
| |
| return false; |
| } |
| |
| /// Determines whether the callee is required to pop its own arguments. |
| /// Callee pop is necessary to support tail calls. |
| bool X86::isCalleePop(CallingConv::ID CallingConv, |
| bool is64Bit, bool IsVarArg, bool GuaranteeTCO) { |
| // If GuaranteeTCO is true, we force some calls to be callee pop so that we |
| // can guarantee TCO. |
| if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO)) |
| return true; |
| |
| switch (CallingConv) { |
| default: |
| return false; |
| case CallingConv::X86_StdCall: |
| case CallingConv::X86_FastCall: |
| case CallingConv::X86_ThisCall: |
| case CallingConv::X86_VectorCall: |
| return !is64Bit; |
| } |
| } |
| |
| /// Return true if the condition is an signed comparison operation. |
| static bool isX86CCSigned(unsigned X86CC) { |
| switch (X86CC) { |
| default: |
| llvm_unreachable("Invalid integer condition!"); |
| case X86::COND_E: |
| case X86::COND_NE: |
| case X86::COND_B: |
| case X86::COND_A: |
| case X86::COND_BE: |
| case X86::COND_AE: |
| return false; |
| case X86::COND_G: |
| case X86::COND_GE: |
| case X86::COND_L: |
| case X86::COND_LE: |
| return true; |
| } |
| } |
| |
| static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) { |
| switch (SetCCOpcode) { |
| default: llvm_unreachable("Invalid integer condition!"); |
| case ISD::SETEQ: return X86::COND_E; |
| case ISD::SETGT: return X86::COND_G; |
| case ISD::SETGE: return X86::COND_GE; |
| case ISD::SETLT: return X86::COND_L; |
| case ISD::SETLE: return X86::COND_LE; |
| case ISD::SETNE: return X86::COND_NE; |
| case ISD::SETULT: return X86::COND_B; |
| case ISD::SETUGT: return X86::COND_A; |
| case ISD::SETULE: return X86::COND_BE; |
| case ISD::SETUGE: return X86::COND_AE; |
| } |
| } |
| |
| /// Do a one-to-one translation of a ISD::CondCode to the X86-specific |
| /// condition code, returning the condition code and the LHS/RHS of the |
| /// comparison to make. |
| static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL, |
| bool isFP, SDValue &LHS, SDValue &RHS, |
| SelectionDAG &DAG) { |
| if (!isFP) { |
| if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) { |
| if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) { |
| // X > -1 -> X == 0, jump !sign. |
| RHS = DAG.getConstant(0, DL, RHS.getValueType()); |
| return X86::COND_NS; |
| } |
| if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) { |
| // X < 0 -> X == 0, jump on sign. |
| return X86::COND_S; |
| } |
| if (SetCCOpcode == ISD::SETGE && RHSC->isNullValue()) { |
| // X >= 0 -> X == 0, jump on !sign. |
| return X86::COND_NS; |
| } |
| if (SetCCOpcode == ISD::SETLT && RHSC->isOne()) { |
| // X < 1 -> X <= 0 |
| RHS = DAG.getConstant(0, DL, RHS.getValueType()); |
| return X86::COND_LE; |
| } |
| } |
| |
| return TranslateIntegerX86CC(SetCCOpcode); |
| } |
| |
| // First determine if it is required or is profitable to flip the operands. |
| |
| // If LHS is a foldable load, but RHS is not, flip the condition. |
| if (ISD::isNON_EXTLoad(LHS.getNode()) && |
| !ISD::isNON_EXTLoad(RHS.getNode())) { |
| SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode); |
| std::swap(LHS, RHS); |
| } |
| |
| switch (SetCCOpcode) { |
| default: break; |
| case ISD::SETOLT: |
| case ISD::SETOLE: |
| case ISD::SETUGT: |
| case ISD::SETUGE: |
| std::swap(LHS, RHS); |
| break; |
| } |
| |
| // On a floating point condition, the flags are set as follows: |
| // ZF PF CF op |
| // 0 | 0 | 0 | X > Y |
| // 0 | 0 | 1 | X < Y |
| // 1 | 0 | 0 | X == Y |
| // 1 | 1 | 1 | unordered |
| switch (SetCCOpcode) { |
| default: llvm_unreachable("Condcode should be pre-legalized away"); |
| case ISD::SETUEQ: |
| case ISD::SETEQ: return X86::COND_E; |
| case ISD::SETOLT: // flipped |
| case ISD::SETOGT: |
| case ISD::SETGT: return X86::COND_A; |
| case ISD::SETOLE: // flipped |
| case ISD::SETOGE: |
| case ISD::SETGE: return X86::COND_AE; |
| case ISD::SETUGT: // flipped |
| case ISD::SETULT: |
| case ISD::SETLT: return X86::COND_B; |
| case ISD::SETUGE: // flipped |
| case ISD::SETULE: |
| case ISD::SETLE: return X86::COND_BE; |
| case ISD::SETONE: |
| case ISD::SETNE: return X86::COND_NE; |
| case ISD::SETUO: return X86::COND_P; |
| case ISD::SETO: return X86::COND_NP; |
| case ISD::SETOEQ: |
| case ISD::SETUNE: return X86::COND_INVALID; |
| } |
| } |
| |
| /// Is there a floating point cmov for the specific X86 condition code? |
| /// Current x86 isa includes the following FP cmov instructions: |
| /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu. |
| static bool hasFPCMov(unsigned X86CC) { |
| switch (X86CC) { |
| default: |
| return false; |
| case X86::COND_B: |
| case X86::COND_BE: |
| case X86::COND_E: |
| case X86::COND_P: |
| case X86::COND_A: |
| case X86::COND_AE: |
| case X86::COND_NE: |
| case X86::COND_NP: |
| return true; |
| } |
| } |
| |
| |
| bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, |
| const CallInst &I, |
| MachineFunction &MF, |
| unsigned Intrinsic) const { |
| |
| const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic); |
| if (!IntrData) |
| return false; |
| |
| Info.flags = MachineMemOperand::MONone; |
| Info.offset = 0; |
| |
| switch (IntrData->Type) { |
| case TRUNCATE_TO_MEM_VI8: |
| case TRUNCATE_TO_MEM_VI16: |
| case TRUNCATE_TO_MEM_VI32: { |
| Info.opc = ISD::INTRINSIC_VOID; |
| Info.ptrVal = I.getArgOperand(0); |
| MVT VT = MVT::getVT(I.getArgOperand(1)->getType()); |
| MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE; |
| if (IntrData->Type == TRUNCATE_TO_MEM_VI8) |
| ScalarVT = MVT::i8; |
| else if (IntrData->Type == TRUNCATE_TO_MEM_VI16) |
| ScalarVT = MVT::i16; |
| else if (IntrData->Type == TRUNCATE_TO_MEM_VI32) |
| ScalarVT = MVT::i32; |
| |
| Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements()); |
| Info.align = Align::None(); |
| Info.flags |= MachineMemOperand::MOStore; |
| break; |
| } |
| case GATHER: |
| case GATHER_AVX2: { |
| Info.opc = ISD::INTRINSIC_W_CHAIN; |
| Info.ptrVal = nullptr; |
| MVT DataVT = MVT::getVT(I.getType()); |
| MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType()); |
| unsigned NumElts = std::min(DataVT.getVectorNumElements(), |
| IndexVT.getVectorNumElements()); |
| Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts); |
| Info.align = Align::None(); |
| Info.flags |= MachineMemOperand::MOLoad; |
| break; |
| } |
| case SCATTER: { |
| Info.opc = ISD::INTRINSIC_VOID; |
| Info.ptrVal = nullptr; |
| MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType()); |
| MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType()); |
| unsigned NumElts = std::min(DataVT.getVectorNumElements(), |
| IndexVT.getVectorNumElements()); |
| Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts); |
| Info.align = Align::None(); |
| Info.flags |= MachineMemOperand::MOStore; |
| break; |
| } |
| default: |
| return false; |
| } |
| |
| return true; |
| } |
| |
| /// Returns true if the target can instruction select the |
| /// specified FP immediate natively. If false, the legalizer will |
| /// materialize the FP immediate as a load from a constant pool. |
| bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT, |
| bool ForCodeSize) const { |
| for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) { |
| if (Imm.bitwiseIsEqual(LegalFPImmediates[i])) |
| return true; |
| } |
| return false; |
| } |
| |
| bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load, |
| ISD::LoadExtType ExtTy, |
| EVT NewVT) const { |
| assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow"); |
| |
| // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF |
| // relocation target a movq or addq instruction: don't let the load shrink. |
| SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr(); |
| if (BasePtr.getOpcode() == X86ISD::WrapperRIP) |
| if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0))) |
| return GA->getTargetFlags() != X86II::MO_GOTTPOFF; |
| |
| // If this is an (1) AVX vector load with (2) multiple uses and (3) all of |
| // those uses are extracted directly into a store, then the extract + store |
| // can be store-folded. Therefore, it's probably not worth splitting the load. |
| EVT VT = Load->getValueType(0); |
| if ((VT.is256BitVector() || VT.is512BitVector()) && !Load->hasOneUse()) { |
| for (auto UI = Load->use_begin(), UE = Load->use_end(); UI != UE; ++UI) { |
| // Skip uses of the chain value. Result 0 of the node is the load value. |
| if (UI.getUse().getResNo() != 0) |
| continue; |
| |
| // If this use is not an extract + store, it's probably worth splitting. |
| if (UI->getOpcode() != ISD::EXTRACT_SUBVECTOR || !UI->hasOneUse() || |
| UI->use_begin()->getOpcode() != ISD::STORE) |
| return true; |
| } |
| // All non-chain uses are extract + store. |
| return false; |
| } |
| |
| return true; |
| } |
| |
| /// Returns true if it is beneficial to convert a load of a constant |
| /// to just the constant itself. |
| bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, |
| Type *Ty) const { |
| assert(Ty->isIntegerTy()); |
| |
| unsigned BitSize = Ty->getPrimitiveSizeInBits(); |
| if (BitSize == 0 || BitSize > 64) |
| return false; |
| return true; |
| } |
| |
| bool X86TargetLowering::reduceSelectOfFPConstantLoads(EVT CmpOpVT) const { |
| // If we are using XMM registers in the ABI and the condition of the select is |
| // a floating-point compare and we have blendv or conditional move, then it is |
| // cheaper to select instead of doing a cross-register move and creating a |
| // load that depends on the compare result. |
| bool IsFPSetCC = CmpOpVT.isFloatingPoint() && CmpOpVT != MVT::f128; |
| return !IsFPSetCC || !Subtarget.isTarget64BitLP64() || !Subtarget.hasAVX(); |
| } |
| |
| bool X86TargetLowering::convertSelectOfConstantsToMath(EVT VT) const { |
| // TODO: It might be a win to ease or lift this restriction, but the generic |
| // folds in DAGCombiner conflict with vector folds for an AVX512 target. |
| if (VT.isVector() && Subtarget.hasAVX512()) |
| return false; |
| |
| return true; |
| } |
| |
| bool X86TargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT, |
| SDValue C) const { |
| // TODO: We handle scalars using custom code, but generic combining could make |
| // that unnecessary. |
| APInt MulC; |
| if (!ISD::isConstantSplatVector(C.getNode(), MulC)) |
| return false; |
| |
| // Find the type this will be legalized too. Otherwise we might prematurely |
| // convert this to shl+add/sub and then still have to type legalize those ops. |
| // Another choice would be to defer the decision for illegal types until |
| // after type legalization. But constant splat vectors of i64 can't make it |
| // through type legalization on 32-bit targets so we would need to special |
| // case vXi64. |
| while (getTypeAction(Context, VT) != TypeLegal) |
| VT = getTypeToTransformTo(Context, VT); |
| |
| // If vector multiply is legal, assume that's faster than shl + add/sub. |
| // TODO: Multiply is a complex op with higher latency and lower throughput in |
| // most implementations, so this check could be loosened based on type |
| // and/or a CPU attribute. |
| if (isOperationLegal(ISD::MUL, VT)) |
| return false; |
| |
| // shl+add, shl+sub, shl+add+neg |
| return (MulC + 1).isPowerOf2() || (MulC - 1).isPowerOf2() || |
| (1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2(); |
| } |
| |
| bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, |
| unsigned Index) const { |
| if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT)) |
| return false; |
| |
| // Mask vectors support all subregister combinations and operations that |
| // extract half of vector. |
| if (ResVT.getVectorElementType() == MVT::i1) |
| return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) && |
| (Index == ResVT.getVectorNumElements())); |
| |
| return (Index % ResVT.getVectorNumElements()) == 0; |
| } |
| |
| bool X86TargetLowering::shouldScalarizeBinop(SDValue VecOp) const { |
| unsigned Opc = VecOp.getOpcode(); |
| |
| // Assume target opcodes can't be scalarized. |
| // TODO - do we have any exceptions? |
| if (Opc >= ISD::BUILTIN_OP_END) |
| return false; |
| |
| // If the vector op is not supported, try to convert to scalar. |
| EVT VecVT = VecOp.getValueType(); |
| if (!isOperationLegalOrCustomOrPromote(Opc, VecVT)) |
| return true; |
| |
| // If the vector op is supported, but the scalar op is not, the transform may |
| // not be worthwhile. |
| EVT ScalarVT = VecVT.getScalarType(); |
| return isOperationLegalOrCustomOrPromote(Opc, ScalarVT); |
| } |
| |
| bool X86TargetLowering::shouldFormOverflowOp(unsigned Opcode, EVT VT) const { |
| // TODO: Allow vectors? |
| if (VT.isVector()) |
| return false; |
| return VT.isSimple() || !isOperationExpand(Opcode, VT); |
| } |
| |
| bool X86TargetLowering::isCheapToSpeculateCttz() const { |
| // Speculate cttz only if we can directly use TZCNT. |
| return Subtarget.hasBMI(); |
| } |
| |
| bool X86TargetLowering::isCheapToSpeculateCtlz() const { |
| // Speculate ctlz only if we can directly use LZCNT. |
| return Subtarget.hasLZCNT(); |
| } |
| |
| bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT, |
| const SelectionDAG &DAG, |
| const MachineMemOperand &MMO) const { |
| if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() && |
| BitcastVT.getVectorElementType() == MVT::i1) |
| return false; |
| |
| if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8) |
| return false; |
| |
| // If both types are legal vectors, it's always ok to convert them. |
| if (LoadVT.isVector() && BitcastVT.isVector() && |
| isTypeLegal(LoadVT) && isTypeLegal(BitcastVT)) |
| return true; |
| |
| return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO); |
| } |
| |
| bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT, |
| const SelectionDAG &DAG) const { |
| // Do not merge to float value size (128 bytes) if no implicit |
| // float attribute is set. |
| bool NoFloat = DAG.getMachineFunction().getFunction().hasFnAttribute( |
| Attribute::NoImplicitFloat); |
| |
| if (NoFloat) { |
| unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32; |
| return (MemVT.getSizeInBits() <= MaxIntSize); |
| } |
| // Make sure we don't merge greater than our preferred vector |
| // width. |
| if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth()) |
| return false; |
| return true; |
| } |
| |
| bool X86TargetLowering::isCtlzFast() const { |
| return Subtarget.hasFastLZCNT(); |
| } |
| |
| bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial( |
| const Instruction &AndI) const { |
| return true; |
| } |
| |
| bool X86TargetLowering::hasAndNotCompare(SDValue Y) const { |
| EVT VT = Y.getValueType(); |
| |
| if (VT.isVector()) |
| return false; |
| |
| if (!Subtarget.hasBMI()) |
| return false; |
| |
| // There are only 32-bit and 64-bit forms for 'andn'. |
| if (VT != MVT::i32 && VT != MVT::i64) |
| return false; |
| |
| return !isa<ConstantSDNode>(Y); |
| } |
| |
| bool X86TargetLowering::hasAndNot(SDValue Y) const { |
| EVT VT = Y.getValueType(); |
| |
| if (!VT.isVector()) |
| return hasAndNotCompare(Y); |
| |
| // Vector. |
| |
| if (!Subtarget.hasSSE1() || VT.getSizeInBits() < 128) |
| return false; |
| |
| if (VT == MVT::v4i32) |
| return true; |
| |
| return Subtarget.hasSSE2(); |
| } |
| |
| bool X86TargetLowering::hasBitTest(SDValue X, SDValue Y) const { |
| return X.getValueType().isScalarInteger(); // 'bt' |
| } |
| |
| bool X86TargetLowering:: |
| shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd( |
| SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, |
| unsigned OldShiftOpcode, unsigned NewShiftOpcode, |
| SelectionDAG &DAG) const { |
| // Does baseline recommend not to perform the fold by default? |
| if (!TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd( |
| X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG)) |
| return false; |
| // For scalars this transform is always beneficial. |
| if (X.getValueType().isScalarInteger()) |
| return true; |
| // If all the shift amounts are identical, then transform is beneficial even |
| // with rudimentary SSE2 shifts. |
| if (DAG.isSplatValue(Y, /*AllowUndefs=*/true)) |
| return true; |
| // If we have AVX2 with it's powerful shift operations, then it's also good. |
| if (Subtarget.hasAVX2()) |
| return true; |
| // Pre-AVX2 vector codegen for this pattern is best for variant with 'shl'. |
| return NewShiftOpcode == ISD::SHL; |
| } |
| |
| bool X86TargetLowering::shouldFoldConstantShiftPairToMask( |
| const SDNode *N, CombineLevel Level) const { |
| assert(((N->getOpcode() == ISD::SHL && |
| N->getOperand(0).getOpcode() == ISD::SRL) || |
| (N->getOpcode() == ISD::SRL && |
| N->getOperand(0).getOpcode() == ISD::SHL)) && |
| "Expected shift-shift mask"); |
| EVT VT = N->getValueType(0); |
| if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) || |
| (Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) { |
| // Only fold if the shift values are equal - so it folds to AND. |
| // TODO - we should fold if either is a non-uniform vector but we don't do |
| // the fold for non-splats yet. |
| return N->getOperand(1) == N->getOperand(0).getOperand(1); |
| } |
| return TargetLoweringBase::shouldFoldConstantShiftPairToMask(N, Level); |
| } |
| |
| bool X86TargetLowering::shouldFoldMaskToVariableShiftPair(SDValue Y) const { |
| EVT VT = Y.getValueType(); |
| |
| // For vectors, we don't have a preference, but we probably want a mask. |
| if (VT.isVector()) |
| return false; |
| |
| // 64-bit shifts on 32-bit targets produce really bad bloated code. |
| if (VT == MVT::i64 && !Subtarget.is64Bit()) |
| return false; |
| |
| return true; |
| } |
| |
| bool X86TargetLowering::shouldExpandShift(SelectionDAG &DAG, |
| SDNode *N) const { |
| if (DAG.getMachineFunction().getFunction().hasMinSize() && |
| !Subtarget.isOSWindows()) |
| return false; |
| return true; |
| } |
| |
| bool X86TargetLowering::shouldSplatInsEltVarIndex(EVT VT) const { |
| // Any legal vector type can be splatted more efficiently than |
| // loading/spilling from memory. |
| return isTypeLegal(VT); |
| } |
| |
| MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const { |
| MVT VT = MVT::getIntegerVT(NumBits); |
| if (isTypeLegal(VT)) |
| return VT; |
| |
| // PMOVMSKB can handle this. |
| if (NumBits == 128 && isTypeLegal(MVT::v16i8)) |
| return MVT::v16i8; |
| |
| // VPMOVMSKB can handle this. |
| if (NumBits == 256 && isTypeLegal(MVT::v32i8)) |
| return MVT::v32i8; |
| |
| // TODO: Allow 64-bit type for 32-bit target. |
| // TODO: 512-bit types should be allowed, but make sure that those |
| // cases are handled in combineVectorSizedSetCCEquality(). |
| |
| return MVT::INVALID_SIMPLE_VALUE_TYPE; |
| } |
| |
| /// Val is the undef sentinel value or equal to the specified value. |
| static bool isUndefOrEqual(int Val, int CmpVal) { |
| return ((Val == SM_SentinelUndef) || (Val == CmpVal)); |
| } |
| |
| /// Val is either the undef or zero sentinel value. |
| static bool isUndefOrZero(int Val) { |
| return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero)); |
| } |
| |
| /// Return true if every element in Mask, beginning from position Pos and ending |
| /// in Pos+Size is the undef sentinel value. |
| static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) { |
| return llvm::all_of(Mask.slice(Pos, Size), |
| [](int M) { return M == SM_SentinelUndef; }); |
| } |
| |
| /// Return true if the mask creates a vector whose lower half is undefined. |
| static bool isUndefLowerHalf(ArrayRef<int> Mask) { |
| unsigned NumElts = Mask.size(); |
| return isUndefInRange(Mask, 0, NumElts / 2); |
| } |
| |
| /// Return true if the mask creates a vector whose upper half is undefined. |
| static bool isUndefUpperHalf(ArrayRef<int> Mask) { |
| unsigned NumElts = Mask.size(); |
| return isUndefInRange(Mask, NumElts / 2, NumElts / 2); |
| } |
| |
| /// Return true if Val falls within the specified range (L, H]. |
| static bool isInRange(int Val, int Low, int Hi) { |
| return (Val >= Low && Val < Hi); |
| } |
| |
| /// Return true if the value of any element in Mask falls within the specified |
| /// range (L, H]. |
| static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) { |
| return llvm::any_of(Mask, [Low, Hi](int M) { return isInRange(M, Low, Hi); }); |
| } |
| |
| /// Return true if Val is undef or if its value falls within the |
| /// specified range (L, H]. |
| static bool isUndefOrInRange(int Val, int Low, int Hi) { |
| return (Val == SM_SentinelUndef) || isInRange(Val, Low, Hi); |
| } |
| |
| /// Return true if every element in Mask is undef or if its value |
| /// falls within the specified range (L, H]. |
| static bool isUndefOrInRange(ArrayRef<int> Mask, int Low, int Hi) { |
| return llvm::all_of( |
| Mask, [Low, Hi](int M) { return isUndefOrInRange(M, Low, Hi); }); |
| } |
| |
| /// Return true if Val is undef, zero or if its value falls within the |
| /// specified range (L, H]. |
| static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) { |
| return isUndefOrZero(Val) || isInRange(Val, Low, Hi); |
| } |
| |
| /// Return true if every element in Mask is undef, zero or if its value |
| /// falls within the specified range (L, H]. |
| static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) { |
| return llvm::all_of( |
| Mask, [Low, Hi](int M) { return isUndefOrZeroOrInRange(M, Low, Hi); }); |
| } |
| |
| /// Return true if every element in Mask, beginning |
| /// from position Pos and ending in Pos + Size, falls within the specified |
| /// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef. |
| static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos, |
| unsigned Size, int Low, int Step = 1) { |
| for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step) |
| if (!isUndefOrEqual(Mask[i], Low)) |
| return false; |
| return true; |
| } |
| |
| /// Return true if every element in Mask, beginning |
| /// from position Pos and ending in Pos+Size, falls within the specified |
| /// sequential range (Low, Low+Size], or is undef or is zero. |
| static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos, |
| unsigned Size, int Low, |
| int Step = 1) { |
| for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step) |
| if (!isUndefOrZero(Mask[i]) && Mask[i] != Low) |
| return false; |
| return true; |
| } |
| |
| /// Return true if every element in Mask, beginning |
| /// from position Pos and ending in Pos+Size is undef or is zero. |
| static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos, |
| unsigned Size) { |
| return llvm::all_of(Mask.slice(Pos, Size), |
| [](int M) { return isUndefOrZero(M); }); |
| } |
| |
| /// Helper function to test whether a shuffle mask could be |
| /// simplified by widening the elements being shuffled. |
| /// |
| /// Appends the mask for wider elements in WidenedMask if valid. Otherwise |
| /// leaves it in an unspecified state. |
| /// |
| /// NOTE: This must handle normal vector shuffle masks and *target* vector |
| /// shuffle masks. The latter have the special property of a '-2' representing |
| /// a zero-ed lane of a vector. |
| static bool canWidenShuffleElements(ArrayRef<int> Mask, |
| SmallVectorImpl<int> &WidenedMask) { |
| WidenedMask.assign(Mask.size() / 2, 0); |
| for (int i = 0, Size = Mask.size(); i < Size; i += 2) { |
| int M0 = Mask[i]; |
| int M1 = Mask[i + 1]; |
| |
| // If both elements are undef, its trivial. |
| if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) { |
| WidenedMask[i / 2] = SM_SentinelUndef; |
| continue; |
| } |
| |
| // Check for an undef mask and a mask value properly aligned to fit with |
| // a pair of values. If we find such a case, use the non-undef mask's value. |
| if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) { |
| WidenedMask[i / 2] = M1 / 2; |
| continue; |
| } |
| if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) { |
| WidenedMask[i / 2] = M0 / 2; |
| continue; |
| } |
| |
| // When zeroing, we need to spread the zeroing across both lanes to widen. |
| if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) { |
| if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) && |
| (M1 == SM_SentinelZero || M1 == SM_SentinelUndef)) { |
| WidenedMask[i / 2] = SM_SentinelZero; |
| continue; |
| } |
| return false; |
| } |
| |
| // Finally check if the two mask values are adjacent and aligned with |
| // a pair. |
| if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) { |
| WidenedMask[i / 2] = M0 / 2; |
| continue; |
| } |
| |
| // Otherwise we can't safely widen the elements used in this shuffle. |
| return false; |
| } |
| assert(WidenedMask.size() == Mask.size() / 2 && |
| "Incorrect size of mask after widening the elements!"); |
| |
| return true; |
| } |
| |
| static bool canWidenShuffleElements(ArrayRef<int> Mask, |
| const APInt &Zeroable, |
| bool V2IsZero, |
| SmallVectorImpl<int> &WidenedMask) { |
| // Create an alternative mask with info about zeroable elements. |
| // Here we do not set undef elements as zeroable. |
| SmallVector<int, 64> ZeroableMask(Mask.begin(), Mask.end()); |
| if (V2IsZero) { |
| assert(!Zeroable.isNullValue() && "V2's non-undef elements are used?!"); |
| for (int i = 0, Size = Mask.size(); i != Size; ++i) |
| if (Mask[i] != SM_SentinelUndef && Zeroable[i]) |
| ZeroableMask[i] = SM_SentinelZero; |
| } |
| return canWidenShuffleElements(ZeroableMask, WidenedMask); |
| } |
| |
| static bool canWidenShuffleElements(ArrayRef<int> Mask) { |
| SmallVector<int, 32> WidenedMask; |
| return canWidenShuffleElements(Mask, WidenedMask); |
| } |
| |
| /// Returns true if Elt is a constant zero or a floating point constant +0.0. |
| bool X86::isZeroNode(SDValue Elt) { |
| return isNullConstant(Elt) || isNullFPConstant(Elt); |
| } |
| |
| // Build a vector of constants. |
| // Use an UNDEF node if MaskElt == -1. |
| // Split 64-bit constants in the 32-bit mode. |
| static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG, |
| const SDLoc &dl, bool IsMask = false) { |
| |
| SmallVector<SDValue, 32> Ops; |
| bool Split = false; |
| |
| MVT ConstVecVT = VT; |
| unsigned NumElts = VT.getVectorNumElements(); |
| bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64); |
| if (!In64BitMode && VT.getVectorElementType() == MVT::i64) { |
| ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2); |
| Split = true; |
| } |
| |
| MVT EltVT = ConstVecVT.getVectorElementType(); |
| for (unsigned i = 0; i < NumElts; ++i) { |
| bool IsUndef = Values[i] < 0 && IsMask; |
| SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) : |
| DAG.getConstant(Values[i], dl, EltVT); |
| Ops.push_back(OpNode); |
| if (Split) |
| Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) : |
| DAG.getConstant(0, dl, EltVT)); |
| } |
| SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops); |
| if (Split) |
| ConstsNode = DAG.getBitcast(VT, ConstsNode); |
| return ConstsNode; |
| } |
| |
| static SDValue getConstVector(ArrayRef<APInt> Bits, APInt &Undefs, |
| MVT VT, SelectionDAG &DAG, const SDLoc &dl) { |
| assert(Bits.size() == Undefs.getBitWidth() && |
| "Unequal constant and undef arrays"); |
| SmallVector<SDValue, 32> Ops; |
| bool Split = false; |
| |
| MVT ConstVecVT = VT; |
| unsigned NumElts = VT.getVectorNumElements(); |
| bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64); |
| if (!In64BitMode && VT.getVectorElementType() == MVT::i64) { |
| ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2); |
| Split = true; |
| } |
| |
| MVT EltVT = ConstVecVT.getVectorElementType(); |
| for (unsigned i = 0, e = Bits.size(); i != e; ++i) { |
| if (Undefs[i]) { |
| Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT)); |
| continue; |
| } |
| const APInt &V = Bits[i]; |
| assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes"); |
| if (Split) { |
| Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT)); |
| Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT)); |
| } else if (EltVT == MVT::f32) { |
| APFloat FV(APFloat::IEEEsingle(), V); |
| Ops.push_back(DAG.getConstantFP(FV, dl, EltVT)); |
| } else if (EltVT == MVT::f64) { |
| APFloat FV(APFloat::IEEEdouble(), V); |
| Ops.push_back(DAG.getConstantFP(FV, dl, EltVT)); |
| } else { |
| Ops.push_back(DAG.getConstant(V, dl, EltVT)); |
| } |
| } |
| |
| SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops); |
| return DAG.getBitcast(VT, ConstsNode); |
| } |
| |
| /// Returns a vector of specified type with all zero elements. |
| static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget, |
| SelectionDAG &DAG, const SDLoc &dl) { |
| assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() || |
| VT.getVectorElementType() == MVT::i1) && |
| "Unexpected vector type"); |
| |
| // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest |
| // type. This ensures they get CSE'd. But if the integer type is not |
| // available, use a floating-point +0.0 instead. |
| SDValue Vec; |
| if (!Subtarget.hasSSE2() && VT.is128BitVector()) { |
| Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32); |
| } else if (VT.isFloatingPoint()) { |
| Vec = DAG.getConstantFP(+0.0, dl, VT); |
| } else if (VT.getVectorElementType() == MVT::i1) { |
| assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) && |
| "Unexpected vector type"); |
| Vec = DAG.getConstant(0, dl, VT); |
| } else { |
| unsigned Num32BitElts = VT.getSizeInBits() / 32; |
| Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts)); |
| } |
| return DAG.getBitcast(VT, Vec); |
| } |
| |
| static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, |
| const SDLoc &dl, unsigned vectorWidth) { |
| EVT VT = Vec.getValueType(); |
| EVT ElVT = VT.getVectorElementType(); |
| unsigned Factor = VT.getSizeInBits()/vectorWidth; |
| EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT, |
| VT.getVectorNumElements()/Factor); |
| |
| // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR |
| unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits(); |
| assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2"); |
| |
| // This is the index of the first element of the vectorWidth-bit chunk |
| // we want. Since ElemsPerChunk is a power of 2 just need to clear bits. |
| IdxVal &= ~(ElemsPerChunk - 1); |
| |
| // If the input is a buildvector just emit a smaller one. |
| if (Vec.getOpcode() == ISD::BUILD_VECTOR) |
| return DAG.getBuildVector(ResultVT, dl, |
| Vec->ops().slice(IdxVal, ElemsPerChunk)); |
| |
| SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl); |
| return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx); |
| } |
| |
| /// Generate a DAG to grab 128-bits from a vector > 128 bits. This |
| /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128 |
| /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4 |
| /// instructions or a simple subregister reference. Idx is an index in the |
| /// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes |
| /// lowering EXTRACT_VECTOR_ELT operations easier. |
| static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal, |
| SelectionDAG &DAG, const SDLoc &dl) { |
| assert((Vec.getValueType().is256BitVector() || |
| Vec.getValueType().is512BitVector()) && "Unexpected vector size!"); |
| return extractSubVector(Vec, IdxVal, DAG, dl, 128); |
| } |
| |
| /// Generate a DAG to grab 256-bits from a 512-bit vector. |
| static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal, |
| SelectionDAG &DAG, const SDLoc &dl) { |
| assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!"); |
| return extractSubVector(Vec, IdxVal, DAG, dl, 256); |
| } |
| |
| static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal, |
| SelectionDAG &DAG, const SDLoc &dl, |
| unsigned vectorWidth) { |
| assert((vectorWidth == 128 || vectorWidth == 256) && |
| "Unsupported vector width"); |
| // Inserting UNDEF is Result |
| if (Vec.isUndef()) |
| return Result; |
| EVT VT = Vec.getValueType(); |
| EVT ElVT = VT.getVectorElementType(); |
| EVT ResultVT = Result.getValueType(); |
| |
| // Insert the relevant vectorWidth bits. |
| unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits(); |
| assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2"); |
| |
| // This is the index of the first element of the vectorWidth-bit chunk |
| // we want. Since ElemsPerChunk is a power of 2 just need to clear bits. |
| IdxVal &= ~(ElemsPerChunk - 1); |
| |
| SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl); |
| return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx); |
| } |
| |
| /// Generate a DAG to put 128-bits into a vector > 128 bits. This |
| /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or |
| /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a |
| /// simple superregister reference. Idx is an index in the 128 bits |
| /// we want. It need not be aligned to a 128-bit boundary. That makes |
| /// lowering INSERT_VECTOR_ELT operations easier. |
| static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, |
| SelectionDAG &DAG, const SDLoc &dl) { |
| assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!"); |
| return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128); |
| } |
| |
| /// Widen a vector to a larger size with the same scalar type, with the new |
| /// elements either zero or undef. |
| static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements, |
| const X86Subtarget &Subtarget, SelectionDAG &DAG, |
| const SDLoc &dl) { |
| assert(Vec.getValueSizeInBits() < VT.getSizeInBits() && |
| Vec.getValueType().getScalarType() == VT.getScalarType() && |
| "Unsupported vector widening type"); |
| SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl) |
| : DAG.getUNDEF(VT); |
| return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, Vec, |
| DAG.getIntPtrConstant(0, dl)); |
| } |
| |
| /// Widen a vector to a larger size with the same scalar type, with the new |
| /// elements either zero or undef. |
| static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements, |
| const X86Subtarget &Subtarget, SelectionDAG &DAG, |
| const SDLoc &dl, unsigned WideSizeInBits) { |
| assert(Vec.getValueSizeInBits() < WideSizeInBits && |
| (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 && |
| "Unsupported vector widening type"); |
| unsigned WideNumElts = WideSizeInBits / Vec.getScalarValueSizeInBits(); |
| MVT SVT = Vec.getSimpleValueType().getScalarType(); |
| MVT VT = MVT::getVectorVT(SVT, WideNumElts); |
| return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl); |
| } |
| |
| // Helper function to collect subvector ops that are concated together, |
| // either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series. |
| // The subvectors in Ops are guaranteed to be the same type. |
| static bool collectConcatOps(SDNode *N, SmallVectorImpl<SDValue> &Ops) { |
| assert(Ops.empty() && "Expected an empty ops vector"); |
| |
| if (N->getOpcode() == ISD::CONCAT_VECTORS) { |
| Ops.append(N->op_begin(), N->op_end()); |
| return true; |
| } |
| |
| if (N->getOpcode() == ISD::INSERT_SUBVECTOR && |
| isa<ConstantSDNode>(N->getOperand(2))) { |
| SDValue Src = N->getOperand(0); |
| SDValue Sub = N->getOperand(1); |
| const APInt &Idx = N->getConstantOperandAPInt(2); |
| EVT VT = Src.getValueType(); |
| EVT SubVT = Sub.getValueType(); |
| |
| // TODO - Handle more general insert_subvector chains. |
| if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2) && |
| Idx == (VT.getVectorNumElements() / 2) && |
| Src.getOpcode() == ISD::INSERT_SUBVECTOR && |
| Src.getOperand(1).getValueType() == SubVT && |
| isNullConstant(Src.getOperand(2))) { |
| Ops.push_back(Src.getOperand(1)); |
| Ops.push_back(Sub); |
| return true; |
| } |
| } |
| |
| return false; |
| } |
| |
| // Helper for splitting operands of an operation to legal target size and |
| // apply a function on each part. |
| // Useful for operations that are available on SSE2 in 128-bit, on AVX2 in |
| // 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for |
| // deciding if/how to split Ops. Ops elements do *not* have to be of type VT. |
| // The argument Builder is a function that will be applied on each split part: |
| // SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef<SDValue>) |
| template <typename F> |
| SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget, |
| const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops, |
| F Builder, bool CheckBWI = true) { |
| assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2"); |
| unsigned NumSubs = 1; |
| if ((CheckBWI && Subtarget.useBWIRegs()) || |
| (!CheckBWI && Subtarget.useAVX512Regs())) { |
| if (VT.getSizeInBits() > 512) { |
| NumSubs = VT.getSizeInBits() / 512; |
| assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size"); |
| } |
| } else if (Subtarget.hasAVX2()) { |
| if (VT.getSizeInBits() > 256) { |
| NumSubs = VT.getSizeInBits() / 256; |
| assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size"); |
| } |
| } else { |
| if (VT.getSizeInBits() > 128) { |
| NumSubs = VT.getSizeInBits() / 128; |
| assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size"); |
| } |
| } |
| |
| if (NumSubs == 1) |
| return Builder(DAG, DL, Ops); |
| |
| SmallVector<SDValue, 4> Subs; |
| for (unsigned i = 0; i != NumSubs; ++i) { |
| SmallVector<SDValue, 2> SubOps; |
| for (SDValue Op : Ops) { |
| EVT OpVT = Op.getValueType(); |
| unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs; |
| unsigned SizeSub = OpVT.getSizeInBits() / NumSubs; |
| SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub)); |
| } |
| Subs.push_back(Builder(DAG, DL, SubOps)); |
| } |
| return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs); |
| } |
| |
| /// Insert i1-subvector to i1-vector. |
| static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, |
| const X86Subtarget &Subtarget) { |
| |
| SDLoc dl(Op); |
| SDValue Vec = Op.getOperand(0); |
| SDValue SubVec = Op.getOperand(1); |
| SDValue Idx = Op.getOperand(2); |
| |
| if (!isa<ConstantSDNode>(Idx)) |
| return SDValue(); |
| |
| // Inserting undef is a nop. We can just return the original vector. |
| if (SubVec.isUndef()) |
| return Vec; |
| |
| unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); |
| if (IdxVal == 0 && Vec.isUndef()) // the operation is legal |
| return Op; |
| |
| MVT OpVT = Op.getSimpleValueType(); |
| unsigned NumElems = OpVT.getVectorNumElements(); |
| |
| SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl); |
| |
| // Extend to natively supported kshift. |
| MVT WideOpVT = OpVT; |
| if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) |
| WideOpVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1; |
| |
| // Inserting into the lsbs of a zero vector is legal. ISel will insert shifts |
| // if necessary. |
| if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) { |
| // May need to promote to a legal type. |
| Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, |
| DAG.getConstant(0, dl, WideOpVT), |
| SubVec, Idx); |
| return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx); |
| } |
| |
| MVT SubVecVT = SubVec.getSimpleValueType(); |
| unsigned SubVecNumElems = SubVecVT.getVectorNumElements(); |
| |
| assert(IdxVal + SubVecNumElems <= NumElems && |
| IdxVal % SubVecVT.getSizeInBits() == 0 && |
| "Unexpected index value in INSERT_SUBVECTOR"); |
| |
| SDValue Undef = DAG.getUNDEF(WideOpVT); |
| |
| if (IdxVal == 0) { |
| // Zero lower bits of the Vec |
| SDValue ShiftBits = DAG.getTargetConstant(SubVecNumElems, dl, MVT::i8); |
| Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, |
| ZeroIdx); |
| Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits); |
| Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits); |
| // Merge them together, SubVec should be zero extended. |
| SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, |
| DAG.getConstant(0, dl, WideOpVT), |
| SubVec, ZeroIdx); |
| Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec); |
| return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx); |
| } |
| |
| SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, |
| Undef, SubVec, ZeroIdx); |
| |
| if (Vec.isUndef()) { |
| assert(IdxVal != 0 && "Unexpected index"); |
| SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec, |
| DAG.getTargetConstant(IdxVal, dl, MVT::i8)); |
| return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx); |
| } |
| |
| if (ISD::isBuildVectorAllZeros(Vec.getNode())) { |
| assert(IdxVal != 0 && "Unexpected index"); |
| NumElems = WideOpVT.getVectorNumElements(); |
| unsigned ShiftLeft = NumElems - SubVecNumElems; |
| unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal; |
| SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec, |
| DAG.getTargetConstant(ShiftLeft, dl, MVT::i8)); |
| if (ShiftRight != 0) |
| SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec, |
| DAG.getTargetConstant(ShiftRight, dl, MVT::i8)); |
| return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx); |
| } |
| |
| // Simple case when we put subvector in the upper part |
| if (IdxVal + SubVecNumElems == NumElems) { |
| SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec, |
| DAG.getTargetConstant(IdxVal, dl, MVT::i8)); |
| if (SubVecNumElems * 2 == NumElems) { |
| // Special case, use legal zero extending insert_subvector. This allows |
| // isel to opimitize when bits are known zero. |
| Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx); |
| Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, |
| DAG.getConstant(0, dl, WideOpVT), |
| Vec, ZeroIdx); |
| } else { |
| // Otherwise use explicit shifts to zero the bits. |
| Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, |
| Undef, Vec, ZeroIdx); |
| NumElems = WideOpVT.getVectorNumElements(); |
| SDValue ShiftBits = DAG.getTargetConstant(NumElems - IdxVal, dl, MVT::i8); |
| Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits); |
| Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits); |
| } |
| Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec); |
| return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx); |
| } |
| |
| // Inserting into the middle is more complicated. |
| |
| NumElems = WideOpVT.getVectorNumElements(); |
| |
| // Widen the vector if needed. |
| Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx); |
| |
| unsigned ShiftLeft = NumElems - SubVecNumElems; |
| unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal; |
| |
| // Do an optimization for the the most frequently used types. |
| if (WideOpVT != MVT::v64i1 || Subtarget.is64Bit()) { |
| APInt Mask0 = APInt::getBitsSet(NumElems, IdxVal, IdxVal + SubVecNumElems); |
| Mask0.flipAllBits(); |
| SDValue CMask0 = DAG.getConstant(Mask0, dl, MVT::getIntegerVT(NumElems)); |
| SDValue VMask0 = DAG.getNode(ISD::BITCAST, dl, WideOpVT, CMask0); |
| Vec = DAG.getNode(ISD::AND, dl, WideOpVT, Vec, VMask0); |
| SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec, |
| DAG.getTargetConstant(ShiftLeft, dl, MVT::i8)); |
| SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec, |
| DAG.getTargetConstant(ShiftRight, dl, MVT::i8)); |
| Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec); |
| |
| // Reduce to original width if needed. |
| return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx); |
| } |
| |
| // Clear the upper bits of the subvector and move it to its insert position. |
| SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec, |
| DAG.getTargetConstant(ShiftLeft, dl, MVT::i8)); |
| SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec, |
| DAG.getTargetConstant(ShiftRight, dl, MVT::i8)); |
| |
| // Isolate the bits below the insertion point. |
| unsigned LowShift = NumElems - IdxVal; |
| SDValue Low = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, |
| DAG.getTargetConstant(LowShift, dl, MVT::i8)); |
| Low = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Low, |
| DAG.getTargetConstant(LowShift, dl, MVT::i8)); |
| |
| // Isolate the bits after the last inserted bit. |
| unsigned HighShift = IdxVal + SubVecNumElems; |
| SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, |
| DAG.getTargetConstant(HighShift, dl, MVT::i8)); |
| High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High, |
| DAG.getTargetConstant(HighShift, dl, MVT::i8)); |
| |
| // Now OR all 3 pieces together. |
| Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High); |
| SubVec = DAG.getNode(ISD::OR, dl, WideOpVT, SubVec, Vec); |
| |
| // Reduce to original width if needed. |
| return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx); |
| } |
| |
| static SDValue concatSubVectors(SDValue V1, SDValue V2, SelectionDAG &DAG, |
| const SDLoc &dl) { |
| assert(V1.getValueType() == V2.getValueType() && "subvector type mismatch"); |
| EVT SubVT = V1.getValueType(); |
| EVT SubSVT = SubVT.getScalarType(); |
| unsigned SubNumElts = SubVT.getVectorNumElements(); |
| unsigned SubVectorWidth = SubVT.getSizeInBits(); |
| EVT VT = EVT::getVectorVT(*DAG.getContext(), SubSVT, 2 * SubNumElts); |
| SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, SubVectorWidth); |
| return insertSubVector(V, V2, SubNumElts, DAG, dl, SubVectorWidth); |
| } |
| |
| /// Returns a vector of specified type with all bits set. |
| /// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>. |
| /// Then bitcast to their original type, ensuring they get CSE'd. |
| static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) { |
| assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && |
| "Expected a 128/256/512-bit vector type"); |
| |
| APInt Ones = APInt::getAllOnesValue(32); |
| unsigned NumElts = VT.getSizeInBits() / 32; |
| SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts)); |
| return DAG.getBitcast(VT, Vec); |
| } |
| |
| // Convert *_EXTEND to *_EXTEND_VECTOR_INREG opcode. |
| static unsigned getOpcode_EXTEND_VECTOR_INREG(unsigned Opcode) { |
| switch (Opcode) { |
| case ISD::ANY_EXTEND: |
| case ISD::ANY_EXTEND_VECTOR_INREG: |
| return ISD::ANY_EXTEND_VECTOR_INREG; |
| case ISD::ZERO_EXTEND: |
| case ISD::ZERO_EXTEND_VECTOR_INREG: |
| return ISD::ZERO_EXTEND_VECTOR_INREG; |
| case ISD::SIGN_EXTEND: |
| case ISD::SIGN_EXTEND_VECTOR_INREG: |
| return ISD::SIGN_EXTEND_VECTOR_INREG; |
| } |
| llvm_unreachable("Unknown opcode"); |
| } |
| |
| static SDValue getExtendInVec(unsigned Opcode, const SDLoc &DL, EVT VT, |
| SDValue In, SelectionDAG &DAG) { |
| EVT InVT = In.getValueType(); |
| assert(VT.isVector() && InVT.isVector() && "Expected vector VTs."); |
| assert((ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode || |
| ISD::ZERO_EXTEND == Opcode) && |
| "Unknown extension opcode"); |
| |
| // For 256-bit vectors, we only need the lower (128-bit) input half. |
| // For 512-bit vectors, we only need the lower input half or quarter. |
| if (InVT.getSizeInBits() > 128) { |
| assert(VT.getSizeInBits() == InVT.getSizeInBits() && |
| "Expected VTs to be the same size!"); |
| unsigned Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits(); |
| In = extractSubVector(In, 0, DAG, DL, |
| std::max(128U, (unsigned)VT.getSizeInBits() / Scale)); |
| InVT = In.getValueType(); |
| } |
| |
| if (VT.getVectorNumElements() != InVT.getVectorNumElements()) |
| Opcode = getOpcode_EXTEND_VECTOR_INREG(Opcode); |
| |
| return DAG.getNode(Opcode, DL, VT, In); |
| } |
| |
| // Match (xor X, -1) -> X. |
| // Match extract_subvector(xor X, -1) -> extract_subvector(X). |
| // Match concat_vectors(xor X, -1, xor Y, -1) -> concat_vectors(X, Y). |
| static SDValue IsNOT(SDValue V, SelectionDAG &DAG) { |
| V = peekThroughBitcasts(V); |
| if (V.getOpcode() == ISD::XOR && |
| ISD::isBuildVectorAllOnes(V.getOperand(1).getNode())) |
| return V.getOperand(0); |
| if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR && |
| (isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) { |
| if (SDValue Not = IsNOT(V.getOperand(0), DAG)) { |
| Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not); |
| return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), V.getValueType(), |
| Not, V.getOperand(1)); |
| } |
| } |
| SmallVector<SDValue, 2> CatOps; |
| if (collectConcatOps(V.getNode(), CatOps)) { |
| for (SDValue &CatOp : CatOps) { |
| SDValue NotCat = IsNOT(CatOp, DAG); |
| if (!NotCat) return SDValue(); |
| CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat); |
| } |
| return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), V.getValueType(), CatOps); |
| } |
| return SDValue(); |
| } |
| |
| /// Returns a vector_shuffle node for an unpackl operation. |
| static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT, |
| SDValue V1, SDValue V2) { |
| SmallVector<int, 8> Mask; |
| createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false); |
| return DAG.getVectorShuffle(VT, dl, V1, V2, Mask); |
| } |
| |
| /// Returns a vector_shuffle node for an unpackh operation. |
| static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, MVT VT, |
| SDValue V1, SDValue V2) { |
| SmallVector<int, 8> Mask; |
| createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false); |
| return DAG.getVectorShuffle(VT, dl, V1, V2, Mask); |
| } |
| |
| /// Return a vector_shuffle of the specified vector of zero or undef vector. |
| /// This produces a shuffle where the low element of V2 is swizzled into the |
| /// zero/undef vector, landing at element Idx. |
| /// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3). |
| static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx, |
| bool IsZero, |
| const X86Subtarget &Subtarget, |
| SelectionDAG &DAG) { |
| MVT VT = V2.getSimpleValueType(); |
| SDValue V1 = IsZero |
| ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT); |
| int NumElems = VT.getVectorNumElements(); |
| SmallVector<int, 16> MaskVec(NumElems); |
| for (int i = 0; i != NumElems; ++i) |
| // If this is the insertion idx, put the low elt of V2 here. |
| MaskVec[i] = (i == Idx) ? NumElems : i; |
| return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec); |
| } |
| |
| static const Constant *getTargetConstantFromNode(LoadSDNode *Load) { |
| if (!Load || !ISD::isNormalLoad(Load)) |
| return nullptr; |
| |
| SDValue Ptr = Load->getBasePtr(); |
| if (Ptr->getOpcode() == X86ISD::Wrapper || |
| Ptr->getOpcode() == X86ISD::WrapperRIP) |
| Ptr = Ptr->getOperand(0); |
| |
| auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr); |
| if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0) |
| return nullptr; |
| |
| return CNode->getConstVal(); |
| } |
| |
| static const Constant *getTargetConstantFromNode(SDValue Op) { |
| Op = peekThroughBitcasts(Op); |
| return getTargetConstantFromNode(dyn_cast<LoadSDNode>(Op)); |
| } |
| |
| const Constant * |
| X86TargetLowering::getTargetConstantFromLoad(LoadSDNode *LD) const { |
| assert(LD && "Unexpected null LoadSDNode"); |
| return getTargetConstantFromNode(LD); |
| } |
| |
| // Extract raw constant bits from constant pools. |
| static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits, |
| APInt &UndefElts, |
| SmallVectorImpl<APInt> &EltBits, |
| bool AllowWholeUndefs = true, |
| bool AllowPartialUndefs = true) { |
| assert(EltBits.empty() && "Expected an empty EltBits vector"); |
| |
| Op = peekThroughBitcasts(Op); |
| |
| EVT VT = Op.getValueType(); |
| unsigned SizeInBits = VT.getSizeInBits(); |
| assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!"); |
| unsigned NumElts = SizeInBits / EltSizeInBits; |
| |
| // Bitcast a source array of element bits to the target size. |
| auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) { |
| unsigned NumSrcElts = UndefSrcElts.getBitWidth(); |
| unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth(); |
| assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits && |
| "Constant bit sizes don't match"); |
| |
| // Don't split if we don't allow undef bits. |
| bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs; |
| if (UndefSrcElts.getBoolValue() && !AllowUndefs) |
| return false; |
| |
| // If we're already the right size, don't bother bitcasting. |
| if (NumSrcElts == NumElts) { |
| UndefElts = UndefSrcElts; |
| EltBits.assign(SrcEltBits.begin(), SrcEltBits.end()); |
| return true; |
| } |
| |
| // Extract all the undef/constant element data and pack into single bitsets. |
| APInt UndefBits(SizeInBits, 0); |
| APInt MaskBits(SizeInBits, 0); |
| |
| for (unsigned i = 0; i != NumSrcElts; ++i) { |
| unsigned BitOffset = i * SrcEltSizeInBits; |
| if (UndefSrcElts[i]) |
| UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits); |
| MaskBits.insertBits(SrcEltBits[i], BitOffset); |
| } |
| |
| // Split the undef/constant single bitset data into the target elements. |
| UndefElts = APInt(NumElts, 0); |
| EltBits.resize(NumElts, APInt(EltSizeInBits, 0)); |
| |
| for (unsigned i = 0; i != NumElts; ++i) { |
| unsigned BitOffset = i * EltSizeInBits; |
| APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset); |
| |
| // Only treat an element as UNDEF if all bits are UNDEF. |
| if (UndefEltBits.isAllOnesValue()) { |
| if (!AllowWholeUndefs) |
| return false; |
| UndefElts.setBit(i); |
| continue; |
| } |
| |
| // If only some bits are UNDEF then treat them as zero (or bail if not |
| // supported). |
| if (UndefEltBits.getBoolValue() && !AllowPartialUndefs) |
| return false; |
| |
| EltBits[i] = MaskBits.extractBits(EltSizeInBits, BitOffset); |
| } |
| return true; |
| }; |
| |
| // Collect constant bits and insert into mask/undef bit masks. |
| auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs, |
| unsigned UndefBitIndex) { |
| if (!Cst) |
| return false; |
| if (isa<UndefValue>(Cst)) { |
| Undefs.setBit(UndefBitIndex); |
| return true; |
| } |
| if (auto *CInt = dyn_cast<ConstantInt>(Cst)) { |
| Mask = CInt->getValue(); |
| return true; |
| } |
| if (auto *CFP = dyn_cast<ConstantFP>(Cst)) { |
| Mask = CFP->getValueAPF().bitcastToAPInt(); |
| return true; |
| } |
| return false; |
| }; |
| |
| // Handle UNDEFs. |
| if (Op.isUndef()) { |
| APInt UndefSrcElts = APInt::getAllOnesValue(NumElts); |
| SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0)); |
| return CastBitData(UndefSrcElts, SrcEltBits); |
| } |
| |
| // Extract scalar constant bits. |
| if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) { |
| APInt UndefSrcElts = APInt::getNullValue(1); |
| SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue()); |
| return CastBitData(UndefSrcElts, SrcEltBits); |
| } |
| if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) { |
| APInt UndefSrcElts = APInt::getNullValue(1); |
| APInt RawBits = Cst->getValueAPF().bitcastToAPInt(); |
| SmallVector<APInt, 64> SrcEltBits(1, RawBits); |
| return CastBitData(UndefSrcElts, SrcEltBits); |
| } |
| |
| // Extract constant bits from build vector. |
| if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) { |
| unsigned SrcEltSizeInBits = VT.getScalarSizeInBits(); |
| unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits; |
| |
| APInt UndefSrcElts(NumSrcElts, 0); |
| SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0)); |
| for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) { |
| const SDValue &Src = Op.getOperand(i); |
| if (Src.isUndef()) { |
| UndefSrcElts.setBit(i); |
| continue; |
| } |
| auto *Cst = cast<ConstantSDNode>(Src); |
| SrcEltBits[i] = Cst->getAPIntValue().zextOrTrunc(SrcEltSizeInBits); |
| } |
| return CastBitData(UndefSrcElts, SrcEltBits); |
| } |
| if (ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode())) { |
| unsigned SrcEltSizeInBits = VT.getScalarSizeInBits(); |
| unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits; |
| |
| APInt UndefSrcElts(NumSrcElts, 0); |
| SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0)); |
| for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) { |
| const SDValue &Src = Op.getOperand(i); |
| if (Src.isUndef()) { |
| UndefSrcElts.setBit(i); |
| continue; |
| } |
| auto *Cst = cast<ConstantFPSDNode>(Src); |
| APInt RawBits = Cst->getValueAPF().bitcastToAPInt(); |
| SrcEltBits[i] = RawBits.zextOrTrunc(SrcEltSizeInBits); |
| } |
| return CastBitData(UndefSrcElts, SrcEltBits); |
| } |
| |
| // Extract constant bits from constant pool vector. |
| if (auto *Cst = getTargetConstantFromNode(Op)) { |
| Type *CstTy = Cst->getType(); |
| unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits(); |
| if (!CstTy->isVectorTy() || (CstSizeInBits % SizeInBits) != 0) |
| return false; |
| |
| unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits(); |
| unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits; |
| |
| APInt UndefSrcElts(NumSrcElts, 0); |
| SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0)); |
| for (unsigned i = 0; i != NumSrcElts; ++i) |
| if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i], |
| UndefSrcElts, i)) |
| return false; |
| |
| return CastBitData(UndefSrcElts, SrcEltBits); |
| } |
| |
| // Extract constant bits from a broadcasted constant pool scalar. |
| if (Op.getOpcode() == X86ISD::VBROADCAST && |
| EltSizeInBits <= VT.getScalarSizeInBits()) { |
| if (auto *Broadcast = getTargetConstantFromNode(Op.getOperand(0))) { |
| unsigned SrcEltSizeInBits = Broadcast->getType()->getScalarSizeInBits(); |
| unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits; |
| |
| APInt UndefSrcElts(NumSrcElts, 0); |
| SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0)); |
| if (CollectConstantBits(Broadcast, SrcEltBits[0], UndefSrcElts, 0)) { |
| if (UndefSrcElts[0]) |
| UndefSrcElts.setBits(0, NumSrcElts); |
| SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]); |
| return CastBitData(UndefSrcElts, SrcEltBits); |
| } |
| } |
| } |
| |
| if (Op.getOpcode() == X86ISD::VBROADCAST_LOAD && |
| EltSizeInBits <= VT.getScalarSizeInBits()) { |
| auto *MemIntr = cast<MemIntrinsicSDNode>(Op); |
| if (MemIntr->getMemoryVT().getScalarSizeInBits() != VT.getScalarSizeInBits()) |
| return false; |
| |
| SDValue Ptr = MemIntr->getBasePtr(); |
| if (Ptr->getOpcode() == X86ISD::Wrapper || |
| Ptr->getOpcode() == X86ISD::WrapperRIP) |
| Ptr = Ptr->getOperand(0); |
| |
| auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr); |
| if (!CNode || CNode->isMachineConstantPoolEntry() || |
| CNode->getOffset() != 0) |
| return false; |
| |
| if (const Constant *C = CNode->getConstVal()) { |
| unsigned SrcEltSizeInBits = C->getType()->getScalarSizeInBits(); |
| unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits; |
| |
| APInt UndefSrcElts(NumSrcElts, 0); |
| SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0)); |
| if (CollectConstantBits(C, SrcEltBits[0], UndefSrcElts, 0)) { |
| if (UndefSrcElts[0]) |
| UndefSrcElts.setBits(0, NumSrcElts); |
| SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]); |
| return CastBitData(UndefSrcElts, SrcEltBits); |
| } |
| } |
| } |
| |
| // Extract constant bits from a subvector broadcast. |
| if (Op.getOpcode() == X86ISD::SUBV_BROADCAST) { |
| SmallVector<APInt, 16> SubEltBits; |
| if (getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits, |
| UndefElts, SubEltBits, AllowWholeUndefs, |
| AllowPartialUndefs)) { |
| UndefElts = APInt::getSplat(NumElts, UndefElts); |
| while (EltBits.size() < NumElts) |
| EltBits.append(SubEltBits.begin(), SubEltBits.end()); |
| return true; |
| } |
| } |
| |
| // Extract a rematerialized scalar constant insertion. |
| if (Op.getOpcode() == X86ISD::VZEXT_MOVL && |
| Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR && |
| isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) { |
| unsigned SrcEltSizeInBits = VT.getScalarSizeInBits(); |
| unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits; |
| |
| APInt UndefSrcElts(NumSrcElts, 0); |
| SmallVector<APInt, 64> SrcEltBits; |
| auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0)); |
| SrcEltBits.push_back(CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits)); |
| SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0)); |
| return CastBitData(UndefSrcElts, SrcEltBits); |
| } |
| |
| // Insert constant bits from a base and sub vector sources. |
| if (Op.getOpcode() == ISD::INSERT_SUBVECTOR && |
| isa<ConstantSDNode>(Op.getOperand(2))) { |
| // TODO - support insert_subvector through bitcasts. |
| if (EltSizeInBits != VT.getScalarSizeInBits()) |
| return false; |
| |
| APInt UndefSubElts; |
| SmallVector<APInt, 32> EltSubBits; |
| if (getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits, |
| UndefSubElts, EltSubBits, |
| AllowWholeUndefs, AllowPartialUndefs) && |
| getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits, |
| UndefElts, EltBits, AllowWholeUndefs, |
| AllowPartialUndefs)) { |
| unsigned BaseIdx = Op.getConstantOperandVal(2); |
| UndefElts.insertBits(UndefSubElts, BaseIdx); |
| for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i) |
| EltBits[BaseIdx + i] = EltSubBits[i]; |
| return true; |
| } |
| } |
| |
| // Extract constant bits from a subvector's source. |
| if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR && |
| isa<ConstantSDNode>(Op.getOperand(1))) { |
| // TODO - support extract_subvector through bitcasts. |
| if (EltSizeInBits != VT.getScalarSizeInBits()) |
| return false; |
| |
| if (getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits, |
| UndefElts, EltBits, AllowWholeUndefs, |
| AllowPartialUndefs)) { |
| EVT SrcVT = Op.getOperand(0).getValueType(); |
| unsigned NumSrcElts = SrcVT.getVectorNumElements(); |
| unsigned NumSubElts = VT.getVectorNumElements(); |
| unsigned BaseIdx = Op.getConstantOperandVal(1); |
| UndefElts = UndefElts.extractBits(NumSubElts, BaseIdx); |
| if ((BaseIdx + NumSubElts) != NumSrcElts) |
| EltBits.erase(EltBits.begin() + BaseIdx + NumSubElts, EltBits.end()); |
| if (BaseIdx != 0) |
| EltBits.erase(EltBits.begin(), EltBits.begin() + BaseIdx); |
| return true; |
| } |
| } |
| |
| // Extract constant bits from shuffle node sources. |
| if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Op)) { |
| // TODO - support shuffle through bitcasts. |
| if (EltSizeInBits != VT.getScalarSizeInBits()) |
| return false; |
| |
| ArrayRef<int> Mask = SVN->getMask(); |
| if ((!AllowWholeUndefs || !AllowPartialUndefs) && |
| llvm::any_of(Mask, [](int M) { return M < 0; })) |
| return false; |
| |
| APInt UndefElts0, UndefElts1; |
| SmallVector<APInt, 32> EltBits0, EltBits1; |
| if (isAnyInRange(Mask, 0, NumElts) && |
| !getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits, |
| UndefElts0, EltBits0, AllowWholeUndefs, |
| AllowPartialUndefs)) |
| return false; |
| if (isAnyInRange(Mask, NumElts, 2 * NumElts) && |
| !getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits, |
| UndefElts1, EltBits1, AllowWholeUndefs, |
| AllowPartialUndefs)) |
| return false; |
| |
| UndefElts = APInt::getNullValue(NumElts); |
| for (int i = 0; i != (int)NumElts; ++i) { |
| int M = Mask[i]; |
| if (M < 0) { |
| UndefElts.setBit(i); |
| EltBits.push_back(APInt::getNullValue(EltSizeInBits)); |
| } else if (M < (int)NumElts) { |
| if (UndefElts0[M]) |
| UndefElts.setBit(i); |
| EltBits.push_back(EltBits0[M]); |
| } else { |
| if (UndefElts1[M - NumElts]) |
| UndefElts.setBit(i); |
| EltBits.push_back(EltBits1[M - NumElts]); |
| } |
| } |
| return true; |
| } |
| |
| return false; |
| } |
| |
| namespace llvm { |
| namespace X86 { |
| bool isConstantSplat(SDValue Op, APInt &SplatVal) { |
| APInt UndefElts; |
| SmallVector<APInt, 16> EltBits; |
| if (getTargetConstantBitsFromNode(Op, Op.getScalarValueSizeInBits(), |
| UndefElts, EltBits, true, false)) { |
| int SplatIndex = -1; |
| for (int i = 0, e = EltBits.size(); i != e; ++i) { |
| if (UndefElts[i]) |
| continue; |
| if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) { |
| SplatIndex = -1; |
| break; |
| } |
| SplatIndex = i; |
| } |
| if (0 <= SplatIndex) { |
| SplatVal = EltBits[SplatIndex]; |
| return true; |
| } |
| } |
| |
| return false; |
| } |
| } // namespace X86 |
| } // namespace llvm |
| |
| static bool getTargetShuffleMaskIndices(SDValue MaskNode, |
| unsigned MaskEltSizeInBits, |
| SmallVectorImpl<uint64_t> &RawMask, |
| APInt &UndefElts) { |
| // Extract the raw target constant bits. |
| SmallVector<APInt, 64> EltBits; |
| if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts, |
| EltBits, /* AllowWholeUndefs */ true, |
| /* AllowPartialUndefs */ false)) |
| return false; |
| |
| // Insert the extracted elements into the mask. |
| for (APInt Elt : EltBits) |
| RawMask.push_back(Elt.getZExtValue()); |
| |
| return true; |
| } |
| |
| /// Create a shuffle mask that matches the PACKSS/PACKUS truncation. |
| /// Note: This ignores saturation, so inputs must be checked first. |
| static void createPackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask, |
| bool Unary) { |
| assert(Mask.empty() && "Expected an empty shuffle mask vector"); |
| unsigned NumElts = VT.getVectorNumElements(); |
| unsigned NumLanes = VT.getSizeInBits() / 128; |
| unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits(); |
| unsigned Offset = Unary ? 0 : NumElts; |
| |
| for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { |
| for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += 2) |
| Mask.push_back(Elt + (Lane * NumEltsPerLane)); |
| for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += 2) |
| Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset); |
| } |
| } |
| |
| // Split the demanded elts of a PACKSS/PACKUS node between its operands. |
| static void getPackDemandedElts(EVT VT, const APInt &DemandedElts, |
| APInt &DemandedLHS, APInt &DemandedRHS) { |
| int NumLanes = VT.getSizeInBits() / 128; |
| int NumElts = DemandedElts.getBitWidth(); |
| int NumInnerElts = NumElts / 2; |
| int NumEltsPerLane = NumElts / NumLanes; |
| int NumInnerEltsPerLane = NumInnerElts / NumLanes; |
| |
| DemandedLHS = APInt::getNullValue(NumInnerElts); |
| DemandedRHS = APInt::getNullValue(NumInnerElts); |
| |
| // Map DemandedElts to the packed operands. |
| for (int Lane = 0; Lane != NumLanes; ++Lane) { |
| for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) { |
| int OuterIdx = (Lane * NumEltsPerLane) + Elt; |
| int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt; |
| if (DemandedElts[OuterIdx]) |
| DemandedLHS.setBit(InnerIdx); |
| if (DemandedElts[OuterIdx + NumInnerEltsPerLane]) |
| DemandedRHS.setBit(InnerIdx); |
| } |
| } |
| } |
| |
| // Split the demanded elts of a HADD/HSUB node between its operands. |
| static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts, |
| APInt &DemandedLHS, APInt &DemandedRHS) { |
| int NumLanes = VT.getSizeInBits() / 128; |
| int NumElts = DemandedElts.getBitWidth(); |
| int NumEltsPerLane = NumElts / NumLanes; |
| int HalfEltsPerLane = NumEltsPerLane / 2; |
| |
| DemandedLHS = APInt::getNullValue(NumElts); |
| DemandedRHS = APInt::getNullValue(NumElts); |
| |
| // Map DemandedElts to the horizontal operands. |
| for (int Idx = 0; Idx != NumElts; ++Idx) { |
| if (!DemandedElts[Idx]) |
| continue; |
| int LaneIdx = (Idx / NumEltsPerLane) * NumEltsPerLane; |
| int LocalIdx = Idx % NumEltsPerLane; |
| if (LocalIdx < HalfEltsPerLane) { |
| DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 0); |
| DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 1); |
| } else { |
| LocalIdx -= HalfEltsPerLane; |
| DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 0); |
| DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 1); |
| } |
| } |
| } |
| |
| /// Calculates the shuffle mask corresponding to the target-specific opcode. |
| /// If the mask could be calculated, returns it in \p Mask, returns the shuffle |
| /// operands in \p Ops, and returns true. |
| /// Sets \p IsUnary to true if only one source is used. Note that this will set |
| /// IsUnary for shuffles which use a single input multiple times, and in those |
| /// cases it will adjust the mask to only have indices within that single input. |
| /// It is an error to call this with non-empty Mask/Ops vectors. |
| static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero, |
| SmallVectorImpl<SDValue> &Ops, |
| SmallVectorImpl<int> &Mask, bool &IsUnary) { |
| unsigned NumElems = VT.getVectorNumElements(); |
| unsigned MaskEltSize = VT.getScalarSizeInBits(); |
| SmallVector<uint64_t, 32> RawMask; |
| APInt RawUndefs; |
| SDValue ImmN; |
| |
| assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector"); |
| assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector"); |
| |
| IsUnary = false; |
| bool IsFakeUnary = false; |
| switch (N->getOpcode()) { |
| case X86ISD::BLENDI: |
| assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); |
| assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); |
| ImmN = N->getOperand(N->getNumOperands() - 1); |
| DecodeBLENDMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); |
| IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); |
| break; |
| case X86ISD::SHUFP: |
| assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); |
| assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); |
| ImmN = N->getOperand(N->getNumOperands() - 1); |
| DecodeSHUFPMask(NumElems, MaskEltSize, |
| cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); |
| IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); |
| break; |
| case X86ISD::INSERTPS: |
| assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); |
| assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); |
| ImmN = N->getOperand(N->getNumOperands() - 1); |
| DecodeINSERTPSMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); |
| IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); |
| break; |
| case X86ISD::EXTRQI: |
| assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); |
| if (isa<ConstantSDNode>(N->getOperand(1)) && |
| isa<ConstantSDNode>(N->getOperand(2))) { |
| int BitLen = N->getConstantOperandVal(1); |
| int BitIdx = N->getConstantOperandVal(2); |
| DecodeEXTRQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask); |
| IsUnary = true; |
| } |
| break; |
| case X86ISD::INSERTQI: |
| assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); |
| assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); |
| if (isa<ConstantSDNode>(N->getOperand(2)) && |
| isa<ConstantSDNode>(N->getOperand(3))) { |
| int BitLen = N->getConstantOperandVal(2); |
| int BitIdx = N->getConstantOperandVal(3); |
| DecodeINSERTQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask); |
| IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); |
| } |
| break; |
| case X86ISD::UNPCKH: |
| assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); |
| assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); |
| DecodeUNPCKHMask(NumElems, MaskEltSize, Mask); |
| IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); |
| break; |
| case X86ISD::UNPCKL: |
| assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); |
| assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); |
| DecodeUNPCKLMask(NumElems, MaskEltSize, Mask); |
| IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); |
| break; |
| case X86ISD::MOVHLPS: |
| assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); |
| assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); |
| DecodeMOVHLPSMask(NumElems, Mask); |
| IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); |
| break; |
| case X86ISD::MOVLHPS: |
| assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); |
| assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); |
| DecodeMOVLHPSMask(NumElems, Mask); |
| IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); |
| break; |
| case X86ISD::PALIGNR: |
| assert(VT.getScalarType() == MVT::i8 && "Byte vector expected"); |
| assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); |
| assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); |
| ImmN = N->getOperand(N->getNumOperands() - 1); |
| DecodePALIGNRMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(), |
| Mask); |
| IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); |
| Ops.push_back(N->getOperand(1)); |
| Ops.push_back(N->getOperand(0)); |
| break; |
| case X86ISD::VSHLDQ: |
| assert(VT.getScalarType() == MVT::i8 && "Byte vector expected"); |
| assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); |
| ImmN = N->getOperand(N->getNumOperands() - 1); |
| DecodePSLLDQMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(), |
| Mask); |
| IsUnary = true; |
| break; |
| case X86ISD::VSRLDQ: |
| assert(VT.getScalarType() == MVT::i8 && "Byte vector expected"); |
| assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); |
| ImmN = N->getOperand(N->getNumOperands() - 1); |
| DecodePSRLDQMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(), |
| Mask); |
| IsUnary = true; |
| break; |
| case X86ISD::PSHUFD: |
| case X86ISD::VPERMILPI: |
| assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); |
| ImmN = N->getOperand(N->getNumOperands() - 1); |
| DecodePSHUFMask(NumElems, MaskEltSize, |
| cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); |
| IsUnary = true; |
| break; |
| case X86ISD::PSHUFHW: |
| assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); |
| ImmN = N->getOperand(N->getNumOperands() - 1); |
| DecodePSHUFHWMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(), |
| Mask); |
| IsUnary = true; |
| break; |
| case X86ISD::PSHUFLW: |
| assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); |
| ImmN = N->getOperand(N->getNumOperands() - 1); |
| DecodePSHUFLWMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(), |
| Mask); |
| IsUnary = true; |
| break; |
| case X86ISD::VZEXT_MOVL: |
| assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); |
| DecodeZeroMoveLowMask(NumElems, Mask); |
| IsUnary = true; |
| break; |
| case X86ISD::VBROADCAST: { |
| SDValue N0 = N->getOperand(0); |
| // See if we're broadcasting from index 0 of an EXTRACT_SUBVECTOR. If so, |
| // add the pre-extracted value to the Ops vector. |
| if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR && |
| N0.getOperand(0).getValueType() == VT && |
| N0.getConstantOperandVal(1) == 0) |
| Ops.push_back(N0.getOperand(0)); |
| |
| // We only decode broadcasts of same-sized vectors, unless the broadcast |
| // came from an extract from the original width. If we found one, we |
| // pushed it the Ops vector above. |
| if (N0.getValueType() == VT || !Ops.empty()) { |
| DecodeVectorBroadcast(NumElems, Mask); |
| IsUnary = true; |
| break; |
| } |
| return false; |
| } |
| case X86ISD::VPERMILPV: { |
| assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); |
| IsUnary = true; |
| SDValue MaskNode = N->getOperand(1); |
| if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask, |
| RawUndefs)) { |
| DecodeVPERMILPMask(NumElems, MaskEltSize, RawMask, RawUndefs, Mask); |
| break; |
| } |
| return false; |
| } |
| case X86ISD::PSHUFB: { |
| assert(VT.getScalarType() == MVT::i8 && "Byte vector expected"); |
| assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); |
| assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); |
| IsUnary = true; |
| SDValue MaskNode = N->getOperand(1); |
| if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) { |
| DecodePSHUFBMask(RawMask, RawUndefs, Mask); |
| break; |
| } |
| return false; |
| } |
| case X86ISD::VPERMI: |
| assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); |
| ImmN = N->getOperand(N->getNumOperands() - 1); |
| DecodeVPERMMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); |
| IsUnary = true; |
| break; |
| case X86ISD::MOVSS: |
| case X86ISD::MOVSD: |
| assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); |
| assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); |
| DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask); |
| break; |
| case X86ISD::VPERM2X128: |
| assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); |
| assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); |
| ImmN = N->getOperand(N->getNumOperands() - 1); |
| DecodeVPERM2X128Mask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(), |
| Mask); |
| IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); |
| break; |
| case X86ISD::SHUF128: |
| assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); |
| assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); |
| ImmN = N->getOperand(N->getNumOperands() - 1); |
| decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize, |
| cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); |
| IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); |
| break; |
| case X86ISD::MOVSLDUP: |
| assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); |
| DecodeMOVSLDUPMask(NumElems, Mask); |
| IsUnary = true; |
| break; |
| case X86ISD::MOVSHDUP: |
| assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); |
| DecodeMOVSHDUPMask(NumElems, Mask); |
| IsUnary = true; |
| break; |
| case X86ISD::MOVDDUP: |
| assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); |
| DecodeMOVDDUPMask(NumElems, Mask); |
| IsUnary = true; |
| break; |
| case X86ISD::VPERMIL2: { |
| assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); |
| assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); |
| IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); |
| SDValue MaskNode = N->getOperand(2); |
| SDValue CtrlNode = N->getOperand(3); |
| if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) { |
| unsigned CtrlImm = CtrlOp->getZExtValue(); |
| if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask, |
| RawUndefs)) { |
| DecodeVPERMIL2PMask(NumElems, MaskEltSize, CtrlImm, RawMask, RawUndefs, |
| Mask); |
| break; |
| } |
| } |
| return false; |
| } |
| case X86ISD::VPPERM: { |
| assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); |
| assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); |
| IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); |
| SDValue MaskNode = N->getOperand(2); |
| if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) { |
| DecodeVPPERMMask(RawMask, RawUndefs, Mask); |
| break; |
| } |
| return false; |
| } |
| case X86ISD::VPERMV: { |
| assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); |
| IsUnary = true; |
| // Unlike most shuffle nodes, VPERMV's mask operand is operand 0. |
| Ops.push_back(N->getOperand(1)); |
| SDValue MaskNode = N->getOperand(0); |
| if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask, |
| RawUndefs)) { |
| DecodeVPERMVMask(RawMask, RawUndefs, Mask); |
| break; |
| } |
| return false; |
| } |
| case X86ISD::VPERMV3: { |
| assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); |
| assert(N->getOperand(2).getValueType() == VT && "Unexpected value type"); |
| IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2); |
| // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one. |
| Ops.push_back(N->getOperand(0)); |
| Ops.push_back(N->getOperand(2)); |
| SDValue MaskNode = N->getOperand(1); |
| if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask, |
| RawUndefs)) { |
| DecodeVPERMV3Mask(RawMask, RawUndefs, Mask); |
| break; |
| } |
| return false; |
| } |
| default: llvm_unreachable("unknown target shuffle node"); |
| } |
| |
| // Empty mask indicates the decode failed. |
| if (Mask.empty()) |
| return false; |
| |
| // Check if we're getting a shuffle mask with zero'd elements. |
| if (!AllowSentinelZero) |
| if (any_of(Mask, [](int M) { return M == SM_SentinelZero; })) |
| return false; |
| |
| // If we have a fake unary shuffle, the shuffle mask is spread across two |
| // inputs that are actually the same node. Re-map the mask to always point |
| // into the first input. |
| if (IsFakeUnary) |
| for (int &M : Mask) |
| if (M >= (int)Mask.size()) |
| M -= Mask.size(); |
| |
| // If we didn't already add operands in the opcode-specific code, default to |
| // adding 1 or 2 operands starting at 0. |
| if (Ops.empty()) { |
| Ops.push_back(N->getOperand(0)); |
| if (!IsUnary || IsFakeUnary) |
| Ops.push_back(N->getOperand(1)); |
| } |
| |
| return true; |
| } |
| |
| /// Compute whether each element of a shuffle is zeroable. |
| /// |
| /// A "zeroable" vector shuffle element is one which can be lowered to zero. |
| /// Either it is an undef element in the shuffle mask, the element of the input |
| /// referenced is undef, or the element of the input referenced is known to be |
| /// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle |
| /// as many lanes with this technique as possible to simplify the remaining |
| /// shuffle. |
| static void computeZeroableShuffleElements(ArrayRef<int> Mask, |
| SDValue V1, SDValue V2, |
| APInt &KnownUndef, APInt &KnownZero) { |
| int Size = Mask.size(); |
| KnownUndef = KnownZero = APInt::getNullValue(Size); |
| |
| V1 = peekThroughBitcasts(V1); |
| V2 = peekThroughBitcasts(V2); |
| |
| bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode()); |
| bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode()); |
| |
| int VectorSizeInBits = V1.getValueSizeInBits(); |
| int ScalarSizeInBits = VectorSizeInBits / Size; |
| assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size"); |
| |
| for (int i = 0; i < Size; ++i) { |
| int M = Mask[i]; |
| // Handle the easy cases. |
| if (M < 0) { |
| KnownUndef.setBit(i); |
| continue; |
| } |
| if ((M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) { |
| KnownZero.setBit(i); |
| continue; |
| } |
| |
| // Determine shuffle input and normalize the mask. |
| SDValue V = M < Size ? V1 : V2; |
| M %= Size; |
| |
| // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements. |
| if (V.getOpcode() != ISD::BUILD_VECTOR) |
| continue; |
| |
| // If the BUILD_VECTOR has fewer elements then the bitcasted portion of |
| // the (larger) source element must be UNDEF/ZERO. |
| if ((Size % V.getNumOperands()) == 0) { |
| int Scale = Size / V->getNumOperands(); |
| SDValue Op = V.getOperand(M / Scale); |
| if (Op.isUndef()) |
| KnownUndef.setBit(i); |
| if (X86::isZeroNode(Op)) |
| KnownZero.setBit(i); |
| else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) { |
| APInt Val = Cst->getAPIntValue(); |
| Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits); |
| if (Val == 0) |
| KnownZero.setBit(i); |
| } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) { |
| APInt Val = Cst->getValueAPF().bitcastToAPInt(); |
| Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits); |
| if (Val == 0) |
| KnownZero.setBit(i); |
| } |
| continue; |
| } |
| |
| // If the BUILD_VECTOR has more elements then all the (smaller) source |
| // elements must be UNDEF or ZERO. |
| if ((V.getNumOperands() % Size) == 0) { |
| int Scale = V->getNumOperands() / Size; |
| bool AllUndef = true; |
| bool AllZero = true; |
| for (int j = 0; j < Scale; ++j) { |
| SDValue Op = V.getOperand((M * Scale) + j); |
| AllUndef &= Op.isUndef(); |
| AllZero &= X86::isZeroNode(Op); |
| } |
| if (AllUndef) |
| KnownUndef.setBit(i); |
| if (AllZero) |
| KnownZero.setBit(i); |
| continue; |
| } |
| } |
| } |
| |
| /// Decode a target shuffle mask and inputs and see if any values are |
| /// known to be undef or zero from their inputs. |
| /// Returns true if the target shuffle mask was decoded. |
| /// FIXME: Merge this with computeZeroableShuffleElements? |
| static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl<int> &Mask, |
| SmallVectorImpl<SDValue> &Ops, |
| APInt &KnownUndef, APInt &KnownZero) { |
| bool IsUnary; |
| if (!isTargetShuffle(N.getOpcode())) |
| return false; |
| |
| MVT VT = N.getSimpleValueType(); |
| if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary)) |
| return false; |
| |
| int Size = Mask.size(); |
| SDValue V1 = Ops[0]; |
| SDValue V2 = IsUnary ? V1 : Ops[1]; |
| KnownUndef = KnownZero = APInt::getNullValue(Size); |
| |
| V1 = peekThroughBitcasts(V1); |
| V2 = peekThroughBitcasts(V2); |
| |
| assert((VT.getSizeInBits() % Size) == 0 && |
| "Illegal split of shuffle value type"); |
| unsigned EltSizeInBits = VT.getSizeInBits() / Size; |
| |
| // Extract known constant input data. |
| APInt UndefSrcElts[2]; |
| SmallVector<APInt, 32> SrcEltBits[2]; |
| bool IsSrcConstant[2] = { |
| getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0], |
| SrcEltBits[0], true, false), |
| getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1], |
| SrcEltBits[1], true, false)}; |
| |
| for (int i = 0; i < Size; ++i) { |
| int M = Mask[i]; |
| |
| // Already decoded as SM_SentinelZero / SM_SentinelUndef. |
| if (M < 0) { |
| assert(isUndefOrZero(M) && "Unknown shuffle sentinel value!"); |
| if (SM_SentinelUndef == M) |
| KnownUndef.setBit(i); |
| if (SM_SentinelZero == M) |
| KnownZero.setBit(i); |
| continue; |
| } |
| |
| // Determine shuffle input and normalize the mask. |
| unsigned SrcIdx = M / Size; |
| SDValue V = M < Size ? V1 : V2; |
| M %= Size; |
| |
| // We are referencing an UNDEF input. |
| if (V.isUndef()) { |
| KnownUndef.setBit(i); |
| continue; |
| } |
| |
| // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF. |
| // TODO: We currently only set UNDEF for integer types - floats use the same |
| // registers as vectors and many of the scalar folded loads rely on the |
| // SCALAR_TO_VECTOR pattern. |
| if (V.getOpcode() == ISD::SCALAR_TO_VECTOR && |
| (Size % V.getValueType().getVectorNumElements()) == 0) { |
| int Scale = Size / V.getValueType().getVectorNumElements(); |
| int Idx = M / Scale; |
| if (Idx != 0 && !VT.isFloatingPoint()) |
| KnownUndef.setBit(i); |
| else if (Idx == 0 && X86::isZeroNode(V.getOperand(0))) |
| KnownZero.setBit(i); |
| continue; |
| } |
| |
| // Attempt to extract from the source's constant bits. |
| if (IsSrcConstant[SrcIdx]) { |
| if (UndefSrcElts[SrcIdx][M]) |
| KnownUndef.setBit(i); |
| else if (SrcEltBits[SrcIdx][M] == 0) |
| KnownZero.setBit(i); |
| } |
| } |
| |
| assert(VT.getVectorNumElements() == (unsigned)Size && |
| "Different mask size from vector size!"); |
| return true; |
| } |
| |
| // Replace target shuffle mask elements with known undef/zero sentinels. |
| static void resolveTargetShuffleFromZeroables(SmallVectorImpl<int> &Mask, |
| const APInt &KnownUndef, |
| const APInt &KnownZero, |
| bool ResolveKnownZeros= true) { |
| unsigned NumElts = Mask.size(); |
| assert(KnownUndef.getBitWidth() == NumElts && |
| KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch"); |
| |
| for (unsigned i = 0; i != NumElts; ++i) { |
| if (KnownUndef[i]) |
| Mask[i] = SM_SentinelUndef; |
| else if (ResolveKnownZeros && KnownZero[i]) |
| Mask[i] = SM_SentinelZero; |
| } |
| } |
| |
| // Extract target shuffle mask sentinel elements to known undef/zero bitmasks. |
| static void resolveZeroablesFromTargetShuffle(const SmallVectorImpl<int> &Mask, |
| APInt &KnownUndef, |
| APInt &KnownZero) { |
| unsigned NumElts = Mask.size(); |
| KnownUndef = KnownZero = APInt::getNullValue(NumElts); |
| |
| for (unsigned i = 0; i != NumElts; ++i) { |
| int M = Mask[i]; |
| if (SM_SentinelUndef == M) |
| KnownUndef.setBit(i); |
| if (SM_SentinelZero == M) |
| KnownZero.setBit(i); |
| } |
| } |
| |
| // Forward declaration (for getFauxShuffleMask recursive check). |
| // TODO: Use DemandedElts variant. |
| static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs, |
| SmallVectorImpl<int> &Mask, |
| SelectionDAG &DAG, unsigned Depth, |
| bool ResolveKnownElts); |
| |
| // Attempt to decode ops that could be represented as a shuffle mask. |
| // The decoded shuffle mask may contain a different number of elements to the |
| // destination value type. |
| static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, |
| SmallVectorImpl<int> &Mask, |
| SmallVectorImpl<SDValue> &Ops, |
| SelectionDAG &DAG, unsigned Depth, |
| bool ResolveKnownElts) { |
| if (Depth > SelectionDAG::MaxRecursionDepth) |
| return false; |
| |
| Mask.clear(); |
| Ops.clear(); |
| |
| MVT VT = N.getSimpleValueType(); |
| unsigned NumElts = VT.getVectorNumElements(); |
| unsigned NumSizeInBits = VT.getSizeInBits(); |
| unsigned NumBitsPerElt = VT.getScalarSizeInBits(); |
| if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0) |
| return false; |
| assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size"); |
| |
| unsigned Opcode = N.getOpcode(); |
| switch (Opcode) { |
| case ISD::VECTOR_SHUFFLE: { |
| // Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here. |
| ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask(); |
| if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) { |
| Mask.append(ShuffleMask.begin(), ShuffleMask.end()); |
| Ops.push_back(N.getOperand(0)); |
| Ops.push_back(N.getOperand(1)); |
| return true; |
| } |
| return false; |
| } |
| case ISD::AND: |
| case X86ISD::ANDNP: { |
| // Attempt to decode as a per-byte mask. |
| APInt UndefElts; |
| SmallVector<APInt, 32> EltBits; |
| SDValue N0 = N.getOperand(0); |
| SDValue N1 = N.getOperand(1); |
| bool IsAndN = (X86ISD::ANDNP == Opcode); |
| uint64_t ZeroMask = IsAndN ? 255 : 0; |
| if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits)) |
| return false; |
| for (int i = 0, e = (int)EltBits.size(); i != e; ++i) { |
| if (UndefElts[i]) { |
| Mask.push_back(SM_SentinelUndef); |
| continue; |
| } |
| const APInt &ByteBits = EltBits[i]; |
| if (ByteBits != 0 && ByteBits != 255) |
| return false; |
| Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i); |
| } |
| Ops.push_back(IsAndN ? N1 : N0); |
| return true; |
| } |
| case ISD::OR: { |
| // Inspect each operand at the byte level. We can merge these into a |
| // blend shuffle mask if for each byte at least one is masked out (zero). |
| KnownBits Known0 = |
| DAG.computeKnownBits(N.getOperand(0), DemandedElts, Depth + 1); |
| KnownBits Known1 = |
| DAG.computeKnownBits(N.getOperand(1), DemandedElts, Depth + 1); |
| if (Known0.One.isNullValue() && Known1.One.isNullValue()) { |
| bool IsByteMask = true; |
| unsigned NumSizeInBytes = NumSizeInBits / 8; |
| unsigned NumBytesPerElt = NumBitsPerElt / 8; |
| APInt ZeroMask = APInt::getNullValue(NumBytesPerElt); |
| APInt SelectMask = APInt::getNullValue(NumBytesPerElt); |
| for (unsigned i = 0; i != NumBytesPerElt && IsByteMask; ++i) { |
| unsigned LHS = Known0.Zero.extractBits(8, i * 8).getZExtValue(); |
| unsigned RHS = Known1.Zero.extractBits(8, i * 8).getZExtValue(); |
| if (LHS == 255 && RHS == 0) |
| SelectMask.setBit(i); |
| else if (LHS == 255 && RHS == 255) |
| ZeroMask.setBit(i); |
| else if (!(LHS == 0 && RHS == 255)) |
| IsByteMask = false; |
| } |
| if (IsByteMask) { |
| for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt) { |
| for (unsigned j = 0; j != NumBytesPerElt; ++j) { |
| unsigned Ofs = (SelectMask[j] ? NumSizeInBytes : 0); |
| int Idx = (ZeroMask[j] ? (int)SM_SentinelZero : (i + j + Ofs)); |
| Mask.push_back(Idx); |
| } |
| } |
| Ops.push_back(N.getOperand(0)); |
| Ops.push_back(N.getOperand(1)); |
| return true; |
| } |
| } |
| |
| // Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other |
| // is a valid shuffle index. |
| SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0)); |
| SDValue N1 = peekThroughOneUseBitcasts(N.getOperand(1)); |
| if (!N0.getValueType().isVector() || !N1.getValueType().isVector()) |
| return false; |
| SmallVector<int, 64> SrcMask0, SrcMask1; |
| SmallVector<SDValue, 2> SrcInputs0, SrcInputs1; |
| if (!getTargetShuffleInputs(N0, SrcInputs0, SrcMask0, DAG, Depth + 1, |
| true) || |
| !getTargetShuffleInputs(N1, SrcInputs1, SrcMask1, DAG, Depth + 1, |
| true)) |
| return false; |
| size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size()); |
| SmallVector<int, 64> Mask0, Mask1; |
| scaleShuffleMask<int>(MaskSize / SrcMask0.size(), SrcMask0, Mask0); |
| scaleShuffleMask<int>(MaskSize / SrcMask1.size(), SrcMask1, Mask1); |
| for (size_t i = 0; i != MaskSize; ++i) { |
| if (Mask0[i] == SM_SentinelUndef && Mask1[i] == SM_SentinelUndef) |
| Mask.push_back(SM_SentinelUndef); |
| else if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero) |
| Mask.push_back(SM_SentinelZero); |
| else if (Mask1[i] == SM_SentinelZero) |
| Mask.push_back(Mask0[i]); |
| else if (Mask0[i] == SM_SentinelZero) |
| Mask.push_back(Mask1[i] + (int)(MaskSize * SrcInputs0.size())); |
| else |
| return false; |
| } |
| Ops.append(SrcInputs0.begin(), SrcInputs0.end()); |
| Ops.append(SrcInputs1.begin(), SrcInputs1.end()); |
| return true; |
| } |
| case ISD::INSERT_SUBVECTOR: { |
| SDValue Src = N.getOperand(0); |
| SDValue Sub = N.getOperand(1); |
| EVT SubVT = Sub.getValueType(); |
| unsigned NumSubElts = SubVT.getVectorNumElements(); |
| if (!isa<ConstantSDNode>(N.getOperand(2)) || |
| !N->isOnlyUserOf(Sub.getNode())) |
| return false; |
| uint64_t InsertIdx = N.getConstantOperandVal(2); |
| // Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)). |
| if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR && |
| Sub.getOperand(0).getValueType() == VT && |
| isa<ConstantSDNode>(Sub.getOperand(1))) { |
| uint64_t ExtractIdx = Sub.getConstantOperandVal(1); |
| for (int i = 0; i != (int)NumElts; ++i) |
| Mask.push_back(i); |
| for (int i = 0; i != (int)NumSubElts; ++i) |
| Mask[InsertIdx + i] = NumElts + ExtractIdx + i; |
| Ops.push_back(Src); |
| Ops.push_back(Sub.getOperand(0)); |
| return true; |
| } |
| // Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)). |
| SmallVector<int, 64> SubMask; |
| SmallVector<SDValue, 2> SubInputs; |
| if (!getTargetShuffleInputs(peekThroughOneUseBitcasts(Sub), SubInputs, |
| SubMask, DAG, Depth + 1, ResolveKnownElts)) |
| return false; |
| if (SubMask.size() != NumSubElts) { |
| assert(((SubMask.size() % NumSubElts) == 0 || |
| (NumSubElts % SubMask.size()) == 0) && "Illegal submask scale"); |
| if ((NumSubElts % SubMask.size()) == 0) { |
| int Scale = NumSubElts / SubMask.size(); |
| SmallVector<int,64> ScaledSubMask; |
| scaleShuffleMask<int>(Scale, SubMask, ScaledSubMask); |
| SubMask = ScaledSubMask; |
| } else { |
| int Scale = SubMask.size() / NumSubElts; |
| NumSubElts = SubMask.size(); |
| NumElts *= Scale; |
| InsertIdx *= Scale; |
| } |
| } |
| Ops.push_back(Src); |
| for (SDValue &SubInput : SubInputs) { |
| EVT SubSVT = SubInput.getValueType().getScalarType(); |
| EVT AltVT = EVT::getVectorVT(*DAG.getContext(), SubSVT, |
| NumSizeInBits / SubSVT.getSizeInBits()); |
| Ops.push_back(DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), AltVT, |
| DAG.getUNDEF(AltVT), SubInput, |
| DAG.getIntPtrConstant(0, SDLoc(N)))); |
| } |
| for (int i = 0; i != (int)NumElts; ++i) |
| Mask.push_back(i); |
| for (int i = 0; i != (int)NumSubElts; ++i) { |
| int M = SubMask[i]; |
| if (0 <= M) { |
| int InputIdx = M / NumSubElts; |
| M = (NumElts * (1 + InputIdx)) + (M % NumSubElts); |
| } |
| Mask[i + InsertIdx] = M; |
| } |
| return true; |
| } |
| case ISD::SCALAR_TO_VECTOR: { |
| // Match against a scalar_to_vector of an extract from a vector, |
| // for PEXTRW/PEXTRB we must handle the implicit zext of the scalar. |
| SDValue N0 = N.getOperand(0); |
| SDValue SrcExtract; |
| |
| if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT && |
| N0.getOperand(0).getValueType() == VT) || |
| (N0.getOpcode() == X86ISD::PEXTRW && |
| N0.getOperand(0).getValueType() == MVT::v8i16) || |
| (N0.getOpcode() == X86ISD::PEXTRB && |
| N0.getOperand(0).getValueType() == MVT::v16i8)) { |
| SrcExtract = N0; |
| } |
| |
| if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1))) |
| return false; |
| |
| SDValue SrcVec = SrcExtract.getOperand(0); |
| EVT SrcVT = SrcVec.getValueType(); |
| unsigned NumSrcElts = SrcVT.getVectorNumElements(); |
| unsigned NumZeros = (NumBitsPerElt / SrcVT.getScalarSizeInBits()) - 1; |
| |
| unsigned SrcIdx = SrcExtract.getConstantOperandVal(1); |
| if (NumSrcElts <= SrcIdx) |
| return false; |
| |
| Ops.push_back(SrcVec); |
| Mask.push_back(SrcIdx); |
| Mask.append(NumZeros, SM_SentinelZero); |
| Mask.append(NumSrcElts - Mask.size(), SM_SentinelUndef); |
| return true; |
| } |
| case X86ISD::PINSRB: |
| case X86ISD::PINSRW: { |
| SDValue InVec = N.getOperand(0); |
| SDValue InScl = N.getOperand(1); |
| SDValue InIndex = N.getOperand(2); |
| if (!isa<ConstantSDNode>(InIndex) || |
| cast<ConstantSDNode>(InIndex)->getAPIntValue().uge(NumElts)) |
| return false; |
| uint64_t InIdx = N.getConstantOperandVal(2); |
| |
| // Attempt to recognise a PINSR*(VEC, 0, Idx) shuffle pattern. |
| if (X86::isZeroNode(InScl)) { |
| Ops.push_back(InVec); |
| for (unsigned i = 0; i != NumElts; ++i) |
| Mask.push_back(i == InIdx ? SM_SentinelZero : (int)i); |
| return true; |
| } |
| |
| // Attempt to recognise a PINSR*(PEXTR*) shuffle pattern. |
| // TODO: Expand this to support INSERT_VECTOR_ELT/etc. |
| unsigned ExOp = |
| (X86ISD::PINSRB == Opcode ? X86ISD::PEXTRB : X86ISD::PEXTRW); |
| if (InScl.getOpcode() != ExOp) |
| return false; |
| |
| SDValue ExVec = InScl.getOperand(0); |
| SDValue ExIndex = InScl.getOperand(1); |
| if (!isa<ConstantSDNode>(ExIndex) || |
| cast<ConstantSDNode>(ExIndex)->getAPIntValue().uge(NumElts)) |
| return false; |
| uint64_t ExIdx = InScl.getConstantOperandVal(1); |
| |
| Ops.push_back(InVec); |
| Ops.push_back(ExVec); |
| for (unsigned i = 0; i != NumElts; ++i) |
| Mask.push_back(i == InIdx ? NumElts + ExIdx : i); |
| return true; |
| } |
| case X86ISD::PACKSS: |
| case X86ISD::PACKUS: { |
| SDValue N0 = N.getOperand(0); |
| SDValue N1 = N.getOperand(1); |
| assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) && |
| N1.getValueType().getVectorNumElements() == (NumElts / 2) && |
| "Unexpected input value type"); |
| |
| APInt EltsLHS, EltsRHS; |
| getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS); |
| |
| // If we know input saturation won't happen we can treat this |
| // as a truncation shuffle. |
| if (Opcode == X86ISD::PACKSS) { |
| if ((!N0.isUndef() && |
| DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) || |
| (!N1.isUndef() && |
| DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt)) |
| return false; |
| } else { |
| APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt); |
| if ((!N0.isUndef() && |
| !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) || |
| (!N1.isUndef() && |
| !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1))) |
| return false; |
| } |
| |
| bool IsUnary = (N0 == N1); |
| |
| Ops.push_back(N0); |
| if (!IsUnary) |
| Ops.push_back(N1); |
| |
| createPackShuffleMask(VT, Mask, IsUnary); |
| return true; |
| } |
| case X86ISD::VSHLI: |
| case X86ISD::VSRLI: { |
| uint64_t ShiftVal = N.getConstantOperandVal(1); |
| // Out of range bit shifts are guaranteed to be zero. |
| if (NumBitsPerElt <= ShiftVal) { |
| Mask.append(NumElts, SM_SentinelZero); |
| return true; |
| } |
| |
| // We can only decode 'whole byte' bit shifts as shuffles. |
| if ((ShiftVal % 8) != 0) |
| break; |
| |
| uint64_t ByteShift = ShiftVal / 8; |
| unsigned NumBytes = NumSizeInBits / 8; |
| unsigned NumBytesPerElt = NumBitsPerElt / 8; |
| Ops.push_back(N.getOperand(0)); |
| |
| // Clear mask to all zeros and insert the shifted byte indices. |
| Mask.append(NumBytes, SM_SentinelZero); |
| |
| if (X86ISD::VSHLI == Opcode) { |
| for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt) |
| for (unsigned j = ByteShift; j != NumBytesPerElt; ++j) |
| Mask[i + j] = i + j - ByteShift; |
| } else { |
| for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt) |
| for (unsigned j = ByteShift; j != NumBytesPerElt; ++j) |
| Mask[i + j - ByteShift] = i + j; |
| } |
| return true; |
| } |
| case X86ISD::VBROADCAST: { |
| SDValue Src = N.getOperand(0); |
| MVT SrcVT = Src.getSimpleValueType(); |
| if (!SrcVT.isVector()) |
| return false; |
| |
| if (NumSizeInBits != SrcVT.getSizeInBits()) { |
| assert((NumSizeInBits % SrcVT.getSizeInBits()) == 0 && |
| "Illegal broadcast type"); |
| SrcVT = MVT::getVectorVT(SrcVT.getScalarType(), |
| NumSizeInBits / SrcVT.getScalarSizeInBits()); |
| Src = DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), SrcVT, |
| DAG.getUNDEF(SrcVT), Src, |
| DAG.getIntPtrConstant(0, SDLoc(N))); |
| } |
| |
| Ops.push_back(Src); |
| Mask.append(NumElts, 0); |
| return true; |
| } |
| case ISD::ZERO_EXTEND: |
| case ISD::ANY_EXTEND: |
| case ISD::ZERO_EXTEND_VECTOR_INREG: |
| case ISD::ANY_EXTEND_VECTOR_INREG: { |
| SDValue Src = N.getOperand(0); |
| EVT SrcVT = Src.getValueType(); |
| |
| // Extended source must be a simple vector. |
| if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 || |
| (SrcVT.getScalarSizeInBits() % 8) != 0) |
| return false; |
| |
| unsigned NumSrcBitsPerElt = SrcVT.getScalarSizeInBits(); |
| bool IsAnyExtend = |
| (ISD::ANY_EXTEND == Opcode || ISD::ANY_EXTEND_VECTOR_INREG == Opcode); |
| DecodeZeroExtendMask(NumSrcBitsPerElt, NumBitsPerElt, NumElts, IsAnyExtend, |
| Mask); |
| |
| if (NumSizeInBits != SrcVT.getSizeInBits()) { |
| assert((NumSizeInBits % SrcVT.getSizeInBits()) == 0 && |
| "Illegal zero-extension type"); |
| SrcVT = MVT::getVectorVT(SrcVT.getSimpleVT().getScalarType(), |
| NumSizeInBits / NumSrcBitsPerElt); |
| Src = DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), SrcVT, |
| DAG.getUNDEF(SrcVT), Src, |
| DAG.getIntPtrConstant(0, SDLoc(N))); |
| } |
| |
| Ops.push_back(Src); |
| return true; |
| } |
| } |
| |
| return false; |
| } |
| |
| /// Removes unused/repeated shuffle source inputs and adjusts the shuffle mask. |
| static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs, |
| SmallVectorImpl<int> &Mask) { |
| int MaskWidth = Mask.size(); |
| SmallVector<SDValue, 16> UsedInputs; |
| for (int i = 0, e = Inputs.size(); i < e; ++i) { |
| int lo = UsedInputs.size() * MaskWidth; |
| int hi = lo + MaskWidth; |
| |
| // Strip UNDEF input usage. |
| if (Inputs[i].isUndef()) |
| for (int &M : Mask) |
| if ((lo <= M) && (M < hi)) |
| M = SM_SentinelUndef; |
| |
| // Check for unused inputs. |
| if (none_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) { |
| for (int &M : Mask) |
| if (lo <= M) |
| M -= MaskWidth; |
| continue; |
| } |
| |
| // Check for repeated inputs. |
| bool IsRepeat = false; |
| for (int j = 0, ue = UsedInputs.size(); j != ue; ++j) { |
| if (UsedInputs[j] != Inputs[i]) |
| continue; |
| for (int &M : Mask) |
| if (lo <= M) |
| M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth); |
| IsRepeat = true; |
| break; |
| } |
| if (IsRepeat) |
| continue; |
| |
| UsedInputs.push_back(Inputs[i]); |
| } |
| Inputs = UsedInputs; |
| } |
| |
| /// Calls getTargetShuffleAndZeroables to resolve a target shuffle mask's inputs |
| /// and then sets the SM_SentinelUndef and SM_SentinelZero values. |
| /// Returns true if the target shuffle mask was decoded. |
| static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts, |
| SmallVectorImpl<SDValue> &Inputs, |
| SmallVectorImpl<int> &Mask, |
| APInt &KnownUndef, APInt &KnownZero, |
| SelectionDAG &DAG, unsigned Depth, |
| bool ResolveKnownElts) { |
| EVT VT = Op.getValueType(); |
| if (!VT.isSimple() || !VT.isVector()) |
| return false; |
| |
| if (getTargetShuffleAndZeroables(Op, Mask, Inputs, KnownUndef, KnownZero)) { |
| if (ResolveKnownElts) |
| resolveTargetShuffleFromZeroables(Mask, KnownUndef, KnownZero); |
| return true; |
| } |
| if (getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG, Depth, |
| ResolveKnownElts)) { |
| resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero); |
| return true; |
| } |
| return false; |
| } |
| |
| static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs, |
| SmallVectorImpl<int> &Mask, |
| SelectionDAG &DAG, unsigned Depth = 0, |
| bool ResolveKnownElts = true) { |
| EVT VT = Op.getValueType(); |
| if (!VT.isSimple() || !VT.isVector()) |
| return false; |
| |
| APInt KnownUndef, KnownZero; |
| unsigned NumElts = Op.getValueType().getVectorNumElements(); |
| APInt DemandedElts = APInt::getAllOnesValue(NumElts); |
| return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, KnownUndef, |
| KnownZero, DAG, Depth, ResolveKnownElts); |
| } |
| |
| /// Returns the scalar element that will make up the ith |
| /// element of the result of the vector shuffle. |
| static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG, |
| unsigned Depth) { |
| if (Depth == 6) |
| return SDValue(); // Limit search depth. |
| |
| SDValue V = SDValue(N, 0); |
| EVT VT = V.getValueType(); |
| unsigned Opcode = V.getOpcode(); |
| |
| // Recurse into ISD::VECTOR_SHUFFLE node to find scalars. |
| if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) { |
| int Elt = SV->getMaskElt(Index); |
| |
| if (Elt < 0) |
| return DAG.getUNDEF(VT.getVectorElementType()); |
| |
| unsigned NumElems = VT.getVectorNumElements(); |
| SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0) |
| : SV->getOperand(1); |
| return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1); |
| } |
| |
| // Recurse into target specific vector shuffles to find scalars. |
| if (isTargetShuffle(Opcode)) { |
| MVT ShufVT = V.getSimpleValueType(); |
| MVT ShufSVT = ShufVT.getVectorElementType(); |
| int NumElems = (int)ShufVT.getVectorNumElements(); |
| SmallVector<int, 16> ShuffleMask; |
| SmallVector<SDValue, 16> ShuffleOps; |
| bool IsUnary; |
| |
| if (!getTargetShuffleMask(N, ShufVT, true, ShuffleOps, ShuffleMask, IsUnary)) |
| return SDValue(); |
| |
| int Elt = ShuffleMask[Index]; |
| if (Elt == SM_SentinelZero) |
| return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(N), ShufSVT) |
| : DAG.getConstantFP(+0.0, SDLoc(N), ShufSVT); |
| if (Elt == SM_SentinelUndef) |
| return DAG.getUNDEF(ShufSVT); |
| |
| assert(0 <= Elt && Elt < (2*NumElems) && "Shuffle index out of range"); |
| SDValue NewV = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1]; |
| return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, |
| Depth+1); |
| } |
| |
| // Recurse into insert_subvector base/sub vector to find scalars. |
| if (Opcode == ISD::INSERT_SUBVECTOR && |
| isa<ConstantSDNode>(N->getOperand(2))) { |
| SDValue Vec = N->getOperand(0); |
| SDValue Sub = N->getOperand(1); |
| EVT SubVT = Sub.getValueType(); |
| unsigned NumSubElts = SubVT.getVectorNumElements(); |
| uint64_t SubIdx = N->getConstantOperandVal(2); |
| |
| if (SubIdx <= Index && Index < (SubIdx + NumSubElts)) |
| return getShuffleScalarElt(Sub.getNode(), Index - SubIdx, DAG, Depth + 1); |
| return getShuffleScalarElt(Vec.getNode(), Index, DAG, Depth + 1); |
| } |
| |
| // Recurse into extract_subvector src vector to find scalars. |
| if (Opcode == ISD::EXTRACT_SUBVECTOR && |
| isa<ConstantSDNode>(N->getOperand(1))) { |
| SDValue Src = N->getOperand(0); |
| uint64_t SrcIdx = N->getConstantOperandVal(1); |
| return getShuffleScalarElt(Src.getNode(), Index + SrcIdx, DAG, Depth + 1); |
| } |
| |
| // Actual nodes that may contain scalar elements |
| if (Opcode == ISD::BITCAST) { |
| V = V.getOperand(0); |
| EVT SrcVT = V.getValueType(); |
| unsigned NumElems = VT.getVectorNumElements(); |
| |
| if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems) |
| return SDValue(); |
| } |
| |
| if (V.getOpcode() == ISD::SCALAR_TO_VECTOR) |
| return (Index == 0) ? V.getOperand(0) |
| : DAG.getUNDEF(VT.getVectorElementType()); |
| |
| if (V.getOpcode() == ISD::BUILD_VECTOR) |
| return V.getOperand(Index); |
| |
| return SDValue(); |
| } |
| |
| // Use PINSRB/PINSRW/PINSRD to create a build vector. |
| static SDValue LowerBuildVectorAsInsert(SDValue Op, unsigned NonZeros, |
| unsigned NumNonZero, unsigned NumZero, |
| SelectionDAG &DAG, |
| const X86Subtarget &Subtarget) { |
| MVT VT = Op.getSimpleValueType(); |
| unsigned NumElts = VT.getVectorNumElements(); |
| assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) || |
| ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) && |
| "Illegal vector insertion"); |
| |
| SDLoc dl(Op); |
| SDValue V; |
| bool First = true; |
| |
| for (unsigned i = 0; i < NumElts; ++i) { |
| bool IsNonZero = (NonZeros & (1 << i)) != 0; |
| if (!IsNonZero) |
| continue; |
| |
| // If the build vector contains zeros or our first insertion is not the |
| // first index then insert into zero vector to break any register |
| // dependency else use SCALAR_TO_VECTOR. |
| if (First) { |
| First = false; |
| if (NumZero || 0 != i) |
| V = getZeroVector(VT, Subtarget, DAG, dl); |
| else { |
| assert(0 == i && "Expected insertion into zero-index"); |
| V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32); |
| V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V); |
| V = DAG.getBitcast(VT, V); |
| continue; |
| } |
| } |
| V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V, Op.getOperand(i), |
| DAG.getIntPtrConstant(i, dl)); |
| } |
| |
| return V; |
| } |
| |
| /// Custom lower build_vector of v16i8. |
| static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, |
| unsigned NumNonZero, unsigned NumZero, |
| SelectionDAG &DAG, |
| const X86Subtarget &Subtarget) { |
| if (NumNonZero > 8 && !Subtarget.hasSSE41()) |
| return SDValue(); |
| |
| // SSE4.1 - use PINSRB to insert each byte directly. |
| if (Subtarget.hasSSE41()) |
| return LowerBuildVectorAsInsert(Op, NonZeros, NumNonZero, NumZero, DAG, |
| Subtarget); |
| |
| SDLoc dl(Op); |
| SDValue V; |
| |
| // Pre-SSE4.1 - merge byte pairs and insert with PINSRW. |
| for (unsigned i = 0; i < 16; i += 2) { |
| bool ThisIsNonZero = (NonZeros & (1 << i)) != 0; |
| bool NextIsNonZero = (NonZeros & (1 << (i + 1))) != 0; |
| if (!ThisIsNonZero && !NextIsNonZero) |
| continue; |
| |
| // FIXME: Investigate combining the first 4 bytes as a i32 instead. |
| SDValue Elt; |
| if (ThisIsNonZero) { |
| if (NumZero || NextIsNonZero) |
| Elt = DAG.getZExtOrTrunc(Op.getOperand(i), dl, MVT::i32); |
| else |
| Elt = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32); |
| } |
| |
| if (NextIsNonZero) { |
| SDValue NextElt = Op.getOperand(i + 1); |
| if (i == 0 && NumZero) |
| NextElt = DAG.getZExtOrTrunc(NextElt, dl, MVT::i32); |
| else |
| NextElt = DAG.getAnyExtOrTrunc(NextElt, dl, MVT::i32); |
| NextElt = DAG.getNode(ISD::SHL, dl, MVT::i32, NextElt, |
| DAG.getConstant(8, dl, MVT::i8)); |
| if (ThisIsNonZero) |
| Elt = DAG.getNode(ISD::OR, dl, MVT::i32, NextElt, Elt); |
| else |
| Elt = NextElt; |
| } |
| |
| // If our first insertion is not the first index then insert into zero |
| // vector to break any register dependency else use SCALAR_TO_VECTOR. |
| if (!V) { |
| if (i != 0) |
| V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl); |
| else { |
| V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Elt); |
| V = DAG.getBitcast(MVT::v8i16, V); |
| continue; |
| } |
| } |
| Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Elt); |
| V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, Elt, |
| DAG.getIntPtrConstant(i / 2, dl)); |
| } |
| |
| return DAG.getBitcast(MVT::v16i8, V); |
| } |
| |
| /// Custom lower build_vector of v8i16. |
| static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, |
| unsigned NumNonZero, unsigned NumZero, |
| SelectionDAG &DAG, |
| const X86Subtarget &Subtarget) { |
| if (NumNonZero > 4 && !Subtarget.hasSSE41()) |
| return SDValue(); |
| |
| // Use PINSRW to insert each byte directly. |
| return LowerBuildVectorAsInsert(Op, NonZeros, NumNonZero, NumZero, DAG, |
| Subtarget); |
| } |
| |
| /// Custom lower build_vector of v4i32 or v4f32. |
| static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG, |
| const X86Subtarget &Subtarget) { |
| // If this is a splat of a pair of elements, use MOVDDUP (unless the target |
| // has XOP; in that case defer lowering to potentially use VPERMIL2PS). |
| // Because we're creating a less complicated build vector here, we may enable |
| // further folding of the MOVDDUP via shuffle transforms. |
| if (Subtarget.hasSSE3() && !Subtarget.hasXOP() && |
| Op.getOperand(0) == Op.getOperand(2) && |
| Op.getOperand(1) == Op.getOperand(3) && |
| Op.getOperand(0) != Op.getOperand(1)) { |
| SDLoc DL(Op); |
| MVT VT = Op.getSimpleValueType(); |
| MVT EltVT = VT.getVectorElementType(); |
| // Create a new build vector with the first 2 elements followed by undef |
| // padding, bitcast to v2f64, duplicate, and bitcast back. |
| SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1), |
| DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) }; |
| SDValue NewBV = DAG.getBitcast(MVT::v2f64, DAG.getBuildVector(VT, DL, Ops)); |
| SDValue Dup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, NewBV); |
| return DAG.getBitcast(VT, Dup); |
| } |
| |
| // Find all zeroable elements. |
| std::bitset<4> Zeroable, Undefs; |
| for (int i = 0; i < 4; ++i) { |
| SDValue Elt = Op.getOperand(i); |
| Undefs[i] = Elt.isUndef(); |
| Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt)); |
| } |
| assert(Zeroable.size() - Zeroable.count() > 1 && |
| "We expect at least two non-zero elements!"); |
| |
| // We only know how to deal with build_vector nodes where elements are either |
| // zeroable or extract_vector_elt with constant index. |
| SDValue FirstNonZero; |
| unsigned FirstNonZeroIdx; |
| for (unsigned i = 0; i < 4; ++i) { |
| if (Zeroable[i]) |
| continue; |
| SDValue Elt = Op.getOperand(i); |
| if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT || |
| !isa<ConstantSDNode>(Elt.getOperand(1))) |
| return SDValue(); |
| // Make sure that this node is extracting from a 128-bit vector. |
| MVT VT = Elt.getOperand(0).getSimpleValueType(); |
| if (!VT.is128BitVector()) |
| return SDValue(); |
| if (!FirstNonZero.getNode()) { |
| FirstNonZero = Elt; |
| FirstNonZeroIdx = i; |
| } |
| } |
| |
| assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!"); |
| SDValue V1 = FirstNonZero.getOperand(0); |
| MVT VT = V1.getSimpleValueType(); |
| |
| // See if this build_vector can be lowered as a blend with zero. |
| SDValue Elt; |
| unsigned EltMaskIdx, EltIdx; |
| int Mask[4]; |
| for (EltIdx = 0; EltIdx < 4; ++EltIdx) { |
| if (Zeroable[EltIdx]) { |
| // The zero vector will be on the right hand side. |
| Mask[EltIdx] = EltIdx+4; |
| continue; |
| } |
| |
| Elt = Op->getOperand(EltIdx); |
| // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index. |
| EltMaskIdx = Elt.getConstantOperandVal(1); |
| if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx) |
| break; |
| Mask[EltIdx] = EltIdx; |
| } |
| |
| if (EltIdx == 4) { |
| // Let the shuffle legalizer deal with blend operations. |
| SDValue VZeroOrUndef = (Zeroable == Undefs) |
| ? DAG.getUNDEF(VT) |
| : getZeroVector(VT, Subtarget, DAG, SDLoc(Op)); |
| if (V1.getSimpleValueType() != VT) |
| V1 = DAG.getBitcast(VT, V1); |
| return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZeroOrUndef, Mask); |
| } |
| |
| // See if we can lower this build_vector to a INSERTPS. |
| if (!Subtarget.hasSSE41()) |
| return SDValue(); |
| |
| SDValue V2 = Elt.getOperand(0); |
| if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx) |
| V1 = SDValue(); |
| |
| bool CanFold = true; |
| for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) { |
| if (Zeroable[i]) |
| continue; |
| |
| SDValue Current = Op->getOperand(i); |
| SDValue SrcVector = Current->getOperand(0); |
| if (!V1.getNode()) |
| V1 = SrcVector; |
| CanFold = (SrcVector == V1) && (Current.getConstantOperandAPInt(1) == i); |
| } |
| |
| if (!CanFold) |
| return SDValue(); |
| |
| assert(V1.getNode() && "Expected at least two non-zero elements!"); |
| if (V1.getSimpleValueType() != MVT::v4f32) |
| V1 = DAG.getBitcast(MVT::v4f32, V1); |
| if (V2.getSimpleValueType() != MVT::v4f32) |
| V2 = DAG.getBitcast(MVT::v4f32, V2); |
| |
| // Ok, we can emit an INSERTPS instruction. |
| unsigned ZMask = Zeroable.to_ulong(); |
| |
| unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask; |
| assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!"); |
| SDLoc DL(Op); |
| SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2, |
| DAG.getIntPtrConstant(InsertPSMask, DL, true)); |
| return DAG.getBitcast(VT, Result); |
| } |
| |
| /// Return a vector logical shift node. |
| static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits, |
| SelectionDAG &DAG, const TargetLowering &TLI, |
| const SDLoc &dl) { |
| assert(VT.is128BitVector() && "Unknown type for VShift"); |
| MVT ShVT = MVT::v16i8; |
| unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ; |
| SrcOp = DAG.getBitcast(ShVT, SrcOp); |
| assert(NumBits % 8 == 0 && "Only support byte sized shifts"); |
| SDValue ShiftVal = DAG.getTargetConstant(NumBits / 8, dl, MVT::i8); |
| return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal)); |
| } |
| |
| static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl, |
| SelectionDAG &DAG) { |
| |
| // Check if the scalar load can be widened into a vector load. And if |
| // the address is "base + cst" see if the cst can be "absorbed" into |
| // the shuffle mask. |
| if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) { |
| SDValue Ptr = LD->getBasePtr(); |
| if (!ISD::isNormalLoad(LD) || !LD->isSimple()) |
| return SDValue(); |
| EVT PVT = LD->getValueType(0); |
| if (PVT != MVT::i32 && PVT != MVT::f32) |
| return SDValue(); |
| |
| int FI = -1; |
| int64_t Offset = 0; |
| if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) { |
| FI = FINode->getIndex(); |
| Offset = 0; |
| } else if (DAG.isBaseWithConstantOffset(Ptr) && |
| isa<FrameIndexSDNode>(Ptr.getOperand(0))) { |
| FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex(); |
| Offset = Ptr.getConstantOperandVal(1); |
| Ptr = Ptr.getOperand(0); |
| } else { |
| return SDValue(); |
| } |
| |
| // FIXME: 256-bit vector instructions don't require a strict alignment, |
| // improve this code to support it better. |
| unsigned RequiredAlign = VT.getSizeInBits()/8; |
| SDValue Chain = LD->getChain(); |
| // Make sure the stack object alignment is at least 16 or 32. |
| MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); |
| if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) { |
| if (MFI.isFixedObjectIndex(FI)) { |
| // Can't change the alignment. FIXME: It's possible to compute |
| // the exact stack offset and reference FI + adjust offset instead. |
| // If someone *really* cares about this. That's the way to implement it. |
| return SDValue(); |
| } else { |
| MFI.setObjectAlignment(FI, RequiredAlign); |
| } |
| } |
| |
| // (Offset % 16 or 32) must be multiple of 4. Then address is then |
| // Ptr + (Offset & ~15). |
| if (Offset < 0) |
| return SDValue(); |
| if ((Offset % RequiredAlign) & 3) |
| return SDValue(); |
| int64_t StartOffset = Offset & ~int64_t(RequiredAlign - 1); |
| if (StartOffset) { |
| SDLoc DL(Ptr); |
| Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, |
| DAG.getConstant(StartOffset, DL, Ptr.getValueType())); |
| } |
| |
| int EltNo = (Offset - StartOffset) >> 2; |
| unsigned NumElems = VT.getVectorNumElements(); |
| |
| EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems); |
| SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr, |
| LD->getPointerInfo().getWithOffset(StartOffset)); |
| |
| SmallVector<int, 8> Mask(NumElems, EltNo); |
| |
| return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask); |
| } |
| |
| return SDValue(); |
| } |
| |
| // Recurse to find a LoadSDNode source and the accumulated ByteOffest. |
| static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) { |
| if (ISD::isNON_EXTLoad(Elt.getNode())) { |
| auto *BaseLd = cast<LoadSDNode>(Elt); |
| if (!BaseLd->isSimple()) |
| return false; |
| Ld = BaseLd; |
| ByteOffset = 0; |
| return true; |
| } |
| |
| switch (Elt.getOpcode()) { |
| case ISD::BITCAST: |
| case ISD::TRUNCATE: |
| case ISD::SCALAR_TO_VECTOR: |
| return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset); |
| case ISD::SRL: |
| if (isa<ConstantSDNode>(Elt.getOperand(1))) { |
| uint64_t Idx = Elt.getConstantOperandVal(1); |
| if ((Idx % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) { |
| ByteOffset += Idx / 8; |
| return true; |
| } |
| } |
| break; |
| case ISD::EXTRACT_VECTOR_ELT: |
| if (isa<ConstantSDNode>(Elt.getOperand(1))) { |
| SDValue Src = Elt.getOperand(0); |
| unsigned SrcSizeInBits = Src.getScalarValueSizeInBits(); |
| unsigned DstSizeInBits = Elt.getScalarValueSizeInBits(); |
| if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 && |
| findEltLoadSrc(Src, Ld, ByteOffset)) { |
| uint64_t Idx = Elt.getConstantOperandVal(1); |
| ByteOffset += Idx * (SrcSizeInBits / 8); |
| return true; |
| } |
| } |
| break; |
| } |
| |
| return false; |
| } |
| |
| /// Given the initializing elements 'Elts' of a vector of type 'VT', see if the |
| /// elements can be replaced by a single large load which has the same value as |
| /// a build_vector or insert_subvector whose loaded operands are 'Elts'. |
| /// |
| /// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a |
| static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts, |
| const SDLoc &DL, SelectionDAG &DAG, |
| const X86Subtarget &Subtarget, |
| bool isAfterLegalize) { |
| if ((VT.getScalarSizeInBits() % 8) != 0) |
| return SDValue(); |
| |
| unsigned NumElems = Elts.size(); |
| |
| int LastLoadedElt = -1; |
| APInt LoadMask = APInt::getNullValue(NumElems); |
| APInt ZeroMask = APInt::getNullValue(NumElems); |
| APInt UndefMask = APInt::getNullValue(NumElems); |
| |
| SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr); |
| SmallVector<int64_t, 8> ByteOffsets(NumElems, 0); |
| |
| // For each element in the initializer, see if we've found a load, zero or an |
| // undef. |
| for (unsigned i = 0; i < NumElems; ++i) { |
| SDValue Elt = peekThroughBitcasts(Elts[i]); |
| if (!Elt.getNode()) |
| return SDValue(); |
| if (Elt.isUndef()) { |
| UndefMask.setBit(i); |
| continue; |
| } |
| if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode())) { |
| ZeroMask.setBit(i); |
| continue; |
| } |
| |
| // Each loaded element must be the correct fractional portion of the |
| // requested vector load. |
| unsigned EltSizeInBits = Elt.getValueSizeInBits(); |
| if ((NumElems * EltSizeInBits) != VT.getSizeInBits()) |
| return SDValue(); |
| |
| if (!findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]) || ByteOffsets[i] < 0) |
| return SDValue(); |
| unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0); |
| if (((ByteOffsets[i] * 8) + EltSizeInBits) > LoadSizeInBits) |
| return SDValue(); |
| |
| LoadMask.setBit(i); |
| LastLoadedElt = i; |
| } |
| assert((ZeroMask.countPopulation() + UndefMask.countPopulation() + |
| LoadMask.countPopulation()) == NumElems && |
| "Incomplete element masks"); |
| |
| // Handle Special Cases - all undef or undef/zero. |
| if (UndefMask.countPopulation() == NumElems) |
| return DAG.getUNDEF(VT); |
| |
| // FIXME: Should we return this as a BUILD_VECTOR instead? |
| if ((ZeroMask.countPopulation() + UndefMask.countPopulation()) == NumElems) |
| return VT.isInteger() ? DAG.getConstant(0, DL, VT) |
| : DAG.getConstantFP(0.0, DL, VT); |
| |
| const TargetLowering &TLI = DAG.getTargetLoweringInfo(); |
| int FirstLoadedElt = LoadMask.countTrailingZeros(); |
| SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]); |
| EVT EltBaseVT = EltBase.getValueType(); |
| assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() && |
| "Register/Memory size mismatch"); |
| LoadSDNode *LDBase = Loads[FirstLoadedElt]; |
| assert(LDBase && "Did not find base load for merging consecutive loads"); |
| unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits(); |
| unsigned BaseSizeInBytes = BaseSizeInBits / 8; |
| int LoadSizeInBits = (1 + LastLoadedElt - FirstLoadedElt) * BaseSizeInBits; |
| assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected"); |
| |
| // TODO: Support offsetting the base load. |
| if (ByteOffsets[FirstLoadedElt] != 0) |
| return SDValue(); |
| |
| // Check to see if the element's load is consecutive to the base load |
| // or offset from a previous (already checked) load. |
| auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) { |
| LoadSDNode *Ld = Loads[EltIdx]; |
| int64_t ByteOffset = ByteOffsets[EltIdx]; |
| if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) { |
| int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes); |
| return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] && |
| Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0); |
| } |
| return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes, |
| EltIdx - FirstLoadedElt); |
| }; |
| |
| // Consecutive loads can contain UNDEFS but not ZERO elements. |
| // Consecutive loads with UNDEFs and ZEROs elements require a |
| // an additional shuffle stage to clear the ZERO elements. |
| bool IsConsecutiveLoad = true; |
| bool IsConsecutiveLoadWithZeros = true; |
| for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) { |
| if (LoadMask[i]) { |
| if (!CheckConsecutiveLoad(LDBase, i)) { |
| IsConsecutiveLoad = false; |
| IsConsecutiveLoadWithZeros = false; |
| break; |
| } |
| } else if (ZeroMask[i]) { |
| IsConsecutiveLoad = false; |
| } |
| } |
| |
| auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) { |
| auto MMOFlags = LDBase->getMemOperand()->getFlags(); |
| assert(LDBase->isSimple() && |
| "Cannot merge volatile or atomic loads."); |
| SDValue NewLd = |
| DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), |
| LDBase->getPointerInfo(), LDBase->getAlignment(), MMOFlags); |
| for (auto *LD : Loads) |
| if (LD) |
| DAG.makeEquivalentMemoryOrdering(LD, NewLd); |
| return NewLd; |
| }; |
| |
| // Check if the base load is entirely dereferenceable. |
| bool IsDereferenceable = LDBase->getPointerInfo().isDereferenceable( |
| VT.getSizeInBits() / 8, *DAG.getContext(), DAG.getDataLayout()); |
| |
| // LOAD - all consecutive load/undefs (must start/end with a load or be |
| // entirely dereferenceable). If we have found an entire vector of loads and |
| // undefs, then return a large load of the entire vector width starting at the |
| // base pointer. If the vector contains zeros, then attempt to shuffle those |
| // elements. |
| if (FirstLoadedElt == 0 && |
| (LastLoadedElt == (int)(NumElems - 1) || IsDereferenceable) && |
| (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) { |
| if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT)) |
| return SDValue(); |
| |
| // Don't create 256-bit non-temporal aligned loads without AVX2 as these |
| // will lower to regular temporal loads and use the cache. |
| if (LDBase->isNonTemporal() && LDBase->getAlignment() >= 32 && |
| VT.is256BitVector() && !Subtarget.hasInt256()) |
| return SDValue(); |
| |
| if (NumElems == 1) |
| return DAG.getBitcast(VT, Elts[FirstLoadedElt]); |
| |
| if (!ZeroMask) |
| return CreateLoad(VT, LDBase); |
| |
| // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded |
| // vector and a zero vector to clear out the zero elements. |
| if (!isAfterLegalize && VT.isVector()) { |
| unsigned NumMaskElts = VT.getVectorNumElements(); |
| if ((NumMaskElts % NumElems) == 0) { |
| unsigned Scale = NumMaskElts / NumElems; |
| SmallVector<int, 4> ClearMask(NumMaskElts, -1); |
| for (unsigned i = 0; i < NumElems; ++i) { |
| if (UndefMask[i]) |
| continue; |
| int Offset = ZeroMask[i] ? NumMaskElts : 0; |
| for (unsigned j = 0; j != Scale; ++j) |
| ClearMask[(i * Scale) + j] = (i * Scale) + j + Offset; |
| } |
| SDValue V = CreateLoad(VT, LDBase); |
| SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT) |
| : DAG.getConstantFP(0.0, DL, VT); |
| return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask); |
| } |
| } |
| } |
| |
| // If the upper half of a ymm/zmm load is undef then just load the lower half. |
| if (VT.is256BitVector() || VT.is512BitVector()) { |
| unsigned HalfNumElems = NumElems / 2; |
| if (UndefMask.extractBits(HalfNumElems, HalfNumElems).isAllOnesValue()) { |
| EVT HalfVT = |
| EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems); |
| SDValue HalfLD = |
| EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL, |
| DAG, Subtarget, isAfterLegalize); |
| if (HalfLD) |
| return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), |
| HalfLD, DAG.getIntPtrConstant(0, DL)); |
| } |
| } |
| |
| // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs. |
| if (IsConsecutiveLoad && FirstLoadedElt == 0 && |
| (LoadSizeInBits == 32 || LoadSizeInBits == 64) && |
| ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) { |
| MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits) |
| : MVT::getIntegerVT(LoadSizeInBits); |
| MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits); |
| if (TLI.isTypeLegal(VecVT)) { |
| SDVTList Tys = DAG.getVTList(VecVT, MVT::Other); |
| SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() }; |
| SDValue ResNode = |
| DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT, |
| LDBase->getPointerInfo(), |
| LDBase->getAlignment(), |
| MachineMemOperand::MOLoad); |
| for (auto *LD : Loads) |
| if (LD) |
| DAG.makeEquivalentMemoryOrdering(LD, ResNode); |
| return DAG.getBitcast(VT, ResNode); |
| } |
| } |
| |
| // BROADCAST - match the smallest possible repetition pattern, load that |
| // scalar/subvector element and then broadcast to the entire vector. |
| if (ZeroMask.isNullValue() && isPowerOf2_32(NumElems) && Subtarget.hasAVX() && |
| (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) { |
| for (unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) { |
| unsigned RepeatSize = SubElems * BaseSizeInBits; |
| unsigned ScalarSize = std::min(RepeatSize, 64u); |
| if (!Subtarget.hasAVX2() && ScalarSize < 32) |
| continue; |
| |
| bool Match = true; |
| SmallVector<SDValue, 8> RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT)); |
| for (unsigned i = 0; i != NumElems && Match; ++i) { |
| if (!LoadMask[i]) |
| continue; |
| SDValue Elt = peekThroughBitcasts(Elts[i]); |
| if (RepeatedLoads[i % SubElems].isUndef()) |
| RepeatedLoads[i % SubElems] = Elt; |
| else |
| Match &= (RepeatedLoads[i % SubElems] == Elt); |
| } |
| |
| // We must have loads at both ends of the repetition. |
| Match &= !RepeatedLoads.front().isUndef(); |
| Match &= !RepeatedLoads.back().isUndef(); |
| if (!Match) |
| continue; |
| |
| EVT RepeatVT = |
| VT.isInteger() && (RepeatSize != 64 || TLI.isTypeLegal(MVT::i64)) |
| ? EVT::getIntegerVT(*DAG.getContext(), ScalarSize) |
| : EVT::getFloatingPointVT(ScalarSize); |
| if (RepeatSize > ScalarSize) |
| RepeatVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT, |
| RepeatSize / ScalarSize); |
| EVT BroadcastVT = |
| EVT::getVectorVT(*DAG.getContext(), RepeatVT.getScalarType(), |
| VT.getSizeInBits() / ScalarSize); |
| if (TLI.isTypeLegal(BroadcastVT)) { |
| if (SDValue RepeatLoad = EltsFromConsecutiveLoads( |
| RepeatVT, RepeatedLoads, DL, DAG, Subtarget, isAfterLegalize)) { |
| unsigned Opcode = RepeatSize > ScalarSize ? X86ISD::SUBV_BROADCAST |
| : X86ISD::VBROADCAST; |
| SDValue Broadcast = DAG.getNode(Opcode, DL, BroadcastVT, RepeatLoad); |
| return DAG.getBitcast(VT, Broadcast); |
| } |
| } |
| } |
| } |
| |
| return SDValue(); |
| } |
| |
| // Combine a vector ops (shuffles etc.) that is equal to build_vector load1, |
| // load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses |
| // are consecutive, non-overlapping, and in the right order. |
| static SDValue combineToConsecutiveLoads(EVT VT, SDNode *N, const SDLoc &DL, |
| SelectionDAG &DAG, |
| const X86Subtarget &Subtarget, |
| bool isAfterLegalize) { |
| SmallVector<SDValue, 64> Elts; |
| for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) { |
| if (SDValue Elt = getShuffleScalarElt(N, i, DAG, 0)) { |
| Elts.push_back(Elt); |
| continue; |
| } |
| return SDValue(); |
| } |
| assert(Elts.size() == VT.getVectorNumElements()); |
| return EltsFromConsecutiveLoads(VT, Elts, DL, DAG, Subtarget, |
| isAfterLegalize); |
| } |
| |
| static Constant *getConstantVector(MVT VT, const APInt &SplatValue, |
| unsigned SplatBitSize, LLVMContext &C) { |
| unsigned ScalarSize = VT.getScalarSizeInBits(); |
| unsigned NumElm = SplatBitSize / ScalarSize; |
| |
| SmallVector<Constant *, 32> ConstantVec; |
| for (unsigned i = 0; i < NumElm; i++) { |
| APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i); |
| Constant *Const; |
| if (VT.isFloatingPoint()) { |
| if (ScalarSize == 32) { |
| Const = ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val)); |
| } else { |
| assert(ScalarSize == 64 && "Unsupported floating point scalar size"); |
| Const = ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val)); |
| } |
| } else |
| Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val); |
| ConstantVec.push_back(Const); |
| } |
| return ConstantVector::get(ArrayRef<Constant *>(ConstantVec)); |
| } |
| |
| static bool isFoldableUseOfShuffle(SDNode *N) { |
| for (auto *U : N->uses()) { |
| unsigned Opc = U->getOpcode(); |
| // VPERMV/VPERMV3 shuffles can never fold their index operands. |
| if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N) |
| return false; |
| if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N) |
| return false; |
| if (isTargetShuffle(Opc)) |
| return true; |
| if (Opc == ISD::BITCAST) // Ignore bitcasts |
| return isFoldableUseOfShuffle(U); |
| if (N->hasOneUse()) |
| return true; |
| } |
| return false; |
| } |
| |
| // Check if the current node of build vector is a zero extended vector. |
| // // If so, return the value extended. |
| // // For example: (0,0,0,a,0,0,0,a,0,0,0,a,0,0,0,a) returns a. |
| // // NumElt - return the number of zero extended identical values. |
| // // EltType - return the type of the value include the zero extend. |
| static SDValue isSplatZeroExtended(const BuildVectorSDNode *Op, |
| unsigned &NumElt, MVT &EltType) { |
| SDValue ExtValue = Op->getOperand(0); |
| unsigned NumElts = Op->getNumOperands(); |
| unsigned Delta = NumElts; |
| |
| for (unsigned i = 1; i < NumElts; i++) { |
| if (Op->getOperand(i) == ExtValue) { |
| Delta = i; |
| break; |
| } |
| if (!(Op->getOperand(i).isUndef() || isNullConstant(Op->getOperand(i)))) |
| return SDValue(); |
| } |
| if (!isPowerOf2_32(Delta) || Delta == 1) |
| return SDValue(); |
| |
| for (unsigned i = Delta; i < NumElts; i++) { |
| if (i % Delta == 0) { |
| if (Op->getOperand(i) != ExtValue) |
| return SDValue(); |
| } else if (!(isNullConstant(Op->getOperand(i)) || |
| Op->getOperand(i).isUndef())) |
| return SDValue(); |
| } |
| unsigned EltSize = Op->getSimpleValueType(0).getScalarSizeInBits(); |
| unsigned ExtVTSize = EltSize * Delta; |
| EltType = MVT::getIntegerVT(ExtVTSize); |
| NumElt = NumElts / Delta; |
| return ExtValue; |
| } |
| |
| /// Attempt to use the vbroadcast instruction to generate a splat value |
| /// from a splat BUILD_VECTOR which uses: |
| /// a. A single scalar load, or a constant. |
| /// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>). |
| /// |
| /// The VBROADCAST node is returned when a pattern is found, |
| /// or SDValue() otherwise. |
| static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp, |
| const X86Subtarget &Subtarget, |
| SelectionDAG &DAG) { |
| // VBROADCAST requires AVX. |
| // TODO: Splats could be generated for non-AVX CPUs using SSE |
| // instructions, but there's less potential gain for only 128-bit vectors. |
| if (!Subtarget.hasAVX()) |
| return SDValue(); |
| |
| MVT VT = BVOp->getSimpleValueType(0); |
| SDLoc dl(BVOp); |
| |
| assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && |
| "Unsupported vector type for broadcast."); |
| |
| BitVector UndefElements; |
| SDValue Ld = BVOp->getSplatValue(&UndefElements); |
| |
| // Attempt to use VBROADCASTM |
| // From this paterrn: |
| // a. t0 = (zext_i64 (bitcast_i8 v2i1 X)) |
| // b. t1 = (build_vector t0 t0) |
| // |
| // Create (VBROADCASTM v2i1 X) |
| if (Subtarget.hasCDI() && (VT.is512BitVector() || Subtarget.hasVLX())) { |
| MVT EltType = VT.getScalarType(); |
| unsigned NumElts = VT.getVectorNumElements(); |
| SDValue BOperand; |
| SDValue ZeroExtended = isSplatZeroExtended(BVOp, NumElts, EltType); |
| if ((ZeroExtended && ZeroExtended.getOpcode() == ISD::BITCAST) || |
| (Ld && Ld.getOpcode() == ISD::ZERO_EXTEND && |
| Ld.getOperand(0).getOpcode() == ISD::BITCAST)) { |
| if (ZeroExtended) |
| BOperand = ZeroExtended.getOperand(0); |
| else |
| BOperand = Ld.getOperand(0).getOperand(0); |
| MVT MaskVT = BOperand.getSimpleValueType(); |
| if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) || // for broadcastmb2q |
| (EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d |
| SDValue Brdcst = |
| DAG.getNode(X86ISD::VBROADCASTM, dl, |
| MVT::getVectorVT(EltType, NumElts), BOperand); |
| return DAG.getBitcast(VT, Brdcst); |
| } |
| } |
| } |
| |
| unsigned NumElts = VT.getVectorNumElements(); |
| unsigned NumUndefElts = UndefElements.count(); |
| if (!Ld || (NumElts - NumUndefElts) <= 1) { |
| APInt SplatValue, Undef; |
| unsigned SplatBitSize; |
| bool HasUndef; |
| // Check if this is a repeated constant pattern suitable for broadcasting. |
| if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) && |
| SplatBitSize > VT.getScalarSizeInBits() && |
| SplatBitSize < VT.getSizeInBits()) { |
| // Avoid replacing with broadcast when it's a use of a shuffle |
| // instruction to preserve the present custom lowering of shuffles. |
| if (isFoldableUseOfShuffle(BVOp)) |
| return SDValue(); |
| // replace BUILD_VECTOR with broadcast of the repeated constants. |
| const TargetLowering &TLI = DAG.getTargetLoweringInfo(); |
| LLVMContext *Ctx = DAG.getContext(); |
| MVT PVT = TLI.getPointerTy(DAG.getDataLayout()); |
| if (Subtarget.hasAVX()) { |
| if (SplatBitSize <= 64 && Subtarget.hasAVX2() && |
| !(SplatBitSize == 64 && Subtarget.is32Bit())) { |
| // Splatted value can fit in one INTEGER constant in constant pool. |
| // Load the constant and broadcast it. |
| MVT CVT = MVT::getIntegerVT(SplatBitSize); |
| Type *ScalarTy = Type::getIntNTy(*Ctx, SplatBitSize); |
| Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue); |
| SDValue CP = DAG.getConstantPool(C, PVT); |
| unsigned Repeat = VT.getSizeInBits() / SplatBitSize; |
| |
| unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment(); |
| Ld = DAG.getLoad( |
| CVT, dl, DAG.getEntryNode(), CP, |
| MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), |
| Alignment); |
| SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl, |
| MVT::getVectorVT(CVT, Repeat), Ld); |
| return DAG.getBitcast(VT, Brdcst); |
| } else if (SplatBitSize == 32 || SplatBitSize == 64) { |
| // Splatted value can fit in one FLOAT constant in constant pool. |
| // Load the constant and broadcast it. |
| // AVX have support for 32 and 64 bit broadcast for floats only. |
| // No 64bit integer in 32bit subtarget. |
| MVT CVT = MVT::getFloatingPointVT(SplatBitSize); |
| // Lower the splat via APFloat directly, to avoid any conversion. |
| Constant *C = |
| SplatBitSize == 32 |
| ? ConstantFP::get(*Ctx, |
| APFloat(APFloat::IEEEsingle(), SplatValue)) |
| : ConstantFP::get(*Ctx, |
| APFloat(APFloat::IEEEdouble(), SplatValue)); |
| SDValue CP = DAG.getConstantPool(C, PVT); |
| unsigned Repeat = VT.getSizeInBits() / SplatBitSize; |
| |
| unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment(); |
| Ld = DAG.getLoad( |
| CVT, dl, DAG.getEntryNode(), CP, |
| MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), |
| Alignment); |
| SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl, |
| MVT::getVectorVT(CVT, Repeat), Ld); |
| return DAG.getBitcast(VT, Brdcst); |
| } else if (SplatBitSize > 64) { |
| // Load the vector of constants and broadcast it. |
| MVT CVT = VT.getScalarType(); |
| Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize, |
| *Ctx); |
| SDValue VCP = DAG.getConstantPool(VecC, PVT); |
| unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits(); |
| unsigned Alignment = cast<ConstantPoolSDNode>(VCP)->getAlignment(); |
| Ld = DAG.getLoad( |
| MVT::getVectorVT(CVT, NumElm), dl, DAG.getEntryNode(), VCP, |
| MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), |
| Alignment); |
| SDValue Brdcst = DAG.getNode(X86ISD::SUBV_BROADCAST, dl, VT, Ld); |
| return DAG.getBitcast(VT, Brdcst); |
| } |
| } |
| } |
| |
| // If we are moving a scalar into a vector (Ld must be set and all elements |
| // but 1 are undef) and that operation is not obviously supported by |
| // vmovd/vmovq/vmovss/vmovsd, then keep trying to form a broadcast. |
| // That's better than general shuffling and may eliminate a load to GPR and |
| // move from scalar to vector register. |
| if (!Ld || NumElts - NumUndefElts != 1) |
| return SDValue(); |
| unsigned ScalarSize = Ld.getValueSizeInBits(); |
| if (!(UndefElements[0] || (ScalarSize != 32 && ScalarSize != 64))) |
| return SDValue(); |
| } |
| |
| bool ConstSplatVal = |
| (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP); |
| |
| // Make sure that all of the users of a non-constant load are from the |
| // BUILD_VECTOR node. |
| if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode())) |
| return SDValue(); |
| |
| unsigned ScalarSize = Ld.getValueSizeInBits(); |
| bool IsGE256 = (VT.getSizeInBits() >= 256); |
| |
| // When optimizing for size, generate up to 5 extra bytes for a broadcast |
| // instruction to save 8 or more bytes of constant pool data. |
| // TODO: If multiple splats are generated to load the same constant, |
| // it may be detrimental to overall size. There needs to be a way to detect |
| // that condition to know if this is truly a size win. |
| bool OptForSize = DAG.shouldOptForSize(); |
| |
| // Handle broadcasting a single constant scalar from the constant pool |
| // into a vector. |
| // On Sandybridge (no AVX2), it is still better to load a constant vector |
| // from the constant pool and not to broadcast it from a scalar. |
| // But override that restriction when optimizing for size. |
| // TODO: Check if splatting is recommended for other AVX-capable CPUs. |
| if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) { |
| EVT CVT = Ld.getValueType(); |
| assert(!CVT.isVector() && "Must not broadcast a vector type"); |
| |
| // Splat f32, i32, v4f64, v4i64 in all cases with AVX2. |
| // For size optimization, also splat v2f64 and v2i64, and for size opt |
| // with AVX2, also splat i8 and i16. |
| // With pattern matching, the VBROADCAST node may become a VMOVDDUP. |
| if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) || |
| (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) { |
| const Constant *C = nullptr; |
| if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld)) |
| C = CI->getConstantIntValue(); |
| else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld)) |
| C = CF->getConstantFPValue(); |
| |
| assert(C && "Invalid constant type"); |
| |
| const TargetLowering &TLI = DAG.getTargetLoweringInfo(); |
| SDValue CP = |
| DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout())); |
| unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment(); |
| Ld = DAG.getLoad( |
| CVT, dl, DAG.getEntryNode(), CP, |
| MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), |
| Alignment); |
| |
| return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); |
| } |
| } |
| |
| bool IsLoad = ISD::isNormalLoad(Ld.getNode()); |
| |
| // Handle AVX2 in-register broadcasts. |
| if (!IsLoad && Subtarget.hasInt256() && |
| (ScalarSize == 32 || (IsGE256 && ScalarSize == 64))) |
| return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); |
| |
| // The scalar source must be a normal load. |
| if (!IsLoad) |
| return SDValue(); |
| |
| if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) || |
| (Subtarget.hasVLX() && ScalarSize == 64)) |
| return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); |
| |
| // The integer check is needed for the 64-bit into 128-bit so it doesn't match |
| // double since there is no vbroadcastsd xmm |
| if (Subtarget.hasInt256() && Ld.getValueType().isInteger()) { |
| if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64) |
| return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); |
| } |
| |
| // Unsupported broadcast. |
| return SDValue(); |
| } |
| |
| /// For an EXTRACT_VECTOR_ELT with a constant index return the real |
| /// underlying vector and index. |
| /// |
| /// Modifies \p ExtractedFromVec to the real vector and returns the real |
| /// index. |
| static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec, |
| SDValue ExtIdx) { |
| int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue(); |
| if (!isa<ShuffleVectorSDNode>(ExtractedFromVec)) |
| return Idx; |
| |
| // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already |
| // lowered this: |
| // (extract_vector_elt (v8f32 %1), Constant<6>) |
| // to: |
| // (extract_vector_elt (vector_shuffle<2,u,u,u> |
| // (extract_subvector (v8f32 %0), Constant<4>), |
| // undef) |
| // Constant<0>) |
| // In this case the vector is the extract_subvector expression and the index |
| // is 2, as specified by the shuffle. |
| ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec); |
| SDValue ShuffleVec = SVOp->getOperand(0); |
| MVT ShuffleVecVT = ShuffleVec.getSimpleValueType(); |
| assert(ShuffleVecVT.getVectorElementType() == |
| ExtractedFromVec.getSimpleValueType().getVectorElementType()); |
| |
| int ShuffleIdx = SVOp->getMaskElt(Idx); |
| if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) { |
| ExtractedFromVec = ShuffleVec; |
| return ShuffleIdx; |
| } |
| return Idx; |
| } |
| |
| static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) { |
| MVT VT = Op.getSimpleValueType(); |
| |
| // Skip if insert_vec_elt is not supported. |
| const TargetLowering &TLI = DAG.getTargetLoweringInfo(); |
| if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT)) |
| return SDValue(); |
| |
| SDLoc DL(Op); |
| unsigned NumElems = Op.getNumOperands(); |
| |
| SDValue VecIn1; |
| SDValue VecIn2; |
| SmallVector<unsigned, 4> InsertIndices; |
| SmallVector<int, 8> Mask(NumElems, -1); |
| |
| for (unsigned i = 0; i != NumElems; ++i) { |
| unsigned Opc = Op.getOperand(i).getOpcode(); |
| |
| if (Opc == ISD::UNDEF) |
| continue; |
| |
| if (Opc != ISD::EXTRACT_VECTOR_ELT) { |
| // Quit if more than 1 elements need inserting. |
| if (InsertIndices.size() > 1) |
| return SDValue(); |
| |
| InsertIndices.push_back(i); |
| continue; |
| } |
| |
| SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0); |
| SDValue ExtIdx = Op.getOperand(i).getOperand(1); |
| |
| // Quit if non-constant index. |
| if (!isa<ConstantSDNode>(ExtIdx)) |
| return SDValue(); |
| int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx); |
| |
| // Quit if extracted from vector of different type. |
| if (ExtractedFromVec.getValueType() != VT) |
| return SDValue(); |
| |
| if (!VecIn1.getNode()) |
| VecIn1 = ExtractedFromVec; |
| else if (VecIn1 != ExtractedFromVec) { |
| if (!VecIn2.getNode()) |
| VecIn2 = ExtractedFromVec; |
| else if (VecIn2 != ExtractedFromVec) |
| // Quit if more than 2 vectors to shuffle |
| return SDValue(); |
| } |
| |
| if (ExtractedFromVec == VecIn1) |
| Mask[i] = Idx; |
| else if (ExtractedFromVec == VecIn2) |
| Mask[i] = Idx + NumElems; |
| } |
| |
| if (!VecIn1.getNode()) |
| return SDValue(); |
| |
| VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT); |
| SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask); |
| |
| for (unsigned Idx : InsertIndices) |
| NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx), |
| DAG.getIntPtrConstant(Idx, DL)); |
| |
| return NV; |
| } |
| |
| static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) { |
| assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) && |
| Op.getScalarValueSizeInBits() == 1 && |
| "Can not convert non-constant vector"); |
| uint64_t Immediate = 0; |
| for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) { |
| SDValue In = Op.getOperand(idx); |
| if (!In.isUndef()) |
| Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx; |
| } |
| SDLoc dl(Op); |
| MVT VT = MVT::getIntegerVT(std::max((int)Op.getValueSizeInBits(), 8)); |
| return DAG.getConstant(Immediate, dl, VT); |
| } |
| // Lower BUILD_VECTOR operation for v8i1 and v16i1 types. |
| static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG, |
| const X86Subtarget &Subtarget) { |
| |
| MVT VT = Op.getSimpleValueType(); |
| assert((VT.getVectorElementType() == MVT::i1) && |
| "Unexpected type in LowerBUILD_VECTORvXi1!"); |
| |
| SDLoc dl(Op); |
| if (ISD::isBuildVectorAllZeros(Op.getNode()) || |
| ISD::isBuildVectorAllOnes(Op.getNode())) |
| return Op; |
| |
| uint64_t Immediate = 0; |
| SmallVector<unsigned, 16> NonConstIdx; |
| bool IsSplat = true; |
| bool HasConstElts = false; |
| int SplatIdx = -1; |
| for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) { |
| SDValue In = Op.getOperand(idx); |
| if (In.isUndef()) |
| continue; |
| if (!isa<ConstantSDNode>(In)) |
| NonConstIdx.push_back(idx); |
| else { |
| Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx; |
| HasConstElts = true; |
| } |
| if (SplatIdx < 0) |
| SplatIdx = idx; |
| else if (In != Op.getOperand(SplatIdx)) |
| IsSplat = false; |
| } |
| |
| // for splat use " (select i1 splat_elt, all-ones, all-zeroes)" |
| if (IsSplat) { |
| // The build_vector allows the scalar element to be larger than the vector |
| // element type. We need to mask it to use as a condition unless we know |
| // the upper bits are zero. |
| // FIXME: Use computeKnownBits instead of checking specific opcode? |
| SDValue Cond = Op.getOperand(SplatIdx); |
| assert(Cond.getValueType() == MVT::i8 && "Unexpected VT!"); |
| if (Cond.getOpcode() != ISD::SETCC) |
| Cond = DAG.getNode(ISD::AND, dl, MVT::i8, Cond, |
| DAG.getConstant(1, dl, MVT::i8)); |
| return DAG.getSelect(dl, VT, Cond, |
| DAG.getConstant(1, dl, VT), |
| DAG.getConstant(0, dl, VT)); |
| } |
| |
| // insert elements one by one |
| SDValue DstVec; |
| if (HasConstElts) { |
| if (VT == MVT::v64i1 && !Subtarget.is64Bit()) { |
| SDValue ImmL = DAG.getConstant(Lo_32(Immediate), dl, MVT::i32); |
| SDValue ImmH = DAG.getConstant(Hi_32(Immediate), dl, MVT::i32); |
| ImmL = DAG.getBitcast(MVT::v32i1, ImmL); |
| ImmH = DAG.getBitcast(MVT::v32i1, ImmH); |
| DstVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, ImmL, ImmH); |
| } else { |
| MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U)); |
| SDValue Imm = DAG.getConstant(Immediate, dl, ImmVT); |
| MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1; |
| DstVec = DAG.getBitcast(VecVT, Imm); |
| DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, DstVec, |
| DAG.getIntPtrConstant(0, dl)); |
| } |
| } else |
| DstVec = DAG.getUNDEF(VT); |
| |
| for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) { |
| unsigned InsertIdx = NonConstIdx[i]; |
| DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec, |
| Op.getOperand(InsertIdx), |
| DAG.getIntPtrConstant(InsertIdx, dl)); |
| } |
| return DstVec; |
| } |
| |
| /// This is a helper function of LowerToHorizontalOp(). |
| /// This function checks that the build_vector \p N in input implements a |
| /// 128-bit partial horizontal operation on a 256-bit vector, but that operation |
| /// may not match the layout of an x86 256-bit horizontal instruction. |
| /// In other words, if this returns true, then some extraction/insertion will |
| /// be required to produce a valid horizontal instruction. |
| /// |
| /// Parameter \p Opcode defines the kind of horizontal operation to match. |
| /// For example, if \p Opcode is equal to ISD::ADD, then this function |
| /// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode |
| /// is equal to ISD::SUB, then this function checks if this is a horizontal |
| /// arithmetic sub. |
| /// |
| /// This function only analyzes elements of \p N whose indices are |
| /// in range [BaseIdx, LastIdx). |
| /// |
| /// TODO: This function was originally used to match both real and fake partial |
| /// horizontal operations, but the index-matching logic is incorrect for that. |
| /// See the corrected implementation in isHopBuildVector(). Can we reduce this |
| /// code because it is only used for partial h-op matching now? |
| static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode, |
| SelectionDAG &DAG, |
| unsigned BaseIdx, unsigned LastIdx, |
| SDValue &V0, SDValue &V1) { |
| EVT VT = N->getValueType(0); |
| assert(VT.is256BitVector() && "Only use for matching partial 256-bit h-ops"); |
| assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!"); |
| assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx && |
| "Invalid Vector in input!"); |
| |
| bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD); |
| bool CanFold = true; |
| unsigned ExpectedVExtractIdx = BaseIdx; |
| unsigned NumElts = LastIdx - BaseIdx; |
| V0 = DAG.getUNDEF(VT); |
| V1 = DAG.getUNDEF(VT); |
| |
| // Check if N implements a horizontal binop. |
| for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) { |
| SDValue Op = N->getOperand(i + BaseIdx); |
| |
| // Skip UNDEFs. |
| if (Op->isUndef()) { |
| // Update the expected vector extract index. |
| if (i * 2 == NumElts) |
| ExpectedVExtractIdx = BaseIdx; |
| ExpectedVExtractIdx += 2; |
| continue; |
| } |
| |
| CanFold = Op->getOpcode() == Opcode && Op->hasOneUse(); |
| |
| if (!CanFold) |
| break; |
| |
| SDValue Op0 = Op.getOperand(0); |
| SDValue Op1 = Op.getOperand(1); |
| |
| // Try to match the following pattern: |
| // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1)) |
| CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT && |
| Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT && |
| Op0.getOperand(0) == Op1.getOperand(0) && |
| isa<ConstantSDNode>(Op0.getOperand(1)) && |
| isa<ConstantSDNode>(Op1.getOperand(1))); |
| if (!CanFold) |
| break; |
| |
| unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue(); |
| unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue(); |
| |
| if (i * 2 < NumElts) { |
| if (V0.isUndef()) { |
| V0 = Op0.getOperand(0); |
| if (V0.getValueType() != VT) |
| return false; |
| } |
| } else { |
| if (V1.isUndef()) { |
| V1 = Op0.getOperand(0); |
| if (V1.getValueType() != VT) |
| return false; |
| } |
| if (i * 2 == NumElts) |
| ExpectedVExtractIdx = BaseIdx; |
| } |
| |
| SDValue Expected = (i * 2 < NumElts) ? V0 : V1; |
| if (I0 == ExpectedVExtractIdx) |
| CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected; |
| else if (IsCommutable && I1 == ExpectedVExtractIdx) { |
| // Try to match the following dag sequence: |
| // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I)) |
| CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected; |
| } else |
| CanFold = false; |
| |
| ExpectedVExtractIdx += 2; |
| } |
| |
| return CanFold; |
| } |
| |
| /// Emit a sequence of two 128-bit horizontal add/sub followed by |
| /// a concat_vector. |
| /// |
| /// This is a helper function of LowerToHorizontalOp(). |
| /// This function expects two 256-bit vectors called V0 and V1. |
| /// At first, each vector is split into two separate 128-bit vectors. |
| /// Then, the resulting 128-bit vectors are used to implement two |
| /// horizontal binary operations. |
| /// |
| /// The kind of horizontal binary operation is defined by \p X86Opcode. |
| /// |
| /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to |
| /// the two new horizontal binop. |
| /// When Mode is set, the first horizontal binop dag node would take as input |
| /// the lower 128-bit of V0 and the upper 128-bit of V0. The second |
| /// horizontal binop dag node would take as input the lower 128-bit of V1 |
| /// and the upper 128-bit of V1. |
| /// Example: |
| /// HADD V0_LO, V0_HI |
| /// HADD V1_LO, V1_HI |
| /// |
| /// Otherwise, the first horizontal binop dag node takes as input the lower |
| /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop |
| /// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1. |
| /// Example: |
| /// HADD V0_LO, V1_LO |
| /// HADD V0_HI, V1_HI |
| /// |
| /// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower |
| /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to |
| /// the upper 128-bits of the result. |
| static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1, |
| const SDLoc &DL, SelectionDAG &DAG, |
| unsigned X86Opcode, bool Mode, |
| bool isUndefLO, bool isUndefHI) { |
| MVT VT = V0.getSimpleValueType(); |
| assert(VT.is256BitVector() && VT == V1.getSimpleValueType() && |
| "Invalid nodes in input!"); |
| |
| unsigned NumElts = VT.getVectorNumElements(); |
| SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL); |
| SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL); |
| SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL); |
| SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL); |
| MVT NewVT = V0_LO.getSimpleValueType(); |
| |
| SDValue LO = DAG.getUNDEF(NewVT); |
| SDValue HI = DAG.getUNDEF(NewVT); |
| |
| if (Mode) { |
| // Don't emit a horizontal binop if the result is expected to be UNDEF. |
| if (!isUndefLO && !V0->isUndef()) |
| LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI); |
| if (!isUndefHI && !V1->isUndef()) |
| HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI); |
| } else { |
| // Don't emit a horizontal binop if the result is expected to be UNDEF. |
| if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef())) |
| LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO); |
| |
| if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef())) |
| HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI); |
| } |
| |
| return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI); |
| } |
| |
| /// Returns true iff \p BV builds a vector with the result equivalent to |
| /// the result of ADDSUB/SUBADD operation. |
| /// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1 |
| /// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters |
| /// \p Opnd0 and \p Opnd1. |
| static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV, |
| const X86Subtarget &Subtarget, SelectionDAG &DAG, |
| SDValue &Opnd0, SDValue &Opnd1, |
| unsigned &NumExtracts, |
| bool &IsSubAdd) { |
| |
| MVT VT = BV->getSimpleValueType(0); |
| if (!Subtarget.hasSSE3() || !VT.isFloatingPoint()) |
| return false; |
| |
| unsigned NumElts = VT.getVectorNumElements(); |
| SDValue InVec0 = DAG.getUNDEF(VT); |
| SDValue InVec1 = DAG.getUNDEF(VT); |
| |
| NumExtracts = 0; |
| |
| // Odd-numbered elements in the input build vector are obtained from |
| // adding/subtracting two integer/float elements. |
| // Even-numbered elements in the input build vector are obtained from |
| // subtracting/adding two integer/float elements. |
| unsigned Opc[2] = {0, 0}; |
| for (unsigned i = 0, e = NumElts; i != e; ++i) { |
| SDValue Op = BV->getOperand(i); |
| |
| // Skip 'undef' values. |
| unsigned Opcode = Op.getOpcode(); |
| if (Opcode == ISD::UNDEF) |
| continue; |
| |
| // Early exit if we found an unexpected opcode. |
| if (Opcode != ISD::FADD && Opcode != ISD::FSUB) |
| return false; |
| |
| SDValue Op0 = Op.getOperand(0); |
| SDValue Op1 = Op.getOperand(1); |
| |
| // Try to match the following pattern: |
| // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i)) |
| // Early exit if we cannot match that sequence. |
| if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT || |
| Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT || |
| !isa<ConstantSDNode>(Op0.getOperand(1)) || |
| !isa<ConstantSDNode>(Op1.getOperand(1)) || |
| Op0.getOperand(1) != Op1.getOperand(1)) |
| return false; |
| |
| unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue(); |
| if (I0 != i) |
| return false; |
| |
| // We found a valid add/sub node, make sure its the same opcode as previous |
| // elements for this parity. |
| if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode) |
| return false; |
| Opc[i % 2] = Opcode; |
| |
| // Update InVec0 and InVec1. |
| if (InVec0.isUndef()) { |
| InVec0 = Op0.getOperand(0); |
| if (InVec0.getSimpleValueType() != VT) |
| return false; |
| } |
| if (InVec1.isUndef()) { |
| InVec1 = Op1.getOperand(0); |
| if (InVec1.getSimpleValueType() != VT) |
| return false; |
| } |
| |
| // Make sure that operands in input to each add/sub node always |
| // come from a same pair of vectors. |
| if (InVec0 != Op0.getOperand(0)) { |
| if (Opcode == ISD::FSUB) |
| return false; |
| |
| // FADD is commutable. Try to commute the operands |
| // and then test again. |
| std::swap(Op0, Op1); |
| if (InVec0 != Op0.getOperand(0)) |
| return false; |
| } |
| |
| if (InVec1 != Op1.getOperand(0)) |
| return false; |
| |
| // Increment the number of extractions done. |
| ++NumExtracts; |
| } |
| |
| // Ensure we have found an opcode for both parities and that they are |
| // different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the |
| // inputs are undef. |
| if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] || |
| InVec0.isUndef() || InVec1.isUndef()) |
| return false; |
| |
| IsSubAdd = Opc[0] == ISD::FADD; |
| |
| Opnd0 = InVec0; |
| Opnd1 = InVec1; |
| return true; |
| } |
| |
| /// Returns true if is possible to fold MUL and an idiom that has already been |
| /// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into |
| /// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the |
| /// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2. |
| /// |
| /// Prior to calling this function it should be known that there is some |
| /// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation |
| /// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called |
| /// before replacement of such SDNode with ADDSUB operation. Thus the number |
| /// of \p Opnd0 uses is expected to be equal to 2. |
| /// For example, this function may be called for the following IR: |
| /// %AB = fmul fast <2 x double> %A, %B |
| /// %Sub = fsub fast <2 x double> %AB, %C |
| /// %Add = fadd fast <2 x double> %AB, %C |
| /// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add, |
| /// <2 x i32> <i32 0, i32 3> |
| /// There is a def for %Addsub here, which potentially can be replaced by |
| /// X86ISD::ADDSUB operation: |
| /// %Addsub = X86ISD::ADDSUB %AB, %C |
| /// and such ADDSUB can further be replaced with FMADDSUB: |
| /// %Addsub = FMADDSUB %A, %B, %C. |
| /// |
| /// The main reason why this method is called before the replacement of the |
| /// recognized ADDSUB idiom with ADDSUB operation is that such replacement |
| /// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit |
| /// FMADDSUB is. |
| static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget, |
| SelectionDAG &DAG, |
| SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2, |
| unsigned ExpectedUses) { |
| if (Opnd0.getOpcode() != ISD::FMUL || |
| !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA()) |
| return false; |
| |
| // FIXME: These checks must match the similar ones in |
| // DAGCombiner::visitFADDForFMACombine. It would be good to have one |
| // function that would answer if it is Ok to fuse MUL + ADD to FMADD |
| // or MUL + ADDSUB to FMADDSUB. |
| const TargetOptions &Options = DAG.getTarget().Options; |
| bool AllowFusion = |
| (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath); |
| if (!AllowFusion) |
| return false; |
| |
| Opnd2 = Opnd1; |
| Opnd1 = Opnd0.getOperand(1); |
| Opnd0 = Opnd0.getOperand(0); |
| |
| return true; |
| } |
| |
| /// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or |
| /// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or |
| /// X86ISD::FMSUBADD node. |
| static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV, |
| const X86Subtarget &Subtarget, |
| SelectionDAG &DAG) { |
| SDValue Opnd0, Opnd1; |
| unsigned NumExtracts; |
| bool IsSubAdd; |
| if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts, |
| IsSubAdd)) |
| return SDValue(); |
| |
| MVT VT = BV->getSimpleValueType(0); |
| SDLoc DL(BV); |
| |
| // Try to generate X86ISD::FMADDSUB node here. |
| SDValue Opnd2; |
| if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts)) { |
| unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB; |
| return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2); |
| } |
| |
| // We only support ADDSUB. |
| if (IsSubAdd) |
| return SDValue(); |
| |
| // Do not generate X86ISD::ADDSUB node for 512-bit types even though |
| // the ADDSUB idiom has been successfully recognized. There are no known |
| // X86 targets with 512-bit ADDSUB instructions! |
| // 512-bit ADDSUB idiom recognition was needed only as part of FMADDSUB idiom |
| // recognition. |
| if (VT.is512BitVector()) |
| return SDValue(); |
| |
| return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1); |
| } |
| |
| static bool isHopBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG, |
| unsigned &HOpcode, SDValue &V0, SDValue &V1) { |
| // Initialize outputs to known values. |
| MVT VT = BV->getSimpleValueType(0); |
| HOpcode = ISD::DELETED_NODE; |
| V0 = DAG.getUNDEF(VT); |
| V1 = DAG.getUNDEF(VT); |
| |
| // x86 256-bit horizontal ops are defined in a non-obvious way. Each 128-bit |
| // half of the result is calculated independently from the 128-bit halves of |
| // the inputs, so that makes the index-checking logic below more complicated. |
| unsigned NumElts = VT.getVectorNumElements(); |
| unsigned GenericOpcode = ISD::DELETED_NODE; |
| unsigned Num128BitChunks = VT.is256BitVector() ? 2 : 1; |
| unsigned NumEltsIn128Bits = NumElts / Num128BitChunks; |
| unsigned NumEltsIn64Bits = NumEltsIn128Bits / 2; |
| for (unsigned i = 0; i != Num128BitChunks; ++i) { |
| for (unsigned j = 0; j != NumEltsIn128Bits; ++j) { |
| // Ignore undef elements. |
| SDValue Op = BV->getOperand(i * NumEltsIn128Bits + j); |
| if (Op.isUndef()) |
| continue; |
| |
| // If there's an opcode mismatch, we're done. |
| if (HOpcode != ISD::DELETED_NODE && Op.getOpcode() != GenericOpcode) |
| return false; |
| |
| // Initialize horizontal opcode. |
| if (HOpcode == ISD::DELETED_NODE) { |
| GenericOpcode = Op.getOpcode(); |
| switch (GenericOpcode) { |
| case ISD::ADD: HOpcode = X86ISD::HADD; break; |
| case ISD::SUB: HOpcode = X86ISD::HSUB; break; |
| case ISD::FADD: HOpcode = X86ISD::FHADD; break; |
| case ISD::FSUB: HOpcode = X86ISD::FHSUB; break; |
| default: return false; |
| } |
| } |
| |
| SDValue Op0 = Op.getOperand(0); |
| SDValue Op1 = Op.getOperand(1); |
| if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT || |
| Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT || |
| Op0.getOperand(0) != Op1.getOperand(0) || |
| !isa<ConstantSDNode>(Op0.getOperand(1)) || |
| !isa<ConstantSDNode>(Op1.getOperand(1)) || !Op.hasOneUse()) |
| return false; |
| |
| // The source vector is chosen based on which 64-bit half of the |
| // destination vector is being calculated. |
| if (j < NumEltsIn64Bits) { |
| if (V0.isUndef()) |
| V0 = Op0.getOperand(0); |
| } else { |
| if (V1.isUndef()) |
| V1 = Op0.getOperand(0); |
| } |
| |
| SDValue SourceVec = (j < NumEltsIn64Bits) ? V0 : V1; |
| if (SourceVec != Op0.getOperand(0)) |
| return false; |
| |
| // op (extract_vector_elt A, I), (extract_vector_elt A, I+1) |
| unsigned ExtIndex0 = Op0.getConstantOperandVal(1); |
| unsigned ExtIndex1 = Op1.getConstantOperandVal(1); |
| unsigned ExpectedIndex = i * NumEltsIn128Bits + |
| (j % NumEltsIn64Bits) * 2; |
| if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1) |
| continue; |
| |
| // If this is not a commutative op, this does not match. |
| if (GenericOpcode != ISD::ADD && GenericOpcode != ISD::FADD) |
| return false; |
| |
| // Addition is commutative, so try swapping the extract indexes. |
| // op (extract_vector_elt A, I+1), (extract_vector_elt A, I) |
| if (ExpectedIndex == ExtIndex1 && ExtIndex0 == ExtIndex1 + 1) |
| continue; |
| |
| // Extract indexes do not match horizontal requirement. |
| return false; |
| } |
| } |
| // We matched. Opcode and operands are returned by reference as arguments. |
| return true; |
| } |
| |
| static SDValue getHopForBuildVector(const BuildVectorSDNode *BV, |
| SelectionDAG &DAG, unsigned HOpcode, |
| SDValue V0, SDValue V1) { |
| // If either input vector is not the same size as the build vector, |
| // extract/insert the low bits to the correct size. |
| // This is free (examples: zmm --> xmm, xmm --> ymm). |
| MVT VT = BV->getSimpleValueType(0); |
| unsigned Width = VT.getSizeInBits(); |
| if (V0.getValueSizeInBits() > Width) |
| V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), Width); |
| else if (V0.getValueSizeInBits() < Width) |
| V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, SDLoc(BV), Width); |
| |
| if (V1.getValueSizeInBits() > Width) |
| V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), Width); |
| else if (V1.getValueSizeInBits() < Width) |
| V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, SDLoc(BV), Width); |
| |
| unsigned NumElts = VT.getVectorNumElements(); |
| APInt DemandedElts = APInt::getAllOnesValue(NumElts); |
| for (unsigned i = 0; i != NumElts; ++i) |
| if (BV->getOperand(i).isUndef()) |
| DemandedElts.clearBit(i); |
| |
| // If we don't need the upper xmm, then perform as a xmm hop. |
| unsigned HalfNumElts = NumElts / 2; |
| if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) { |
| MVT HalfVT = VT.getHalfNumVectorElementsVT(); |
| V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), 128); |
| V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), 128); |
| SDValue Half = DAG.getNode(HOpcode, SDLoc(BV), HalfVT, V0, V1); |
| return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, SDLoc(BV), 256); |
| } |
| |
| return DAG.getNode(HOpcode, SDLoc(BV), VT, V0, V1); |
| } |
| |
| /// Lower BUILD_VECTOR to a horizontal add/sub operation if possible. |
| static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV, |
| const X86Subtarget &Subtarget, |
| SelectionDAG &DAG) { |
| // We need at least 2 non-undef elements to make this worthwhile by default. |
| unsigned NumNonUndefs = |
| count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); }); |
| if (NumNonUndefs < 2) |
| return SDValue(); |
| |
| // There are 4 sets of horizontal math operations distinguished by type: |
| // int/FP at 128-bit/256-bit. Each type was introduced with a different |
| // subtarget feature. Try to match those "native" patterns first. |
| MVT VT = BV->getSimpleValueType(0); |
| if (((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) || |
| ((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3()) || |
| ((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX()) || |
| ((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2())) { |
| unsigned HOpcode; |
| SDValue V0, V1; |
| if (isHopBuildVector(BV, DAG, HOpcode, V0, V1)) |
| return getHopForBuildVector(BV, DAG, HOpcode, V0, V1); |
| } |
| |
| // Try harder to match 256-bit ops by using extract/concat. |
| if (!Subtarget.hasAVX() || !VT.is256BitVector()) |
| return SDValue(); |
| |
| // Count the number of UNDEF operands in the build_vector in input. |
| unsigned NumElts = VT.getVectorNumElements(); |
| unsigned Half = NumElts / 2; |
| unsigned NumUndefsLO = 0; |
| unsigned NumUndefsHI = 0; |
| for (unsigned i = 0, e = Half; i != e; ++i) |
| if (BV->getOperand(i)->isUndef()) |
| NumUndefsLO++; |
| |
| for (unsigned i = Half, e = NumElts; i != e; ++i) |
| if (BV->getOperand(i)->isUndef()) |
| NumUndefsHI++; |
| |
| SDLoc DL(BV); |
| SDValue InVec0, InVec1; |
| if (VT == MVT::v8i32 || VT == MVT::v16i16) { |
| SDValue InVec2, InVec3; |
| unsigned X86Opcode; |
| bool CanFold = true; |
| |
| if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) && |
| isHorizontalBinOpPart(BV, ISD::ADD, DAG, Half, NumElts, InVec2, |
| InVec3) && |
| ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) && |
| ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3)) |
| X86Opcode = X86ISD::HADD; |
| else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, Half, InVec0, |
| InVec1) && |
| isHorizontalBinOpPart(BV, ISD::SUB, DAG, Half, NumElts, InVec2, |
| InVec3) && |
| ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) && |
| ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3)) |
| X86Opcode = X86ISD::HSUB; |
| else |
| CanFold = false; |
| |
| if (CanFold) { |
| // Do not try to expand this build_vector into a pair of horizontal |
| // add/sub if we can emit a pair of scalar add/sub. |
| if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half) |
| return SDValue(); |
| |
| // Convert this build_vector into a pair of horizontal binops followed by |
| // a concat vector. We must adjust the outputs from the partial horizontal |
| // matching calls above to account for undefined vector halves. |
| SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0; |
| SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1; |
| assert((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?"); |
| bool isUndefLO = NumUndefsLO == Half; |
| bool isUndefHI = NumUndefsHI == Half; |
| return ExpandHorizontalBinOp(V0, V1, DL, DAG, X86Opcode, false, isUndefLO, |
| isUndefHI); |
| } |
| } |
| |
| if (VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 || |
| VT == MVT::v16i16) { |
| unsigned X86Opcode; |
| if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1)) |
| X86Opcode = X86ISD::HADD; |
| else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, NumElts, InVec0, |
| InVec1)) |
| X86Opcode = X86ISD::HSUB; |
| else if (isHorizontalBinOpPart(BV, ISD::FADD, DAG, 0, NumElts, InVec0, |
| InVec1)) |
| X86Opcode = X86ISD::FHADD; |
| else if (isHorizontalBinOpPart(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, |
| InVec1)) |
| X86Opcode = X86ISD::FHSUB; |
| else |
| return SDValue(); |
| |
| // Don't try to expand this build_vector into a pair of horizontal add/sub |
| // if we can simply emit a pair of scalar add/sub. |
| if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half) |
| return SDValue(); |
| |
| // Convert this build_vector into two horizontal add/sub followed by |
| // a concat vector. |
| bool isUndefLO = NumUndefsLO == Half; |
| bool isUndefHI = NumUndefsHI == Half; |
| return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true, |
| isUndefLO, isUndefHI); |
| } |
| |
| return SDValue(); |
| } |
| |
| /// If a BUILD_VECTOR's source elements all apply the same bit operation and |
| /// one of their operands is constant, lower to a pair of BUILD_VECTOR and |
| /// just apply the bit to the vectors. |
| /// NOTE: Its not in our interest to start make a general purpose vectorizer |
| /// from this, but enough scalar bit operations are created from the later |
| /// legalization + scalarization stages to need basic support. |
| static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op, |
| SelectionDAG &DAG) { |
| SDLoc DL(Op); |
| MVT VT = Op->getSimpleValueType(0); |
| unsigned NumElems = VT.getVectorNumElements(); |
| const TargetLowering &TLI = DAG.getTargetLoweringInfo(); |
| |
| // Check that all elements have the same opcode. |
| // TODO: Should we allow UNDEFS and if so how many? |
| unsigned Opcode = Op->getOperand(0).getOpcode(); |
| for (unsigned i = 1; i < NumElems; ++i) |
| if (Opcode != Op->getOperand(i).getOpcode()) |
| return SDValue(); |
| |
| // TODO: We may be able to add support for other Ops (ADD/SUB + shifts). |
| bool IsShift = false; |
| switch (Opcode) { |
| default: |
| return SDValue(); |
| case ISD::SHL: |
| case ISD::SRL: |
| case ISD::SRA: |
| IsShift = true; |
| break; |
| case ISD::AND: |
| case ISD::XOR: |
| case ISD::OR: |
| // Don't do this if the buildvector is a splat - we'd replace one |
| // constant with an entire vector. |
| if (Op->getSplatValue()) |
| return SDValue(); |
| if (!TLI.isOperationLegalOrPromote(Opcode, VT)) |
| return SDValue(); |
| break; |
| } |
| |
| SmallVector<SDValue, 4> LHSElts, RHSElts; |
| for (SDValue Elt : Op->ops()) { |
| SDValue LHS = Elt.getOperand(0); |
| SDValue RHS = Elt.getOperand(1); |
| |
| // We expect the canonicalized RHS operand to be the constant. |
| if (!isa<ConstantSDNode>(RHS)) |
| return SDValue(); |
| |
| // Extend shift amounts. |
| if (RHS.getValueSizeInBits() != VT.getScalarSizeInBits()) { |
| if (!IsShift) |
| return SDValue(); |
| RHS = DAG.getZExtOrTrunc(RHS, DL, VT.getScalarType()); |
| } |
| |
| LHSElts.push_back(LHS); |
| RHSElts.push_back(RHS); |
| } |
| |
| // Limit to shifts by uniform immediates. |
| // TODO: Only accept vXi8/vXi64 special cases? |
| // TODO: Permit non-uniform XOP/AVX2/MULLO cases? |
| if (IsShift && any_of(RHSElts, [&](SDValue V) { return RHSElts[0] != V; })) |
| return SDValue(); |
| |
| SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts); |
| SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts); |
| return DAG.getNode(Opcode, DL, VT, LHS, RHS); |
| } |
| |
| /// Create a vector constant without a load. SSE/AVX provide the bare minimum |
| /// functionality to do this, so it's all zeros, all ones, or some derivation |
| /// that is cheap to calculate. |
| static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG, |
| const X86Subtarget &Subtarget) { |
| SDLoc DL(Op); |
| MVT VT = Op.getSimpleValueType(); |
| |
| // Vectors containing all zeros can be matched by pxor and xorps. |
| if (ISD::isBuildVectorAllZeros(Op.getNode())) |
| return Op; |
| |
| // Vectors containing all ones can be matched by pcmpeqd on 128-bit width |
| // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use |
| // vpcmpeqd on 256-bit vectors. |
| if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) { |
| if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) |
| return Op; |
| |
| return getOnesVector(VT, DAG, DL); |
| } |
| |
| return SDValue(); |
| } |
| |
| /// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute |
| /// from a vector of source values and a vector of extraction indices. |
| /// The vectors might be manipulated to match the type of the permute op. |
| static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec, |
| SDLoc &DL, SelectionDAG &DAG, |
| const X86Subtarget &Subtarget) { |
| MVT ShuffleVT = VT; |
| EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger(); |
| unsigned NumElts = VT.getVectorNumElements(); |
| unsigned SizeInBits = VT.getSizeInBits(); |
| |
| // Adjust IndicesVec to match VT size. |
| assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts && |
| "Illegal variable permute mask size"); |
| if (IndicesVec.getValueType().getVectorNumElements() > NumElts) |
| IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec), |
| NumElts * VT.getScalarSizeInBits()); |
| IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT); |
| |
| // Handle SrcVec that don't match VT type. |
| if (SrcVec.getValueSizeInBits() != SizeInBits) { |
| if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) { |
| // Handle larger SrcVec by treating it as a larger permute. |
| unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits; |
| VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts); |
| IndicesVT = EVT(VT).changeVectorElementTypeToInteger(); |
| IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false, |
| Subtarget, DAG, SDLoc(IndicesVec)); |
| return extractSubVector( |
| createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget), 0, |
| DAG, DL, SizeInBits); |
| } else if (SrcVec.getValueSizeInBits() < SizeInBits) { |
| // Widen smaller SrcVec to match VT. |
| SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec)); |
| } else |
| return SDValue(); |
| } |
| |
| auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) { |
| assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale"); |
| EVT SrcVT = Idx.getValueType(); |
| unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale; |
| uint64_t IndexScale = 0; |
| uint64_t IndexOffset = 0; |
| |
| // If we're scaling a smaller permute op, then we need to repeat the |
| // indices, scaling and offsetting them as well. |
| // e.g. v4i32 -> v16i8 (Scale = 4) |
| // IndexScale = v4i32 Splat(4 << 24 | 4 << 16 | 4 << 8 | 4) |
| // IndexOffset = v4i32 Splat(3 << 24 | 2 << 16 | 1 << 8 | 0) |
| for (uint64_t i = 0; i != Scale; ++i) { |
| IndexScale |= Scale << (i * NumDstBits); |
| IndexOffset |= i << (i * NumDstBits); |
| } |
| |
| Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx, |
| DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT)); |
| Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx, |
| DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT)); |
| return Idx; |
| }; |
| |
| unsigned Opcode = 0; |
| switch (VT.SimpleTy) { |
| default: |
| break; |
| case MVT::v16i8: |
| if (Subtarget.hasSSSE3()) |
| Opcode = X86ISD::PSHUFB; |
| break; |
| case MVT::v8i16: |
| if (Subtarget.hasVLX() && Subtarget.hasBWI()) |
| Opcode = X86ISD::VPERMV; |
| else if (Subtarget.hasSSSE3()) { |
| Opcode = X86ISD::PSHUFB; |
| ShuffleVT = MVT::v16i8; |
| } |
| break; |
| case MVT::v4f32: |
| case MVT::v4i32: |
| if (Subtarget.hasAVX()) { |
| Opcode = X86ISD::VPERMILPV; |
| ShuffleVT = MVT::v4f32; |
| } else if (Subtarget.hasSSSE3()) { |
| Opcode = X86ISD::PSHUFB; |
| ShuffleVT = MVT::v16i8; |
| } |
| break; |
| case MVT::v2f64: |
| case MVT::v2i64: |
| if (Subtarget.hasAVX()) { |
| // VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec. |
| IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec); |
| Opcode = X86ISD::VPERMILPV; |
| ShuffleVT = MVT::v2f64; |
| } else if (Subtarget.hasSSE41()) { |
| // SSE41 can compare v2i64 - select between indices 0 and 1. |
| return DAG.getSelectCC( |
| DL, IndicesVec, |
| getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL), |
| DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}), |
| DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}), |
| ISD::CondCode::SETEQ); |
| } |
| break; |
| case MVT::v32i8: |
| if (Subtarget.hasVLX() && Subtarget.hasVBMI()) |
| Opcode = X86ISD::VPERMV; |
| else if (Subtarget.hasXOP()) { |
| SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL); |
| SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL); |
| SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL); |
| SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL); |
| return DAG.getNode( |
| ISD::CONCAT_VECTORS, DL, VT, |
| DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx), |
| DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx)); |
| } else if (Subtarget.hasAVX()) { |
| SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL); |
| SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL); |
| SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo); |
| SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi); |
| auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL, |
| ArrayRef<SDValue> Ops) { |
| // Permute Lo and Hi and then select based on index range. |
| // This works as SHUFB uses bits[3:0] to permute elements and we don't |
| // care about the bit[7] as its just an index vector. |
| SDValue Idx = Ops[2]; |
| EVT VT = Idx.getValueType(); |
| return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT), |
| DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx), |
| DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx), |
| ISD::CondCode::SETGT); |
| }; |
| SDValue Ops[] = {LoLo, HiHi, IndicesVec}; |
| return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops, |
| PSHUFBBuilder); |
| } |
| break; |
| case MVT::v16i16: |
| if (Subtarget.hasVLX() && Subtarget.hasBWI()) |
| Opcode = X86ISD::VPERMV; |
| else if (Subtarget.hasAVX()) { |
| // Scale to v32i8 and perform as v32i8. |
| IndicesVec = ScaleIndices(IndicesVec, 2); |
| return DAG.getBitcast( |
| VT, createVariablePermute( |
| MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec), |
| DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget)); |
| } |
| break; |
| case MVT::v8f32: |
| case MVT::v8i32: |
| if (Subtarget.hasAVX2()) |
| Opcode = X86ISD::VPERMV; |
| else if (Subtarget.hasAVX()) { |
| SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec); |
| SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec, |
| {0, 1, 2, 3, 0, 1, 2, 3}); |
| SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec, |
| {4, 5, 6, 7, 4, 5, 6, 7}); |
| if (Subtarget.hasXOP()) |
| return DAG.getBitcast( |
| VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32, LoLo, HiHi, |
| IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8))); |
| // Permute Lo and Hi and then select based on index range. |
| // This works as VPERMILPS only uses index bits[0:1] to permute elements. |
| SDValue Res = DAG.getSelectCC( |
| DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32), |
| DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec), |
| DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec), |
| ISD::CondCode::SETGT); |
| return DAG.getBitcast(VT, Res); |
| } |
| break; |
| case MVT::v4i64: |
| case MVT::v4f64: |
| if (Subtarget.hasAVX512()) { |
| if (!Subtarget.hasVLX()) { |
| MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8); |
| SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG, |
| SDLoc(SrcVec)); |
| IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget, |
| DAG, SDLoc(IndicesVec)); |
| SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL, |
| DAG, Subtarget); |
| return extract256BitVector(Res, 0, DAG, DL); |
| } |
| Opcode = X86ISD::VPERMV; |
| } else if (Subtarget.hasAVX()) { |
| SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec); |
| SDValue LoLo = |
| DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1}); |
| SDValue HiHi = |
| DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3}); |
| // VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec. |
| IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec); |
| if (Subtarget.hasXOP()) |
| return DAG.getBitcast( |
| VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64, LoLo, HiHi, |
| IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8))); |
| // Permute Lo and Hi and then select based on index range. |
| // This works as VPERMILPD only uses index bit[1] to permute elements. |
| SDValue Res = DAG.getSelectCC( |
| DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64), |
| DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec), |
| DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec), |
| ISD::CondCode::SETGT); |
| return DAG.getBitcast(VT, Res); |
| } |
| break; |
| case MVT::v64i8: |
| if (Subtarget.hasVBMI()) |
| Opcode = X86ISD::VPERMV; |
| break; |
| case MVT::v32i16: |
| if (Subtarget.hasBWI()) |
| Opcode = X86ISD::VPERMV; |
| break; |
| case MVT::v16f32: |
| case MVT::v16i32: |
| case MVT::v8f64: |
| case MVT::v8i64: |
| if (Subtarget.hasAVX512()) |
| Opcode = X86ISD::VPERMV; |
| break; |
| } |
| if (!Opcode) |
| return SDValue(); |
| |
| assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) && |
| (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 && |
| "Illegal variable permute shuffle type"); |
| |
| uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits(); |
| if (Scale > 1) |
| IndicesVec = ScaleIndices(IndicesVec, Scale); |
| |
| EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger(); |
| IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec); |
| |
| SrcVec = DAG.getBitcast(ShuffleVT, SrcVec); |
| SDValue Res = Opcode == X86ISD::VPERMV |
| ? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec) |
| : DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec); |
| return DAG.getBitcast(VT, Res); |
| } |
| |
| // Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be |
| // reasoned to be a permutation of a vector by indices in a non-constant vector. |
| // (build_vector (extract_elt V, (extract_elt I, 0)), |
| // (extract_elt V, (extract_elt I, 1)), |
| // ... |
| // -> |
| // (vpermv I, V) |
| // |
| // TODO: Handle undefs |
| // TODO: Utilize pshufb and zero mask blending to support more efficient |
| // construction of vectors with constant-0 elements. |
| static SDValue |
| LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG, |
| const X86Subtarget &Subtarget) { |
| SDValue SrcVec, IndicesVec; |
| // Check for a match of the permute source vector and permute index elements. |
| // This is done by checking that the i-th build_vector operand is of the form: |
| // (extract_elt SrcVec, (extract_elt IndicesVec, i)). |
| for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) { |
| SDValue Op = V.getOperand(Idx); |
| if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT) |
| return SDValue(); |
| |
| // If this is the first extract encountered in V, set the source vector, |
| // otherwise verify the extract is from the previously defined source |
| // vector. |
| if (!SrcVec) |
| SrcVec = Op.getOperand(0); |
| else if (SrcVec != Op.getOperand(0)) |
| return SDValue(); |
| SDValue ExtractedIndex = Op->getOperand(1); |
| // Peek through extends. |
| if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND || |
| ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND) |
| ExtractedIndex = ExtractedIndex.getOperand(0); |
| if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT) |
| return SDValue(); |
| |
| // If this is the first extract from the index vector candidate, set the |
| // indices vector, otherwise verify the extract is from the previously |
| // defined indices vector. |
| if (!IndicesVec) |
| IndicesVec = ExtractedIndex.getOperand(0); |
| else if (IndicesVec != ExtractedIndex.getOperand(0)) |
| return SDValue(); |
| |
| auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1)); |
| if (!PermIdx || PermIdx->getAPIntValue() != Idx) |
| return SDValue(); |
| } |
| |
| SDLoc DL(V); |
| MVT VT = V.getSimpleValueType(); |
| return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget); |
| } |
| |
| SDValue |
| X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { |
| SDLoc dl(Op); |
| |
| MVT VT = Op.getSimpleValueType(); |
| MVT EltVT = VT.getVectorElementType(); |
| unsigned NumElems = Op.getNumOperands(); |
| |
| // Generate vectors for predicate vectors. |
| if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512()) |
| return LowerBUILD_VECTORvXi1(Op, DAG, Subtarget); |
| |
| if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget)) |
| return VectorConstant; |
| |
| BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode()); |
| if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG)) |
| return AddSub; |
| if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG)) |
| return HorizontalOp; |
| if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG)) |
| return Broadcast; |
| if (SDValue BitOp = lowerBuildVectorToBitOp(BV, DAG)) |
| return BitOp; |
| |
| unsigned EVTBits = EltVT.getSizeInBits(); |
| |
| unsigned NumZero = 0; |
| unsigned NumNonZero = 0; |
| uint64_t NonZeros = 0; |
| bool IsAllConstants = true; |
| SmallSet<SDValue, 8> Values; |
| unsigned NumConstants = NumElems; |
| for (unsigned i = 0; i < NumElems; ++i) { |
| SDValue Elt = Op.getOperand(i); |
| if (Elt.isUndef()) |
| continue; |
| Values.insert(Elt); |
| if (!isa<ConstantSDNode>(Elt) && !isa<ConstantFPSDNode>(Elt)) { |
| IsAllConstants = false; |
| NumConstants--; |
| } |
| if (X86::isZeroNode(Elt)) |
| NumZero++; |
| else { |
| assert(i < sizeof(NonZeros) * 8); // Make sure the shift is within range. |
| NonZeros |= ((uint64_t)1 << i); |
| NumNonZero++; |
| } |
| } |
| |
| // All undef vector. Return an UNDEF. All zero vectors were handled above. |
| if (NumNonZero == 0) |
| return DAG.getUNDEF(VT); |
| |
| // If we are inserting one variable into a vector of non-zero constants, try |
| // to avoid loading each constant element as a scalar. Load the constants as a |
| // vector and then insert the variable scalar element. If insertion is not |
| // supported, fall back to a shuffle to get the scalar blended with the |
| // constants. Insertion into a zero vector is handled as a special-case |
| // somewhere below here. |
| if (NumConstants == NumElems - 1 && NumNonZero != 1 && |
| (isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT) || |
| isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, VT))) { |
| // Create an all-constant vector. The variable element in the old |
| // build vector is replaced by undef in the constant vector. Save the |
| // variable scalar element and its index for use in the insertelement. |
| LLVMContext &Context = *DAG.getContext(); |
| Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context); |
| SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType)); |
| SDValue VarElt; |
| SDValue InsIndex; |
| for (unsigned i = 0; i != NumElems; ++i) { |
| SDValue Elt = Op.getOperand(i); |
| if (auto *C = dyn_cast<ConstantSDNode>(Elt)) |
| ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue()); |
| else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt)) |
| ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF()); |
| else if (!Elt.isUndef()) { |
| assert(!VarElt.getNode() && !InsIndex.getNode() && |
| "Expected one variable element in this vector"); |
| VarElt = Elt; |
| InsIndex = DAG.getConstant(i, dl, getVectorIdxTy(DAG.getDataLayout())); |
| } |
| } |
| Constant *CV = ConstantVector::get(ConstVecOps); |
| SDValue DAGConstVec = DAG.getConstantPool(CV, VT); |
| |
| // The constants we just created may not be legal (eg, floating point). We |
| // must lower the vector right here because we can not guarantee that we'll |
| // legalize it before loading it. This is also why we could not just create |
| // a new build vector here. If the build vector contains illegal constants, |
| // it could get split back up into a series of insert elements. |
| // TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD. |
| SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG); |
| MachineFunction &MF = DAG.getMachineFunction(); |
| MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF); |
| SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI); |
| unsigned InsertC = cast<ConstantSDNode>(InsIndex)->getZExtValue(); |
| unsigned NumEltsInLow128Bits = 128 / VT.getScalarSizeInBits(); |
| if (InsertC < NumEltsInLow128Bits) |
| return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex); |
| |
| // There's no good way to insert into the high elements of a >128-bit |
| // vector, so use shuffles to avoid an extract/insert sequence. |
| assert(VT.getSizeInBits() > 128 && "Invalid insertion index?"); |
| assert(Subtarget.hasAVX() && "Must have AVX with >16-byte vector"); |
| SmallVector<int, 8> ShuffleMask; |
| unsigned NumElts = VT.getVectorNumElements(); |
| for (unsigned i = 0; i != NumElts; ++i) |
| ShuffleMask.push_back(i == InsertC ? NumElts : i); |
| SDValue S2V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, VarElt); |
| return DAG.getVectorShuffle(VT, dl, Ld, S2V, ShuffleMask); |
| } |
| |
| // Special case for single non-zero, non-undef, element. |
| if (NumNonZero == 1) { |
| unsigned Idx = countTrailingZeros(NonZeros); |
| SDValue Item = Op.getOperand(Idx); |
| |
| // If we have a constant or non-constant insertion into the low element of |
| // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into |
| // the rest of the elements. This will be matched as movd/movq/movss/movsd |
| // depending on what the source datatype is. |
| if (Idx == 0) { |
| if (NumZero == 0) |
| return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); |
| |
| if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 || |
| (EltVT == MVT::i64 && Subtarget.is64Bit())) { |
| assert((VT.is128BitVector() || VT.is256BitVector() || |
| VT.is512BitVector()) && |
| "Expected an SSE value type!"); |
| Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); |
| // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector. |
| return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); |
| } |
| |
| // We can't directly insert an i8 or i16 into a vector, so zero extend |
| // it to i32 first. |
| if (EltVT == MVT::i16 || EltVT == MVT::i8) { |
| Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item); |
| MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32); |
| Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item); |
| Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); |
| return DAG.getBitcast(VT, Item); |
| } |
| } |
| |
| // Is it a vector logical left shift? |
| if (NumElems == 2 && Idx == 1 && |
| X86::isZeroNode(Op.getOperand(0)) && |
| !X86::isZeroNode(Op.getOperand(1))) { |
| unsigned NumBits = VT.getSizeInBits(); |
| return getVShift(true, VT, |
| DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, |
| VT, Op.getOperand(1)), |
| NumBits/2, DAG, *this, dl); |
| } |
| |
| if (IsAllConstants) // Otherwise, it's better to do a constpool load. |
| return SDValue(); |
| |
| // Otherwise, if this is a vector with i32 or f32 elements, and the element |
| // is a non-constant being inserted into an element other than the low one, |
| // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka |
| // movd/movss) to move this into the low element, then shuffle it into |
| // place. |
| if (EVTBits == 32) { |
| Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); |
| return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG); |
| } |
| } |
| |
| // Splat is obviously ok. Let legalizer expand it to a shuffle. |
| if (Values.size() == 1) { |
| if (EVTBits == 32) { |
| // Instead of a shuffle like this: |
| // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0> |
| // Check if it's possible to issue this instead. |
| // shuffle (vload ptr)), undef, <1, 1, 1, 1> |
| unsigned Idx = countTrailingZeros(NonZeros); |
| SDValue Item = Op.getOperand(Idx); |
| if (Op.getNode()->isOnlyUserOf(Item.getNode())) |
| return LowerAsSplatVectorLoad(Item, VT, dl, DAG); |
| } |
| return SDValue(); |
| } |
| |
| // A vector full of immediates; various special cases are already |
| // handled, so this is best done with a single constant-pool load. |
| if (IsAllConstants) |
| return SDValue(); |
| |
| if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, DAG, Subtarget)) |
| return V; |
| |
| // See if we can use a vector load to get all of the elements. |
| { |
| SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems); |
| if (SDValue LD = |
| EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false)) |
| return LD; |
| } |
| |
| // If this is a splat of pairs of 32-bit elements, we can use a narrower |
| // build_vector and broadcast it. |
| // TODO: We could probably generalize this more. |
| if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) { |
| SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1), |
| DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) }; |
| auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) { |
| // Make sure all the even/odd operands match. |
| for (unsigned i = 2; i != NumElems; ++i) |
| if (Ops[i % 2] != Op.getOperand(i)) |
| return false; |
| return true; |
| }; |
| if (CanSplat(Op, NumElems, Ops)) { |
| MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64; |
| MVT NarrowVT = MVT::getVectorVT(EltVT, 4); |
| // Create a new build vector and cast to v2i64/v2f64. |
| SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2), |
| DAG.getBuildVector(NarrowVT, dl, Ops)); |
| // Broadcast from v2i64/v2f64 and cast to final VT. |
| MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems/2); |
| return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT, |
| NewBV)); |
| } |
| } |
| |
| // For AVX-length vectors, build the individual 128-bit pieces and use |
| // shuffles to put them in place. |
| if (VT.getSizeInBits() > 128) { |
| MVT HVT = MVT::getVectorVT(EltVT, NumElems/2); |
| |
| // Build both the lower and upper subvector. |
| SDValue Lower = |
| DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2)); |
| SDValue Upper = DAG.getBuildVector( |
| HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2)); |
| |
| // Recreate the wider vector with the lower and upper part. |
| return concatSubVectors(Lower, Upper, DAG, dl); |
| } |
| |
| // Let legalizer expand 2-wide build_vectors. |
| if (EVTBits == 64) { |
| if (NumNonZero == 1) { |
| // One half is zero or undef. |
| unsigned Idx = countTrailingZeros(NonZeros); |
| SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, |
| Op.getOperand(Idx)); |
| return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG); |
| } |
| return SDValue(); |
| } |
| |
| // If element VT is < 32 bits, convert it to inserts into a zero vector. |
| if (EVTBits == 8 && NumElems == 16) |
| if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero, |
| DAG, Subtarget)) |
| return V; |
| |
| if (EVTBits == 16 && NumElems == 8) |
| if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero, |
| DAG, Subtarget)) |
| return V; |
| |
| // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS |
| if (EVTBits == 32 && NumElems == 4) |
| if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget)) |
| return V; |
| |
| // If element VT is == 32 bits, turn it into a number of shuffles. |
| if (NumElems == 4 && NumZero > 0) { |
| SmallVector<SDValue, 8> Ops(NumElems); |
| for (unsigned i = 0; i < 4; ++i) { |
| bool isZero = !(NonZeros & (1ULL << i)); |
| if (isZero) |
| Ops[i] = getZeroVector(VT, Subtarget, DAG, dl); |
| else |
| Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); |
| } |
| |
| for (unsigned i = 0; i < 2; ++i) { |
| switch ((NonZeros >> (i*2)) & 0x3) { |
| default: llvm_unreachable("Unexpected NonZero count"); |
| case 0: |
| Ops[i] = Ops[i*2]; // Must be a zero vector. |
| break; |
| case 1: |
| Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]); |
| break; |
| case 2: |
| Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]); |
| break; |
| case 3: |
| Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]); |
| break; |
| } |
| } |
| |
| bool Reverse1 = (NonZeros & 0x3) == 2; |
| bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2; |
| int MaskVec[] = { |
| Reverse1 ? 1 : 0, |
| Reverse1 ? 0 : 1, |
| static_cast<int>(Reverse2 ? NumElems+1 : NumElems), |
| static_cast<int>(Reverse2 ? NumElems : NumElems+1) |
| }; |
| return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec); |
| } |
| |
| assert(Values.size() > 1 && "Expected non-undef and non-splat vector"); |
| |
| // Check for a build vector from mostly shuffle plus few inserting. |
| if (SDValue Sh = buildFromShuffleMostly(Op, DAG)) |
| return Sh; |
| |
| // For SSE 4.1, use insertps to put the high elements into the low element. |
| if (Subtarget.hasSSE41()) { |
| SDValue Result; |
| if (!Op.getOperand(0).isUndef()) |
| Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0)); |
| else |
| Result = DAG.getUNDEF(VT); |
| |
| for (unsigned i = 1; i < NumElems; ++i) { |
| if (Op.getOperand(i).isUndef()) continue; |
| Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result, |
| Op.getOperand(i), DAG.getIntPtrConstant(i, dl)); |
| } |
| return Result; |
| } |
| |
| // Otherwise, expand into a number of unpckl*, start by extending each of |
| // our (non-undef) elements to the full vector width with the element in the |
| // bottom slot of the vector (which generates no code for SSE). |
| SmallVector<SDValue, 8> Ops(NumElems); |
| for (unsigned i = 0; i < NumElems; ++i) { |
| if (!Op.getOperand(i).isUndef()) |
| Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); |
| else |
| Ops[i] = DAG.getUNDEF(VT); |
| } |
| |
| // Next, we iteratively mix elements, e.g. for v4f32: |
| // Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0> |
| // : unpcklps 2, 3 ==> Y: <?, ?, 3, 2> |
| // Step 2: unpcklpd X, Y ==> <3, 2, 1, 0> |
| for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) { |
| // Generate scaled UNPCKL shuffle mask. |
| SmallVector<int, 16> Mask; |
| for(unsigned i = 0; i != Scale; ++i) |
| Mask.push_back(i); |
| for (unsigned i = 0; i != Scale; ++i) |
| Mask.push_back(NumElems+i); |
| Mask.append(NumElems - Mask.size(), SM_SentinelUndef); |
| |
| for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i) |
| Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask); |
| } |
| return Ops[0]; |
| } |
| |
| // 256-bit AVX can use the vinsertf128 instruction |
| // to create 256-bit vectors from two other 128-bit ones. |
| // TODO: Detect subvector broadcast here instead of DAG combine? |
| static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG, |
| const X86Subtarget &Subtarget) { |
| SDLoc dl(Op); |
| MVT ResVT = Op.getSimpleValueType(); |
| |
| assert((ResVT.is256BitVector() || |
| ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide"); |
| |
| unsigned NumOperands = Op.getNumOperands(); |
| unsigned NumZero = 0; |
| unsigned NumNonZero = 0; |
| unsigned NonZeros = 0; |
| for (unsigned i = 0; i != NumOperands; ++i) { |
| SDValue SubVec = Op.getOperand(i); |
| if (SubVec.isUndef()) |
| continue; |
| if (ISD::isBuildVectorAllZeros(SubVec.getNode())) |
| ++NumZero; |
| else { |
| assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range. |
| NonZeros |= 1 << i; |
| ++NumNonZero; |
| } |
| } |
| |
| // If we have more than 2 non-zeros, build each half separately. |
| if (NumNonZero > 2) { |
| MVT HalfVT = ResVT.getHalfNumVectorElementsVT(); |
| ArrayRef<SDUse> Ops = Op->ops(); |
| SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, |
| Ops.slice(0, NumOperands/2)); |
| SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, |
| Ops.slice(NumOperands/2)); |
| return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi); |
| } |
| |
| // Otherwise, build it up through insert_subvectors. |
| SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl) |
| : DAG.getUNDEF(ResVT); |
| |
| MVT SubVT = Op.getOperand(0).getSimpleValueType(); |
| unsigned NumSubElems = SubVT.getVectorNumElements(); |
| for (unsigned i = 0; i != NumOperands; ++i) { |
| if ((NonZeros & (1 << i)) == 0) |
| continue; |
| |
| Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, |
| Op.getOperand(i), |
| DAG.getIntPtrConstant(i * NumSubElems, dl)); |
| } |
| |
| return Vec; |
| } |
| |
| // Returns true if the given node is a type promotion (by concatenating i1 |
| // zeros) of the result of a node that already zeros all upper bits of |
| // k-register. |
| // TODO: Merge this with LowerAVXCONCAT_VECTORS? |
| static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, |
| const X86Subtarget &Subtarget, |
| SelectionDAG & DAG) { |
| SDLoc dl(Op); |
| MVT ResVT = Op.getSimpleValueType(); |
| unsigned NumOperands = Op.getNumOperands(); |
| |
| assert(NumOperands > 1 && isPowerOf2_32(NumOperands) && |
| "Unexpected number of operands in CONCAT_VECTORS"); |
| |
| uint64_t Zeros = 0; |
| uint64_t NonZeros = 0; |
| for (unsigned i = 0; i != NumOperands; ++i) { |
| SDValue SubVec = Op.getOperand(i); |
| if (SubVec.isUndef()) |
| continue; |
| assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range. |
| if (ISD::isBuildVectorAllZeros(SubVec.getNode())) |
| Zeros |= (uint64_t)1 << i; |
| else |
| NonZeros |= (uint64_t)1 << i; |
| } |
| |
| unsigned NumElems = ResVT.getVectorNumElements(); |
| |
| // If we are inserting non-zero vector and there are zeros in LSBs and undef |
| // in the MSBs we need to emit a KSHIFTL. The generic lowering to |
| // insert_subvector will give us two kshifts. |
| if (isPowerOf2_64(NonZeros) && Zeros != 0 && NonZeros > Zeros && |
| Log2_64(NonZeros) != NumOperands - 1) { |
| MVT ShiftVT = ResVT; |
| if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) |
| ShiftVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1; |
| unsigned Idx = Log2_64(NonZeros); |
| SDValue SubVec = Op.getOperand(Idx); |
| unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements(); |
| SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ShiftVT, |
| DAG.getUNDEF(ShiftVT), SubVec, |
| DAG.getIntPtrConstant(0, dl)); |
| Op = DAG.getNode(X86ISD::KSHIFTL, dl, ShiftVT, SubVec, |
| DAG.getTargetConstant(Idx * SubVecNumElts, dl, MVT::i8)); |
| return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Op, |
| DAG.getIntPtrConstant(0, dl)); |
| } |
| |
| // If there are zero or one non-zeros we can handle this very simply. |
| if (NonZeros == 0 || isPowerOf2_64(NonZeros)) { |
| SDValue Vec = Zeros ? DAG.getConstant(0, dl, ResVT) : DAG.getUNDEF(ResVT); |
| if (!NonZeros) |
| return Vec; |
| unsigned Idx = Log2_64(NonZeros); |
| SDValue SubVec = Op.getOperand(Idx); |
| unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements(); |
| return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec, |
| DAG.getIntPtrConstant(Idx * SubVecNumElts, dl)); |
| } |
| |
| if (NumOperands > 2) { |
| MVT HalfVT = ResVT.getHalfNumVectorElementsVT(); |
| ArrayRef<SDUse> Ops = Op->ops(); |
| SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, |
| Ops.slice(0, NumOperands/2)); |
| SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, |
| Ops.slice(NumOperands/2)); |
| return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi); |
| } |
| |
| assert(countPopulation(NonZeros) == 2 && "Simple cases not handled?"); |
| |
| if (ResVT.getVectorNumElements() >= 16) |
| return Op; // The operation is legal with KUNPCK |
| |
| SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, |
| DAG.getUNDEF(ResVT), Op.getOperand(0), |
| DAG.getIntPtrConstant(0, dl)); |
| return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1), |
| DAG.getIntPtrConstant(NumElems/2, dl)); |
| } |
| |
| static SDValue LowerCONCAT_VECTORS(SDValue Op, |
| const X86Subtarget &Subtarget, |
| SelectionDAG &DAG) { |
| MVT VT = Op.getSimpleValueType(); |
| if (VT.getVectorElementType() == MVT::i1) |
| return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG); |
| |
| assert((VT.is256BitVector() && Op.getNumOperands() == 2) || |
| (VT.is512BitVector() && (Op.getNumOperands() == 2 || |
| Op.getNumOperands() == 4))); |
| |
| // AVX can use the vinsertf128 instruction to create 256-bit vectors |
| // from two other 128-bit ones. |
| |
| // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors |
| return LowerAVXCONCAT_VECTORS(Op, DAG, Subtarget); |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // Vector shuffle lowering |
| // |
| // This is an experimental code path for lowering vector shuffles on x86. It is |
| // designed to handle arbitrary vector shuffles and blends, gracefully |
| // degrading performance as necessary. It works hard to recognize idiomatic |
| // shuffles and lower them to optimal instruction patterns without leaving |
| // a framework that allows reasonably efficient handling of all vector shuffle |
| // patterns. |
| //===----------------------------------------------------------------------===// |
| |
| /// Tiny helper function to identify a no-op mask. |
| /// |
| /// This is a somewhat boring predicate function. It checks whether the mask |
| /// array input, which is assumed to be a single-input shuffle mask of the kind |
| /// used by the X86 shuffle instructions (not a fully general |
| /// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an |
| /// in-place shuffle are 'no-op's. |
| static bool isNoopShuffleMask(ArrayRef<int> Mask) { |
| for (int i = 0, Size = Mask.size(); i < Size; ++i) { |
| assert(Mask[i] >= -1 && "Out of bound mask element!"); |
| if (Mask[i] >= 0 && Mask[i] != i) |
| return false; |
| } |
| return true; |
| } |
| |
| /// Test whether there are elements crossing LaneSizeInBits lanes in this |
| /// shuffle mask. |
| /// |
| /// X86 divides up its shuffles into in-lane and cross-lane shuffle operations |
| /// and we routinely test for these. |
| static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits, |
| unsigned ScalarSizeInBits, |
| ArrayRef<int> Mask) { |
| assert(LaneSizeInBits && ScalarSizeInBits && |
| (LaneSizeInBits % ScalarSizeInBits) == 0 && |
| "Illegal shuffle lane size"); |
| int LaneSize = LaneSizeInBits / ScalarSizeInBits; |
| int Size = Mask.size(); |
| for (int i = 0; i < Size; ++i) |
| if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize) |
| return true; |
| return false; |
| } |
| |
| /// Test whether there are elements crossing 128-bit lanes in this |
| /// shuffle mask. |
| static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) { |
| return isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), Mask); |
| } |
| |
| /// Test whether a shuffle mask is equivalent within each sub-lane. |
| /// |
| /// This checks a shuffle mask to see if it is performing the same |
| /// lane-relative shuffle in each sub-lane. This trivially implies |
| /// that it is also not lane-crossing. It may however involve a blend from the |
| /// same lane of a second vector. |
| /// |
| /// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is |
| /// non-trivial to compute in the face of undef lanes. The representation is |
| /// suitable for use with existing 128-bit shuffles as entries from the second |
| /// vector have been remapped to [LaneSize, 2*LaneSize). |
| static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT, |
| ArrayRef<int> Mask, |
| SmallVectorImpl<int> &RepeatedMask) { |
| auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits(); |
| RepeatedMask.assign(LaneSize, -1); |
| int Size = Mask.size(); |
| for (int i = 0; i < Size; ++i) { |
| assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0); |
| if (Mask[i] < 0) |
| continue; |
| if ((Mask[i] % Size) / LaneSize != i / LaneSize) |
| // This entry crosses lanes, so there is no way to model this shuffle. |
| return false; |
| |
| // Ok, handle the in-lane shuffles by detecting if and when they repeat. |
| // Adjust second vector indices to start at LaneSize instead of Size. |
| int LocalM = Mask[i] < Size ? Mask[i] % LaneSize |
| : Mask[i] % LaneSize + LaneSize; |
| if (RepeatedMask[i % LaneSize] < 0) |
| // This is the first non-undef entry in this slot of a 128-bit lane. |
| RepeatedMask[i % LaneSize] = LocalM; |
| else if (RepeatedMask[i % LaneSize] != LocalM) |
| // Found a mismatch with the repeated mask. |
| return false; |
| } |
| return true; |
| } |
| |
| /// Test whether a shuffle mask is equivalent within each 128-bit lane. |
| static bool |
| is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask, |
| SmallVectorImpl<int> &RepeatedMask) { |
| return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask); |
| } |
| |
| static bool |
| is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask) { |
| SmallVector<int, 32> RepeatedMask; |
| return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask); |
| } |
| |
| /// Test whether a shuffle mask is equivalent within each 256-bit lane. |
| static bool |
| is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask, |
| SmallVectorImpl<int> &RepeatedMask) { |
| return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask); |
| } |
| |
| /// Test whether a target shuffle mask is equivalent within each sub-lane. |
| /// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero. |
| static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT, |
| ArrayRef<int> Mask, |
| SmallVectorImpl<int> &RepeatedMask) { |
| int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits(); |
| RepeatedMask.assign(LaneSize, SM_SentinelUndef); |
| int Size = Mask.size(); |
| for (int i = 0; i < Size; ++i) { |
| assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0)); |
| if (Mask[i] == SM_SentinelUndef) |
| continue; |
| if (Mask[i] == SM_SentinelZero) { |
| if (!isUndefOrZero(RepeatedMask[i % LaneSize])) |
| return false; |
| RepeatedMask[i % LaneSize] = SM_SentinelZero; |
| continue; |
| } |
| if ((Mask[i] % Size) / LaneSize != i / LaneSize) |
| // This entry crosses lanes, so there is no way to model this shuffle. |
| return false; |
| |
| // Ok, handle the in-lane shuffles by detecting if and when they repeat. |
| // Adjust second vector indices to start at LaneSize instead of Size. |
| int LocalM = |
| Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + LaneSize; |
| if (RepeatedMask[i % LaneSize] == SM_SentinelUndef) |
| // This is the first non-undef entry in this slot of a 128-bit lane. |
| RepeatedMask[i % LaneSize] = LocalM; |
| else if (RepeatedMask[i % LaneSize] != LocalM) |
| // Found a mismatch with the repeated mask. |
| return false; |
| } |
| return true; |
| } |
| |
| /// Checks whether a shuffle mask is equivalent to an explicit list of |
| /// arguments. |
| /// |
| /// This is a fast way to test a shuffle mask against a fixed pattern: |
| /// |
| /// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... } |
| /// |
| /// It returns true if the mask is exactly as wide as the argument list, and |
| /// each element of the mask is either -1 (signifying undef) or the value given |
| /// in the argument. |
| static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask, |
| ArrayRef<int> ExpectedMask) { |
| if (Mask.size() != ExpectedMask.size()) |
| return false; |
| |
| int Size = Mask.size(); |
| |
| // If the values are build vectors, we can look through them to find |
| // equivalent inputs that make the shuffles equivalent. |
| auto *BV1 = dyn_cast<BuildVectorSDNode>(V1); |
| auto *BV2 = dyn_cast<BuildVectorSDNode>(V2); |
| |
| for (int i = 0; i < Size; ++i) { |
| assert(Mask[i] >= -1 && "Out of bound mask element!"); |
| if (Mask[i] >= 0 && Mask[i] != ExpectedMask[i]) { |
| auto *MaskBV = Mask[i] < Size ? BV1 : BV2; |
| auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2; |
| if (!MaskBV || !ExpectedBV || |
| MaskBV->getOperand(Mask[i] % Size) != |
| ExpectedBV->getOperand(ExpectedMask[i] % Size)) |
| return false; |
| } |
| } |
| |
| return true; |
| } |
| |
| /// Checks whether a target shuffle mask is equivalent to an explicit pattern. |
| /// |
| /// The masks must be exactly the same width. |
| /// |
| /// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding |
| /// value in ExpectedMask is always accepted. Otherwise the indices must match. |
| /// |
| /// SM_SentinelZero is accepted as a valid negative index but must match in |
| /// both. |
| static bool isTargetShuffleEquivalent(ArrayRef<int> Mask, |
| ArrayRef<int> ExpectedMask, |
| SDValue V1 = SDValue(), |
| SDValue V2 = SDValue()) { |
| int Size = Mask.size(); |
| if (Size != (int)ExpectedMask.size()) |
| return false; |
| assert(isUndefOrZeroOrInRange(ExpectedMask, 0, 2 * Size) && |
| "Illegal target shuffle mask"); |
| |
| // Check for out-of-range target shuffle mask indices. |
| if (!isUndefOrZeroOrInRange(Mask, 0, 2 * Size)) |
| return false; |
| |
| // If the values are build vectors, we can look through them to find |
| // equivalent inputs that make the shuffles equivalent. |
| auto *BV1 = dyn_cast_or_null<BuildVectorSDNode>(V1); |
| auto *BV2 = dyn_cast_or_null<BuildVectorSDNode>(V2); |
| BV1 = ((BV1 && Size != (int)BV1->getNumOperands()) ? nullptr : BV1); |
| BV2 = ((BV2 && Size != (int)BV2->getNumOperands()) ? nullptr : BV2); |
| |
| for (int i = 0; i < Size; ++i) { |
| if (Mask[i] == SM_SentinelUndef || Mask[i] == ExpectedMask[i]) |
| continue; |
| if (0 <= Mask[i] && 0 <= ExpectedMask[i]) { |
| auto *MaskBV = Mask[i] < Size ? BV1 : BV2; |
| auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2; |
| if (MaskBV && ExpectedBV && |
| MaskBV->getOperand(Mask[i] % Size) == |
| ExpectedBV->getOperand(ExpectedMask[i] % Size)) |
| continue; |
| } |
| // TODO - handle SM_Sentinel equivalences. |
| return false; |
| } |
| return true; |
| } |
| |
| // Attempt to create a shuffle mask from a VSELECT condition mask. |
| static bool createShuffleMaskFromVSELECT(SmallVectorImpl<int> &Mask, |
| SDValue Cond) { |
| if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) |
| return false; |
| |
| unsigned Size = Cond.getValueType().getVectorNumElements(); |
| Mask.resize(Size, SM_SentinelUndef); |
| |
| for (int i = 0; i != (int)Size; ++i) { |
| SDValue CondElt = Cond.getOperand(i); |
| Mask[i] = i; |
| // Arbitrarily choose from the 2nd operand if the select condition element |
| // is undef. |
| // TODO: Can we do better by matching patterns such as even/odd? |
| if (CondElt.isUndef() || isNullConstant(CondElt)) |
| Mask[i] += Size; |
| } |
| |
| return true; |
| } |
| |
| // Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd |
| // instructions. |
| static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) { |
| if (VT != MVT::v8i32 && VT != MVT::v8f32) |
| return false; |
| |
| SmallVector<int, 8> Unpcklwd; |
| createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true, |
| /* Unary = */ false); |
| SmallVector<int, 8> Unpckhwd; |
| createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false, |
| /* Unary = */ false); |
| bool IsUnpackwdMask = (isTargetShuffleEquivalent(Mask, Unpcklwd) || |
| isTargetShuffleEquivalent(Mask, Unpckhwd)); |
| return IsUnpackwdMask; |
| } |
| |
| static bool is128BitUnpackShuffleMask(ArrayRef<int> Mask) { |
| // Create 128-bit vector type based on mask size. |
| MVT EltVT = MVT::getIntegerVT(128 / Mask.size()); |
| MVT VT = MVT::getVectorVT(EltVT, Mask.size()); |
| |
| // We can't assume a canonical shuffle mask, so try the commuted version too. |
| SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end()); |
| ShuffleVectorSDNode::commuteMask(CommutedMask); |
| |
| // Match any of unary/binary or low/high. |
| for (unsigned i = 0; i != 4; ++i) { |
| SmallVector<int, 16> UnpackMask; |
| createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2); |
| if (isTargetShuffleEquivalent(Mask, UnpackMask) || |
| isTargetShuffleEquivalent(CommutedMask, UnpackMask)) |
| return true; |
| } |
| return false; |
| } |
| |
| /// Return true if a shuffle mask chooses elements identically in its top and |
| /// bottom halves. For example, any splat mask has the same top and bottom |
| /// halves. If an element is undefined in only one half of the mask, the halves |
| /// are not considered identical. |
| static bool hasIdenticalHalvesShuffleMask(ArrayRef<int> Mask) { |
| assert(Mask.size() % 2 == 0 && "Expecting even number of elements in mask"); |
| unsigned HalfSize = Mask.size() / 2; |
| for (unsigned i = 0; i != HalfSize; ++i) { |
| if (Mask[i] != Mask[i + HalfSize]) |
| return false; |
| } |
| return true; |
| } |
| |
| /// Get a 4-lane 8-bit shuffle immediate for a mask. |
| /// |
| /// This helper function produces an 8-bit shuffle immediate corresponding to |
| /// the ubiquitous shuffle encoding scheme used in x86 instructions for |
| /// shuffling 4 lanes. It can be used with most of the PSHUF instructions for |
| /// example. |
| /// |
| /// NB: We rely heavily on "undef" masks preserving the input lane. |
| static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) { |
| assert(Mask.size() == 4 && "Only 4-lane shuffle masks"); |
| assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!"); |
| assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!"); |
| assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!"); |
| assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!"); |
| |
| unsigned Imm = 0; |
| Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0; |
| Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2; |
| Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4; |
| Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6; |
| return Imm; |
| } |
| |
| static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL, |
| SelectionDAG &DAG) { |
| return DAG.getTargetConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8); |
| } |
| |
| // The Shuffle result is as follow: |
| // 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order. |
| // Each Zeroable's element correspond to a particular Mask's element. |
| // As described in computeZeroableShuffleElements function. |
| // |
| // The function looks for a sub-mask that the nonzero elements are in |
| // increasing order. If such sub-mask exist. The function returns true. |
| static bool isNonZeroElementsInOrder(const APInt &Zeroable, |
| ArrayRef<int> Mask, const EVT &VectorType, |
| bool &IsZeroSideLeft) { |
| int NextElement = -1; |
| // Check if the Mask's nonzero elements are in increasing order. |
| for (int i = 0, e = Mask.size(); i < e; i++) { |
| // Checks if the mask's zeros elements are built from only zeros. |
| assert(Mask[i] >= -1 && "Out of bound mask element!"); |
| if (Mask[i] < 0) |
| return false; |
| if (Zeroable[i]) |
| continue; |
| // Find the lowest non zero element |
| if (NextElement < 0) { |
| NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0; |
| IsZeroSideLeft = NextElement != 0; |
| } |
| // Exit if the mask's non zero elements are not in increasing order. |
| if (NextElement != Mask[i]) |
| return false; |
| NextElement++; |
| } |
| return true; |
| } |
| |
| /// Try to lower a shuffle with a single PSHUFB of V1 or V2. |
| static SDValue lowerShuffleWithPSHUFB(const SDLoc &DL, MVT VT, |
| ArrayRef<int> Mask, SDValue V1, |
| SDValue V2, const APInt &Zeroable, |
| const X86Subtarget &Subtarget, |
| SelectionDAG &DAG) { |
| int Size = Mask.size(); |
| int LaneSize = 128 / VT.getScalarSizeInBits(); |
| const int NumBytes = VT.getSizeInBits() / 8; |
| const int NumEltBytes = VT.getScalarSizeInBits() / 8; |
| |
| assert((Subtarget.hasSSSE3() && VT.is128BitVector()) || |
| (Subtarget.hasAVX2() && VT.is256BitVector()) || |
| (Subtarget.hasBWI() && VT.is512BitVector())); |
| |
| SmallVector<SDValue, 64> PSHUFBMask(NumBytes); |
| // Sign bit set in i8 mask means zero element. |
| SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8); |
| |
| SDValue V; |
| for (int i = 0; i < NumBytes; ++i) { |
| int M = Mask[i / NumEltBytes]; |
| if (M < 0) { |
| PSHUFBMask[i] = DAG.getUNDEF(MVT::i8); |
| continue; |
| } |
| if (Zeroable[i / NumEltBytes]) { |
| PSHUFBMask[i] = ZeroMask; |
| continue; |
| } |
| |
| // We can only use a single input of V1 or V2. |
| SDValue SrcV = (M >= Size ? V2 : V1); |
| if (V && V != SrcV) |
| return SDValue(); |
| V = SrcV; |
| M %= Size; |
| |
| // PSHUFB can't cross lanes, ensure this doesn't happen. |
| if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize)) |
| return SDValue(); |
| |
| M = M % LaneSize; |
| M = M * NumEltBytes + (i % NumEltBytes); |
| PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8); |
| } |
| assert(V && "Failed to find a source input"); |
| |
| MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes); |
| return DAG.getBitcast( |
| VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V), |
| DAG.getBuildVector(I8VT, DL, PSHUFBMask))); |
| } |
| |
| static SDValue getMaskNode(SDValue Mask, MVT MaskVT, |
| const X86Subtarget &Subtarget, SelectionDAG &DAG, |
| const SDLoc &dl); |
| |
| // X86 has dedicated shuffle that can be lowered to VEXPAND |
| static SDValue lowerShuffleToEXPAND(const SDLoc &DL, MVT VT, |
| const APInt &Zeroable, |
| ArrayRef<int> Mask, SDValue &V1, |
| SDValue &V2, SelectionDAG &DAG, |
| const X86Subtarget &Subtarget) { |
| bool IsLeftZeroSide = true; |
| if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(), |
| IsLeftZeroSide)) |
| return SDValue(); |
| unsigned VEXPANDMask = (~Zeroable).getZExtValue(); |
| MVT IntegerType = |
| MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8)); |
| SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType); |
| unsigned NumElts = VT.getVectorNumElements(); |
| assert((NumElts == 4 || NumElts == 8 || NumElts == 16) && |
| "Unexpected number of vector elements"); |
| SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts), |
| Subtarget, DAG, DL); |
| SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL); |
| SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1; |
| return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask); |
| } |
| |
| static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2, |
| unsigned &UnpackOpcode, bool IsUnary, |
| ArrayRef<int> TargetMask, const SDLoc &DL, |
| SelectionDAG &DAG, |
| const X86Subtarget &Subtarget) { |
| int NumElts = VT.getVectorNumElements(); |
| |
| bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true; |
| for (int i = 0; i != NumElts; i += 2) { |
| int M1 = TargetMask[i + 0]; |
| int M2 = TargetMask[i + 1]; |
| Undef1 &= (SM_SentinelUndef == M1); |
| Undef2 &= (SM_SentinelUndef == M2); |
| Zero1 &= isUndefOrZero(M1); |
| Zero2 &= isUndefOrZero(M2); |
| } |
| assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) && |
| "Zeroable shuffle detected"); |
| |
| // Attempt to match the target mask against the unpack lo/hi mask patterns. |
| SmallVector<int, 64> Unpckl, Unpckh; |
| createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary); |
| if (isTargetShuffleEquivalent(TargetMask, Unpckl)) { |
| UnpackOpcode = X86ISD::UNPCKL; |
| V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2)); |
| V1 = (Undef1 ? DAG.getUNDEF(VT) : V1); |
| return true; |
| } |
| |
| createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary); |
| if (isTargetShuffleEquivalent(TargetMask, Unpckh)) { |
| UnpackOpcode = X86ISD::UNPCKH; |
| V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2)); |
| V1 = (Undef1 ? DAG.getUNDEF(VT) : V1); |
| return true; |
| } |
| |
| // If an unary shuffle, attempt to match as an unpack lo/hi with zero. |
| if (IsUnary && (Zero1 || Zero2)) { |
| // Don't bother if we can blend instead. |
| if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) && |
| isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0)) |
| return false; |
| |
| bool MatchLo = true, MatchHi = true; |
| for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) { |
| int M = TargetMask[i]; |
| |
| // Ignore if the input is known to be zero or the index is undef. |
| if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) || |
| (M == SM_SentinelUndef)) |
| continue; |
| |
| MatchLo &= (M == Unpckl[i]); |
| MatchHi &= (M == Unpckh[i]); |
| } |
| |
| if (MatchLo || MatchHi) { |
| UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH; |
| V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1; |
| V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1; |
| return true; |
| } |
| } |
| |
| // If a binary shuffle, commute and try again. |
| if (!IsUnary) { |
| ShuffleVectorSDNode::commuteMask(Unpckl); |
| if (isTargetShuffleEquivalent(TargetMask, Unpckl)) { |
| UnpackOpcode = X86ISD::UNPCKL; |
| std::swap(V1, V2); |
| return true; |
| } |
| |
| ShuffleVectorSDNode::commuteMask(Unpckh); |
| if (isTargetShuffleEquivalent(TargetMask, Unpckh)) { |
| UnpackOpcode = X86ISD::UNPCKH; |
| std::swap(V1, V2); |
| return true; |
| } |
| } |
| |
| return false; |
| } |
| |
| // X86 has dedicated unpack instructions that can handle specific blend |
| // operations: UNPCKH and UNPCKL. |
| static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT, |
| ArrayRef<int> Mask, SDValue V1, SDValue V2, |
| SelectionDAG &DAG) { |
| SmallVector<int, 8> Unpckl; |
| createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false); |
| if (isShuffleEquivalent(V1, V2, Mask, Unpckl)) |
| return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2); |
| |
| SmallVector<int, 8> Unpckh; |
| createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false); |
| if (isShuffleEquivalent(V1, V2, Mask, Unpckh)) |
| return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2); |
| |
| // Commute and try again. |
| ShuffleVectorSDNode::commuteMask(Unpckl); |
| if (isShuffleEquivalent(V1, V2, Mask, Unpckl)) |
| return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1); |
| |
| ShuffleVectorSDNode::commuteMask(Unpckh); |
| if (isShuffleEquivalent(V1, V2, Mask, Unpckh)) |
| return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1); |
| |
| return SDValue(); |
| } |
| |
| static bool matchShuffleAsVPMOV(ArrayRef<int> Mask, bool SwappedOps, |
| int Delta) { |
| int Size = (int)Mask.size(); |
| int Split = Size / Delta; |
| int TruncatedVectorStart = SwappedOps ? Size : 0; |
| |
| // Match for mask starting with e.g.: <8, 10, 12, 14,... or <0, 2, 4, 6,... |
| if (!isSequentialOrUndefInRange(Mask, 0, Split, TruncatedVectorStart, Delta)) |
| return false; |
| |
| // The rest of the mask should not refer to the truncated vector's elements. |
| if (isAnyInRange(Mask.slice(Split, Size - Split), TruncatedVectorStart, |
| TruncatedVectorStart + Size)) |
| return false; |
| |
| return true; |
| } |
| |
| // Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction. |
| // |
| // An example is the following: |
| // |
| // t0: ch = EntryToken |
| // t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0 |
| // t25: v4i32 = truncate t2 |
| // t41: v8i16 = bitcast t25 |
| // t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16, |
| // Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0> |
| // t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21 |
| // t18: v2i64 = bitcast t51 |
| // |
| // Without avx512vl, this is lowered to: |
| // |
| // vpmovqd %zmm0, %ymm0 |
| // vpshufb {{.*#+}} xmm0 = |
| // xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero |
| // |
| // But when avx512vl is available, one can just use a single vpmovdw |
| // instruction. |
| static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, ArrayRef<int> Mask, |
| MVT VT, SDValue V1, SDValue V2, |
| SelectionDAG &DAG, |
| const X86Subtarget &Subtarget) { |
| if (VT != MVT::v16i8 && VT != MVT::v8i16) |
| return SDValue(); |
| |
| if (Mask.size() != VT.getVectorNumElements()) |
| return SDValue(); |
| |
| bool SwappedOps = false; |
| |
| if (!ISD::isBuildVectorAllZeros(V2.getNode())) { |
| if (!ISD::isBuildVectorAllZeros(V1.getNode())) |
| return SDValue(); |
| |
| std::swap(V1, V2); |
| SwappedOps = true; |
| } |
| |
| // Look for: |
| // |
| // bitcast (truncate <8 x i32> %vec to <8 x i16>) to <16 x i8> |
| // bitcast (truncate <4 x i64> %vec to <4 x i32>) to <8 x i16> |
| // |
| // and similar ones. |
| if (V1.getOpcode() != ISD::BITCAST) |
| return SDValue(); |
| if (V1.getOperand(0).getOpcode() != ISD::TRUNCATE) |
| return SDValue(); |
| |
| SDValue Src = V1.getOperand(0).getOperand(0); |
| MVT SrcVT = Src.getSimpleValueType(); |
| |
| // The vptrunc** instructions truncating 128 bit and 256 bit vectors |
| // are only available with avx512vl. |
| if (!SrcVT.is512BitVector() && !Subtarget.hasVLX()) |
| return SDValue(); |
| |
| // Down Convert Word to Byte is only available with avx512bw. The case with |
| // 256-bit output doesn't contain a shuffle and is therefore not handled here. |
| if (SrcVT.getVectorElementType() == MVT::i16 && VT == MVT::v16i8 && |
| !Subtarget.hasBWI()) |
| return SDValue(); |
| |
| // The first half/quarter of the mask should refer to every second/fourth |
| // element of the vector truncated and bitcasted. |
| if (!matchShuffleAsVPMOV(Mask, SwappedOps, 2) && |
| !matchShuffleAsVPMOV(Mask, SwappedOps, 4)) |
| return SDValue(); |
| |
| return DAG.getNode(X86ISD::VTRUNC, DL, VT, Src); |
| } |
| |
| // X86 has dedicated pack instructions that can handle specific truncation |
| // operations: PACKSS and PACKUS. |
| static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2, |
| unsigned &PackOpcode, ArrayRef<int> TargetMask, |
| SelectionDAG &DAG, |
| const X86Subtarget &Subtarget) { |
| unsigned NumElts = VT.getVectorNumElements(); |
| unsigned BitSize = VT.getScalarSizeInBits(); |
| MVT PackSVT = MVT::getIntegerVT(BitSize * 2); |
| MVT PackVT = MVT::getVectorVT(PackSVT, NumElts / 2); |
| |
| auto MatchPACK = [&](SDValue N1, SDValue N2) { |
| SDValue VV1 = DAG.getBitcast(PackVT, N1); |
| SDValue VV2 = DAG.getBitcast(PackVT, N2); |
| if (Subtarget.hasSSE41() || PackSVT == MVT::i16) { |
| APInt ZeroMask = APInt::getHighBitsSet(BitSize * 2, BitSize); |
| if ((N1.isUndef() || DAG.MaskedValueIsZero(VV1, ZeroMask)) && |
| (N2.isUndef() || DAG.MaskedValueIsZero(VV2, ZeroMask))) { |
| V1 = VV1; |
| V2 = VV2; |
| SrcVT = PackVT; |
| PackOpcode = X86ISD::PACKUS; |
| return true; |
| } |
| } |
| if ((N1.isUndef() || DAG.ComputeNumSignBits(VV1) > BitSize) && |
| (N2.isUndef() || DAG.ComputeNumSignBits(VV2) > BitSize)) { |
| V1 = VV1; |
| V2 = VV2; |
| SrcVT = PackVT; |
| PackOpcode = X86ISD::PACKSS; |
| return true; |
| } |
| return false; |
| }; |
| |
| // Try binary shuffle. |
| SmallVector<int, 32> BinaryMask; |
| createPackShuffleMask(VT, BinaryMask, false); |
| if (isTargetShuffleEquivalent(TargetMask, BinaryMask, V1, V2)) |
| if (MatchPACK(V1, V2)) |
| return true; |
| |
| // Try unary shuffle. |
| SmallVector<int, 32> UnaryMask; |
| createPackShuffleMask(VT, UnaryMask, true); |
| if (isTargetShuffleEquivalent(TargetMask, UnaryMask, V1)) |
| if (MatchPACK(V1, V1)) |
| return true; |
| |
| return false; |
| } |
| |
| static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, ArrayRef<int> Mask, |
| SDValue V1, SDValue V2, SelectionDAG &DAG, |
| const X86Subtarget &Subtarget) { |
| MVT PackVT; |
| unsigned PackOpcode; |
| if (matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG, |
| Subtarget)) |
| return DAG.getNode(PackOpcode, DL, VT, DAG.getBitcast(PackVT, V1), |
| DAG.getBitcast(PackVT, V2)); |
| |
| return SDValue(); |
| } |
| |
| /// Try to emit a bitmask instruction for a shuffle. |
| /// |
| /// This handles cases where we can model a blend exactly as a bitmask due to |
| /// one of the inputs being zeroable. |
| static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1, |
| SDValue V2, ArrayRef<int> Mask, |
| const APInt &Zeroable, |
| const X86Subtarget &Subtarget, |
| SelectionDAG &DAG) { |
| MVT MaskVT = VT; |
| MVT EltVT = VT.getVectorElementType(); |
| SDValue Zero, AllOnes; |
| // Use f64 if i64 isn't legal. |
| if (EltVT == MVT::i64 && !Subtarget.is64Bit()) { |
| EltVT = MVT::f64; |
| MaskVT = MVT::getVectorVT(EltVT, Mask.size()); |
| } |
| |
| MVT LogicVT = VT; |
| if (EltVT == MVT::f32 || EltVT == MVT::f64) { |
| Zero = DAG.getConstantFP(0.0, DL, EltVT); |
| AllOnes = DAG.getConstantFP( |
| APFloat::getAllOnesValue(EltVT.getSizeInBits(), true), DL, EltVT); |
| LogicVT = |
| MVT::getVectorVT(EltVT == MVT::f64 ? MVT::i64 : MVT::i32, Mask.size()); |
| } else { |
| Zero = DAG.getConstant(0, DL, EltVT); |
| AllOnes = DAG.getAllOnesConstant(DL, EltVT); |
| } |
| |
| SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero); |
| SDValue V; |
| for (int i = 0, Size = Mask.size(); i < Size; ++i) { |
| if (Zeroable[i]) |
| continue; |
| if (Mask[i] % Size != i) |
| return SDValue(); // Not a blend. |
| if (!V) |
| V = Mask[i] < Size ? V1 : V2; |
| else if (V != (Mask[i] < Size ? V1 : V2)) |
| return SDValue(); // Can only let one input through the mask. |
| |
| VMaskOps[i] = AllOnes; |
| } |
| if (!V) |
| return SDValue(); // No non-zeroable elements! |
| |
| SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps); |
| VMask = DAG.getBitcast(LogicVT, VMask); |
| V = DAG.getBitcast(LogicVT, V); |
| SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask); |
| return DAG.getBitcast(VT, And); |
| } |
| |
| /// Try to emit a blend instruction for a shuffle using bit math. |
| /// |
| /// This is used as a fallback approach when first class blend instructions are |
| /// unavailable. Currently it is only suitable for integer vectors, but could |
| /// be generalized for floating point vectors if desirable. |
| static SDValue lowerShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1, |
| SDValue V2, ArrayRef<int> Mask, |
| SelectionDAG &DAG) { |
| assert(VT.isInteger() && "Only supports integer vector types!"); |
| MVT EltVT = VT.getVectorElementType(); |
| SDValue Zero = DAG.getConstant(0, DL, EltVT); |
| SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT); |
| SmallVector<SDValue, 16> MaskOps; |
| for (int i = 0, Size = Mask.size(); i < Size; ++i) { |
| if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size) |
| return SDValue(); // Shuffled input! |
| MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero); |
| } |
| |
| SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps); |
| V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask); |
| V2 = DAG.getNode(X86ISD::ANDNP, DL, VT, V1Mask, V2); |
| return DAG.getNode(ISD::OR, DL, VT, V1, V2); |
| } |
| |
| static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, |
| SDValue PreservedSrc, |
| const X86Subtarget &Subtarget, |
| SelectionDAG &DAG); |
| |
| static bool matchShuffleAsBlend(SDValue V1, SDValue V2, |
| MutableArrayRef<int> Mask, |
| const APInt &Zeroable, bool &ForceV1Zero, |
| bool &ForceV2Zero, uint64_t &BlendMask) { |
| bool V1IsZeroOrUndef = |
| V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode()); |
| bool V2IsZeroOrUndef = |
| V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode()); |
| |
| BlendMask = 0; |
| ForceV1Zero = false, ForceV2Zero = false; |
| assert(Mask.size() <= 64 && "Shuffle mask too big for blend mask"); |
| |
| // Attempt to generate the binary blend mask. If an input is zero then |
| // we can use any lane. |
| for (int i = 0, Size = Mask.size(); i < Size; ++i) { |
| int M = Mask[i]; |
| if (M == SM_SentinelUndef) |
| continue; |
| if (M == i) |
| continue; |
| if (M == i + Size) { |
| BlendMask |= 1ull << i; |
| continue; |
| } |
| if (Zeroable[i]) { |
| if (V1IsZeroOrUndef) { |
| ForceV1Zero = true; |
| Mask[i] = i; |
| continue; |
| } |
| if (V2IsZeroOrUndef) { |
| ForceV2Zero = true; |
| BlendMask |= 1ull << i; |
| Mask[i] = i + Size; |
| continue; |
| } |
| } |
| return false; |
| } |
| return true; |
| } |
| |
| static uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size, |
| int Scale) { |
| uint64_t ScaledMask = 0; |
| for (int i = 0; i != Size; ++i) |
| if (BlendMask & (1ull << i)) |
| ScaledMask |= ((1ull << Scale) - 1) << (i * Scale); |
| return ScaledMask; |
| } |
| |
| /// Try to emit a blend instruction for a shuffle. |
| /// |
| /// This doesn't do any checks for the availability of instructions for blending |
| /// these values. It relies on the availability of the X86ISD::BLENDI pattern to |
| /// be matched in the backend with the type given. What it does check for is |
| /// that the shuffle mask is a blend, or convertible into a blend with zero. |
| static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, |
| SDValue V2, ArrayRef<int> Original, |
| const APInt &Zeroable, |
| const X86Subtarget &Subtarget, |
| SelectionDAG &DAG) { |
| uint64_t BlendMask = 0; |
| bool ForceV1Zero = false, ForceV2Zero = false; |
| SmallVector<int, 64> Mask(Original.begin(), Original.end()); |
| if (!matchShuffleAsBlend(V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero, |
| BlendMask)) |
| return SDValue(); |
| |
| // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs. |
| if (ForceV1Zero) |
| V1 = getZeroVector(VT, Subtarget, DAG, DL); |
| if (ForceV2Zero) |
| V2 = getZeroVector(VT, Subtarget, DAG, DL); |
| |
| switch (VT.SimpleTy) { |
| case MVT::v4i64: |
| case MVT::v8i32: |
| assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!"); |
| LLVM_FALLTHROUGH; |
| case MVT::v4f64: |
| case MVT::v8f32: |
| assert(Subtarget.hasAVX() && "256-bit float blends require AVX!"); |
| LLVM_FALLTHROUGH; |
| case MVT::v2f64: |
| case MVT::v2i64: |
| case MVT::v4f32: |
| case MVT::v4i32: |
| case MVT::v8i16: |
| assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!"); |
| return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2, |
| DAG.getTargetConstant(BlendMask, DL, MVT::i8)); |
| case MVT::v16i16: { |
| assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!"); |
| SmallVector<int, 8> RepeatedMask; |
| if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) { |
| // We can lower these with PBLENDW which is mirrored across 128-bit lanes. |
| assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!"); |
| BlendMask = 0; |
| for (int i = 0; i < 8; ++i) |
| if (RepeatedMask[i] >= 8) |
| BlendMask |= 1ull << i; |
| return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2, |
| DAG.getTargetConstant(BlendMask, DL, MVT::i8)); |
| } |
| // Use PBLENDW for lower/upper lanes and then blend lanes. |
| // TODO - we should allow 2 PBLENDW here and leave shuffle combine to |
| // merge to VSELECT where useful. |
| uint64_t LoMask = BlendMask & 0xFF; |
| uint64_t HiMask = (BlendMask >> 8) & 0xFF; |
| if (LoMask == 0 || LoMask == 255 || HiMask == 0 || HiMask == 255) { |
| SDValue Lo = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2, |
| DAG.getTargetConstant(LoMask, DL, MVT::i8)); |
| SDValue Hi = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2, |
| DAG.getTargetConstant(HiMask, DL, MVT::i8)); |
| return DAG.getVectorShuffle( |
| MVT::v16i16, DL, Lo, Hi, |
| {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31}); |
| } |
| LLVM_FALLTHROUGH; |
| } |
| case MVT::v32i8: |
| assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!"); |
| LLVM_FALLTHROUGH; |
| case MVT::v16i8: { |
| assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!"); |
| |
| // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB. |
| if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, |
| Subtarget, DAG)) |
| return Masked; |
| |
| if (Subtarget.hasBWI() && Subtarget.hasVLX()) { |
| MVT IntegerType = |
| MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8)); |
| SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType); |
| return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG); |
| } |
| |
| // Scale the blend by the number of bytes per element. |
| int Scale = VT.getScalarSizeInBits() / 8; |
| |
| // This form of blend is always done on bytes. Compute the byte vector |
| // type. |
| MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8); |
| |
| // x86 allows load folding with blendvb from the 2nd source operand. But |
| // we are still using LLVM select here (see comment below), so that's V1. |
| // If V2 can be load-folded and V1 cannot be load-folded, then commute to |
| // allow that load-folding possibility. |
| if (!ISD::isNormalLoad(V1.getNode()) && ISD::isNormalLoad(V2.getNode())) { |
| ShuffleVectorSDNode::commuteMask(Mask); |
| std::swap(V1, V2); |
| } |
| |
| // Compute the VSELECT mask. Note that VSELECT is really confusing in the |
| // mix of LLVM's code generator and the x86 backend. We tell the code |
| // generator that boolean values in the elements of an x86 vector register |
| // are -1 for true and 0 for false. We then use the LLVM semantics of 'true' |
| // mapping a select to operand #1, and 'false' mapping to operand #2. The |
| // reality in x86 is that vector masks (pre-AVX-512) use only the high bit |
| // of the element (the remaining are ignored) and 0 in that high bit would |
| // mean operand #1 while 1 in the high bit would mean operand #2. So while |
| // the LLVM model for boolean values in vector elements gets the relevant |
| // bit set, it is set backwards and over constrained relative to x86's |
| // actual model. |
| SmallVector<SDValue, 32> VSELECTMask; |
| for (int i = 0, Size = Mask.size(); i < Size; ++i) |
| for (int j = 0; j < Scale; ++j) |
| VSELECTMask.push_back( |
| Mask[i] < 0 ? DAG.getUNDEF(MVT::i8) |
| : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL, |
| MVT::i8)); |
| |
| V1 = DAG.getBitcast(BlendVT, V1); |
| V2 = DAG.getBitcast(BlendVT, V2); |
| return DAG.getBitcast( |
| VT, |
| DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask), |
| V1, V2)); |
| } |
| case MVT::v16f32: |
| case MVT::v8f64: |
| case MVT::v8i64: |
| case MVT::v16i32: |
| case MVT::v32i16: |
| case MVT::v64i8: { |
| // Attempt to lower to a bitmask if we can. Only if not optimizing for size. |
| bool OptForSize = DAG.shouldOptForSize(); |
| if (!OptForSize) { |
| if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, |
| Subtarget, DAG)) |
| return Masked; |
| } |
| |
| // Otherwise load an immediate into a GPR, cast to k-register, and use a |
| // masked move. |
| MVT IntegerType = |
| MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8)); |
| SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType); |
| return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG); |
| } |
| default: |
| llvm_unreachable("Not a supported integer vector type!"); |
| } |
| } |
| |
| /// Try to lower as a blend of elements from two inputs followed by |
| /// a single-input permutation. |
| /// |
| /// This matches the pattern where we can blend elements from two inputs and |
| /// then reduce the shuffle to a single-input permutation. |
| static SDValue lowerShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT, |
| SDValue V1, SDValue V2, |
| ArrayRef<int> Mask, |
| SelectionDAG &DAG, |
| bool ImmBlends = false) { |
| // We build up the blend mask while checking whether a blend is a viable way |
| // to reduce the shuffle. |
| SmallVector<int, 32> BlendMask(Mask.size(), -1); |
| SmallVector<int, 32> PermuteMask(Mask.size(), -1); |
| |
| for (int i = 0, Size = Mask.size(); i < Size; ++i) { |
| if (Mask[i] < 0) |
| continue; |
| |
| assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds."); |
| |
| if (BlendMask[Mask[i] % Size] < 0) |
| BlendMask[Mask[i] % Size] = Mask[i]; |
| else if (BlendMask[Mask[i] % Size] != Mask[i]) |
| return SDValue(); // Can't blend in the needed input! |
| |
| PermuteMask[i] = Mask[i] % Size; |
| } |
| |
| // If only immediate blends, then bail if the blend mask can't be widened to |
| // i16. |
| unsigned EltSize = VT.getScalarSizeInBits(); |
| if (ImmBlends && EltSize == 8 && !canWidenShuffleElements(BlendMask)) |
| return SDValue(); |
| |
| SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask); |
| return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask); |
| } |
| |
| /// Try to lower as an unpack of elements from two inputs followed by |
| /// a single-input permutation. |
| /// |
| /// This matches the pattern where we can unpack elements from two inputs and |
| /// then reduce the shuffle to a single-input (wider) permutation. |
| static SDValue lowerShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT, |
| SDValue V1, SDValue V2, |
| ArrayRef<int> Mask, |
| SelectionDAG &DAG) { |
| int NumElts = Mask.size(); |
| int NumLanes = VT.getSizeInBits() / 128; |
| int NumLaneElts = NumElts / NumLanes; |
| int NumHalfLaneElts = NumLaneElts / 2; |
| |
| bool MatchLo = true, MatchHi = true; |
| SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)}; |
| |
| // Determine UNPCKL/UNPCKH type and operand order. |
| for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) { |
| for (int Elt = 0; Elt != NumLaneElts; ++Elt) { |
| int M = Mask[Lane + Elt]; |
| if (M < 0) |
| continue; |
| |
| SDValue &Op = Ops[Elt & 1]; |
| if (M < NumElts && (Op.isUndef() || Op == V1)) |
| Op = V1; |
| else if (NumElts <= M && (Op.isUndef() || Op == V2)) |
| Op = V2; |
| else |
| return SDValue(); |
| |
| int Lo = Lane, Mid = Lane + NumHalfLaneElts, Hi = Lane + NumLaneElts; |
| MatchLo &= isUndefOrInRange(M, Lo, Mid) || |
| isUndefOrInRange(M, NumElts + Lo, NumElts + Mid); |
| MatchHi &= isUndefOrInRange(M, Mid, Hi) || |
| isUndefOrInRange(M, NumElts + Mid, NumElts + Hi); |
| if (!MatchLo && !MatchHi) |
| return SDValue(); |
| } |
| } |
| assert((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI"); |
| |
| // Now check that each pair of elts come from the same unpack pair |
| // and set the permute mask based on each pair. |
| // TODO - Investigate cases where we permute individual elements. |
| SmallVector<int, 32> PermuteMask(NumElts, -1); |
| for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) { |
| for (int Elt = 0; Elt != NumLaneElts; Elt += 2) { |
| int M0 = Mask[Lane + Elt + 0]; |
| int M1 = Mask[Lane + Elt + 1]; |
| if (0 <= M0 && 0 <= M1 && |
| (M0 % NumHalfLaneElts) != (M1 % NumHalfLaneElts)) |
| return SDValue(); |
| if (0 <= M0) |
| PermuteMask[Lane + Elt + 0] = Lane + (2 * (M0 % NumHalfLaneElts)); |
| if (0 <= M1) |
| PermuteMask[Lane + Elt + 1] = Lane + (2 * (M1 % NumHalfLaneElts)) + 1; |
| } |
| } |
| |
| unsigned UnpckOp = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH; |
| SDValue Unpck = DAG.getNode(UnpckOp, DL, VT, Ops); |
| return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask); |
| } |
| |
| /// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then |
| /// permuting the elements of the result in place. |
| static SDValue lowerShuffleAsByteRotateAndPermute( |
| const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, |
| const X86Subtarget &Subtarget, SelectionDAG &DAG) { |
| if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) || |
| (VT.is256BitVector() && !Subtarget.hasAVX2()) || |
| (VT.is512BitVector() && !Subtarget.hasBWI())) |
| return SDValue(); |
| |
| // We don't currently support lane crossing permutes. |
| if (is128BitLaneCrossingShuffleMask(VT, Mask)) |
| return SDValue(); |
| |
| int Scale = VT.getScalarSizeInBits() / 8; |
| int NumLanes = VT.getSizeInBits() / 128; |
| int NumElts = VT.getVectorNumElements(); |
| int NumEltsPerLane = NumElts / NumLanes; |
| |
| // Determine range of mask elts. |
| bool Blend1 = true; |
| bool Blend2 = true; |
| std::pair<int, int> Range1 = std::make_pair(INT_MAX, INT_MIN); |
| std::pair<int, int> Range2 = std::make_pair(INT_MAX, INT_MIN); |
| for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) { |
| for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) { |
| int M = Mask[Lane + Elt]; |
| if (M < 0) |
| continue; |
| if (M < NumElts) { |
| Blend1 &= (M == (Lane + Elt)); |
| assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask"); |
| M = M % NumEltsPerLane; |
| Range1.first = std::min(Range1.first, M); |
| Range1.second = std::max(Range1.second, M); |
| } else { |
| M -= NumElts; |
| Blend2 &= (M == (Lane + Elt)); |
| assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask"); |
| M = M % NumEltsPerLane; |
| Range2.first = std::min(Range2.first, M); |
| Range2.second = std::max(Range2.second, M); |
| } |
| } |
| } |
| |
| // Bail if we don't need both elements. |
| // TODO - it might be worth doing this for unary shuffles if the permute |
| // can be widened. |
| if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) || |
| !(0 <= Range2.first && Range2.second < NumEltsPerLane)) |
| return SDValue(); |
| |
| if (VT.getSizeInBits() > 128 && (Blend1 || Blend2)) |
| return SDValue(); |
| |
| // Rotate the 2 ops so we can access both ranges, then permute the result. |
| auto RotateAndPermute = [&](SDValue Lo, SDValue Hi, int RotAmt, int Ofs) { |
| MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8); |
| SDValue Rotate = DAG.getBitcast( |
| VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi), |
| DAG.getBitcast(ByteVT, Lo), |
| DAG.getTargetConstant(Scale * RotAmt, DL, MVT::i8))); |
| SmallVector<int, 64> PermMask(NumElts, SM_SentinelUndef); |
| for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) { |
| for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) { |
| int M = Mask[Lane + Elt]; |
| if (M < 0) |
| continue; |
| if (M < NumElts) |
| PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane); |
| else |
| PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane); |
| } |
| } |
| return DAG.getVectorShuffle(VT, DL, Rotate, DAG.getUNDEF(VT), PermMask); |
| }; |
| |
| // Check if the ranges are small enough to rotate from either direction. |
| if (Range2.second < Range1.first) |
| return RotateAndPermute(V1, V2, Range1.first, 0); |
| if (Range1.second < Range2.first) |
| return RotateAndPermute(V2, V1, Range2.first, NumElts); |
| return SDValue(); |
| } |
| |
| /// Generic routine to decompose a shuffle and blend into independent |
| /// blends and permutes. |
| /// |
| /// This matches the extremely common pattern for handling combined |
| /// shuffle+blend operations on newer X86 ISAs where we have very fast blend |
| /// operations. It will try to pick the best arrangement of shuffles and |
| /// blends. |
| static SDValue lowerShuffleAsDecomposedShuffleBlend( |
| const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, |
| const X86Subtarget &Subtarget, SelectionDAG &DAG) { |
| // Shuffle the input elements into the desired positions in V1 and V2 and |
| // blend them together. |
| SmallVector<int, 32> V1Mask(Mask.size(), -1); |
| SmallVector<int, 32> V2Mask(Mask.size(), -1); |
| SmallVector<int, 32> BlendMask(Mask.size(), -1); |
| for (int i = 0, Size = Mask.size(); i < Size; ++i) |
| if (Mask[i] >= 0 && Mask[i] < Size) { |
| V1Mask[i] = Mask[i]; |
| BlendMask[i] = i; |
| } else if (Mask[i] >= Size) { |
| V2Mask[i] = Mask[i] - Size; |
| BlendMask[i] = i + Size; |
| } |
| |
| // Try to lower with the simpler initial blend/unpack/rotate strategies unless |
| // one of the input shuffles would be a no-op. We prefer to shuffle inputs as |
| // the shuffle may be able to fold with a load or other benefit. However, when |
| // we'll have to do 2x as many shuffles in order to achieve this, a 2-input |
| // pre-shuffle first is a better strategy. |
| if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) { |
| // Only prefer immediate blends to unpack/rotate. |
| if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, |
| DAG, true)) |
| return BlendPerm; |
| if (SDValue UnpackPerm = lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask, |
| DAG)) |
| return UnpackPerm; |
| if (SDValue RotatePerm = lowerShuffleAsByteRotateAndPermute( |
| DL, VT, V1, V2, Mask, Subtarget, DAG)) |
| return RotatePerm; |
| // Unpack/rotate failed - try again with variable blends. |
| if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, |
| DAG)) |
| return BlendPerm; |
| } |
| |
| V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask); |
| V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask); |
| return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask); |
| } |
| |
| /// Try to lower a vector shuffle as a rotation. |
| /// |
| /// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512. |
| static int matchShuffleAsRotate(SDValue &V1, SDValue &V2, ArrayRef<int> Mask) { |
| int NumElts = Mask.size(); |
| |
| // We need to detect various ways of spelling a rotation: |
| // [11, 12, 13, 14, 15, 0, 1, 2] |
| // [-1, 12, 13, 14, -1, -1, 1, -1] |
| // [-1, -1, -1, -1, -1, -1, 1, 2] |
| // [ 3, 4, 5, 6, 7, 8, 9, 10] |
| // [-1, 4, 5, 6, -1, -1, 9, -1] |
| // [-1, 4, 5, 6, -1, -1, -1, -1] |
| int Rotation = 0; |
| SDValue Lo, Hi; |
| for (int i = 0; i < NumElts; ++i) { |
| int M = Mask[i]; |
| assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) && |
| "Unexpected mask index."); |
| if (M < 0) |
| continue; |
| |
| // Determine where a rotated vector would have started. |
| int StartIdx = i - (M % NumElts); |
| if (StartIdx == 0) |
| // The identity rotation isn't interesting, stop. |
| return -1; |
| |
| // If we found the tail of a vector the rotation must be the missing |
| // front. If we found the head of a vector, it must be how much of the |
| // head. |
| int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx; |
| |
| if (Rotation == 0) |
| Rotation = CandidateRotation; |
| else if (Rotation != CandidateRotation) |
| // The rotations don't match, so we can't match this mask. |
| return -1; |
| |
| // Compute which value this mask is pointing at. |
| SDValue MaskV = M < NumElts ? V1 : V2; |
| |
| // Compute which of the two target values this index should be assigned |
| // to. This reflects whether the high elements are remaining or the low |
| // elements are remaining. |
| SDValue &TargetV = StartIdx < 0 ? Hi : Lo; |
| |
| // Either set up this value if we've not encountered it before, or check |
| // that it remains consistent. |
| if (!TargetV) |
| TargetV = MaskV; |
| else if (TargetV != MaskV) |
| // This may be a rotation, but it pulls from the inputs in some |
| // unsupported interleaving. |
| return -1; |
| } |
| |
| // Check that we successfully analyzed the mask, and normalize the results. |
| assert(Rotation != 0 && "Failed to locate a viable rotation!"); |
| assert((Lo || Hi) && "Failed to find a rotated input vector!"); |
| if (!Lo) |
| Lo = Hi; |
| else if (!Hi) |
| Hi = Lo; |
| |
| V1 = Lo; |
| V2 = Hi; |
| |
| return Rotation; |
| } |
| |
| /// Try to lower a vector shuffle as a byte rotation. |
| /// |
| /// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary |
| /// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use |
| /// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will |
| /// try to generically lower a vector shuffle through such an pattern. It |
| /// does not check for the profitability of lowering either as PALIGNR or |
| /// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form. |
| /// This matches shuffle vectors that look like: |
| /// |
| /// v8i16 [11, 12, 13, 14, 15, 0, 1, 2] |
| /// |
| /// Essentially it concatenates V1 and V2, shifts right by some number of |
| /// elements, and takes the low elements as the result. Note that while this is |
| /// specified as a *right shift* because x86 is little-endian, it is a *left |
| /// rotate* of the vector lanes. |
| static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2, |
| ArrayRef<int> Mask) { |
| // Don't accept any shuffles with zero elements. |
| if (any_of(Mask, [](int M) { return M == SM_SentinelZero; })) |
| return -1; |
| |
| // PALIGNR works on 128-bit lanes. |
| SmallVector<int, 16> RepeatedMask; |
| if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask)) |
| return -1; |
| |
| int Rotation = matchShuffleAsRotate(V1, V2, RepeatedMask); |
| if (Rotation <= 0) |
| return -1; |
| |
| // PALIGNR rotates bytes, so we need to scale the |
| // rotation based on how many bytes are in the vector lane. |
| int NumElts = RepeatedMask.size(); |
| int Scale = 16 / NumElts; |
| return Rotation * Scale; |
| } |
| |
| static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1, |
| SDValue V2, ArrayRef<int> Mask, |
| const X86Subtarget &Subtarget, |
| SelectionDAG &DAG) { |
| assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!"); |
| |
| SDValue Lo = V1, Hi = V2; |
| int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask); |
| if (ByteRotation <= 0) |
| return SDValue(); |
| |
| // Cast the inputs to i8 vector of correct length to match PALIGNR or |
| // PSLLDQ/PSRLDQ. |
| MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8); |
| Lo = DAG.getBitcast(ByteVT, Lo); |
| Hi = DAG.getBitcast(ByteVT, Hi); |
| |
| // SSSE3 targets can use the palignr instruction. |
| if (Subtarget.hasSSSE3()) { |
| assert((!VT.is512BitVector() || Subtarget.hasBWI()) && |
| "512-bit PALIGNR requires BWI instructions"); |
| return DAG.getBitcast( |
| VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi, |
| DAG.getTargetConstant(ByteRotation, DL, MVT::i8))); |
| } |
| |
| assert(VT.is128BitVector() && |
| "Rotate-based lowering only supports 128-bit lowering!"); |
| assert(Mask.size() <= 16 && |
| "Can shuffle at most 16 bytes in a 128-bit vector!"); |
| assert(ByteVT == MVT::v16i8 && |
| "SSE2 rotate lowering only needed for v16i8!"); |
| |
| // Default SSE2 implementation |
| int LoByteShift = 16 - ByteRotation; |
| int HiByteShift = ByteRotation; |
| |
| SDValue LoShift = |
| DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo, |
| DAG.getTargetConstant(LoByteShift, DL, MVT::i8)); |
| SDValue HiShift = |
| DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi, |
| DAG.getTargetConstant(HiByteShift, DL, MVT::i8)); |
| return DAG.getBitcast(VT, |
| DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift)); |
| } |
| |
| /// Try to lower a vector shuffle as a dword/qword rotation. |
| /// |
| /// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary |
| /// rotation of the concatenation of two vectors; This routine will |
| /// try to generically lower a vector shuffle through such an pattern. |
| /// |
| /// Essentially it concatenates V1 and V2, shifts right by some number of |
| /// elements, and takes the low elements as the result. Note that while this is |
| /// specified as a *right shift* because x86 is little-endian, it is a *left |
| /// rotate* of the vector lanes. |
| static SDValue lowerShuffleAsRotate(const SDLoc &DL, MVT VT, SDValue V1, |
| SDValue V2, ArrayRef<int> Mask, |
| const X86Subtarget &Subtarget, |
| SelectionDAG &DAG) { |
| assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && |
| "Only 32-bit and 64-bit elements are supported!"); |
| |
| // 128/256-bit vectors are only supported with VLX. |
| assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector())) |
| && "VLX required for 128/256-bit vectors"); |
| |
| SDValue Lo = V1, Hi = V2; |
| int Rotation = matchShuffleAsRotate(Lo, Hi, Mask); |
| if (Rotation <= 0) |
| return SDValue(); |
| |
| return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi, |
| DAG.getTargetConstant(Rotation, DL, MVT::i8)); |
| } |
| |
| /// Try to lower a vector shuffle as a byte shift sequence. |
| static SDValue lowerShuffleAsByteShiftMask(const SDLoc &DL, MVT VT, SDValue V1, |
| SDValue V2, ArrayRef<int> Mask, |
| const APInt &Zeroable, |
| const X86Subtarget &Subtarget, |
| SelectionDAG &DAG) { |
| assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!"); |
| assert(VT.is128BitVector() && "Only 128-bit vectors supported"); |
| |
| // We need a shuffle that has zeros at one/both ends and a sequential |
| // shuffle from one source within. |
| unsigned ZeroLo = Zeroable.countTrailingOnes(); |
| unsigned ZeroHi = Zeroable.countLeadingOnes(); |
| if (!ZeroLo && !ZeroHi) |
| return SDValue(); |
| |
| unsigned NumElts = Mask.size(); |
| unsigned Len = NumElts - (ZeroLo + ZeroHi); |
| if (!isSequentialOrUndefInRange(Mask, ZeroLo, Len, Mask[ZeroLo])) |
| return SDValue(); |
| |
| unsigned Scale = VT.getScalarSizeInBits() / 8; |
| ArrayRef<int> StubMask = Mask.slice(ZeroLo, Len); |
| if (!isUndefOrInRange(StubMask, 0, NumElts) && |
| !isUndefOrInRange(StubMask, NumElts, 2 * NumElts)) |
| return SDValue(); |
| |
| SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2; |
| Res = DAG.getBitcast(MVT::v16i8, Res); |
| |
| // Use VSHLDQ/VSRLDQ ops to zero the ends of a vector and leave an |
| // inner sequential set of elements, possibly offset: |
| // 01234567 --> zzzzzz01 --> 1zzzzzzz |
| // 01234567 --> 4567zzzz --> zzzzz456 |
| // 01234567 --> z0123456 --> 3456zzzz --> zz3456zz |
| if (ZeroLo == 0) { |
| unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts); |
| Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res, |
| DAG.getTargetConstant(Scale * Shift, DL, MVT::i8)); |
| Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res, |
| DAG.getTargetConstant(Scale * ZeroHi, DL, MVT::i8)); |
| } else if (ZeroHi == 0) { |
| unsigned Shift = Mask[ZeroLo] % NumElts; |
| Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res, |
| DAG.getTargetConstant(Scale * Shift, DL, MVT::i8)); |
| Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res, |
| DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8)); |
| } else if (!Subtarget.hasSSSE3()) { |
| // If we don't have PSHUFB then its worth avoiding an AND constant mask |
| // by performing 3 byte shifts. Shuffle combining can kick in above that. |
| // TODO: There may be some cases where VSH{LR}DQ+PAND is still better. |
| unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts); |
| Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res, |
| DAG.getTargetConstant(Scale * Shift, DL, MVT::i8)); |
| Shift += Mask[ZeroLo] % NumElts; |
| Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res, |
| DAG.getTargetConstant(Scale * Shift, DL, MVT::i8)); |
| Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res, |
| DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8)); |
| } else |
| return SDValue(); |
| |
| return DAG.getBitcast(VT, Res); |
| } |
| |
| /// Try to lower a vector shuffle as a bit shift (shifts in zeros). |
| /// |
| /// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and |
| /// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function |
| /// matches elements from one of the input vectors shuffled to the left or |
| /// right with zeroable elements 'shifted in'. It handles both the strictly |
| /// bit-wise element shifts and the byte shift across an entire 128-bit double |
| /// quad word lane. |
| /// |
| /// PSHL : (little-endian) left bit shift. |
| /// [ zz, 0, zz, 2 ] |
| /// [ -1, 4, zz, -1 ] |
| /// PSRL : (little-endian) right bit shift. |
| /// [ 1, zz, 3, zz] |
| /// [ -1, -1, 7, zz] |
| /// PSLLDQ : (little-endian) left byte shift |
| /// [ zz, 0, 1, 2, 3, 4, 5, 6] |
| /// [ zz, zz, -1, -1, 2, 3, 4, -1] |
| /// [ zz, zz, zz, zz, zz, zz, -1, 1] |
| /// PSRLDQ : (little-endian) right byte shift |
| /// [ 5, 6, 7, zz, zz, zz, zz, zz] |
| /// [ -1, 5, 6, 7, zz, zz, zz, zz] |
| /// [ 1, 2, -1, -1, -1, -1, zz, zz] |
| static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode, |
| unsigned ScalarSizeInBits, ArrayRef<int> Mask, |
| int MaskOffset, const APInt &Zeroable, |
| const X86Subtarget &Subtarget) { |
| int Size = Mask.size(); |
| unsigned SizeInBits = Size * ScalarSizeInBits; |
| |
| auto CheckZeros = [&](int Shift, int Scale, bool Left) { |
| for (int i = 0; i < Size; i += Scale) |
| for (int j = 0; j < Shift; ++j) |
| if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))]) |
| return false; |
| |
| return true; |
| }; |
| |
| auto MatchShift = [&](int Shift, int Scale, bool Left) { |
| for (int i = 0; i != Size; i += Scale) { |
| unsigned Pos = Left ? i + Shift : i; |
| unsigned Low = Left ? i : i + Shift; |
| unsigned Len = Scale - Shift; |
| if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset)) |
| return -1; |
| } |
| |
| int ShiftEltBits = ScalarSizeInBits * Scale; |
| bool ByteShift = ShiftEltBits > 64; |
| Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI) |
| : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI); |
| int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1); |
| |
| // Normalize the scale for byte shifts to still produce an i64 element |
| // type. |
| Scale = ByteShift ? Scale / 2 : Scale; |
| |
| // We need to round trip through the appropriate type for the shift. |
| MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale); |
| ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8) |
| : MVT::getVectorVT(ShiftSVT, Size / Scale); |
| return (int)ShiftAmt; |
| }; |
| |
| // SSE/AVX supports logical shifts up to 64-bit integers - so we can just |
| // keep doubling the size of the integer elements up to that. We can |
| // then shift the elements of the integer vector by whole multiples of |
| // their width within the elements of the larger integer vector. Test each |
| // multiple to see if we can find a match with the moved element indices |
| // and that the shifted in elements are all zeroable. |
| unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128); |
| for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2) |
| for (int Shift = 1; Shift != Scale; ++Shift) |
| for (bool Left : {true, false}) |
| if (CheckZeros(Shift, Scale, Left)) { |
| int ShiftAmt = MatchShift(Shift, Scale, Left); |
| if (0 < ShiftAmt) |
| return ShiftAmt; |
| } |
| |
| // no match |
| return -1; |
| } |
| |
| static SDValue lowerShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1, |
| SDValue V2, ArrayRef<int> Mask, |
| const APInt &Zeroable, |
| const X86Subtarget &Subtarget, |
| SelectionDAG &DAG) { |
| int Size = Mask.size(); |
| assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size"); |
| |
| MVT ShiftVT; |
| SDValue V = V1; |
| unsigned Opcode; |
| |
| // Try to match shuffle against V1 shift. |
| int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(), |
| Mask, 0, Zeroable, Subtarget); |
| |
| // If V1 failed, try to match shuffle against V2 shift. |
| if (ShiftAmt < 0) { |
| ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(), |
| Mask, Size, Zeroable, Subtarget); |
| V = V2; |
| } |
| |
| if (ShiftAmt < 0) |
| return SDValue(); |
| |
| assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) && |
| "Illegal integer vector type"); |
| V = DAG.getBitcast(ShiftVT, V); |
| V = DAG.getNode(Opcode, DL, ShiftVT, V, |
| DAG.getTargetConstant(ShiftAmt, DL, MVT::i8)); |
| return DAG.getBitcast(VT, V); |
| } |
| |
| // EXTRQ: Extract Len elements from lower half of source, starting at Idx. |
| // Remainder of lower half result is zero and upper half is all undef. |
| static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2, |
| ArrayRef<int> Mask, uint64_t &BitLen, |
| uint64_t &BitIdx, const APInt &Zeroable) { |
| int Size = Mask.size(); |
| int HalfSize = Size / 2; |
| assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size"); |
| assert(!Zeroable.isAllOnesValue() && "Fully zeroable shuffle mask"); |
| |
| // Upper half must be undefined. |
| if (!isUndefUpperHalf(Mask)) |
| return false; |
| |
| // Determine the extraction length from the part of the |
| // lower half that isn't zeroable. |
| int Len = HalfSize; |
| for (; Len > 0; --Len) |
| if (!Zeroable[Len - 1]) |
| break; |
| assert(Len > 0 && "Zeroable shuffle mask"); |
| |
| // Attempt to match first Len sequential elements from the lower half. |
| SDValue Src; |
| int Idx = -1; |
| for (int i = 0; i != Len; ++i) { |
| int M = Mask[i]; |
| if (M == SM_SentinelUndef) |
| continue; |
| SDValue &V = (M < Size ? V1 : V2); |
| M = M % Size; |
| |
| // The extracted elements must start at a valid index and all mask |
| // elements must be in the lower half. |
| if (i > M || M >= HalfSize) |
| return false; |
| |
| if (Idx < 0 || (Src == V && Idx == (M - i))) { |
| Src = V; |
| Idx = M - i; |
| continue; |
| } |
| return false; |
| } |
| |
| if (!Src || Idx < 0) |
| return false; |
| |
| assert((Idx + Len) <= HalfSize && "Illegal extraction mask"); |
| BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f; |
| BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f; |
| V1 = Src; |
| return true; |
| } |
| |
| // INSERTQ: Extract lowest Len elements from lower half of second source and |
| // insert over first source, starting at Idx. |
| // { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... } |
| static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2, |
| ArrayRef<int> Mask, uint64_t &BitLen, |
| uint64_t &BitIdx) { |
| int Size = Mask.size(); |
| int HalfSize = Size / 2; |
| assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size"); |
| |
| // Upper half must be undefined. |
| if (!isUndefUpperHalf(Mask)) |
| return false; |
| |
| for (int Idx = 0; Idx != HalfSize; ++Idx) { |
| SDValue Base; |
| |
| // Attempt to match first source from mask before insertion point. |
| if (isUndefInRange(Mask, 0, Idx)) { |
| /* EMPTY */ |
| } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) { |
| Base = V1; |
| } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) { |
| Base = V2; |
| } else { |
| continue; |
| } |
| |
| // Extend the extraction length looking to match both the insertion of |
| // the second source and the remaining elements of the first. |
| for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) { |
| SDValue Insert; |
| int Len = Hi - Idx; |
| |
| // Match insertion. |
| if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) { |
| Insert = V1; |
| } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) { |
| Insert = V2; |
| } else { |
| continue; |
| } |
| |
| // Match the remaining elements of the lower half. |
| if (isUndefInRange(Mask, Hi, HalfSize - Hi)) { |
| /* EMPTY */ |
| } else if ((!Base || (Base == V1)) && |
| isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) { |
| Base = V1; |
| } else if ((!Base || (Base == V2)) && |
| isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, |
| Size + Hi)) { |
| Base = V2; |
| } else { |
| continue |