blob: 215898db8fddd787e4f0c1a2a86f7686a9c62250 [file] [log] [blame]
//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file defines the interfaces that X86 uses to lower LLVM code into a
// selection DAG.
//
//===----------------------------------------------------------------------===//
#include "X86ISelLowering.h"
#include "Utils/X86ShuffleDecode.h"
#include "X86CallingConv.h"
#include "X86FrameLowering.h"
#include "X86InstrBuilder.h"
#include "X86IntrinsicsInfo.h"
#include "X86MachineFunctionInfo.h"
#include "X86TargetMachine.h"
#include "X86TargetObjectFile.h"
#include "llvm/ADT/SmallBitVector.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringSwitch.h"
#include "llvm/Analysis/BlockFrequencyInfo.h"
#include "llvm/Analysis/EHPersonalities.h"
#include "llvm/Analysis/ProfileSummaryInfo.h"
#include "llvm/CodeGen/IntrinsicLowering.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineJumpTableInfo.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/TargetLowering.h"
#include "llvm/CodeGen/WinEHFuncInfo.h"
#include "llvm/IR/CallSite.h"
#include "llvm/IR/CallingConv.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalAlias.h"
#include "llvm/IR/GlobalVariable.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCSymbol.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/KnownBits.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Target/TargetOptions.h"
#include <algorithm>
#include <bitset>
#include <cctype>
#include <numeric>
using namespace llvm;
#define DEBUG_TYPE "x86-isel"
STATISTIC(NumTailCalls, "Number of tail calls");
static cl::opt<int> ExperimentalPrefLoopAlignment(
"x86-experimental-pref-loop-alignment", cl::init(4),
cl::desc(
"Sets the preferable loop alignment for experiments (as log2 bytes)"
"(the last x86-experimental-pref-loop-alignment bits"
" of the loop header PC will be 0)."),
cl::Hidden);
// Added in 10.0.
static cl::opt<bool> EnableOldKNLABI(
"x86-enable-old-knl-abi", cl::init(false),
cl::desc("Enables passing v32i16 and v64i8 in 2 YMM registers instead of "
"one ZMM register on AVX512F, but not AVX512BW targets."),
cl::Hidden);
static cl::opt<bool> MulConstantOptimization(
"mul-constant-optimization", cl::init(true),
cl::desc("Replace 'mul x, Const' with more effective instructions like "
"SHIFT, LEA, etc."),
cl::Hidden);
static cl::opt<bool> ExperimentalUnorderedISEL(
"x86-experimental-unordered-atomic-isel", cl::init(false),
cl::desc("Use LoadSDNode and StoreSDNode instead of "
"AtomicSDNode for unordered atomic loads and "
"stores respectively."),
cl::Hidden);
/// Call this when the user attempts to do something unsupported, like
/// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
/// report_fatal_error, so calling code should attempt to recover without
/// crashing.
static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,
const char *Msg) {
MachineFunction &MF = DAG.getMachineFunction();
DAG.getContext()->diagnose(
DiagnosticInfoUnsupported(MF.getFunction(), Msg, dl.getDebugLoc()));
}
X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
const X86Subtarget &STI)
: TargetLowering(TM), Subtarget(STI) {
bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
X86ScalarSSEf64 = Subtarget.hasSSE2();
X86ScalarSSEf32 = Subtarget.hasSSE1();
MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
// Set up the TargetLowering object.
// X86 is weird. It always uses i8 for shift amounts and setcc results.
setBooleanContents(ZeroOrOneBooleanContent);
// X86-SSE is even stranger. It uses -1 or 0 for vector masks.
setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
// For 64-bit, since we have so many registers, use the ILP scheduler.
// For 32-bit, use the register pressure specific scheduling.
// For Atom, always use ILP scheduling.
if (Subtarget.isAtom())
setSchedulingPreference(Sched::ILP);
else if (Subtarget.is64Bit())
setSchedulingPreference(Sched::ILP);
else
setSchedulingPreference(Sched::RegPressure);
const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
// Bypass expensive divides and use cheaper ones.
if (TM.getOptLevel() >= CodeGenOpt::Default) {
if (Subtarget.hasSlowDivide32())
addBypassSlowDiv(32, 8);
if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
addBypassSlowDiv(64, 32);
}
if (Subtarget.isTargetWindowsMSVC() ||
Subtarget.isTargetWindowsItanium()) {
// Setup Windows compiler runtime calls.
setLibcallName(RTLIB::SDIV_I64, "_alldiv");
setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
setLibcallName(RTLIB::SREM_I64, "_allrem");
setLibcallName(RTLIB::UREM_I64, "_aullrem");
setLibcallName(RTLIB::MUL_I64, "_allmul");
setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
}
if (Subtarget.getTargetTriple().isOSMSVCRT()) {
// MSVCRT doesn't have powi; fall back to pow
setLibcallName(RTLIB::POWI_F32, nullptr);
setLibcallName(RTLIB::POWI_F64, nullptr);
}
// If we don't have cmpxchg8b(meaing this is a 386/486), limit atomic size to
// 32 bits so the AtomicExpandPass will expand it so we don't need cmpxchg8b.
// FIXME: Should we be limitting the atomic size on other configs? Default is
// 1024.
if (!Subtarget.hasCmpxchg8b())
setMaxAtomicSizeInBitsSupported(32);
// Set up the register classes.
addRegisterClass(MVT::i8, &X86::GR8RegClass);
addRegisterClass(MVT::i16, &X86::GR16RegClass);
addRegisterClass(MVT::i32, &X86::GR32RegClass);
if (Subtarget.is64Bit())
addRegisterClass(MVT::i64, &X86::GR64RegClass);
for (MVT VT : MVT::integer_valuetypes())
setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
// We don't accept any truncstore of integer registers.
setTruncStoreAction(MVT::i64, MVT::i32, Expand);
setTruncStoreAction(MVT::i64, MVT::i16, Expand);
setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
setTruncStoreAction(MVT::i32, MVT::i16, Expand);
setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
setTruncStoreAction(MVT::i16, MVT::i8, Expand);
setTruncStoreAction(MVT::f64, MVT::f32, Expand);
// SETOEQ and SETUNE require checking two conditions.
setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
// Integer absolute.
if (Subtarget.hasCMov()) {
setOperationAction(ISD::ABS , MVT::i16 , Custom);
setOperationAction(ISD::ABS , MVT::i32 , Custom);
}
setOperationAction(ISD::ABS , MVT::i64 , Custom);
// Funnel shifts.
for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
setOperationAction(ShiftOp , MVT::i16 , Custom);
setOperationAction(ShiftOp , MVT::i32 , Custom);
if (Subtarget.is64Bit())
setOperationAction(ShiftOp , MVT::i64 , Custom);
}
if (!Subtarget.useSoftFloat()) {
// Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
// operation.
setOperationAction(ISD::UINT_TO_FP, MVT::i8, Promote);
setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i8, Promote);
setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i16, Promote);
// We have an algorithm for SSE2, and we turn this into a 64-bit
// FILD or VCVTUSI2SS/SD for other targets.
setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom);
// We have an algorithm for SSE2->double, and we turn this into a
// 64-bit FILD followed by conditional FADD for other targets.
setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom);
// Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
// this operation.
setOperationAction(ISD::SINT_TO_FP, MVT::i8, Promote);
setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i8, Promote);
// SSE has no i16 to fp conversion, only i32. We promote in the handler
// to allow f80 to use i16 and f64 to use i16 with sse1 only
setOperationAction(ISD::SINT_TO_FP, MVT::i16, Custom);
setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i16, Custom);
// f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not
setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom);
// In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
// are Legal, f80 is custom lowered.
setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom);
// Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
// this operation.
setOperationAction(ISD::FP_TO_SINT, MVT::i8, Promote);
// FIXME: This doesn't generate invalid exception when it should. PR44019.
setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i8, Promote);
setOperationAction(ISD::FP_TO_SINT, MVT::i16, Custom);
setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i16, Custom);
setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);
// In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
// are Legal, f80 is custom lowered.
setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom);
// Handle FP_TO_UINT by promoting the destination to a larger signed
// conversion.
setOperationAction(ISD::FP_TO_UINT, MVT::i8, Promote);
// FIXME: This doesn't generate invalid exception when it should. PR44019.
setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i8, Promote);
setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
// FIXME: This doesn't generate invalid exception when it should. PR44019.
setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i16, Promote);
setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);
setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom);
}
// Handle address space casts between mixed sized pointers.
setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
// TODO: when we have SSE, these could be more efficient, by using movd/movq.
if (!X86ScalarSSEf64) {
setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
setOperationAction(ISD::BITCAST , MVT::i32 , Expand);
if (Subtarget.is64Bit()) {
setOperationAction(ISD::BITCAST , MVT::f64 , Expand);
// Without SSE, i64->f64 goes through memory.
setOperationAction(ISD::BITCAST , MVT::i64 , Expand);
}
} else if (!Subtarget.is64Bit())
setOperationAction(ISD::BITCAST , MVT::i64 , Custom);
// Scalar integer divide and remainder are lowered to use operations that
// produce two results, to match the available instructions. This exposes
// the two-result form to trivial CSE, which is able to combine x/y and x%y
// into a single instruction.
//
// Scalar integer multiply-high is also lowered to use two-result
// operations, to match the available instructions. However, plain multiply
// (low) operations are left as Legal, as there are single-result
// instructions for this in x86. Using the two-result multiply instructions
// when both high and low results are needed must be arranged by dagcombine.
for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
setOperationAction(ISD::MULHS, VT, Expand);
setOperationAction(ISD::MULHU, VT, Expand);
setOperationAction(ISD::SDIV, VT, Expand);
setOperationAction(ISD::UDIV, VT, Expand);
setOperationAction(ISD::SREM, VT, Expand);
setOperationAction(ISD::UREM, VT, Expand);
}
setOperationAction(ISD::BR_JT , MVT::Other, Expand);
setOperationAction(ISD::BRCOND , MVT::Other, Custom);
for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
setOperationAction(ISD::BR_CC, VT, Expand);
setOperationAction(ISD::SELECT_CC, VT, Expand);
}
if (Subtarget.is64Bit())
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
setOperationAction(ISD::FREM , MVT::f32 , Expand);
setOperationAction(ISD::FREM , MVT::f64 , Expand);
setOperationAction(ISD::FREM , MVT::f80 , Expand);
setOperationAction(ISD::FREM , MVT::f128 , Expand);
setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom);
// Promote the i8 variants and force them on up to i32 which has a shorter
// encoding.
setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
if (!Subtarget.hasBMI()) {
setOperationAction(ISD::CTTZ , MVT::i16 , Custom);
setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Legal);
setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal);
if (Subtarget.is64Bit()) {
setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
}
}
if (Subtarget.hasLZCNT()) {
// When promoting the i8 variants, force them to i32 for a shorter
// encoding.
setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
} else {
setOperationAction(ISD::CTLZ , MVT::i8 , Custom);
setOperationAction(ISD::CTLZ , MVT::i16 , Custom);
setOperationAction(ISD::CTLZ , MVT::i32 , Custom);
setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Custom);
setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Custom);
setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Custom);
if (Subtarget.is64Bit()) {
setOperationAction(ISD::CTLZ , MVT::i64 , Custom);
setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
}
}
// Special handling for half-precision floating point conversions.
// If we don't have F16C support, then lower half float conversions
// into library calls.
if (Subtarget.useSoftFloat() || !Subtarget.hasF16C()) {
setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
}
// There's never any support for operations beyond MVT::f32.
setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
setOperationAction(ISD::FP16_TO_FP, MVT::f128, Expand);
setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
setOperationAction(ISD::FP_TO_FP16, MVT::f128, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f16, Expand);
setTruncStoreAction(MVT::f32, MVT::f16, Expand);
setTruncStoreAction(MVT::f64, MVT::f16, Expand);
setTruncStoreAction(MVT::f80, MVT::f16, Expand);
setTruncStoreAction(MVT::f128, MVT::f16, Expand);
if (Subtarget.hasPOPCNT()) {
setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
} else {
setOperationAction(ISD::CTPOP , MVT::i8 , Expand);
setOperationAction(ISD::CTPOP , MVT::i16 , Expand);
setOperationAction(ISD::CTPOP , MVT::i32 , Expand);
if (Subtarget.is64Bit())
setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
else
setOperationAction(ISD::CTPOP , MVT::i64 , Custom);
}
setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
if (!Subtarget.hasMOVBE())
setOperationAction(ISD::BSWAP , MVT::i16 , Expand);
// X86 wants to expand cmov itself.
for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
setOperationAction(ISD::SELECT, VT, Custom);
setOperationAction(ISD::SETCC, VT, Custom);
setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
}
for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
if (VT == MVT::i64 && !Subtarget.is64Bit())
continue;
setOperationAction(ISD::SELECT, VT, Custom);
setOperationAction(ISD::SETCC, VT, Custom);
}
// Custom action for SELECT MMX and expand action for SELECT_CC MMX
setOperationAction(ISD::SELECT, MVT::x86mmx, Custom);
setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand);
setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);
// NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
// LLVM/Clang supports zero-cost DWARF and SEH exception handling.
setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
// Darwin ABI issue.
for (auto VT : { MVT::i32, MVT::i64 }) {
if (VT == MVT::i64 && !Subtarget.is64Bit())
continue;
setOperationAction(ISD::ConstantPool , VT, Custom);
setOperationAction(ISD::JumpTable , VT, Custom);
setOperationAction(ISD::GlobalAddress , VT, Custom);
setOperationAction(ISD::GlobalTLSAddress, VT, Custom);
setOperationAction(ISD::ExternalSymbol , VT, Custom);
setOperationAction(ISD::BlockAddress , VT, Custom);
}
// 64-bit shl, sra, srl (iff 32-bit x86)
for (auto VT : { MVT::i32, MVT::i64 }) {
if (VT == MVT::i64 && !Subtarget.is64Bit())
continue;
setOperationAction(ISD::SHL_PARTS, VT, Custom);
setOperationAction(ISD::SRA_PARTS, VT, Custom);
setOperationAction(ISD::SRL_PARTS, VT, Custom);
}
if (Subtarget.hasSSEPrefetch() || Subtarget.has3DNow())
setOperationAction(ISD::PREFETCH , MVT::Other, Legal);
setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);
// Expand certain atomics
for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
}
if (!Subtarget.is64Bit())
setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);
if (Subtarget.hasCmpxchg16b()) {
setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
}
// FIXME - use subtarget debug flags
if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
!Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
}
setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
setOperationAction(ISD::TRAP, MVT::Other, Legal);
setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
// VASTART needs to be custom lowered to use the VarArgsFrameIndex
setOperationAction(ISD::VASTART , MVT::Other, Custom);
setOperationAction(ISD::VAEND , MVT::Other, Expand);
bool Is64Bit = Subtarget.is64Bit();
setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
// GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) {
// f32 and f64 use SSE.
// Set up the FP register classes.
addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
: &X86::FR32RegClass);
addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
: &X86::FR64RegClass);
// Disable f32->f64 extload as we can only generate this in one instruction
// under optsize. So its easier to pattern match (fpext (load)) for that
// case instead of needing to emit 2 instructions for extload in the
// non-optsize case.
setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
for (auto VT : { MVT::f32, MVT::f64 }) {
// Use ANDPD to simulate FABS.
setOperationAction(ISD::FABS, VT, Custom);
// Use XORP to simulate FNEG.
setOperationAction(ISD::FNEG, VT, Custom);
// Use ANDPD and ORPD to simulate FCOPYSIGN.
setOperationAction(ISD::FCOPYSIGN, VT, Custom);
// These might be better off as horizontal vector ops.
setOperationAction(ISD::FADD, VT, Custom);
setOperationAction(ISD::FSUB, VT, Custom);
// We don't support sin/cos/fmod
setOperationAction(ISD::FSIN , VT, Expand);
setOperationAction(ISD::FCOS , VT, Expand);
setOperationAction(ISD::FSINCOS, VT, Expand);
}
// Lower this to MOVMSK plus an AND.
setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
} else if (!useSoftFloat() && X86ScalarSSEf32 && (UseX87 || Is64Bit)) {
// Use SSE for f32, x87 for f64.
// Set up the FP register classes.
addRegisterClass(MVT::f32, &X86::FR32RegClass);
if (UseX87)
addRegisterClass(MVT::f64, &X86::RFP64RegClass);
// Use ANDPS to simulate FABS.
setOperationAction(ISD::FABS , MVT::f32, Custom);
// Use XORP to simulate FNEG.
setOperationAction(ISD::FNEG , MVT::f32, Custom);
if (UseX87)
setOperationAction(ISD::UNDEF, MVT::f64, Expand);
// Use ANDPS and ORPS to simulate FCOPYSIGN.
if (UseX87)
setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
// We don't support sin/cos/fmod
setOperationAction(ISD::FSIN , MVT::f32, Expand);
setOperationAction(ISD::FCOS , MVT::f32, Expand);
setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
if (UseX87) {
// Always expand sin/cos functions even though x87 has an instruction.
setOperationAction(ISD::FSIN, MVT::f64, Expand);
setOperationAction(ISD::FCOS, MVT::f64, Expand);
setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
}
} else if (UseX87) {
// f32 and f64 in x87.
// Set up the FP register classes.
addRegisterClass(MVT::f64, &X86::RFP64RegClass);
addRegisterClass(MVT::f32, &X86::RFP32RegClass);
for (auto VT : { MVT::f32, MVT::f64 }) {
setOperationAction(ISD::UNDEF, VT, Expand);
setOperationAction(ISD::FCOPYSIGN, VT, Expand);
// Always expand sin/cos functions even though x87 has an instruction.
setOperationAction(ISD::FSIN , VT, Expand);
setOperationAction(ISD::FCOS , VT, Expand);
setOperationAction(ISD::FSINCOS, VT, Expand);
}
}
// Expand FP32 immediates into loads from the stack, save special cases.
if (isTypeLegal(MVT::f32)) {
if (UseX87 && (getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) {
addLegalFPImmediate(APFloat(+0.0f)); // FLD0
addLegalFPImmediate(APFloat(+1.0f)); // FLD1
addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
} else // SSE immediates.
addLegalFPImmediate(APFloat(+0.0f)); // xorps
}
// Expand FP64 immediates into loads from the stack, save special cases.
if (isTypeLegal(MVT::f64)) {
if (UseX87 && getRegClassFor(MVT::f64) == &X86::RFP64RegClass) {
addLegalFPImmediate(APFloat(+0.0)); // FLD0
addLegalFPImmediate(APFloat(+1.0)); // FLD1
addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
} else // SSE immediates.
addLegalFPImmediate(APFloat(+0.0)); // xorpd
}
// Handle constrained floating-point operations of scalar.
setOperationAction(ISD::STRICT_FADD, MVT::f32, Legal);
setOperationAction(ISD::STRICT_FADD, MVT::f64, Legal);
setOperationAction(ISD::STRICT_FSUB, MVT::f32, Legal);
setOperationAction(ISD::STRICT_FSUB, MVT::f64, Legal);
setOperationAction(ISD::STRICT_FMUL, MVT::f32, Legal);
setOperationAction(ISD::STRICT_FMUL, MVT::f64, Legal);
setOperationAction(ISD::STRICT_FDIV, MVT::f32, Legal);
setOperationAction(ISD::STRICT_FDIV, MVT::f64, Legal);
setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal);
setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Legal);
setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Legal);
setOperationAction(ISD::STRICT_FSQRT, MVT::f32, Legal);
setOperationAction(ISD::STRICT_FSQRT, MVT::f64, Legal);
// We don't support FMA.
setOperationAction(ISD::FMA, MVT::f64, Expand);
setOperationAction(ISD::FMA, MVT::f32, Expand);
// f80 always uses X87.
if (UseX87) {
addRegisterClass(MVT::f80, &X86::RFP80RegClass);
setOperationAction(ISD::UNDEF, MVT::f80, Expand);
setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
{
APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());
addLegalFPImmediate(TmpFlt); // FLD0
TmpFlt.changeSign();
addLegalFPImmediate(TmpFlt); // FLD0/FCHS
bool ignored;
APFloat TmpFlt2(+1.0);
TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,
&ignored);
addLegalFPImmediate(TmpFlt2); // FLD1
TmpFlt2.changeSign();
addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
}
// Always expand sin/cos functions even though x87 has an instruction.
setOperationAction(ISD::FSIN , MVT::f80, Expand);
setOperationAction(ISD::FCOS , MVT::f80, Expand);
setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
setOperationAction(ISD::FCEIL, MVT::f80, Expand);
setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
setOperationAction(ISD::FRINT, MVT::f80, Expand);
setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
setOperationAction(ISD::FMA, MVT::f80, Expand);
setOperationAction(ISD::LROUND, MVT::f80, Expand);
setOperationAction(ISD::LLROUND, MVT::f80, Expand);
setOperationAction(ISD::LRINT, MVT::f80, Expand);
setOperationAction(ISD::LLRINT, MVT::f80, Expand);
// Handle constrained floating-point operations of scalar.
setOperationAction(ISD::STRICT_FADD , MVT::f80, Legal);
setOperationAction(ISD::STRICT_FSUB , MVT::f80, Legal);
setOperationAction(ISD::STRICT_FMUL , MVT::f80, Legal);
setOperationAction(ISD::STRICT_FDIV , MVT::f80, Legal);
setOperationAction(ISD::STRICT_FSQRT , MVT::f80, Legal);
setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Legal);
// FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten
// as Custom.
setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Legal);
}
// f128 uses xmm registers, but most operations require libcalls.
if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) {
addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass
: &X86::VR128RegClass);
addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps
setOperationAction(ISD::FADD, MVT::f128, LibCall);
setOperationAction(ISD::STRICT_FADD, MVT::f128, LibCall);
setOperationAction(ISD::FSUB, MVT::f128, LibCall);
setOperationAction(ISD::STRICT_FSUB, MVT::f128, LibCall);
setOperationAction(ISD::FDIV, MVT::f128, LibCall);
setOperationAction(ISD::STRICT_FDIV, MVT::f128, LibCall);
setOperationAction(ISD::FMUL, MVT::f128, LibCall);
setOperationAction(ISD::STRICT_FMUL, MVT::f128, LibCall);
setOperationAction(ISD::FMA, MVT::f128, LibCall);
setOperationAction(ISD::STRICT_FMA, MVT::f128, LibCall);
setOperationAction(ISD::FABS, MVT::f128, Custom);
setOperationAction(ISD::FNEG, MVT::f128, Custom);
setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
setOperationAction(ISD::FSIN, MVT::f128, LibCall);
setOperationAction(ISD::STRICT_FSIN, MVT::f128, LibCall);
setOperationAction(ISD::FCOS, MVT::f128, LibCall);
setOperationAction(ISD::STRICT_FCOS, MVT::f128, LibCall);
setOperationAction(ISD::FSINCOS, MVT::f128, LibCall);
// No STRICT_FSINCOS
setOperationAction(ISD::FSQRT, MVT::f128, LibCall);
setOperationAction(ISD::STRICT_FSQRT, MVT::f128, LibCall);
setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f128, Custom);
// We need to custom handle any FP_ROUND with an f128 input, but
// LegalizeDAG uses the result type to know when to run a custom handler.
// So we have to list all legal floating point result types here.
if (isTypeLegal(MVT::f32)) {
setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom);
}
if (isTypeLegal(MVT::f64)) {
setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);
setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom);
}
if (isTypeLegal(MVT::f80)) {
setOperationAction(ISD::FP_ROUND, MVT::f80, Custom);
setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Custom);
}
setOperationAction(ISD::SETCC, MVT::f128, Custom);
setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f80, Expand);
setTruncStoreAction(MVT::f128, MVT::f32, Expand);
setTruncStoreAction(MVT::f128, MVT::f64, Expand);
setTruncStoreAction(MVT::f128, MVT::f80, Expand);
}
// Always use a library call for pow.
setOperationAction(ISD::FPOW , MVT::f32 , Expand);
setOperationAction(ISD::FPOW , MVT::f64 , Expand);
setOperationAction(ISD::FPOW , MVT::f80 , Expand);
setOperationAction(ISD::FPOW , MVT::f128 , Expand);
setOperationAction(ISD::FLOG, MVT::f80, Expand);
setOperationAction(ISD::FLOG2, MVT::f80, Expand);
setOperationAction(ISD::FLOG10, MVT::f80, Expand);
setOperationAction(ISD::FEXP, MVT::f80, Expand);
setOperationAction(ISD::FEXP2, MVT::f80, Expand);
setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
// Some FP actions are always expanded for vector types.
for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32,
MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
setOperationAction(ISD::FSIN, VT, Expand);
setOperationAction(ISD::FSINCOS, VT, Expand);
setOperationAction(ISD::FCOS, VT, Expand);
setOperationAction(ISD::FREM, VT, Expand);
setOperationAction(ISD::FCOPYSIGN, VT, Expand);
setOperationAction(ISD::FPOW, VT, Expand);
setOperationAction(ISD::FLOG, VT, Expand);
setOperationAction(ISD::FLOG2, VT, Expand);
setOperationAction(ISD::FLOG10, VT, Expand);
setOperationAction(ISD::FEXP, VT, Expand);
setOperationAction(ISD::FEXP2, VT, Expand);
}
// First set operation action for all vector types to either promote
// (for widening) or expand (for scalarization). Then we will selectively
// turn on ones that can be effectively codegen'd.
for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
setOperationAction(ISD::SDIV, VT, Expand);
setOperationAction(ISD::UDIV, VT, Expand);
setOperationAction(ISD::SREM, VT, Expand);
setOperationAction(ISD::UREM, VT, Expand);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
setOperationAction(ISD::FMA, VT, Expand);
setOperationAction(ISD::FFLOOR, VT, Expand);
setOperationAction(ISD::FCEIL, VT, Expand);
setOperationAction(ISD::FTRUNC, VT, Expand);
setOperationAction(ISD::FRINT, VT, Expand);
setOperationAction(ISD::FNEARBYINT, VT, Expand);
setOperationAction(ISD::SMUL_LOHI, VT, Expand);
setOperationAction(ISD::MULHS, VT, Expand);
setOperationAction(ISD::UMUL_LOHI, VT, Expand);
setOperationAction(ISD::MULHU, VT, Expand);
setOperationAction(ISD::SDIVREM, VT, Expand);
setOperationAction(ISD::UDIVREM, VT, Expand);
setOperationAction(ISD::CTPOP, VT, Expand);
setOperationAction(ISD::CTTZ, VT, Expand);
setOperationAction(ISD::CTLZ, VT, Expand);
setOperationAction(ISD::ROTL, VT, Expand);
setOperationAction(ISD::ROTR, VT, Expand);
setOperationAction(ISD::BSWAP, VT, Expand);
setOperationAction(ISD::SETCC, VT, Expand);
setOperationAction(ISD::FP_TO_UINT, VT, Expand);
setOperationAction(ISD::FP_TO_SINT, VT, Expand);
setOperationAction(ISD::UINT_TO_FP, VT, Expand);
setOperationAction(ISD::SINT_TO_FP, VT, Expand);
setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
setOperationAction(ISD::TRUNCATE, VT, Expand);
setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
setOperationAction(ISD::ANY_EXTEND, VT, Expand);
setOperationAction(ISD::SELECT_CC, VT, Expand);
for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
setTruncStoreAction(InnerVT, VT, Expand);
setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
// N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
// types, we have to deal with them whether we ask for Expansion or not.
// Setting Expand causes its own optimisation problems though, so leave
// them legal.
if (VT.getVectorElementType() == MVT::i1)
setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
// EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
// split/scalarized right now.
if (VT.getVectorElementType() == MVT::f16)
setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
}
}
// FIXME: In order to prevent SSE instructions being expanded to MMX ones
// with -msoft-float, disable use of MMX as well.
if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
// No operations on x86mmx supported, everything uses intrinsics.
}
if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
: &X86::VR128RegClass);
setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
setOperationAction(ISD::FABS, MVT::v4f32, Custom);
setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom);
setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
setOperationAction(ISD::STORE, MVT::v2f32, Custom);
setOperationAction(ISD::STRICT_FADD, MVT::v4f32, Legal);
setOperationAction(ISD::STRICT_FSUB, MVT::v4f32, Legal);
setOperationAction(ISD::STRICT_FMUL, MVT::v4f32, Legal);
setOperationAction(ISD::STRICT_FDIV, MVT::v4f32, Legal);
setOperationAction(ISD::STRICT_FSQRT, MVT::v4f32, Legal);
}
if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
: &X86::VR128RegClass);
// FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
// registers cannot be used even for integer operations.
addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
: &X86::VR128RegClass);
addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
: &X86::VR128RegClass);
addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
: &X86::VR128RegClass);
addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
: &X86::VR128RegClass);
for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,
MVT::v2i16, MVT::v4i16, MVT::v2i32 }) {
setOperationAction(ISD::SDIV, VT, Custom);
setOperationAction(ISD::SREM, VT, Custom);
setOperationAction(ISD::UDIV, VT, Custom);
setOperationAction(ISD::UREM, VT, Custom);
}
setOperationAction(ISD::MUL, MVT::v2i8, Custom);
setOperationAction(ISD::MUL, MVT::v4i8, Custom);
setOperationAction(ISD::MUL, MVT::v8i8, Custom);
setOperationAction(ISD::MUL, MVT::v16i8, Custom);
setOperationAction(ISD::MUL, MVT::v4i32, Custom);
setOperationAction(ISD::MUL, MVT::v2i64, Custom);
setOperationAction(ISD::MULHU, MVT::v4i32, Custom);
setOperationAction(ISD::MULHS, MVT::v4i32, Custom);
setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
setOperationAction(ISD::MUL, MVT::v8i16, Legal);
setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
setOperationAction(ISD::FABS, MVT::v2f64, Custom);
setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom);
for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);
setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
}
setOperationAction(ISD::UADDSAT, MVT::v16i8, Legal);
setOperationAction(ISD::SADDSAT, MVT::v16i8, Legal);
setOperationAction(ISD::USUBSAT, MVT::v16i8, Legal);
setOperationAction(ISD::SSUBSAT, MVT::v16i8, Legal);
setOperationAction(ISD::UADDSAT, MVT::v8i16, Legal);
setOperationAction(ISD::SADDSAT, MVT::v8i16, Legal);
setOperationAction(ISD::USUBSAT, MVT::v8i16, Legal);
setOperationAction(ISD::SSUBSAT, MVT::v8i16, Legal);
setOperationAction(ISD::UADDSAT, MVT::v4i32, Custom);
setOperationAction(ISD::USUBSAT, MVT::v4i32, Custom);
setOperationAction(ISD::UADDSAT, MVT::v2i64, Custom);
setOperationAction(ISD::USUBSAT, MVT::v2i64, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
setOperationAction(ISD::SETCC, VT, Custom);
setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
setOperationAction(ISD::CTPOP, VT, Custom);
setOperationAction(ISD::ABS, VT, Custom);
// The condition codes aren't legal in SSE/AVX and under AVX512 we use
// setcc all the way to isel and prefer SETGT in some isel patterns.
setCondCodeAction(ISD::SETLT, VT, Custom);
setCondCodeAction(ISD::SETLE, VT, Custom);
}
for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
setOperationAction(ISD::VSELECT, VT, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
}
for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
setOperationAction(ISD::VSELECT, VT, Custom);
if (VT == MVT::v2i64 && !Subtarget.is64Bit())
continue;
setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
}
// Custom lower v2i64 and v2f64 selects.
setOperationAction(ISD::SELECT, MVT::v2f64, Custom);
setOperationAction(ISD::SELECT, MVT::v2i64, Custom);
setOperationAction(ISD::SELECT, MVT::v4i32, Custom);
setOperationAction(ISD::SELECT, MVT::v8i16, Custom);
setOperationAction(ISD::SELECT, MVT::v16i8, Custom);
setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);
setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4i32, Legal);
setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i32, Custom);
// Custom legalize these to avoid over promotion or custom promotion.
for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) {
setOperationAction(ISD::FP_TO_SINT, VT, Custom);
setOperationAction(ISD::FP_TO_UINT, VT, Custom);
setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Custom);
setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Custom);
}
setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i32, Legal);
setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2i32, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2i32, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32, Custom);
// Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom);
setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2f32, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);
setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2f32, Custom);
setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v2f32, Custom);
setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);
setOperationAction(ISD::STRICT_FP_ROUND, MVT::v2f32, Custom);
// We want to legalize this to an f64 load rather than an i64 load on
// 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
// store.
setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
setOperationAction(ISD::LOAD, MVT::v4i16, Custom);
setOperationAction(ISD::LOAD, MVT::v8i8, Custom);
setOperationAction(ISD::STORE, MVT::v2i32, Custom);
setOperationAction(ISD::STORE, MVT::v4i16, Custom);
setOperationAction(ISD::STORE, MVT::v8i8, Custom);
setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);
if (!Subtarget.hasAVX512())
setOperationAction(ISD::BITCAST, MVT::v16i1, Custom);
setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom);
setOperationAction(ISD::TRUNCATE, MVT::v2i8, Custom);
setOperationAction(ISD::TRUNCATE, MVT::v2i16, Custom);
setOperationAction(ISD::TRUNCATE, MVT::v2i32, Custom);
setOperationAction(ISD::TRUNCATE, MVT::v4i8, Custom);
setOperationAction(ISD::TRUNCATE, MVT::v4i16, Custom);
setOperationAction(ISD::TRUNCATE, MVT::v8i8, Custom);
// In the customized shift lowering, the legal v4i32/v2i64 cases
// in AVX2 will be recognized.
for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
setOperationAction(ISD::SRL, VT, Custom);
setOperationAction(ISD::SHL, VT, Custom);
setOperationAction(ISD::SRA, VT, Custom);
}
setOperationAction(ISD::ROTL, MVT::v4i32, Custom);
setOperationAction(ISD::ROTL, MVT::v8i16, Custom);
// With AVX512, expanding (and promoting the shifts) is better.
if (!Subtarget.hasAVX512())
setOperationAction(ISD::ROTL, MVT::v16i8, Custom);
setOperationAction(ISD::STRICT_FSQRT, MVT::v2f64, Legal);
setOperationAction(ISD::STRICT_FADD, MVT::v2f64, Legal);
setOperationAction(ISD::STRICT_FSUB, MVT::v2f64, Legal);
setOperationAction(ISD::STRICT_FMUL, MVT::v2f64, Legal);
setOperationAction(ISD::STRICT_FDIV, MVT::v2f64, Legal);
}
if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
setOperationAction(ISD::ABS, MVT::v16i8, Legal);
setOperationAction(ISD::ABS, MVT::v8i16, Legal);
setOperationAction(ISD::ABS, MVT::v4i32, Legal);
setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom);
setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);
setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);
setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
// These might be better off as horizontal vector ops.
setOperationAction(ISD::ADD, MVT::i16, Custom);
setOperationAction(ISD::ADD, MVT::i32, Custom);
setOperationAction(ISD::SUB, MVT::i16, Custom);
setOperationAction(ISD::SUB, MVT::i32, Custom);
}
if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
setOperationAction(ISD::STRICT_FFLOOR, RoundedTy, Legal);
setOperationAction(ISD::FCEIL, RoundedTy, Legal);
setOperationAction(ISD::STRICT_FCEIL, RoundedTy, Legal);
setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
setOperationAction(ISD::STRICT_FTRUNC, RoundedTy, Legal);
setOperationAction(ISD::FRINT, RoundedTy, Legal);
setOperationAction(ISD::STRICT_FRINT, RoundedTy, Legal);
setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
setOperationAction(ISD::STRICT_FNEARBYINT, RoundedTy, Legal);
}
setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
// FIXME: Do we need to handle scalar-to-vector here?
setOperationAction(ISD::MUL, MVT::v4i32, Legal);
// We directly match byte blends in the backend as they match the VSELECT
// condition form.
setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);
// SSE41 brings specific instructions for doing vector sign extend even in
// cases where we don't have SRA.
for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal);
setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal);
}
// SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
}
// i8 vectors are custom because the source register and source
// source memory operand types are not the same width.
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) {
// We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can
// do the pre and post work in the vector domain.
setOperationAction(ISD::UINT_TO_FP, MVT::v4i64, Custom);
setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i64, Custom);
// We need to mark SINT_TO_FP as Custom even though we want to expand it
// so that DAG combine doesn't try to turn it into uint_to_fp.
setOperationAction(ISD::SINT_TO_FP, MVT::v4i64, Custom);
setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i64, Custom);
}
}
if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
setOperationAction(ISD::ROTL, VT, Custom);
// XOP can efficiently perform BITREVERSE with VPPERM.
for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
setOperationAction(ISD::BITREVERSE, VT, Custom);
for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
setOperationAction(ISD::BITREVERSE, VT, Custom);
}
if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
bool HasInt256 = Subtarget.hasInt256();
addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
: &X86::VR256RegClass);
addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
: &X86::VR256RegClass);
addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
: &X86::VR256RegClass);
addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
: &X86::VR256RegClass);
addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
: &X86::VR256RegClass);
addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
: &X86::VR256RegClass);
for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
setOperationAction(ISD::FFLOOR, VT, Legal);
setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);
setOperationAction(ISD::FCEIL, VT, Legal);
setOperationAction(ISD::STRICT_FCEIL, VT, Legal);
setOperationAction(ISD::FTRUNC, VT, Legal);
setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);
setOperationAction(ISD::FRINT, VT, Legal);
setOperationAction(ISD::STRICT_FRINT, VT, Legal);
setOperationAction(ISD::FNEARBYINT, VT, Legal);
setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
setOperationAction(ISD::FNEG, VT, Custom);
setOperationAction(ISD::FABS, VT, Custom);
setOperationAction(ISD::FCOPYSIGN, VT, Custom);
}
// (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
// even though v8i16 is a legal type.
setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);
setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32);
setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32);
setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i32, Legal);
setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i32, Legal);
setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f32, Legal);
setOperationAction(ISD::STRICT_FADD, MVT::v8f32, Legal);
setOperationAction(ISD::STRICT_FADD, MVT::v4f64, Legal);
setOperationAction(ISD::STRICT_FSUB, MVT::v8f32, Legal);
setOperationAction(ISD::STRICT_FSUB, MVT::v4f64, Legal);
setOperationAction(ISD::STRICT_FMUL, MVT::v8f32, Legal);
setOperationAction(ISD::STRICT_FMUL, MVT::v4f64, Legal);
setOperationAction(ISD::STRICT_FDIV, MVT::v8f32, Legal);
setOperationAction(ISD::STRICT_FDIV, MVT::v4f64, Legal);
setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f64, Legal);
setOperationAction(ISD::STRICT_FSQRT, MVT::v8f32, Legal);
setOperationAction(ISD::STRICT_FSQRT, MVT::v4f64, Legal);
if (!Subtarget.hasAVX512())
setOperationAction(ISD::BITCAST, MVT::v32i1, Custom);
// In the customized shift lowering, the legal v8i32/v4i64 cases
// in AVX2 will be recognized.
for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
setOperationAction(ISD::SRL, VT, Custom);
setOperationAction(ISD::SHL, VT, Custom);
setOperationAction(ISD::SRA, VT, Custom);
}
// These types need custom splitting if their input is a 128-bit vector.
setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
setOperationAction(ISD::ROTL, MVT::v8i32, Custom);
setOperationAction(ISD::ROTL, MVT::v16i16, Custom);
// With BWI, expanding (and promoting the shifts) is the better.
if (!Subtarget.hasBWI())
setOperationAction(ISD::ROTL, MVT::v32i8, Custom);
setOperationAction(ISD::SELECT, MVT::v4f64, Custom);
setOperationAction(ISD::SELECT, MVT::v4i64, Custom);
setOperationAction(ISD::SELECT, MVT::v8i32, Custom);
setOperationAction(ISD::SELECT, MVT::v16i16, Custom);
setOperationAction(ISD::SELECT, MVT::v32i8, Custom);
setOperationAction(ISD::SELECT, MVT::v8f32, Custom);
for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
setOperationAction(ISD::ANY_EXTEND, VT, Custom);
}
setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);
setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);
setOperationAction(ISD::BITREVERSE, MVT::v32i8, Custom);
for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
setOperationAction(ISD::SETCC, VT, Custom);
setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
setOperationAction(ISD::CTPOP, VT, Custom);
setOperationAction(ISD::CTLZ, VT, Custom);
// The condition codes aren't legal in SSE/AVX and under AVX512 we use
// setcc all the way to isel and prefer SETGT in some isel patterns.
setCondCodeAction(ISD::SETLT, VT, Custom);
setCondCodeAction(ISD::SETLE, VT, Custom);
}
if (Subtarget.hasAnyFMA()) {
for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
MVT::v2f64, MVT::v4f64 }) {
setOperationAction(ISD::FMA, VT, Legal);
setOperationAction(ISD::STRICT_FMA, VT, Legal);
}
}
for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
}
setOperationAction(ISD::MUL, MVT::v4i64, Custom);
setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
setOperationAction(ISD::MUL, MVT::v32i8, Custom);
setOperationAction(ISD::MULHU, MVT::v8i32, Custom);
setOperationAction(ISD::MULHS, MVT::v8i32, Custom);
setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
setOperationAction(ISD::ABS, MVT::v4i64, Custom);
setOperationAction(ISD::SMAX, MVT::v4i64, Custom);
setOperationAction(ISD::UMAX, MVT::v4i64, Custom);
setOperationAction(ISD::SMIN, MVT::v4i64, Custom);
setOperationAction(ISD::UMIN, MVT::v4i64, Custom);
setOperationAction(ISD::UADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
setOperationAction(ISD::SADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
setOperationAction(ISD::USUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
setOperationAction(ISD::SSUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
setOperationAction(ISD::UADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
setOperationAction(ISD::SADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
setOperationAction(ISD::SSUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
}
for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
}
if (HasInt256) {
// The custom lowering for UINT_TO_FP for v8i32 becomes interesting
// when we have a 256bit-wide blend with immediate.
setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i32, Custom);
// AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
}
}
for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
setOperationAction(ISD::MSTORE, VT, Legal);
}
// Extract subvector is special because the value type
// (result) is 128-bit but the source is 256-bit wide.
for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
MVT::v4f32, MVT::v2f64 }) {
setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
}
// Custom lower several nodes for 256-bit types.
for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
MVT::v8f32, MVT::v4f64 }) {
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
setOperationAction(ISD::VSELECT, VT, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
setOperationAction(ISD::STORE, VT, Custom);
}
if (HasInt256) {
setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
// Custom legalize 2x32 to get a little better code.
setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);
setOperationAction(ISD::MGATHER, MVT::v2i32, Custom);
for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
setOperationAction(ISD::MGATHER, VT, Custom);
}
}
// This block controls legalization of the mask vector sizes that are
// available with AVX512. 512-bit vectors are in a separate block controlled
// by useAVX512Regs.
if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
setOperationAction(ISD::SELECT, MVT::v1i1, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);
setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom);
setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);
setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);
setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);
setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);
setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1, MVT::v8i32);
setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1, MVT::v8i32);
setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1, MVT::v4i32);
setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1, MVT::v4i32);
setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Custom);
setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Custom);
setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i1, Custom);
setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i1, Custom);
// There is no byte sized k-register load or store without AVX512DQ.
if (!Subtarget.hasDQI()) {
setOperationAction(ISD::LOAD, MVT::v1i1, Custom);
setOperationAction(ISD::LOAD, MVT::v2i1, Custom);
setOperationAction(ISD::LOAD, MVT::v4i1, Custom);
setOperationAction(ISD::LOAD, MVT::v8i1, Custom);
setOperationAction(ISD::STORE, MVT::v1i1, Custom);
setOperationAction(ISD::STORE, MVT::v2i1, Custom);
setOperationAction(ISD::STORE, MVT::v4i1, Custom);
setOperationAction(ISD::STORE, MVT::v8i1, Custom);
}
// Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
setOperationAction(ISD::ANY_EXTEND, VT, Custom);
}
for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
setOperationAction(ISD::ADD, VT, Custom);
setOperationAction(ISD::SUB, VT, Custom);
setOperationAction(ISD::MUL, VT, Custom);
setOperationAction(ISD::SETCC, VT, Custom);
setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
setOperationAction(ISD::SELECT, VT, Custom);
setOperationAction(ISD::TRUNCATE, VT, Custom);
setOperationAction(ISD::UADDSAT, VT, Custom);
setOperationAction(ISD::SADDSAT, VT, Custom);
setOperationAction(ISD::USUBSAT, VT, Custom);
setOperationAction(ISD::SSUBSAT, VT, Custom);
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
setOperationAction(ISD::VSELECT, VT, Expand);
}
for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
}
// This block controls legalization for 512-bit operations with 32/64 bit
// elements. 512-bits can be disabled based on prefer-vector-width and
// required-vector-width function attributes.
if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
}
for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
setOperationAction(ISD::FNEG, VT, Custom);
setOperationAction(ISD::FABS, VT, Custom);
setOperationAction(ISD::FMA, VT, Legal);
setOperationAction(ISD::STRICT_FMA, VT, Legal);
setOperationAction(ISD::FCOPYSIGN, VT, Custom);
}
for (MVT VT : { MVT::v16i1, MVT::v16i8, MVT::v16i16 }) {
setOperationPromotedToType(ISD::FP_TO_SINT , VT, MVT::v16i32);
setOperationPromotedToType(ISD::FP_TO_UINT , VT, MVT::v16i32);
setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, VT, MVT::v16i32);
setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, VT, MVT::v16i32);
}
setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal);
setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal);
setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v16i32, Legal);
setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v16i32, Legal);
setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal);
setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal);
setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i32, Legal);
setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i32, Legal);
setOperationAction(ISD::STRICT_FADD, MVT::v16f32, Legal);
setOperationAction(ISD::STRICT_FADD, MVT::v8f64, Legal);
setOperationAction(ISD::STRICT_FSUB, MVT::v16f32, Legal);
setOperationAction(ISD::STRICT_FSUB, MVT::v8f64, Legal);
setOperationAction(ISD::STRICT_FMUL, MVT::v16f32, Legal);
setOperationAction(ISD::STRICT_FMUL, MVT::v8f64, Legal);
setOperationAction(ISD::STRICT_FDIV, MVT::v16f32, Legal);
setOperationAction(ISD::STRICT_FDIV, MVT::v8f64, Legal);
setOperationAction(ISD::STRICT_FSQRT, MVT::v16f32, Legal);
setOperationAction(ISD::STRICT_FSQRT, MVT::v8f64, Legal);
setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f64, Legal);
setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f32, Legal);
setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
// With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
// to 512-bit rather than use the AVX2 instructions so that we can use
// k-masks.
if (!Subtarget.hasVLX()) {
for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
setOperationAction(ISD::MLOAD, VT, Custom);
setOperationAction(ISD::MSTORE, VT, Custom);
}
}
setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom);
setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom);
setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom);
setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom);
setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
// Need to custom widen this if we don't have AVX512BW.
setOperationAction(ISD::ANY_EXTEND, MVT::v8i8, Custom);
setOperationAction(ISD::ZERO_EXTEND, MVT::v8i8, Custom);
setOperationAction(ISD::SIGN_EXTEND, MVT::v8i8, Custom);
for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
setOperationAction(ISD::FFLOOR, VT, Legal);
setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);
setOperationAction(ISD::FCEIL, VT, Legal);
setOperationAction(ISD::STRICT_FCEIL, VT, Legal);
setOperationAction(ISD::FTRUNC, VT, Legal);
setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);
setOperationAction(ISD::FRINT, VT, Legal);
setOperationAction(ISD::STRICT_FRINT, VT, Legal);
setOperationAction(ISD::FNEARBYINT, VT, Legal);
setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
setOperationAction(ISD::SELECT, VT, Custom);
}
// Without BWI we need to use custom lowering to handle MVT::v64i8 input.
for (auto VT : {MVT::v16i32, MVT::v8i64, MVT::v64i8}) {
setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
}
setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom);
setOperationAction(ISD::MUL, MVT::v8i64, Custom);
setOperationAction(ISD::MUL, MVT::v16i32, Legal);
setOperationAction(ISD::MULHU, MVT::v16i32, Custom);
setOperationAction(ISD::MULHS, MVT::v16i32, Custom);
for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
setOperationAction(ISD::SMAX, VT, Legal);
setOperationAction(ISD::UMAX, VT, Legal);
setOperationAction(ISD::SMIN, VT, Legal);
setOperationAction(ISD::UMIN, VT, Legal);
setOperationAction(ISD::ABS, VT, Legal);
setOperationAction(ISD::SRL, VT, Custom);
setOperationAction(ISD::SHL, VT, Custom);
setOperationAction(ISD::SRA, VT, Custom);
setOperationAction(ISD::CTPOP, VT, Custom);
setOperationAction(ISD::ROTL, VT, Custom);
setOperationAction(ISD::ROTR, VT, Custom);
setOperationAction(ISD::SETCC, VT, Custom);
setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
setOperationAction(ISD::SELECT, VT, Custom);
// The condition codes aren't legal in SSE/AVX and under AVX512 we use
// setcc all the way to isel and prefer SETGT in some isel patterns.
setCondCodeAction(ISD::SETLT, VT, Custom);
setCondCodeAction(ISD::SETLE, VT, Custom);
}
if (Subtarget.hasDQI()) {
setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal);
setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal);
setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i64, Legal);
setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i64, Legal);
setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal);
setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal);
setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i64, Legal);
setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i64, Legal);
setOperationAction(ISD::MUL, MVT::v8i64, Legal);
}
if (Subtarget.hasCDI()) {
// NonVLX sub-targets extend 128/256 vectors to use the 512 version.
for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
setOperationAction(ISD::CTLZ, VT, Legal);
}
} // Subtarget.hasCDI()
if (Subtarget.hasVPOPCNTDQ()) {
for (auto VT : { MVT::v16i32, MVT::v8i64 })
setOperationAction(ISD::CTPOP, VT, Legal);
}
// Extract subvector is special because the value type
// (result) is 256-bit but the source is 512-bit wide.
// 128-bit was made Legal under AVX1.
for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
MVT::v8f32, MVT::v4f64 })
setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
setOperationAction(ISD::VSELECT, VT, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
setOperationAction(ISD::MLOAD, VT, Legal);
setOperationAction(ISD::MSTORE, VT, Legal);
setOperationAction(ISD::MGATHER, VT, Custom);
setOperationAction(ISD::MSCATTER, VT, Custom);
}
if (!Subtarget.hasBWI()) {
// Need to custom split v32i16/v64i8 bitcasts.
setOperationAction(ISD::BITCAST, MVT::v32i16, Custom);
setOperationAction(ISD::BITCAST, MVT::v64i8, Custom);
// Better to split these into two 256-bit ops.
setOperationAction(ISD::BITREVERSE, MVT::v8i64, Custom);
setOperationAction(ISD::BITREVERSE, MVT::v16i32, Custom);
}
if (Subtarget.hasVBMI2()) {
for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
setOperationAction(ISD::FSHL, VT, Custom);
setOperationAction(ISD::FSHR, VT, Custom);
}
}
}// has AVX-512
// This block controls legalization for operations that don't have
// pre-AVX512 equivalents. Without VLX we use 512-bit operations for
// narrower widths.
if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
// These operations are handled on non-VLX by artificially widening in
// isel patterns.
setOperationAction(ISD::FP_TO_UINT, MVT::v8i32,
Subtarget.hasVLX() ? Legal : Custom);
setOperationAction(ISD::FP_TO_UINT, MVT::v4i32,
Subtarget.hasVLX() ? Legal : Custom);
setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);
setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i32,
Subtarget.hasVLX() ? Legal : Custom);
setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4i32,
Subtarget.hasVLX() ? Legal : Custom);
setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i32, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::v8i32,
Subtarget.hasVLX() ? Legal : Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::v4i32,
Subtarget.hasVLX() ? Legal : Custom);
setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i32,
Subtarget.hasVLX() ? Legal : Custom);
setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32,
Subtarget.hasVLX() ? Legal : Custom);
for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
setOperationAction(ISD::SMAX, VT, Legal);
setOperationAction(ISD::UMAX, VT, Legal);
setOperationAction(ISD::SMIN, VT, Legal);
setOperationAction(ISD::UMIN, VT, Legal);
setOperationAction(ISD::ABS, VT, Legal);
}
for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
setOperationAction(ISD::ROTL, VT, Custom);
setOperationAction(ISD::ROTR, VT, Custom);
}
// Custom legalize 2x32 to get a little better code.
setOperationAction(ISD::MSCATTER, MVT::v2f32, Custom);
setOperationAction(ISD::MSCATTER, MVT::v2i32, Custom);
for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
setOperationAction(ISD::MSCATTER, VT, Custom);
if (Subtarget.hasDQI()) {
for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
setOperationAction(ISD::SINT_TO_FP, VT,
Subtarget.hasVLX() ? Legal : Custom);
setOperationAction(ISD::UINT_TO_FP, VT,
Subtarget.hasVLX() ? Legal : Custom);
setOperationAction(ISD::STRICT_SINT_TO_FP, VT,
Subtarget.hasVLX() ? Legal : Custom);
setOperationAction(ISD::STRICT_UINT_TO_FP, VT,
Subtarget.hasVLX() ? Legal : Custom);
setOperationAction(ISD::FP_TO_SINT, VT,
Subtarget.hasVLX() ? Legal : Custom);
setOperationAction(ISD::FP_TO_UINT, VT,
Subtarget.hasVLX() ? Legal : Custom);
setOperationAction(ISD::STRICT_FP_TO_SINT, VT,
Subtarget.hasVLX() ? Legal : Custom);
setOperationAction(ISD::STRICT_FP_TO_UINT, VT,
Subtarget.hasVLX() ? Legal : Custom);
setOperationAction(ISD::MUL, VT, Legal);
}
}
if (Subtarget.hasCDI()) {
for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
setOperationAction(ISD::CTLZ, VT, Legal);
}
} // Subtarget.hasCDI()
if (Subtarget.hasVPOPCNTDQ()) {
for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
setOperationAction(ISD::CTPOP, VT, Legal);
}
}
// This block control legalization of v32i1/v64i1 which are available with
// AVX512BW. 512-bit v32i16 and v64i8 vector legalization is controlled with
// useBWIRegs.
if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
setOperationAction(ISD::ADD, VT, Custom);
setOperationAction(ISD::SUB, VT, Custom);
setOperationAction(ISD::MUL, VT, Custom);
setOperationAction(ISD::VSELECT, VT, Expand);
setOperationAction(ISD::UADDSAT, VT, Custom);
setOperationAction(ISD::SADDSAT, VT, Custom);
setOperationAction(ISD::USUBSAT, VT, Custom);
setOperationAction(ISD::SSUBSAT, VT, Custom);
setOperationAction(ISD::TRUNCATE, VT, Custom);
setOperationAction(ISD::SETCC, VT, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::SELECT, VT, Custom);
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
}
setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom);
for (auto VT : { MVT::v16i1, MVT::v32i1 })
setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
// Extends from v32i1 masks to 256-bit vectors.
setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom);
setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom);
setOperationAction(ISD::ANY_EXTEND, MVT::v32i8, Custom);
}
// This block controls legalization for v32i16 and v64i8. 512-bits can be
// disabled based on prefer-vector-width and required-vector-width function
// attributes.
if (!Subtarget.useSoftFloat() && Subtarget.useBWIRegs()) {
addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
// Extends from v64i1 masks to 512-bit vectors.
setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);
setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);
setOperationAction(ISD::ANY_EXTEND, MVT::v64i8, Custom);
setOperationAction(ISD::MUL, MVT::v32i16, Legal);
setOperationAction(ISD::MUL, MVT::v64i8, Custom);
setOperationAction(ISD::MULHS, MVT::v32i16, Legal);
setOperationAction(ISD::MULHU, MVT::v32i16, Legal);
setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i16, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i8, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i16, Legal);
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i8, Legal);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom);
setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i16, Custom);
setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v64i8, Custom);
setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);
setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i16, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i8, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i16, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i8, Custom);
setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom);
setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom);
setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v32i16, Custom);
setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v32i16, Custom);
setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
setOperationAction(ISD::VSELECT, VT, Custom);
setOperationAction(ISD::ABS, VT, Legal);
setOperationAction(ISD::SRL, VT, Custom);
setOperationAction(ISD::SHL, VT, Custom);
setOperationAction(ISD::SRA, VT, Custom);
setOperationAction(ISD::MLOAD, VT, Legal);
setOperationAction(ISD::MSTORE, VT, Legal);
setOperationAction(ISD::CTPOP, VT, Custom);
setOperationAction(ISD::CTLZ, VT, Custom);
setOperationAction(ISD::SMAX, VT, Legal);
setOperationAction(ISD::UMAX, VT, Legal);
setOperationAction(ISD::SMIN, VT, Legal);
setOperationAction(ISD::UMIN, VT, Legal);
setOperationAction(ISD::SETCC, VT, Custom);
setOperationAction(ISD::UADDSAT, VT, Legal);
setOperationAction(ISD::SADDSAT, VT, Legal);
setOperationAction(ISD::USUBSAT, VT, Legal);
setOperationAction(ISD::SSUBSAT, VT, Legal);
setOperationAction(ISD::SELECT, VT, Custom);
// The condition codes aren't legal in SSE/AVX and under AVX512 we use
// setcc all the way to isel and prefer SETGT in some isel patterns.
setCondCodeAction(ISD::SETLT, VT, Custom);
setCondCodeAction(ISD::SETLE, VT, Custom);
}
for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
}
if (Subtarget.hasBITALG()) {
for (auto VT : { MVT::v64i8, MVT::v32i16 })
setOperationAction(ISD::CTPOP, VT, Legal);
}
if (Subtarget.hasVBMI2()) {
setOperationAction(ISD::FSHL, MVT::v32i16, Custom);
setOperationAction(ISD::FSHR, MVT::v32i16, Custom);
}
}
if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
}
// These operations are handled on non-VLX by artificially widening in
// isel patterns.
// TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
if (Subtarget.hasBITALG()) {
for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
setOperationAction(ISD::CTPOP, VT, Legal);
}
}
if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
if (Subtarget.hasDQI()) {
// Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
// v2f32 UINT_TO_FP is already custom under SSE2.
assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) &&
isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) &&
"Unexpected operation action!");
// v2i64 FP_TO_S/UINT(v2f32) custom conversion.
setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);
setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);
setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f32, Custom);
setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f32, Custom);
}
if (Subtarget.hasBWI()) {
setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
}
if (Subtarget.hasVBMI2()) {
// TODO: Make these legal even without VLX?
for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64,
MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
setOperationAction(ISD::FSHL, VT, Custom);
setOperationAction(ISD::FSHR, VT, Custom);
}
}
setOperationAction(ISD::TRUNCATE, MVT::v16i32, Custom);
setOperationAction(ISD::TRUNCATE, MVT::v8i64, Custom);
setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);
}
// We want to custom lower some of our intrinsics.
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
if (!Subtarget.is64Bit()) {
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
}
// Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
// handle type legalization for these operations here.
//
// FIXME: We really should do custom legalization for addition and
// subtraction on x86-32 once PR3203 is fixed. We really can't do much better
// than generic legalization for 64-bit multiplication-with-overflow, though.
for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
if (VT == MVT::i64 && !Subtarget.is64Bit())
continue;
// Add/Sub/Mul with overflow operations are custom lowered.
setOperationAction(ISD::SADDO, VT, Custom);
setOperationAction(ISD::UADDO, VT, Custom);
setOperationAction(ISD::SSUBO, VT, Custom);
setOperationAction(ISD::USUBO, VT, Custom);
setOperationAction(ISD::SMULO, VT, Custom);
setOperationAction(ISD::UMULO, VT, Custom);
// Support carry in as value rather than glue.
setOperationAction(ISD::ADDCARRY, VT, Custom);
setOperationAction(ISD::SUBCARRY, VT, Custom);
setOperationAction(ISD::SETCCCARRY, VT, Custom);
}
if (!Subtarget.is64Bit()) {
// These libcalls are not available in 32-bit.
setLibcallName(RTLIB::SHL_I128, nullptr);
setLibcallName(RTLIB::SRL_I128, nullptr);
setLibcallName(RTLIB::SRA_I128, nullptr);
setLibcallName(RTLIB::MUL_I128, nullptr);
}
// Combine sin / cos into _sincos_stret if it is available.
if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
}
if (Subtarget.isTargetWin64()) {
setOperationAction(ISD::SDIV, MVT::i128, Custom);
setOperationAction(ISD::UDIV, MVT::i128, Custom);
setOperationAction(ISD::SREM, MVT::i128, Custom);
setOperationAction(ISD::UREM, MVT::i128, Custom);
setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
}
// On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
// is. We should promote the value to 64-bits to solve this.
// This is what the CRT headers do - `fmodf` is an inline header
// function casting to f64 and calling `fmod`.
if (Subtarget.is32Bit() &&
(Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()))
for (ISD::NodeType Op :
{ISD::FCEIL, ISD::STRICT_FCEIL,
ISD::FCOS, ISD::STRICT_FCOS,
ISD::FEXP, ISD::STRICT_FEXP,
ISD::FFLOOR, ISD::STRICT_FFLOOR,
ISD::FREM, ISD::STRICT_FREM,
ISD::FLOG, ISD::STRICT_FLOG,
ISD::FLOG10, ISD::STRICT_FLOG10,
ISD::FPOW, ISD::STRICT_FPOW,
ISD::FSIN, ISD::STRICT_FSIN})
if (isOperationExpand(Op, MVT::f32))
setOperationAction(Op, MVT::f32, Promote);
// We have target-specific dag combine patterns for the following nodes:
setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
setTargetDAGCombine(ISD::CONCAT_VECTORS);
setTargetDAGCombine(ISD::INSERT_SUBVECTOR);
setTargetDAGCombine(ISD::EXTRACT_SUBVECTOR);
setTargetDAGCombine(ISD::BITCAST);
setTargetDAGCombine(ISD::VSELECT);
setTargetDAGCombine(ISD::SELECT);
setTargetDAGCombine(ISD::SHL);
setTargetDAGCombine(ISD::SRA);
setTargetDAGCombine(ISD::SRL);
setTargetDAGCombine(ISD::OR);
setTargetDAGCombine(ISD::AND);
setTargetDAGCombine(ISD::ADD);
setTargetDAGCombine(ISD::FADD);
setTargetDAGCombine(ISD::FSUB);
setTargetDAGCombine(ISD::FNEG);
setTargetDAGCombine(ISD::FMA);
setTargetDAGCombine(ISD::FMINNUM);
setTargetDAGCombine(ISD::FMAXNUM);
setTargetDAGCombine(ISD::SUB);
setTargetDAGCombine(ISD::LOAD);
setTargetDAGCombine(ISD::MLOAD);
setTargetDAGCombine(ISD::STORE);
setTargetDAGCombine(ISD::MSTORE);
setTargetDAGCombine(ISD::TRUNCATE);
setTargetDAGCombine(ISD::ZERO_EXTEND);
setTargetDAGCombine(ISD::ANY_EXTEND);
setTargetDAGCombine(ISD::SIGN_EXTEND);
setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
setTargetDAGCombine(ISD::ANY_EXTEND_VECTOR_INREG);
setTargetDAGCombine(ISD::SIGN_EXTEND_VECTOR_INREG);
setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG);
setTargetDAGCombine(ISD::SINT_TO_FP);
setTargetDAGCombine(ISD::UINT_TO_FP);
setTargetDAGCombine(ISD::STRICT_SINT_TO_FP);
setTargetDAGCombine(ISD::STRICT_UINT_TO_FP);
setTargetDAGCombine(ISD::SETCC);
setTargetDAGCombine(ISD::MUL);
setTargetDAGCombine(ISD::XOR);
setTargetDAGCombine(ISD::MSCATTER);
setTargetDAGCombine(ISD::MGATHER);
computeRegisterProperties(Subtarget.getRegisterInfo());
MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
MaxStoresPerMemsetOptSize = 8;
MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
MaxStoresPerMemcpyOptSize = 4;
MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
MaxStoresPerMemmoveOptSize = 4;
// TODO: These control memcmp expansion in CGP and could be raised higher, but
// that needs to benchmarked and balanced with the potential use of vector
// load/store types (PR33329, PR33914).
MaxLoadsPerMemcmp = 2;
MaxLoadsPerMemcmpOptSize = 2;
// Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4).
setPrefLoopAlignment(Align(1ULL << ExperimentalPrefLoopAlignment));
// An out-of-order CPU can speculatively execute past a predictable branch,
// but a conditional move could be stalled by an expensive earlier operation.
PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
EnableExtLdPromotion = true;
setPrefFunctionAlignment(Align(16));
verifyIntrinsicTables();
// Default to having -disable-strictnode-mutation on
IsStrictFPEnabled = true;
}
// This has so far only been implemented for 64-bit MachO.
bool X86TargetLowering::useLoadStackGuardNode() const {
return Subtarget.isTargetMachO() && Subtarget.is64Bit();
}
bool X86TargetLowering::useStackGuardXorFP() const {
// Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
return Subtarget.getTargetTriple().isOSMSVCRT() && !Subtarget.isTargetMachO();
}
SDValue X86TargetLowering::emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
const SDLoc &DL) const {
EVT PtrTy = getPointerTy(DAG.getDataLayout());
unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
return SDValue(Node, 0);
}
TargetLoweringBase::LegalizeTypeAction
X86TargetLowering::getPreferredVectorAction(MVT VT) const {
if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
return TypeSplitVector;
if (VT.getVectorNumElements() != 1 &&
VT.getVectorElementType() != MVT::i1)
return TypeWidenVector;
return TargetLoweringBase::getPreferredVectorAction(VT);
}
MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
CallingConv::ID CC,
EVT VT) const {
// v32i1 vectors should be promoted to v32i8 to match avx2.
if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
return MVT::v32i8;
// Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
Subtarget.hasAVX512() &&
(!isPowerOf2_32(VT.getVectorNumElements()) ||
(VT.getVectorNumElements() > 16 && !Subtarget.hasBWI()) ||
(VT.getVectorNumElements() > 64 && Subtarget.hasBWI())))
return MVT::i8;
// Split v64i1 vectors if we don't have v64i8 available.
if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
CC != CallingConv::X86_RegCall)
return MVT::v32i1;
// FIXME: Should we just make these types legal and custom split operations?
if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !EnableOldKNLABI &&
Subtarget.useAVX512Regs() && !Subtarget.hasBWI())
return MVT::v16i32;
return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
}
unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
CallingConv::ID CC,
EVT VT) const {
// v32i1 vectors should be promoted to v32i8 to match avx2.
if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
return 1;
// Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
Subtarget.hasAVX512() &&
(!isPowerOf2_32(VT.getVectorNumElements()) ||
(VT.getVectorNumElements() > 16 && !Subtarget.hasBWI()) ||
(VT.getVectorNumElements() > 64 && Subtarget.hasBWI())))
return VT.getVectorNumElements();
// Split v64i1 vectors if we don't have v64i8 available.
if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
CC != CallingConv::X86_RegCall)
return 2;
// FIXME: Should we just make these types legal and custom split operations?
if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !EnableOldKNLABI &&
Subtarget.useAVX512Regs() && !Subtarget.hasBWI())
return 1;
return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
}
unsigned X86TargetLowering::getVectorTypeBreakdownForCallingConv(
LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
unsigned &NumIntermediates, MVT &RegisterVT) const {
// Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
Subtarget.hasAVX512() &&
(!isPowerOf2_32(VT.getVectorNumElements()) ||
(VT.getVectorNumElements() > 16 && !Subtarget.hasBWI()) ||
(VT.getVectorNumElements() > 64 && Subtarget.hasBWI()))) {
RegisterVT = MVT::i8;
IntermediateVT = MVT::i1;
NumIntermediates = VT.getVectorNumElements();
return NumIntermediates;
}
// Split v64i1 vectors if we don't have v64i8 available.
if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
CC != CallingConv::X86_RegCall) {
RegisterVT = MVT::v32i1;
IntermediateVT = MVT::v32i1;
NumIntermediates = 2;
return 2;
}
return TargetLowering::getVectorTypeBreakdownForCallingConv(Context, CC, VT, IntermediateVT,
NumIntermediates, RegisterVT);
}
EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
LLVMContext& Context,
EVT VT) const {
if (!VT.isVector())
return MVT::i8;
if (Subtarget.hasAVX512()) {
const unsigned NumElts = VT.getVectorNumElements();
// Figure out what this type will be legalized to.
EVT LegalVT = VT;
while (getTypeAction(Context, LegalVT) != TypeLegal)
LegalVT = getTypeToTransformTo(Context, LegalVT);
// If we got a 512-bit vector then we'll definitely have a vXi1 compare.
if (LegalVT.getSimpleVT().is512BitVector())
return EVT::getVectorVT(Context, MVT::i1, NumElts);
if (LegalVT.getSimpleVT().isVector() && Subtarget.hasVLX()) {
// If we legalized to less than a 512-bit vector, then we will use a vXi1
// compare for vXi32/vXi64 for sure. If we have BWI we will also support
// vXi16/vXi8.
MVT EltVT = LegalVT.getSimpleVT().getVectorElementType();
if (Subtarget.hasBWI() || EltVT.getSizeInBits() >= 32)
return EVT::getVectorVT(Context, MVT::i1, NumElts);
}
}
return VT.changeVectorElementTypeToInteger();
}
/// Helper for getByValTypeAlignment to determine
/// the desired ByVal argument alignment.
static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
if (MaxAlign == 16)
return;
if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
if (VTy->getBitWidth() == 128)
MaxAlign = 16;
} else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
unsigned EltAlign = 0;
getMaxByValAlign(ATy->getElementType(), EltAlign);
if (EltAlign > MaxAlign)
MaxAlign = EltAlign;
} else if (StructType *STy = dyn_cast<StructType>(Ty)) {
for (auto *EltTy : STy->elements()) {
unsigned EltAlign = 0;
getMaxByValAlign(EltTy, EltAlign);
if (EltAlign > MaxAlign)
MaxAlign = EltAlign;
if (MaxAlign == 16)
break;
}
}
}
/// Return the desired alignment for ByVal aggregate
/// function arguments in the caller parameter area. For X86, aggregates
/// that contain SSE vectors are placed at 16-byte boundaries while the rest
/// are at 4-byte boundaries.
unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
const DataLayout &DL) const {
if (Subtarget.is64Bit()) {
// Max of 8 and alignment of type.
unsigned TyAlign = DL.getABITypeAlignment(Ty);
if (TyAlign > 8)
return TyAlign;
return 8;
}
unsigned Align = 4;
if (Subtarget.hasSSE1())
getMaxByValAlign(Ty, Align);
return Align;
}
/// Returns the target specific optimal type for load
/// and store operations as a result of memset, memcpy, and memmove
/// lowering. If DstAlign is zero that means it's safe to destination
/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
/// means there isn't a need to check it against alignment requirement,
/// probably because the source does not need to be loaded. If 'IsMemset' is
/// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
/// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
/// source is constant so it does not need to be loaded.
/// It returns EVT::Other if the type should be determined using generic
/// target-independent logic.
/// For vector ops we check that the overall size isn't larger than our
/// preferred vector width.
EVT X86TargetLowering::getOptimalMemOpType(
uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset,
bool ZeroMemset, bool MemcpyStrSrc,
const AttributeList &FuncAttributes) const {
if (!FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) {
if (Size >= 16 && (!Subtarget.isUnalignedMem16Slow() ||
((DstAlign == 0 || DstAlign >= 16) &&
(SrcAlign == 0 || SrcAlign >= 16)))) {
// FIXME: Check if unaligned 64-byte accesses are slow.
if (Size >= 64 && Subtarget.hasAVX512() &&
(Subtarget.getPreferVectorWidth() >= 512)) {
return Subtarget.hasBWI() ? MVT::v64i8 : MVT::v16i32;
}
// FIXME: Check if unaligned 32-byte accesses are slow.
if (Size >= 32 && Subtarget.hasAVX() &&
(Subtarget.getPreferVectorWidth() >= 256)) {
// Although this isn't a well-supported type for AVX1, we'll let
// legalization and shuffle lowering produce the optimal codegen. If we
// choose an optimal type with a vector element larger than a byte,
// getMemsetStores() may create an intermediate splat (using an integer
// multiply) before we splat as a vector.
return MVT::v32i8;
}
if (Subtarget.hasSSE2() && (Subtarget.getPreferVectorWidth() >= 128))
return MVT::v16i8;
// TODO: Can SSE1 handle a byte vector?
// If we have SSE1 registers we should be able to use them.
if (Subtarget.hasSSE1() && (Subtarget.is64Bit() || Subtarget.hasX87()) &&
(Subtarget.getPreferVectorWidth() >= 128))
return MVT::v4f32;
} else if ((!IsMemset || ZeroMemset) && !MemcpyStrSrc && Size >= 8 &&
!Subtarget.is64Bit() && Subtarget.hasSSE2()) {
// Do not use f64 to lower memcpy if source is string constant. It's
// better to use i32 to avoid the loads.
// Also, do not use f64 to lower memset unless this is a memset of zeros.
// The gymnastics of splatting a byte value into an XMM register and then
// only using 8-byte stores (because this is a CPU with slow unaligned
// 16-byte accesses) makes that a loser.
return MVT::f64;
}
}
// This is a compromise. If we reach here, unaligned accesses may be slow on
// this target. However, creating smaller, aligned accesses could be even
// slower and would certainly be a lot more code.
if (Subtarget.is64Bit() && Size >= 8)
return MVT::i64;
return MVT::i32;
}
bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
if (VT == MVT::f32)
return X86ScalarSSEf32;
else if (VT == MVT::f64)
return X86ScalarSSEf64;
return true;
}
bool X86TargetLowering::allowsMisalignedMemoryAccesses(
EVT VT, unsigned, unsigned Align, MachineMemOperand::Flags Flags,
bool *Fast) const {
if (Fast) {
switch (VT.getSizeInBits()) {
default:
// 8-byte and under are always assumed to be fast.
*Fast = true;
break;
case 128:
*Fast = !Subtarget.isUnalignedMem16Slow();
break;
case 256:
*Fast = !Subtarget.isUnalignedMem32Slow();
break;
// TODO: What about AVX-512 (512-bit) accesses?
}
}
// NonTemporal vector memory ops must be aligned.
if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) {
// NT loads can only be vector aligned, so if its less aligned than the
// minimum vector size (which we can split the vector down to), we might as
// well use a regular unaligned vector load.
// We don't have any NT loads pre-SSE41.
if (!!(Flags & MachineMemOperand::MOLoad))
return (Align < 16 || !Subtarget.hasSSE41());
return false;
}
// Misaligned accesses of any size are always allowed.
return true;
}
/// Return the entry encoding for a jump table in the
/// current function. The returned value is a member of the
/// MachineJumpTableInfo::JTEntryKind enum.
unsigned X86TargetLowering::getJumpTableEncoding() const {
// In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
// symbol.
if (isPositionIndependent() && Subtarget.isPICStyleGOT())
return MachineJumpTableInfo::EK_Custom32;
// Otherwise, use the normal jump table encoding heuristics.
return TargetLowering::getJumpTableEncoding();
}
bool X86TargetLowering::useSoftFloat() const {
return Subtarget.useSoftFloat();
}
void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,
ArgListTy &Args) const {
// Only relabel X86-32 for C / Stdcall CCs.
if (Subtarget.is64Bit())
return;
if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
return;
unsigned ParamRegs = 0;
if (auto *M = MF->getFunction().getParent())
ParamRegs = M->getNumberRegisterParameters();
// Mark the first N int arguments as having reg
for (unsigned Idx = 0; Idx < Args.size(); Idx++) {
Type *T = Args[Idx].Ty;
if (T->isIntOrPtrTy())
if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {
unsigned numRegs = 1;
if (MF->getDataLayout().getTypeAllocSize(T) > 4)
numRegs = 2;
if (ParamRegs < numRegs)
return;
ParamRegs -= numRegs;
Args[Idx].IsInReg = true;
}
}
}
const MCExpr *
X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
const MachineBasicBlock *MBB,
unsigned uid,MCContext &Ctx) const{
assert(isPositionIndependent() && Subtarget.isPICStyleGOT());
// In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
// entries.
return MCSymbolRefExpr::create(MBB->getSymbol(),
MCSymbolRefExpr::VK_GOTOFF, Ctx);
}
/// Returns relocation base for the given PIC jumptable.
SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
SelectionDAG &DAG) const {
if (!Subtarget.is64Bit())
// This doesn't have SDLoc associated with it, but is not really the
// same as a Register.
return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
getPointerTy(DAG.getDataLayout()));
return Table;
}
/// This returns the relocation base for the given PIC jumptable,
/// the same as getPICJumpTableRelocBase, but as an MCExpr.
const MCExpr *X86TargetLowering::
getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
MCContext &Ctx) const {
// X86-64 uses RIP relative addressing based on the jump table label.
if (Subtarget.isPICStyleRIPRel())
return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
// Otherwise, the reference is relative to the PIC base.
return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
}
std::pair<const TargetRegisterClass *, uint8_t>
X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
MVT VT) const {
const TargetRegisterClass *RRC = nullptr;
uint8_t Cost = 1;
switch (VT.SimpleTy) {
default:
return TargetLowering::findRepresentativeClass(TRI, VT);
case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
break;
case MVT::x86mmx:
RRC = &X86::VR64RegClass;
break;
case MVT::f32: case MVT::f64:
case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
case MVT::v4f32: case MVT::v2f64:
case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
case MVT::v8f32: case MVT::v4f64:
case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
case MVT::v16f32: case MVT::v8f64:
RRC = &X86::VR128XRegClass;
break;
}
return std::make_pair(RRC, Cost);
}
unsigned X86TargetLowering::getAddressSpace() const {
if (Subtarget.is64Bit())
return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
return 256;
}
static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() ||
(TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));
}
static Constant* SegmentOffset(IRBuilder<> &IRB,
unsigned Offset, unsigned AddressSpace) {
return ConstantExpr::getIntToPtr(
ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
}
Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
// glibc, bionic, and Fuchsia have a special slot for the stack guard in
// tcbhead_t; use it instead of the usual global variable (see
// sysdeps/{i386,x86_64}/nptl/tls.h)
if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {
if (Subtarget.isTargetFuchsia()) {
// <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
return SegmentOffset(IRB, 0x10, getAddressSpace());
} else {
// %fs:0x28, unless we're using a Kernel code model, in which case
// it's %gs:0x28. gs:0x14 on i386.
unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
return SegmentOffset(IRB, Offset, getAddressSpace());
}
}
return TargetLowering::getIRStackGuard(IRB);
}
void X86TargetLowering::insertSSPDeclarations(Module &M) const {
// MSVC CRT provides functionalities for stack protection.
if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
// MSVC CRT has a global variable holding security cookie.
M.getOrInsertGlobal("__security_cookie",
Type::getInt8PtrTy(M.getContext()));
// MSVC CRT has a function to validate security cookie.
FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
"__security_check_cookie", Type::getVoidTy(M.getContext()),
Type::getInt8PtrTy(M.getContext()));
if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
F->setCallingConv(CallingConv::X86_FastCall);
F->addAttribute(1, Attribute::AttrKind::InReg);
}
return;
}
// glibc, bionic, and Fuchsia have a special slot for the stack guard.
if (hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
return;
TargetLowering::insertSSPDeclarations(M);
}
Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
// MSVC CRT has a global variable holding security cookie.
if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
return M.getGlobalVariable("__security_cookie");
}
return TargetLowering::getSDagStackGuard(M);
}
Function *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
// MSVC CRT has a function to validate security cookie.
if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
return M.getFunction("__security_check_cookie");
}
return TargetLowering::getSSPStackGuardCheck(M);
}
Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
if (Subtarget.getTargetTriple().isOSContiki())
return getDefaultSafeStackPointerLocation(IRB, false);
// Android provides a fixed TLS slot for the SafeStack pointer. See the
// definition of TLS_SLOT_SAFESTACK in
// https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
if (Subtarget.isTargetAndroid()) {
// %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
// %gs:0x24 on i386
unsigned Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
return SegmentOffset(IRB, Offset, getAddressSpace());
}
// Fuchsia is similar.
if (Subtarget.isTargetFuchsia()) {
// <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
return SegmentOffset(IRB, 0x18, getAddressSpace());
}
return TargetLowering::getSafeStackPointerLocation(IRB);
}
bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
unsigned DestAS) const {
assert(SrcAS != DestAS && "Expected different address spaces!");
const TargetMachine &TM = getTargetMachine();
if (TM.getPointerSize(SrcAS) != TM.getPointerSize(DestAS))
return false;
return SrcAS < 256 && DestAS < 256;
}
//===----------------------------------------------------------------------===//
// Return Value Calling Convention Implementation
//===----------------------------------------------------------------------===//
bool X86TargetLowering::CanLowerReturn(
CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
SmallVector<CCValAssign, 16> RVLocs;
CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
return CCInfo.CheckReturn(Outs, RetCC_X86);
}
const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
return ScratchRegs;
}
/// Lowers masks values (v*i1) to the local register values
/// \returns DAG node after lowering to register type
static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
const SDLoc &Dl, SelectionDAG &DAG) {
EVT ValVT = ValArg.getValueType();
if (ValVT == MVT::v1i1)
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, Dl, ValLoc, ValArg,
DAG.getIntPtrConstant(0, Dl));
if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) ||
(ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) {
// Two stage lowering might be required
// bitcast: v8i1 -> i8 / v16i1 -> i16
// anyextend: i8 -> i32 / i16 -> i32
EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);
if (ValLoc == MVT::i32)
ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);
return ValToCopy;
}
if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||
(ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
// One stage lowering is required
// bitcast: v32i1 -> i32 / v64i1 -> i64
return DAG.getBitcast(ValLoc, ValArg);
}
return DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValArg);
}
/// Breaks v64i1 value into two registers and adds the new node to the DAG
static void Passv64i1ArgInRegs(
const SDLoc &Dl, SelectionDAG &DAG, SDValue &Arg,
SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass, CCValAssign &VA,
CCValAssign &NextVA, const X86Subtarget &Subtarget) {
assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
assert(Subtarget.is32Bit() && "Expecting 32 bit target");
assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value");
assert(VA.isRegLoc() && NextVA.isRegLoc() &&
"The value should reside in two registers");
// Before splitting the value we cast it to i64
Arg = DAG.getBitcast(MVT::i64, Arg);
// Splitting the value into two i32 types
SDValue Lo, Hi;
Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
DAG.getConstant(0, Dl, MVT::i32));
Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
DAG.getConstant(1, Dl, MVT::i32));
// Attach the two i32 types into corresponding registers
RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));
}
SDValue
X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
bool isVarArg,
const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<SDValue> &OutVals,
const SDLoc &dl, SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
// In some cases we need to disable registers from the default CSR list.
// For example, when they are used for argument passing.
bool ShouldDisableCalleeSavedRegister =
CallConv == CallingConv::X86_RegCall ||
MF.getFunction().hasFnAttribute("no_caller_saved_registers");
if (CallConv == CallingConv::X86_INTR && !Outs.empty())
report_fatal_error("X86 interrupts may not return any value");
SmallVector<CCValAssign, 16> RVLocs;
CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
CCInfo.AnalyzeReturn(Outs, RetCC_X86);
SDValue Flag;
SmallVector<SDValue, 6> RetOps;
RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
// Operand #1 = Bytes To Pop
RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
MVT::i32));
// Copy the result values into the output registers.
for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
++I, ++OutsIndex) {
CCValAssign &VA = RVLocs[I];
assert(VA.isRegLoc() && "Can only return in registers!");
// Add the register to the CalleeSaveDisableRegs list.
if (ShouldDisableCalleeSavedRegister)
MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg());
SDValue ValToCopy = OutVals[OutsIndex];
EVT ValVT = ValToCopy.getValueType();
// Promote values to the appropriate types.
if (VA.getLocInfo() == CCValAssign::SExt)
ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
else if (VA.getLocInfo() == CCValAssign::ZExt)
ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
else if (VA.getLocInfo() == CCValAssign::AExt) {
if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);
else
ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
}
else if (VA.getLocInfo() == CCValAssign::BCvt)
ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
assert(VA.getLocInfo() != CCValAssign::FPExt &&
"Unexpected FP-extend for return value.");
// Report an error if we have attempted to return a value via an XMM
// register and SSE was disabled.
if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {
errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
} else if (!Subtarget.hasSSE2() &&
X86::FR64XRegClass.contains(VA.getLocReg()) &&
ValVT == MVT::f64) {
// When returning a double via an XMM register, report an error if SSE2 is
// not enabled.
errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
}
// Returns in ST0/ST1 are handled specially: these are pushed as operands to
// the RET instruction and handled by the FP Stackifier.
if (VA.getLocReg() == X86::FP0 ||
VA.getLocReg() == X86::FP1) {
// If this is a copy from an xmm register to ST(0), use an FPExtend to
// change the value to the FP stack register class.
if (isScalarFPTypeInSSEReg(VA.getValVT()))
ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
RetOps.push_back(ValToCopy);
// Don't emit a copytoreg.
continue;
}
// 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
// which is returned in RAX / RDX.
if (Subtarget.is64Bit()) {
if (ValVT == MVT::x86mmx) {
if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
ValToCopy);
// If we don't have SSE2 available, convert to v4f32 so the generated
// register is legal.
if (!Subtarget.hasSSE2())
ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
}
}
}
SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
if (VA.needsCustom()) {
assert(VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs");
Passv64i1ArgInRegs(dl, DAG, ValToCopy, RegsToPass, VA, RVLocs[++I],
Subtarget);
assert(2 == RegsToPass.size() &&
"Expecting two registers after Pass64BitArgInRegs");
// Add the second register to the CalleeSaveDisableRegs list.
if (ShouldDisableCalleeSavedRegister)
MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());
} else {
RegsToPass.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
}
// Add nodes to the DAG and add the values into the RetOps list
for (auto &Reg : RegsToPass) {
Chain = DAG.getCopyToReg(Chain, dl, Reg.first, Reg.second, Flag);
Flag = Chain.getValue(1);
RetOps.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType()));
}
}
// Swift calling convention does not require we copy the sret argument
// into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
// All x86 ABIs require that for returning structs by value we copy
// the sret argument into %rax/%eax (depending on ABI) for the return.
// We saved the argument into a virtual register in the entry block,
// so now we copy the value out and into %rax/%eax.
//
// Checking Function.hasStructRetAttr() here is insufficient because the IR
// may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
// false, then an sret argument may be implicitly inserted in the SelDAG. In
// either case FuncInfo->setSRetReturnReg() will have been called.
if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
// When we have both sret and another return value, we should use the
// original Chain stored in RetOps[0], instead of the current Chain updated
// in the above loop. If we only have sret, RetOps[0] equals to Chain.
// For the case of sret and another return value, we have
// Chain_0 at the function entry
// Chain_1 = getCopyToReg(Chain_0) in the above loop
// If we use Chain_1 in getCopyFromReg, we will have
// Val = getCopyFromReg(Chain_1)
// Chain_2 = getCopyToReg(Chain_1, Val) from below
// getCopyToReg(Chain_0) will be glued together with
// getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
// in Unit B, and we will have cyclic dependency between Unit A and Unit B:
// Data dependency from Unit B to Unit A due to usage of Val in
// getCopyToReg(Chain_1, Val)
// Chain dependency from Unit A to Unit B
// So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
getPointerTy(MF.getDataLayout()));
unsigned RetValReg
= (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
X86::RAX : X86::EAX;
Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
Flag = Chain.getValue(1);
// RAX/EAX now acts like a return value.
RetOps.push_back(
DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
// Add the returned register to the CalleeSaveDisableRegs list.
if (ShouldDisableCalleeSavedRegister)
MF.getRegInfo().disableCalleeSavedRegister(RetValReg);
}
const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
const MCPhysReg *I =
TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
if (I) {
for (; *I; ++I) {
if (X86::GR64RegClass.contains(*I))
RetOps.push_back(DAG.getRegister(*I, MVT::i64));
else
llvm_unreachable("Unexpected register class in CSRsViaCopy!");
}
}
RetOps[0] = Chain; // Update chain.
// Add the flag if we have it.
if (Flag.getNode())
RetOps.push_back(Flag);
X86ISD::NodeType opcode = X86ISD::RET_FLAG;
if (CallConv == CallingConv::X86_INTR)
opcode = X86ISD::IRET;
return DAG.getNode(opcode, dl, MVT::Other, RetOps);
}
bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))
return false;
SDValue TCChain = Chain;
SDNode *Copy = *N->use_begin();
if (Copy->getOpcode() == ISD::CopyToReg) {
// If the copy has a glue operand, we conservatively assume it isn't safe to
// perform a tail call.
if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
return false;
TCChain = Copy->getOperand(0);
} else if (Copy->getOpcode() != ISD::FP_EXTEND)
return false;
bool HasRet = false;
for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
UI != UE; ++UI) {
if (UI->getOpcode() != X86ISD::RET_FLAG)
return false;
// If we are returning more than one value, we can definitely
// not make a tail call see PR19530
if (UI->getNumOperands() > 4)
return false;
if (UI->getNumOperands() == 4 &&
UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
return false;
HasRet = true;
}
if (!HasRet)
return false;
Chain = TCChain;
return true;
}
EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
ISD::NodeType ExtendKind) const {
MVT ReturnMVT = MVT::i32;
bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {
// The ABI does not require i1, i8 or i16 to be extended.
//
// On Darwin, there is code in the wild relying on Clang's old behaviour of
// always extending i8/i16 return values, so keep doing that for now.
// (PR26665).
ReturnMVT = MVT::i8;
}
EVT MinVT = getRegisterType(Context, ReturnMVT);
return VT.bitsLT(MinVT) ? MinVT : VT;
}
/// Reads two 32 bit registers and creates a 64 bit mask value.
/// \param VA The current 32 bit value that need to be assigned.
/// \param NextVA The next 32 bit value that need to be assigned.
/// \param Root The parent DAG node.
/// \param [in,out] InFlag Represents SDvalue in the parent DAG node for
/// glue purposes. In the case the DAG is already using
/// physical register instead of virtual, we should glue
/// our new SDValue to InFlag SDvalue.
/// \return a new SDvalue of size 64bit.
static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
SDValue &Root, SelectionDAG &DAG,
const SDLoc &Dl, const X86Subtarget &Subtarget,
SDValue *InFlag = nullptr) {
assert((Subtarget.hasBWI()) && "Expected AVX512BW target!");
assert(Subtarget.is32Bit() && "Expecting 32 bit target");
assert(VA.getValVT() == MVT::v64i1 &&
"Expecting first location of 64 bit width type");
assert(NextVA.getValVT() == VA.getValVT() &&
"The locations should have the same type");
assert(VA.isRegLoc() && NextVA.isRegLoc() &&
"The values should reside in two registers");
SDValue Lo, Hi;
SDValue ArgValueLo, ArgValueHi;
MachineFunction &MF = DAG.getMachineFunction();
const TargetRegisterClass *RC = &X86::GR32RegClass;
// Read a 32 bit value from the registers.
if (nullptr == InFlag) {
// When no physical register is present,
// create an intermediate virtual register.
unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
} else {
// When a physical register is available read the value from it and glue
// the reads together.
ArgValueLo =
DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InFlag);
*InFlag = ArgValueLo.getValue(2);
ArgValueHi =
DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InFlag);
*InFlag = ArgValueHi.getValue(2);
}
// Convert the i32 type into v32i1 type.
Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);
// Convert the i32 type into v32i1 type.
Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);
// Concatenate the two values together.
return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);
}
/// The function will lower a register of various sizes (8/16/32/64)
/// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
/// \returns a DAG node contains the operand after lowering to mask type.
static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
const EVT &ValLoc, const SDLoc &Dl,
SelectionDAG &DAG) {
SDValue ValReturned = ValArg;
if (ValVT == MVT::v1i1)
return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned);
if (ValVT == MVT::v64i1) {
// In 32 bit machine, this case is handled by getv64i1Argument
assert(ValLoc == MVT::i64 && "Expecting only i64 locations");
// In 64 bit machine, There is no need to truncate the value only bitcast
} else {
MVT maskLen;
switch (ValVT.getSimpleVT().SimpleTy) {
case MVT::v8i1:
maskLen = MVT::i8;
break;
case MVT::v16i1:
maskLen = MVT::i16;
break;
case MVT::v32i1:
maskLen = MVT::i32;
break;
default:
llvm_unreachable("Expecting a vector of i1 types");
}
ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);
}
return DAG.getBitcast(ValVT, ValReturned);
}
/// Lower the result values of a call into the
/// appropriate copies out of appropriate physical registers.
///
SDValue X86TargetLowering::LowerCallResult(
SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
uint32_t *RegMask) const {
const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
// Assign locations to each value returned by this call.
SmallVector<CCValAssign, 16> RVLocs;
CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
*DAG.getContext());
CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
// Copy all of the result registers out of their specified physreg.
for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
++I, ++InsIndex) {
CCValAssign &VA = RVLocs[I];
EVT CopyVT = VA.getLocVT();
// In some calling conventions we need to remove the used registers
// from the register mask.
if (RegMask) {
for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /*IncludeSelf=*/true);
SubRegs.isValid(); ++SubRegs)
RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
}
// Report an error if there was an attempt to return FP values via XMM
// registers.
if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {
errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
if (VA.getLocReg() == X86::XMM1)
VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.
else
VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
} else if (!Subtarget.hasSSE2() &&
X86::FR64XRegClass.contains(VA.getLocReg()) &&
CopyVT == MVT::f64) {
errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
if (VA.getLocReg() == X86::XMM1)
VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.
else
VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
}
// If we prefer to use the value in xmm registers, copy it out as f80 and
// use a truncate to move it from fp stack reg to xmm reg.
bool RoundAfterCopy = false;
if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
isScalarFPTypeInSSEReg(VA.getValVT())) {
if (!Subtarget.hasX87())
report_fatal_error("X87 register return with X87 disabled");
CopyVT = MVT::f80;
RoundAfterCopy = (CopyVT != VA.getLocVT());
}
SDValue Val;
if (VA.needsCustom()) {
assert(VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs");
Val =
getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InFlag);
} else {
Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag)
.getValue(1);
Val = Chain.getValue(0);
InFlag = Chain.getValue(2);
}
if (RoundAfterCopy)
Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
// This truncation won't change the value.
DAG.getIntPtrConstant(1, dl));
if (VA.isExtInLoc() && (VA.getValVT().getScalarType() == MVT::i1)) {
if (VA.getValVT().isVector() &&
((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
(VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
// promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
} else
Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
}
if (VA.getLocInfo() == CCValAssign::BCvt)
Val = DAG.getBitcast(VA.getValVT(), Val);
InVals.push_back(Val);
}
return Chain;
}
//===----------------------------------------------------------------------===//
// C & StdCall & Fast Calling Convention implementation
//===----------------------------------------------------------------------===//
// StdCall calling convention seems to be standard for many Windows' API
// routines and around. It differs from C calling convention just a little:
// callee should clean up the stack, not caller. Symbols should be also
// decorated in some fancy way :) It doesn't support any vector arguments.
// For info on fast calling convention see Fast Calling Convention (tail call)
// implementation LowerX86_32FastCCCallTo.
/// CallIsStructReturn - Determines whether a call uses struct return
/// semantics.
enum StructReturnType {
NotStructReturn,
RegStructReturn,
StackStructReturn
};
static StructReturnType
callIsStructReturn(ArrayRef<ISD::OutputArg> Outs, bool IsMCU) {
if (Outs.empty())
return NotStructReturn;
const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
if (!Flags.isSRet())
return NotStructReturn;
if (Flags.isInReg() || IsMCU)
return RegStructReturn;
return StackStructReturn;
}
/// Determines whether a function uses struct return semantics.
static StructReturnType
argsAreStructReturn(ArrayRef<ISD::InputArg> Ins, bool IsMCU) {
if (Ins.empty())
return NotStructReturn;
const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
if (!Flags.isSRet())
return NotStructReturn;
if (Flags.isInReg() || IsMCU)
return RegStructReturn;
return StackStructReturn;
}
/// Make a copy of an aggregate at address specified by "Src" to address
/// "Dst" with size and alignment information specified by the specific
/// parameter attribute. The copy will be passed as a byval function parameter.
static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
SDValue Chain, ISD::ArgFlagsTy Flags,
SelectionDAG &DAG, const SDLoc &dl) {
SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
/*isVolatile*/false, /*AlwaysInline=*/true,
/*isTailCall*/false,
MachinePointerInfo(), MachinePointerInfo());
}
/// Return true if the calling convention is one that we can guarantee TCO for.
static bool canGuaranteeTCO(CallingConv::ID CC) {
return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE ||
CC == CallingConv::HHVM || CC == CallingConv::Tail);
}
/// Return true if we might ever do TCO for calls with this calling convention.
static bool mayTailCallThisCC(CallingConv::ID CC) {
switch (CC) {
// C calling conventions:
case CallingConv::C:
case CallingConv::Win64:
case CallingConv::X86_64_SysV:
// Callee pop conventions:
case CallingConv::X86_ThisCall:
case CallingConv::X86_StdCall:
case CallingConv::X86_VectorCall:
case CallingConv::X86_FastCall:
// Swift:
case CallingConv::Swift:
return true;
default:
return canGuaranteeTCO(CC);
}
}
/// Return true if the function is being made into a tailcall target by
/// changing its ABI.
static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
return (GuaranteedTailCallOpt && canGuaranteeTCO(CC)) || CC == CallingConv::Tail;
}
bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
if (!CI->isTailCall())
return false;
ImmutableCallSite CS(CI);
CallingConv::ID CalleeCC = CS.getCallingConv();
if (!mayTailCallThisCC(CalleeCC))
return false;
return true;
}
SDValue
X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
const SmallVectorImpl<ISD::InputArg> &Ins,
const SDLoc &dl, SelectionDAG &DAG,
const CCValAssign &VA,
MachineFrameInfo &MFI, unsigned i) const {
// Create the nodes corresponding to a load from this parameter slot.
ISD::ArgFlagsTy Flags = Ins[i].Flags;
bool AlwaysUseMutable = shouldGuaranteeTCO(
CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
EVT ValVT;
MVT PtrVT = getPointerTy(DAG.getDataLayout());
// If value is passed by pointer we have address passed instead of the value
// itself. No need to extend if the mask value and location share the same
// absolute size.
bool ExtendedInMem =
VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();
if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)
ValVT = VA.getLocVT();
else
ValVT = VA.getValVT();
// FIXME: For now, all byval parameter objects are marked mutable. This can be
// changed with more analysis.
// In case of tail call optimization mark all arguments mutable. Since they
// could be overwritten by lowering of arguments in case of a tail call.
if (Flags.isByVal()) {
unsigned Bytes = Flags.getByValSize();
if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
// FIXME: For now, all byval parameter objects are marked as aliasing. This
// can be improved with deeper analysis.
int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable,
/*isAliased=*/true);
return DAG.getFrameIndex(FI, PtrVT);
}
// This is an argument in memory. We might be able to perform copy elision.
// If the argument is passed directly in memory without any extension, then we
// can perform copy elision. Large vector types, for example, may be passed
// indirectly by pointer.
if (Flags.isCopyElisionCandidate() &&
VA.getLocInfo() != CCValAssign::Indirect && !ExtendedInMem) {
EVT ArgVT = Ins[i].ArgVT;
SDValue PartAddr;
if (Ins[i].PartOffset == 0) {
// If this is a one-part value or the first part of a multi-part value,
// create a stack object for the entire argument value type and return a
// load from our portion of it. This assumes that if the first part of an
// argument is in memory, the rest will also be in memory.
int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),
/*IsImmutable=*/false);
PartAddr = DAG.getFrameIndex(FI, PtrVT);
return DAG.getLoad(
ValVT, dl, Chain, PartAddr,
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
} else {
// This is not the first piece of an argument in memory. See if there is
// already a fixed stack object including this offset. If so, assume it
// was created by the PartOffset == 0 branch above and create a load from
// the appropriate offset into it.
int64_t PartBegin = VA.getLocMemOffset();
int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;
int FI = MFI.getObjectIndexBegin();
for (; MFI.isFixedObjectIndex(FI); ++FI) {
int64_t ObjBegin = MFI.getObjectOffset(FI);
int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);
if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)
break;
}
if (MFI.isFixedObjectIndex(FI)) {
SDValue Addr =
DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),
DAG.getIntPtrConstant(Ins[i].PartOffset, dl));
return DAG.getLoad(
ValVT, dl, Chain, Addr,
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI,
Ins[i].PartOffset));
}
}
}
int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
VA.getLocMemOffset(), isImmutable);
// Set SExt or ZExt flag.
if (VA.getLocInfo() == CCValAssign::ZExt) {
MFI.setObjectZExt(FI, true);
} else if (VA.getLocInfo() == CCValAssign::SExt) {
MFI.setObjectSExt(FI, true);
}
SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
SDValue Val = DAG.getLoad(
ValVT, dl, Chain, FIN,
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
return ExtendedInMem
? (VA.getValVT().isVector()
? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)
: DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val))
: Val;
}
// FIXME: Get this from tablegen.
static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
const X86Subtarget &Subtarget) {
assert(Subtarget.is64Bit());
if (Subtarget.isCallingConvWin64(CallConv)) {
static const MCPhysReg GPR64ArgRegsWin64[] = {
X86::RCX, X86::RDX, X86::R8, X86::R9
};
return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
}
static const MCPhysReg GPR64ArgRegs64Bit[] = {
X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
};
return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
}
// FIXME: Get this from tablegen.
static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
CallingConv::ID CallConv,
const X86Subtarget &Subtarget) {
assert(Subtarget.is64Bit());
if (Subtarget.isCallingConvWin64(CallConv)) {
// The XMM registers which might contain var arg parameters are shadowed
// in their paired GPR. So we only need to save the GPR to their home
// slots.
// TODO: __vectorcall will change this.
return None;
}
const Function &F = MF.getFunction();
bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
bool isSoftFloat = Subtarget.useSoftFloat();
assert(!(isSoftFloat && NoImplicitFloatOps) &&
"SSE register cannot be used when SSE is disabled!");
if (isSoftFloat || NoImplicitFloatOps || !Subtarget.hasSSE1())
// Kernel mode asks for SSE to be disabled, so there are no XMM argument
// registers.
return None;
static const MCPhysReg XMMArgRegs64Bit[] = {
X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
};
return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
}
#ifndef NDEBUG
static bool isSortedByValueNo(ArrayRef<CCValAssign> ArgLocs) {
return std::is_sorted(ArgLocs.begin(), ArgLocs.end(),
[](const CCValAssign &A, const CCValAssign &B) -> bool {
return A.getValNo() < B.getValNo();
});
}
#endif
SDValue X86TargetLowering::LowerFormalArguments(
SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
MachineFunction &MF = DAG.getMachineFunction();
X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
const Function &F = MF.getFunction();
if (F.hasExternalLinkage() && Subtarget.isTargetCygMing() &&
F.getName() == "main")
FuncInfo->setForceFramePointer(true);
MachineFrameInfo &MFI = MF.getFrameInfo();
bool Is64Bit = Subtarget.is64Bit();
bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
assert(
!(isVarArg && canGuaranteeTCO(CallConv)) &&
"Var args not supported with calling conv' regcall, fastcc, ghc or hipe");
// Assign locations to all of the incoming arguments.
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
// Allocate shadow area for Win64.
if (IsWin64)
CCInfo.AllocateStack(32, 8);
CCInfo.AnalyzeArguments(Ins, CC_X86);
// In vectorcall calling convention a second pass is required for the HVA
// types.
if (CallingConv::X86_VectorCall == CallConv) {
CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
}
// The next loop assumes that the locations are in the same order of the
// input arguments.
assert(isSortedByValueNo(ArgLocs) &&
"Argument Location list must be sorted before lowering");
SDValue ArgValue;
for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
++I, ++InsIndex) {
assert(InsIndex < Ins.size() && "Invalid Ins index");
CCValAssign &VA = ArgLocs[I];
if (VA.isRegLoc()) {
EVT RegVT = VA.getLocVT();
if (VA.needsCustom()) {
assert(
VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs");
// v64i1 values, in regcall calling convention, that are
// compiled to 32 bit arch, are split up into two registers.
ArgValue =
getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
} else {
const TargetRegisterClass *RC;
if (RegVT == MVT::i8)
RC = &X86::GR8RegClass;
else if (RegVT == MVT::i16)
RC = &X86::GR16RegClass;
else if (RegVT == MVT::i32)
RC = &X86::GR32RegClass;
else if (Is64Bit && RegVT == MVT::i64)
RC = &X86::GR64RegClass;
else if (RegVT == MVT::f32)
RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
else if (RegVT == MVT::f64)
RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
else if (RegVT == MVT::f80)
RC = &X86::RFP80RegClass;
else if (RegVT == MVT::f128)
RC = &X86::VR128RegClass;
else if (RegVT.is512BitVector())
RC = &X86::VR512RegClass;
else if (RegVT.is256BitVector())
RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
else if (RegVT.is128BitVector())
RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
else if (RegVT == MVT::x86mmx)
RC = &X86::VR64RegClass;
else if (RegVT == MVT::v1i1)
RC = &X86::VK1RegClass;
else if (RegVT == MVT::v8i1)
RC = &X86::VK8RegClass;
else if (RegVT == MVT::v16i1)
RC = &X86::VK16RegClass;
else if (RegVT == MVT::v32i1)
RC = &X86::VK32RegClass;
else if (RegVT == MVT::v64i1)
RC = &X86::VK64RegClass;
else
llvm_unreachable("Unknown argument type!");
unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
}
// If this is an 8 or 16-bit value, it is really passed promoted to 32
// bits. Insert an assert[sz]ext to capture this, then truncate to the
// right size.
if (VA.getLocInfo() == CCValAssign::SExt)
ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
DAG.getValueType(VA.getValVT()));
else if (VA.getLocInfo() == CCValAssign::ZExt)
ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
DAG.getValueType(VA.getValVT()));
else if (VA.getLocInfo() == CCValAssign::BCvt)
ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
if (VA.isExtInLoc()) {
// Handle MMX values passed in XMM regs.
if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
else if (VA.getValVT().isVector() &&
VA.getValVT().getScalarType() == MVT::i1 &&
((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
(VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
// Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);
} else
ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
}
} else {
assert(VA.isMemLoc());
ArgValue =
LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);
}
// If value is passed via pointer - do a load.
if (VA.getLocInfo() == CCValAssign::Indirect && !Ins[I].Flags.isByVal())
ArgValue =
DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());
InVals.push_back(ArgValue);
}
for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
// Swift calling convention does not require we copy the sret argument
// into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
if (CallConv == CallingConv::Swift)
continue;
// All x86 ABIs require that for returning structs by value we copy the
// sret argument into %rax/%eax (depending on ABI) for the return. Save
// the argument into a virtual register so that we can access it from the
// return points.
if (Ins[I].Flags.isSRet()) {
unsigned Reg = FuncInfo->getSRetReturnReg();
if (!Reg) {
MVT PtrTy = getPointerTy(DAG.getDataLayout());
Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
FuncInfo->setSRetReturnReg(Reg);
}
SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);
Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
break;
}
}
unsigned StackSize = CCInfo.getNextStackOffset();
// Align stack specially for tail calls.
if (shouldGuaranteeTCO(CallConv,
MF.getTarget().Options.GuaranteedTailCallOpt))
StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
// If the function takes variable number of arguments, make a frame index for
// the start of the first vararg value... for expansion of llvm.va_start. We
// can skip this if there are no va_start calls.
if (MFI.hasVAStart() &&
(Is64Bit || (CallConv != CallingConv::X86_FastCall &&
CallConv != CallingConv::X86_ThisCall))) {
FuncInfo->setVarArgsFrameIndex(MFI.CreateFixedObject(1, StackSize, true));
}
// Figure out if XMM registers are in use.
assert(!(Subtarget.useSoftFloat() &&
F.hasFnAttribute(Attribute::NoImplicitFloat)) &&
"SSE register cannot be used when SSE is disabled!");
// 64-bit calling conventions support varargs and register parameters, so we
// have to do extra work to spill them in the prologue.
if (Is64Bit && isVarArg && MFI.hasVAStart()) {
// Find the first unallocated argument registers.
ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&
"SSE register cannot be used when SSE is disabled!");
// Gather all the live in physical registers.
SmallVector<SDValue, 6> LiveGPRs;
SmallVector<SDValue, 8> LiveXMMRegs;
SDValue ALVal;
for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
LiveGPRs.push_back(
DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
}
if (!ArgXMMs.empty()) {
unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
LiveXMMRegs.push_back(
DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
}
}
if (IsWin64) {
// Get to the caller-allocated home save location. Add 8 to account
// for the return address.
int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
FuncInfo->setRegSaveFrameIndex(
MFI.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
// Fixup to set vararg frame on shadow area (4 x i64).
if (NumIntRegs < 4)
FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
} else {
// For X86-64, if there are vararg parameters that are passed via
// registers, then we must store them to their spots on the stack so
// they may be loaded by dereferencing the result of va_next.
FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
FuncInfo->setRegSaveFrameIndex(MFI.CreateStackObject(
ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
}
// Store the integer parameter registers.
SmallVector<SDValue, 8> MemOps;
SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
getPointerTy(DAG.getDataLayout()));
unsigned Offset = FuncInfo->getVarArgsGPOffset();
for (SDValue Val : LiveGPRs) {
SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
RSFIN, DAG.getIntPtrConstant(Offset, dl));
SDValue Store =
DAG.getStore(Val.getValue(1), dl, Val, FIN,
MachinePointerInfo::getFixedStack(
DAG.getMachineFunction(),
FuncInfo->getRegSaveFrameIndex(), Offset));
MemOps.push_back(Store);
Offset += 8;
}
if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
// Now store the XMM (fp + vector) parameter registers.
SmallVector<SDValue, 12> SaveXMMOps;
SaveXMMOps.push_back(Chain);
SaveXMMOps.push_back(ALVal);
SaveXMMOps.push_back(DAG.getIntPtrConstant(
FuncInfo->getRegSaveFrameIndex(), dl));
SaveXMMOps.push_back(DAG.getIntPtrConstant(
FuncInfo->getVarArgsFPOffset(), dl));
SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
LiveXMMRegs.end());
MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
MVT::Other, SaveXMMOps));
}
if (!MemOps.empty())
Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
}
if (isVarArg && MFI.hasMustTailInVarArgFunc()) {
// Find the largest legal vector type.
MVT VecVT = MVT::Other;
// FIXME: Only some x86_32 calling conventions support AVX512.
if (Subtarget.useAVX512Regs() &&
(Is64Bit || (CallConv == CallingConv::X86_VectorCall ||
CallConv == CallingConv::Intel_OCL_BI)))
VecVT = MVT::v16f32;
else if (Subtarget.hasAVX())
VecVT = MVT::v8f32;
else if (Subtarget.hasSSE2())
VecVT = MVT::v4f32;
// We forward some GPRs and some vector types.
SmallVector<MVT, 2> RegParmTypes;
MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
RegParmTypes.push_back(IntVT);
if (VecVT != MVT::Other)
RegParmTypes.push_back(VecVT);
// Compute the set of forwarded registers. The rest are scratch.
SmallVectorImpl<ForwardedRegister> &Forwards =
FuncInfo->getForwardedMustTailRegParms();
CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
// Forward AL for SysV x86_64 targets, since it is used for varargs.
if (Is64Bit && !IsWin64 && !CCInfo.isAllocated(X86::AL)) {
unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
}
// Copy all forwards from physical to virtual registers.
for (ForwardedRegister &FR : Forwards) {
// FIXME: Can we use a less constrained schedule?
SDValue RegVal = DAG.getCopyFromReg(Chain, dl, FR.VReg, FR.VT);
FR.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(FR.VT));
Chain = DAG.getCopyToReg(Chain, dl, FR.VReg, RegVal);
}
}
// Some CCs need callee pop.
if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
MF.getTarget().Options.GuaranteedTailCallOpt)) {
FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
} else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
// X86 interrupts must pop the error code (and the alignment padding) if
// present.
FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);
} else {
FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
// If this is an sret function, the return should pop the hidden pointer.
if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
!Subtarget.getTargetTriple().isOSMSVCRT() &&
argsAreStructReturn(Ins, Subtarget.isTargetMCU()) == StackStructReturn)
FuncInfo->setBytesToPopOnReturn(4);
}
if (!Is64Bit) {
// RegSaveFrameIndex is X86-64 only.
FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
if (CallConv == CallingConv::X86_FastCall ||
CallConv == CallingConv::X86_ThisCall)
// fastcc functions can't have varargs.
FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
}
FuncInfo->setArgumentStackSize(StackSize);
if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
EHPersonality Personality = classifyEHPersonality(F.getPersonalityFn());
if (Personality == EHPersonality::CoreCLR) {
assert(Is64Bit);
// TODO: Add a mechanism to frame lowering that will allow us to indicate
// that we'd prefer this slot be allocated towards the bottom of the frame
// (i.e. near the stack pointer after allocating the frame). Every
// funclet needs a copy of this slot in its (mostly empty) frame, and the
// offset from the bottom of this and each funclet's frame must be the
// same, so the size of funclets' (mostly empty) frames is dictated by
// how far this slot is from the bottom (since they allocate just enough
// space to accommodate holding this slot at the correct offset).
int PSPSymFI = MFI.CreateStackObject(8, 8, /*isSS=*/false);
EHInfo->PSPSymFrameIdx = PSPSymFI;
}
}
if (CallConv == CallingConv::X86_RegCall ||
F.hasFnAttribute("no_caller_saved_registers")) {
MachineRegisterInfo &MRI = MF.getRegInfo();
for (std::pair<unsigned, unsigned> Pair : MRI.liveins())
MRI.disableCalleeSavedRegister(Pair.first);
}
return Chain;
}
SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
SDValue Arg, const SDLoc &dl,
SelectionDAG &DAG,
const CCValAssign &VA,
ISD::ArgFlagsTy Flags) const {
unsigned LocMemOffset = VA.getLocMemOffset();
SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
StackPtr, PtrOff);
if (Flags.isByVal())
return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
return DAG.getStore(
Chain, dl, Arg, PtrOff,
MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
}
/// Emit a load of return address if tail call
/// optimization is performed and it is required.
SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
bool Is64Bit, int FPDiff, const SDLoc &dl) const {
// Adjust the Return address stack slot.
EVT VT = getPointerTy(DAG.getDataLayout());
OutRetAddr = getReturnAddressFrameIndex(DAG);
// Load the "old" Return address.
OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
return SDValue(OutRetAddr.getNode(), 1);
}
/// Emit a store of the return address if tail call
/// optimization is performed and it is required (FPDiff!=0).
static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
SDValue Chain, SDValue RetAddrFrIdx,
EVT PtrVT, unsigned SlotSize,
int FPDiff, const SDLoc &dl) {
// Store the return address to the appropriate stack slot.
if (!FPDiff) return Chain;
// Calculate the new stack slot for the return address.
int NewReturnAddrFI =
MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
false);
SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
MachinePointerInfo::getFixedStack(
DAG.getMachineFunction(), NewReturnAddrFI));
return Chain;
}
/// Returns a vector_shuffle mask for an movs{s|d}, movd
/// operation of specified width.
static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
SDValue V2) {
unsigned NumElems = VT.getVectorNumElements();
SmallVector<int, 8> Mask;
Mask.push_back(NumElems);
for (unsigned i = 1; i != NumElems; ++i)
Mask.push_back(i);
return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
}
SDValue
X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
SmallVectorImpl<SDValue> &InVals) const {
SelectionDAG &DAG = CLI.DAG;
SDLoc &dl = CLI.DL;
SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
SDValue Chain = CLI.Chain;
SDValue Callee = CLI.Callee;
CallingConv::ID CallConv = CLI.CallConv;
bool &isTailCall = CLI.IsTailCall;
bool isVarArg = CLI.IsVarArg;
MachineFunction &MF = DAG.getMachineFunction();
bool Is64Bit = Subtarget.is64Bit();
bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU());
bool IsSibcall = false;
bool IsGuaranteeTCO = MF.getTarget().Options.GuaranteedTailCallOpt ||
CallConv == CallingConv::Tail;
X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
const auto *CI = dyn_cast_or_null<CallInst>(CLI.CS.getInstruction());
const Function *Fn = CI ? CI->getCalledFunction() : nullptr;
bool HasNCSR = (CI && CI->hasFnAttr("no_caller_saved_registers")) ||
(Fn && Fn->hasFnAttribute("no_caller_saved_registers"));
const auto *II = dyn_cast_or_null<InvokeInst>(CLI.CS.getInstruction());
bool HasNoCfCheck =
(CI && CI->doesNoCfCheck()) || (II && II->doesNoCfCheck());
const Module *M = MF.getMMI().getModule();
Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
MachineFunction::CallSiteInfo CSInfo;
if (CallConv == CallingConv::X86_INTR)
report_fatal_error("X86 interrupts may not be called directly");
if (Subtarget.isPICStyleGOT() && !IsGuaranteeTCO) {
// If we are using a GOT, disable tail calls to external symbols with
// default visibility. Tail calling such a symbol requires using a GOT
// relocation, which forces early binding of the symbol. This breaks code
// that require lazy function symbol resolution. Using musttail or
// GuaranteedTailCallOpt will override this.
GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
if (!G || (!G->getGlobal()->hasLocalLinkage() &&
G->getGlobal()->hasDefaultVisibility()))
isTailCall = false;
}
bool IsMustTail = CLI.CS && CLI.CS.isMustTailCall();
if (IsMustTail) {
// Force this to be a tail call. The verifier rules are enough to ensure
// that we can lower this successfully without moving the return address
// around.
isTailCall = true;
} else if (isTailCall) {
// Check if it's really possible to do a tail call.
isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
isVarArg, SR != NotStructReturn,
MF.getFunction().hasStructRetAttr(), CLI.RetTy,
Outs, OutVals, Ins, DAG);
// Sibcalls are automatically detected tailcalls which do not require
// ABI changes.
if (!IsGuaranteeTCO && isTailCall)
IsSibcall = true;
if (isTailCall)
++NumTailCalls;
}
assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
"Var args not supported with calling convention fastcc, ghc or hipe");
// Analyze operands of the call, assigning locations to each operand.
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
// Allocate shadow area for Win64.
if (IsWin64)
CCInfo.AllocateStack(32, 8);
CCInfo.AnalyzeArguments(Outs, CC_X86);
// In vectorcall calling convention a second pass is required for the HVA
// types.
if (CallingConv::X86_VectorCall == CallConv) {
CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
}
// Get a count of how many bytes are to be pushed on the stack.
unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
if (IsSibcall)
// This is a sibcall. The memory operands are available in caller's
// own caller's stack.
NumBytes = 0;
else if (IsGuaranteeTCO && canGuaranteeTCO(CallConv))
NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
int FPDiff = 0;
if (isTailCall && !IsSibcall && !IsMustTail) {
// Lower arguments at fp - stackoffset + fpdiff.
unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
FPDiff = NumBytesCallerPushed - NumBytes;
// Set the delta of movement of the returnaddr stackslot.
// But only set if delta is greater than previous delta.
if (FPDiff < X86Info->getTCReturnAddrDelta())
X86Info->setTCReturnAddrDelta(FPDiff);
}
unsigned NumBytesToPush = NumBytes;
unsigned NumBytesToPop = NumBytes;
// If we have an inalloca argument, all stack space has already been allocated
// for us and be right at the top of the stack. We don't support multiple
// arguments passed in memory when using inalloca.
if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
NumBytesToPush = 0;
if (!ArgLocs.back().isMemLoc())
report_fatal_error("cannot use inalloca attribute on a register "
"parameter");
if (ArgLocs.back().getLocMemOffset() != 0)
report_fatal_error("any parameter with the inalloca attribute must be "
"the only memory argument");
}
if (!IsSibcall && !IsMustTail)
Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,
NumBytes - NumBytesToPush, dl);
SDValue RetAddrFrIdx;
// Load return address for tail calls.
if (isTailCall && FPDiff)
Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
Is64Bit, FPDiff, dl);
SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
SmallVector<SDValue, 8> MemOpChains;
SDValue StackPtr;
// The next loop assumes that the locations are in the same order of the
// input arguments.
assert(isSortedByValueNo(ArgLocs) &&
"Argument Location list must be sorted before lowering");
// Walk the register/memloc assignments, inserting copies/loads. In the case
// of tail call optimization arguments are handle later.
const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
++I, ++OutIndex) {
assert(OutIndex < Outs.size() && "Invalid Out index");
// Skip inalloca arguments, they have already been written.
ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
if (Flags.isInAlloca())
continue;
CCValAssign &VA = ArgLocs[I];
EVT RegVT = VA.getLocVT();
SDValue Arg = OutVals[OutIndex];
bool isByVal = Flags.isByVal();
// Promote the value if needed.
switch (VA.getLocInfo()) {
default: llvm_unreachable("Unknown loc info!");
case CCValAssign::Full: break;
case CCValAssign::SExt:
Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
break;
case CCValAssign::ZExt:
Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
break;
case CCValAssign::AExt:
if (Arg.getValueType().isVector() &&
Arg.getValueType().getVectorElementType() == MVT::i1)
Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);
else if (RegVT.is128BitVector()) {
// Special case: passing MMX values in XMM registers.
Arg = DAG.getBitcast(MVT::i64, Arg);
Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
} else
Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
break;
case CCValAssign::BCvt:
Arg = DAG.getBitcast(RegVT, Arg);
break;
case CCValAssign::Indirect: {
if (isByVal) {
// Memcpy the argument to a temporary stack slot to prevent
// the caller from seeing any modifications the callee may make
// as guaranteed by the `byval` attribute.
int FrameIdx = MF.getFrameInfo().CreateStackObject(
Flags.getByValSize(), std::max(16, (int)Flags.getByValAlign()),
false);
SDValue StackSlot =
DAG.getFrameIndex(FrameIdx, getPointerTy(DAG.getDataLayout()));
Chain =
CreateCopyOfByValArgument(Arg, StackSlot, Chain, Flags, DAG, dl);
// From now on treat this as a regular pointer
Arg = StackSlot;
isByVal = false;
} else {
// Store the argument.
SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
Chain = DAG.getStore(
Chain, dl, Arg, SpillSlot,
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
Arg = SpillSlot;
}
break;
}
}
if (VA.needsCustom()) {
assert(VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs");
// Split v64i1 value into two registers
Passv64i1ArgInRegs(dl, DAG, Arg, RegsToPass, VA, ArgLocs[++I], Subtarget);
} else if (VA.isRegLoc()) {
RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
const TargetOptions &Options = DAG.getTarget().Options;
if (Options.EnableDebugEntryValues)
CSInfo.emplace_back(VA.getLocReg(), I);
if (isVarArg && IsWin64) {
// Win64 ABI requires argument XMM reg to be copied to the corresponding
// shadow reg if callee is a varargs function.
unsigned ShadowReg = 0;
switch (VA.getLocReg()) {
case X86::XMM0: ShadowReg = X86::RCX; break;
case X86::XMM1: ShadowReg = X86::RDX; break;
case X86::XMM2: ShadowReg = X86::R8; break;
case X86::XMM3: ShadowReg = X86::R9; break;
}
if (ShadowReg)
RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
}
} else if (!IsSibcall && (!isTailCall || isByVal)) {
assert(VA.isMemLoc());
if (!StackPtr.getNode())
StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
getPointerTy(DAG.getDataLayout()));
MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
dl, DAG, VA, Flags));
}
}
if (!MemOpChains.empty())
Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
if (Subtarget.isPICStyleGOT()) {
// ELF / PIC requires GOT in the EBX register before function calls via PLT
// GOT pointer.
if (!isTailCall) {
RegsToPass.push_back(std::make_pair(
unsigned(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
getPointerTy(DAG.getDataLayout()))));
} else {
// If we are tail calling and generating PIC/GOT style code load the
// address of the callee into ECX. The value in ecx is used as target of
// the tail jump. This is done to circumvent the ebx/callee-saved problem
// for tail calls on PIC/GOT architectures. Normally we would just put the
// address of GOT into ebx and then call target@PLT. But for tail calls
// ebx would be restored (since ebx is callee saved) before jumping to the
// target@PLT.
// Note: The actual moving to ECX is done further down.
GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
if (G && !G->getGlobal()->hasLocalLinkage() &&
G->getGlobal()->hasDefaultVisibility())
Callee = LowerGlobalAddress(Callee, DAG);
else if (isa<ExternalSymbolSDNode>(Callee))
Callee = LowerExternalSymbol(Callee, DAG);
}
}
if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
// From AMD64 ABI document:
// For calls that may call functions that use varargs or stdargs
// (prototype-less calls or calls to functions containing ellipsis (...) in
// the declaration) %al is used as hidden argument to specify the number
// of SSE registers used. The contents of %al do not need to match exactly
// the number of registers, but must be an ubound on the number of SSE
// registers used and is in the range 0 - 8 inclusive.
// Count the number of XMM registers allocated.
static const MCPhysReg XMMArgRegs[] = {
X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
};
unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
assert((Subtarget.hasSSE1() || !NumXMMRegs)
&& "SSE registers cannot be used when SSE is disabled");
RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
DAG.getConstant(NumXMMRegs, dl,
MVT::i8)));
}
if (isVarArg && IsMustTail) {
const auto &Forwards = X86Info->getForwardedMustTailRegParms();
for (const auto &F : Forwards) {
SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
}
}
// For tail calls lower the arguments to the 'real' stack slots. Sibcalls
// don't need this because the eligibility check rejects calls that require
// shuffling arguments passed in memory.
if (!IsSibcall && isTailCall) {
// Force all the incoming stack arguments to be loaded from the stack
// before any new outgoing arguments are stored to the stack, because the
// outgoing stack slots may alias the incoming argument stack slots, and
// the alias isn't otherwise explicit. This is slightly more conservative
// than necessary, because it means that each store effectively depends
// on every argument instead of just those arguments it would clobber.
SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
SmallVector<SDValue, 8> MemOpChains2;
SDValue FIN;
int FI = 0;
for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;
++I, ++OutsIndex) {
CCValAssign &VA = ArgLocs[I];
if (VA.isRegLoc()) {
if (VA.needsCustom()) {
assert((CallConv == CallingConv::X86_RegCall) &&
"Expecting custom case only in regcall calling convention");
// This means that we are in special case where one argument was
// passed through two register locations - Skip the next location
++I;
}
continue;
}
assert(VA.isMemLoc());
SDValue Arg = OutVals[OutsIndex];
ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
// Skip inalloca arguments. They don't require any work.
if (Flags.isInAlloca())
continue;
// Create frame index.
int32_t Offset = VA.getLocMemOffset()+FPDiff;
uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
if (Flags.isByVal()) {
// Copy relative to framepointer.
SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
if (!StackPtr.getNode())
StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
getPointerTy(DAG.getDataLayout()));
Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
StackPtr, Source);
MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
ArgChain,
Flags, DAG, dl));
} else {
// Store relative to framepointer.
MemOpChains2.push_back(DAG.getStore(
ArgChain, dl, Arg, FIN,
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
}
}
if (!MemOpChains2.empty())
Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
// Store the return address to the appropriate stack slot.
Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
getPointerTy(DAG.getDataLayout()),
RegInfo->getSlotSize(), FPDiff, dl);
}
// Build a sequence of copy-to-reg nodes chained together with token chain
// and flag operands which copy the outgoing args into registers.
SDValue InFlag;
for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
RegsToPass[i].second, InFlag);
InFlag = Chain.getValue(1);
}
if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
// In the 64-bit large code model, we have to make all calls
// through a register, since the call instruction's 32-bit
// pc-relative offset may not be large enough to hold the whole
// address.
} else if (Callee->getOpcode() == ISD::GlobalAddress ||
Callee->getOpcode() == ISD::ExternalSymbol) {
// Lower direct calls to global addresses and external symbols. Setting
// ForCall to true here has the effect of removing WrapperRIP when possible
// to allow direct calls to be selected without first materializing the
// address into a register.
Callee = LowerGlobalOrExternal(Callee, DAG, /*ForCall=*/true);
} else if (Subtarget.isTarget64BitILP32() &&
Callee->getValueType(0) == MVT::i32) {
// Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
}
// Returns a chain & a flag for retval copy to use.
SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
SmallVector<SDValue, 8> Ops;
if (!IsSibcall && isTailCall && !IsMustTail) {
Chain = DAG.getCALLSEQ_END(Chain,
DAG.getIntPtrConstant(NumBytesToPop, dl, true),
DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
InFlag = Chain.getValue(1);
}
Ops.push_back(Chain);
Ops.push_back(Callee);
if (isTailCall)
Ops.push_back(DAG.getConstant(FPDiff, dl, MVT::i32));
// Add argument registers to the end of the list so that they are known live
// into the call.
for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
Ops.push_back(DAG.getRegister(RegsToPass[i].first,
RegsToPass[i].second.getValueType()));
// Add a register mask operand representing the call-preserved registers.
// If HasNCSR is asserted (attribute NoCallerSavedRegisters exists) then we
// set X86_INTR calling convention because it has the same CSR mask
// (same preserved registers).
const uint32_t *Mask = RegInfo->getCallPreservedMask(
MF, HasNCSR ? (CallingConv::ID)CallingConv::X86_INTR : CallConv);
assert(Mask && "Missing call preserved mask for calling convention");
// If this is an invoke in a 32-bit function using a funclet-based
// personality, assume the function clobbers all registers. If an exception
// is thrown, the runtime will not restore CSRs.
// FIXME: Model this more precisely so that we can register allocate across
// the normal edge and spill and fill across the exceptional edge.
if (!Is64Bit && CLI.CS && CLI.CS.isInvoke()) {
const Function &CallerFn = MF.getFunction();
EHPersonality Pers =
CallerFn.hasPersonalityFn()
? classifyEHPersonality(CallerFn.getPersonalityFn())
: EHPersonality::Unknown;
if (isFuncletEHPersonality(Pers))
Mask = RegInfo->getNoPreservedMask();
}
// Define a new register mask from the existing mask.
uint32_t *RegMask = nullptr;
// In some calling conventions we need to remove the used physical registers
// from the reg mask.
if (CallConv == CallingConv::X86_RegCall || HasNCSR) {
const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
// Allocate a new Reg Mask and copy Mask.
RegMask = MF.allocateRegMask();
unsigned RegMaskSize = MachineOperand::getRegMaskSize(TRI->getNumRegs());
memcpy(RegMask, Mask, sizeof(RegMask[0]) * RegMaskSize);
// Make sure all sub registers of the argument registers are reset
// in the RegMask.
for (auto const &RegPair : RegsToPass)
for (MCSubRegIterator SubRegs(RegPair.first, TRI, /*IncludeSelf=*/true);
SubRegs.isValid(); ++SubRegs)
RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
// Create the RegMask Operand according to our updated mask.
Ops.push_back(DAG.getRegisterMask(RegMask));
} else {
// Create the RegMask Operand according to the static mask.
Ops.push_back(DAG.getRegisterMask(Mask));
}
if (InFlag.getNode())
Ops.push_back(InFlag);
if (isTailCall) {
// We used to do:
//// If this is the first return lowered for this function, add the regs
//// to the liveout set for the function.
// This isn't right, although it's probably harmless on x86; liveouts
// should be computed from returns not tail calls. Consider a void
// function making a tail call to a function returning int.
MF.getFrameInfo().setHasTailCall();
SDValue Ret = DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
return Ret;
}
if (HasNoCfCheck && IsCFProtectionSupported) {
Chain = DAG.getNode(X86ISD::NT_CALL, dl, NodeTys, Ops);
} else {
Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
}
InFlag = Chain.getValue(1);
DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
// Save heapallocsite metadata.
if (CLI.CS)
if (MDNode *HeapAlloc = CLI.CS->getMetadata("heapallocsite"))
DAG.addHeapAllocSite(Chain.getNode(), HeapAlloc);
// Create the CALLSEQ_END node.
unsigned NumBytesForCalleeToPop;
if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
DAG.getTarget().Options.GuaranteedTailCallOpt))
NumBytesForCalleeToPop = NumBytes; // Callee pops everything
else if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
!Subtarget.getTargetTriple().isOSMSVCRT() &&
SR == StackStructReturn)
// If this is a call to a struct-return function, the callee
// pops the hidden struct pointer, so we have to push it back.
// This is common for Darwin/X86, Linux & Mingw32 targets.
// For MSVC Win32 targets, the caller pops the hidden struct pointer.
NumBytesForCalleeToPop = 4;
else
NumBytesForCalleeToPop = 0; // Callee pops nothing.
if (CLI.DoesNotReturn && !getTargetMachine().Options.TrapUnreachable) {
// No need to reset the stack after the call if the call doesn't return. To
// make the MI verify, we'll pretend the callee does it for us.
NumBytesForCalleeToPop = NumBytes;
}
// Returns a flag for retval copy to use.
if (!IsSibcall) {
Chain = DAG.getCALLSEQ_END(Chain,
DAG.getIntPtrConstant(NumBytesToPop, dl, true),
DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl,
true),
InFlag, dl);
InFlag = Chain.getValue(1);
}
// Handle result values, copying them out of physregs into vregs that we
// return.
return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
InVals, RegMask);
}
//===----------------------------------------------------------------------===//
// Fast Calling Convention (tail call) implementation
//===----------------------------------------------------------------------===//
// Like std call, callee cleans arguments, convention except that ECX is
// reserved for storing the tail called function address. Only 2 registers are
// free for argument passing (inreg). Tail call optimization is performed
// provided:
// * tailcallopt is enabled
// * caller/callee are fastcc
// On X86_64 architecture with GOT-style position independent code only local
// (within module) calls are supported at the moment.
// To keep the stack aligned according to platform abi the function
// GetAlignedArgumentStackSize ensures that argument delta is always multiples
// of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
// If a tail called function callee has more arguments than the caller the
// caller needs to make sure that there is room to move the RETADDR to. This is
// achieved by reserving an area the size of the argument delta right after the
// original RETADDR, but before the saved framepointer or the spilled registers
// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
// stack layout:
// arg1
// arg2
// RETADDR
// [ new RETADDR
// move area ]
// (possible EBP)
// ESI
// EDI
// local1 ..
/// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
/// requirement.
unsigned
X86TargetLowering::GetAlignedArgumentStackSize(const unsigned StackSize,
SelectionDAG &DAG) const {
const Align StackAlignment(Subtarget.getFrameLowering()->getStackAlignment());
const uint64_t SlotSize = Subtarget.getRegisterInfo()->getSlotSize();
assert(StackSize % SlotSize == 0 &&
"StackSize must be a multiple of SlotSize");
return alignTo(StackSize + SlotSize, StackAlignment) - SlotSize;
}
/// Return true if the given stack call argument is already available in the
/// same position (relatively) of the caller's incoming argument stack.
static
bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
const X86InstrInfo *TII, const CCValAssign &VA) {
unsigned Bytes = Arg.getValueSizeInBits() / 8;
for (;;) {
// Look through nodes that don't alter the bits of the incoming value.
unsigned Op = Arg.getOpcode();
if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) {
Arg = Arg.getOperand(0);
continue;
}
if (Op == ISD::TRUNCATE) {
const SDValue &TruncInput = Arg.getOperand(0);
if (TruncInput.getOpcode() == ISD::AssertZext &&
cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
Arg.getValueType()) {
Arg = TruncInput.getOperand(0);
continue;
}
}
break;
}
int FI = INT_MAX;
if (Arg.getOpcode() == ISD::CopyFromReg) {
unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
if (!Register::isVirtualRegister(VR))
return false;
MachineInstr *Def = MRI->getVRegDef(VR);
if (!Def)
return false;
if (!Flags.isByVal()) {
if (!TII->isLoadFromStackSlot(*Def, FI))
return false;
} else {
unsigned Opcode = Def->getOpcode();
if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
Opcode == X86::LEA64_32r) &&
Def->getOperand(1).isFI()) {
FI = Def->getOperand(1).getIndex();
Bytes = Flags.getByValSize();
} else
return false;
}
} else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
if (Flags.isByVal())
// ByVal argument is passed in as a pointer but it's now being
// dereferenced. e.g.
// define @foo(%struct.X* %A) {
// tail call @bar(%struct.X* byval %A)
// }
return false;
SDValue Ptr = Ld->getBasePtr();
FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
if (!FINode)
return false;
FI = FINode->getIndex();
} else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
FI = FINode->getIndex();
Bytes = Flags.getByValSize();
} else
return false;
assert(FI != INT_MAX);
if (!MFI.isFixedObjectIndex(FI))
return false;
if (Offset != MFI.getObjectOffset(FI))
return false;
// If this is not byval, check that the argument stack object is immutable.
// inalloca and argument copy elision can create mutable argument stack
// objects. Byval objects can be mutated, but a byval call intends to pass the
// mutated memory.
if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI))
return false;
if (VA.getLocVT().getSizeInBits() > Arg.getValueSizeInBits()) {
// If the argument location is wider than the argument type, check that any
// extension flags match.
if (Flags.isZExt() != MFI.isObjectZExt(FI) ||
Flags.isSExt() != MFI.isObjectSExt(FI)) {
return false;
}
}
return Bytes == MFI.getObjectSize(FI);
}
/// Check whether the call is eligible for tail call optimization. Targets
/// that want to do tail call optimization should implement this function.
bool X86TargetLowering::IsEligibleForTailCallOptimization(
SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy,
const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<SDValue> &OutVals,
const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
if (!mayTailCallThisCC(CalleeCC))
return false;
// If -tailcallopt is specified, make fastcc functions tail-callable.
MachineFunction &MF = DAG.getMachineFunction();
const Function &CallerF = MF.getFunction();
// If the function return type is x86_fp80 and the callee return type is not,
// then the FP_EXTEND of the call result is not a nop. It's not safe to
// perform a tailcall optimization here.
if (CallerF.getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
return false;
CallingConv::ID CallerCC = CallerF.getCallingConv();
bool CCMatch = CallerCC == CalleeCC;
bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
bool IsGuaranteeTCO = DAG.getTarget().Options.GuaranteedTailCallOpt ||
CalleeCC == CallingConv::Tail;
// Win64 functions have extra shadow space for argument homing. Don't do the
// sibcall if the caller and callee have mismatched expectations for this
// space.
if (IsCalleeWin64 != IsCallerWin64)
return false;
if (IsGuaranteeTCO) {
if (canGuaranteeTCO(CalleeCC) && CCMatch)
return true;
return false;
}
// Look for obvious safe cases to perform tail call optimization that do not
// require ABI changes. This is what gcc calls sibcall.
// Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
// emit a special epilogue.
const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
if (RegInfo->needsStackRealignment(MF))
return false;
// Also avoid sibcall optimization if either caller or callee uses struct
// return semantics.
if (isCalleeStructRet || isCallerStructRet)
return false;
// Do not sibcall optimize vararg calls unless all arguments are passed via
// registers.
LLVMContext &C = *DAG.getContext();
if (isVarArg && !Outs.empty()) {
// Optimizing for varargs on Win64 is unlikely to be safe without
// additional testing.
if (IsCalleeWin64 || IsCallerWin64)
return false;
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
CCInfo.AnalyzeCallOperands(Outs, CC_X86);
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
if (!ArgLocs[i].isRegLoc())
return false;
}
// If the call result is in ST0 / ST1, it needs to be popped off the x87
// stack. Therefore, if it's not used by the call it is not safe to optimize
// this into a sibcall.
bool Unused = false;
for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
if (!Ins[i].Used) {
Unused = true;
break;
}
}
if (Unused) {
SmallVector<CCValAssign, 16> RVLocs;
CCState CCInfo(CalleeCC, false, MF, RVLocs, C);
CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
CCValAssign &VA = RVLocs[i];
if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
return false;
}
}
// Check that the call results are passed in the same way.
if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
RetCC_X86, RetCC_X86))
return false;
// The callee has to preserve all registers the caller needs to preserve.
const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
if (!CCMatch) {
const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
return false;
}
unsigned StackArgsSize = 0;
// If the callee takes no arguments then go on to check the results of the
// call.
if (!Outs.empty()) {
// Check if stack adjustment is needed. For now, do not do this if any
// argument is passed on the stack.
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
// Allocate shadow area for Win64
if (IsCalleeWin64)
CCInfo.AllocateStack(32, 8);
CCInfo.AnalyzeCallOperands(Outs, CC_X86);
StackArgsSize = CCInfo.getNextStackOffset();
if (CCInfo.getNextStackOffset()) {
// Check if the arguments are already laid out in the right way as
// the caller's fixed stack objects.
MachineFrameInfo &MFI = MF.getFrameInfo();
const MachineRegisterInfo *MRI = &MF.getRegInfo();
const X86InstrInfo *TII = Subtarget.getInstrInfo();
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
CCValAssign &VA = ArgLocs[i];
SDValue Arg = OutVals[i];
ISD::ArgFlagsTy Flags = Outs[i].Flags;
if (VA.getLocInfo() == CCValAssign::Indirect)
return false;
if (!VA.isRegLoc()) {
if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
MFI, MRI, TII, VA))
return false;
}
}
}
bool PositionIndependent = isPositionIndependent();
// If the tailcall address may be in a register, then make sure it's
// possible to register allocate for it. In 32-bit, the call address can
// only target EAX, EDX, or ECX since the tail call must be scheduled after
// callee-saved registers are restored. These happen to be the same
// registers used to pass 'inreg' arguments so watch out for those.
if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
!isa<ExternalSymbolSDNode>(Callee)) ||
PositionIndependent)) {
unsigned NumInRegs = 0;
// In PIC we need an extra register to formulate the address computation
// for the callee.
unsigned MaxInRegs = PositionIndependent ? 2 : 3;
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
CCValAssign &VA = ArgLocs[i];
if (!VA.isRegLoc())
continue;
Register Reg = VA.getLocReg();
switch (Reg) {
default: break;
case X86::EAX: case X86::EDX: case X86::ECX:
if (++NumInRegs == MaxInRegs)
return false;
break;
}
}
}
const MachineRegisterInfo &MRI = MF.getRegInfo();
if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
return false;
}
bool CalleeWillPop =
X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
MF.getTarget().Options.GuaranteedTailCallOpt);
if (unsigned BytesToPop =
MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
// If we have bytes to pop, the callee must pop them.
bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
if (!CalleePopMatches)
return false;
} else if (CalleeWillPop && StackArgsSize > 0) {
// If we don't have bytes to pop, make sure the callee doesn't pop any.
return false;
}
return true;
}
FastISel *
X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
const TargetLibraryInfo *libInfo) const {
return X86::createFastISel(funcInfo, libInfo);
}
//===----------------------------------------------------------------------===//
// Other Lowering Hooks
//===----------------------------------------------------------------------===//
static bool MayFoldLoad(SDValue Op) {
return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
}
static bool MayFoldIntoStore(SDValue Op) {
return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
}
static bool MayFoldIntoZeroExtend(SDValue Op) {
if (Op.hasOneUse()) {
unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
return (ISD::ZERO_EXTEND == Opcode);
}
return false;
}
static bool isTargetShuffle(unsigned Opcode) {
switch(Opcode) {
default: return false;
case X86ISD::BLENDI:
case X86ISD::PSHUFB:
case X86ISD::PSHUFD:
case X86ISD::PSHUFHW:
case X86ISD::PSHUFLW:
case X86ISD::SHUFP:
case X86ISD::INSERTPS:
case X86ISD::EXTRQI:
case X86ISD::INSERTQI:
case X86ISD::PALIGNR:
case X86ISD::VSHLDQ:
case X86ISD::VSRLDQ:
case X86ISD::MOVLHPS:
case X86ISD::MOVHLPS:
case X86ISD::MOVSHDUP:
case X86ISD::MOVSLDUP:
case X86ISD::MOVDDUP:
case X86ISD::MOVSS:
case X86ISD::MOVSD:
case X86ISD::UNPCKL:
case X86ISD::UNPCKH:
case X86ISD::VBROADCAST:
case X86ISD::VPERMILPI:
case X86ISD::VPERMILPV:
case X86ISD::VPERM2X128:
case X86ISD::SHUF128:
case X86ISD::VPERMIL2:
case X86ISD::VPERMI:
case X86ISD::VPPERM:
case X86ISD::VPERMV:
case X86ISD::VPERMV3:
case X86ISD::VZEXT_MOVL:
return true;
}
}
static bool isTargetShuffleVariableMask(unsigned Opcode) {
switch (Opcode) {
default: return false;
// Target Shuffles.
case X86ISD::PSHUFB:
case X86ISD::VPERMILPV:
case X86ISD::VPERMIL2:
case X86ISD::VPPERM:
case X86ISD::VPERMV:
case X86ISD::VPERMV3:
return true;
// 'Faux' Target Shuffles.
case ISD::OR:
case ISD::AND:
case X86ISD::ANDNP:
return true;
}
}
SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
int ReturnAddrIndex = FuncInfo->getRAIndex();
if (ReturnAddrIndex == 0) {
// Set up a frame object for the return address.
unsigned SlotSize = RegInfo->getSlotSize();
ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
-(int64_t)SlotSize,
false);
FuncInfo->setRAIndex(ReturnAddrIndex);
}
return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
}
bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
bool hasSymbolicDisplacement) {
// Offset should fit into 32 bit immediate field.
if (!isInt<32>(Offset))
return false;
// If we don't have a symbolic displacement - we don't have any extra
// restrictions.
if (!hasSymbolicDisplacement)
return true;
// FIXME: Some tweaks might be needed for medium code model.
if (M != CodeModel::Small && M != CodeModel::Kernel)
return false;
// For small code model we assume that latest object is 16MB before end of 31
// bits boundary. We may also accept pretty large negative constants knowing
// that all objects are in the positive half of address space.
if (M == CodeModel::Small && Offset < 16*1024*1024)
return true;
// For kernel code model we know that all object resist in the negative half
// of 32bits address space. We may not accept negative offsets, since they may
// be just off and we may accept pretty large positive ones.
if (M == CodeModel::Kernel && Offset >= 0)
return true;
return false;
}
/// Determines whether the callee is required to pop its own arguments.
/// Callee pop is necessary to support tail calls.
bool X86::isCalleePop(CallingConv::ID CallingConv,
bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
// If GuaranteeTCO is true, we force some calls to be callee pop so that we
// can guarantee TCO.
if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
return true;
switch (CallingConv) {
default:
return false;
case CallingConv::X86_StdCall:
case CallingConv::X86_FastCall:
case CallingConv::X86_ThisCall:
case CallingConv::X86_VectorCall:
return !is64Bit;
}
}
/// Return true if the condition is an signed comparison operation.
static bool isX86CCSigned(unsigned X86CC) {
switch (X86CC) {
default:
llvm_unreachable("Invalid integer condition!");
case X86::COND_E:
case X86::COND_NE:
case X86::COND_B:
case X86::COND_A:
case X86::COND_BE:
case X86::COND_AE:
return false;
case X86::COND_G:
case X86::COND_GE:
case X86::COND_L:
case X86::COND_LE:
return true;
}
}
static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
switch (SetCCOpcode) {
default: llvm_unreachable("Invalid integer condition!");
case ISD::SETEQ: return X86::COND_E;
case ISD::SETGT: return X86::COND_G;
case ISD::SETGE: return X86::COND_GE;
case ISD::SETLT: return X86::COND_L;
case ISD::SETLE: return X86::COND_LE;
case ISD::SETNE: return X86::COND_NE;
case ISD::SETULT: return X86::COND_B;
case ISD::SETUGT: return X86::COND_A;
case ISD::SETULE: return X86::COND_BE;
case ISD::SETUGE: return X86::COND_AE;
}
}
/// Do a one-to-one translation of a ISD::CondCode to the X86-specific
/// condition code, returning the condition code and the LHS/RHS of the
/// comparison to make.
static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
bool isFP, SDValue &LHS, SDValue &RHS,
SelectionDAG &DAG) {
if (!isFP) {
if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
// X > -1 -> X == 0, jump !sign.
RHS = DAG.getConstant(0, DL, RHS.getValueType());
return X86::COND_NS;
}
if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
// X < 0 -> X == 0, jump on sign.
return X86::COND_S;
}
if (SetCCOpcode == ISD::SETGE && RHSC->isNullValue()) {
// X >= 0 -> X == 0, jump on !sign.
return X86::COND_NS;
}
if (SetCCOpcode == ISD::SETLT && RHSC->isOne()) {
// X < 1 -> X <= 0
RHS = DAG.getConstant(0, DL, RHS.getValueType());
return X86::COND_LE;
}
}
return TranslateIntegerX86CC(SetCCOpcode);
}
// First determine if it is required or is profitable to flip the operands.
// If LHS is a foldable load, but RHS is not, flip the condition.
if (ISD::isNON_EXTLoad(LHS.getNode()) &&
!ISD::isNON_EXTLoad(RHS.getNode())) {
SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
std::swap(LHS, RHS);
}
switch (SetCCOpcode) {
default: break;
case ISD::SETOLT:
case ISD::SETOLE:
case ISD::SETUGT:
case ISD::SETUGE:
std::swap(LHS, RHS);
break;
}
// On a floating point condition, the flags are set as follows:
// ZF PF CF op
// 0 | 0 | 0 | X > Y
// 0 | 0 | 1 | X < Y
// 1 | 0 | 0 | X == Y
// 1 | 1 | 1 | unordered
switch (SetCCOpcode) {
default: llvm_unreachable("Condcode should be pre-legalized away");
case ISD::SETUEQ:
case ISD::SETEQ: return X86::COND_E;
case ISD::SETOLT: // flipped
case ISD::SETOGT:
case ISD::SETGT: return X86::COND_A;
case ISD::SETOLE: // flipped
case ISD::SETOGE:
case ISD::SETGE: return X86::COND_AE;
case ISD::SETUGT: // flipped
case ISD::SETULT:
case ISD::SETLT: return X86::COND_B;
case ISD::SETUGE: // flipped
case ISD::SETULE:
case ISD::SETLE: return X86::COND_BE;
case ISD::SETONE:
case ISD::SETNE: return X86::COND_NE;
case ISD::SETUO: return X86::COND_P;
case ISD::SETO: return X86::COND_NP;
case ISD::SETOEQ:
case ISD::SETUNE: return X86::COND_INVALID;
}
}
/// Is there a floating point cmov for the specific X86 condition code?
/// Current x86 isa includes the following FP cmov instructions:
/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
static bool hasFPCMov(unsigned X86CC) {
switch (X86CC) {
default:
return false;
case X86::COND_B:
case X86::COND_BE:
case X86::COND_E:
case X86::COND_P:
case X86::COND_A:
case X86::COND_AE:
case X86::COND_NE:
case X86::COND_NP:
return true;
}
}
bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
const CallInst &I,
MachineFunction &MF,
unsigned Intrinsic) const {
const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
if (!IntrData)
return false;
Info.flags = MachineMemOperand::MONone;
Info.offset = 0;
switch (IntrData->Type) {
case TRUNCATE_TO_MEM_VI8:
case TRUNCATE_TO_MEM_VI16:
case TRUNCATE_TO_MEM_VI32: {
Info.opc = ISD::INTRINSIC_VOID;
Info.ptrVal = I.getArgOperand(0);
MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
ScalarVT = MVT::i8;
else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
ScalarVT = MVT::i16;
else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
ScalarVT = MVT::i32;
Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
Info.align = Align::None();
Info.flags |= MachineMemOperand::MOStore;
break;
}
case GATHER:
case GATHER_AVX2: {
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.ptrVal = nullptr;
MVT DataVT = MVT::getVT(I.getType());
MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
unsigned NumElts = std::min(DataVT.getVectorNumElements(),
IndexVT.getVectorNumElements());
Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
Info.align = Align::None();
Info.flags |= MachineMemOperand::MOLoad;
break;
}
case SCATTER: {
Info.opc = ISD::INTRINSIC_VOID;
Info.ptrVal = nullptr;
MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType());
MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
unsigned NumElts = std::min(DataVT.getVectorNumElements(),
IndexVT.getVectorNumElements());
Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
Info.align = Align::None();
Info.flags |= MachineMemOperand::MOStore;
break;
}
default:
return false;
}
return true;
}
/// Returns true if the target can instruction select the
/// specified FP immediate natively. If false, the legalizer will
/// materialize the FP immediate as a load from a constant pool.
bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
bool ForCodeSize) const {
for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
return true;
}
return false;
}
bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
ISD::LoadExtType ExtTy,
EVT NewVT) const {
assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow");
// "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
// relocation target a movq or addq instruction: don't let the load shrink.
SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
// If this is an (1) AVX vector load with (2) multiple uses and (3) all of
// those uses are extracted directly into a store, then the extract + store
// can be store-folded. Therefore, it's probably not worth splitting the load.
EVT VT = Load->getValueType(0);
if ((VT.is256BitVector() || VT.is512BitVector()) && !Load->hasOneUse()) {
for (auto UI = Load->use_begin(), UE = Load->use_end(); UI != UE; ++UI) {
// Skip uses of the chain value. Result 0 of the node is the load value.
if (UI.getUse().getResNo() != 0)
continue;
// If this use is not an extract + store, it's probably worth splitting.
if (UI->getOpcode() != ISD::EXTRACT_SUBVECTOR || !UI->hasOneUse() ||
UI->use_begin()->getOpcode() != ISD::STORE)
return true;
}
// All non-chain uses are extract + store.
return false;
}
return true;
}
/// Returns true if it is beneficial to convert a load of a constant
/// to just the constant itself.
bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
Type *Ty) const {
assert(Ty->isIntegerTy());
unsigned BitSize = Ty->getPrimitiveSizeInBits();
if (BitSize == 0 || BitSize > 64)
return false;
return true;
}
bool X86TargetLowering::reduceSelectOfFPConstantLoads(EVT CmpOpVT) const {
// If we are using XMM registers in the ABI and the condition of the select is
// a floating-point compare and we have blendv or conditional move, then it is
// cheaper to select instead of doing a cross-register move and creating a
// load that depends on the compare result.
bool IsFPSetCC = CmpOpVT.isFloatingPoint() && CmpOpVT != MVT::f128;
return !IsFPSetCC || !Subtarget.isTarget64BitLP64() || !Subtarget.hasAVX();
}
bool X86TargetLowering::convertSelectOfConstantsToMath(EVT VT) const {
// TODO: It might be a win to ease or lift this restriction, but the generic
// folds in DAGCombiner conflict with vector folds for an AVX512 target.
if (VT.isVector() && Subtarget.hasAVX512())
return false;
return true;
}
bool X86TargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,
SDValue C) const {
// TODO: We handle scalars using custom code, but generic combining could make
// that unnecessary.
APInt MulC;
if (!ISD::isConstantSplatVector(C.getNode(), MulC))
return false;
// Find the type this will be legalized too. Otherwise we might prematurely
// convert this to shl+add/sub and then still have to type legalize those ops.
// Another choice would be to defer the decision for illegal types until
// after type legalization. But constant splat vectors of i64 can't make it
// through type legalization on 32-bit targets so we would need to special
// case vXi64.
while (getTypeAction(Context, VT) != TypeLegal)
VT = getTypeToTransformTo(Context, VT);
// If vector multiply is legal, assume that's faster than shl + add/sub.
// TODO: Multiply is a complex op with higher latency and lower throughput in
// most implementations, so this check could be loosened based on type
// and/or a CPU attribute.
if (isOperationLegal(ISD::MUL, VT))
return false;
// shl+add, shl+sub, shl+add+neg
return (MulC + 1).isPowerOf2() || (MulC - 1).isPowerOf2() ||
(1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2();
}
bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
unsigned Index) const {
if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
return false;
// Mask vectors support all subregister combinations and operations that
// extract half of vector.
if (ResVT.getVectorElementType() == MVT::i1)
return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&
(Index == ResVT.getVectorNumElements()));
return (Index % ResVT.getVectorNumElements()) == 0;
}
bool X86TargetLowering::shouldScalarizeBinop(SDValue VecOp) const {
unsigned Opc = VecOp.getOpcode();
// Assume target opcodes can't be scalarized.
// TODO - do we have any exceptions?
if (Opc >= ISD::BUILTIN_OP_END)
return false;
// If the vector op is not supported, try to convert to scalar.
EVT VecVT = VecOp.getValueType();
if (!isOperationLegalOrCustomOrPromote(Opc, VecVT))
return true;
// If the vector op is supported, but the scalar op is not, the transform may
// not be worthwhile.
EVT ScalarVT = VecVT.getScalarType();
return isOperationLegalOrCustomOrPromote(Opc, ScalarVT);
}
bool X86TargetLowering::shouldFormOverflowOp(unsigned Opcode, EVT VT) const {
// TODO: Allow vectors?
if (VT.isVector())
return false;
return VT.isSimple() || !isOperationExpand(Opcode, VT);
}
bool X86TargetLowering::isCheapToSpeculateCttz() const {
// Speculate cttz only if we can directly use TZCNT.
return Subtarget.hasBMI();
}
bool X86TargetLowering::isCheapToSpeculateCtlz() const {
// Speculate ctlz only if we can directly use LZCNT.
return Subtarget.hasLZCNT();
}
bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,
const SelectionDAG &DAG,
const MachineMemOperand &MMO) const {
if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() &&
BitcastVT.getVectorElementType() == MVT::i1)
return false;
if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)
return false;
// If both types are legal vectors, it's always ok to convert them.
if (LoadVT.isVector() && BitcastVT.isVector() &&
isTypeLegal(LoadVT) && isTypeLegal(BitcastVT))
return true;
return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO);
}
bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
const SelectionDAG &DAG) const {
// Do not merge to float value size (128 bytes) if no implicit
// float attribute is set.
bool NoFloat = DAG.getMachineFunction().getFunction().hasFnAttribute(
Attribute::NoImplicitFloat);
if (NoFloat) {
unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
return (MemVT.getSizeInBits() <= MaxIntSize);
}
// Make sure we don't merge greater than our preferred vector
// width.
if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth())
return false;
return true;
}
bool X86TargetLowering::isCtlzFast() const {
return Subtarget.hasFastLZCNT();
}
bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(
const Instruction &AndI) const {
return true;
}
bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
EVT VT = Y.getValueType();
if (VT.isVector())
return false;
if (!Subtarget.hasBMI())
return false;
// There are only 32-bit and 64-bit forms for 'andn'.
if (VT != MVT::i32 && VT != MVT::i64)
return false;
return !isa<ConstantSDNode>(Y);
}
bool X86TargetLowering::hasAndNot(SDValue Y) const {
EVT VT = Y.getValueType();
if (!VT.isVector())
return hasAndNotCompare(Y);
// Vector.
if (!Subtarget.hasSSE1() || VT.getSizeInBits() < 128)
return false;
if (VT == MVT::v4i32)
return true;
return Subtarget.hasSSE2();
}
bool X86TargetLowering::hasBitTest(SDValue X, SDValue Y) const {
return X.getValueType().isScalarInteger(); // 'bt'
}
bool X86TargetLowering::
shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,
unsigned OldShiftOpcode, unsigned NewShiftOpcode,
SelectionDAG &DAG) const {
// Does baseline recommend not to perform the fold by default?
if (!TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
return false;
// For scalars this transform is always beneficial.
if (X.getValueType().isScalarInteger())
return true;
// If all the shift amounts are identical, then transform is beneficial even
// with rudimentary SSE2 shifts.
if (DAG.isSplatValue(Y, /*AllowUndefs=*/true))
return true;
// If we have AVX2 with it's powerful shift operations, then it's also good.
if (Subtarget.hasAVX2())
return true;
// Pre-AVX2 vector codegen for this pattern is best for variant with 'shl'.
return NewShiftOpcode == ISD::SHL;
}
bool X86TargetLowering::shouldFoldConstantShiftPairToMask(
const SDNode *N, CombineLevel Level) const {
assert(((N->getOpcode() == ISD::SHL &&
N->getOperand(0).getOpcode() == ISD::SRL) ||
(N->getOpcode() == ISD::SRL &&
N->getOperand(0).getOpcode() == ISD::SHL)) &&
"Expected shift-shift mask");
EVT VT = N->getValueType(0);
if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) ||
(Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) {
// Only fold if the shift values are equal - so it folds to AND.
// TODO - we should fold if either is a non-uniform vector but we don't do
// the fold for non-splats yet.
return N->getOperand(1) == N->getOperand(0).getOperand(1);
}
return TargetLoweringBase::shouldFoldConstantShiftPairToMask(N, Level);
}
bool X86TargetLowering::shouldFoldMaskToVariableShiftPair(SDValue Y) const {
EVT VT = Y.getValueType();
// For vectors, we don't have a preference, but we probably want a mask.
if (VT.isVector())
return false;
// 64-bit shifts on 32-bit targets produce really bad bloated code.
if (VT == MVT::i64 && !Subtarget.is64Bit())
return false;
return true;
}
bool X86TargetLowering::shouldExpandShift(SelectionDAG &DAG,
SDNode *N) const {
if (DAG.getMachineFunction().getFunction().hasMinSize() &&
!Subtarget.isOSWindows())
return false;
return true;
}
bool X86TargetLowering::shouldSplatInsEltVarIndex(EVT VT) const {
// Any legal vector type can be splatted more efficiently than
// loading/spilling from memory.
return isTypeLegal(VT);
}
MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {
MVT VT = MVT::getIntegerVT(NumBits);
if (isTypeLegal(VT))
return VT;
// PMOVMSKB can handle this.
if (NumBits == 128 && isTypeLegal(MVT::v16i8))
return MVT::v16i8;
// VPMOVMSKB can handle this.
if (NumBits == 256 && isTypeLegal(MVT::v32i8))
return MVT::v32i8;
// TODO: Allow 64-bit type for 32-bit target.
// TODO: 512-bit types should be allowed, but make sure that those
// cases are handled in combineVectorSizedSetCCEquality().
return MVT::INVALID_SIMPLE_VALUE_TYPE;
}
/// Val is the undef sentinel value or equal to the specified value.
static bool isUndefOrEqual(int Val, int CmpVal) {
return ((Val == SM_SentinelUndef) || (Val == CmpVal));
}
/// Val is either the undef or zero sentinel value.
static bool isUndefOrZero(int Val) {
return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
}
/// Return true if every element in Mask, beginning from position Pos and ending
/// in Pos+Size is the undef sentinel value.
static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
return llvm::all_of(Mask.slice(Pos, Size),
[](int M) { return M == SM_SentinelUndef; });
}
/// Return true if the mask creates a vector whose lower half is undefined.
static bool isUndefLowerHalf(ArrayRef<int> Mask) {
unsigned NumElts = Mask.size();
return isUndefInRange(Mask, 0, NumElts / 2);
}
/// Return true if the mask creates a vector whose upper half is undefined.
static bool isUndefUpperHalf(ArrayRef<int> Mask) {
unsigned NumElts = Mask.size();
return isUndefInRange(Mask, NumElts / 2, NumElts / 2);
}
/// Return true if Val falls within the specified range (L, H].
static bool isInRange(int Val, int Low, int Hi) {
return (Val >= Low && Val < Hi);
}
/// Return true if the value of any element in Mask falls within the specified
/// range (L, H].
static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {
return llvm::any_of(Mask, [Low, Hi](int M) { return isInRange(M, Low, Hi); });
}
/// Return true if Val is undef or if its value falls within the
/// specified range (L, H].
static bool isUndefOrInRange(int Val, int Low, int Hi) {
return (Val == SM_SentinelUndef) || isInRange(Val, Low, Hi);
}
/// Return true if every element in Mask is undef or if its value
/// falls within the specified range (L, H].
static bool isUndefOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
return llvm::all_of(
Mask, [Low, Hi](int M) { return isUndefOrInRange(M, Low, Hi); });
}
/// Return true if Val is undef, zero or if its value falls within the
/// specified range (L, H].
static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
return isUndefOrZero(Val) || isInRange(Val, Low, Hi);
}
/// Return true if every element in Mask is undef, zero or if its value
/// falls within the specified range (L, H].
static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
return llvm::all_of(
Mask, [Low, Hi](int M) { return isUndefOrZeroOrInRange(M, Low, Hi); });
}
/// Return true if every element in Mask, beginning
/// from position Pos and ending in Pos + Size, falls within the specified
/// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.
static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos,
unsigned Size, int Low, int Step = 1) {
for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
if (!isUndefOrEqual(Mask[i], Low))
return false;
return true;
}
/// Return true if every element in Mask, beginning
/// from position Pos and ending in Pos+Size, falls within the specified
/// sequential range (Low, Low+Size], or is undef or is zero.
static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
unsigned Size, int Low,
int Step = 1) {
for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
return false;
return true;
}
/// Return true if every element in Mask, beginning
/// from position Pos and ending in Pos+Size is undef or is zero.
static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
unsigned Size) {
return llvm::all_of(Mask.slice(Pos, Size),
[](int M) { return isUndefOrZero(M); });
}
/// Helper function to test whether a shuffle mask could be
/// simplified by widening the elements being shuffled.
///
/// Appends the mask for wider elements in WidenedMask if valid. Otherwise
/// leaves it in an unspecified state.
///
/// NOTE: This must handle normal vector shuffle masks and *target* vector
/// shuffle masks. The latter have the special property of a '-2' representing
/// a zero-ed lane of a vector.
static bool canWidenShuffleElements(ArrayRef<int> Mask,
SmallVectorImpl<int> &WidenedMask) {
WidenedMask.assign(Mask.size() / 2, 0);
for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
int M0 = Mask[i];
int M1 = Mask[i + 1];
// If both elements are undef, its trivial.
if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
WidenedMask[i / 2] = SM_SentinelUndef;
continue;
}
// Check for an undef mask and a mask value properly aligned to fit with
// a pair of values. If we find such a case, use the non-undef mask's value.
if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
WidenedMask[i / 2] = M1 / 2;
continue;
}
if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
WidenedMask[i / 2] = M0 / 2;
continue;
}
// When zeroing, we need to spread the zeroing across both lanes to widen.
if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
(M1 == SM_SentinelZero || M1 == SM_SentinelUndef)) {
WidenedMask[i / 2] = SM_SentinelZero;
continue;
}
return false;
}
// Finally check if the two mask values are adjacent and aligned with
// a pair.
if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
WidenedMask[i / 2] = M0 / 2;
continue;
}
// Otherwise we can't safely widen the elements used in this shuffle.
return false;
}
assert(WidenedMask.size() == Mask.size() / 2 &&
"Incorrect size of mask after widening the elements!");
return true;
}
static bool canWidenShuffleElements(ArrayRef<int> Mask,
const APInt &Zeroable,
bool V2IsZero,
SmallVectorImpl<int> &WidenedMask) {
// Create an alternative mask with info about zeroable elements.
// Here we do not set undef elements as zeroable.
SmallVector<int, 64> ZeroableMask(Mask.begin(), Mask.end());
if (V2IsZero) {
assert(!Zeroable.isNullValue() && "V2's non-undef elements are used?!");
for (int i = 0, Size = Mask.size(); i != Size; ++i)
if (Mask[i] != SM_SentinelUndef && Zeroable[i])
ZeroableMask[i] = SM_SentinelZero;
}
return canWidenShuffleElements(ZeroableMask, WidenedMask);
}
static bool canWidenShuffleElements(ArrayRef<int> Mask) {
SmallVector<int, 32> WidenedMask;
return canWidenShuffleElements(Mask, WidenedMask);
}
/// Returns true if Elt is a constant zero or a floating point constant +0.0.
bool X86::isZeroNode(SDValue Elt) {
return isNullConstant(Elt) || isNullFPConstant(Elt);
}
// Build a vector of constants.
// Use an UNDEF node if MaskElt == -1.
// Split 64-bit constants in the 32-bit mode.
static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
const SDLoc &dl, bool IsMask = false) {
SmallVector<SDValue, 32> Ops;
bool Split = false;
MVT ConstVecVT = VT;
unsigned NumElts = VT.getVectorNumElements();
bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
Split = true;
}
MVT EltVT = ConstVecVT.getVectorElementType();
for (unsigned i = 0; i < NumElts; ++i) {
bool IsUndef = Values[i] < 0 && IsMask;
SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
DAG.getConstant(Values[i], dl, EltVT);
Ops.push_back(OpNode);
if (Split)
Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
DAG.getConstant(0, dl, EltVT));
}
SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
if (Split)
ConstsNode = DAG.getBitcast(VT, ConstsNode);
return ConstsNode;
}
static SDValue getConstVector(ArrayRef<APInt> Bits, APInt &Undefs,
MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
assert(Bits.size() == Undefs.getBitWidth() &&
"Unequal constant and undef arrays");
SmallVector<SDValue, 32> Ops;
bool Split = false;
MVT ConstVecVT = VT;
unsigned NumElts = VT.getVectorNumElements();
bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
Split = true;
}
MVT EltVT = ConstVecVT.getVectorElementType();
for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
if (Undefs[i]) {
Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
continue;
}
const APInt &V = Bits[i];
assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
if (Split) {
Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
} else if (EltVT == MVT::f32) {
APFloat FV(APFloat::IEEEsingle(), V);
Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
} else if (EltVT == MVT::f64) {
APFloat FV(APFloat::IEEEdouble(), V);
Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
} else {
Ops.push_back(DAG.getConstant(V, dl, EltVT));
}
}
SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
return DAG.getBitcast(VT, ConstsNode);
}
/// Returns a vector of specified type with all zero elements.
static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
SelectionDAG &DAG, const SDLoc &dl) {
assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||
VT.getVectorElementType() == MVT::i1) &&
"Unexpected vector type");
// Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
// type. This ensures they get CSE'd. But if the integer type is not
// available, use a floating-point +0.0 instead.
SDValue Vec;
if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
} else if (VT.isFloatingPoint()) {
Vec = DAG.getConstantFP(+0.0, dl, VT);
} else if (VT.getVectorElementType() == MVT::i1) {
assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
"Unexpected vector type");
Vec = DAG.getConstant(0, dl, VT);
} else {
unsigned Num32BitElts = VT.getSizeInBits() / 32;
Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
}
return DAG.getBitcast(VT, Vec);
}
static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
const SDLoc &dl, unsigned vectorWidth) {
EVT VT = Vec.getValueType();
EVT ElVT = VT.getVectorElementType();
unsigned Factor = VT.getSizeInBits()/vectorWidth;
EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
VT.getVectorNumElements()/Factor);
// Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
// This is the index of the first element of the vectorWidth-bit chunk
// we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
IdxVal &= ~(ElemsPerChunk - 1);
// If the input is a buildvector just emit a smaller one.
if (Vec.getOpcode() == ISD::BUILD_VECTOR)
return DAG.getBuildVector(ResultVT, dl,
Vec->ops().slice(IdxVal, ElemsPerChunk));
SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
}
/// Generate a DAG to grab 128-bits from a vector > 128 bits. This
/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
/// instructions or a simple subregister reference. Idx is an index in the
/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
/// lowering EXTRACT_VECTOR_ELT operations easier.
static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
SelectionDAG &DAG, const SDLoc &dl) {
assert((Vec.getValueType().is256BitVector() ||
Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
return extractSubVector(Vec, IdxVal, DAG, dl, 128);
}
/// Generate a DAG to grab 256-bits from a 512-bit vector.
static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
SelectionDAG &DAG, const SDLoc &dl) {
assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
return extractSubVector(Vec, IdxVal, DAG, dl, 256);
}
static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
SelectionDAG &DAG, const SDLoc &dl,
unsigned vectorWidth) {
assert((vectorWidth == 128 || vectorWidth == 256) &&
"Unsupported vector width");
// Inserting UNDEF is Result
if (Vec.isUndef())
return Result;
EVT VT = Vec.getValueType();
EVT ElVT = VT.getVectorElementType();
EVT ResultVT = Result.getValueType();
// Insert the relevant vectorWidth bits.
unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
// This is the index of the first element of the vectorWidth-bit chunk
// we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
IdxVal &= ~(ElemsPerChunk - 1);
SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
}
/// Generate a DAG to put 128-bits into a vector > 128 bits. This
/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
/// simple superregister reference. Idx is an index in the 128 bits
/// we want. It need not be aligned to a 128-bit boundary. That makes
/// lowering INSERT_VECTOR_ELT operations easier.
static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
SelectionDAG &DAG, const SDLoc &dl) {
assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
}
/// Widen a vector to a larger size with the same scalar type, with the new
/// elements either zero or undef.
static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,
const X86Subtarget &Subtarget, SelectionDAG &DAG,
const SDLoc &dl) {
assert(Vec.getValueSizeInBits() < VT.getSizeInBits() &&
Vec.getValueType().getScalarType() == VT.getScalarType() &&
"Unsupported vector widening type");
SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)
: DAG.getUNDEF(VT);
return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, Vec,
DAG.getIntPtrConstant(0, dl));
}
/// Widen a vector to a larger size with the same scalar type, with the new
/// elements either zero or undef.
static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements,
const X86Subtarget &Subtarget, SelectionDAG &DAG,
const SDLoc &dl, unsigned WideSizeInBits) {
assert(Vec.getValueSizeInBits() < WideSizeInBits &&
(WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 &&
"Unsupported vector widening type");
unsigned WideNumElts = WideSizeInBits / Vec.getScalarValueSizeInBits();
MVT SVT = Vec.getSimpleValueType().getScalarType();
MVT VT = MVT::getVectorVT(SVT, WideNumElts);
return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
}
// Helper function to collect subvector ops that are concated together,
// either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series.
// The subvectors in Ops are guaranteed to be the same type.
static bool collectConcatOps(SDNode *N, SmallVectorImpl<SDValue> &Ops) {
assert(Ops.empty() && "Expected an empty ops vector");
if (N->getOpcode() == ISD::CONCAT_VECTORS) {
Ops.append(N->op_begin(), N->op_end());
return true;
}
if (N->getOpcode() == ISD::INSERT_SUBVECTOR &&
isa<ConstantSDNode>(N->getOperand(2))) {
SDValue Src = N->getOperand(0);
SDValue Sub = N->getOperand(1);
const APInt &Idx = N->getConstantOperandAPInt(2);
EVT VT = Src.getValueType();
EVT SubVT = Sub.getValueType();
// TODO - Handle more general insert_subvector chains.
if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2) &&
Idx == (VT.getVectorNumElements() / 2) &&
Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
Src.getOperand(1).getValueType() == SubVT &&
isNullConstant(Src.getOperand(2))) {
Ops.push_back(Src.getOperand(1));
Ops.push_back(Sub);
return true;
}
}
return false;
}
// Helper for splitting operands of an operation to legal target size and
// apply a function on each part.
// Useful for operations that are available on SSE2 in 128-bit, on AVX2 in
// 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for
// deciding if/how to split Ops. Ops elements do *not* have to be of type VT.
// The argument Builder is a function that will be applied on each split part:
// SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef<SDValue>)
template <typename F>
SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget,
const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,
F Builder, bool CheckBWI = true) {
assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2");
unsigned NumSubs = 1;
if ((CheckBWI && Subtarget.useBWIRegs()) ||
(!CheckBWI && Subtarget.useAVX512Regs())) {
if (VT.getSizeInBits() > 512) {
NumSubs = VT.getSizeInBits() / 512;
assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size");
}
} else if (Subtarget.hasAVX2()) {
if (VT.getSizeInBits() > 256) {
NumSubs = VT.getSizeInBits() / 256;
assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size");
}
} else {
if (VT.getSizeInBits() > 128) {
NumSubs = VT.getSizeInBits() / 128;
assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size");
}
}
if (NumSubs == 1)
return Builder(DAG, DL, Ops);
SmallVector<SDValue, 4> Subs;
for (unsigned i = 0; i != NumSubs; ++i) {
SmallVector<SDValue, 2> SubOps;
for (SDValue Op : Ops) {
EVT OpVT = Op.getValueType();
unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs;
unsigned SizeSub = OpVT.getSizeInBits() / NumSubs;
SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub));
}
Subs.push_back(Builder(DAG, DL, SubOps));
}
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
}
/// Insert i1-subvector to i1-vector.
static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
SDLoc dl(Op);
SDValue Vec = Op.getOperand(0);
SDValue SubVec = Op.getOperand(1);
SDValue Idx = Op.getOperand(2);
if (!isa<ConstantSDNode>(Idx))
return SDValue();
// Inserting undef is a nop. We can just return the original vector.
if (SubVec.isUndef())
return Vec;
unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
return Op;
MVT OpVT = Op.getSimpleValueType();
unsigned NumElems = OpVT.getVectorNumElements();
SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
// Extend to natively supported kshift.
MVT WideOpVT = OpVT;
if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8)
WideOpVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
// Inserting into the lsbs of a zero vector is legal. ISel will insert shifts
// if necessary.
if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
// May need to promote to a legal type.
Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
DAG.getConstant(0, dl, WideOpVT),
SubVec, Idx);
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
}
MVT SubVecVT = SubVec.getSimpleValueType();
unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
assert(IdxVal + SubVecNumElems <= NumElems &&
IdxVal % SubVecVT.getSizeInBits() == 0 &&
"Unexpected index value in INSERT_SUBVECTOR");
SDValue Undef = DAG.getUNDEF(WideOpVT);
if (IdxVal == 0) {
// Zero lower bits of the Vec
SDValue ShiftBits = DAG.getTargetConstant(SubVecNumElems, dl, MVT::i8);
Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
ZeroIdx);
Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
// Merge them together, SubVec should be zero extended.
SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
DAG.getConstant(0, dl, WideOpVT),
SubVec, ZeroIdx);
Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
}
SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
Undef, SubVec, ZeroIdx);
if (Vec.isUndef()) {
assert(IdxVal != 0 && "Unexpected index");
SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
DAG.getTargetConstant(IdxVal, dl, MVT::i8));
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
}
if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
assert(IdxVal != 0 && "Unexpected index");
NumElems = WideOpVT.getVectorNumElements();
unsigned ShiftLeft = NumElems - SubVecNumElems;
unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
if (ShiftRight != 0)
SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
}
// Simple case when we put subvector in the upper part
if (IdxVal + SubVecNumElems == NumElems) {
SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
DAG.getTargetConstant(IdxVal, dl, MVT::i8));
if (SubVecNumElems * 2 == NumElems) {
// Special case, use legal zero extending insert_subvector. This allows
// isel to opimitize when bits are known zero.
Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
DAG.getConstant(0, dl, WideOpVT),
Vec, ZeroIdx);
} else {
// Otherwise use explicit shifts to zero the bits.
Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
Undef, Vec, ZeroIdx);
NumElems = WideOpVT.getVectorNumElements();
SDValue ShiftBits = DAG.getTargetConstant(NumElems - IdxVal, dl, MVT::i8);
Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
}
Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
}
// Inserting into the middle is more complicated.
NumElems = WideOpVT.getVectorNumElements();
// Widen the vector if needed.
Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
unsigned ShiftLeft = NumElems - SubVecNumElems;
unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
// Do an optimization for the the most frequently used types.
if (WideOpVT != MVT::v64i1 || Subtarget.is64Bit()) {
APInt Mask0 = APInt::getBitsSet(NumElems, IdxVal, IdxVal + SubVecNumElems);
Mask0.flipAllBits();
SDValue CMask0 = DAG.getConstant(Mask0, dl, MVT::getIntegerVT(NumElems));
SDValue VMask0 = DAG.getNode(ISD::BITCAST, dl, WideOpVT, CMask0);
Vec = DAG.getNode(ISD::AND, dl, WideOpVT, Vec, VMask0);
SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
// Reduce to original width if needed.
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
}
// Clear the upper bits of the subvector and move it to its insert position.
SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
// Isolate the bits below the insertion point.
unsigned LowShift = NumElems - IdxVal;
SDValue Low = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec,
DAG.getTargetConstant(LowShift, dl, MVT::i8));
Low = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Low,
DAG.getTargetConstant(LowShift, dl, MVT::i8));
// Isolate the bits after the last inserted bit.
unsigned HighShift = IdxVal + SubVecNumElems;
SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
DAG.getTargetConstant(HighShift, dl, MVT::i8));
High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High,
DAG.getTargetConstant(HighShift, dl, MVT::i8));
// Now OR all 3 pieces together.
Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High);
SubVec = DAG.getNode(ISD::OR, dl, WideOpVT, SubVec, Vec);
// Reduce to original width if needed.
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
}
static SDValue concatSubVectors(SDValue V1, SDValue V2, SelectionDAG &DAG,
const SDLoc &dl) {
assert(V1.getValueType() == V2.getValueType() && "subvector type mismatch");
EVT SubVT = V1.getValueType();
EVT SubSVT = SubVT.getScalarType();
unsigned SubNumElts = SubVT.getVectorNumElements();
unsigned SubVectorWidth = SubVT.getSizeInBits();
EVT VT = EVT::getVectorVT(*DAG.getContext(), SubSVT, 2 * SubNumElts);
SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, SubVectorWidth);
return insertSubVector(V, V2, SubNumElts, DAG, dl, SubVectorWidth);
}
/// Returns a vector of specified type with all bits set.
/// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
/// Then bitcast to their original type, ensuring they get CSE'd.
static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
"Expected a 128/256/512-bit vector type");
APInt Ones = APInt::getAllOnesValue(32);
unsigned NumElts = VT.getSizeInBits() / 32;
SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
return DAG.getBitcast(VT, Vec);
}
// Convert *_EXTEND to *_EXTEND_VECTOR_INREG opcode.
static unsigned getOpcode_EXTEND_VECTOR_INREG(unsigned Opcode) {
switch (Opcode) {
case ISD::ANY_EXTEND:
case ISD::ANY_EXTEND_VECTOR_INREG:
return ISD::ANY_EXTEND_VECTOR_INREG;
case ISD::ZERO_EXTEND:
case ISD::ZERO_EXTEND_VECTOR_INREG:
return ISD::ZERO_EXTEND_VECTOR_INREG;
case ISD::SIGN_EXTEND:
case ISD::SIGN_EXTEND_VECTOR_INREG:
return ISD::SIGN_EXTEND_VECTOR_INREG;
}
llvm_unreachable("Unknown opcode");
}
static SDValue getExtendInVec(unsigned Opcode, const SDLoc &DL, EVT VT,
SDValue In, SelectionDAG &DAG) {
EVT InVT = In.getValueType();
assert(VT.isVector() && InVT.isVector() && "Expected vector VTs.");
assert((ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode ||
ISD::ZERO_EXTEND == Opcode) &&
"Unknown extension opcode");
// For 256-bit vectors, we only need the lower (128-bit) input half.
// For 512-bit vectors, we only need the lower input half or quarter.
if (InVT.getSizeInBits() > 128) {
assert(VT.getSizeInBits() == InVT.getSizeInBits() &&
"Expected VTs to be the same size!");
unsigned Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
In = extractSubVector(In, 0, DAG, DL,
std::max(128U, (unsigned)VT.getSizeInBits() / Scale));
InVT = In.getValueType();
}
if (VT.getVectorNumElements() != InVT.getVectorNumElements())
Opcode = getOpcode_EXTEND_VECTOR_INREG(Opcode);
return DAG.getNode(Opcode, DL, VT, In);
}
// Match (xor X, -1) -> X.
// Match extract_subvector(xor X, -1) -> extract_subvector(X).
// Match concat_vectors(xor X, -1, xor Y, -1) -> concat_vectors(X, Y).
static SDValue IsNOT(SDValue V, SelectionDAG &DAG) {
V = peekThroughBitcasts(V);
if (V.getOpcode() == ISD::XOR &&
ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()))
return V.getOperand(0);
if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
(isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) {
if (SDValue Not = IsNOT(V.getOperand(0), DAG)) {
Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not);
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), V.getValueType(),
Not, V.getOperand(1));
}
}
SmallVector<SDValue, 2> CatOps;
if (collectConcatOps(V.getNode(), CatOps)) {
for (SDValue &CatOp : CatOps) {
SDValue NotCat = IsNOT(CatOp, DAG);
if (!NotCat) return SDValue();
CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat);
}
return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), V.getValueType(), CatOps);
}
return SDValue();
}
/// Returns a vector_shuffle node for an unpackl operation.
static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
SDValue V1, SDValue V2) {
SmallVector<int, 8> Mask;
createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
}
/// Returns a vector_shuffle node for an unpackh operation.
static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
SDValue V1, SDValue V2) {
SmallVector<int, 8> Mask;
createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
}
/// Return a vector_shuffle of the specified vector of zero or undef vector.
/// This produces a shuffle where the low element of V2 is swizzled into the
/// zero/undef vector, landing at element Idx.
/// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
bool IsZero,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MVT VT = V2.getSimpleValueType();
SDValue V1 = IsZero
? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
int NumElems = VT.getVectorNumElements();
SmallVector<int, 16> MaskVec(NumElems);
for (int i = 0; i != NumElems; ++i)
// If this is the insertion idx, put the low elt of V2 here.
MaskVec[i] = (i == Idx) ? NumElems : i;
return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
}
static const Constant *getTargetConstantFromNode(LoadSDNode *Load) {
if (!Load || !ISD::isNormalLoad(Load))
return nullptr;
SDValue Ptr = Load->getBasePtr();
if (Ptr->getOpcode() == X86ISD::Wrapper ||
Ptr->getOpcode() == X86ISD::WrapperRIP)
Ptr = Ptr->getOperand(0);
auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0)
return nullptr;
return CNode->getConstVal();
}
static const Constant *getTargetConstantFromNode(SDValue Op) {
Op = peekThroughBitcasts(Op);
return getTargetConstantFromNode(dyn_cast<LoadSDNode>(Op));
}
const Constant *
X86TargetLowering::getTargetConstantFromLoad(LoadSDNode *LD) const {
assert(LD && "Unexpected null LoadSDNode");
return getTargetConstantFromNode(LD);
}
// Extract raw constant bits from constant pools.
static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
APInt &UndefElts,
SmallVectorImpl<APInt> &EltBits,
bool AllowWholeUndefs = true,
bool AllowPartialUndefs = true) {
assert(EltBits.empty() && "Expected an empty EltBits vector");
Op = peekThroughBitcasts(Op);
EVT VT = Op.getValueType();
unsigned SizeInBits = VT.getSizeInBits();
assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!");
unsigned NumElts = SizeInBits / EltSizeInBits;
// Bitcast a source array of element bits to the target size.
auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
unsigned NumSrcElts = UndefSrcElts.getBitWidth();
unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&
"Constant bit sizes don't match");
// Don't split if we don't allow undef bits.
bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
if (UndefSrcElts.getBoolValue() && !AllowUndefs)
return false;
// If we're already the right size, don't bother bitcasting.
if (NumSrcElts == NumElts) {
UndefElts = UndefSrcElts;
EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
return true;
}
// Extract all the undef/constant element data and pack into single bitsets.
APInt UndefBits(SizeInBits, 0);
APInt MaskBits(SizeInBits, 0);
for (unsigned i = 0; i != NumSrcElts; ++i) {
unsigned BitOffset = i * SrcEltSizeInBits;
if (UndefSrcElts[i])
UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
MaskBits.insertBits(SrcEltBits[i], BitOffset);
}
// Split the undef/constant single bitset data into the target elements.
UndefElts = APInt(NumElts, 0);
EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
for (unsigned i = 0; i != NumElts; ++i) {
unsigned BitOffset = i * EltSizeInBits;
APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
// Only treat an element as UNDEF if all bits are UNDEF.
if (UndefEltBits.isAllOnesValue()) {
if (!AllowWholeUndefs)
return false;
UndefElts.setBit(i);
continue;
}
// If only some bits are UNDEF then treat them as zero (or bail if not
// supported).
if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
return false;
EltBits[i] = MaskBits.extractBits(EltSizeInBits, BitOffset);
}
return true;
};
// Collect constant bits and insert into mask/undef bit masks.
auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
unsigned UndefBitIndex) {
if (!Cst)
return false;
if (isa<UndefValue>(Cst)) {
Undefs.setBit(UndefBitIndex);
return true;
}
if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
Mask = CInt->getValue();
return true;
}
if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
Mask = CFP->getValueAPF().bitcastToAPInt();
return true;
}
return false;
};
// Handle UNDEFs.
if (Op.isUndef()) {
APInt UndefSrcElts = APInt::getAllOnesValue(NumElts);
SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));
return CastBitData(UndefSrcElts, SrcEltBits);
}
// Extract scalar constant bits.
if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
APInt UndefSrcElts = APInt::getNullValue(1);
SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
return CastBitData(UndefSrcElts, SrcEltBits);
}
if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
APInt UndefSrcElts = APInt::getNullValue(1);
APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
SmallVector<APInt, 64> SrcEltBits(1, RawBits);
return CastBitData(UndefSrcElts, SrcEltBits);
}
// Extract constant bits from build vector.
if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
APInt UndefSrcElts(NumSrcElts, 0);
SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
const SDValue &Src = Op.getOperand(i);
if (Src.isUndef()) {
UndefSrcElts.setBit(i);
continue;
}
auto *Cst = cast<ConstantSDNode>(Src);
SrcEltBits[i] = Cst->getAPIntValue().zextOrTrunc(SrcEltSizeInBits);
}
return CastBitData(UndefSrcElts, SrcEltBits);
}
if (ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode())) {
unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
APInt UndefSrcElts(NumSrcElts, 0);
SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
const SDValue &Src = Op.getOperand(i);
if (Src.isUndef()) {
UndefSrcElts.setBit(i);
continue;
}
auto *Cst = cast<ConstantFPSDNode>(Src);
APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
SrcEltBits[i] = RawBits.zextOrTrunc(SrcEltSizeInBits);
}
return CastBitData(UndefSrcElts, SrcEltBits);
}
// Extract constant bits from constant pool vector.
if (auto *Cst = getTargetConstantFromNode(Op)) {
Type *CstTy = Cst->getType();
unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
if (!CstTy->isVectorTy() || (CstSizeInBits % SizeInBits) != 0)
return false;
unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
APInt UndefSrcElts(NumSrcElts, 0);
SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
for (unsigned i = 0; i != NumSrcElts; ++i)
if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
UndefSrcElts, i))
return false;
return CastBitData(UndefSrcElts, SrcEltBits);
}
// Extract constant bits from a broadcasted constant pool scalar.
if (Op.getOpcode() == X86ISD::VBROADCAST &&
EltSizeInBits <= VT.getScalarSizeInBits()) {
if (auto *Broadcast = getTargetConstantFromNode(Op.getOperand(0))) {
unsigned SrcEltSizeInBits = Broadcast->getType()->getScalarSizeInBits();
unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
APInt UndefSrcElts(NumSrcElts, 0);
SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
if (CollectConstantBits(Broadcast, SrcEltBits[0], UndefSrcElts, 0)) {
if (UndefSrcElts[0])
UndefSrcElts.setBits(0, NumSrcElts);
SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
return CastBitData(UndefSrcElts, SrcEltBits);
}
}
}
if (Op.getOpcode() == X86ISD::VBROADCAST_LOAD &&
EltSizeInBits <= VT.getScalarSizeInBits()) {
auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
if (MemIntr->getMemoryVT().getScalarSizeInBits() != VT.getScalarSizeInBits())
return false;
SDValue Ptr = MemIntr->getBasePtr();
if (Ptr->getOpcode() == X86ISD::Wrapper ||
Ptr->getOpcode() == X86ISD::WrapperRIP)
Ptr = Ptr->getOperand(0);
auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
if (!CNode || CNode->isMachineConstantPoolEntry() ||
CNode->getOffset() != 0)
return false;
if (const Constant *C = CNode->getConstVal()) {
unsigned SrcEltSizeInBits = C->getType()->getScalarSizeInBits();
unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
APInt UndefSrcElts(NumSrcElts, 0);
SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
if (CollectConstantBits(C, SrcEltBits[0], UndefSrcElts, 0)) {
if (UndefSrcElts[0])
UndefSrcElts.setBits(0, NumSrcElts);
SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
return CastBitData(UndefSrcElts, SrcEltBits);
}
}
}
// Extract constant bits from a subvector broadcast.
if (Op.getOpcode() == X86ISD::SUBV_BROADCAST) {
SmallVector<APInt, 16> SubEltBits;
if (getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
UndefElts, SubEltBits, AllowWholeUndefs,
AllowPartialUndefs)) {
UndefElts = APInt::getSplat(NumElts, UndefElts);
while (EltBits.size() < NumElts)
EltBits.append(SubEltBits.begin(), SubEltBits.end());
return true;
}
}
// Extract a rematerialized scalar constant insertion.
if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
APInt UndefSrcElts(NumSrcElts, 0);
SmallVector<APInt, 64> SrcEltBits;
auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0));
SrcEltBits.push_back(CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits));
SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
return CastBitData(UndefSrcElts, SrcEltBits);
}
// Insert constant bits from a base and sub vector sources.
if (Op.getOpcode() == ISD::INSERT_SUBVECTOR &&
isa<ConstantSDNode>(Op.getOperand(2))) {
// TODO - support insert_subvector through bitcasts.
if (EltSizeInBits != VT.getScalarSizeInBits())
return false;
APInt UndefSubElts;
SmallVector<APInt, 32> EltSubBits;
if (getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,
UndefSubElts, EltSubBits,
AllowWholeUndefs, AllowPartialUndefs) &&
getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
UndefElts, EltBits, AllowWholeUndefs,
AllowPartialUndefs)) {
unsigned BaseIdx = Op.getConstantOperandVal(2);
UndefElts.insertBits(UndefSubElts, BaseIdx);
for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i)
EltBits[BaseIdx + i] = EltSubBits[i];
return true;
}
}
// Extract constant bits from a subvector's source.
if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
isa<ConstantSDNode>(Op.getOperand(1))) {
// TODO - support extract_subvector through bitcasts.
if (EltSizeInBits != VT.getScalarSizeInBits())
return false;
if (getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
UndefElts, EltBits, AllowWholeUndefs,
AllowPartialUndefs)) {
EVT SrcVT = Op.getOperand(0).getValueType();
unsigned NumSrcElts = SrcVT.getVectorNumElements();
unsigned NumSubElts = VT.getVectorNumElements();
unsigned BaseIdx = Op.getConstantOperandVal(1);
UndefElts = UndefElts.extractBits(NumSubElts, BaseIdx);
if ((BaseIdx + NumSubElts) != NumSrcElts)
EltBits.erase(EltBits.begin() + BaseIdx + NumSubElts, EltBits.end());
if (BaseIdx != 0)
EltBits.erase(EltBits.begin(), EltBits.begin() + BaseIdx);
return true;
}
}
// Extract constant bits from shuffle node sources.
if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Op)) {
// TODO - support shuffle through bitcasts.
if (EltSizeInBits != VT.getScalarSizeInBits())
return false;
ArrayRef<int> Mask = SVN->getMask();
if ((!AllowWholeUndefs || !AllowPartialUndefs) &&
llvm::any_of(Mask, [](int M) { return M < 0; }))
return false;
APInt UndefElts0, UndefElts1;
SmallVector<APInt, 32> EltBits0, EltBits1;
if (isAnyInRange(Mask, 0, NumElts) &&
!getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
UndefElts0, EltBits0, AllowWholeUndefs,
AllowPartialUndefs))
return false;
if (isAnyInRange(Mask, NumElts, 2 * NumElts) &&
!getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,
UndefElts1, EltBits1, AllowWholeUndefs,
AllowPartialUndefs))
return false;
UndefElts = APInt::getNullValue(NumElts);
for (int i = 0; i != (int)NumElts; ++i) {
int M = Mask[i];
if (M < 0) {
UndefElts.setBit(i);
EltBits.push_back(APInt::getNullValue(EltSizeInBits));
} else if (M < (int)NumElts) {
if (UndefElts0[M])
UndefElts.setBit(i);
EltBits.push_back(EltBits0[M]);
} else {
if (UndefElts1[M - NumElts])
UndefElts.setBit(i);
EltBits.push_back(EltBits1[M - NumElts]);
}
}
return true;
}
return false;
}
namespace llvm {
namespace X86 {
bool isConstantSplat(SDValue Op, APInt &SplatVal) {
APInt UndefElts;
SmallVector<APInt, 16> EltBits;
if (getTargetConstantBitsFromNode(Op, Op.getScalarValueSizeInBits(),
UndefElts, EltBits, true, false)) {
int SplatIndex = -1;
for (int i = 0, e = EltBits.size(); i != e; ++i) {
if (UndefElts[i])
continue;
if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) {
SplatIndex = -1;
break;
}
SplatIndex = i;
}
if (0 <= SplatIndex) {
SplatVal = EltBits[SplatIndex];
return true;
}
}
return false;
}
} // namespace X86
} // namespace llvm
static bool getTargetShuffleMaskIndices(SDValue MaskNode,
unsigned MaskEltSizeInBits,
SmallVectorImpl<uint64_t> &RawMask,
APInt &UndefElts) {
// Extract the raw target constant bits.
SmallVector<APInt, 64> EltBits;
if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
EltBits, /* AllowWholeUndefs */ true,
/* AllowPartialUndefs */ false))
return false;
// Insert the extracted elements into the mask.
for (APInt Elt : EltBits)
RawMask.push_back(Elt.getZExtValue());
return true;
}
/// Create a shuffle mask that matches the PACKSS/PACKUS truncation.
/// Note: This ignores saturation, so inputs must be checked first.
static void createPackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
bool Unary) {
assert(Mask.empty() && "Expected an empty shuffle mask vector");
unsigned NumElts = VT.getVectorNumElements();
unsigned NumLanes = VT.getSizeInBits() / 128;
unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();
unsigned Offset = Unary ? 0 : NumElts;
for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += 2)
Mask.push_back(Elt + (Lane * NumEltsPerLane));
for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += 2)
Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
}
}
// Split the demanded elts of a PACKSS/PACKUS node between its operands.
static void getPackDemandedElts(EVT VT, const APInt &DemandedElts,
APInt &DemandedLHS, APInt &DemandedRHS) {
int NumLanes = VT.getSizeInBits() / 128;
int NumElts = DemandedElts.getBitWidth();
int NumInnerElts = NumElts / 2;
int NumEltsPerLane = NumElts / NumLanes;
int NumInnerEltsPerLane = NumInnerElts / NumLanes;
DemandedLHS = APInt::getNullValue(NumInnerElts);
DemandedRHS = APInt::getNullValue(NumInnerElts);
// Map DemandedElts to the packed operands.
for (int Lane = 0; Lane != NumLanes; ++Lane) {
for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {
int OuterIdx = (Lane * NumEltsPerLane) + Elt;
int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;
if (DemandedElts[OuterIdx])
DemandedLHS.setBit(InnerIdx);
if (DemandedElts[OuterIdx + NumInnerEltsPerLane])
DemandedRHS.setBit(InnerIdx);
}
}
}
// Split the demanded elts of a HADD/HSUB node between its operands.
static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts,
APInt &DemandedLHS, APInt &DemandedRHS) {
int NumLanes = VT.getSizeInBits() / 128;
int NumElts = DemandedElts.getBitWidth();
int NumEltsPerLane = NumElts / NumLanes;
int HalfEltsPerLane = NumEltsPerLane / 2;
DemandedLHS = APInt::getNullValue(NumElts);
DemandedRHS = APInt::getNullValue(NumElts);
// Map DemandedElts to the horizontal operands.
for (int Idx = 0; Idx != NumElts; ++Idx) {
if (!DemandedElts[Idx])
continue;
int LaneIdx = (Idx / NumEltsPerLane) * NumEltsPerLane;
int LocalIdx = Idx % NumEltsPerLane;
if (LocalIdx < HalfEltsPerLane) {
DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 0);
DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 1);
} else {
LocalIdx -= HalfEltsPerLane;
DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 0);
DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 1);
}
}
}
/// Calculates the shuffle mask corresponding to the target-specific opcode.
/// If the mask could be calculated, returns it in \p Mask, returns the shuffle
/// operands in \p Ops, and returns true.
/// Sets \p IsUnary to true if only one source is used. Note that this will set
/// IsUnary for shuffles which use a single input multiple times, and in those
/// cases it will adjust the mask to only have indices within that single input.
/// It is an error to call this with non-empty Mask/Ops vectors.
static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
SmallVectorImpl<SDValue> &Ops,
SmallVectorImpl<int> &Mask, bool &IsUnary) {
unsigned NumElems = VT.getVectorNumElements();
unsigned MaskEltSize = VT.getScalarSizeInBits();
SmallVector<uint64_t, 32> RawMask;
APInt RawUndefs;
SDValue ImmN;
assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
IsUnary = false;
bool IsFakeUnary = false;
switch (N->getOpcode()) {
case X86ISD::BLENDI:
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
ImmN = N->getOperand(N->getNumOperands() - 1);
DecodeBLENDMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
break;
case X86ISD::SHUFP:
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
ImmN = N->getOperand(N->getNumOperands() - 1);
DecodeSHUFPMask(NumElems, MaskEltSize,
cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
break;
case X86ISD::INSERTPS:
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
ImmN = N->getOperand(N->getNumOperands() - 1);
DecodeINSERTPSMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
break;
case X86ISD::EXTRQI:
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
if (isa<ConstantSDNode>(N->getOperand(1)) &&
isa<ConstantSDNode>(N->getOperand(2))) {
int BitLen = N->getConstantOperandVal(1);
int BitIdx = N->getConstantOperandVal(2);
DecodeEXTRQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
IsUnary = true;
}
break;
case X86ISD::INSERTQI:
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
if (isa<ConstantSDNode>(N->getOperand(2)) &&
isa<ConstantSDNode>(N->getOperand(3))) {
int BitLen = N->getConstantOperandVal(2);
int BitIdx = N->getConstantOperandVal(3);
DecodeINSERTQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
}
break;
case X86ISD::UNPCKH:
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
DecodeUNPCKHMask(NumElems, MaskEltSize, Mask);
IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
break;
case X86ISD::UNPCKL:
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
DecodeUNPCKLMask(NumElems, MaskEltSize, Mask);
IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
break;
case X86ISD::MOVHLPS:
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
DecodeMOVHLPSMask(NumElems, Mask);
IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
break;
case X86ISD::MOVLHPS:
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
DecodeMOVLHPSMask(NumElems, Mask);
IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
break;
case X86ISD::PALIGNR:
assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
ImmN = N->getOperand(N->getNumOperands() - 1);
DecodePALIGNRMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
Mask);
IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
Ops.push_back(N->getOperand(1));
Ops.push_back(N->getOperand(0));
break;
case X86ISD::VSHLDQ:
assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
ImmN = N->getOperand(N->getNumOperands() - 1);
DecodePSLLDQMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
Mask);
IsUnary = true;
break;
case X86ISD::VSRLDQ:
assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
ImmN = N->getOperand(N->getNumOperands() - 1);
DecodePSRLDQMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
Mask);
IsUnary = true;
break;
case X86ISD::PSHUFD:
case X86ISD::VPERMILPI:
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
ImmN = N->getOperand(N->getNumOperands() - 1);
DecodePSHUFMask(NumElems, MaskEltSize,
cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
IsUnary = true;
break;
case X86ISD::PSHUFHW:
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
ImmN = N->getOperand(N->getNumOperands() - 1);
DecodePSHUFHWMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
Mask);
IsUnary = true;
break;
case X86ISD::PSHUFLW:
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
ImmN = N->getOperand(N->getNumOperands() - 1);
DecodePSHUFLWMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
Mask);
IsUnary = true;
break;
case X86ISD::VZEXT_MOVL:
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
DecodeZeroMoveLowMask(NumElems, Mask);
IsUnary = true;
break;
case X86ISD::VBROADCAST: {
SDValue N0 = N->getOperand(0);
// See if we're broadcasting from index 0 of an EXTRACT_SUBVECTOR. If so,
// add the pre-extracted value to the Ops vector.
if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
N0.getOperand(0).getValueType() == VT &&
N0.getConstantOperandVal(1) == 0)
Ops.push_back(N0.getOperand(0));
// We only decode broadcasts of same-sized vectors, unless the broadcast
// came from an extract from the original width. If we found one, we
// pushed it the Ops vector above.
if (N0.getValueType() == VT || !Ops.empty()) {
DecodeVectorBroadcast(NumElems, Mask);
IsUnary = true;
break;
}
return false;
}
case X86ISD::VPERMILPV: {
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
IsUnary = true;
SDValue MaskNode = N->getOperand(1);
if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
RawUndefs)) {
DecodeVPERMILPMask(NumElems, MaskEltSize, RawMask, RawUndefs, Mask);
break;
}
return false;
}
case X86ISD::PSHUFB: {
assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
IsUnary = true;
SDValue MaskNode = N->getOperand(1);
if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
DecodePSHUFBMask(RawMask, RawUndefs, Mask);
break;
}
return false;
}
case X86ISD::VPERMI:
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
ImmN = N->getOperand(N->getNumOperands() - 1);
DecodeVPERMMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
IsUnary = true;
break;
case X86ISD::MOVSS:
case X86ISD::MOVSD:
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask);
break;
case X86ISD::VPERM2X128:
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
ImmN = N->getOperand(N->getNumOperands() - 1);
DecodeVPERM2X128Mask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
Mask);
IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
break;
case X86ISD::SHUF128:
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
ImmN = N->getOperand(N->getNumOperands() - 1);
decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize,
cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
break;
case X86ISD::MOVSLDUP:
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
DecodeMOVSLDUPMask(NumElems, Mask);
IsUnary = true;
break;
case X86ISD::MOVSHDUP:
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
DecodeMOVSHDUPMask(NumElems, Mask);
IsUnary = true;
break;
case X86ISD::MOVDDUP:
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
DecodeMOVDDUPMask(NumElems, Mask);
IsUnary = true;
break;
case X86ISD::VPERMIL2: {
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
SDValue MaskNode = N->getOperand(2);
SDValue CtrlNode = N->getOperand(3);
if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
unsigned CtrlImm = CtrlOp->getZExtValue();
if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
RawUndefs)) {
DecodeVPERMIL2PMask(NumElems, MaskEltSize, CtrlImm, RawMask, RawUndefs,
Mask);
break;
}
}
return false;
}
case X86ISD::VPPERM: {
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
SDValue MaskNode = N->getOperand(2);
if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
DecodeVPPERMMask(RawMask, RawUndefs, Mask);
break;
}
return false;
}
case X86ISD::VPERMV: {
assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
IsUnary = true;
// Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
Ops.push_back(N->getOperand(1));
SDValue MaskNode = N->getOperand(0);
if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
RawUndefs)) {
DecodeVPERMVMask(RawMask, RawUndefs, Mask);
break;
}
return false;
}
case X86ISD::VPERMV3: {
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
assert(N->getOperand(2).getValueType() == VT && "Unexpected value type");
IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);
// Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
Ops.push_back(N->getOperand(0));
Ops.push_back(N->getOperand(2));
SDValue MaskNode = N->getOperand(1);
if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
RawUndefs)) {
DecodeVPERMV3Mask(RawMask, RawUndefs, Mask);
break;
}
return false;
}
default: llvm_unreachable("unknown target shuffle node");
}
// Empty mask indicates the decode failed.
if (Mask.empty())
return false;
// Check if we're getting a shuffle mask with zero'd elements.
if (!AllowSentinelZero)
if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
return false;
// If we have a fake unary shuffle, the shuffle mask is spread across two
// inputs that are actually the same node. Re-map the mask to always point
// into the first input.
if (IsFakeUnary)
for (int &M : Mask)
if (M >= (int)Mask.size())
M -= Mask.size();
// If we didn't already add operands in the opcode-specific code, default to
// adding 1 or 2 operands starting at 0.
if (Ops.empty()) {
Ops.push_back(N->getOperand(0));
if (!IsUnary || IsFakeUnary)
Ops.push_back(N->getOperand(1));
}
return true;
}
/// Compute whether each element of a shuffle is zeroable.
///
/// A "zeroable" vector shuffle element is one which can be lowered to zero.
/// Either it is an undef element in the shuffle mask, the element of the input
/// referenced is undef, or the element of the input referenced is known to be
/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
/// as many lanes with this technique as possible to simplify the remaining
/// shuffle.
static void computeZeroableShuffleElements(ArrayRef<int> Mask,
SDValue V1, SDValue V2,
APInt &KnownUndef, APInt &KnownZero) {
int Size = Mask.size();
KnownUndef = KnownZero = APInt::getNullValue(Size);
V1 = peekThroughBitcasts(V1);
V2 = peekThroughBitcasts(V2);
bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
int VectorSizeInBits = V1.getValueSizeInBits();
int ScalarSizeInBits = VectorSizeInBits / Size;
assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
for (int i = 0; i < Size; ++i) {
int M = Mask[i];
// Handle the easy cases.
if (M < 0) {
KnownUndef.setBit(i);
continue;
}
if ((M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
KnownZero.setBit(i);
continue;
}
// Determine shuffle input and normalize the mask.
SDValue V = M < Size ? V1 : V2;
M %= Size;
// Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
if (V.getOpcode() != ISD::BUILD_VECTOR)
continue;
// If the BUILD_VECTOR has fewer elements then the bitcasted portion of
// the (larger) source element must be UNDEF/ZERO.
if ((Size % V.getNumOperands()) == 0) {
int Scale = Size / V->getNumOperands();
SDValue Op = V.getOperand(M / Scale);
if (Op.isUndef())
KnownUndef.setBit(i);
if (X86::isZeroNode(Op))
KnownZero.setBit(i);
else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
APInt Val = Cst->getAPIntValue();
Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
if (Val == 0)
KnownZero.setBit(i);
} else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
APInt Val = Cst->getValueAPF().bitcastToAPInt();
Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
if (Val == 0)
KnownZero.setBit(i);
}
continue;
}
// If the BUILD_VECTOR has more elements then all the (smaller) source
// elements must be UNDEF or ZERO.
if ((V.getNumOperands() % Size) == 0) {
int Scale = V->getNumOperands() / Size;
bool AllUndef = true;
bool AllZero = true;
for (int j = 0; j < Scale; ++j) {
SDValue Op = V.getOperand((M * Scale) + j);
AllUndef &= Op.isUndef();
AllZero &= X86::isZeroNode(Op);
}
if (AllUndef)
KnownUndef.setBit(i);
if (AllZero)
KnownZero.setBit(i);
continue;
}
}
}
/// Decode a target shuffle mask and inputs and see if any values are
/// known to be undef or zero from their inputs.
/// Returns true if the target shuffle mask was decoded.
/// FIXME: Merge this with computeZeroableShuffleElements?
static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl<int> &Mask,
SmallVectorImpl<SDValue> &Ops,
APInt &KnownUndef, APInt &KnownZero) {
bool IsUnary;
if (!isTargetShuffle(N.getOpcode()))
return false;
MVT VT = N.getSimpleValueType();
if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))
return false;
int Size = Mask.size();
SDValue V1 = Ops[0];
SDValue V2 = IsUnary ? V1 : Ops[1];
KnownUndef = KnownZero = APInt::getNullValue(Size);
V1 = peekThroughBitcasts(V1);
V2 = peekThroughBitcasts(V2);
assert((VT.getSizeInBits() % Size) == 0 &&
"Illegal split of shuffle value type");
unsigned EltSizeInBits = VT.getSizeInBits() / Size;
// Extract known constant input data.
APInt UndefSrcElts[2];
SmallVector<APInt, 32> SrcEltBits[2];
bool IsSrcConstant[2] = {
getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
SrcEltBits[0], true, false),
getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
SrcEltBits[1], true, false)};
for (int i = 0; i < Size; ++i) {
int M = Mask[i];
// Already decoded as SM_SentinelZero / SM_SentinelUndef.
if (M < 0) {
assert(isUndefOrZero(M) && "Unknown shuffle sentinel value!");
if (SM_SentinelUndef == M)
KnownUndef.setBit(i);
if (SM_SentinelZero == M)
KnownZero.setBit(i);
continue;
}
// Determine shuffle input and normalize the mask.
unsigned SrcIdx = M / Size;
SDValue V = M < Size ? V1 : V2;
M %= Size;
// We are referencing an UNDEF input.
if (V.isUndef()) {
KnownUndef.setBit(i);
continue;
}
// SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
// TODO: We currently only set UNDEF for integer types - floats use the same
// registers as vectors and many of the scalar folded loads rely on the
// SCALAR_TO_VECTOR pattern.
if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
(Size % V.getValueType().getVectorNumElements()) == 0) {
int Scale = Size / V.getValueType().getVectorNumElements();
int Idx = M / Scale;
if (Idx != 0 && !VT.isFloatingPoint())
KnownUndef.setBit(i);
else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
KnownZero.setBit(i);
continue;
}
// Attempt to extract from the source's constant bits.
if (IsSrcConstant[SrcIdx]) {
if (UndefSrcElts[SrcIdx][M])
KnownUndef.setBit(i);
else if (SrcEltBits[SrcIdx][M] == 0)
KnownZero.setBit(i);
}
}
assert(VT.getVectorNumElements() == (unsigned)Size &&
"Different mask size from vector size!");
return true;
}
// Replace target shuffle mask elements with known undef/zero sentinels.
static void resolveTargetShuffleFromZeroables(SmallVectorImpl<int> &Mask,
const APInt &KnownUndef,
const APInt &KnownZero,
bool ResolveKnownZeros= true) {
unsigned NumElts = Mask.size();
assert(KnownUndef.getBitWidth() == NumElts &&
KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch");
for (unsigned i = 0; i != NumElts; ++i) {
if (KnownUndef[i])
Mask[i] = SM_SentinelUndef;
else if (ResolveKnownZeros && KnownZero[i])
Mask[i] = SM_SentinelZero;
}
}
// Extract target shuffle mask sentinel elements to known undef/zero bitmasks.
static void resolveZeroablesFromTargetShuffle(const SmallVectorImpl<int> &Mask,
APInt &KnownUndef,
APInt &KnownZero) {
unsigned NumElts = Mask.size();
KnownUndef = KnownZero = APInt::getNullValue(NumElts);
for (unsigned i = 0; i != NumElts; ++i) {
int M = Mask[i];
if (SM_SentinelUndef == M)
KnownUndef.setBit(i);
if (SM_SentinelZero == M)
KnownZero.setBit(i);
}
}
// Forward declaration (for getFauxShuffleMask recursive check).
// TODO: Use DemandedElts variant.
static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,
SmallVectorImpl<int> &Mask,
SelectionDAG &DAG, unsigned Depth,
bool ResolveKnownElts);
// Attempt to decode ops that could be represented as a shuffle mask.
// The decoded shuffle mask may contain a different number of elements to the
// destination value type.
static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
SmallVectorImpl<int> &Mask,
SmallVectorImpl<SDValue> &Ops,
SelectionDAG &DAG, unsigned Depth,
bool ResolveKnownElts) {
if (Depth > SelectionDAG::MaxRecursionDepth)
return false;
Mask.clear();
Ops.clear();
MVT VT = N.getSimpleValueType();
unsigned NumElts = VT.getVectorNumElements();
unsigned NumSizeInBits = VT.getSizeInBits();
unsigned NumBitsPerElt = VT.getScalarSizeInBits();
if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0)
return false;
assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size");
unsigned Opcode = N.getOpcode();
switch (Opcode) {
case ISD::VECTOR_SHUFFLE: {
// Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here.
ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask();
if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) {
Mask.append(ShuffleMask.begin(), ShuffleMask.end());
Ops.push_back(N.getOperand(0));
Ops.push_back(N.getOperand(1));
return true;
}
return false;
}
case ISD::AND:
case X86ISD::ANDNP: {
// Attempt to decode as a per-byte mask.
APInt UndefElts;
SmallVector<APInt, 32> EltBits;
SDValue N0 = N.getOperand(0);
SDValue N1 = N.getOperand(1);
bool IsAndN = (X86ISD::ANDNP == Opcode);
uint64_t ZeroMask = IsAndN ? 255 : 0;
if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits))
return false;
for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
if (UndefElts[i]) {
Mask.push_back(SM_SentinelUndef);
continue;
}
const APInt &ByteBits = EltBits[i];
if (ByteBits != 0 && ByteBits != 255)
return false;
Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
}
Ops.push_back(IsAndN ? N1 : N0);
return true;
}
case ISD::OR: {
// Inspect each operand at the byte level. We can merge these into a
// blend shuffle mask if for each byte at least one is masked out (zero).
KnownBits Known0 =
DAG.computeKnownBits(N.getOperand(0), DemandedElts, Depth + 1);
KnownBits Known1 =
DAG.computeKnownBits(N.getOperand(1), DemandedElts, Depth + 1);
if (Known0.One.isNullValue() && Known1.One.isNullValue()) {
bool IsByteMask = true;
unsigned NumSizeInBytes = NumSizeInBits / 8;
unsigned NumBytesPerElt = NumBitsPerElt / 8;
APInt ZeroMask = APInt::getNullValue(NumBytesPerElt);
APInt SelectMask = APInt::getNullValue(NumBytesPerElt);
for (unsigned i = 0; i != NumBytesPerElt && IsByteMask; ++i) {
unsigned LHS = Known0.Zero.extractBits(8, i * 8).getZExtValue();
unsigned RHS = Known1.Zero.extractBits(8, i * 8).getZExtValue();
if (LHS == 255 && RHS == 0)
SelectMask.setBit(i);
else if (LHS == 255 && RHS == 255)
ZeroMask.setBit(i);
else if (!(LHS == 0 && RHS == 255))
IsByteMask = false;
}
if (IsByteMask) {
for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt) {
for (unsigned j = 0; j != NumBytesPerElt; ++j) {
unsigned Ofs = (SelectMask[j] ? NumSizeInBytes : 0);
int Idx = (ZeroMask[j] ? (int)SM_SentinelZero : (i + j + Ofs));
Mask.push_back(Idx);
}
}
Ops.push_back(N.getOperand(0));
Ops.push_back(N.getOperand(1));
return true;
}
}
// Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other
// is a valid shuffle index.
SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
SDValue N1 = peekThroughOneUseBitcasts(N.getOperand(1));
if (!N0.getValueType().isVector() || !N1.getValueType().isVector())
return false;
SmallVector<int, 64> SrcMask0, SrcMask1;
SmallVector<SDValue, 2> SrcInputs0, SrcInputs1;
if (!getTargetShuffleInputs(N0, SrcInputs0, SrcMask0, DAG, Depth + 1,
true) ||
!getTargetShuffleInputs(N1, SrcInputs1, SrcMask1, DAG, Depth + 1,
true))
return false;
size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size());
SmallVector<int, 64> Mask0, Mask1;
scaleShuffleMask<int>(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
scaleShuffleMask<int>(MaskSize / SrcMask1.size(), SrcMask1, Mask1);
for (size_t i = 0; i != MaskSize; ++i) {
if (Mask0[i] == SM_SentinelUndef && Mask1[i] == SM_SentinelUndef)
Mask.push_back(SM_SentinelUndef);
else if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero)
Mask.push_back(SM_SentinelZero);
else if (Mask1[i] == SM_SentinelZero)
Mask.push_back(Mask0[i]);
else if (Mask0[i] == SM_SentinelZero)
Mask.push_back(Mask1[i] + (int)(MaskSize * SrcInputs0.size()));
else
return false;
}
Ops.append(SrcInputs0.begin(), SrcInputs0.end());
Ops.append(SrcInputs1.begin(), SrcInputs1.end());
return true;
}
case ISD::INSERT_SUBVECTOR: {
SDValue Src = N.getOperand(0);
SDValue Sub = N.getOperand(1);
EVT SubVT = Sub.getValueType();
unsigned NumSubElts = SubVT.getVectorNumElements();
if (!isa<ConstantSDNode>(N.getOperand(2)) ||
!N->isOnlyUserOf(Sub.getNode()))
return false;
uint64_t InsertIdx = N.getConstantOperandVal(2);
// Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)).
if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
Sub.getOperand(0).getValueType() == VT &&
isa<ConstantSDNode>(Sub.getOperand(1))) {
uint64_t ExtractIdx = Sub.getConstantOperandVal(1);
for (int i = 0; i != (int)NumElts; ++i)
Mask.push_back(i);
for (int i = 0; i != (int)NumSubElts; ++i)
Mask[InsertIdx + i] = NumElts + ExtractIdx + i;
Ops.push_back(Src);
Ops.push_back(Sub.getOperand(0));
return true;
}
// Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)).
SmallVector<int, 64> SubMask;
SmallVector<SDValue, 2> SubInputs;
if (!getTargetShuffleInputs(peekThroughOneUseBitcasts(Sub), SubInputs,
SubMask, DAG, Depth + 1, ResolveKnownElts))
return false;
if (SubMask.size() != NumSubElts) {
assert(((SubMask.size() % NumSubElts) == 0 ||
(NumSubElts % SubMask.size()) == 0) && "Illegal submask scale");
if ((NumSubElts % SubMask.size()) == 0) {
int Scale = NumSubElts / SubMask.size();
SmallVector<int,64> ScaledSubMask;
scaleShuffleMask<int>(Scale, SubMask, ScaledSubMask);
SubMask = ScaledSubMask;
} else {
int Scale = SubMask.size() / NumSubElts;
NumSubElts = SubMask.size();
NumElts *= Scale;
InsertIdx *= Scale;
}
}
Ops.push_back(Src);
for (SDValue &SubInput : SubInputs) {
EVT SubSVT = SubInput.getValueType().getScalarType();
EVT AltVT = EVT::getVectorVT(*DAG.getContext(), SubSVT,
NumSizeInBits / SubSVT.getSizeInBits());
Ops.push_back(DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), AltVT,
DAG.getUNDEF(AltVT), SubInput,
DAG.getIntPtrConstant(0, SDLoc(N))));
}
for (int i = 0; i != (int)NumElts; ++i)
Mask.push_back(i);
for (int i = 0; i != (int)NumSubElts; ++i) {
int M = SubMask[i];
if (0 <= M) {
int InputIdx = M / NumSubElts;
M = (NumElts * (1 + InputIdx)) + (M % NumSubElts);
}
Mask[i + InsertIdx] = M;
}
return true;
}
case ISD::SCALAR_TO_VECTOR: {
// Match against a scalar_to_vector of an extract from a vector,
// for PEXTRW/PEXTRB we must handle the implicit zext of the scalar.
SDValue N0 = N.getOperand(0);
SDValue SrcExtract;
if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
N0.getOperand(0).getValueType() == VT) ||
(N0.getOpcode() == X86ISD::PEXTRW &&
N0.getOperand(0).getValueType() == MVT::v8i16) ||
(N0.getOpcode() == X86ISD::PEXTRB &&
N0.getOperand(0).getValueType() == MVT::v16i8)) {
SrcExtract = N0;
}
if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
return false;
SDValue SrcVec = SrcExtract.getOperand(0);
EVT SrcVT = SrcVec.getValueType();
unsigned NumSrcElts = SrcVT.getVectorNumElements();
unsigned NumZeros = (NumBitsPerElt / SrcVT.getScalarSizeInBits()) - 1;
unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
if (NumSrcElts <= SrcIdx)
return false;
Ops.push_back(SrcVec);
Mask.push_back(SrcIdx);
Mask.append(NumZeros, SM_SentinelZero);
Mask.append(NumSrcElts - Mask.size(), SM_SentinelUndef);
return true;
}
case X86ISD::PINSRB:
case X86ISD::PINSRW: {
SDValue InVec = N.getOperand(0);
SDValue InScl = N.getOperand(1);
SDValue InIndex = N.getOperand(2);
if (!isa<ConstantSDNode>(InIndex) ||
cast<ConstantSDNode>(InIndex)->getAPIntValue().uge(NumElts))
return false;
uint64_t InIdx = N.getConstantOperandVal(2);
// Attempt to recognise a PINSR*(VEC, 0, Idx) shuffle pattern.
if (X86::isZeroNode(InScl)) {
Ops.push_back(InVec);
for (unsigned i = 0; i != NumElts; ++i)
Mask.push_back(i == InIdx ? SM_SentinelZero : (int)i);
return true;
}
// Attempt to recognise a PINSR*(PEXTR*) shuffle pattern.
// TODO: Expand this to support INSERT_VECTOR_ELT/etc.
unsigned ExOp =
(X86ISD::PINSRB == Opcode ? X86ISD::PEXTRB : X86ISD::PEXTRW);
if (InScl.getOpcode() != ExOp)
return false;
SDValue ExVec = InScl.getOperand(0);
SDValue ExIndex = InScl.getOperand(1);
if (!isa<ConstantSDNode>(ExIndex) ||
cast<ConstantSDNode>(ExIndex)->getAPIntValue().uge(NumElts))
return false;
uint64_t ExIdx = InScl.getConstantOperandVal(1);
Ops.push_back(InVec);
Ops.push_back(ExVec);
for (unsigned i = 0; i != NumElts; ++i)
Mask.push_back(i == InIdx ? NumElts + ExIdx : i);
return true;
}
case X86ISD::PACKSS:
case X86ISD::PACKUS: {
SDValue N0 = N.getOperand(0);
SDValue N1 = N.getOperand(1);
assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&
N1.getValueType().getVectorNumElements() == (NumElts / 2) &&
"Unexpected input value type");
APInt EltsLHS, EltsRHS;
getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS);
// If we know input saturation won't happen we can treat this
// as a truncation shuffle.
if (Opcode == X86ISD::PACKSS) {
if ((!N0.isUndef() &&
DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) ||
(!N1.isUndef() &&
DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt))
return false;
} else {
APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
if ((!N0.isUndef() &&
!DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) ||
(!N1.isUndef() &&
!DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1)))
return false;
}
bool IsUnary = (N0 == N1);
Ops.push_back(N0);
if (!IsUnary)
Ops.push_back(N1);
createPackShuffleMask(VT, Mask, IsUnary);
return true;
}
case X86ISD::VSHLI:
case X86ISD::VSRLI: {
uint64_t ShiftVal = N.getConstantOperandVal(1);
// Out of range bit shifts are guaranteed to be zero.
if (NumBitsPerElt <= ShiftVal) {
Mask.append(NumElts, SM_SentinelZero);
return true;
}
// We can only decode 'whole byte' bit shifts as shuffles.
if ((ShiftVal % 8) != 0)
break;
uint64_t ByteShift = ShiftVal / 8;
unsigned NumBytes = NumSizeInBits / 8;
unsigned NumBytesPerElt = NumBitsPerElt / 8;
Ops.push_back(N.getOperand(0));
// Clear mask to all zeros and insert the shifted byte indices.
Mask.append(NumBytes, SM_SentinelZero);
if (X86ISD::VSHLI == Opcode) {
for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
Mask[i + j] = i + j - ByteShift;
} else {
for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
Mask[i + j - ByteShift] = i + j;
}
return true;
}
case X86ISD::VBROADCAST: {
SDValue Src = N.getOperand(0);
MVT SrcVT = Src.getSimpleValueType();
if (!SrcVT.isVector())
return false;
if (NumSizeInBits != SrcVT.getSizeInBits()) {
assert((NumSizeInBits % SrcVT.getSizeInBits()) == 0 &&
"Illegal broadcast type");
SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
NumSizeInBits / SrcVT.getScalarSizeInBits());
Src = DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), SrcVT,
DAG.getUNDEF(SrcVT), Src,
DAG.getIntPtrConstant(0, SDLoc(N)));
}
Ops.push_back(Src);
Mask.append(NumElts, 0);
return true;
}
case ISD::ZERO_EXTEND:
case ISD::ANY_EXTEND:
case ISD::ZERO_EXTEND_VECTOR_INREG:
case ISD::ANY_EXTEND_VECTOR_INREG: {
SDValue Src = N.getOperand(0);
EVT SrcVT = Src.getValueType();
// Extended source must be a simple vector.
if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
(SrcVT.getScalarSizeInBits() % 8) != 0)
return false;
unsigned NumSrcBitsPerElt = SrcVT.getScalarSizeInBits();
bool IsAnyExtend =
(ISD::ANY_EXTEND == Opcode || ISD::ANY_EXTEND_VECTOR_INREG == Opcode);
DecodeZeroExtendMask(NumSrcBitsPerElt, NumBitsPerElt, NumElts, IsAnyExtend,
Mask);
if (NumSizeInBits != SrcVT.getSizeInBits()) {
assert((NumSizeInBits % SrcVT.getSizeInBits()) == 0 &&
"Illegal zero-extension type");
SrcVT = MVT::getVectorVT(SrcVT.getSimpleVT().getScalarType(),
NumSizeInBits / NumSrcBitsPerElt);
Src = DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), SrcVT,
DAG.getUNDEF(SrcVT), Src,
DAG.getIntPtrConstant(0, SDLoc(N)));
}
Ops.push_back(Src);
return true;
}
}
return false;
}
/// Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
SmallVectorImpl<int> &Mask) {
int MaskWidth = Mask.size();
SmallVector<SDValue, 16> UsedInputs;
for (int i = 0, e = Inputs.size(); i < e; ++i) {
int lo = UsedInputs.size() * MaskWidth;
int hi = lo + MaskWidth;
// Strip UNDEF input usage.
if (Inputs[i].isUndef())
for (int &M : Mask)
if ((lo <= M) && (M < hi))
M = SM_SentinelUndef;
// Check for unused inputs.
if (none_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
for (int &M : Mask)
if (lo <= M)
M -= MaskWidth;
continue;
}
// Check for repeated inputs.
bool IsRepeat = false;
for (int j = 0, ue = UsedInputs.size(); j != ue; ++j) {
if (UsedInputs[j] != Inputs[i])
continue;
for (int &M : Mask)
if (lo <= M)
M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth);
IsRepeat = true;
break;
}
if (IsRepeat)
continue;
UsedInputs.push_back(Inputs[i]);
}
Inputs = UsedInputs;
}
/// Calls getTargetShuffleAndZeroables to resolve a target shuffle mask's inputs
/// and then sets the SM_SentinelUndef and SM_SentinelZero values.
/// Returns true if the target shuffle mask was decoded.
static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
SmallVectorImpl<SDValue> &Inputs,
SmallVectorImpl<int> &Mask,
APInt &KnownUndef, APInt &KnownZero,
SelectionDAG &DAG, unsigned Depth,
bool ResolveKnownElts) {
EVT VT = Op.getValueType();
if (!VT.isSimple() || !VT.isVector())
return false;
if (getTargetShuffleAndZeroables(Op, Mask, Inputs, KnownUndef, KnownZero)) {
if (ResolveKnownElts)
resolveTargetShuffleFromZeroables(Mask, KnownUndef, KnownZero);
return true;
}
if (getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG, Depth,
ResolveKnownElts)) {
resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
return true;
}
return false;
}
static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,
SmallVectorImpl<int> &Mask,
SelectionDAG &DAG, unsigned Depth = 0,
bool ResolveKnownElts = true) {
EVT VT = Op.getValueType();
if (!VT.isSimple() || !VT.isVector())
return false;
APInt KnownUndef, KnownZero;
unsigned NumElts = Op.getValueType().getVectorNumElements();
APInt DemandedElts = APInt::getAllOnesValue(NumElts);
return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, KnownUndef,
KnownZero, DAG, Depth, ResolveKnownElts);
}
/// Returns the scalar element that will make up the ith
/// element of the result of the vector shuffle.
static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
unsigned Depth) {
if (Depth == 6)
return SDValue(); // Limit search depth.
SDValue V = SDValue(N, 0);
EVT VT = V.getValueType();
unsigned Opcode = V.getOpcode();
// Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
int Elt = SV->getMaskElt(Index);
if (Elt < 0)
return DAG.getUNDEF(VT.getVectorElementType());
unsigned NumElems = VT.getVectorNumElements();
SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
: SV->getOperand(1);
return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
}
// Recurse into target specific vector shuffles to find scalars.
if (isTargetShuffle(Opcode)) {
MVT ShufVT = V.getSimpleValueType();
MVT ShufSVT = ShufVT.getVectorElementType();
int NumElems = (int)ShufVT.getVectorNumElements();
SmallVector<int, 16> ShuffleMask;
SmallVector<SDValue, 16> ShuffleOps;
bool IsUnary;
if (!getTargetShuffleMask(N, ShufVT, true, ShuffleOps, ShuffleMask, IsUnary))
return SDValue();
int Elt = ShuffleMask[Index];
if (Elt == SM_SentinelZero)
return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(N), ShufSVT)
: DAG.getConstantFP(+0.0, SDLoc(N), ShufSVT);
if (Elt == SM_SentinelUndef)
return DAG.getUNDEF(ShufSVT);
assert(0 <= Elt && Elt < (2*NumElems) && "Shuffle index out of range");
SDValue NewV = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
Depth+1);
}
// Recurse into insert_subvector base/sub vector to find scalars.
if (Opcode == ISD::INSERT_SUBVECTOR &&
isa<ConstantSDNode>(N->getOperand(2))) {
SDValue Vec = N->getOperand(0);
SDValue Sub = N->getOperand(1);
EVT SubVT = Sub.getValueType();
unsigned NumSubElts = SubVT.getVectorNumElements();
uint64_t SubIdx = N->getConstantOperandVal(2);
if (SubIdx <= Index && Index < (SubIdx + NumSubElts))
return getShuffleScalarElt(Sub.getNode(), Index - SubIdx, DAG, Depth + 1);
return getShuffleScalarElt(Vec.getNode(), Index, DAG, Depth + 1);
}
// Recurse into extract_subvector src vector to find scalars.
if (Opcode == ISD::EXTRACT_SUBVECTOR &&
isa<ConstantSDNode>(N->getOperand(1))) {
SDValue Src = N->getOperand(0);
uint64_t SrcIdx = N->getConstantOperandVal(1);
return getShuffleScalarElt(Src.getNode(), Index + SrcIdx, DAG, Depth + 1);
}
// Actual nodes that may contain scalar elements
if (Opcode == ISD::BITCAST) {
V = V.getOperand(0);
EVT SrcVT = V.getValueType();
unsigned NumElems = VT.getVectorNumElements();
if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
return SDValue();
}
if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
return (Index == 0) ? V.getOperand(0)
: DAG.getUNDEF(VT.getVectorElementType());
if (V.getOpcode() == ISD::BUILD_VECTOR)
return V.getOperand(Index);
return SDValue();
}
// Use PINSRB/PINSRW/PINSRD to create a build vector.
static SDValue LowerBuildVectorAsInsert(SDValue Op, unsigned NonZeros,
unsigned NumNonZero, unsigned NumZero,
SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
MVT VT = Op.getSimpleValueType();
unsigned NumElts = VT.getVectorNumElements();
assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) ||
((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) &&
"Illegal vector insertion");
SDLoc dl(Op);
SDValue V;
bool First = true;
for (unsigned i = 0; i < NumElts; ++i) {
bool IsNonZero = (NonZeros & (1 << i)) != 0;
if (!IsNonZero)
continue;
// If the build vector contains zeros or our first insertion is not the
// first index then insert into zero vector to break any register
// dependency else use SCALAR_TO_VECTOR.
if (First) {
First = false;
if (NumZero || 0 != i)
V = getZeroVector(VT, Subtarget, DAG, dl);
else {
assert(0 == i && "Expected insertion into zero-index");
V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
V = DAG.getBitcast(VT, V);
continue;
}
}
V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V, Op.getOperand(i),
DAG.getIntPtrConstant(i, dl));
}
return V;
}
/// Custom lower build_vector of v16i8.
static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
unsigned NumNonZero, unsigned NumZero,
SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
if (NumNonZero > 8 && !Subtarget.hasSSE41())
return SDValue();
// SSE4.1 - use PINSRB to insert each byte directly.
if (Subtarget.hasSSE41())
return LowerBuildVectorAsInsert(Op, NonZeros, NumNonZero, NumZero, DAG,
Subtarget);
SDLoc dl(Op);
SDValue V;
// Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
for (unsigned i = 0; i < 16; i += 2) {
bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
bool NextIsNonZero = (NonZeros & (1 << (i + 1))) != 0;
if (!ThisIsNonZero && !NextIsNonZero)
continue;
// FIXME: Investigate combining the first 4 bytes as a i32 instead.
SDValue Elt;
if (ThisIsNonZero) {
if (NumZero || NextIsNonZero)
Elt = DAG.getZExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
else
Elt = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
}
if (NextIsNonZero) {
SDValue NextElt = Op.getOperand(i + 1);
if (i == 0 && NumZero)
NextElt = DAG.getZExtOrTrunc(NextElt, dl, MVT::i32);
else
NextElt = DAG.getAnyExtOrTrunc(NextElt, dl, MVT::i32);
NextElt = DAG.getNode(ISD::SHL, dl, MVT::i32, NextElt,
DAG.getConstant(8, dl, MVT::i8));
if (ThisIsNonZero)
Elt = DAG.getNode(ISD::OR, dl, MVT::i32, NextElt, Elt);
else
Elt = NextElt;
}
// If our first insertion is not the first index then insert into zero
// vector to break any register dependency else use SCALAR_TO_VECTOR.
if (!V) {
if (i != 0)
V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
else {
V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Elt);
V = DAG.getBitcast(MVT::v8i16, V);
continue;
}
}
Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Elt);
V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, Elt,
DAG.getIntPtrConstant(i / 2, dl));
}
return DAG.getBitcast(MVT::v16i8, V);
}
/// Custom lower build_vector of v8i16.
static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
unsigned NumNonZero, unsigned NumZero,
SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
if (NumNonZero > 4 && !Subtarget.hasSSE41())
return SDValue();
// Use PINSRW to insert each byte directly.
return LowerBuildVectorAsInsert(Op, NonZeros, NumNonZero, NumZero, DAG,
Subtarget);
}
/// Custom lower build_vector of v4i32 or v4f32.
static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
// If this is a splat of a pair of elements, use MOVDDUP (unless the target
// has XOP; in that case defer lowering to potentially use VPERMIL2PS).
// Because we're creating a less complicated build vector here, we may enable
// further folding of the MOVDDUP via shuffle transforms.
if (Subtarget.hasSSE3() && !Subtarget.hasXOP() &&
Op.getOperand(0) == Op.getOperand(2) &&
Op.getOperand(1) == Op.getOperand(3) &&
Op.getOperand(0) != Op.getOperand(1)) {
SDLoc DL(Op);
MVT VT = Op.getSimpleValueType();
MVT EltVT = VT.getVectorElementType();
// Create a new build vector with the first 2 elements followed by undef
// padding, bitcast to v2f64, duplicate, and bitcast back.
SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
SDValue NewBV = DAG.getBitcast(MVT::v2f64, DAG.getBuildVector(VT, DL, Ops));
SDValue Dup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, NewBV);
return DAG.getBitcast(VT, Dup);
}
// Find all zeroable elements.
std::bitset<4> Zeroable, Undefs;
for (int i = 0; i < 4; ++i) {
SDValue Elt = Op.getOperand(i);
Undefs[i] = Elt.isUndef();
Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
}
assert(Zeroable.size() - Zeroable.count() > 1 &&
"We expect at least two non-zero elements!");
// We only know how to deal with build_vector nodes where elements are either
// zeroable or extract_vector_elt with constant index.
SDValue FirstNonZero;
unsigned FirstNonZeroIdx;
for (unsigned i = 0; i < 4; ++i) {
if (Zeroable[i])
continue;
SDValue Elt = Op.getOperand(i);
if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
!isa<ConstantSDNode>(Elt.getOperand(1)))
return SDValue();
// Make sure that this node is extracting from a 128-bit vector.
MVT VT = Elt.getOperand(0).getSimpleValueType();
if (!VT.is128BitVector())
return SDValue();
if (!FirstNonZero.getNode()) {
FirstNonZero = Elt;
FirstNonZeroIdx = i;
}
}
assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
SDValue V1 = FirstNonZero.getOperand(0);
MVT VT = V1.getSimpleValueType();
// See if this build_vector can be lowered as a blend with zero.
SDValue Elt;
unsigned EltMaskIdx, EltIdx;
int Mask[4];
for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
if (Zeroable[EltIdx]) {
// The zero vector will be on the right hand side.
Mask[EltIdx] = EltIdx+4;
continue;
}
Elt = Op->getOperand(EltIdx);
// By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
EltMaskIdx = Elt.getConstantOperandVal(1);
if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
break;
Mask[EltIdx] = EltIdx;
}
if (EltIdx == 4) {
// Let the shuffle legalizer deal with blend operations.
SDValue VZeroOrUndef = (Zeroable == Undefs)
? DAG.getUNDEF(VT)
: getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
if (V1.getSimpleValueType() != VT)
V1 = DAG.getBitcast(VT, V1);
return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZeroOrUndef, Mask);
}
// See if we can lower this build_vector to a INSERTPS.
if (!Subtarget.hasSSE41())
return SDValue();
SDValue V2 = Elt.getOperand(0);
if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
V1 = SDValue();
bool CanFold = true;
for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
if (Zeroable[i])
continue;
SDValue Current = Op->getOperand(i);
SDValue SrcVector = Current->getOperand(0);
if (!V1.getNode())
V1 = SrcVector;
CanFold = (SrcVector == V1) && (Current.getConstantOperandAPInt(1) == i);
}
if (!CanFold)
return SDValue();
assert(V1.getNode() && "Expected at least two non-zero elements!");
if (V1.getSimpleValueType() != MVT::v4f32)
V1 = DAG.getBitcast(MVT::v4f32, V1);
if (V2.getSimpleValueType() != MVT::v4f32)
V2 = DAG.getBitcast(MVT::v4f32, V2);
// Ok, we can emit an INSERTPS instruction.
unsigned ZMask = Zeroable.to_ulong();
unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
SDLoc DL(Op);
SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
DAG.getIntPtrConstant(InsertPSMask, DL, true));
return DAG.getBitcast(VT, Result);
}
/// Return a vector logical shift node.
static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
SelectionDAG &DAG, const TargetLowering &TLI,
const SDLoc &dl) {
assert(VT.is128BitVector() && "Unknown type for VShift");
MVT ShVT = MVT::v16i8;
unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
SrcOp = DAG.getBitcast(ShVT, SrcOp);
assert(NumBits % 8 == 0 && "Only support byte sized shifts");
SDValue ShiftVal = DAG.getTargetConstant(NumBits / 8, dl, MVT::i8);
return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
}
static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
SelectionDAG &DAG) {
// Check if the scalar load can be widened into a vector load. And if
// the address is "base + cst" see if the cst can be "absorbed" into
// the shuffle mask.
if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
SDValue Ptr = LD->getBasePtr();
if (!ISD::isNormalLoad(LD) || !LD->isSimple())
return SDValue();
EVT PVT = LD->getValueType(0);
if (PVT != MVT::i32 && PVT != MVT::f32)
return SDValue();
int FI = -1;
int64_t Offset = 0;
if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
FI = FINode->getIndex();
Offset = 0;
} else if (DAG.isBaseWithConstantOffset(Ptr) &&
isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
Offset = Ptr.getConstantOperandVal(1);
Ptr = Ptr.getOperand(0);
} else {
return SDValue();
}
// FIXME: 256-bit vector instructions don't require a strict alignment,
// improve this code to support it better.
unsigned RequiredAlign = VT.getSizeInBits()/8;
SDValue Chain = LD->getChain();
// Make sure the stack object alignment is at least 16 or 32.
MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
if (MFI.isFixedObjectIndex(FI)) {
// Can't change the alignment. FIXME: It's possible to compute
// the exact stack offset and reference FI + adjust offset instead.
// If someone *really* cares about this. That's the way to implement it.
return SDValue();
} else {
MFI.setObjectAlignment(FI, RequiredAlign);
}
}
// (Offset % 16 or 32) must be multiple of 4. Then address is then
// Ptr + (Offset & ~15).
if (Offset < 0)
return SDValue();
if ((Offset % RequiredAlign) & 3)
return SDValue();
int64_t StartOffset = Offset & ~int64_t(RequiredAlign - 1);
if (StartOffset) {
SDLoc DL(Ptr);
Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
}
int EltNo = (Offset - StartOffset) >> 2;
unsigned NumElems = VT.getVectorNumElements();
EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
LD->getPointerInfo().getWithOffset(StartOffset));
SmallVector<int, 8> Mask(NumElems, EltNo);
return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
}
return SDValue();
}
// Recurse to find a LoadSDNode source and the accumulated ByteOffest.
static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {
if (ISD::isNON_EXTLoad(Elt.getNode())) {
auto *BaseLd = cast<LoadSDNode>(Elt);
if (!BaseLd->isSimple())
return false;
Ld = BaseLd;
ByteOffset = 0;
return true;
}
switch (Elt.getOpcode()) {
case ISD::BITCAST:
case ISD::TRUNCATE:
case ISD::SCALAR_TO_VECTOR:
return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset);
case ISD::SRL:
if (isa<ConstantSDNode>(Elt.getOperand(1))) {
uint64_t Idx = Elt.getConstantOperandVal(1);
if ((Idx % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) {
ByteOffset += Idx / 8;
return true;
}
}
break;
case ISD::EXTRACT_VECTOR_ELT:
if (isa<ConstantSDNode>(Elt.getOperand(1))) {
SDValue Src = Elt.getOperand(0);
unsigned SrcSizeInBits = Src.getScalarValueSizeInBits();
unsigned DstSizeInBits = Elt.getScalarValueSizeInBits();
if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 &&
findEltLoadSrc(Src, Ld, ByteOffset)) {
uint64_t Idx = Elt.getConstantOperandVal(1);
ByteOffset += Idx * (SrcSizeInBits / 8);
return true;
}
}
break;
}
return false;
}
/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
/// elements can be replaced by a single large load which has the same value as
/// a build_vector or insert_subvector whose loaded operands are 'Elts'.
///
/// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
const SDLoc &DL, SelectionDAG &DAG,
const X86Subtarget &Subtarget,
bool isAfterLegalize) {
if ((VT.getScalarSizeInBits() % 8) != 0)
return SDValue();
unsigned NumElems = Elts.size();
int LastLoadedElt = -1;
APInt LoadMask = APInt::getNullValue(NumElems);
APInt ZeroMask = APInt::getNullValue(NumElems);
APInt UndefMask = APInt::getNullValue(NumElems);
SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr);
SmallVector<int64_t, 8> ByteOffsets(NumElems, 0);
// For each element in the initializer, see if we've found a load, zero or an
// undef.
for (unsigned i = 0; i < NumElems; ++i) {
SDValue Elt = peekThroughBitcasts(Elts[i]);
if (!Elt.getNode())
return SDValue();
if (Elt.isUndef()) {
UndefMask.setBit(i);
continue;
}
if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode())) {
ZeroMask.setBit(i);
continue;
}
// Each loaded element must be the correct fractional portion of the
// requested vector load.
unsigned EltSizeInBits = Elt.getValueSizeInBits();
if ((NumElems * EltSizeInBits) != VT.getSizeInBits())
return SDValue();
if (!findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]) || ByteOffsets[i] < 0)
return SDValue();
unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0);
if (((ByteOffsets[i] * 8) + EltSizeInBits) > LoadSizeInBits)
return SDValue();
LoadMask.setBit(i);
LastLoadedElt = i;
}
assert((ZeroMask.countPopulation() + UndefMask.countPopulation() +
LoadMask.countPopulation()) == NumElems &&
"Incomplete element masks");
// Handle Special Cases - all undef or undef/zero.
if (UndefMask.countPopulation() == NumElems)
return DAG.getUNDEF(VT);
// FIXME: Should we return this as a BUILD_VECTOR instead?
if ((ZeroMask.countPopulation() + UndefMask.countPopulation()) == NumElems)
return VT.isInteger() ? DAG.getConstant(0, DL, VT)
: DAG.getConstantFP(0.0, DL, VT);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
int FirstLoadedElt = LoadMask.countTrailingZeros();
SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
EVT EltBaseVT = EltBase.getValueType();
assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() &&
"Register/Memory size mismatch");
LoadSDNode *LDBase = Loads[FirstLoadedElt];
assert(LDBase && "Did not find base load for merging consecutive loads");
unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits();
unsigned BaseSizeInBytes = BaseSizeInBits / 8;
int LoadSizeInBits = (1 + LastLoadedElt - FirstLoadedElt) * BaseSizeInBits;
assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected");
// TODO: Support offsetting the base load.
if (ByteOffsets[FirstLoadedElt] != 0)
return SDValue();
// Check to see if the element's load is consecutive to the base load
// or offset from a previous (already checked) load.
auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) {
LoadSDNode *Ld = Loads[EltIdx];
int64_t ByteOffset = ByteOffsets[EltIdx];
if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {
int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);
return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] &&
Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);
}
return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes,
EltIdx - FirstLoadedElt);
};
// Consecutive loads can contain UNDEFS but not ZERO elements.
// Consecutive loads with UNDEFs and ZEROs elements require a
// an additional shuffle stage to clear the ZERO elements.
bool IsConsecutiveLoad = true;
bool IsConsecutiveLoadWithZeros = true;
for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
if (LoadMask[i]) {
if (!CheckConsecutiveLoad(LDBase, i)) {
IsConsecutiveLoad = false;
IsConsecutiveLoadWithZeros = false;
break;
}
} else if (ZeroMask[i]) {
IsConsecutiveLoad = false;
}
}
auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
auto MMOFlags = LDBase->getMemOperand()->getFlags();
assert(LDBase->isSimple() &&
"Cannot merge volatile or atomic loads.");
SDValue NewLd =
DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
LDBase->getPointerInfo(), LDBase->getAlignment(), MMOFlags);
for (auto *LD : Loads)
if (LD)
DAG.makeEquivalentMemoryOrdering(LD, NewLd);
return NewLd;
};
// Check if the base load is entirely dereferenceable.
bool IsDereferenceable = LDBase->getPointerInfo().isDereferenceable(
VT.getSizeInBits() / 8, *DAG.getContext(), DAG.getDataLayout());
// LOAD - all consecutive load/undefs (must start/end with a load or be
// entirely dereferenceable). If we have found an entire vector of loads and
// undefs, then return a large load of the entire vector width starting at the
// base pointer. If the vector contains zeros, then attempt to shuffle those
// elements.
if (FirstLoadedElt == 0 &&
(LastLoadedElt == (int)(NumElems - 1) || IsDereferenceable) &&
(IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
return SDValue();
// Don't create 256-bit non-temporal aligned loads without AVX2 as these
// will lower to regular temporal loads and use the cache.
if (LDBase->isNonTemporal() && LDBase->getAlignment() >= 32 &&
VT.is256BitVector() && !Subtarget.hasInt256())
return SDValue();
if (NumElems == 1)
return DAG.getBitcast(VT, Elts[FirstLoadedElt]);
if (!ZeroMask)
return CreateLoad(VT, LDBase);
// IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
// vector and a zero vector to clear out the zero elements.
if (!isAfterLegalize && VT.isVector()) {
unsigned NumMaskElts = VT.getVectorNumElements();
if ((NumMaskElts % NumElems) == 0) {
unsigned Scale = NumMaskElts / NumElems;
SmallVector<int, 4> ClearMask(NumMaskElts, -1);
for (unsigned i = 0; i < NumElems; ++i) {
if (UndefMask[i])
continue;
int Offset = ZeroMask[i] ? NumMaskElts : 0;
for (unsigned j = 0; j != Scale; ++j)
ClearMask[(i * Scale) + j] = (i * Scale) + j + Offset;
}
SDValue V = CreateLoad(VT, LDBase);
SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
: DAG.getConstantFP(0.0, DL, VT);
return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
}
}
}
// If the upper half of a ymm/zmm load is undef then just load the lower half.
if (VT.is256BitVector() || VT.is512BitVector()) {
unsigned HalfNumElems = NumElems / 2;
if (UndefMask.extractBits(HalfNumElems, HalfNumElems).isAllOnesValue()) {
EVT HalfVT =
EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems);
SDValue HalfLD =
EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL,
DAG, Subtarget, isAfterLegalize);
if (HalfLD)
return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT),
HalfLD, DAG.getIntPtrConstant(0, DL));
}
}
// VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
(LoadSizeInBits == 32 || LoadSizeInBits == 64) &&
((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits)
: MVT::getIntegerVT(LoadSizeInBits);
MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits);
if (TLI.isTypeLegal(VecVT)) {
SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
SDValue ResNode =
DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT,
LDBase->getPointerInfo(),
LDBase->getAlignment(),
MachineMemOperand::MOLoad);
for (auto *LD : Loads)
if (LD)
DAG.makeEquivalentMemoryOrdering(LD, ResNode);
return DAG.getBitcast(VT, ResNode);
}
}
// BROADCAST - match the smallest possible repetition pattern, load that
// scalar/subvector element and then broadcast to the entire vector.
if (ZeroMask.isNullValue() && isPowerOf2_32(NumElems) && Subtarget.hasAVX() &&
(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) {
for (unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) {
unsigned RepeatSize = SubElems * BaseSizeInBits;
unsigned ScalarSize = std::min(RepeatSize, 64u);
if (!Subtarget.hasAVX2() && ScalarSize < 32)
continue;
bool Match = true;
SmallVector<SDValue, 8> RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT));
for (unsigned i = 0; i != NumElems && Match; ++i) {
if (!LoadMask[i])
continue;
SDValue Elt = peekThroughBitcasts(Elts[i]);
if (RepeatedLoads[i % SubElems].isUndef())
RepeatedLoads[i % SubElems] = Elt;
else
Match &= (RepeatedLoads[i % SubElems] == Elt);
}
// We must have loads at both ends of the repetition.
Match &= !RepeatedLoads.front().isUndef();
Match &= !RepeatedLoads.back().isUndef();
if (!Match)
continue;
EVT RepeatVT =
VT.isInteger() && (RepeatSize != 64 || TLI.isTypeLegal(MVT::i64))
? EVT::getIntegerVT(*DAG.getContext(), ScalarSize)
: EVT::getFloatingPointVT(ScalarSize);
if (RepeatSize > ScalarSize)
RepeatVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT,
RepeatSize / ScalarSize);
EVT BroadcastVT =
EVT::getVectorVT(*DAG.getContext(), RepeatVT.getScalarType(),
VT.getSizeInBits() / ScalarSize);
if (TLI.isTypeLegal(BroadcastVT)) {
if (SDValue RepeatLoad = EltsFromConsecutiveLoads(
RepeatVT, RepeatedLoads, DL, DAG, Subtarget, isAfterLegalize)) {
unsigned Opcode = RepeatSize > ScalarSize ? X86ISD::SUBV_BROADCAST
: X86ISD::VBROADCAST;
SDValue Broadcast = DAG.getNode(Opcode, DL, BroadcastVT, RepeatLoad);
return DAG.getBitcast(VT, Broadcast);
}
}
}
}
return SDValue();
}
// Combine a vector ops (shuffles etc.) that is equal to build_vector load1,
// load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses
// are consecutive, non-overlapping, and in the right order.
static SDValue combineToConsecutiveLoads(EVT VT, SDNode *N, const SDLoc &DL,
SelectionDAG &DAG,
const X86Subtarget &Subtarget,
bool isAfterLegalize) {
SmallVector<SDValue, 64> Elts;
for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
if (SDValue Elt = getShuffleScalarElt(N, i, DAG, 0)) {
Elts.push_back(Elt);
continue;
}
return SDValue();
}
assert(Elts.size() == VT.getVectorNumElements());
return EltsFromConsecutiveLoads(VT, Elts, DL, DAG, Subtarget,
isAfterLegalize);
}
static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
unsigned SplatBitSize, LLVMContext &C) {
unsigned ScalarSize = VT.getScalarSizeInBits();
unsigned NumElm = SplatBitSize / ScalarSize;
SmallVector<Constant *, 32> ConstantVec;
for (unsigned i = 0; i < NumElm; i++) {
APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i);
Constant *Const;
if (VT.isFloatingPoint()) {
if (ScalarSize == 32) {
Const = ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
} else {
assert(ScalarSize == 64 && "Unsupported floating point scalar size");
Const = ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
}
} else
Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
ConstantVec.push_back(Const);
}
return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
}
static bool isFoldableUseOfShuffle(SDNode *N) {
for (auto *U : N->uses()) {
unsigned Opc = U->getOpcode();
// VPERMV/VPERMV3 shuffles can never fold their index operands.
if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N)
return false;
if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N)
return false;
if (isTargetShuffle(Opc))
return true;
if (Opc == ISD::BITCAST) // Ignore bitcasts
return isFoldableUseOfShuffle(U);
if (N->hasOneUse())
return true;
}
return false;
}
// Check if the current node of build vector is a zero extended vector.
// // If so, return the value extended.
// // For example: (0,0,0,a,0,0,0,a,0,0,0,a,0,0,0,a) returns a.
// // NumElt - return the number of zero extended identical values.
// // EltType - return the type of the value include the zero extend.
static SDValue isSplatZeroExtended(const BuildVectorSDNode *Op,
unsigned &NumElt, MVT &EltType) {
SDValue ExtValue = Op->getOperand(0);
unsigned NumElts = Op->getNumOperands();
unsigned Delta = NumElts;
for (unsigned i = 1; i < NumElts; i++) {
if (Op->getOperand(i) == ExtValue) {
Delta = i;
break;
}
if (!(Op->getOperand(i).isUndef() || isNullConstant(Op->getOperand(i))))
return SDValue();
}
if (!isPowerOf2_32(Delta) || Delta == 1)
return SDValue();
for (unsigned i = Delta; i < NumElts; i++) {
if (i % Delta == 0) {
if (Op->getOperand(i) != ExtValue)
return SDValue();
} else if (!(isNullConstant(Op->getOperand(i)) ||
Op->getOperand(i).isUndef()))
return SDValue();
}
unsigned EltSize = Op->getSimpleValueType(0).getScalarSizeInBits();
unsigned ExtVTSize = EltSize * Delta;
EltType = MVT::getIntegerVT(ExtVTSize);
NumElt = NumElts / Delta;
return ExtValue;
}
/// Attempt to use the vbroadcast instruction to generate a splat value
/// from a splat BUILD_VECTOR which uses:
/// a. A single scalar load, or a constant.
/// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
///
/// The VBROADCAST node is returned when a pattern is found,
/// or SDValue() otherwise.
static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
// VBROADCAST requires AVX.
// TODO: Splats could be generated for non-AVX CPUs using SSE
// instructions, but there's less potential gain for only 128-bit vectors.
if (!Subtarget.hasAVX())
return SDValue();
MVT VT = BVOp->getSimpleValueType(0);
SDLoc dl(BVOp);
assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
"Unsupported vector type for broadcast.");
BitVector UndefElements;
SDValue Ld = BVOp->getSplatValue(&UndefElements);
// Attempt to use VBROADCASTM
// From this paterrn:
// a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
// b. t1 = (build_vector t0 t0)
//
// Create (VBROADCASTM v2i1 X)
if (Subtarget.hasCDI() && (VT.is512BitVector() || Subtarget.hasVLX())) {
MVT EltType = VT.getScalarType();
unsigned NumElts = VT.getVectorNumElements();
SDValue BOperand;
SDValue ZeroExtended = isSplatZeroExtended(BVOp, NumElts, EltType);
if ((ZeroExtended && ZeroExtended.getOpcode() == ISD::BITCAST) ||
(Ld && Ld.getOpcode() == ISD::ZERO_EXTEND &&
Ld.getOperand(0).getOpcode() == ISD::BITCAST)) {
if (ZeroExtended)
BOperand = ZeroExtended.getOperand(0);
else
BOperand = Ld.getOperand(0).getOperand(0);
MVT MaskVT = BOperand.getSimpleValueType();
if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) || // for broadcastmb2q
(EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d
SDValue Brdcst =
DAG.getNode(X86ISD::VBROADCASTM, dl,
MVT::getVectorVT(EltType, NumElts), BOperand);
return DAG.getBitcast(VT, Brdcst);
}
}
}
unsigned NumElts = VT.getVectorNumElements();
unsigned NumUndefElts = UndefElements.count();
if (!Ld || (NumElts - NumUndefElts) <= 1) {
APInt SplatValue, Undef;
unsigned SplatBitSize;
bool HasUndef;
// Check if this is a repeated constant pattern suitable for broadcasting.
if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
SplatBitSize > VT.getScalarSizeInBits() &&
SplatBitSize < VT.getSizeInBits()) {
// Avoid replacing with broadcast when it's a use of a shuffle
// instruction to preserve the present custom lowering of shuffles.
if (isFoldableUseOfShuffle(BVOp))
return SDValue();
// replace BUILD_VECTOR with broadcast of the repeated constants.
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
LLVMContext *Ctx = DAG.getContext();
MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
if (Subtarget.hasAVX()) {
if (SplatBitSize <= 64 && Subtarget.hasAVX2() &&
!(SplatBitSize == 64 && Subtarget.is32Bit())) {
// Splatted value can fit in one INTEGER constant in constant pool.
// Load the constant and broadcast it.
MVT CVT = MVT::getIntegerVT(SplatBitSize);
Type *ScalarTy = Type::getIntNTy(*Ctx, SplatBitSize);
Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue);
SDValue CP = DAG.getConstantPool(C, PVT);
unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
Ld = DAG.getLoad(
CVT, dl, DAG.getEntryNode(), CP,
MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
Alignment);
SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
MVT::getVectorVT(CVT, Repeat), Ld);
return DAG.getBitcast(VT, Brdcst);
} else if (SplatBitSize == 32 || SplatBitSize == 64) {
// Splatted value can fit in one FLOAT constant in constant pool.
// Load the constant and broadcast it.
// AVX have support for 32 and 64 bit broadcast for floats only.
// No 64bit integer in 32bit subtarget.
MVT CVT = MVT::getFloatingPointVT(SplatBitSize);
// Lower the splat via APFloat directly, to avoid any conversion.
Constant *C =
SplatBitSize == 32
? ConstantFP::get(*Ctx,
APFloat(APFloat::IEEEsingle(), SplatValue))
: ConstantFP::get(*Ctx,
APFloat(APFloat::IEEEdouble(), SplatValue));
SDValue CP = DAG.getConstantPool(C, PVT);
unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
Ld = DAG.getLoad(
CVT, dl, DAG.getEntryNode(), CP,
MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
Alignment);
SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
MVT::getVectorVT(CVT, Repeat), Ld);
return DAG.getBitcast(VT, Brdcst);
} else if (SplatBitSize > 64) {
// Load the vector of constants and broadcast it.
MVT CVT = VT.getScalarType();
Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,
*Ctx);
SDValue VCP = DAG.getConstantPool(VecC, PVT);
unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
unsigned Alignment = cast<ConstantPoolSDNode>(VCP)->getAlignment();
Ld = DAG.getLoad(
MVT::getVectorVT(CVT, NumElm), dl, DAG.getEntryNode(), VCP,
MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
Alignment);
SDValue Brdcst = DAG.getNode(X86ISD::SUBV_BROADCAST, dl, VT, Ld);
return DAG.getBitcast(VT, Brdcst);
}
}
}
// If we are moving a scalar into a vector (Ld must be set and all elements
// but 1 are undef) and that operation is not obviously supported by
// vmovd/vmovq/vmovss/vmovsd, then keep trying to form a broadcast.
// That's better than general shuffling and may eliminate a load to GPR and
// move from scalar to vector register.
if (!Ld || NumElts - NumUndefElts != 1)
return SDValue();
unsigned ScalarSize = Ld.getValueSizeInBits();
if (!(UndefElements[0] || (ScalarSize != 32 && ScalarSize != 64)))
return SDValue();
}
bool ConstSplatVal =
(Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
// Make sure that all of the users of a non-constant load are from the
// BUILD_VECTOR node.
if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
return SDValue();
unsigned ScalarSize = Ld.getValueSizeInBits();
bool IsGE256 = (VT.getSizeInBits() >= 256);
// When optimizing for size, generate up to 5 extra bytes for a broadcast
// instruction to save 8 or more bytes of constant pool data.
// TODO: If multiple splats are generated to load the same constant,
// it may be detrimental to overall size. There needs to be a way to detect
// that condition to know if this is truly a size win.
bool OptForSize = DAG.shouldOptForSize();
// Handle broadcasting a single constant scalar from the constant pool
// into a vector.
// On Sandybridge (no AVX2), it is still better to load a constant vector
// from the constant pool and not to broadcast it from a scalar.
// But override that restriction when optimizing for size.
// TODO: Check if splatting is recommended for other AVX-capable CPUs.
if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
EVT CVT = Ld.getValueType();
assert(!CVT.isVector() && "Must not broadcast a vector type");
// Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
// For size optimization, also splat v2f64 and v2i64, and for size opt
// with AVX2, also splat i8 and i16.
// With pattern matching, the VBROADCAST node may become a VMOVDDUP.
if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
(OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
const Constant *C = nullptr;
if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
C = CI->getConstantIntValue();
else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
C = CF->getConstantFPValue();
assert(C && "Invalid constant type");
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
SDValue CP =
DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
Ld = DAG.getLoad(
CVT, dl, DAG.getEntryNode(), CP,
MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
Alignment);
return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
}
}
bool IsLoad = ISD::isNormalLoad(Ld.getNode());
// Handle AVX2 in-register broadcasts.
if (!IsLoad && Subtarget.hasInt256() &&
(ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
// The scalar source must be a normal load.
if (!IsLoad)
return SDValue();
if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
(Subtarget.hasVLX() && ScalarSize == 64))
return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
// The integer check is needed for the 64-bit into 128-bit so it doesn't match
// double since there is no vbroadcastsd xmm
if (Subtarget.hasInt256() && Ld.getValueType().isInteger()) {
if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
}
// Unsupported broadcast.
return SDValue();
}
/// For an EXTRACT_VECTOR_ELT with a constant index return the real
/// underlying vector and index.
///
/// Modifies \p ExtractedFromVec to the real vector and returns the real
/// index.
static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
SDValue ExtIdx) {
int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
return Idx;
// For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
// lowered this:
// (extract_vector_elt (v8f32 %1), Constant<6>)
// to:
// (extract_vector_elt (vector_shuffle<2,u,u,u>
// (extract_subvector (v8f32 %0), Constant<4>),
// undef)
// Constant<0>)
// In this case the vector is the extract_subvector expression and the index
// is 2, as specified by the shuffle.
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
SDValue ShuffleVec = SVOp->getOperand(0);
MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
assert(ShuffleVecVT.getVectorElementType() ==
ExtractedFromVec.getSimpleValueType().getVectorElementType());
int ShuffleIdx = SVOp->getMaskElt(Idx);
if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
ExtractedFromVec = ShuffleVec;
return ShuffleIdx;
}
return Idx;
}
static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
MVT VT = Op.getSimpleValueType();
// Skip if insert_vec_elt is not supported.
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
return SDValue();
SDLoc DL(Op);
unsigned NumElems = Op.getNumOperands();
SDValue VecIn1;
SDValue VecIn2;
SmallVector<unsigned, 4> InsertIndices;
SmallVector<int, 8> Mask(NumElems, -1);
for (unsigned i = 0; i != NumElems; ++i) {
unsigned Opc = Op.getOperand(i).getOpcode();
if (Opc == ISD::UNDEF)
continue;
if (Opc != ISD::EXTRACT_VECTOR_ELT) {
// Quit if more than 1 elements need inserting.
if (InsertIndices.size() > 1)
return SDValue();
InsertIndices.push_back(i);
continue;
}
SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
SDValue ExtIdx = Op.getOperand(i).getOperand(1);
// Quit if non-constant index.
if (!isa<ConstantSDNode>(ExtIdx))
return SDValue();
int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
// Quit if extracted from vector of different type.
if (ExtractedFromVec.getValueType() != VT)
return SDValue();
if (!VecIn1.getNode())
VecIn1 = ExtractedFromVec;
else if (VecIn1 != ExtractedFromVec) {
if (!VecIn2.getNode())
VecIn2 = ExtractedFromVec;
else if (VecIn2 != ExtractedFromVec)
// Quit if more than 2 vectors to shuffle
return SDValue();
}
if (ExtractedFromVec == VecIn1)
Mask[i] = Idx;
else if (ExtractedFromVec == VecIn2)
Mask[i] = Idx + NumElems;
}
if (!VecIn1.getNode())
return SDValue();
VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
for (unsigned Idx : InsertIndices)
NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
DAG.getIntPtrConstant(Idx, DL));
return NV;
}
static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) {
assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
Op.getScalarValueSizeInBits() == 1 &&
"Can not convert non-constant vector");
uint64_t Immediate = 0;
for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
SDValue In = Op.getOperand(idx);
if (!In.isUndef())
Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
}
SDLoc dl(Op);
MVT VT = MVT::getIntegerVT(std::max((int)Op.getValueSizeInBits(), 8));
return DAG.getConstant(Immediate, dl, VT);
}
// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
MVT VT = Op.getSimpleValueType();
assert((VT.getVectorElementType() == MVT::i1) &&
"Unexpected type in LowerBUILD_VECTORvXi1!");
SDLoc dl(Op);
if (ISD::isBuildVectorAllZeros(Op.getNode()) ||
ISD::isBuildVectorAllOnes(Op.getNode()))
return Op;
uint64_t Immediate = 0;
SmallVector<unsigned, 16> NonConstIdx;
bool IsSplat = true;
bool HasConstElts = false;
int SplatIdx = -1;
for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
SDValue In = Op.getOperand(idx);
if (In.isUndef())
continue;
if (!isa<ConstantSDNode>(In))
NonConstIdx.push_back(idx);
else {
Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
HasConstElts = true;
}
if (SplatIdx < 0)
SplatIdx = idx;
else if (In != Op.getOperand(SplatIdx))
IsSplat = false;
}
// for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
if (IsSplat) {
// The build_vector allows the scalar element to be larger than the vector
// element type. We need to mask it to use as a condition unless we know
// the upper bits are zero.
// FIXME: Use computeKnownBits instead of checking specific opcode?
SDValue Cond = Op.getOperand(SplatIdx);
assert(Cond.getValueType() == MVT::i8 && "Unexpected VT!");
if (Cond.getOpcode() != ISD::SETCC)
Cond = DAG.getNode(ISD::AND, dl, MVT::i8, Cond,
DAG.getConstant(1, dl, MVT::i8));
return DAG.getSelect(dl, VT, Cond,
DAG.getConstant(1, dl, VT),
DAG.getConstant(0, dl, VT));
}
// insert elements one by one
SDValue DstVec;
if (HasConstElts) {
if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
SDValue ImmL = DAG.getConstant(Lo_32(Immediate), dl, MVT::i32);
SDValue ImmH = DAG.getConstant(Hi_32(Immediate), dl, MVT::i32);
ImmL = DAG.getBitcast(MVT::v32i1, ImmL);
ImmH = DAG.getBitcast(MVT::v32i1, ImmH);
DstVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, ImmL, ImmH);
} else {
MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
SDValue Imm = DAG.getConstant(Immediate, dl, ImmVT);
MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
DstVec = DAG.getBitcast(VecVT, Imm);
DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, DstVec,
DAG.getIntPtrConstant(0, dl));
}
} else
DstVec = DAG.getUNDEF(VT);
for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
unsigned InsertIdx = NonConstIdx[i];
DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
Op.getOperand(InsertIdx),
DAG.getIntPtrConstant(InsertIdx, dl));
}
return DstVec;
}
/// This is a helper function of LowerToHorizontalOp().
/// This function checks that the build_vector \p N in input implements a
/// 128-bit partial horizontal operation on a 256-bit vector, but that operation
/// may not match the layout of an x86 256-bit horizontal instruction.
/// In other words, if this returns true, then some extraction/insertion will
/// be required to produce a valid horizontal instruction.
///
/// Parameter \p Opcode defines the kind of horizontal operation to match.
/// For example, if \p Opcode is equal to ISD::ADD, then this function
/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
/// is equal to ISD::SUB, then this function checks if this is a horizontal
/// arithmetic sub.
///
/// This function only analyzes elements of \p N whose indices are
/// in range [BaseIdx, LastIdx).
///
/// TODO: This function was originally used to match both real and fake partial
/// horizontal operations, but the index-matching logic is incorrect for that.
/// See the corrected implementation in isHopBuildVector(). Can we reduce this
/// code because it is only used for partial h-op matching now?
static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode,
SelectionDAG &DAG,
unsigned BaseIdx, unsigned LastIdx,
SDValue &V0, SDValue &V1) {
EVT VT = N->getValueType(0);
assert(VT.is256BitVector() && "Only use for matching partial 256-bit h-ops");
assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
"Invalid Vector in input!");
bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
bool CanFold = true;
unsigned ExpectedVExtractIdx = BaseIdx;
unsigned NumElts = LastIdx - BaseIdx;
V0 = DAG.getUNDEF(VT);
V1 = DAG.getUNDEF(VT);
// Check if N implements a horizontal binop.
for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
SDValue Op = N->getOperand(i + BaseIdx);
// Skip UNDEFs.
if (Op->isUndef()) {
// Update the expected vector extract index.
if (i * 2 == NumElts)
ExpectedVExtractIdx = BaseIdx;
ExpectedVExtractIdx += 2;
continue;
}
CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
if (!CanFold)
break;
SDValue Op0 = Op.getOperand(0);
SDValue Op1 = Op.getOperand(1);
// Try to match the following pattern:
// (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
Op0.getOperand(0) == Op1.getOperand(0) &&
isa<ConstantSDNode>(Op0.getOperand(1)) &&
isa<ConstantSDNode>(Op1.getOperand(1)));
if (!CanFold)
break;
unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
if (i * 2 < NumElts) {
if (V0.isUndef()) {
V0 = Op0.getOperand(0);
if (V0.getValueType() != VT)
return false;
}
} else {
if (V1.isUndef()) {
V1 = Op0.getOperand(0);
if (V1.getValueType() != VT)
return false;
}
if (i * 2 == NumElts)
ExpectedVExtractIdx = BaseIdx;
}
SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
if (I0 == ExpectedVExtractIdx)
CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
else if (IsCommutable && I1 == ExpectedVExtractIdx) {
// Try to match the following dag sequence:
// (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
} else
CanFold = false;
ExpectedVExtractIdx += 2;
}
return CanFold;
}
/// Emit a sequence of two 128-bit horizontal add/sub followed by
/// a concat_vector.
///
/// This is a helper function of LowerToHorizontalOp().
/// This function expects two 256-bit vectors called V0 and V1.
/// At first, each vector is split into two separate 128-bit vectors.
/// Then, the resulting 128-bit vectors are used to implement two
/// horizontal binary operations.
///
/// The kind of horizontal binary operation is defined by \p X86Opcode.
///
/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
/// the two new horizontal binop.
/// When Mode is set, the first horizontal binop dag node would take as input
/// the lower 128-bit of V0 and the upper 128-bit of V0. The second
/// horizontal binop dag node would take as input the lower 128-bit of V1
/// and the upper 128-bit of V1.
/// Example:
/// HADD V0_LO, V0_HI
/// HADD V1_LO, V1_HI
///
/// Otherwise, the first horizontal binop dag node takes as input the lower
/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
/// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
/// Example:
/// HADD V0_LO, V1_LO
/// HADD V0_HI, V1_HI
///
/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
/// the upper 128-bits of the result.
static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
const SDLoc &DL, SelectionDAG &DAG,
unsigned X86Opcode, bool Mode,
bool isUndefLO, bool isUndefHI) {
MVT VT = V0.getSimpleValueType();
assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
"Invalid nodes in input!");
unsigned NumElts = VT.getVectorNumElements();
SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
MVT NewVT = V0_LO.getSimpleValueType();
SDValue LO = DAG.getUNDEF(NewVT);
SDValue HI = DAG.getUNDEF(NewVT);
if (Mode) {
// Don't emit a horizontal binop if the result is expected to be UNDEF.
if (!isUndefLO && !V0->isUndef())
LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
if (!isUndefHI && !V1->isUndef())
HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
} else {
// Don't emit a horizontal binop if the result is expected to be UNDEF.
if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
}
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
}
/// Returns true iff \p BV builds a vector with the result equivalent to
/// the result of ADDSUB/SUBADD operation.
/// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1
/// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters
/// \p Opnd0 and \p Opnd1.
static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV,
const X86Subtarget &Subtarget, SelectionDAG &DAG,
SDValue &Opnd0, SDValue &Opnd1,
unsigned &NumExtracts,
bool &IsSubAdd) {
MVT VT = BV->getSimpleValueType(0);
if (!Subtarget.hasSSE3() || !VT.isFloatingPoint())
return false;
unsigned NumElts = VT.getVectorNumElements();
SDValue InVec0 = DAG.getUNDEF(VT);
SDValue InVec1 = DAG.getUNDEF(VT);
NumExtracts = 0;
// Odd-numbered elements in the input build vector are obtained from
// adding/subtracting two integer/float elements.
// Even-numbered elements in the input build vector are obtained from
// subtracting/adding two integer/float elements.
unsigned Opc[2] = {0, 0};
for (unsigned i = 0, e = NumElts; i != e; ++i) {
SDValue Op = BV->getOperand(i);
// Skip 'undef' values.
unsigned Opcode = Op.getOpcode();
if (Opcode == ISD::UNDEF)
continue;
// Early exit if we found an unexpected opcode.
if (Opcode != ISD::FADD && Opcode != ISD::FSUB)
return false;
SDValue Op0 = Op.getOperand(0);
SDValue Op1 = Op.getOperand(1);
// Try to match the following pattern:
// (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
// Early exit if we cannot match that sequence.
if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
!isa<ConstantSDNode>(Op0.getOperand(1)) ||
!isa<ConstantSDNode>(Op1.getOperand(1)) ||
Op0.getOperand(1) != Op1.getOperand(1))
return false;
unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
if (I0 != i)
return false;
// We found a valid add/sub node, make sure its the same opcode as previous
// elements for this parity.
if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode)
return false;
Opc[i % 2] = Opcode;
// Update InVec0 and InVec1.
if (InVec0.isUndef()) {
InVec0 = Op0.getOperand(0);
if (InVec0.getSimpleValueType() != VT)
return false;
}
if (InVec1.isUndef()) {
InVec1 = Op1.getOperand(0);
if (InVec1.getSimpleValueType() != VT)
return false;
}
// Make sure that operands in input to each add/sub node always
// come from a same pair of vectors.
if (InVec0 != Op0.getOperand(0)) {
if (Opcode == ISD::FSUB)
return false;
// FADD is commutable. Try to commute the operands
// and then test again.
std::swap(Op0, Op1);
if (InVec0 != Op0.getOperand(0))
return false;
}
if (InVec1 != Op1.getOperand(0))
return false;
// Increment the number of extractions done.
++NumExtracts;
}
// Ensure we have found an opcode for both parities and that they are
// different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the
// inputs are undef.
if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] ||
InVec0.isUndef() || InVec1.isUndef())
return false;
IsSubAdd = Opc[0] == ISD::FADD;
Opnd0 = InVec0;
Opnd1 = InVec1;
return true;
}
/// Returns true if is possible to fold MUL and an idiom that has already been
/// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into
/// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the
/// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.
///
/// Prior to calling this function it should be known that there is some
/// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
/// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
/// before replacement of such SDNode with ADDSUB operation. Thus the number
/// of \p Opnd0 uses is expected to be equal to 2.
/// For example, this function may be called for the following IR:
/// %AB = fmul fast <2 x double> %A, %B
/// %Sub = fsub fast <2 x double> %AB, %C
/// %Add = fadd fast <2 x double> %AB, %C
/// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
/// <2 x i32> <i32 0, i32 3>
/// There is a def for %Addsub here, which potentially can be replaced by
/// X86ISD::ADDSUB operation:
/// %Addsub = X86ISD::ADDSUB %AB, %C
/// and such ADDSUB can further be replaced with FMADDSUB:
/// %Addsub = FMADDSUB %A, %B, %C.
///
/// The main reason why this method is called before the replacement of the
/// recognized ADDSUB idiom with ADDSUB operation is that such replacement
/// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
/// FMADDSUB is.
static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
SelectionDAG &DAG,
SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2,
unsigned ExpectedUses) {
if (Opnd0.getOpcode() != ISD::FMUL ||
!Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA())
return false;
// FIXME: These checks must match the similar ones in
// DAGCombiner::visitFADDForFMACombine. It would be good to have one
// function that would answer if it is Ok to fuse MUL + ADD to FMADD
// or MUL + ADDSUB to FMADDSUB.
const TargetOptions &Options = DAG.getTarget().Options;
bool AllowFusion =
(Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
if (!AllowFusion)
return false;
Opnd2 = Opnd1;
Opnd1 = Opnd0.getOperand(1);
Opnd0 = Opnd0.getOperand(0);
return true;
}
/// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or
/// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or
/// X86ISD::FMSUBADD node.
static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
SDValue Opnd0, Opnd1;
unsigned NumExtracts;
bool IsSubAdd;
if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts,
IsSubAdd))
return SDValue();
MVT VT = BV->getSimpleValueType(0);
SDLoc DL(BV);
// Try to generate X86ISD::FMADDSUB node here.
SDValue Opnd2;
if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts)) {
unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
}
// We only support ADDSUB.
if (IsSubAdd)
return SDValue();
// Do not generate X86ISD::ADDSUB node for 512-bit types even though
// the ADDSUB idiom has been successfully recognized. There are no known
// X86 targets with 512-bit ADDSUB instructions!
// 512-bit ADDSUB idiom recognition was needed only as part of FMADDSUB idiom
// recognition.
if (VT.is512BitVector())
return SDValue();
return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
}
static bool isHopBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG,
unsigned &HOpcode, SDValue &V0, SDValue &V1) {
// Initialize outputs to known values.
MVT VT = BV->getSimpleValueType(0);
HOpcode = ISD::DELETED_NODE;
V0 = DAG.getUNDEF(VT);
V1 = DAG.getUNDEF(VT);
// x86 256-bit horizontal ops are defined in a non-obvious way. Each 128-bit
// half of the result is calculated independently from the 128-bit halves of
// the inputs, so that makes the index-checking logic below more complicated.
unsigned NumElts = VT.getVectorNumElements();
unsigned GenericOpcode = ISD::DELETED_NODE;
unsigned Num128BitChunks = VT.is256BitVector() ? 2 : 1;
unsigned NumEltsIn128Bits = NumElts / Num128BitChunks;
unsigned NumEltsIn64Bits = NumEltsIn128Bits / 2;
for (unsigned i = 0; i != Num128BitChunks; ++i) {
for (unsigned j = 0; j != NumEltsIn128Bits; ++j) {
// Ignore undef elements.
SDValue Op = BV->getOperand(i * NumEltsIn128Bits + j);
if (Op.isUndef())
continue;
// If there's an opcode mismatch, we're done.
if (HOpcode != ISD::DELETED_NODE && Op.getOpcode() != GenericOpcode)
return false;
// Initialize horizontal opcode.
if (HOpcode == ISD::DELETED_NODE) {
GenericOpcode = Op.getOpcode();
switch (GenericOpcode) {
case ISD::ADD: HOpcode = X86ISD::HADD; break;
case ISD::SUB: HOpcode = X86ISD::HSUB; break;
case ISD::FADD: HOpcode = X86ISD::FHADD; break;
case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
default: return false;
}
}
SDValue Op0 = Op.getOperand(0);
SDValue Op1 = Op.getOperand(1);
if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
Op0.getOperand(0) != Op1.getOperand(0) ||
!isa<ConstantSDNode>(Op0.getOperand(1)) ||
!isa<ConstantSDNode>(Op1.getOperand(1)) || !Op.hasOneUse())
return false;
// The source vector is chosen based on which 64-bit half of the
// destination vector is being calculated.
if (j < NumEltsIn64Bits) {
if (V0.isUndef())
V0 = Op0.getOperand(0);
} else {
if (V1.isUndef())
V1 = Op0.getOperand(0);
}
SDValue SourceVec = (j < NumEltsIn64Bits) ? V0 : V1;
if (SourceVec != Op0.getOperand(0))
return false;
// op (extract_vector_elt A, I), (extract_vector_elt A, I+1)
unsigned ExtIndex0 = Op0.getConstantOperandVal(1);
unsigned ExtIndex1 = Op1.getConstantOperandVal(1);
unsigned ExpectedIndex = i * NumEltsIn128Bits +
(j % NumEltsIn64Bits) * 2;
if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1)
continue;
// If this is not a commutative op, this does not match.
if (GenericOpcode != ISD::ADD && GenericOpcode != ISD::FADD)
return false;
// Addition is commutative, so try swapping the extract indexes.
// op (extract_vector_elt A, I+1), (extract_vector_elt A, I)
if (ExpectedIndex == ExtIndex1 && ExtIndex0 == ExtIndex1 + 1)
continue;
// Extract indexes do not match horizontal requirement.
return false;
}
}
// We matched. Opcode and operands are returned by reference as arguments.
return true;
}
static SDValue getHopForBuildVector(const BuildVectorSDNode *BV,
SelectionDAG &DAG, unsigned HOpcode,
SDValue V0, SDValue V1) {
// If either input vector is not the same size as the build vector,
// extract/insert the low bits to the correct size.
// This is free (examples: zmm --> xmm, xmm --> ymm).
MVT VT = BV->getSimpleValueType(0);
unsigned Width = VT.getSizeInBits();
if (V0.getValueSizeInBits() > Width)
V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), Width);
else if (V0.getValueSizeInBits() < Width)
V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, SDLoc(BV), Width);
if (V1.getValueSizeInBits() > Width)
V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), Width);
else if (V1.getValueSizeInBits() < Width)
V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, SDLoc(BV), Width);
unsigned NumElts = VT.getVectorNumElements();
APInt DemandedElts = APInt::getAllOnesValue(NumElts);
for (unsigned i = 0; i != NumElts; ++i)
if (BV->getOperand(i).isUndef())
DemandedElts.clearBit(i);
// If we don't need the upper xmm, then perform as a xmm hop.
unsigned HalfNumElts = NumElts / 2;
if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) {
MVT HalfVT = VT.getHalfNumVectorElementsVT();
V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), 128);
V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), 128);
SDValue Half = DAG.getNode(HOpcode, SDLoc(BV), HalfVT, V0, V1);
return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, SDLoc(BV), 256);
}
return DAG.getNode(HOpcode, SDLoc(BV), VT, V0, V1);
}
/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
// We need at least 2 non-undef elements to make this worthwhile by default.
unsigned NumNonUndefs =
count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); });
if (NumNonUndefs < 2)
return SDValue();
// There are 4 sets of horizontal math operations distinguished by type:
// int/FP at 128-bit/256-bit. Each type was introduced with a different
// subtarget feature. Try to match those "native" patterns first.
MVT VT = BV->getSimpleValueType(0);
if (((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) ||
((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3()) ||
((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX()) ||
((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2())) {
unsigned HOpcode;
SDValue V0, V1;
if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
return getHopForBuildVector(BV, DAG, HOpcode, V0, V1);
}
// Try harder to match 256-bit ops by using extract/concat.
if (!Subtarget.hasAVX() || !VT.is256BitVector())
return SDValue();
// Count the number of UNDEF operands in the build_vector in input.
unsigned NumElts = VT.getVectorNumElements();
unsigned Half = NumElts / 2;
unsigned NumUndefsLO = 0;
unsigned NumUndefsHI = 0;
for (unsigned i = 0, e = Half; i != e; ++i)
if (BV->getOperand(i)->isUndef())
NumUndefsLO++;
for (unsigned i = Half, e = NumElts; i != e; ++i)
if (BV->getOperand(i)->isUndef())
NumUndefsHI++;
SDLoc DL(BV);
SDValue InVec0, InVec1;
if (VT == MVT::v8i32 || VT == MVT::v16i16) {
SDValue InVec2, InVec3;
unsigned X86Opcode;
bool CanFold = true;
if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
isHorizontalBinOpPart(BV, ISD::ADD, DAG, Half, NumElts, InVec2,
InVec3) &&
((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
X86Opcode = X86ISD::HADD;
else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, Half, InVec0,
InVec1) &&
isHorizontalBinOpPart(BV, ISD::SUB, DAG, Half, NumElts, InVec2,
InVec3) &&
((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
X86Opcode = X86ISD::HSUB;
else
CanFold = false;
if (CanFold) {
// Do not try to expand this build_vector into a pair of horizontal
// add/sub if we can emit a pair of scalar add/sub.
if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
return SDValue();
// Convert this build_vector into a pair of horizontal binops followed by
// a concat vector. We must adjust the outputs from the partial horizontal
// matching calls above to account for undefined vector halves.
SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0;
SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1;
assert((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?");
bool isUndefLO = NumUndefsLO == Half;
bool isUndefHI = NumUndefsHI == Half;
return ExpandHorizontalBinOp(V0, V1, DL, DAG, X86Opcode, false, isUndefLO,
isUndefHI);
}
}
if (VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
VT == MVT::v16i16) {
unsigned X86Opcode;
if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
X86Opcode = X86ISD::HADD;
else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, NumElts, InVec0,
InVec1))
X86Opcode = X86ISD::HSUB;
else if (isHorizontalBinOpPart(BV, ISD::FADD, DAG, 0, NumElts, InVec0,
InVec1))
X86Opcode = X86ISD::FHADD;
else if (isHorizontalBinOpPart(BV, ISD::FSUB, DAG, 0, NumElts, InVec0,
InVec1))
X86Opcode = X86ISD::FHSUB;
else
return SDValue();
// Don't try to expand this build_vector into a pair of horizontal add/sub
// if we can simply emit a pair of scalar add/sub.
if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
return SDValue();
// Convert this build_vector into two horizontal add/sub followed by
// a concat vector.
bool isUndefLO = NumUndefsLO == Half;
bool isUndefHI = NumUndefsHI == Half;
return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
isUndefLO, isUndefHI);
}
return SDValue();
}
/// If a BUILD_VECTOR's source elements all apply the same bit operation and
/// one of their operands is constant, lower to a pair of BUILD_VECTOR and
/// just apply the bit to the vectors.
/// NOTE: Its not in our interest to start make a general purpose vectorizer
/// from this, but enough scalar bit operations are created from the later
/// legalization + scalarization stages to need basic support.
static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
SelectionDAG &DAG) {
SDLoc DL(Op);
MVT VT = Op->getSimpleValueType(0);
unsigned NumElems = VT.getVectorNumElements();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
// Check that all elements have the same opcode.
// TODO: Should we allow UNDEFS and if so how many?
unsigned Opcode = Op->getOperand(0).getOpcode();
for (unsigned i = 1; i < NumElems; ++i)
if (Opcode != Op->getOperand(i).getOpcode())
return SDValue();
// TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
bool IsShift = false;
switch (Opcode) {
default:
return SDValue();
case ISD::SHL:
case ISD::SRL:
case ISD::SRA:
IsShift = true;
break;
case ISD::AND:
case ISD::XOR:
case ISD::OR:
// Don't do this if the buildvector is a splat - we'd replace one
// constant with an entire vector.
if (Op->getSplatValue())
return SDValue();
if (!TLI.isOperationLegalOrPromote(Opcode, VT))
return SDValue();
break;
}
SmallVector<SDValue, 4> LHSElts, RHSElts;
for (SDValue Elt : Op->ops()) {
SDValue LHS = Elt.getOperand(0);
SDValue RHS = Elt.getOperand(1);
// We expect the canonicalized RHS operand to be the constant.
if (!isa<ConstantSDNode>(RHS))
return SDValue();
// Extend shift amounts.
if (RHS.getValueSizeInBits() != VT.getScalarSizeInBits()) {
if (!IsShift)
return SDValue();
RHS = DAG.getZExtOrTrunc(RHS, DL, VT.getScalarType());
}
LHSElts.push_back(LHS);
RHSElts.push_back(RHS);
}
// Limit to shifts by uniform immediates.
// TODO: Only accept vXi8/vXi64 special cases?
// TODO: Permit non-uniform XOP/AVX2/MULLO cases?
if (IsShift && any_of(RHSElts, [&](SDValue V) { return RHSElts[0] != V; }))
return SDValue();
SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
return DAG.getNode(Opcode, DL, VT, LHS, RHS);
}
/// Create a vector constant without a load. SSE/AVX provide the bare minimum
/// functionality to do this, so it's all zeros, all ones, or some derivation
/// that is cheap to calculate.
static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
SDLoc DL(Op);
MVT VT = Op.getSimpleValueType();
// Vectors containing all zeros can be matched by pxor and xorps.
if (ISD::isBuildVectorAllZeros(Op.getNode()))
return Op;
// Vectors containing all ones can be matched by pcmpeqd on 128-bit width
// vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
// vpcmpeqd on 256-bit vectors.
if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
return Op;
return getOnesVector(VT, DAG, DL);
}
return SDValue();
}
/// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute
/// from a vector of source values and a vector of extraction indices.
/// The vectors might be manipulated to match the type of the permute op.
static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
SDLoc &DL, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
MVT ShuffleVT = VT;
EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
unsigned NumElts = VT.getVectorNumElements();
unsigned SizeInBits = VT.getSizeInBits();
// Adjust IndicesVec to match VT size.
assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts &&
"Illegal variable permute mask size");
if (IndicesVec.getValueType().getVectorNumElements() > NumElts)
IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec),
NumElts * VT.getScalarSizeInBits());
IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
// Handle SrcVec that don't match VT type.
if (SrcVec.getValueSizeInBits() != SizeInBits) {
if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) {
// Handle larger SrcVec by treating it as a larger permute.
unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits;
VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts);
IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,
Subtarget, DAG, SDLoc(IndicesVec));
return extractSubVector(
createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget), 0,
DAG, DL, SizeInBits);
} else if (SrcVec.getValueSizeInBits() < SizeInBits) {
// Widen smaller SrcVec to match VT.
SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));
} else
return SDValue();
}
auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) {
assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale");
EVT SrcVT = Idx.getValueType();
unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale;
uint64_t IndexScale = 0;
uint64_t IndexOffset = 0;
// If we're scaling a smaller permute op, then we need to repeat the
// indices, scaling and offsetting them as well.
// e.g. v4i32 -> v16i8 (Scale = 4)
// IndexScale = v4i32 Splat(4 << 24 | 4 << 16 | 4 << 8 | 4)
// IndexOffset = v4i32 Splat(3 << 24 | 2 << 16 | 1 << 8 | 0)
for (uint64_t i = 0; i != Scale; ++i) {
IndexScale |= Scale << (i * NumDstBits);
IndexOffset |= i << (i * NumDstBits);
}
Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx,
DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT));
Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx,
DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT));
return Idx;
};
unsigned Opcode = 0;
switch (VT.SimpleTy) {
default:
break;
case MVT::v16i8:
if (Subtarget.hasSSSE3())
Opcode = X86ISD::PSHUFB;
break;
case MVT::v8i16:
if (Subtarget.hasVLX() && Subtarget.hasBWI())
Opcode = X86ISD::VPERMV;
else if (Subtarget.hasSSSE3()) {
Opcode = X86ISD::PSHUFB;
ShuffleVT = MVT::v16i8;
}
break;
case MVT::v4f32:
case MVT::v4i32:
if (Subtarget.hasAVX()) {
Opcode = X86ISD::VPERMILPV;
ShuffleVT = MVT::v4f32;
} else if (Subtarget.hasSSSE3()) {
Opcode = X86ISD::PSHUFB;
ShuffleVT = MVT::v16i8;
}
break;
case MVT::v2f64:
case MVT::v2i64:
if (Subtarget.hasAVX()) {
// VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec.
IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
Opcode = X86ISD::VPERMILPV;
ShuffleVT = MVT::v2f64;
} else if (Subtarget.hasSSE41()) {
// SSE41 can compare v2i64 - select between indices 0 and 1.
return DAG.getSelectCC(
DL, IndicesVec,
getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL),
DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}),
DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}),
ISD::CondCode::SETEQ);
}
break;
case MVT::v32i8:
if (Subtarget.hasVLX() && Subtarget.hasVBMI())
Opcode = X86ISD::VPERMV;
else if (Subtarget.hasXOP()) {
SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL);
SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL);
SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL);
SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL);
return DAG.getNode(
ISD::CONCAT_VECTORS, DL, VT,
DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx),
DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx));
} else if (Subtarget.hasAVX()) {
SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL);
SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL);
SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo);
SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi);
auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
ArrayRef<SDValue> Ops) {
// Permute Lo and Hi and then select based on index range.
// This works as SHUFB uses bits[3:0] to permute elements and we don't
// care about the bit[7] as its just an index vector.
SDValue Idx = Ops[2];
EVT VT = Idx.getValueType();
return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT),
DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx),
DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx),
ISD::CondCode::SETGT);
};
SDValue Ops[] = {LoLo, HiHi, IndicesVec};
return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops,
PSHUFBBuilder);
}
break;
case MVT::v16i16:
if (Subtarget.hasVLX() && Subtarget.hasBWI())
Opcode = X86ISD::VPERMV;
else if (Subtarget.hasAVX()) {
// Scale to v32i8 and perform as v32i8.
IndicesVec = ScaleIndices(IndicesVec, 2);
return DAG.getBitcast(
VT, createVariablePermute(
MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec),
DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget));
}
break;
case MVT::v8f32:
case MVT::v8i32:
if (Subtarget.hasAVX2())
Opcode = X86ISD::VPERMV;
else if (Subtarget.hasAVX()) {
SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec);
SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
{0, 1, 2, 3, 0, 1, 2, 3});
SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
{4, 5, 6, 7, 4, 5, 6, 7});
if (Subtarget.hasXOP())
return DAG.getBitcast(
VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32, LoLo, HiHi,
IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
// Permute Lo and Hi and then select based on index range.
// This works as VPERMILPS only uses index bits[0:1] to permute elements.
SDValue Res = DAG.getSelectCC(
DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32),
DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec),
DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec),
ISD::CondCode::SETGT);
return DAG.getBitcast(VT, Res);
}
break;
case MVT::v4i64:
case MVT::v4f64:
if (Subtarget.hasAVX512()) {
if (!Subtarget.hasVLX()) {
MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8);
SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG,
SDLoc(SrcVec));
IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget,
DAG, SDLoc(IndicesVec));
SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL,
DAG, Subtarget);
return extract256BitVector(Res, 0, DAG, DL);
}
Opcode = X86ISD::VPERMV;
} else if (Subtarget.hasAVX()) {
SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec);
SDValue LoLo =
DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1});
SDValue HiHi =
DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3});
// VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec.
IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
if (Subtarget.hasXOP())
return DAG.getBitcast(
VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64, LoLo, HiHi,
IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
// Permute Lo and Hi and then select based on index range.
// This works as VPERMILPD only uses index bit[1] to permute elements.
SDValue Res = DAG.getSelectCC(
DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64),
DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec),
DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec),
ISD::CondCode::SETGT);
return DAG.getBitcast(VT, Res);
}
break;
case MVT::v64i8:
if (Subtarget.hasVBMI())
Opcode = X86ISD::VPERMV;
break;
case MVT::v32i16:
if (Subtarget.hasBWI())
Opcode = X86ISD::VPERMV;
break;
case MVT::v16f32:
case MVT::v16i32:
case MVT::v8f64:
case MVT::v8i64:
if (Subtarget.hasAVX512())
Opcode = X86ISD::VPERMV;
break;
}
if (!Opcode)
return SDValue();
assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&
(VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 &&
"Illegal variable permute shuffle type");
uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits();
if (Scale > 1)
IndicesVec = ScaleIndices(IndicesVec, Scale);
EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger();
IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec);
SrcVec = DAG.getBitcast(ShuffleVT, SrcVec);
SDValue Res = Opcode == X86ISD::VPERMV
? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec)
: DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec);
return DAG.getBitcast(VT, Res);
}
// Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
// reasoned to be a permutation of a vector by indices in a non-constant vector.
// (build_vector (extract_elt V, (extract_elt I, 0)),
// (extract_elt V, (extract_elt I, 1)),
// ...
// ->
// (vpermv I, V)
//
// TODO: Handle undefs
// TODO: Utilize pshufb and zero mask blending to support more efficient
// construction of vectors with constant-0 elements.
static SDValue
LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
SDValue SrcVec, IndicesVec;
// Check for a match of the permute source vector and permute index elements.
// This is done by checking that the i-th build_vector operand is of the form:
// (extract_elt SrcVec, (extract_elt IndicesVec, i)).
for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
SDValue Op = V.getOperand(Idx);
if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
return SDValue();
// If this is the first extract encountered in V, set the source vector,
// otherwise verify the extract is from the previously defined source
// vector.
if (!SrcVec)
SrcVec = Op.getOperand(0);
else if (SrcVec != Op.getOperand(0))
return SDValue();
SDValue ExtractedIndex = Op->getOperand(1);
// Peek through extends.
if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||
ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)
ExtractedIndex = ExtractedIndex.getOperand(0);
if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
return SDValue();
// If this is the first extract from the index vector candidate, set the
// indices vector, otherwise verify the extract is from the previously
// defined indices vector.
if (!IndicesVec)
IndicesVec = ExtractedIndex.getOperand(0);
else if (IndicesVec != ExtractedIndex.getOperand(0))
return SDValue();
auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
if (!PermIdx || PermIdx->getAPIntValue() != Idx)
return SDValue();
}
SDLoc DL(V);
MVT VT = V.getSimpleValueType();
return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
}
SDValue
X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
SDLoc dl(Op);
MVT VT = Op.getSimpleValueType();
MVT EltVT = VT.getVectorElementType();
unsigned NumElems = Op.getNumOperands();
// Generate vectors for predicate vectors.
if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
return LowerBUILD_VECTORvXi1(Op, DAG, Subtarget);
if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
return VectorConstant;
BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
return AddSub;
if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
return HorizontalOp;
if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))
return Broadcast;
if (SDValue BitOp = lowerBuildVectorToBitOp(BV, DAG))
return BitOp;
unsigned EVTBits = EltVT.getSizeInBits();
unsigned NumZero = 0;
unsigned NumNonZero = 0;
uint64_t NonZeros = 0;
bool IsAllConstants = true;
SmallSet<SDValue, 8> Values;
unsigned NumConstants = NumElems;
for (unsigned i = 0; i < NumElems; ++i) {
SDValue Elt = Op.getOperand(i);
if (Elt.isUndef())
continue;
Values.insert(Elt);
if (!isa<ConstantSDNode>(Elt) && !isa<ConstantFPSDNode>(Elt)) {
IsAllConstants = false;
NumConstants--;
}
if (X86::isZeroNode(Elt))
NumZero++;
else {
assert(i < sizeof(NonZeros) * 8); // Make sure the shift is within range.
NonZeros |= ((uint64_t)1 << i);
NumNonZero++;
}
}
// All undef vector. Return an UNDEF. All zero vectors were handled above.
if (NumNonZero == 0)
return DAG.getUNDEF(VT);
// If we are inserting one variable into a vector of non-zero constants, try
// to avoid loading each constant element as a scalar. Load the constants as a
// vector and then insert the variable scalar element. If insertion is not
// supported, fall back to a shuffle to get the scalar blended with the
// constants. Insertion into a zero vector is handled as a special-case
// somewhere below here.
if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
(isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT) ||
isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, VT))) {
// Create an all-constant vector. The variable element in the old
// build vector is replaced by undef in the constant vector. Save the
// variable scalar element and its index for use in the insertelement.
LLVMContext &Context = *DAG.getContext();
Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);
SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));
SDValue VarElt;
SDValue InsIndex;
for (unsigned i = 0; i != NumElems; ++i) {
SDValue Elt = Op.getOperand(i);
if (auto *C = dyn_cast<ConstantSDNode>(Elt))
ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());
else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))
ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());
else if (!Elt.isUndef()) {
assert(!VarElt.getNode() && !InsIndex.getNode() &&
"Expected one variable element in this vector");
VarElt = Elt;
InsIndex = DAG.getConstant(i, dl, getVectorIdxTy(DAG.getDataLayout()));
}
}
Constant *CV = ConstantVector::get(ConstVecOps);
SDValue DAGConstVec = DAG.getConstantPool(CV, VT);
// The constants we just created may not be legal (eg, floating point). We
// must lower the vector right here because we can not guarantee that we'll
// legalize it before loading it. This is also why we could not just create
// a new build vector here. If the build vector contains illegal constants,
// it could get split back up into a series of insert elements.
// TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.
SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
MachineFunction &MF = DAG.getMachineFunction();
MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF);
SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);
unsigned InsertC = cast<ConstantSDNode>(InsIndex)->getZExtValue();
unsigned NumEltsInLow128Bits = 128 / VT.getScalarSizeInBits();
if (InsertC < NumEltsInLow128Bits)
return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);
// There's no good way to insert into the high elements of a >128-bit
// vector, so use shuffles to avoid an extract/insert sequence.
assert(VT.getSizeInBits() > 128 && "Invalid insertion index?");
assert(Subtarget.hasAVX() && "Must have AVX with >16-byte vector");
SmallVector<int, 8> ShuffleMask;
unsigned NumElts = VT.getVectorNumElements();
for (unsigned i = 0; i != NumElts; ++i)
ShuffleMask.push_back(i == InsertC ? NumElts : i);
SDValue S2V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, VarElt);
return DAG.getVectorShuffle(VT, dl, Ld, S2V, ShuffleMask);
}
// Special case for single non-zero, non-undef, element.
if (NumNonZero == 1) {
unsigned Idx = countTrailingZeros(NonZeros);
SDValue Item = Op.getOperand(Idx);
// If we have a constant or non-constant insertion into the low element of
// a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
// the rest of the elements. This will be matched as movd/movq/movss/movsd
// depending on what the source datatype is.
if (Idx == 0) {
if (NumZero == 0)
return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
(EltVT == MVT::i64 && Subtarget.is64Bit())) {
assert((VT.is128BitVector() || VT.is256BitVector() ||
VT.is512BitVector()) &&
"Expected an SSE value type!");
Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
// Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
}
// We can't directly insert an i8 or i16 into a vector, so zero extend
// it to i32 first.
if (EltVT == MVT::i16 || EltVT == MVT::i8) {
Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);
Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
return DAG.getBitcast(VT, Item);
}
}
// Is it a vector logical left shift?
if (NumElems == 2 && Idx == 1 &&
X86::isZeroNode(Op.getOperand(0)) &&
!X86::isZeroNode(Op.getOperand(1))) {
unsigned NumBits = VT.getSizeInBits();
return getVShift(true, VT,
DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
VT, Op.getOperand(1)),
NumBits/2, DAG, *this, dl);
}
if (IsAllConstants) // Otherwise, it's better to do a constpool load.
return SDValue();
// Otherwise, if this is a vector with i32 or f32 elements, and the element
// is a non-constant being inserted into an element other than the low one,
// we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
// movd/movss) to move this into the low element, then shuffle it into
// place.
if (EVTBits == 32) {
Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
}
}
// Splat is obviously ok. Let legalizer expand it to a shuffle.
if (Values.size() == 1) {
if (EVTBits == 32) {
// Instead of a shuffle like this:
// shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
// Check if it's possible to issue this instead.
// shuffle (vload ptr)), undef, <1, 1, 1, 1>
unsigned Idx = countTrailingZeros(NonZeros);
SDValue Item = Op.getOperand(Idx);
if (Op.getNode()->isOnlyUserOf(Item.getNode()))
return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
}
return SDValue();
}
// A vector full of immediates; various special cases are already
// handled, so this is best done with a single constant-pool load.
if (IsAllConstants)
return SDValue();
if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, DAG, Subtarget))
return V;
// See if we can use a vector load to get all of the elements.
{
SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
if (SDValue LD =
EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
return LD;
}
// If this is a splat of pairs of 32-bit elements, we can use a narrower
// build_vector and broadcast it.
// TODO: We could probably generalize this more.
if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) {
SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) {
// Make sure all the even/odd operands match.
for (unsigned i = 2; i != NumElems; ++i)
if (Ops[i % 2] != Op.getOperand(i))
return false;
return true;
};
if (CanSplat(Op, NumElems, Ops)) {
MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;
MVT NarrowVT = MVT::getVectorVT(EltVT, 4);
// Create a new build vector and cast to v2i64/v2f64.
SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2),
DAG.getBuildVector(NarrowVT, dl, Ops));
// Broadcast from v2i64/v2f64 and cast to final VT.
MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems/2);
return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,
NewBV));
}
}
// For AVX-length vectors, build the individual 128-bit pieces and use
// shuffles to put them in place.
if (VT.getSizeInBits() > 128) {
MVT HVT = MVT::getVectorVT(EltVT, NumElems/2);
// Build both the lower and upper subvector.
SDValue Lower =
DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
SDValue Upper = DAG.getBuildVector(
HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
// Recreate the wider vector with the lower and upper part.
return concatSubVectors(Lower, Upper, DAG, dl);
}
// Let legalizer expand 2-wide build_vectors.
if (EVTBits == 64) {
if (NumNonZero == 1) {
// One half is zero or undef.
unsigned Idx = countTrailingZeros(NonZeros);
SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
Op.getOperand(Idx));
return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
}
return SDValue();
}
// If element VT is < 32 bits, convert it to inserts into a zero vector.
if (EVTBits == 8 && NumElems == 16)
if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero,
DAG, Subtarget))
return V;
if (EVTBits == 16 && NumElems == 8)
if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero,
DAG, Subtarget))
return V;
// If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
if (EVTBits == 32 && NumElems == 4)
if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget))
return V;
// If element VT is == 32 bits, turn it into a number of shuffles.
if (NumElems == 4 && NumZero > 0) {
SmallVector<SDValue, 8> Ops(NumElems);
for (unsigned i = 0; i < 4; ++i) {
bool isZero = !(NonZeros & (1ULL << i));
if (isZero)
Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
else
Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
}
for (unsigned i = 0; i < 2; ++i) {
switch ((NonZeros >> (i*2)) & 0x3) {
default: llvm_unreachable("Unexpected NonZero count");
case 0:
Ops[i] = Ops[i*2]; // Must be a zero vector.
break;
case 1:
Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
break;
case 2:
Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
break;
case 3:
Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
break;
}
}
bool Reverse1 = (NonZeros & 0x3) == 2;
bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
int MaskVec[] = {
Reverse1 ? 1 : 0,
Reverse1 ? 0 : 1,
static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
static_cast<int>(Reverse2 ? NumElems : NumElems+1)
};
return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
}
assert(Values.size() > 1 && "Expected non-undef and non-splat vector");
// Check for a build vector from mostly shuffle plus few inserting.
if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
return Sh;
// For SSE 4.1, use insertps to put the high elements into the low element.
if (Subtarget.hasSSE41()) {
SDValue Result;
if (!Op.getOperand(0).isUndef())
Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
else
Result = DAG.getUNDEF(VT);
for (unsigned i = 1; i < NumElems; ++i) {
if (Op.getOperand(i).isUndef()) continue;
Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
}
return Result;
}
// Otherwise, expand into a number of unpckl*, start by extending each of
// our (non-undef) elements to the full vector width with the element in the
// bottom slot of the vector (which generates no code for SSE).
SmallVector<SDValue, 8> Ops(NumElems);
for (unsigned i = 0; i < NumElems; ++i) {
if (!Op.getOperand(i).isUndef())
Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
else
Ops[i] = DAG.getUNDEF(VT);
}
// Next, we iteratively mix elements, e.g. for v4f32:
// Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
// : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
// Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
// Generate scaled UNPCKL shuffle mask.
SmallVector<int, 16> Mask;
for(unsigned i = 0; i != Scale; ++i)
Mask.push_back(i);
for (unsigned i = 0; i != Scale; ++i)
Mask.push_back(NumElems+i);
Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
}
return Ops[0];
}
// 256-bit AVX can use the vinsertf128 instruction
// to create 256-bit vectors from two other 128-bit ones.
// TODO: Detect subvector broadcast here instead of DAG combine?
static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
SDLoc dl(Op);
MVT ResVT = Op.getSimpleValueType();
assert((ResVT.is256BitVector() ||
ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
unsigned NumOperands = Op.getNumOperands();
unsigned NumZero = 0;
unsigned NumNonZero = 0;
unsigned NonZeros = 0;
for (unsigned i = 0; i != NumOperands; ++i) {
SDValue SubVec = Op.getOperand(i);
if (SubVec.isUndef())
continue;
if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
++NumZero;
else {
assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
NonZeros |= 1 << i;
++NumNonZero;
}
}
// If we have more than 2 non-zeros, build each half separately.
if (NumNonZero > 2) {
MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
ArrayRef<SDUse> Ops = Op->ops();
SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
Ops.slice(0, NumOperands/2));
SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
Ops.slice(NumOperands/2));
return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
}
// Otherwise, build it up through insert_subvectors.
SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
: DAG.getUNDEF(ResVT);
MVT SubVT = Op.getOperand(0).getSimpleValueType();
unsigned NumSubElems = SubVT.getVectorNumElements();
for (unsigned i = 0; i != NumOperands; ++i) {
if ((NonZeros & (1 << i)) == 0)
continue;
Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec,
Op.getOperand(i),
DAG.getIntPtrConstant(i * NumSubElems, dl));
}
return Vec;
}
// Returns true if the given node is a type promotion (by concatenating i1
// zeros) of the result of a node that already zeros all upper bits of
// k-register.
// TODO: Merge this with LowerAVXCONCAT_VECTORS?
static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
const X86Subtarget &Subtarget,
SelectionDAG & DAG) {
SDLoc dl(Op);
MVT ResVT = Op.getSimpleValueType();
unsigned NumOperands = Op.getNumOperands();
assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
"Unexpected number of operands in CONCAT_VECTORS");
uint64_t Zeros = 0;
uint64_t NonZeros = 0;
for (unsigned i = 0; i != NumOperands; ++i) {
SDValue SubVec = Op.getOperand(i);
if (SubVec.isUndef())
continue;
assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
Zeros |= (uint64_t)1 << i;
else
NonZeros |= (uint64_t)1 << i;
}
unsigned NumElems = ResVT.getVectorNumElements();
// If we are inserting non-zero vector and there are zeros in LSBs and undef
// in the MSBs we need to emit a KSHIFTL. The generic lowering to
// insert_subvector will give us two kshifts.
if (isPowerOf2_64(NonZeros) && Zeros != 0 && NonZeros > Zeros &&
Log2_64(NonZeros) != NumOperands - 1) {
MVT ShiftVT = ResVT;
if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8)
ShiftVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
unsigned Idx = Log2_64(NonZeros);
SDValue SubVec = Op.getOperand(Idx);
unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ShiftVT,
DAG.getUNDEF(ShiftVT), SubVec,
DAG.getIntPtrConstant(0, dl));
Op = DAG.getNode(X86ISD::KSHIFTL, dl, ShiftVT, SubVec,
DAG.getTargetConstant(Idx * SubVecNumElts, dl, MVT::i8));
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Op,
DAG.getIntPtrConstant(0, dl));
}
// If there are zero or one non-zeros we can handle this very simply.
if (NonZeros == 0 || isPowerOf2_64(NonZeros)) {
SDValue Vec = Zeros ? DAG.getConstant(0, dl, ResVT) : DAG.getUNDEF(ResVT);
if (!NonZeros)
return Vec;
unsigned Idx = Log2_64(NonZeros);
SDValue SubVec = Op.getOperand(Idx);
unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
DAG.getIntPtrConstant(Idx * SubVecNumElts, dl));
}
if (NumOperands > 2) {
MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
ArrayRef<SDUse> Ops = Op->ops();
SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
Ops.slice(0, NumOperands/2));
SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
Ops.slice(NumOperands/2));
return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
}
assert(countPopulation(NonZeros) == 2 && "Simple cases not handled?");
if (ResVT.getVectorNumElements() >= 16)
return Op; // The operation is legal with KUNPCK
SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT,
DAG.getUNDEF(ResVT), Op.getOperand(0),
DAG.getIntPtrConstant(0, dl));
return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
DAG.getIntPtrConstant(NumElems/2, dl));
}
static SDValue LowerCONCAT_VECTORS(SDValue Op,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MVT VT = Op.getSimpleValueType();
if (VT.getVectorElementType() == MVT::i1)
return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
(VT.is512BitVector() && (Op.getNumOperands() == 2 ||
Op.getNumOperands() == 4)));
// AVX can use the vinsertf128 instruction to create 256-bit vectors
// from two other 128-bit ones.
// 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
return LowerAVXCONCAT_VECTORS(Op, DAG, Subtarget);
}
//===----------------------------------------------------------------------===//
// Vector shuffle lowering
//
// This is an experimental code path for lowering vector shuffles on x86. It is
// designed to handle arbitrary vector shuffles and blends, gracefully
// degrading performance as necessary. It works hard to recognize idiomatic
// shuffles and lower them to optimal instruction patterns without leaving
// a framework that allows reasonably efficient handling of all vector shuffle
// patterns.
//===----------------------------------------------------------------------===//
/// Tiny helper function to identify a no-op mask.
///
/// This is a somewhat boring predicate function. It checks whether the mask
/// array input, which is assumed to be a single-input shuffle mask of the kind
/// used by the X86 shuffle instructions (not a fully general
/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
/// in-place shuffle are 'no-op's.
static bool isNoopShuffleMask(ArrayRef<int> Mask) {
for (int i = 0, Size = Mask.size(); i < Size; ++i) {
assert(Mask[i] >= -1 && "Out of bound mask element!");
if (Mask[i] >= 0 && Mask[i] != i)
return false;
}
return true;
}
/// Test whether there are elements crossing LaneSizeInBits lanes in this
/// shuffle mask.
///
/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
/// and we routinely test for these.
static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits,
unsigned ScalarSizeInBits,
ArrayRef<int> Mask) {
assert(LaneSizeInBits && ScalarSizeInBits &&
(LaneSizeInBits % ScalarSizeInBits) == 0 &&
"Illegal shuffle lane size");
int LaneSize = LaneSizeInBits / ScalarSizeInBits;
int Size = Mask.size();
for (int i = 0; i < Size; ++i)
if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
return true;
return false;
}
/// Test whether there are elements crossing 128-bit lanes in this
/// shuffle mask.
static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
return isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), Mask);
}
/// Test whether a shuffle mask is equivalent within each sub-lane.
///
/// This checks a shuffle mask to see if it is performing the same
/// lane-relative shuffle in each sub-lane. This trivially implies
/// that it is also not lane-crossing. It may however involve a blend from the
/// same lane of a second vector.
///
/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
/// non-trivial to compute in the face of undef lanes. The representation is
/// suitable for use with existing 128-bit shuffles as entries from the second
/// vector have been remapped to [LaneSize, 2*LaneSize).
static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
ArrayRef<int> Mask,
SmallVectorImpl<int> &RepeatedMask) {
auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
RepeatedMask.assign(LaneSize, -1);
int Size = Mask.size();
for (int i = 0; i < Size; ++i) {
assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0);
if (Mask[i] < 0)
continue;
if ((Mask[i] % Size) / LaneSize != i / LaneSize)
// This entry crosses lanes, so there is no way to model this shuffle.
return false;
// Ok, handle the in-lane shuffles by detecting if and when they repeat.
// Adjust second vector indices to start at LaneSize instead of Size.
int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
: Mask[i] % LaneSize + LaneSize;
if (RepeatedMask[i % LaneSize] < 0)
// This is the first non-undef entry in this slot of a 128-bit lane.
RepeatedMask[i % LaneSize] = LocalM;
else if (RepeatedMask[i % LaneSize] != LocalM)
// Found a mismatch with the repeated mask.
return false;
}
return true;
}
/// Test whether a shuffle mask is equivalent within each 128-bit lane.
static bool
is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
SmallVectorImpl<int> &RepeatedMask) {
return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
}
static bool
is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask) {
SmallVector<int, 32> RepeatedMask;
return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
}
/// Test whether a shuffle mask is equivalent within each 256-bit lane.
static bool
is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
SmallVectorImpl<int> &RepeatedMask) {
return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
}
/// Test whether a target shuffle mask is equivalent within each sub-lane.
/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
ArrayRef<int> Mask,
SmallVectorImpl<int> &RepeatedMask) {
int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
RepeatedMask.assign(LaneSize, SM_SentinelUndef);
int Size = Mask.size();
for (int i = 0; i < Size; ++i) {
assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0));
if (Mask[i] == SM_SentinelUndef)
continue;
if (Mask[i] == SM_SentinelZero) {
if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
return false;
RepeatedMask[i % LaneSize] = SM_SentinelZero;
continue;
}
if ((Mask[i] % Size) / LaneSize != i / LaneSize)
// This entry crosses lanes, so there is no way to model this shuffle.
return false;
// Ok, handle the in-lane shuffles by detecting if and when they repeat.
// Adjust second vector indices to start at LaneSize instead of Size.
int LocalM =
Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + LaneSize;
if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
// This is the first non-undef entry in this slot of a 128-bit lane.
RepeatedMask[i % LaneSize] = LocalM;
else if (RepeatedMask[i % LaneSize] != LocalM)
// Found a mismatch with the repeated mask.
return false;
}
return true;
}
/// Checks whether a shuffle mask is equivalent to an explicit list of
/// arguments.
///
/// This is a fast way to test a shuffle mask against a fixed pattern:
///
/// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
///
/// It returns true if the mask is exactly as wide as the argument list, and
/// each element of the mask is either -1 (signifying undef) or the value given
/// in the argument.
static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
ArrayRef<int> ExpectedMask) {
if (Mask.size() != ExpectedMask.size())
return false;
int Size = Mask.size();
// If the values are build vectors, we can look through them to find
// equivalent inputs that make the shuffles equivalent.
auto *BV1 = dyn_cast<BuildVectorSDNode>(V1);
auto *BV2 = dyn_cast<BuildVectorSDNode>(V2);
for (int i = 0; i < Size; ++i) {
assert(Mask[i] >= -1 && "Out of bound mask element!");
if (Mask[i] >= 0 && Mask[i] != ExpectedMask[i]) {
auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
if (!MaskBV || !ExpectedBV ||
MaskBV->getOperand(Mask[i] % Size) !=
ExpectedBV->getOperand(ExpectedMask[i] % Size))
return false;
}
}
return true;
}
/// Checks whether a target shuffle mask is equivalent to an explicit pattern.
///
/// The masks must be exactly the same width.
///
/// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
/// value in ExpectedMask is always accepted. Otherwise the indices must match.
///
/// SM_SentinelZero is accepted as a valid negative index but must match in
/// both.
static bool isTargetShuffleEquivalent(ArrayRef<int> Mask,
ArrayRef<int> ExpectedMask,
SDValue V1 = SDValue(),
SDValue V2 = SDValue()) {
int Size = Mask.size();
if (Size != (int)ExpectedMask.size())
return false;
assert(isUndefOrZeroOrInRange(ExpectedMask, 0, 2 * Size) &&
"Illegal target shuffle mask");
// Check for out-of-range target shuffle mask indices.
if (!isUndefOrZeroOrInRange(Mask, 0, 2 * Size))
return false;
// If the values are build vectors, we can look through them to find
// equivalent inputs that make the shuffles equivalent.
auto *BV1 = dyn_cast_or_null<BuildVectorSDNode>(V1);
auto *BV2 = dyn_cast_or_null<BuildVectorSDNode>(V2);
BV1 = ((BV1 && Size != (int)BV1->getNumOperands()) ? nullptr : BV1);
BV2 = ((BV2 && Size != (int)BV2->getNumOperands()) ? nullptr : BV2);
for (int i = 0; i < Size; ++i) {
if (Mask[i] == SM_SentinelUndef || Mask[i] == ExpectedMask[i])
continue;
if (0 <= Mask[i] && 0 <= ExpectedMask[i]) {
auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
if (MaskBV && ExpectedBV &&
MaskBV->getOperand(Mask[i] % Size) ==
ExpectedBV->getOperand(ExpectedMask[i] % Size))
continue;
}
// TODO - handle SM_Sentinel equivalences.
return false;
}
return true;
}
// Attempt to create a shuffle mask from a VSELECT condition mask.
static bool createShuffleMaskFromVSELECT(SmallVectorImpl<int> &Mask,
SDValue Cond) {
if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
return false;
unsigned Size = Cond.getValueType().getVectorNumElements();
Mask.resize(Size, SM_SentinelUndef);
for (int i = 0; i != (int)Size; ++i) {
SDValue CondElt = Cond.getOperand(i);
Mask[i] = i;
// Arbitrarily choose from the 2nd operand if the select condition element
// is undef.
// TODO: Can we do better by matching patterns such as even/odd?
if (CondElt.isUndef() || isNullConstant(CondElt))
Mask[i] += Size;
}
return true;
}
// Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
// instructions.
static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) {
if (VT != MVT::v8i32 && VT != MVT::v8f32)
return false;
SmallVector<int, 8> Unpcklwd;
createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
/* Unary = */ false);
SmallVector<int, 8> Unpckhwd;
createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
/* Unary = */ false);
bool IsUnpackwdMask = (isTargetShuffleEquivalent(Mask, Unpcklwd) ||
isTargetShuffleEquivalent(Mask, Unpckhwd));
return IsUnpackwdMask;
}
static bool is128BitUnpackShuffleMask(ArrayRef<int> Mask) {
// Create 128-bit vector type based on mask size.
MVT EltVT = MVT::getIntegerVT(128 / Mask.size());
MVT VT = MVT::getVectorVT(EltVT, Mask.size());
// We can't assume a canonical shuffle mask, so try the commuted version too.
SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
ShuffleVectorSDNode::commuteMask(CommutedMask);
// Match any of unary/binary or low/high.
for (unsigned i = 0; i != 4; ++i) {
SmallVector<int, 16> UnpackMask;
createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2);
if (isTargetShuffleEquivalent(Mask, UnpackMask) ||
isTargetShuffleEquivalent(CommutedMask, UnpackMask))
return true;
}
return false;
}
/// Return true if a shuffle mask chooses elements identically in its top and
/// bottom halves. For example, any splat mask has the same top and bottom
/// halves. If an element is undefined in only one half of the mask, the halves
/// are not considered identical.
static bool hasIdenticalHalvesShuffleMask(ArrayRef<int> Mask) {
assert(Mask.size() % 2 == 0 && "Expecting even number of elements in mask");
unsigned HalfSize = Mask.size() / 2;
for (unsigned i = 0; i != HalfSize; ++i) {
if (Mask[i] != Mask[i + HalfSize])
return false;
}
return true;
}
/// Get a 4-lane 8-bit shuffle immediate for a mask.
///
/// This helper function produces an 8-bit shuffle immediate corresponding to
/// the ubiquitous shuffle encoding scheme used in x86 instructions for
/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
/// example.
///
/// NB: We rely heavily on "undef" masks preserving the input lane.
static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
unsigned Imm = 0;
Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
return Imm;
}
static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,
SelectionDAG &DAG) {
return DAG.getTargetConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
}
// The Shuffle result is as follow:
// 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
// Each Zeroable's element correspond to a particular Mask's element.
// As described in computeZeroableShuffleElements function.
//
// The function looks for a sub-mask that the nonzero elements are in
// increasing order. If such sub-mask exist. The function returns true.
static bool isNonZeroElementsInOrder(const APInt &Zeroable,
ArrayRef<int> Mask, const EVT &VectorType,
bool &IsZeroSideLeft) {
int NextElement = -1;
// Check if the Mask's nonzero elements are in increasing order.
for (int i = 0, e = Mask.size(); i < e; i++) {
// Checks if the mask's zeros elements are built from only zeros.
assert(Mask[i] >= -1 && "Out of bound mask element!");
if (Mask[i] < 0)
return false;
if (Zeroable[i])
continue;
// Find the lowest non zero element
if (NextElement < 0) {
NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
IsZeroSideLeft = NextElement != 0;
}
// Exit if the mask's non zero elements are not in increasing order.
if (NextElement != Mask[i])
return false;
NextElement++;
}
return true;
}
/// Try to lower a shuffle with a single PSHUFB of V1 or V2.
static SDValue lowerShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
ArrayRef<int> Mask, SDValue V1,
SDValue V2, const APInt &Zeroable,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
int Size = Mask.size();
int LaneSize = 128 / VT.getScalarSizeInBits();
const int NumBytes = VT.getSizeInBits() / 8;
const int NumEltBytes = VT.getScalarSizeInBits() / 8;
assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
(Subtarget.hasAVX2() && VT.is256BitVector()) ||
(Subtarget.hasBWI() && VT.is512BitVector()));
SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
// Sign bit set in i8 mask means zero element.
SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
SDValue V;
for (int i = 0; i < NumBytes; ++i) {
int M = Mask[i / NumEltBytes];
if (M < 0) {
PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
continue;
}
if (Zeroable[i / NumEltBytes]) {
PSHUFBMask[i] = ZeroMask;
continue;
}
// We can only use a single input of V1 or V2.
SDValue SrcV = (M >= Size ? V2 : V1);
if (V && V != SrcV)
return SDValue();
V = SrcV;
M %= Size;
// PSHUFB can't cross lanes, ensure this doesn't happen.
if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
return SDValue();
M = M % LaneSize;
M = M * NumEltBytes + (i % NumEltBytes);
PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
}
assert(V && "Failed to find a source input");
MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
return DAG.getBitcast(
VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
}
static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
const X86Subtarget &Subtarget, SelectionDAG &DAG,
const SDLoc &dl);
// X86 has dedicated shuffle that can be lowered to VEXPAND
static SDValue lowerShuffleToEXPAND(const SDLoc &DL, MVT VT,
const APInt &Zeroable,
ArrayRef<int> Mask, SDValue &V1,
SDValue &V2, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
bool IsLeftZeroSide = true;
if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
IsLeftZeroSide))
return SDValue();
unsigned VEXPANDMask = (~Zeroable).getZExtValue();
MVT IntegerType =
MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
unsigned NumElts = VT.getVectorNumElements();
assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&
"Unexpected number of vector elements");
SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
Subtarget, DAG, DL);
SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask);
}
static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
unsigned &UnpackOpcode, bool IsUnary,
ArrayRef<int> TargetMask, const SDLoc &DL,
SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
int NumElts = VT.getVectorNumElements();
bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
for (int i = 0; i != NumElts; i += 2) {
int M1 = TargetMask[i + 0];
int M2 = TargetMask[i + 1];
Undef1 &= (SM_SentinelUndef == M1);
Undef2 &= (SM_SentinelUndef == M2);
Zero1 &= isUndefOrZero(M1);
Zero2 &= isUndefOrZero(M2);
}
assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
"Zeroable shuffle detected");
// Attempt to match the target mask against the unpack lo/hi mask patterns.
SmallVector<int, 64> Unpckl, Unpckh;
createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
UnpackOpcode = X86ISD::UNPCKL;
V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
return true;
}
createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
UnpackOpcode = X86ISD::UNPCKH;
V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
return true;
}
// If an unary shuffle, attempt to match as an unpack lo/hi with zero.
if (IsUnary && (Zero1 || Zero2)) {
// Don't bother if we can blend instead.
if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
return false;
bool MatchLo = true, MatchHi = true;
for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
int M = TargetMask[i];
// Ignore if the input is known to be zero or the index is undef.
if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
(M == SM_SentinelUndef))
continue;
MatchLo &= (M == Unpckl[i]);
MatchHi &= (M == Unpckh[i]);
}
if (MatchLo || MatchHi) {
UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
return true;
}
}
// If a binary shuffle, commute and try again.
if (!IsUnary) {
ShuffleVectorSDNode::commuteMask(Unpckl);
if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
UnpackOpcode = X86ISD::UNPCKL;
std::swap(V1, V2);
return true;
}
ShuffleVectorSDNode::commuteMask(Unpckh);
if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
UnpackOpcode = X86ISD::UNPCKH;
std::swap(V1, V2);
return true;
}
}
return false;
}
// X86 has dedicated unpack instructions that can handle specific blend
// operations: UNPCKH and UNPCKL.
static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT,
ArrayRef<int> Mask, SDValue V1, SDValue V2,
SelectionDAG &DAG) {
SmallVector<int, 8> Unpckl;
createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
SmallVector<int, 8> Unpckh;
createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
// Commute and try again.
ShuffleVectorSDNode::commuteMask(Unpckl);
if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
ShuffleVectorSDNode::commuteMask(Unpckh);
if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
return SDValue();
}
static bool matchShuffleAsVPMOV(ArrayRef<int> Mask, bool SwappedOps,
int Delta) {
int Size = (int)Mask.size();
int Split = Size / Delta;
int TruncatedVectorStart = SwappedOps ? Size : 0;
// Match for mask starting with e.g.: <8, 10, 12, 14,... or <0, 2, 4, 6,...
if (!isSequentialOrUndefInRange(Mask, 0, Split, TruncatedVectorStart, Delta))
return false;
// The rest of the mask should not refer to the truncated vector's elements.
if (isAnyInRange(Mask.slice(Split, Size - Split), TruncatedVectorStart,
TruncatedVectorStart + Size))
return false;
return true;
}
// Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction.
//
// An example is the following:
//
// t0: ch = EntryToken
// t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0
// t25: v4i32 = truncate t2
// t41: v8i16 = bitcast t25
// t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16,
// Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0>
// t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21
// t18: v2i64 = bitcast t51
//
// Without avx512vl, this is lowered to:
//
// vpmovqd %zmm0, %ymm0
// vpshufb {{.*#+}} xmm0 =
// xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
//
// But when avx512vl is available, one can just use a single vpmovdw
// instruction.
static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, ArrayRef<int> Mask,
MVT VT, SDValue V1, SDValue V2,
SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
if (VT != MVT::v16i8 && VT != MVT::v8i16)
return SDValue();
if (Mask.size() != VT.getVectorNumElements())
return SDValue();
bool SwappedOps = false;
if (!ISD::isBuildVectorAllZeros(V2.getNode())) {
if (!ISD::isBuildVectorAllZeros(V1.getNode()))
return SDValue();
std::swap(V1, V2);
SwappedOps = true;
}
// Look for:
//
// bitcast (truncate <8 x i32> %vec to <8 x i16>) to <16 x i8>
// bitcast (truncate <4 x i64> %vec to <4 x i32>) to <8 x i16>
//
// and similar ones.
if (V1.getOpcode() != ISD::BITCAST)
return SDValue();
if (V1.getOperand(0).getOpcode() != ISD::TRUNCATE)
return SDValue();
SDValue Src = V1.getOperand(0).getOperand(0);
MVT SrcVT = Src.getSimpleValueType();
// The vptrunc** instructions truncating 128 bit and 256 bit vectors
// are only available with avx512vl.
if (!SrcVT.is512BitVector() && !Subtarget.hasVLX())
return SDValue();
// Down Convert Word to Byte is only available with avx512bw. The case with
// 256-bit output doesn't contain a shuffle and is therefore not handled here.
if (SrcVT.getVectorElementType() == MVT::i16 && VT == MVT::v16i8 &&
!Subtarget.hasBWI())
return SDValue();
// The first half/quarter of the mask should refer to every second/fourth
// element of the vector truncated and bitcasted.
if (!matchShuffleAsVPMOV(Mask, SwappedOps, 2) &&
!matchShuffleAsVPMOV(Mask, SwappedOps, 4))
return SDValue();
return DAG.getNode(X86ISD::VTRUNC, DL, VT, Src);
}
// X86 has dedicated pack instructions that can handle specific truncation
// operations: PACKSS and PACKUS.
static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,
unsigned &PackOpcode, ArrayRef<int> TargetMask,
SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
unsigned NumElts = VT.getVectorNumElements();
unsigned BitSize = VT.getScalarSizeInBits();
MVT PackSVT = MVT::getIntegerVT(BitSize * 2);
MVT PackVT = MVT::getVectorVT(PackSVT, NumElts / 2);
auto MatchPACK = [&](SDValue N1, SDValue N2) {
SDValue VV1 = DAG.getBitcast(PackVT, N1);
SDValue VV2 = DAG.getBitcast(PackVT, N2);
if (Subtarget.hasSSE41() || PackSVT == MVT::i16) {
APInt ZeroMask = APInt::getHighBitsSet(BitSize * 2, BitSize);
if ((N1.isUndef() || DAG.MaskedValueIsZero(VV1, ZeroMask)) &&
(N2.isUndef() || DAG.MaskedValueIsZero(VV2, ZeroMask))) {
V1 = VV1;
V2 = VV2;
SrcVT = PackVT;
PackOpcode = X86ISD::PACKUS;
return true;
}
}
if ((N1.isUndef() || DAG.ComputeNumSignBits(VV1) > BitSize) &&
(N2.isUndef() || DAG.ComputeNumSignBits(VV2) > BitSize)) {
V1 = VV1;
V2 = VV2;
SrcVT = PackVT;
PackOpcode = X86ISD::PACKSS;
return true;
}
return false;
};
// Try binary shuffle.
SmallVector<int, 32> BinaryMask;
createPackShuffleMask(VT, BinaryMask, false);
if (isTargetShuffleEquivalent(TargetMask, BinaryMask, V1, V2))
if (MatchPACK(V1, V2))
return true;
// Try unary shuffle.
SmallVector<int, 32> UnaryMask;
createPackShuffleMask(VT, UnaryMask, true);
if (isTargetShuffleEquivalent(TargetMask, UnaryMask, V1))
if (MatchPACK(V1, V1))
return true;
return false;
}
static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
SDValue V1, SDValue V2, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
MVT PackVT;
unsigned PackOpcode;
if (matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
Subtarget))
return DAG.getNode(PackOpcode, DL, VT, DAG.getBitcast(PackVT, V1),
DAG.getBitcast(PackVT, V2));
return SDValue();
}
/// Try to emit a bitmask instruction for a shuffle.
///
/// This handles cases where we can model a blend exactly as a bitmask due to
/// one of the inputs being zeroable.
static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
SDValue V2, ArrayRef<int> Mask,
const APInt &Zeroable,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MVT MaskVT = VT;
MVT EltVT = VT.getVectorElementType();
SDValue Zero, AllOnes;
// Use f64 if i64 isn't legal.
if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
EltVT = MVT::f64;
MaskVT = MVT::getVectorVT(EltVT, Mask.size());
}
MVT LogicVT = VT;
if (EltVT == MVT::f32 || EltVT == MVT::f64) {
Zero = DAG.getConstantFP(0.0, DL, EltVT);
AllOnes = DAG.getConstantFP(
APFloat::getAllOnesValue(EltVT.getSizeInBits(), true), DL, EltVT);
LogicVT =
MVT::getVectorVT(EltVT == MVT::f64 ? MVT::i64 : MVT::i32, Mask.size());
} else {
Zero = DAG.getConstant(0, DL, EltVT);
AllOnes = DAG.getAllOnesConstant(DL, EltVT);
}
SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
SDValue V;
for (int i = 0, Size = Mask.size(); i < Size; ++i) {
if (Zeroable[i])
continue;
if (Mask[i] % Size != i)
return SDValue(); // Not a blend.
if (!V)
V = Mask[i] < Size ? V1 : V2;
else if (V != (Mask[i] < Size ? V1 : V2))
return SDValue(); // Can only let one input through the mask.
VMaskOps[i] = AllOnes;
}
if (!V)
return SDValue(); // No non-zeroable elements!
SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps);
VMask = DAG.getBitcast(LogicVT, VMask);
V = DAG.getBitcast(LogicVT, V);
SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask);
return DAG.getBitcast(VT, And);
}
/// Try to emit a blend instruction for a shuffle using bit math.
///
/// This is used as a fallback approach when first class blend instructions are
/// unavailable. Currently it is only suitable for integer vectors, but could
/// be generalized for floating point vectors if desirable.
static SDValue lowerShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
SDValue V2, ArrayRef<int> Mask,
SelectionDAG &DAG) {
assert(VT.isInteger() && "Only supports integer vector types!");
MVT EltVT = VT.getVectorElementType();
SDValue Zero = DAG.getConstant(0, DL, EltVT);
SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
SmallVector<SDValue, 16> MaskOps;
for (int i = 0, Size = Mask.size(); i < Size; ++i) {
if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
return SDValue(); // Shuffled input!
MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
}
SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
V2 = DAG.getNode(X86ISD::ANDNP, DL, VT, V1Mask, V2);
return DAG.getNode(ISD::OR, DL, VT, V1, V2);
}
static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
SDValue PreservedSrc,
const X86Subtarget &Subtarget,
SelectionDAG &DAG);
static bool matchShuffleAsBlend(SDValue V1, SDValue V2,
MutableArrayRef<int> Mask,
const APInt &Zeroable, bool &ForceV1Zero,
bool &ForceV2Zero, uint64_t &BlendMask) {
bool V1IsZeroOrUndef =
V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode());
bool V2IsZeroOrUndef =
V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode());
BlendMask = 0;
ForceV1Zero = false, ForceV2Zero = false;
assert(Mask.size() <= 64 && "Shuffle mask too big for blend mask");
// Attempt to generate the binary blend mask. If an input is zero then
// we can use any lane.
for (int i = 0, Size = Mask.size(); i < Size; ++i) {
int M = Mask[i];
if (M == SM_SentinelUndef)
continue;
if (M == i)
continue;
if (M == i + Size) {
BlendMask |= 1ull << i;
continue;
}
if (Zeroable[i]) {
if (V1IsZeroOrUndef) {
ForceV1Zero = true;
Mask[i] = i;
continue;
}
if (V2IsZeroOrUndef) {
ForceV2Zero = true;
BlendMask |= 1ull << i;
Mask[i] = i + Size;
continue;
}
}
return false;
}
return true;
}
static uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size,
int Scale) {
uint64_t ScaledMask = 0;
for (int i = 0; i != Size; ++i)
if (BlendMask & (1ull << i))
ScaledMask |= ((1ull << Scale) - 1) << (i * Scale);
return ScaledMask;
}
/// Try to emit a blend instruction for a shuffle.
///
/// This doesn't do any checks for the availability of instructions for blending
/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
/// be matched in the backend with the type given. What it does check for is
/// that the shuffle mask is a blend, or convertible into a blend with zero.
static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
SDValue V2, ArrayRef<int> Original,
const APInt &Zeroable,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
uint64_t BlendMask = 0;
bool ForceV1Zero = false, ForceV2Zero = false;
SmallVector<int, 64> Mask(Original.begin(), Original.end());
if (!matchShuffleAsBlend(V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero,
BlendMask))
return SDValue();
// Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
if (ForceV1Zero)
V1 = getZeroVector(VT, Subtarget, DAG, DL);
if (ForceV2Zero)
V2 = getZeroVector(VT, Subtarget, DAG, DL);
switch (VT.SimpleTy) {
case MVT::v4i64:
case MVT::v8i32:
assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
LLVM_FALLTHROUGH;
case MVT::v4f64:
case MVT::v8f32:
assert(Subtarget.hasAVX() && "256-bit float blends require AVX!");
LLVM_FALLTHROUGH;
case MVT::v2f64:
case MVT::v2i64:
case MVT::v4f32:
case MVT::v4i32:
case MVT::v8i16:
assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!");
return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
DAG.getTargetConstant(BlendMask, DL, MVT::i8));
case MVT::v16i16: {
assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!");
SmallVector<int, 8> RepeatedMask;
if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
// We can lower these with PBLENDW which is mirrored across 128-bit lanes.
assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
BlendMask = 0;
for (int i = 0; i < 8; ++i)
if (RepeatedMask[i] >= 8)
BlendMask |= 1ull << i;
return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
DAG.getTargetConstant(BlendMask, DL, MVT::i8));
}
// Use PBLENDW for lower/upper lanes and then blend lanes.
// TODO - we should allow 2 PBLENDW here and leave shuffle combine to
// merge to VSELECT where useful.
uint64_t LoMask = BlendMask & 0xFF;
uint64_t HiMask = (BlendMask >> 8) & 0xFF;
if (LoMask == 0 || LoMask == 255 || HiMask == 0 || HiMask == 255) {
SDValue Lo = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
DAG.getTargetConstant(LoMask, DL, MVT::i8));
SDValue Hi = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
DAG.getTargetConstant(HiMask, DL, MVT::i8));
return DAG.getVectorShuffle(
MVT::v16i16, DL, Lo, Hi,
{0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31});
}
LLVM_FALLTHROUGH;
}
case MVT::v32i8:
assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!");
LLVM_FALLTHROUGH;
case MVT::v16i8: {
assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!");
// Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
Subtarget, DAG))
return Masked;
if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
MVT IntegerType =
MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
}
// Scale the blend by the number of bytes per element.
int Scale = VT.getScalarSizeInBits() / 8;
// This form of blend is always done on bytes. Compute the byte vector
// type.
MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
// x86 allows load folding with blendvb from the 2nd source operand. But
// we are still using LLVM select here (see comment below), so that's V1.
// If V2 can be load-folded and V1 cannot be load-folded, then commute to
// allow that load-folding possibility.
if (!ISD::isNormalLoad(V1.getNode()) && ISD::isNormalLoad(V2.getNode())) {
ShuffleVectorSDNode::commuteMask(Mask);
std::swap(V1, V2);
}
// Compute the VSELECT mask. Note that VSELECT is really confusing in the
// mix of LLVM's code generator and the x86 backend. We tell the code
// generator that boolean values in the elements of an x86 vector register
// are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
// mapping a select to operand #1, and 'false' mapping to operand #2. The
// reality in x86 is that vector masks (pre-AVX-512) use only the high bit
// of the element (the remaining are ignored) and 0 in that high bit would
// mean operand #1 while 1 in the high bit would mean operand #2. So while
// the LLVM model for boolean values in vector elements gets the relevant
// bit set, it is set backwards and over constrained relative to x86's
// actual model.
SmallVector<SDValue, 32> VSELECTMask;
for (int i = 0, Size = Mask.size(); i < Size; ++i)
for (int j = 0; j < Scale; ++j)
VSELECTMask.push_back(
Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
: DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
MVT::i8));
V1 = DAG.getBitcast(BlendVT, V1);
V2 = DAG.getBitcast(BlendVT, V2);
return DAG.getBitcast(
VT,
DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
V1, V2));
}
case MVT::v16f32:
case MVT::v8f64:
case MVT::v8i64:
case MVT::v16i32:
case MVT::v32i16:
case MVT::v64i8: {
// Attempt to lower to a bitmask if we can. Only if not optimizing for size.
bool OptForSize = DAG.shouldOptForSize();
if (!OptForSize) {
if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
Subtarget, DAG))
return Masked;
}
// Otherwise load an immediate into a GPR, cast to k-register, and use a
// masked move.
MVT IntegerType =
MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
}
default:
llvm_unreachable("Not a supported integer vector type!");
}
}
/// Try to lower as a blend of elements from two inputs followed by
/// a single-input permutation.
///
/// This matches the pattern where we can blend elements from two inputs and
/// then reduce the shuffle to a single-input permutation.
static SDValue lowerShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
SDValue V1, SDValue V2,
ArrayRef<int> Mask,
SelectionDAG &DAG,
bool ImmBlends = false) {
// We build up the blend mask while checking whether a blend is a viable way
// to reduce the shuffle.
SmallVector<int, 32> BlendMask(Mask.size(), -1);
SmallVector<int, 32> PermuteMask(Mask.size(), -1);
for (int i = 0, Size = Mask.size(); i < Size; ++i) {
if (Mask[i] < 0)
continue;
assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
if (BlendMask[Mask[i] % Size] < 0)
BlendMask[Mask[i] % Size] = Mask[i];
else if (BlendMask[Mask[i] % Size] != Mask[i])
return SDValue(); // Can't blend in the needed input!
PermuteMask[i] = Mask[i] % Size;
}
// If only immediate blends, then bail if the blend mask can't be widened to
// i16.
unsigned EltSize = VT.getScalarSizeInBits();
if (ImmBlends && EltSize == 8 && !canWidenShuffleElements(BlendMask))
return SDValue();
SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
}
/// Try to lower as an unpack of elements from two inputs followed by
/// a single-input permutation.
///
/// This matches the pattern where we can unpack elements from two inputs and
/// then reduce the shuffle to a single-input (wider) permutation.
static SDValue lowerShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT,
SDValue V1, SDValue V2,
ArrayRef<int> Mask,
SelectionDAG &DAG) {
int NumElts = Mask.size();
int NumLanes = VT.getSizeInBits() / 128;
int NumLaneElts = NumElts / NumLanes;
int NumHalfLaneElts = NumLaneElts / 2;
bool MatchLo = true, MatchHi = true;
SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
// Determine UNPCKL/UNPCKH type and operand order.
for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
for (int Elt = 0; Elt != NumLaneElts; ++Elt) {
int M = Mask[Lane + Elt];
if (M < 0)
continue;
SDValue &Op = Ops[Elt & 1];
if (M < NumElts && (Op.isUndef() || Op == V1))
Op = V1;
else if (NumElts <= M && (Op.isUndef() || Op == V2))
Op = V2;
else
return SDValue();
int Lo = Lane, Mid = Lane + NumHalfLaneElts, Hi = Lane + NumLaneElts;
MatchLo &= isUndefOrInRange(M, Lo, Mid) ||
isUndefOrInRange(M, NumElts + Lo, NumElts + Mid);
MatchHi &= isUndefOrInRange(M, Mid, Hi) ||
isUndefOrInRange(M, NumElts + Mid, NumElts + Hi);
if (!MatchLo && !MatchHi)
return SDValue();
}
}
assert((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI");
// Now check that each pair of elts come from the same unpack pair
// and set the permute mask based on each pair.
// TODO - Investigate cases where we permute individual elements.
SmallVector<int, 32> PermuteMask(NumElts, -1);
for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
for (int Elt = 0; Elt != NumLaneElts; Elt += 2) {
int M0 = Mask[Lane + Elt + 0];
int M1 = Mask[Lane + Elt + 1];
if (0 <= M0 && 0 <= M1 &&
(M0 % NumHalfLaneElts) != (M1 % NumHalfLaneElts))
return SDValue();
if (0 <= M0)
PermuteMask[Lane + Elt + 0] = Lane + (2 * (M0 % NumHalfLaneElts));
if (0 <= M1)
PermuteMask[Lane + Elt + 1] = Lane + (2 * (M1 % NumHalfLaneElts)) + 1;
}
}
unsigned UnpckOp = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
SDValue Unpck = DAG.getNode(UnpckOp, DL, VT, Ops);
return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask);
}
/// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then
/// permuting the elements of the result in place.
static SDValue lowerShuffleAsByteRotateAndPermute(
const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
const X86Subtarget &Subtarget, SelectionDAG &DAG) {
if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) ||
(VT.is256BitVector() && !Subtarget.hasAVX2()) ||
(VT.is512BitVector() && !Subtarget.hasBWI()))
return SDValue();
// We don't currently support lane crossing permutes.
if (is128BitLaneCrossingShuffleMask(VT, Mask))
return SDValue();
int Scale = VT.getScalarSizeInBits() / 8;
int NumLanes = VT.getSizeInBits() / 128;
int NumElts = VT.getVectorNumElements();
int NumEltsPerLane = NumElts / NumLanes;
// Determine range of mask elts.
bool Blend1 = true;
bool Blend2 = true;
std::pair<int, int> Range1 = std::make_pair(INT_MAX, INT_MIN);
std::pair<int, int> Range2 = std::make_pair(INT_MAX, INT_MIN);
for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
int M = Mask[Lane + Elt];
if (M < 0)
continue;
if (M < NumElts) {
Blend1 &= (M == (Lane + Elt));
assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
M = M % NumEltsPerLane;
Range1.first = std::min(Range1.first, M);
Range1.second = std::max(Range1.second, M);
} else {
M -= NumElts;
Blend2 &= (M == (Lane + Elt));
assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
M = M % NumEltsPerLane;
Range2.first = std::min(Range2.first, M);
Range2.second = std::max(Range2.second, M);
}
}
}
// Bail if we don't need both elements.
// TODO - it might be worth doing this for unary shuffles if the permute
// can be widened.
if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) ||
!(0 <= Range2.first && Range2.second < NumEltsPerLane))
return SDValue();
if (VT.getSizeInBits() > 128 && (Blend1 || Blend2))
return SDValue();
// Rotate the 2 ops so we can access both ranges, then permute the result.
auto RotateAndPermute = [&](SDValue Lo, SDValue Hi, int RotAmt, int Ofs) {
MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
SDValue Rotate = DAG.getBitcast(
VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi),
DAG.getBitcast(ByteVT, Lo),
DAG.getTargetConstant(Scale * RotAmt, DL, MVT::i8)));
SmallVector<int, 64> PermMask(NumElts, SM_SentinelUndef);
for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
int M = Mask[Lane + Elt];
if (M < 0)
continue;
if (M < NumElts)
PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane);
else
PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane);
}
}
return DAG.getVectorShuffle(VT, DL, Rotate, DAG.getUNDEF(VT), PermMask);
};
// Check if the ranges are small enough to rotate from either direction.
if (Range2.second < Range1.first)
return RotateAndPermute(V1, V2, Range1.first, 0);
if (Range1.second < Range2.first)
return RotateAndPermute(V2, V1, Range2.first, NumElts);
return SDValue();
}
/// Generic routine to decompose a shuffle and blend into independent
/// blends and permutes.
///
/// This matches the extremely common pattern for handling combined
/// shuffle+blend operations on newer X86 ISAs where we have very fast blend
/// operations. It will try to pick the best arrangement of shuffles and
/// blends.
static SDValue lowerShuffleAsDecomposedShuffleBlend(
const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
const X86Subtarget &Subtarget, SelectionDAG &DAG) {
// Shuffle the input elements into the desired positions in V1 and V2 and
// blend them together.
SmallVector<int, 32> V1Mask(Mask.size(), -1);
SmallVector<int, 32> V2Mask(Mask.size(), -1);
SmallVector<int, 32> BlendMask(Mask.size(), -1);
for (int i = 0, Size = Mask.size(); i < Size; ++i)
if (Mask[i] >= 0 && Mask[i] < Size) {
V1Mask[i] = Mask[i];
BlendMask[i] = i;
} else if (Mask[i] >= Size) {
V2Mask[i] = Mask[i] - Size;
BlendMask[i] = i + Size;
}
// Try to lower with the simpler initial blend/unpack/rotate strategies unless
// one of the input shuffles would be a no-op. We prefer to shuffle inputs as
// the shuffle may be able to fold with a load or other benefit. However, when
// we'll have to do 2x as many shuffles in order to achieve this, a 2-input
// pre-shuffle first is a better strategy.
if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) {
// Only prefer immediate blends to unpack/rotate.
if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
DAG, true))
return BlendPerm;
if (SDValue UnpackPerm = lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask,
DAG))
return UnpackPerm;
if (SDValue RotatePerm = lowerShuffleAsByteRotateAndPermute(
DL, VT, V1, V2, Mask, Subtarget, DAG))
return RotatePerm;
// Unpack/rotate failed - try again with variable blends.
if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
DAG))
return BlendPerm;
}
V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
}
/// Try to lower a vector shuffle as a rotation.
///
/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
static int matchShuffleAsRotate(SDValue &V1, SDValue &V2, ArrayRef<int> Mask) {
int NumElts = Mask.size();
// We need to detect various ways of spelling a rotation:
// [11, 12, 13, 14, 15, 0, 1, 2]
// [-1, 12, 13, 14, -1, -1, 1, -1]
// [-1, -1, -1, -1, -1, -1, 1, 2]
// [ 3, 4, 5, 6, 7, 8, 9, 10]
// [-1, 4, 5, 6, -1, -1, 9, -1]
// [-1, 4, 5, 6, -1, -1, -1, -1]
int Rotation = 0;
SDValue Lo, Hi;
for (int i = 0; i < NumElts; ++i) {
int M = Mask[i];
assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&
"Unexpected mask index.");
if (M < 0)
continue;
// Determine where a rotated vector would have started.
int StartIdx = i - (M % NumElts);
if (StartIdx == 0)
// The identity rotation isn't interesting, stop.
return -1;
// If we found the tail of a vector the rotation must be the missing
// front. If we found the head of a vector, it must be how much of the
// head.
int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
if (Rotation == 0)
Rotation = CandidateRotation;
else if (Rotation != CandidateRotation)
// The rotations don't match, so we can't match this mask.
return -1;
// Compute which value this mask is pointing at.
SDValue MaskV = M < NumElts ? V1 : V2;
// Compute which of the two target values this index should be assigned
// to. This reflects whether the high elements are remaining or the low
// elements are remaining.
SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
// Either set up this value if we've not encountered it before, or check
// that it remains consistent.
if (!TargetV)
TargetV = MaskV;
else if (TargetV != MaskV)
// This may be a rotation, but it pulls from the inputs in some
// unsupported interleaving.
return -1;
}
// Check that we successfully analyzed the mask, and normalize the results.
assert(Rotation != 0 && "Failed to locate a viable rotation!");
assert((Lo || Hi) && "Failed to find a rotated input vector!");
if (!Lo)
Lo = Hi;
else if (!Hi)
Hi = Lo;
V1 = Lo;
V2 = Hi;
return Rotation;
}
/// Try to lower a vector shuffle as a byte rotation.
///
/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
/// try to generically lower a vector shuffle through such an pattern. It
/// does not check for the profitability of lowering either as PALIGNR or
/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
/// This matches shuffle vectors that look like:
///
/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
///
/// Essentially it concatenates V1 and V2, shifts right by some number of
/// elements, and takes the low elements as the result. Note that while this is
/// specified as a *right shift* because x86 is little-endian, it is a *left
/// rotate* of the vector lanes.
static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
ArrayRef<int> Mask) {
// Don't accept any shuffles with zero elements.
if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
return -1;
// PALIGNR works on 128-bit lanes.
SmallVector<int, 16> RepeatedMask;
if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
return -1;
int Rotation = matchShuffleAsRotate(V1, V2, RepeatedMask);
if (Rotation <= 0)
return -1;
// PALIGNR rotates bytes, so we need to scale the
// rotation based on how many bytes are in the vector lane.
int NumElts = RepeatedMask.size();
int Scale = 16 / NumElts;
return Rotation * Scale;
}
static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1,
SDValue V2, ArrayRef<int> Mask,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
SDValue Lo = V1, Hi = V2;
int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask);
if (ByteRotation <= 0)
return SDValue();
// Cast the inputs to i8 vector of correct length to match PALIGNR or
// PSLLDQ/PSRLDQ.
MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
Lo = DAG.getBitcast(ByteVT, Lo);
Hi = DAG.getBitcast(ByteVT, Hi);
// SSSE3 targets can use the palignr instruction.
if (Subtarget.hasSSSE3()) {
assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&
"512-bit PALIGNR requires BWI instructions");
return DAG.getBitcast(
VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
DAG.getTargetConstant(ByteRotation, DL, MVT::i8)));
}
assert(VT.is128BitVector() &&
"Rotate-based lowering only supports 128-bit lowering!");
assert(Mask.size() <= 16 &&
"Can shuffle at most 16 bytes in a 128-bit vector!");
assert(ByteVT == MVT::v16i8 &&
"SSE2 rotate lowering only needed for v16i8!");
// Default SSE2 implementation
int LoByteShift = 16 - ByteRotation;
int HiByteShift = ByteRotation;
SDValue LoShift =
DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
DAG.getTargetConstant(LoByteShift, DL, MVT::i8));
SDValue HiShift =
DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
DAG.getTargetConstant(HiByteShift, DL, MVT::i8));
return DAG.getBitcast(VT,
DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
}
/// Try to lower a vector shuffle as a dword/qword rotation.
///
/// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
/// rotation of the concatenation of two vectors; This routine will
/// try to generically lower a vector shuffle through such an pattern.
///
/// Essentially it concatenates V1 and V2, shifts right by some number of
/// elements, and takes the low elements as the result. Note that while this is
/// specified as a *right shift* because x86 is little-endian, it is a *left
/// rotate* of the vector lanes.
static SDValue lowerShuffleAsRotate(const SDLoc &DL, MVT VT, SDValue V1,
SDValue V2, ArrayRef<int> Mask,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
"Only 32-bit and 64-bit elements are supported!");
// 128/256-bit vectors are only supported with VLX.
assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))
&& "VLX required for 128/256-bit vectors");
SDValue Lo = V1, Hi = V2;
int Rotation = matchShuffleAsRotate(Lo, Hi, Mask);
if (Rotation <= 0)
return SDValue();
return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
DAG.getTargetConstant(Rotation, DL, MVT::i8));
}
/// Try to lower a vector shuffle as a byte shift sequence.
static SDValue lowerShuffleAsByteShiftMask(const SDLoc &DL, MVT VT, SDValue V1,
SDValue V2, ArrayRef<int> Mask,
const APInt &Zeroable,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
assert(VT.is128BitVector() && "Only 128-bit vectors supported");
// We need a shuffle that has zeros at one/both ends and a sequential
// shuffle from one source within.
unsigned ZeroLo = Zeroable.countTrailingOnes();
unsigned ZeroHi = Zeroable.countLeadingOnes();
if (!ZeroLo && !ZeroHi)
return SDValue();
unsigned NumElts = Mask.size();
unsigned Len = NumElts - (ZeroLo + ZeroHi);
if (!isSequentialOrUndefInRange(Mask, ZeroLo, Len, Mask[ZeroLo]))
return SDValue();
unsigned Scale = VT.getScalarSizeInBits() / 8;
ArrayRef<int> StubMask = Mask.slice(ZeroLo, Len);
if (!isUndefOrInRange(StubMask, 0, NumElts) &&
!isUndefOrInRange(StubMask, NumElts, 2 * NumElts))
return SDValue();
SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
Res = DAG.getBitcast(MVT::v16i8, Res);
// Use VSHLDQ/VSRLDQ ops to zero the ends of a vector and leave an
// inner sequential set of elements, possibly offset:
// 01234567 --> zzzzzz01 --> 1zzzzzzz
// 01234567 --> 4567zzzz --> zzzzz456
// 01234567 --> z0123456 --> 3456zzzz --> zz3456zz
if (ZeroLo == 0) {
unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
DAG.getTargetConstant(Scale * ZeroHi, DL, MVT::i8));
} else if (ZeroHi == 0) {
unsigned Shift = Mask[ZeroLo] % NumElts;
Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
} else if (!Subtarget.hasSSSE3()) {
// If we don't have PSHUFB then its worth avoiding an AND constant mask
// by performing 3 byte shifts. Shuffle combining can kick in above that.
// TODO: There may be some cases where VSH{LR}DQ+PAND is still better.
unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
Shift += Mask[ZeroLo] % NumElts;
Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
} else
return SDValue();
return DAG.getBitcast(VT, Res);
}
/// Try to lower a vector shuffle as a bit shift (shifts in zeros).
///
/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
/// matches elements from one of the input vectors shuffled to the left or
/// right with zeroable elements 'shifted in'. It handles both the strictly
/// bit-wise element shifts and the byte shift across an entire 128-bit double
/// quad word lane.
///
/// PSHL : (little-endian) left bit shift.
/// [ zz, 0, zz, 2 ]
/// [ -1, 4, zz, -1 ]
/// PSRL : (little-endian) right bit shift.
/// [ 1, zz, 3, zz]
/// [ -1, -1, 7, zz]
/// PSLLDQ : (little-endian) left byte shift
/// [ zz, 0, 1, 2, 3, 4, 5, 6]
/// [ zz, zz, -1, -1, 2, 3, 4, -1]
/// [ zz, zz, zz, zz, zz, zz, -1, 1]
/// PSRLDQ : (little-endian) right byte shift
/// [ 5, 6, 7, zz, zz, zz, zz, zz]
/// [ -1, 5, 6, 7, zz, zz, zz, zz]
/// [ 1, 2, -1, -1, -1, -1, zz, zz]
static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
unsigned ScalarSizeInBits, ArrayRef<int> Mask,
int MaskOffset, const APInt &Zeroable,
const X86Subtarget &Subtarget) {
int Size = Mask.size();
unsigned SizeInBits = Size * ScalarSizeInBits;
auto CheckZeros = [&](int Shift, int Scale, bool Left) {
for (int i = 0; i < Size; i += Scale)
for (int j = 0; j < Shift; ++j)
if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
return false;
return true;
};
auto MatchShift = [&](int Shift, int Scale, bool Left) {
for (int i = 0; i != Size; i += Scale) {
unsigned Pos = Left ? i + Shift : i;
unsigned Low = Left ? i : i + Shift;
unsigned Len = Scale - Shift;
if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
return -1;
}
int ShiftEltBits = ScalarSizeInBits * Scale;
bool ByteShift = ShiftEltBits > 64;
Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
: (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
// Normalize the scale for byte shifts to still produce an i64 element
// type.
Scale = ByteShift ? Scale / 2 : Scale;
// We need to round trip through the appropriate type for the shift.
MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
: MVT::getVectorVT(ShiftSVT, Size / Scale);
return (int)ShiftAmt;
};
// SSE/AVX supports logical shifts up to 64-bit integers - so we can just
// keep doubling the size of the integer elements up to that. We can
// then shift the elements of the integer vector by whole multiples of
// their width within the elements of the larger integer vector. Test each
// multiple to see if we can find a match with the moved element indices
// and that the shifted in elements are all zeroable.
unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
for (int Shift = 1; Shift != Scale; ++Shift)
for (bool Left : {true, false})
if (CheckZeros(Shift, Scale, Left)) {
int ShiftAmt = MatchShift(Shift, Scale, Left);
if (0 < ShiftAmt)
return ShiftAmt;
}
// no match
return -1;
}
static SDValue lowerShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
SDValue V2, ArrayRef<int> Mask,
const APInt &Zeroable,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
int Size = Mask.size();
assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
MVT ShiftVT;
SDValue V = V1;
unsigned Opcode;
// Try to match shuffle against V1 shift.
int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
Mask, 0, Zeroable, Subtarget);
// If V1 failed, try to match shuffle against V2 shift.
if (ShiftAmt < 0) {
ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
Mask, Size, Zeroable, Subtarget);
V = V2;
}
if (ShiftAmt < 0)
return SDValue();
assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
"Illegal integer vector type");
V = DAG.getBitcast(ShiftVT, V);
V = DAG.getNode(Opcode, DL, ShiftVT, V,
DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
return DAG.getBitcast(VT, V);
}
// EXTRQ: Extract Len elements from lower half of source, starting at Idx.
// Remainder of lower half result is zero and upper half is all undef.
static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
ArrayRef<int> Mask, uint64_t &BitLen,
uint64_t &BitIdx, const APInt &Zeroable) {
int Size = Mask.size();
int HalfSize = Size / 2;
assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
assert(!Zeroable.isAllOnesValue() && "Fully zeroable shuffle mask");
// Upper half must be undefined.
if (!isUndefUpperHalf(Mask))
return false;
// Determine the extraction length from the part of the
// lower half that isn't zeroable.
int Len = HalfSize;
for (; Len > 0; --Len)
if (!Zeroable[Len - 1])
break;
assert(Len > 0 && "Zeroable shuffle mask");
// Attempt to match first Len sequential elements from the lower half.
SDValue Src;
int Idx = -1;
for (int i = 0; i != Len; ++i) {
int M = Mask[i];
if (M == SM_SentinelUndef)
continue;
SDValue &V = (M < Size ? V1 : V2);
M = M % Size;
// The extracted elements must start at a valid index and all mask
// elements must be in the lower half.
if (i > M || M >= HalfSize)
return false;
if (Idx < 0 || (Src == V && Idx == (M - i))) {
Src = V;
Idx = M - i;
continue;
}
return false;
}
if (!Src || Idx < 0)
return false;
assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
V1 = Src;
return true;
}
// INSERTQ: Extract lowest Len elements from lower half of second source and
// insert over first source, starting at Idx.
// { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
ArrayRef<int> Mask, uint64_t &BitLen,
uint64_t &BitIdx) {
int Size = Mask.size();
int HalfSize = Size / 2;
assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
// Upper half must be undefined.
if (!isUndefUpperHalf(Mask))
return false;
for (int Idx = 0; Idx != HalfSize; ++Idx) {
SDValue Base;
// Attempt to match first source from mask before insertion point.
if (isUndefInRange(Mask, 0, Idx)) {
/* EMPTY */
} else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
Base = V1;
} else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
Base = V2;
} else {
continue;
}
// Extend the extraction length looking to match both the insertion of
// the second source and the remaining elements of the first.
for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
SDValue Insert;
int Len = Hi - Idx;
// Match insertion.
if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
Insert = V1;
} else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
Insert = V2;
} else {
continue;
}
// Match the remaining elements of the lower half.
if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
/* EMPTY */
} else if ((!Base || (Base == V1)) &&
isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
Base = V1;
} else if ((!Base || (Base == V2)) &&
isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
Size + Hi)) {
Base = V2;
} else {
continue