| //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| // |
| // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops |
| // and generates target-independent LLVM-IR. |
| // The vectorizer uses the TargetTransformInfo analysis to estimate the costs |
| // of instructions in order to estimate the profitability of vectorization. |
| // |
| // The loop vectorizer combines consecutive loop iterations into a single |
| // 'wide' iteration. After this transformation the index is incremented |
| // by the SIMD vector width, and not by one. |
| // |
| // This pass has three parts: |
| // 1. The main loop pass that drives the different parts. |
| // 2. LoopVectorizationLegality - A unit that checks for the legality |
| // of the vectorization. |
| // 3. InnerLoopVectorizer - A unit that performs the actual |
| // widening of instructions. |
| // 4. LoopVectorizationCostModel - A unit that checks for the profitability |
| // of vectorization. It decides on the optimal vector width, which |
| // can be one, if vectorization is not profitable. |
| // |
| // There is a development effort going on to migrate loop vectorizer to the |
| // VPlan infrastructure and to introduce outer loop vectorization support (see |
| // docs/Proposal/VectorizationPlan.rst and |
| // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this |
| // purpose, we temporarily introduced the VPlan-native vectorization path: an |
| // alternative vectorization path that is natively implemented on top of the |
| // VPlan infrastructure. See EnableVPlanNativePath for enabling. |
| // |
| //===----------------------------------------------------------------------===// |
| // |
| // The reduction-variable vectorization is based on the paper: |
| // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. |
| // |
| // Variable uniformity checks are inspired by: |
| // Karrenberg, R. and Hack, S. Whole Function Vectorization. |
| // |
| // The interleaved access vectorization is based on the paper: |
| // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved |
| // Data for SIMD |
| // |
| // Other ideas/concepts are from: |
| // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. |
| // |
| // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of |
| // Vectorizing Compilers. |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #include "llvm/Transforms/Vectorize/LoopVectorize.h" |
| #include "LoopVectorizationPlanner.h" |
| #include "VPRecipeBuilder.h" |
| #include "VPlan.h" |
| #include "VPlanHCFGBuilder.h" |
| #include "VPlanPredicator.h" |
| #include "VPlanTransforms.h" |
| #include "llvm/ADT/APInt.h" |
| #include "llvm/ADT/ArrayRef.h" |
| #include "llvm/ADT/DenseMap.h" |
| #include "llvm/ADT/DenseMapInfo.h" |
| #include "llvm/ADT/Hashing.h" |
| #include "llvm/ADT/MapVector.h" |
| #include "llvm/ADT/None.h" |
| #include "llvm/ADT/Optional.h" |
| #include "llvm/ADT/STLExtras.h" |
| #include "llvm/ADT/SetVector.h" |
| #include "llvm/ADT/SmallPtrSet.h" |
| #include "llvm/ADT/SmallVector.h" |
| #include "llvm/ADT/Statistic.h" |
| #include "llvm/ADT/StringRef.h" |
| #include "llvm/ADT/Twine.h" |
| #include "llvm/ADT/iterator_range.h" |
| #include "llvm/Analysis/AssumptionCache.h" |
| #include "llvm/Analysis/BasicAliasAnalysis.h" |
| #include "llvm/Analysis/BlockFrequencyInfo.h" |
| #include "llvm/Analysis/CFG.h" |
| #include "llvm/Analysis/CodeMetrics.h" |
| #include "llvm/Analysis/DemandedBits.h" |
| #include "llvm/Analysis/GlobalsModRef.h" |
| #include "llvm/Analysis/LoopAccessAnalysis.h" |
| #include "llvm/Analysis/LoopAnalysisManager.h" |
| #include "llvm/Analysis/LoopInfo.h" |
| #include "llvm/Analysis/LoopIterator.h" |
| #include "llvm/Analysis/MemorySSA.h" |
| #include "llvm/Analysis/OptimizationRemarkEmitter.h" |
| #include "llvm/Analysis/ProfileSummaryInfo.h" |
| #include "llvm/Analysis/ScalarEvolution.h" |
| #include "llvm/Analysis/ScalarEvolutionExpander.h" |
| #include "llvm/Analysis/ScalarEvolutionExpressions.h" |
| #include "llvm/Analysis/TargetLibraryInfo.h" |
| #include "llvm/Analysis/TargetTransformInfo.h" |
| #include "llvm/Analysis/VectorUtils.h" |
| #include "llvm/IR/Attributes.h" |
| #include "llvm/IR/BasicBlock.h" |
| #include "llvm/IR/CFG.h" |
| #include "llvm/IR/Constant.h" |
| #include "llvm/IR/Constants.h" |
| #include "llvm/IR/DataLayout.h" |
| #include "llvm/IR/DebugInfoMetadata.h" |
| #include "llvm/IR/DebugLoc.h" |
| #include "llvm/IR/DerivedTypes.h" |
| #include "llvm/IR/DiagnosticInfo.h" |
| #include "llvm/IR/Dominators.h" |
| #include "llvm/IR/Function.h" |
| #include "llvm/IR/IRBuilder.h" |
| #include "llvm/IR/InstrTypes.h" |
| #include "llvm/IR/Instruction.h" |
| #include "llvm/IR/Instructions.h" |
| #include "llvm/IR/IntrinsicInst.h" |
| #include "llvm/IR/Intrinsics.h" |
| #include "llvm/IR/LLVMContext.h" |
| #include "llvm/IR/Metadata.h" |
| #include "llvm/IR/Module.h" |
| #include "llvm/IR/Operator.h" |
| #include "llvm/IR/Type.h" |
| #include "llvm/IR/Use.h" |
| #include "llvm/IR/User.h" |
| #include "llvm/IR/Value.h" |
| #include "llvm/IR/ValueHandle.h" |
| #include "llvm/IR/Verifier.h" |
| #include "llvm/InitializePasses.h" |
| #include "llvm/Pass.h" |
| #include "llvm/Support/Casting.h" |
| #include "llvm/Support/CommandLine.h" |
| #include "llvm/Support/Compiler.h" |
| #include "llvm/Support/Debug.h" |
| #include "llvm/Support/ErrorHandling.h" |
| #include "llvm/Support/MathExtras.h" |
| #include "llvm/Support/raw_ostream.h" |
| #include "llvm/Transforms/Utils/BasicBlockUtils.h" |
| #include "llvm/Transforms/Utils/LoopSimplify.h" |
| #include "llvm/Transforms/Utils/LoopUtils.h" |
| #include "llvm/Transforms/Utils/LoopVersioning.h" |
| #include "llvm/Transforms/Utils/SizeOpts.h" |
| #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" |
| #include <algorithm> |
| #include <cassert> |
| #include <cstdint> |
| #include <cstdlib> |
| #include <functional> |
| #include <iterator> |
| #include <limits> |
| #include <memory> |
| #include <string> |
| #include <tuple> |
| #include <utility> |
| |
| using namespace llvm; |
| |
| #define LV_NAME "loop-vectorize" |
| #define DEBUG_TYPE LV_NAME |
| |
| /// @{ |
| /// Metadata attribute names |
| static const char *const LLVMLoopVectorizeFollowupAll = |
| "llvm.loop.vectorize.followup_all"; |
| static const char *const LLVMLoopVectorizeFollowupVectorized = |
| "llvm.loop.vectorize.followup_vectorized"; |
| static const char *const LLVMLoopVectorizeFollowupEpilogue = |
| "llvm.loop.vectorize.followup_epilogue"; |
| /// @} |
| |
| STATISTIC(LoopsVectorized, "Number of loops vectorized"); |
| STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); |
| |
| /// Loops with a known constant trip count below this number are vectorized only |
| /// if no scalar iteration overheads are incurred. |
| static cl::opt<unsigned> TinyTripCountVectorThreshold( |
| "vectorizer-min-trip-count", cl::init(16), cl::Hidden, |
| cl::desc("Loops with a constant trip count that is smaller than this " |
| "value are vectorized only if no scalar iteration overheads " |
| "are incurred.")); |
| |
| // Indicates that an epilogue is undesired, predication is preferred. |
| // This means that the vectorizer will try to fold the loop-tail (epilogue) |
| // into the loop and predicate the loop body accordingly. |
| static cl::opt<bool> PreferPredicateOverEpilog( |
| "prefer-predicate-over-epilog", cl::init(false), cl::Hidden, |
| cl::desc("Indicate that an epilogue is undesired, predication should be " |
| "used instead.")); |
| |
| static cl::opt<bool> MaximizeBandwidth( |
| "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, |
| cl::desc("Maximize bandwidth when selecting vectorization factor which " |
| "will be determined by the smallest type in loop.")); |
| |
| static cl::opt<bool> EnableInterleavedMemAccesses( |
| "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, |
| cl::desc("Enable vectorization on interleaved memory accesses in a loop")); |
| |
| /// An interleave-group may need masking if it resides in a block that needs |
| /// predication, or in order to mask away gaps. |
| static cl::opt<bool> EnableMaskedInterleavedMemAccesses( |
| "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, |
| cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); |
| |
| static cl::opt<unsigned> TinyTripCountInterleaveThreshold( |
| "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, |
| cl::desc("We don't interleave loops with a estimated constant trip count " |
| "below this number")); |
| |
| static cl::opt<unsigned> ForceTargetNumScalarRegs( |
| "force-target-num-scalar-regs", cl::init(0), cl::Hidden, |
| cl::desc("A flag that overrides the target's number of scalar registers.")); |
| |
| static cl::opt<unsigned> ForceTargetNumVectorRegs( |
| "force-target-num-vector-regs", cl::init(0), cl::Hidden, |
| cl::desc("A flag that overrides the target's number of vector registers.")); |
| |
| static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( |
| "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, |
| cl::desc("A flag that overrides the target's max interleave factor for " |
| "scalar loops.")); |
| |
| static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( |
| "force-target-max-vector-interleave", cl::init(0), cl::Hidden, |
| cl::desc("A flag that overrides the target's max interleave factor for " |
| "vectorized loops.")); |
| |
| static cl::opt<unsigned> ForceTargetInstructionCost( |
| "force-target-instruction-cost", cl::init(0), cl::Hidden, |
| cl::desc("A flag that overrides the target's expected cost for " |
| "an instruction to a single constant value. Mostly " |
| "useful for getting consistent testing.")); |
| |
| static cl::opt<unsigned> SmallLoopCost( |
| "small-loop-cost", cl::init(20), cl::Hidden, |
| cl::desc( |
| "The cost of a loop that is considered 'small' by the interleaver.")); |
| |
| static cl::opt<bool> LoopVectorizeWithBlockFrequency( |
| "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, |
| cl::desc("Enable the use of the block frequency analysis to access PGO " |
| "heuristics minimizing code growth in cold regions and being more " |
| "aggressive in hot regions.")); |
| |
| // Runtime interleave loops for load/store throughput. |
| static cl::opt<bool> EnableLoadStoreRuntimeInterleave( |
| "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, |
| cl::desc( |
| "Enable runtime interleaving until load/store ports are saturated")); |
| |
| /// The number of stores in a loop that are allowed to need predication. |
| static cl::opt<unsigned> NumberOfStoresToPredicate( |
| "vectorize-num-stores-pred", cl::init(1), cl::Hidden, |
| cl::desc("Max number of stores to be predicated behind an if.")); |
| |
| static cl::opt<bool> EnableIndVarRegisterHeur( |
| "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, |
| cl::desc("Count the induction variable only once when interleaving")); |
| |
| static cl::opt<bool> EnableCondStoresVectorization( |
| "enable-cond-stores-vec", cl::init(true), cl::Hidden, |
| cl::desc("Enable if predication of stores during vectorization.")); |
| |
| static cl::opt<unsigned> MaxNestedScalarReductionIC( |
| "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, |
| cl::desc("The maximum interleave count to use when interleaving a scalar " |
| "reduction in a nested loop.")); |
| |
| cl::opt<bool> EnableVPlanNativePath( |
| "enable-vplan-native-path", cl::init(false), cl::Hidden, |
| cl::desc("Enable VPlan-native vectorization path with " |
| "support for outer loop vectorization.")); |
| |
| // FIXME: Remove this switch once we have divergence analysis. Currently we |
| // assume divergent non-backedge branches when this switch is true. |
| cl::opt<bool> EnableVPlanPredication( |
| "enable-vplan-predication", cl::init(false), cl::Hidden, |
| cl::desc("Enable VPlan-native vectorization path predicator with " |
| "support for outer loop vectorization.")); |
| |
| // This flag enables the stress testing of the VPlan H-CFG construction in the |
| // VPlan-native vectorization path. It must be used in conjuction with |
| // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the |
| // verification of the H-CFGs built. |
| static cl::opt<bool> VPlanBuildStressTest( |
| "vplan-build-stress-test", cl::init(false), cl::Hidden, |
| cl::desc( |
| "Build VPlan for every supported loop nest in the function and bail " |
| "out right after the build (stress test the VPlan H-CFG construction " |
| "in the VPlan-native vectorization path).")); |
| |
| cl::opt<bool> llvm::EnableLoopInterleaving( |
| "interleave-loops", cl::init(true), cl::Hidden, |
| cl::desc("Enable loop interleaving in Loop vectorization passes")); |
| cl::opt<bool> llvm::EnableLoopVectorization( |
| "vectorize-loops", cl::init(true), cl::Hidden, |
| cl::desc("Run the Loop vectorization passes")); |
| |
| /// A helper function for converting Scalar types to vector types. |
| /// If the incoming type is void, we return void. If the VF is 1, we return |
| /// the scalar type. |
| static Type *ToVectorTy(Type *Scalar, unsigned VF) { |
| if (Scalar->isVoidTy() || VF == 1) |
| return Scalar; |
| return VectorType::get(Scalar, VF); |
| } |
| |
| /// A helper function that returns the type of loaded or stored value. |
| static Type *getMemInstValueType(Value *I) { |
| assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && |
| "Expected Load or Store instruction"); |
| if (auto *LI = dyn_cast<LoadInst>(I)) |
| return LI->getType(); |
| return cast<StoreInst>(I)->getValueOperand()->getType(); |
| } |
| |
| /// A helper function that returns true if the given type is irregular. The |
| /// type is irregular if its allocated size doesn't equal the store size of an |
| /// element of the corresponding vector type at the given vectorization factor. |
| static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) { |
| // Determine if an array of VF elements of type Ty is "bitcast compatible" |
| // with a <VF x Ty> vector. |
| if (VF > 1) { |
| auto *VectorTy = VectorType::get(Ty, VF); |
| return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy); |
| } |
| |
| // If the vectorization factor is one, we just check if an array of type Ty |
| // requires padding between elements. |
| return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); |
| } |
| |
| /// A helper function that returns the reciprocal of the block probability of |
| /// predicated blocks. If we return X, we are assuming the predicated block |
| /// will execute once for every X iterations of the loop header. |
| /// |
| /// TODO: We should use actual block probability here, if available. Currently, |
| /// we always assume predicated blocks have a 50% chance of executing. |
| static unsigned getReciprocalPredBlockProb() { return 2; } |
| |
| /// A helper function that adds a 'fast' flag to floating-point operations. |
| static Value *addFastMathFlag(Value *V) { |
| if (isa<FPMathOperator>(V)) |
| cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast()); |
| return V; |
| } |
| |
| static Value *addFastMathFlag(Value *V, FastMathFlags FMF) { |
| if (isa<FPMathOperator>(V)) |
| cast<Instruction>(V)->setFastMathFlags(FMF); |
| return V; |
| } |
| |
| /// A helper function that returns an integer or floating-point constant with |
| /// value C. |
| static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { |
| return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) |
| : ConstantFP::get(Ty, C); |
| } |
| |
| /// Returns "best known" trip count for the specified loop \p L as defined by |
| /// the following procedure: |
| /// 1) Returns exact trip count if it is known. |
| /// 2) Returns expected trip count according to profile data if any. |
| /// 3) Returns upper bound estimate if it is known. |
| /// 4) Returns None if all of the above failed. |
| static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { |
| // Check if exact trip count is known. |
| if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) |
| return ExpectedTC; |
| |
| // Check if there is an expected trip count available from profile data. |
| if (LoopVectorizeWithBlockFrequency) |
| if (auto EstimatedTC = getLoopEstimatedTripCount(L)) |
| return EstimatedTC; |
| |
| // Check if upper bound estimate is known. |
| if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) |
| return ExpectedTC; |
| |
| return None; |
| } |
| |
| namespace llvm { |
| |
| /// InnerLoopVectorizer vectorizes loops which contain only one basic |
| /// block to a specified vectorization factor (VF). |
| /// This class performs the widening of scalars into vectors, or multiple |
| /// scalars. This class also implements the following features: |
| /// * It inserts an epilogue loop for handling loops that don't have iteration |
| /// counts that are known to be a multiple of the vectorization factor. |
| /// * It handles the code generation for reduction variables. |
| /// * Scalarization (implementation using scalars) of un-vectorizable |
| /// instructions. |
| /// InnerLoopVectorizer does not perform any vectorization-legality |
| /// checks, and relies on the caller to check for the different legality |
| /// aspects. The InnerLoopVectorizer relies on the |
| /// LoopVectorizationLegality class to provide information about the induction |
| /// and reduction variables that were found to a given vectorization factor. |
| class InnerLoopVectorizer { |
| public: |
| InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, |
| LoopInfo *LI, DominatorTree *DT, |
| const TargetLibraryInfo *TLI, |
| const TargetTransformInfo *TTI, AssumptionCache *AC, |
| OptimizationRemarkEmitter *ORE, unsigned VecWidth, |
| unsigned UnrollFactor, LoopVectorizationLegality *LVL, |
| LoopVectorizationCostModel *CM) |
| : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), |
| AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), |
| Builder(PSE.getSE()->getContext()), |
| VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {} |
| virtual ~InnerLoopVectorizer() = default; |
| |
| /// Create a new empty loop. Unlink the old loop and connect the new one. |
| /// Return the pre-header block of the new loop. |
| BasicBlock *createVectorizedLoopSkeleton(); |
| |
| /// Widen a single instruction within the innermost loop. |
| void widenInstruction(Instruction &I); |
| |
| /// Fix the vectorized code, taking care of header phi's, live-outs, and more. |
| void fixVectorizedLoop(); |
| |
| // Return true if any runtime check is added. |
| bool areSafetyChecksAdded() { return AddedSafetyChecks; } |
| |
| /// A type for vectorized values in the new loop. Each value from the |
| /// original loop, when vectorized, is represented by UF vector values in the |
| /// new unrolled loop, where UF is the unroll factor. |
| using VectorParts = SmallVector<Value *, 2>; |
| |
| /// Vectorize a single GetElementPtrInst based on information gathered and |
| /// decisions taken during planning. |
| void widenGEP(GetElementPtrInst *GEP, unsigned UF, unsigned VF, |
| bool IsPtrLoopInvariant, SmallBitVector &IsIndexLoopInvariant); |
| |
| /// Vectorize a single PHINode in a block. This method handles the induction |
| /// variable canonicalization. It supports both VF = 1 for unrolled loops and |
| /// arbitrary length vectors. |
| void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF); |
| |
| /// A helper function to scalarize a single Instruction in the innermost loop. |
| /// Generates a sequence of scalar instances for each lane between \p MinLane |
| /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, |
| /// inclusive.. |
| void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance, |
| bool IfPredicateInstr); |
| |
| /// Widen an integer or floating-point induction variable \p IV. If \p Trunc |
| /// is provided, the integer induction variable will first be truncated to |
| /// the corresponding type. |
| void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr); |
| |
| /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a |
| /// vector or scalar value on-demand if one is not yet available. When |
| /// vectorizing a loop, we visit the definition of an instruction before its |
| /// uses. When visiting the definition, we either vectorize or scalarize the |
| /// instruction, creating an entry for it in the corresponding map. (In some |
| /// cases, such as induction variables, we will create both vector and scalar |
| /// entries.) Then, as we encounter uses of the definition, we derive values |
| /// for each scalar or vector use unless such a value is already available. |
| /// For example, if we scalarize a definition and one of its uses is vector, |
| /// we build the required vector on-demand with an insertelement sequence |
| /// when visiting the use. Otherwise, if the use is scalar, we can use the |
| /// existing scalar definition. |
| /// |
| /// Return a value in the new loop corresponding to \p V from the original |
| /// loop at unroll index \p Part. If the value has already been vectorized, |
| /// the corresponding vector entry in VectorLoopValueMap is returned. If, |
| /// however, the value has a scalar entry in VectorLoopValueMap, we construct |
| /// a new vector value on-demand by inserting the scalar values into a vector |
| /// with an insertelement sequence. If the value has been neither vectorized |
| /// nor scalarized, it must be loop invariant, so we simply broadcast the |
| /// value into a vector. |
| Value *getOrCreateVectorValue(Value *V, unsigned Part); |
| |
| /// Return a value in the new loop corresponding to \p V from the original |
| /// loop at unroll and vector indices \p Instance. If the value has been |
| /// vectorized but not scalarized, the necessary extractelement instruction |
| /// will be generated. |
| Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance); |
| |
| /// Construct the vector value of a scalarized value \p V one lane at a time. |
| void packScalarIntoVectorValue(Value *V, const VPIteration &Instance); |
| |
| /// Try to vectorize the interleaved access group that \p Instr belongs to |
| /// with the base address given in \p Addr, optionally masking the vector |
| /// operations if \p BlockInMask is non-null. Use \p State to translate given |
| /// VPValues to IR values in the vectorized loop. |
| void vectorizeInterleaveGroup(Instruction *Instr, VPTransformState &State, |
| VPValue *Addr, VPValue *BlockInMask = nullptr); |
| |
| /// Vectorize Load and Store instructions with the base address given in \p |
| /// Addr, optionally masking the vector operations if \p BlockInMask is |
| /// non-null. Use \p State to translate given VPValues to IR values in the |
| /// vectorized loop. |
| void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, |
| VPValue *Addr, |
| VPValue *BlockInMask = nullptr); |
| |
| /// Set the debug location in the builder using the debug location in |
| /// the instruction. |
| void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); |
| |
| /// Fix the non-induction PHIs in the OrigPHIsToFix vector. |
| void fixNonInductionPHIs(void); |
| |
| protected: |
| friend class LoopVectorizationPlanner; |
| |
| /// A small list of PHINodes. |
| using PhiVector = SmallVector<PHINode *, 4>; |
| |
| /// A type for scalarized values in the new loop. Each value from the |
| /// original loop, when scalarized, is represented by UF x VF scalar values |
| /// in the new unrolled loop, where UF is the unroll factor and VF is the |
| /// vectorization factor. |
| using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; |
| |
| /// Set up the values of the IVs correctly when exiting the vector loop. |
| void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, |
| Value *CountRoundDown, Value *EndValue, |
| BasicBlock *MiddleBlock); |
| |
| /// Create a new induction variable inside L. |
| PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, |
| Value *Step, Instruction *DL); |
| |
| /// Handle all cross-iteration phis in the header. |
| void fixCrossIterationPHIs(); |
| |
| /// Fix a first-order recurrence. This is the second phase of vectorizing |
| /// this phi node. |
| void fixFirstOrderRecurrence(PHINode *Phi); |
| |
| /// Fix a reduction cross-iteration phi. This is the second phase of |
| /// vectorizing this phi node. |
| void fixReduction(PHINode *Phi); |
| |
| /// Clear NSW/NUW flags from reduction instructions if necessary. |
| void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc); |
| |
| /// The Loop exit block may have single value PHI nodes with some |
| /// incoming value. While vectorizing we only handled real values |
| /// that were defined inside the loop and we should have one value for |
| /// each predecessor of its parent basic block. See PR14725. |
| void fixLCSSAPHIs(); |
| |
| /// Iteratively sink the scalarized operands of a predicated instruction into |
| /// the block that was created for it. |
| void sinkScalarOperands(Instruction *PredInst); |
| |
| /// Shrinks vector element sizes to the smallest bitwidth they can be legally |
| /// represented as. |
| void truncateToMinimalBitwidths(); |
| |
| /// Create a broadcast instruction. This method generates a broadcast |
| /// instruction (shuffle) for loop invariant values and for the induction |
| /// value. If this is the induction variable then we extend it to N, N+1, ... |
| /// this is needed because each iteration in the loop corresponds to a SIMD |
| /// element. |
| virtual Value *getBroadcastInstrs(Value *V); |
| |
| /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...) |
| /// to each vector element of Val. The sequence starts at StartIndex. |
| /// \p Opcode is relevant for FP induction variable. |
| virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step, |
| Instruction::BinaryOps Opcode = |
| Instruction::BinaryOpsEnd); |
| |
| /// Compute scalar induction steps. \p ScalarIV is the scalar induction |
| /// variable on which to base the steps, \p Step is the size of the step, and |
| /// \p EntryVal is the value from the original loop that maps to the steps. |
| /// Note that \p EntryVal doesn't have to be an induction variable - it |
| /// can also be a truncate instruction. |
| void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, |
| const InductionDescriptor &ID); |
| |
| /// Create a vector induction phi node based on an existing scalar one. \p |
| /// EntryVal is the value from the original loop that maps to the vector phi |
| /// node, and \p Step is the loop-invariant step. If \p EntryVal is a |
| /// truncate instruction, instead of widening the original IV, we widen a |
| /// version of the IV truncated to \p EntryVal's type. |
| void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, |
| Value *Step, Instruction *EntryVal); |
| |
| /// Returns true if an instruction \p I should be scalarized instead of |
| /// vectorized for the chosen vectorization factor. |
| bool shouldScalarizeInstruction(Instruction *I) const; |
| |
| /// Returns true if we should generate a scalar version of \p IV. |
| bool needsScalarInduction(Instruction *IV) const; |
| |
| /// If there is a cast involved in the induction variable \p ID, which should |
| /// be ignored in the vectorized loop body, this function records the |
| /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the |
| /// cast. We had already proved that the casted Phi is equal to the uncasted |
| /// Phi in the vectorized loop (under a runtime guard), and therefore |
| /// there is no need to vectorize the cast - the same value can be used in the |
| /// vector loop for both the Phi and the cast. |
| /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, |
| /// Otherwise, \p VectorLoopValue is a widened/vectorized value. |
| /// |
| /// \p EntryVal is the value from the original loop that maps to the vector |
| /// phi node and is used to distinguish what is the IV currently being |
| /// processed - original one (if \p EntryVal is a phi corresponding to the |
| /// original IV) or the "newly-created" one based on the proof mentioned above |
| /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the |
| /// latter case \p EntryVal is a TruncInst and we must not record anything for |
| /// that IV, but it's error-prone to expect callers of this routine to care |
| /// about that, hence this explicit parameter. |
| void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID, |
| const Instruction *EntryVal, |
| Value *VectorLoopValue, |
| unsigned Part, |
| unsigned Lane = UINT_MAX); |
| |
| /// Generate a shuffle sequence that will reverse the vector Vec. |
| virtual Value *reverseVector(Value *Vec); |
| |
| /// Returns (and creates if needed) the original loop trip count. |
| Value *getOrCreateTripCount(Loop *NewLoop); |
| |
| /// Returns (and creates if needed) the trip count of the widened loop. |
| Value *getOrCreateVectorTripCount(Loop *NewLoop); |
| |
| /// Returns a bitcasted value to the requested vector type. |
| /// Also handles bitcasts of vector<float> <-> vector<pointer> types. |
| Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, |
| const DataLayout &DL); |
| |
| /// Emit a bypass check to see if the vector trip count is zero, including if |
| /// it overflows. |
| void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); |
| |
| /// Emit a bypass check to see if all of the SCEV assumptions we've |
| /// had to make are correct. |
| void emitSCEVChecks(Loop *L, BasicBlock *Bypass); |
| |
| /// Emit bypass checks to check any memory assumptions we may have made. |
| void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); |
| |
| /// Compute the transformed value of Index at offset StartValue using step |
| /// StepValue. |
| /// For integer induction, returns StartValue + Index * StepValue. |
| /// For pointer induction, returns StartValue[Index * StepValue]. |
| /// FIXME: The newly created binary instructions should contain nsw/nuw |
| /// flags, which can be found from the original scalar operations. |
| Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, |
| const DataLayout &DL, |
| const InductionDescriptor &ID) const; |
| |
| /// Add additional metadata to \p To that was not present on \p Orig. |
| /// |
| /// Currently this is used to add the noalias annotations based on the |
| /// inserted memchecks. Use this for instructions that are *cloned* into the |
| /// vector loop. |
| void addNewMetadata(Instruction *To, const Instruction *Orig); |
| |
| /// Add metadata from one instruction to another. |
| /// |
| /// This includes both the original MDs from \p From and additional ones (\see |
| /// addNewMetadata). Use this for *newly created* instructions in the vector |
| /// loop. |
| void addMetadata(Instruction *To, Instruction *From); |
| |
| /// Similar to the previous function but it adds the metadata to a |
| /// vector of instructions. |
| void addMetadata(ArrayRef<Value *> To, Instruction *From); |
| |
| /// The original loop. |
| Loop *OrigLoop; |
| |
| /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies |
| /// dynamic knowledge to simplify SCEV expressions and converts them to a |
| /// more usable form. |
| PredicatedScalarEvolution &PSE; |
| |
| /// Loop Info. |
| LoopInfo *LI; |
| |
| /// Dominator Tree. |
| DominatorTree *DT; |
| |
| /// Alias Analysis. |
| AliasAnalysis *AA; |
| |
| /// Target Library Info. |
| const TargetLibraryInfo *TLI; |
| |
| /// Target Transform Info. |
| const TargetTransformInfo *TTI; |
| |
| /// Assumption Cache. |
| AssumptionCache *AC; |
| |
| /// Interface to emit optimization remarks. |
| OptimizationRemarkEmitter *ORE; |
| |
| /// LoopVersioning. It's only set up (non-null) if memchecks were |
| /// used. |
| /// |
| /// This is currently only used to add no-alias metadata based on the |
| /// memchecks. The actually versioning is performed manually. |
| std::unique_ptr<LoopVersioning> LVer; |
| |
| /// The vectorization SIMD factor to use. Each vector will have this many |
| /// vector elements. |
| unsigned VF; |
| |
| /// The vectorization unroll factor to use. Each scalar is vectorized to this |
| /// many different vector instructions. |
| unsigned UF; |
| |
| /// The builder that we use |
| IRBuilder<> Builder; |
| |
| // --- Vectorization state --- |
| |
| /// The vector-loop preheader. |
| BasicBlock *LoopVectorPreHeader; |
| |
| /// The scalar-loop preheader. |
| BasicBlock *LoopScalarPreHeader; |
| |
| /// Middle Block between the vector and the scalar. |
| BasicBlock *LoopMiddleBlock; |
| |
| /// The ExitBlock of the scalar loop. |
| BasicBlock *LoopExitBlock; |
| |
| /// The vector loop body. |
| BasicBlock *LoopVectorBody; |
| |
| /// The scalar loop body. |
| BasicBlock *LoopScalarBody; |
| |
| /// A list of all bypass blocks. The first block is the entry of the loop. |
| SmallVector<BasicBlock *, 4> LoopBypassBlocks; |
| |
| /// The new Induction variable which was added to the new block. |
| PHINode *Induction = nullptr; |
| |
| /// The induction variable of the old basic block. |
| PHINode *OldInduction = nullptr; |
| |
| /// Maps values from the original loop to their corresponding values in the |
| /// vectorized loop. A key value can map to either vector values, scalar |
| /// values or both kinds of values, depending on whether the key was |
| /// vectorized and scalarized. |
| VectorizerValueMap VectorLoopValueMap; |
| |
| /// Store instructions that were predicated. |
| SmallVector<Instruction *, 4> PredicatedInstructions; |
| |
| /// Trip count of the original loop. |
| Value *TripCount = nullptr; |
| |
| /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) |
| Value *VectorTripCount = nullptr; |
| |
| /// The legality analysis. |
| LoopVectorizationLegality *Legal; |
| |
| /// The profitablity analysis. |
| LoopVectorizationCostModel *Cost; |
| |
| // Record whether runtime checks are added. |
| bool AddedSafetyChecks = false; |
| |
| // Holds the end values for each induction variable. We save the end values |
| // so we can later fix-up the external users of the induction variables. |
| DenseMap<PHINode *, Value *> IVEndValues; |
| |
| // Vector of original scalar PHIs whose corresponding widened PHIs need to be |
| // fixed up at the end of vector code generation. |
| SmallVector<PHINode *, 8> OrigPHIsToFix; |
| }; |
| |
| class InnerLoopUnroller : public InnerLoopVectorizer { |
| public: |
| InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, |
| LoopInfo *LI, DominatorTree *DT, |
| const TargetLibraryInfo *TLI, |
| const TargetTransformInfo *TTI, AssumptionCache *AC, |
| OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, |
| LoopVectorizationLegality *LVL, |
| LoopVectorizationCostModel *CM) |
| : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1, |
| UnrollFactor, LVL, CM) {} |
| |
| private: |
| Value *getBroadcastInstrs(Value *V) override; |
| Value *getStepVector(Value *Val, int StartIdx, Value *Step, |
| Instruction::BinaryOps Opcode = |
| Instruction::BinaryOpsEnd) override; |
| Value *reverseVector(Value *Vec) override; |
| }; |
| |
| } // end namespace llvm |
| |
| /// Look for a meaningful debug location on the instruction or it's |
| /// operands. |
| static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { |
| if (!I) |
| return I; |
| |
| DebugLoc Empty; |
| if (I->getDebugLoc() != Empty) |
| return I; |
| |
| for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) { |
| if (Instruction *OpInst = dyn_cast<Instruction>(*OI)) |
| if (OpInst->getDebugLoc() != Empty) |
| return OpInst; |
| } |
| |
| return I; |
| } |
| |
| void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { |
| if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) { |
| const DILocation *DIL = Inst->getDebugLoc(); |
| if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && |
| !isa<DbgInfoIntrinsic>(Inst)) { |
| auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF); |
| if (NewDIL) |
| B.SetCurrentDebugLocation(NewDIL.getValue()); |
| else |
| LLVM_DEBUG(dbgs() |
| << "Failed to create new discriminator: " |
| << DIL->getFilename() << " Line: " << DIL->getLine()); |
| } |
| else |
| B.SetCurrentDebugLocation(DIL); |
| } else |
| B.SetCurrentDebugLocation(DebugLoc()); |
| } |
| |
| /// Write a record \p DebugMsg about vectorization failure to the debug |
| /// output stream. If \p I is passed, it is an instruction that prevents |
| /// vectorization. |
| #ifndef NDEBUG |
| static void debugVectorizationFailure(const StringRef DebugMsg, |
| Instruction *I) { |
| dbgs() << "LV: Not vectorizing: " << DebugMsg; |
| if (I != nullptr) |
| dbgs() << " " << *I; |
| else |
| dbgs() << '.'; |
| dbgs() << '\n'; |
| } |
| #endif |
| |
| /// Create an analysis remark that explains why vectorization failed |
| /// |
| /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p |
| /// RemarkName is the identifier for the remark. If \p I is passed it is an |
| /// instruction that prevents vectorization. Otherwise \p TheLoop is used for |
| /// the location of the remark. \return the remark object that can be |
| /// streamed to. |
| static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, |
| StringRef RemarkName, Loop *TheLoop, Instruction *I) { |
| Value *CodeRegion = TheLoop->getHeader(); |
| DebugLoc DL = TheLoop->getStartLoc(); |
| |
| if (I) { |
| CodeRegion = I->getParent(); |
| // If there is no debug location attached to the instruction, revert back to |
| // using the loop's. |
| if (I->getDebugLoc()) |
| DL = I->getDebugLoc(); |
| } |
| |
| OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion); |
| R << "loop not vectorized: "; |
| return R; |
| } |
| |
| namespace llvm { |
| |
| void reportVectorizationFailure(const StringRef DebugMsg, |
| const StringRef OREMsg, const StringRef ORETag, |
| OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) { |
| LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I)); |
| LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); |
| ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), |
| ORETag, TheLoop, I) << OREMsg); |
| } |
| |
| } // end namespace llvm |
| |
| #ifndef NDEBUG |
| /// \return string containing a file name and a line # for the given loop. |
| static std::string getDebugLocString(const Loop *L) { |
| std::string Result; |
| if (L) { |
| raw_string_ostream OS(Result); |
| if (const DebugLoc LoopDbgLoc = L->getStartLoc()) |
| LoopDbgLoc.print(OS); |
| else |
| // Just print the module name. |
| OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); |
| OS.flush(); |
| } |
| return Result; |
| } |
| #endif |
| |
| void InnerLoopVectorizer::addNewMetadata(Instruction *To, |
| const Instruction *Orig) { |
| // If the loop was versioned with memchecks, add the corresponding no-alias |
| // metadata. |
| if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) |
| LVer->annotateInstWithNoAlias(To, Orig); |
| } |
| |
| void InnerLoopVectorizer::addMetadata(Instruction *To, |
| Instruction *From) { |
| propagateMetadata(To, From); |
| addNewMetadata(To, From); |
| } |
| |
| void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, |
| Instruction *From) { |
| for (Value *V : To) { |
| if (Instruction *I = dyn_cast<Instruction>(V)) |
| addMetadata(I, From); |
| } |
| } |
| |
| namespace llvm { |
| |
| // Loop vectorization cost-model hints how the scalar epilogue loop should be |
| // lowered. |
| enum ScalarEpilogueLowering { |
| |
| // The default: allowing scalar epilogues. |
| CM_ScalarEpilogueAllowed, |
| |
| // Vectorization with OptForSize: don't allow epilogues. |
| CM_ScalarEpilogueNotAllowedOptSize, |
| |
| // A special case of vectorisation with OptForSize: loops with a very small |
| // trip count are considered for vectorization under OptForSize, thereby |
| // making sure the cost of their loop body is dominant, free of runtime |
| // guards and scalar iteration overheads. |
| CM_ScalarEpilogueNotAllowedLowTripLoop, |
| |
| // Loop hint predicate indicating an epilogue is undesired. |
| CM_ScalarEpilogueNotNeededUsePredicate |
| }; |
| |
| /// LoopVectorizationCostModel - estimates the expected speedups due to |
| /// vectorization. |
| /// In many cases vectorization is not profitable. This can happen because of |
| /// a number of reasons. In this class we mainly attempt to predict the |
| /// expected speedup/slowdowns due to the supported instruction set. We use the |
| /// TargetTransformInfo to query the different backends for the cost of |
| /// different operations. |
| class LoopVectorizationCostModel { |
| public: |
| LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, |
| PredicatedScalarEvolution &PSE, LoopInfo *LI, |
| LoopVectorizationLegality *Legal, |
| const TargetTransformInfo &TTI, |
| const TargetLibraryInfo *TLI, DemandedBits *DB, |
| AssumptionCache *AC, |
| OptimizationRemarkEmitter *ORE, const Function *F, |
| const LoopVectorizeHints *Hints, |
| InterleavedAccessInfo &IAI) |
| : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), |
| TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), |
| Hints(Hints), InterleaveInfo(IAI) {} |
| |
| /// \return An upper bound for the vectorization factor, or None if |
| /// vectorization and interleaving should be avoided up front. |
| Optional<unsigned> computeMaxVF(); |
| |
| /// \return True if runtime checks are required for vectorization, and false |
| /// otherwise. |
| bool runtimeChecksRequired(); |
| |
| /// \return The most profitable vectorization factor and the cost of that VF. |
| /// This method checks every power of two up to MaxVF. If UserVF is not ZERO |
| /// then this vectorization factor will be selected if vectorization is |
| /// possible. |
| VectorizationFactor selectVectorizationFactor(unsigned MaxVF); |
| |
| /// Setup cost-based decisions for user vectorization factor. |
| void selectUserVectorizationFactor(unsigned UserVF) { |
| collectUniformsAndScalars(UserVF); |
| collectInstsToScalarize(UserVF); |
| } |
| |
| /// \return The size (in bits) of the smallest and widest types in the code |
| /// that needs to be vectorized. We ignore values that remain scalar such as |
| /// 64 bit loop indices. |
| std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); |
| |
| /// \return The desired interleave count. |
| /// If interleave count has been specified by metadata it will be returned. |
| /// Otherwise, the interleave count is computed and returned. VF and LoopCost |
| /// are the selected vectorization factor and the cost of the selected VF. |
| unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost); |
| |
| /// Memory access instruction may be vectorized in more than one way. |
| /// Form of instruction after vectorization depends on cost. |
| /// This function takes cost-based decisions for Load/Store instructions |
| /// and collects them in a map. This decisions map is used for building |
| /// the lists of loop-uniform and loop-scalar instructions. |
| /// The calculated cost is saved with widening decision in order to |
| /// avoid redundant calculations. |
| void setCostBasedWideningDecision(unsigned VF); |
| |
| /// A struct that represents some properties of the register usage |
| /// of a loop. |
| struct RegisterUsage { |
| /// Holds the number of loop invariant values that are used in the loop. |
| /// The key is ClassID of target-provided register class. |
| SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; |
| /// Holds the maximum number of concurrent live intervals in the loop. |
| /// The key is ClassID of target-provided register class. |
| SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; |
| }; |
| |
| /// \return Returns information about the register usages of the loop for the |
| /// given vectorization factors. |
| SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs); |
| |
| /// Collect values we want to ignore in the cost model. |
| void collectValuesToIgnore(); |
| |
| /// \returns The smallest bitwidth each instruction can be represented with. |
| /// The vector equivalents of these instructions should be truncated to this |
| /// type. |
| const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { |
| return MinBWs; |
| } |
| |
| /// \returns True if it is more profitable to scalarize instruction \p I for |
| /// vectorization factor \p VF. |
| bool isProfitableToScalarize(Instruction *I, unsigned VF) const { |
| assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1."); |
| |
| // Cost model is not run in the VPlan-native path - return conservative |
| // result until this changes. |
| if (EnableVPlanNativePath) |
| return false; |
| |
| auto Scalars = InstsToScalarize.find(VF); |
| assert(Scalars != InstsToScalarize.end() && |
| "VF not yet analyzed for scalarization profitability"); |
| return Scalars->second.find(I) != Scalars->second.end(); |
| } |
| |
| /// Returns true if \p I is known to be uniform after vectorization. |
| bool isUniformAfterVectorization(Instruction *I, unsigned VF) const { |
| if (VF == 1) |
| return true; |
| |
| // Cost model is not run in the VPlan-native path - return conservative |
| // result until this changes. |
| if (EnableVPlanNativePath) |
| return false; |
| |
| auto UniformsPerVF = Uniforms.find(VF); |
| assert(UniformsPerVF != Uniforms.end() && |
| "VF not yet analyzed for uniformity"); |
| return UniformsPerVF->second.find(I) != UniformsPerVF->second.end(); |
| } |
| |
| /// Returns true if \p I is known to be scalar after vectorization. |
| bool isScalarAfterVectorization(Instruction *I, unsigned VF) const { |
| if (VF == 1) |
| return true; |
| |
| // Cost model is not run in the VPlan-native path - return conservative |
| // result until this changes. |
| if (EnableVPlanNativePath) |
| return false; |
| |
| auto ScalarsPerVF = Scalars.find(VF); |
| assert(ScalarsPerVF != Scalars.end() && |
| "Scalar values are not calculated for VF"); |
| return ScalarsPerVF->second.find(I) != ScalarsPerVF->second.end(); |
| } |
| |
| /// \returns True if instruction \p I can be truncated to a smaller bitwidth |
| /// for vectorization factor \p VF. |
| bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const { |
| return VF > 1 && MinBWs.find(I) != MinBWs.end() && |
| !isProfitableToScalarize(I, VF) && |
| !isScalarAfterVectorization(I, VF); |
| } |
| |
| /// Decision that was taken during cost calculation for memory instruction. |
| enum InstWidening { |
| CM_Unknown, |
| CM_Widen, // For consecutive accesses with stride +1. |
| CM_Widen_Reverse, // For consecutive accesses with stride -1. |
| CM_Interleave, |
| CM_GatherScatter, |
| CM_Scalarize |
| }; |
| |
| /// Save vectorization decision \p W and \p Cost taken by the cost model for |
| /// instruction \p I and vector width \p VF. |
| void setWideningDecision(Instruction *I, unsigned VF, InstWidening W, |
| unsigned Cost) { |
| assert(VF >= 2 && "Expected VF >=2"); |
| WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); |
| } |
| |
| /// Save vectorization decision \p W and \p Cost taken by the cost model for |
| /// interleaving group \p Grp and vector width \p VF. |
| void setWideningDecision(const InterleaveGroup<Instruction> *Grp, unsigned VF, |
| InstWidening W, unsigned Cost) { |
| assert(VF >= 2 && "Expected VF >=2"); |
| /// Broadcast this decicion to all instructions inside the group. |
| /// But the cost will be assigned to one instruction only. |
| for (unsigned i = 0; i < Grp->getFactor(); ++i) { |
| if (auto *I = Grp->getMember(i)) { |
| if (Grp->getInsertPos() == I) |
| WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); |
| else |
| WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); |
| } |
| } |
| } |
| |
| /// Return the cost model decision for the given instruction \p I and vector |
| /// width \p VF. Return CM_Unknown if this instruction did not pass |
| /// through the cost modeling. |
| InstWidening getWideningDecision(Instruction *I, unsigned VF) { |
| assert(VF >= 2 && "Expected VF >=2"); |
| |
| // Cost model is not run in the VPlan-native path - return conservative |
| // result until this changes. |
| if (EnableVPlanNativePath) |
| return CM_GatherScatter; |
| |
| std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF); |
| auto Itr = WideningDecisions.find(InstOnVF); |
| if (Itr == WideningDecisions.end()) |
| return CM_Unknown; |
| return Itr->second.first; |
| } |
| |
| /// Return the vectorization cost for the given instruction \p I and vector |
| /// width \p VF. |
| unsigned getWideningCost(Instruction *I, unsigned VF) { |
| assert(VF >= 2 && "Expected VF >=2"); |
| std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF); |
| assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && |
| "The cost is not calculated"); |
| return WideningDecisions[InstOnVF].second; |
| } |
| |
| /// Return True if instruction \p I is an optimizable truncate whose operand |
| /// is an induction variable. Such a truncate will be removed by adding a new |
| /// induction variable with the destination type. |
| bool isOptimizableIVTruncate(Instruction *I, unsigned VF) { |
| // If the instruction is not a truncate, return false. |
| auto *Trunc = dyn_cast<TruncInst>(I); |
| if (!Trunc) |
| return false; |
| |
| // Get the source and destination types of the truncate. |
| Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); |
| Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); |
| |
| // If the truncate is free for the given types, return false. Replacing a |
| // free truncate with an induction variable would add an induction variable |
| // update instruction to each iteration of the loop. We exclude from this |
| // check the primary induction variable since it will need an update |
| // instruction regardless. |
| Value *Op = Trunc->getOperand(0); |
| if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) |
| return false; |
| |
| // If the truncated value is not an induction variable, return false. |
| return Legal->isInductionPhi(Op); |
| } |
| |
| /// Collects the instructions to scalarize for each predicated instruction in |
| /// the loop. |
| void collectInstsToScalarize(unsigned VF); |
| |
| /// Collect Uniform and Scalar values for the given \p VF. |
| /// The sets depend on CM decision for Load/Store instructions |
| /// that may be vectorized as interleave, gather-scatter or scalarized. |
| void collectUniformsAndScalars(unsigned VF) { |
| // Do the analysis once. |
| if (VF == 1 || Uniforms.find(VF) != Uniforms.end()) |
| return; |
| setCostBasedWideningDecision(VF); |
| collectLoopUniforms(VF); |
| collectLoopScalars(VF); |
| } |
| |
| /// Returns true if the target machine supports masked store operation |
| /// for the given \p DataType and kind of access to \p Ptr. |
| bool isLegalMaskedStore(Type *DataType, Value *Ptr, MaybeAlign Alignment) { |
| return Legal->isConsecutivePtr(Ptr) && |
| TTI.isLegalMaskedStore(DataType, Alignment); |
| } |
| |
| /// Returns true if the target machine supports masked load operation |
| /// for the given \p DataType and kind of access to \p Ptr. |
| bool isLegalMaskedLoad(Type *DataType, Value *Ptr, MaybeAlign Alignment) { |
| return Legal->isConsecutivePtr(Ptr) && |
| TTI.isLegalMaskedLoad(DataType, Alignment); |
| } |
| |
| /// Returns true if the target machine supports masked scatter operation |
| /// for the given \p DataType. |
| bool isLegalMaskedScatter(Type *DataType, MaybeAlign Alignment) { |
| return TTI.isLegalMaskedScatter(DataType, Alignment); |
| } |
| |
| /// Returns true if the target machine supports masked gather operation |
| /// for the given \p DataType. |
| bool isLegalMaskedGather(Type *DataType, MaybeAlign Alignment) { |
| return TTI.isLegalMaskedGather(DataType, Alignment); |
| } |
| |
| /// Returns true if the target machine can represent \p V as a masked gather |
| /// or scatter operation. |
| bool isLegalGatherOrScatter(Value *V) { |
| bool LI = isa<LoadInst>(V); |
| bool SI = isa<StoreInst>(V); |
| if (!LI && !SI) |
| return false; |
| auto *Ty = getMemInstValueType(V); |
| MaybeAlign Align = getLoadStoreAlignment(V); |
| return (LI && isLegalMaskedGather(Ty, Align)) || |
| (SI && isLegalMaskedScatter(Ty, Align)); |
| } |
| |
| /// Returns true if \p I is an instruction that will be scalarized with |
| /// predication. Such instructions include conditional stores and |
| /// instructions that may divide by zero. |
| /// If a non-zero VF has been calculated, we check if I will be scalarized |
| /// predication for that VF. |
| bool isScalarWithPredication(Instruction *I, unsigned VF = 1); |
| |
| // Returns true if \p I is an instruction that will be predicated either |
| // through scalar predication or masked load/store or masked gather/scatter. |
| // Superset of instructions that return true for isScalarWithPredication. |
| bool isPredicatedInst(Instruction *I) { |
| if (!blockNeedsPredication(I->getParent())) |
| return false; |
| // Loads and stores that need some form of masked operation are predicated |
| // instructions. |
| if (isa<LoadInst>(I) || isa<StoreInst>(I)) |
| return Legal->isMaskRequired(I); |
| return isScalarWithPredication(I); |
| } |
| |
| /// Returns true if \p I is a memory instruction with consecutive memory |
| /// access that can be widened. |
| bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1); |
| |
| /// Returns true if \p I is a memory instruction in an interleaved-group |
| /// of memory accesses that can be vectorized with wide vector loads/stores |
| /// and shuffles. |
| bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1); |
| |
| /// Check if \p Instr belongs to any interleaved access group. |
| bool isAccessInterleaved(Instruction *Instr) { |
| return InterleaveInfo.isInterleaved(Instr); |
| } |
| |
| /// Get the interleaved access group that \p Instr belongs to. |
| const InterleaveGroup<Instruction> * |
| getInterleavedAccessGroup(Instruction *Instr) { |
| return InterleaveInfo.getInterleaveGroup(Instr); |
| } |
| |
| /// Returns true if an interleaved group requires a scalar iteration |
| /// to handle accesses with gaps, and there is nothing preventing us from |
| /// creating a scalar epilogue. |
| bool requiresScalarEpilogue() const { |
| return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue(); |
| } |
| |
| /// Returns true if a scalar epilogue is not allowed due to optsize or a |
| /// loop hint annotation. |
| bool isScalarEpilogueAllowed() const { |
| return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; |
| } |
| |
| /// Returns true if all loop blocks should be masked to fold tail loop. |
| bool foldTailByMasking() const { return FoldTailByMasking; } |
| |
| bool blockNeedsPredication(BasicBlock *BB) { |
| return foldTailByMasking() || Legal->blockNeedsPredication(BB); |
| } |
| |
| /// Estimate cost of an intrinsic call instruction CI if it were vectorized |
| /// with factor VF. Return the cost of the instruction, including |
| /// scalarization overhead if it's needed. |
| unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF); |
| |
| /// Estimate cost of a call instruction CI if it were vectorized with factor |
| /// VF. Return the cost of the instruction, including scalarization overhead |
| /// if it's needed. The flag NeedToScalarize shows if the call needs to be |
| /// scalarized - |
| /// i.e. either vector version isn't available, or is too expensive. |
| unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize); |
| |
| private: |
| unsigned NumPredStores = 0; |
| |
| /// \return An upper bound for the vectorization factor, larger than zero. |
| /// One is returned if vectorization should best be avoided due to cost. |
| unsigned computeFeasibleMaxVF(unsigned ConstTripCount); |
| |
| /// The vectorization cost is a combination of the cost itself and a boolean |
| /// indicating whether any of the contributing operations will actually |
| /// operate on |
| /// vector values after type legalization in the backend. If this latter value |
| /// is |
| /// false, then all operations will be scalarized (i.e. no vectorization has |
| /// actually taken place). |
| using VectorizationCostTy = std::pair<unsigned, bool>; |
| |
| /// Returns the expected execution cost. The unit of the cost does |
| /// not matter because we use the 'cost' units to compare different |
| /// vector widths. The cost that is returned is *not* normalized by |
| /// the factor width. |
| VectorizationCostTy expectedCost(unsigned VF); |
| |
| /// Returns the execution time cost of an instruction for a given vector |
| /// width. Vector width of one means scalar. |
| VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF); |
| |
| /// The cost-computation logic from getInstructionCost which provides |
| /// the vector type as an output parameter. |
| unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy); |
| |
| /// Calculate vectorization cost of memory instruction \p I. |
| unsigned getMemoryInstructionCost(Instruction *I, unsigned VF); |
| |
| /// The cost computation for scalarized memory instruction. |
| unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF); |
| |
| /// The cost computation for interleaving group of memory instructions. |
| unsigned getInterleaveGroupCost(Instruction *I, unsigned VF); |
| |
| /// The cost computation for Gather/Scatter instruction. |
| unsigned getGatherScatterCost(Instruction *I, unsigned VF); |
| |
| /// The cost computation for widening instruction \p I with consecutive |
| /// memory access. |
| unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF); |
| |
| /// The cost calculation for Load/Store instruction \p I with uniform pointer - |
| /// Load: scalar load + broadcast. |
| /// Store: scalar store + (loop invariant value stored? 0 : extract of last |
| /// element) |
| unsigned getUniformMemOpCost(Instruction *I, unsigned VF); |
| |
| /// Estimate the overhead of scalarizing an instruction. This is a |
| /// convenience wrapper for the type-based getScalarizationOverhead API. |
| unsigned getScalarizationOverhead(Instruction *I, unsigned VF); |
| |
| /// Returns whether the instruction is a load or store and will be a emitted |
| /// as a vector operation. |
| bool isConsecutiveLoadOrStore(Instruction *I); |
| |
| /// Returns true if an artificially high cost for emulated masked memrefs |
| /// should be used. |
| bool useEmulatedMaskMemRefHack(Instruction *I); |
| |
| /// Map of scalar integer values to the smallest bitwidth they can be legally |
| /// represented as. The vector equivalents of these values should be truncated |
| /// to this type. |
| MapVector<Instruction *, uint64_t> MinBWs; |
| |
| /// A type representing the costs for instructions if they were to be |
| /// scalarized rather than vectorized. The entries are Instruction-Cost |
| /// pairs. |
| using ScalarCostsTy = DenseMap<Instruction *, unsigned>; |
| |
| /// A set containing all BasicBlocks that are known to present after |
| /// vectorization as a predicated block. |
| SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; |
| |
| /// Records whether it is allowed to have the original scalar loop execute at |
| /// least once. This may be needed as a fallback loop in case runtime |
| /// aliasing/dependence checks fail, or to handle the tail/remainder |
| /// iterations when the trip count is unknown or doesn't divide by the VF, |
| /// or as a peel-loop to handle gaps in interleave-groups. |
| /// Under optsize and when the trip count is very small we don't allow any |
| /// iterations to execute in the scalar loop. |
| ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; |
| |
| /// All blocks of loop are to be masked to fold tail of scalar iterations. |
| bool FoldTailByMasking = false; |
| |
| /// A map holding scalar costs for different vectorization factors. The |
| /// presence of a cost for an instruction in the mapping indicates that the |
| /// instruction will be scalarized when vectorizing with the associated |
| /// vectorization factor. The entries are VF-ScalarCostTy pairs. |
| DenseMap<unsigned, ScalarCostsTy> InstsToScalarize; |
| |
| /// Holds the instructions known to be uniform after vectorization. |
| /// The data is collected per VF. |
| DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms; |
| |
| /// Holds the instructions known to be scalar after vectorization. |
| /// The data is collected per VF. |
| DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars; |
| |
| /// Holds the instructions (address computations) that are forced to be |
| /// scalarized. |
| DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars; |
| |
| /// Returns the expected difference in cost from scalarizing the expression |
| /// feeding a predicated instruction \p PredInst. The instructions to |
| /// scalarize and their scalar costs are collected in \p ScalarCosts. A |
| /// non-negative return value implies the expression will be scalarized. |
| /// Currently, only single-use chains are considered for scalarization. |
| int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, |
| unsigned VF); |
| |
| /// Collect the instructions that are uniform after vectorization. An |
| /// instruction is uniform if we represent it with a single scalar value in |
| /// the vectorized loop corresponding to each vector iteration. Examples of |
| /// uniform instructions include pointer operands of consecutive or |
| /// interleaved memory accesses. Note that although uniformity implies an |
| /// instruction will be scalar, the reverse is not true. In general, a |
| /// scalarized instruction will be represented by VF scalar values in the |
| /// vectorized loop, each corresponding to an iteration of the original |
| /// scalar loop. |
| void collectLoopUniforms(unsigned VF); |
| |
| /// Collect the instructions that are scalar after vectorization. An |
| /// instruction is scalar if it is known to be uniform or will be scalarized |
| /// during vectorization. Non-uniform scalarized instructions will be |
| /// represented by VF values in the vectorized loop, each corresponding to an |
| /// iteration of the original scalar loop. |
| void collectLoopScalars(unsigned VF); |
| |
| /// Keeps cost model vectorization decision and cost for instructions. |
| /// Right now it is used for memory instructions only. |
| using DecisionList = DenseMap<std::pair<Instruction *, unsigned>, |
| std::pair<InstWidening, unsigned>>; |
| |
| DecisionList WideningDecisions; |
| |
| /// Returns true if \p V is expected to be vectorized and it needs to be |
| /// extracted. |
| bool needsExtract(Value *V, unsigned VF) const { |
| Instruction *I = dyn_cast<Instruction>(V); |
| if (VF == 1 || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I)) |
| return false; |
| |
| // Assume we can vectorize V (and hence we need extraction) if the |
| // scalars are not computed yet. This can happen, because it is called |
| // via getScalarizationOverhead from setCostBasedWideningDecision, before |
| // the scalars are collected. That should be a safe assumption in most |
| // cases, because we check if the operands have vectorizable types |
| // beforehand in LoopVectorizationLegality. |
| return Scalars.find(VF) == Scalars.end() || |
| !isScalarAfterVectorization(I, VF); |
| }; |
| |
| /// Returns a range containing only operands needing to be extracted. |
| SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, |
| unsigned VF) { |
| return SmallVector<Value *, 4>(make_filter_range( |
| Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); |
| } |
| |
| public: |
| /// The loop that we evaluate. |
| Loop *TheLoop; |
| |
| /// Predicated scalar evolution analysis. |
| PredicatedScalarEvolution &PSE; |
| |
| /// Loop Info analysis. |
| LoopInfo *LI; |
| |
| /// Vectorization legality. |
| LoopVectorizationLegality *Legal; |
| |
| /// Vector target information. |
| const TargetTransformInfo &TTI; |
| |
| /// Target Library Info. |
| const TargetLibraryInfo *TLI; |
| |
| /// Demanded bits analysis. |
| DemandedBits *DB; |
| |
| /// Assumption cache. |
| AssumptionCache *AC; |
| |
| /// Interface to emit optimization remarks. |
| OptimizationRemarkEmitter *ORE; |
| |
| const Function *TheFunction; |
| |
| /// Loop Vectorize Hint. |
| const LoopVectorizeHints *Hints; |
| |
| /// The interleave access information contains groups of interleaved accesses |
| /// with the same stride and close to each other. |
| InterleavedAccessInfo &InterleaveInfo; |
| |
| /// Values to ignore in the cost model. |
| SmallPtrSet<const Value *, 16> ValuesToIgnore; |
| |
| /// Values to ignore in the cost model when VF > 1. |
| SmallPtrSet<const Value *, 16> VecValuesToIgnore; |
| }; |
| |
| } // end namespace llvm |
| |
| // Return true if \p OuterLp is an outer loop annotated with hints for explicit |
| // vectorization. The loop needs to be annotated with #pragma omp simd |
| // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the |
| // vector length information is not provided, vectorization is not considered |
| // explicit. Interleave hints are not allowed either. These limitations will be |
| // relaxed in the future. |
| // Please, note that we are currently forced to abuse the pragma 'clang |
| // vectorize' semantics. This pragma provides *auto-vectorization hints* |
| // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' |
| // provides *explicit vectorization hints* (LV can bypass legal checks and |
| // assume that vectorization is legal). However, both hints are implemented |
| // using the same metadata (llvm.loop.vectorize, processed by |
| // LoopVectorizeHints). This will be fixed in the future when the native IR |
| // representation for pragma 'omp simd' is introduced. |
| static bool isExplicitVecOuterLoop(Loop *OuterLp, |
| OptimizationRemarkEmitter *ORE) { |
| assert(!OuterLp->empty() && "This is not an outer loop"); |
| LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); |
| |
| // Only outer loops with an explicit vectorization hint are supported. |
| // Unannotated outer loops are ignored. |
| if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) |
| return false; |
| |
| Function *Fn = OuterLp->getHeader()->getParent(); |
| if (!Hints.allowVectorization(Fn, OuterLp, |
| true /*VectorizeOnlyWhenForced*/)) { |
| LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); |
| return false; |
| } |
| |
| if (Hints.getInterleave() > 1) { |
| // TODO: Interleave support is future work. |
| LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " |
| "outer loops.\n"); |
| Hints.emitRemarkWithHints(); |
| return false; |
| } |
| |
| return true; |
| } |
| |
| static void collectSupportedLoops(Loop &L, LoopInfo *LI, |
| OptimizationRemarkEmitter *ORE, |
| SmallVectorImpl<Loop *> &V) { |
| // Collect inner loops and outer loops without irreducible control flow. For |
| // now, only collect outer loops that have explicit vectorization hints. If we |
| // are stress testing the VPlan H-CFG construction, we collect the outermost |
| // loop of every loop nest. |
| if (L.empty() || VPlanBuildStressTest || |
| (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { |
| LoopBlocksRPO RPOT(&L); |
| RPOT.perform(LI); |
| if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { |
| V.push_back(&L); |
| // TODO: Collect inner loops inside marked outer loops in case |
| // vectorization fails for the outer loop. Do not invoke |
| // 'containsIrreducibleCFG' again for inner loops when the outer loop is |
| // already known to be reducible. We can use an inherited attribute for |
| // that. |
| return; |
| } |
| } |
| for (Loop *InnerL : L) |
| collectSupportedLoops(*InnerL, LI, ORE, V); |
| } |
| |
| namespace { |
| |
| /// The LoopVectorize Pass. |
| struct LoopVectorize : public FunctionPass { |
| /// Pass identification, replacement for typeid |
| static char ID; |
| |
| LoopVectorizePass Impl; |
| |
| explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, |
| bool VectorizeOnlyWhenForced = false) |
| : FunctionPass(ID) { |
| Impl.InterleaveOnlyWhenForced = InterleaveOnlyWhenForced; |
| Impl.VectorizeOnlyWhenForced = VectorizeOnlyWhenForced; |
| initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); |
| } |
| |
| bool runOnFunction(Function &F) override { |
| if (skipFunction(F)) |
| return false; |
| |
| auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); |
| auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); |
| auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); |
| auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); |
| auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); |
| auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); |
| auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; |
| auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); |
| auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); |
| auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); |
| auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); |
| auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); |
| auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); |
| |
| std::function<const LoopAccessInfo &(Loop &)> GetLAA = |
| [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; |
| |
| return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, |
| GetLAA, *ORE, PSI); |
| } |
| |
| void getAnalysisUsage(AnalysisUsage &AU) const override { |
| AU.addRequired<AssumptionCacheTracker>(); |
| AU.addRequired<BlockFrequencyInfoWrapperPass>(); |
| AU.addRequired<DominatorTreeWrapperPass>(); |
| AU.addRequired<LoopInfoWrapperPass>(); |
| AU.addRequired<ScalarEvolutionWrapperPass>(); |
| AU.addRequired<TargetTransformInfoWrapperPass>(); |
| AU.addRequired<AAResultsWrapperPass>(); |
| AU.addRequired<LoopAccessLegacyAnalysis>(); |
| AU.addRequired<DemandedBitsWrapperPass>(); |
| AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); |
| |
| // We currently do not preserve loopinfo/dominator analyses with outer loop |
| // vectorization. Until this is addressed, mark these analyses as preserved |
| // only for non-VPlan-native path. |
| // TODO: Preserve Loop and Dominator analyses for VPlan-native path. |
| if (!EnableVPlanNativePath) { |
| AU.addPreserved<LoopInfoWrapperPass>(); |
| AU.addPreserved<DominatorTreeWrapperPass>(); |
| } |
| |
| AU.addPreserved<BasicAAWrapperPass>(); |
| AU.addPreserved<GlobalsAAWrapperPass>(); |
| AU.addRequired<ProfileSummaryInfoWrapperPass>(); |
| } |
| }; |
| |
| } // end anonymous namespace |
| |
| //===----------------------------------------------------------------------===// |
| // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and |
| // LoopVectorizationCostModel and LoopVectorizationPlanner. |
| //===----------------------------------------------------------------------===// |
| |
| Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { |
| // We need to place the broadcast of invariant variables outside the loop, |
| // but only if it's proven safe to do so. Else, broadcast will be inside |
| // vector loop body. |
| Instruction *Instr = dyn_cast<Instruction>(V); |
| bool SafeToHoist = OrigLoop->isLoopInvariant(V) && |
| (!Instr || |
| DT->dominates(Instr->getParent(), LoopVectorPreHeader)); |
| // Place the code for broadcasting invariant variables in the new preheader. |
| IRBuilder<>::InsertPointGuard Guard(Builder); |
| if (SafeToHoist) |
| Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); |
| |
| // Broadcast the scalar into all locations in the vector. |
| Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); |
| |
| return Shuf; |
| } |
| |
| void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( |
| const InductionDescriptor &II, Value *Step, Instruction *EntryVal) { |
| assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && |
| "Expected either an induction phi-node or a truncate of it!"); |
| Value *Start = II.getStartValue(); |
| |
| // Construct the initial value of the vector IV in the vector loop preheader |
| auto CurrIP = Builder.saveIP(); |
| Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); |
| if (isa<TruncInst>(EntryVal)) { |
| assert(Start->getType()->isIntegerTy() && |
| "Truncation requires an integer type"); |
| auto *TruncType = cast<IntegerType>(EntryVal->getType()); |
| Step = Builder.CreateTrunc(Step, TruncType); |
| Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); |
| } |
| Value *SplatStart = Builder.CreateVectorSplat(VF, Start); |
| Value *SteppedStart = |
| getStepVector(SplatStart, 0, Step, II.getInductionOpcode()); |
| |
| // We create vector phi nodes for both integer and floating-point induction |
| // variables. Here, we determine the kind of arithmetic we will perform. |
| Instruction::BinaryOps AddOp; |
| Instruction::BinaryOps MulOp; |
| if (Step->getType()->isIntegerTy()) { |
| AddOp = Instruction::Add; |
| MulOp = Instruction::Mul; |
| } else { |
| AddOp = II.getInductionOpcode(); |
| MulOp = Instruction::FMul; |
| } |
| |
| // Multiply the vectorization factor by the step using integer or |
| // floating-point arithmetic as appropriate. |
| Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF); |
| Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF)); |
| |
| // Create a vector splat to use in the induction update. |
| // |
| // FIXME: If the step is non-constant, we create the vector splat with |
| // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't |
| // handle a constant vector splat. |
| Value *SplatVF = isa<Constant>(Mul) |
| ? ConstantVector::getSplat(VF, cast<Constant>(Mul)) |
| : Builder.CreateVectorSplat(VF, Mul); |
| Builder.restoreIP(CurrIP); |
| |
| // We may need to add the step a number of times, depending on the unroll |
| // factor. The last of those goes into the PHI. |
| PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", |
| &*LoopVectorBody->getFirstInsertionPt()); |
| VecInd->setDebugLoc(EntryVal->getDebugLoc()); |
| Instruction *LastInduction = VecInd; |
| for (unsigned Part = 0; Part < UF; ++Part) { |
| VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction); |
| |
| if (isa<TruncInst>(EntryVal)) |
| addMetadata(LastInduction, EntryVal); |
| recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part); |
| |
| LastInduction = cast<Instruction>(addFastMathFlag( |
| Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"))); |
| LastInduction->setDebugLoc(EntryVal->getDebugLoc()); |
| } |
| |
| // Move the last step to the end of the latch block. This ensures consistent |
| // placement of all induction updates. |
| auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); |
| auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); |
| auto *ICmp = cast<Instruction>(Br->getCondition()); |
| LastInduction->moveBefore(ICmp); |
| LastInduction->setName("vec.ind.next"); |
| |
| VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); |
| VecInd->addIncoming(LastInduction, LoopVectorLatch); |
| } |
| |
| bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { |
| return Cost->isScalarAfterVectorization(I, VF) || |
| Cost->isProfitableToScalarize(I, VF); |
| } |
| |
| bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { |
| if (shouldScalarizeInstruction(IV)) |
| return true; |
| auto isScalarInst = [&](User *U) -> bool { |
| auto *I = cast<Instruction>(U); |
| return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); |
| }; |
| return llvm::any_of(IV->users(), isScalarInst); |
| } |
| |
| void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( |
| const InductionDescriptor &ID, const Instruction *EntryVal, |
| Value *VectorLoopVal, unsigned Part, unsigned Lane) { |
| assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && |
| "Expected either an induction phi-node or a truncate of it!"); |
| |
| // This induction variable is not the phi from the original loop but the |
| // newly-created IV based on the proof that casted Phi is equal to the |
| // uncasted Phi in the vectorized loop (under a runtime guard possibly). It |
| // re-uses the same InductionDescriptor that original IV uses but we don't |
| // have to do any recording in this case - that is done when original IV is |
| // processed. |
| if (isa<TruncInst>(EntryVal)) |
| return; |
| |
| const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); |
| if (Casts.empty()) |
| return; |
| // Only the first Cast instruction in the Casts vector is of interest. |
| // The rest of the Casts (if exist) have no uses outside the |
| // induction update chain itself. |
| Instruction *CastInst = *Casts.begin(); |
| if (Lane < UINT_MAX) |
| VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal); |
| else |
| VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal); |
| } |
| |
| void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) { |
| assert((IV->getType()->isIntegerTy() || IV != OldInduction) && |
| "Primary induction variable must have an integer type"); |
| |
| auto II = Legal->getInductionVars()->find(IV); |
| assert(II != Legal->getInductionVars()->end() && "IV is not an induction"); |
| |
| auto ID = II->second; |
| assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); |
| |
| // The scalar value to broadcast. This will be derived from the canonical |
| // induction variable. |
| Value *ScalarIV = nullptr; |
| |
| // The value from the original loop to which we are mapping the new induction |
| // variable. |
| Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; |
| |
| // True if we have vectorized the induction variable. |
| auto VectorizedIV = false; |
| |
| // Determine if we want a scalar version of the induction variable. This is |
| // true if the induction variable itself is not widened, or if it has at |
| // least one user in the loop that is not widened. |
| auto NeedsScalarIV = VF > 1 && needsScalarInduction(EntryVal); |
| |
| // Generate code for the induction step. Note that induction steps are |
| // required to be loop-invariant |
| assert(PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) && |
| "Induction step should be loop invariant"); |
| auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); |
| Value *Step = nullptr; |
| if (PSE.getSE()->isSCEVable(IV->getType())) { |
| SCEVExpander Exp(*PSE.getSE(), DL, "induction"); |
| Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(), |
| LoopVectorPreHeader->getTerminator()); |
| } else { |
| Step = cast<SCEVUnknown>(ID.getStep())->getValue(); |
| } |
| |
| // Try to create a new independent vector induction variable. If we can't |
| // create the phi node, we will splat the scalar induction variable in each |
| // loop iteration. |
| if (VF > 1 && !shouldScalarizeInstruction(EntryVal)) { |
| createVectorIntOrFpInductionPHI(ID, Step, EntryVal); |
| VectorizedIV = true; |
| } |
| |
| // If we haven't yet vectorized the induction variable, or if we will create |
| // a scalar one, we need to define the scalar induction variable and step |
| // values. If we were given a truncation type, truncate the canonical |
| // induction variable and step. Otherwise, derive these values from the |
| // induction descriptor. |
| if (!VectorizedIV || NeedsScalarIV) { |
| ScalarIV = Induction; |
| if (IV != OldInduction) { |
| ScalarIV = IV->getType()->isIntegerTy() |
| ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) |
| : Builder.CreateCast(Instruction::SIToFP, Induction, |
| IV->getType()); |
| ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID); |
| ScalarIV->setName("offset.idx"); |
| } |
| if (Trunc) { |
| auto *TruncType = cast<IntegerType>(Trunc->getType()); |
| assert(Step->getType()->isIntegerTy() && |
| "Truncation requires an integer step"); |
| ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); |
| Step = Builder.CreateTrunc(Step, TruncType); |
| } |
| } |
| |
| // If we haven't yet vectorized the induction variable, splat the scalar |
| // induction variable, and build the necessary step vectors. |
| // TODO: Don't do it unless the vectorized IV is really required. |
| if (!VectorizedIV) { |
| Value *Broadcasted = getBroadcastInstrs(ScalarIV); |
| for (unsigned Part = 0; Part < UF; ++Part) { |
| Value *EntryPart = |
| getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode()); |
| VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart); |
| if (Trunc) |
| addMetadata(EntryPart, Trunc); |
| recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part); |
| } |
| } |
| |
| // If an induction variable is only used for counting loop iterations or |
| // calculating addresses, it doesn't need to be widened. Create scalar steps |
| // that can be used by instructions we will later scalarize. Note that the |
| // addition of the scalar steps will not increase the number of instructions |
| // in the loop in the common case prior to InstCombine. We will be trading |
| // one vector extract for each scalar step. |
| if (NeedsScalarIV) |
| buildScalarSteps(ScalarIV, Step, EntryVal, ID); |
| } |
| |
| Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, |
| Instruction::BinaryOps BinOp) { |
| // Create and check the types. |
| assert(Val->getType()->isVectorTy() && "Must be a vector"); |
| int VLen = Val->getType()->getVectorNumElements(); |
| |
| Type *STy = Val->getType()->getScalarType(); |
| assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && |
| "Induction Step must be an integer or FP"); |
| assert(Step->getType() == STy && "Step has wrong type"); |
| |
| SmallVector<Constant *, 8> Indices; |
| |
| if (STy->isIntegerTy()) { |
| // Create a vector of consecutive numbers from zero to VF. |
| for (int i = 0; i < VLen; ++i) |
| Indices.push_back(ConstantInt::get(STy, StartIdx + i)); |
| |
| // Add the consecutive indices to the vector value. |
| Constant *Cv = ConstantVector::get(Indices); |
| assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); |
| Step = Builder.CreateVectorSplat(VLen, Step); |
| assert(Step->getType() == Val->getType() && "Invalid step vec"); |
| // FIXME: The newly created binary instructions should contain nsw/nuw flags, |
| // which can be found from the original scalar operations. |
| Step = Builder.CreateMul(Cv, Step); |
| return Builder.CreateAdd(Val, Step, "induction"); |
| } |
| |
| // Floating point induction. |
| assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && |
| "Binary Opcode should be specified for FP induction"); |
| // Create a vector of consecutive numbers from zero to VF. |
| for (int i = 0; i < VLen; ++i) |
| Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i))); |
| |
| // Add the consecutive indices to the vector value. |
| Constant *Cv = ConstantVector::get(Indices); |
| |
| Step = Builder.CreateVectorSplat(VLen, Step); |
| |
| // Floating point operations had to be 'fast' to enable the induction. |
| FastMathFlags Flags; |
| Flags.setFast(); |
| |
| Value *MulOp = Builder.CreateFMul(Cv, Step); |
| if (isa<Instruction>(MulOp)) |
| // Have to check, MulOp may be a constant |
| cast<Instruction>(MulOp)->setFastMathFlags(Flags); |
| |
| Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); |
| if (isa<Instruction>(BOp)) |
| cast<Instruction>(BOp)->setFastMathFlags(Flags); |
| return BOp; |
| } |
| |
| void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, |
| Instruction *EntryVal, |
| const InductionDescriptor &ID) { |
| // We shouldn't have to build scalar steps if we aren't vectorizing. |
| assert(VF > 1 && "VF should be greater than one"); |
| |
| // Get the value type and ensure it and the step have the same integer type. |
| Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); |
| assert(ScalarIVTy == Step->getType() && |
| "Val and Step should have the same type"); |
| |
| // We build scalar steps for both integer and floating-point induction |
| // variables. Here, we determine the kind of arithmetic we will perform. |
| Instruction::BinaryOps AddOp; |
| Instruction::BinaryOps MulOp; |
| if (ScalarIVTy->isIntegerTy()) { |
| AddOp = Instruction::Add; |
| MulOp = Instruction::Mul; |
| } else { |
| AddOp = ID.getInductionOpcode(); |
| MulOp = Instruction::FMul; |
| } |
| |
| // Determine the number of scalars we need to generate for each unroll |
| // iteration. If EntryVal is uniform, we only need to generate the first |
| // lane. Otherwise, we generate all VF values. |
| unsigned Lanes = |
| Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1 |
| : VF; |
| // Compute the scalar steps and save the results in VectorLoopValueMap. |
| for (unsigned Part = 0; Part < UF; ++Part) { |
| for (unsigned Lane = 0; Lane < Lanes; ++Lane) { |
| auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane); |
| auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step)); |
| auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul)); |
| VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add); |
| recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane); |
| } |
| } |
| } |
| |
| Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) { |
| assert(V != Induction && "The new induction variable should not be used."); |
| assert(!V->getType()->isVectorTy() && "Can't widen a vector"); |
| assert(!V->getType()->isVoidTy() && "Type does not produce a value"); |
| |
| // If we have a stride that is replaced by one, do it here. Defer this for |
| // the VPlan-native path until we start running Legal checks in that path. |
| if (!EnableVPlanNativePath && Legal->hasStride(V)) |
| V = ConstantInt::get(V->getType(), 1); |
| |
| // If we have a vector mapped to this value, return it. |
| if (VectorLoopValueMap.hasVectorValue(V, Part)) |
| return VectorLoopValueMap.getVectorValue(V, Part); |
| |
| // If the value has not been vectorized, check if it has been scalarized |
| // instead. If it has been scalarized, and we actually need the value in |
| // vector form, we will construct the vector values on demand. |
| if (VectorLoopValueMap.hasAnyScalarValue(V)) { |
| Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0}); |
| |
| // If we've scalarized a value, that value should be an instruction. |
| auto *I = cast<Instruction>(V); |
| |
| // If we aren't vectorizing, we can just copy the scalar map values over to |
| // the vector map. |
| if (VF == 1) { |
| VectorLoopValueMap.setVectorValue(V, Part, ScalarValue); |
| return ScalarValue; |
| } |
| |
| // Get the last scalar instruction we generated for V and Part. If the value |
| // is known to be uniform after vectorization, this corresponds to lane zero |
| // of the Part unroll iteration. Otherwise, the last instruction is the one |
| // we created for the last vector lane of the Part unroll iteration. |
| unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1; |
| auto *LastInst = cast<Instruction>( |
| VectorLoopValueMap.getScalarValue(V, {Part, LastLane})); |
| |
| // Set the insert point after the last scalarized instruction. This ensures |
| // the insertelement sequence will directly follow the scalar definitions. |
| auto OldIP = Builder.saveIP(); |
| auto NewIP = std::next(BasicBlock::iterator(LastInst)); |
| Builder.SetInsertPoint(&*NewIP); |
| |
| // However, if we are vectorizing, we need to construct the vector values. |
| // If the value is known to be uniform after vectorization, we can just |
| // broadcast the scalar value corresponding to lane zero for each unroll |
| // iteration. Otherwise, we construct the vector values using insertelement |
| // instructions. Since the resulting vectors are stored in |
| // VectorLoopValueMap, we will only generate the insertelements once. |
| Value *VectorValue = nullptr; |
| if (Cost->isUniformAfterVectorization(I, VF)) { |
| VectorValue = getBroadcastInstrs(ScalarValue); |
| VectorLoopValueMap.setVectorValue(V, Part, VectorValue); |
| } else { |
| // Initialize packing with insertelements to start from undef. |
| Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF)); |
| VectorLoopValueMap.setVectorValue(V, Part, Undef); |
| for (unsigned Lane = 0; Lane < VF; ++Lane) |
| packScalarIntoVectorValue(V, {Part, Lane}); |
| VectorValue = VectorLoopValueMap.getVectorValue(V, Part); |
| } |
| Builder.restoreIP(OldIP); |
| return VectorValue; |
| } |
| |
| // If this scalar is unknown, assume that it is a constant or that it is |
| // loop invariant. Broadcast V and save the value for future uses. |
| Value *B = getBroadcastInstrs(V); |
| VectorLoopValueMap.setVectorValue(V, Part, B); |
| return B; |
| } |
| |
| Value * |
| InnerLoopVectorizer::getOrCreateScalarValue(Value *V, |
| const VPIteration &Instance) { |
| // If the value is not an instruction contained in the loop, it should |
| // already be scalar. |
| if (OrigLoop->isLoopInvariant(V)) |
| return V; |
| |
| assert(Instance.Lane > 0 |
| ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF) |
| : true && "Uniform values only have lane zero"); |
| |
| // If the value from the original loop has not been vectorized, it is |
| // represented by UF x VF scalar values in the new loop. Return the requested |
| // scalar value. |
| if (VectorLoopValueMap.hasScalarValue(V, Instance)) |
| return VectorLoopValueMap.getScalarValue(V, Instance); |
| |
| // If the value has not been scalarized, get its entry in VectorLoopValueMap |
| // for the given unroll part. If this entry is not a vector type (i.e., the |
| // vectorization factor is one), there is no need to generate an |
| // extractelement instruction. |
| auto *U = getOrCreateVectorValue(V, Instance.Part); |
| if (!U->getType()->isVectorTy()) { |
| assert(VF == 1 && "Value not scalarized has non-vector type"); |
| return U; |
| } |
| |
| // Otherwise, the value from the original loop has been vectorized and is |
| // represented by UF vector values. Extract and return the requested scalar |
| // value from the appropriate vector lane. |
| return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane)); |
| } |
| |
| void InnerLoopVectorizer::packScalarIntoVectorValue( |
| Value *V, const VPIteration &Instance) { |
| assert(V != Induction && "The new induction variable should not be used."); |
| assert(!V->getType()->isVectorTy() && "Can't pack a vector"); |
| assert(!V->getType()->isVoidTy() && "Type does not produce a value"); |
| |
| Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance); |
| Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part); |
| VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst, |
| Builder.getInt32(Instance.Lane)); |
| VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue); |
| } |
| |
| Value *InnerLoopVectorizer::reverseVector(Value *Vec) { |
| assert(Vec->getType()->isVectorTy() && "Invalid type"); |
| SmallVector<Constant *, 8> ShuffleMask; |
| for (unsigned i = 0; i < VF; ++i) |
| ShuffleMask.push_back(Builder.getInt32(VF - i - 1)); |
| |
| return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()), |
| ConstantVector::get(ShuffleMask), |
| "reverse"); |
| } |
| |
| // Return whether we allow using masked interleave-groups (for dealing with |
| // strided loads/stores that reside in predicated blocks, or for dealing |
| // with gaps). |
| static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { |
| // If an override option has been passed in for interleaved accesses, use it. |
| if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) |
| return EnableMaskedInterleavedMemAccesses; |
| |
| return TTI.enableMaskedInterleavedAccessVectorization(); |
| } |
| |
| // Try to vectorize the interleave group that \p Instr belongs to. |
| // |
| // E.g. Translate following interleaved load group (factor = 3): |
| // for (i = 0; i < N; i+=3) { |
| // R = Pic[i]; // Member of index 0 |
| // G = Pic[i+1]; // Member of index 1 |
| // B = Pic[i+2]; // Member of index 2 |
| // ... // do something to R, G, B |
| // } |
| // To: |
| // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B |
| // %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9> ; R elements |
| // %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10> ; G elements |
| // %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11> ; B elements |
| // |
| // Or translate following interleaved store group (factor = 3): |
| // for (i = 0; i < N; i+=3) { |
| // ... do something to R, G, B |
| // Pic[i] = R; // Member of index 0 |
| // Pic[i+1] = G; // Member of index 1 |
| // Pic[i+2] = B; // Member of index 2 |
| // } |
| // To: |
| // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> |
| // %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u> |
| // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, |
| // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements |
| // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B |
| void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr, |
| VPTransformState &State, |
| VPValue *Addr, |
| VPValue *BlockInMask) { |
| const InterleaveGroup<Instruction> *Group = |
| Cost->getInterleavedAccessGroup(Instr); |
| assert(Group && "Fail to get an interleaved access group."); |
| |
| // Skip if current instruction is not the insert position. |
| if (Instr != Group->getInsertPos()) |
| return; |
| |
| const DataLayout &DL = Instr->getModule()->getDataLayout(); |
| |
| // Prepare for the vector type of the interleaved load/store. |
| Type *ScalarTy = getMemInstValueType(Instr); |
| unsigned InterleaveFactor = Group->getFactor(); |
| Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF); |
| |
| // Prepare for the new pointers. |
| SmallVector<Value *, 2> AddrParts; |
| unsigned Index = Group->getIndex(Instr); |
| |
| // TODO: extend the masked interleaved-group support to reversed access. |
| assert((!BlockInMask || !Group->isReverse()) && |
| "Reversed masked interleave-group not supported."); |
| |
| // If the group is reverse, adjust the index to refer to the last vector lane |
| // instead of the first. We adjust the index from the first vector lane, |
| // rather than directly getting the pointer for lane VF - 1, because the |
| // pointer operand of the interleaved access is supposed to be uniform. For |
| // uniform instructions, we're only required to generate a value for the |
| // first vector lane in each unroll iteration. |
| if (Group->isReverse()) |
| Index += (VF - 1) * Group->getFactor(); |
| |
| for (unsigned Part = 0; Part < UF; Part++) { |
| Value *AddrPart = State.get(Addr, {Part, 0}); |
| setDebugLocFromInst(Builder, AddrPart); |
| |
| // Notice current instruction could be any index. Need to adjust the address |
| // to the member of index 0. |
| // |
| // E.g. a = A[i+1]; // Member of index 1 (Current instruction) |
| // b = A[i]; // Member of index 0 |
| // Current pointer is pointed to A[i+1], adjust it to A[i]. |
| // |
| // E.g. A[i+1] = a; // Member of index 1 |
| // A[i] = b; // Member of index 0 |
| // A[i+2] = c; // Member of index 2 (Current instruction) |
| // Current pointer is pointed to A[i+2], adjust it to A[i]. |
| |
| bool InBounds = false; |
| if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) |
| InBounds = gep->isInBounds(); |
| AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); |
| cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); |
| |
| // Cast to the vector pointer type. |
| unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); |
| Type *PtrTy = VecTy->getPointerTo(AddressSpace); |
| AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); |
| } |
| |
| setDebugLocFromInst(Builder, Instr); |
| Value *UndefVec = UndefValue::get(VecTy); |
| |
| Value *MaskForGaps = nullptr; |
| if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { |
| MaskForGaps = createBitMaskForGaps(Builder, VF, *Group); |
| assert(MaskForGaps && "Mask for Gaps is required but it is null"); |
| } |
| |
| // Vectorize the interleaved load group. |
| if (isa<LoadInst>(Instr)) { |
| // For each unroll part, create a wide load for the group. |
| SmallVector<Value *, 2> NewLoads; |
| for (unsigned Part = 0; Part < UF; Part++) { |
| Instruction *NewLoad; |
| if (BlockInMask || MaskForGaps) { |
| assert(useMaskedInterleavedAccesses(*TTI) && |
| "masked interleaved groups are not allowed."); |
| Value *GroupMask = MaskForGaps; |
| if (BlockInMask) { |
| Value *BlockInMaskPart = State.get(BlockInMask, Part); |
| auto *Undefs = UndefValue::get(BlockInMaskPart->getType()); |
| auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF); |
| Value *ShuffledMask = Builder.CreateShuffleVector( |
| BlockInMaskPart, Undefs, RepMask, "interleaved.mask"); |
| GroupMask = MaskForGaps |
| ? Builder.CreateBinOp(Instruction::And, ShuffledMask, |
| MaskForGaps) |
| : ShuffledMask; |
| } |
| NewLoad = |
| Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlignment(), |
| GroupMask, UndefVec, "wide.masked.vec"); |
| } |
| else |
| NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], |
| Group->getAlignment(), "wide.vec"); |
| Group->addMetadata(NewLoad); |
| NewLoads.push_back(NewLoad); |
| } |
| |
| // For each member in the group, shuffle out the appropriate data from the |
| // wide loads. |
| for (unsigned I = 0; I < InterleaveFactor; ++I) { |
| Instruction *Member = Group->getMember(I); |
| |
| // Skip the gaps in the group. |
| if (!Member) |
| continue; |
| |
| Constant *StrideMask = createStrideMask(Builder, I, InterleaveFactor, VF); |
| for (unsigned Part = 0; Part < UF; Part++) { |
| Value *StridedVec = Builder.CreateShuffleVector( |
| NewLoads[Part], UndefVec, StrideMask, "strided.vec"); |
| |
| // If this member has different type, cast the result type. |
| if (Member->getType() != ScalarTy) { |
| VectorType *OtherVTy = VectorType::get(Member->getType(), VF); |
| StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); |
| } |
| |
| if (Group->isReverse()) |
| StridedVec = reverseVector(StridedVec); |
| |
| VectorLoopValueMap.setVectorValue(Member, Part, StridedVec); |
| } |
| } |
| return; |
| } |
| |
| // The sub vector type for current instruction. |
| VectorType *SubVT = VectorType::get(ScalarTy, VF); |
| |
| // Vectorize the interleaved store group. |
| for (unsigned Part = 0; Part < UF; Part++) { |
| // Collect the stored vector from each member. |
| SmallVector<Value *, 4> StoredVecs; |
| for (unsigned i = 0; i < InterleaveFactor; i++) { |
| // Interleaved store group doesn't allow a gap, so each index has a member |
| Instruction *Member = Group->getMember(i); |
| assert(Member && "Fail to get a member from an interleaved store group"); |
| |
| Value *StoredVec = getOrCreateVectorValue( |
| cast<StoreInst>(Member)->getValueOperand(), Part); |
| if (Group->isReverse()) |
| StoredVec = reverseVector(StoredVec); |
| |
| // If this member has different type, cast it to a unified type. |
| |
| if (StoredVec->getType() != SubVT) |
| StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); |
| |
| StoredVecs.push_back(StoredVec); |
| } |
| |
| // Concatenate all vectors into a wide vector. |
| Value *WideVec = concatenateVectors(Builder, StoredVecs); |
| |
| // Interleave the elements in the wide vector. |
| Constant *IMask = createInterleaveMask(Builder, VF, InterleaveFactor); |
| Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask, |
| "interleaved.vec"); |
| |
| Instruction *NewStoreInstr; |
| if (BlockInMask) { |
| Value *BlockInMaskPart = State.get(BlockInMask, Part); |
| auto *Undefs = UndefValue::get(BlockInMaskPart->getType()); |
| auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF); |
| Value *ShuffledMask = Builder.CreateShuffleVector( |
| BlockInMaskPart, Undefs, RepMask, "interleaved.mask"); |
| NewStoreInstr = Builder.CreateMaskedStore( |
| IVec, AddrParts[Part], Group->getAlignment(), ShuffledMask); |
| } |
| else |
| NewStoreInstr = Builder.CreateAlignedStore(IVec, AddrParts[Part], |
| Group->getAlignment()); |
| |
| Group->addMetadata(NewStoreInstr); |
| } |
| } |
| |
| void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, |
| VPTransformState &State, |
| VPValue *Addr, |
| VPValue *BlockInMask) { |
| // Attempt to issue a wide load. |
| LoadInst *LI = dyn_cast<LoadInst>(Instr); |
| StoreInst *SI = dyn_cast<StoreInst>(Instr); |
| |
| assert((LI || SI) && "Invalid Load/Store instruction"); |
| |
| LoopVectorizationCostModel::InstWidening Decision = |
| Cost->getWideningDecision(Instr, VF); |
| assert(Decision != LoopVectorizationCostModel::CM_Unknown && |
| "CM decision should be taken at this point"); |
| if (Decision == LoopVectorizationCostModel::CM_Interleave) |
| return vectorizeInterleaveGroup(Instr, State, Addr, BlockInMask); |
| |
| Type *ScalarDataTy = getMemInstValueType(Instr); |
| Type *DataTy = VectorType::get(ScalarDataTy, VF); |
| // An alignment of 0 means target abi alignment. We need to use the scalar's |
| // target abi alignment in such a case. |
| const DataLayout &DL = Instr->getModule()->getDataLayout(); |
| const Align Alignment = |
| DL.getValueOrABITypeAlignment(getLoadStoreAlignment(Instr), ScalarDataTy); |
| |
| // Determine if the pointer operand of the access is either consecutive or |
| // reverse consecutive. |
| bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse); |
| bool ConsecutiveStride = |
| Reverse || (Decision == LoopVectorizationCostModel::CM_Widen); |
| bool CreateGatherScatter = |
| (Decision == LoopVectorizationCostModel::CM_GatherScatter); |
| |
| // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector |
| // gather/scatter. Otherwise Decision should have been to Scalarize. |
| assert((ConsecutiveStride || CreateGatherScatter) && |
| "The instruction should be scalarized"); |
| (void)ConsecutiveStride; |
| |
| VectorParts BlockInMaskParts(UF); |
| bool isMaskRequired = BlockInMask; |
| if (isMaskRequired) |
| for (unsigned Part = 0; Part < UF; ++Part) |
| BlockInMaskParts[Part] = State.get(BlockInMask, Part); |
| |
| const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { |
| // Calculate the pointer for the specific unroll-part. |
| GetElementPtrInst *PartPtr = nullptr; |
| |
| bool InBounds = false; |
| if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) |
| InBounds = gep->isInBounds(); |
| |
| if (Reverse) { |
| // If the address is consecutive but reversed, then the |
| // wide store needs to start at the last vector element. |
| PartPtr = cast<GetElementPtrInst>( |
| Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(-Part * VF))); |
| PartPtr->setIsInBounds(InBounds); |
| PartPtr = cast<GetElementPtrInst>( |
| Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF))); |
| PartPtr->setIsInBounds(InBounds); |
| if (isMaskRequired) // Reverse of a null all-one mask is a null mask. |
| BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); |
| } else { |
| PartPtr = cast<GetElementPtrInst>( |
| Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF))); |
| PartPtr->setIsInBounds(InBounds); |
| } |
| |
| unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); |
| return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); |
| }; |
| |
| // Handle Stores: |
| if (SI) { |
| setDebugLocFromInst(Builder, SI); |
| |
| for (unsigned Part = 0; Part < UF; ++Part) { |
| Instruction *NewSI = nullptr; |
| Value *StoredVal = getOrCreateVectorValue(SI->getValueOperand(), Part); |
| if (CreateGatherScatter) { |
| Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; |
| Value *VectorGep = State.get(Addr, Part); |
| NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, |
| Alignment.value(), MaskPart); |
| } else { |
| if (Reverse) { |
| // If we store to reverse consecutive memory locations, then we need |
| // to reverse the order of elements in the stored value. |
| StoredVal = reverseVector(StoredVal); |
| // We don't want to update the value in the map as it might be used in |
| // another expression. So don't call resetVectorValue(StoredVal). |
| } |
| auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); |
| if (isMaskRequired) |
| NewSI = Builder.CreateMaskedStore( |
| StoredVal, VecPtr, Alignment.value(), BlockInMaskParts[Part]); |
| else |
| NewSI = |
| Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment.value()); |
| } |
| addMetadata(NewSI, SI); |
| } |
| return; |
| } |
| |
| // Handle loads. |
| assert(LI && "Must have a load instruction"); |
| setDebugLocFromInst(Builder, LI); |
| for (unsigned Part = 0; Part < UF; ++Part) { |
| Value *NewLI; |
| if (CreateGatherScatter) { |
| Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; |
| Value *VectorGep = State.get(Addr, Part); |
| NewLI = Builder.CreateMaskedGather(VectorGep, Alignment.value(), MaskPart, |
| nullptr, "wide.masked.gather"); |
| addMetadata(NewLI, LI); |
| } else { |
| auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); |
| if (isMaskRequired) |
| NewLI = Builder.CreateMaskedLoad( |
| VecPtr, Alignment.value(), BlockInMaskParts[Part], |
| UndefValue::get(DataTy), "wide.masked.load"); |
| else |
| NewLI = Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment.value(), |
| "wide.load"); |
| |
| // Add metadata to the load, but setVectorValue to the reverse shuffle. |
| addMetadata(NewLI, LI); |
| if (Reverse) |
| NewLI = reverseVector(NewLI); |
| } |
| VectorLoopValueMap.setVectorValue(Instr, Part, NewLI); |
| } |
| } |
| |
| void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, |
| const VPIteration &Instance, |
| bool IfPredicateInstr) { |
| assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); |
| |
| setDebugLocFromInst(Builder, Instr); |
| |
| // Does this instruction return a value ? |
| bool IsVoidRetTy = Instr->getType()->isVoidTy(); |
| |
| Instruction *Cloned = Instr->clone(); |
| if (!IsVoidRetTy) |
| Cloned->setName(Instr->getName() + ".cloned"); |
| |
| // Replace the operands of the cloned instructions with their scalar |
| // equivalents in the new loop. |
| for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { |
| auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Instance); |
| Cloned->setOperand(op, NewOp); |
| } |
| addNewMetadata(Cloned, Instr); |
| |
| // Place the cloned scalar in the new loop. |
| Builder.Insert(Cloned); |
| |
| // Add the cloned scalar to the scalar map entry. |
| VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned); |
| |
| // If we just cloned a new assumption, add it the assumption cache. |
| if (auto *II = dyn_cast<IntrinsicInst>(Cloned)) |
| if (II->getIntrinsicID() == Intrinsic::assume) |
| AC->registerAssumption(II); |
| |
| // End if-block. |
| if (IfPredicateInstr) |
| PredicatedInstructions.push_back(Cloned); |
| } |
| |
| PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, |
| Value *End, Value *Step, |
| Instruction *DL) { |
| BasicBlock *Header = L->getHeader(); |
| BasicBlock *Latch = L->getLoopLatch(); |
| // As we're just creating this loop, it's possible no latch exists |
| // yet. If so, use the header as this will be a single block loop. |
| if (!Latch) |
| Latch = Header; |
| |
| IRBuilder<> Builder(&*Header->getFirstInsertionPt()); |
| Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); |
| setDebugLocFromInst(Builder, OldInst); |
| auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index"); |
| |
| Builder.SetInsertPoint(Latch->getTerminator()); |
| setDebugLocFromInst(Builder, OldInst); |
| |
| // Create i+1 and fill the PHINode. |
| Value *Next = Builder.CreateAdd(Induction, Step, "index.next"); |
| Induction->addIncoming(Start, L->getLoopPreheader()); |
| Induction->addIncoming(Next, Latch); |
| // Create the compare. |
| Value *ICmp = Builder.CreateICmpEQ(Next, End); |
| Builder.CreateCondBr(ICmp, L->getExitBlock(), Header); |
| |
| // Now we have two terminators. Remove the old one from the block. |
| Latch->getTerminator()->eraseFromParent(); |
| |
| return Induction; |
| } |
| |
| Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { |
| if (TripCount) |
| return TripCount; |
| |
| assert(L && "Create Trip Count for null loop."); |
| IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); |
| // Find the loop boundaries. |
| ScalarEvolution *SE = PSE.getSE(); |
| const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); |
| assert(BackedgeTakenCount != SE->getCouldNotCompute() && |
| "Invalid loop count"); |
| |
| Type *IdxTy = Legal->getWidestInductionType(); |
| assert(IdxTy && "No type for induction"); |
| |
| // The exit count might have the type of i64 while the phi is i32. This can |
| // happen if we have an induction variable that is sign extended before the |
| // compare. The only way that we get a backedge taken count is that the |
| // induction variable was signed and as such will not overflow. In such a case |
| // truncation is legal. |
| if (BackedgeTakenCount->getType()->getPrimitiveSizeInBits() > |
| IdxTy->getPrimitiveSizeInBits()) |
| BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); |
| BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); |
| |
| // Get the total trip count from the count by adding 1. |
| const SCEV *ExitCount = SE->getAddExpr( |
| BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); |
| |
| const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); |
| |
| // Expand the trip count and place the new instructions in the preheader. |
| // Notice that the pre-header does not change, only the loop body. |
| SCEVExpander Exp(*SE, DL, "induction"); |
| |
| // Count holds the overall loop count (N). |
| TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), |
| L->getLoopPreheader()->getTerminator()); |
| |
| if (TripCount->getType()->isPointerTy()) |
| TripCount = |
| CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", |
| L->getLoopPreheader()->getTerminator()); |
| |
| return TripCount; |
| } |
| |
| Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { |
| if (VectorTripCount) |
| return VectorTripCount; |
| |
| Value *TC = getOrCreateTripCount(L); |
| IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); |
| |
| Type *Ty = TC->getType(); |
| Constant *Step = ConstantInt::get(Ty, VF * UF); |
| |
| // If the tail is to be folded by masking, round the number of iterations N |
| // up to a multiple of Step instead of rounding down. This is done by first |
| // adding Step-1 and then rounding down. Note that it's ok if this addition |
| // overflows: the vector induction variable will eventually wrap to zero given |
| // that it starts at zero and its Step is a power of two; the loop will then |
| // exit, with the last early-exit vector comparison also producing all-true. |
| if (Cost->foldTailByMasking()) { |
| assert(isPowerOf2_32(VF * UF) && |
| "VF*UF must be a power of 2 when folding tail by masking"); |
| TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up"); |
| } |
| |
| // Now we need to generate the expression for the part of the loop that the |
| // vectorized body will execute. This is equal to N - (N % Step) if scalar |
| // iterations are not required for correctness, or N - Step, otherwise. Step |
| // is equal to the vectorization factor (number of SIMD elements) times the |
| // unroll factor (number of SIMD instructions). |
| Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); |
| |
| // If there is a non-reversed interleaved group that may speculatively access |
| // memory out-of-bounds, we need to ensure that there will be at least one |
| // iteration of the scalar epilogue loop. Thus, if the step evenly divides |
| // the trip count, we set the remainder to be equal to the step. If the step |
| // does not evenly divide the trip count, no adjustment is necessary since |
| // there will already be scalar iterations. Note that the minimum iterations |
| // check ensures that N >= Step. |
| if (VF > 1 && Cost->requiresScalarEpilogue()) { |
| auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); |
| R = Builder.CreateSelect(IsZero, Step, R); |
| } |
| |
| VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); |
| |
| return VectorTripCount; |
| } |
| |
| Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, |
| const DataLayout &DL) { |
| // Verify that V is a vector type with same number of elements as DstVTy. |
| unsigned VF = DstVTy->getNumElements(); |
| VectorType *SrcVecTy = cast<VectorType>(V->getType()); |
| assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); |
| Type *SrcElemTy = SrcVecTy->getElementType(); |
| Type *DstElemTy = DstVTy->getElementType(); |
| assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && |
| "Vector elements must have same size"); |
| |
| // Do a direct cast if element types are castable. |
| if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { |
| return Builder.CreateBitOrPointerCast(V, DstVTy); |
| } |
| // V cannot be directly casted to desired vector type. |
| // May happen when V is a floating point vector but DstVTy is a vector of |
| // pointers or vice-versa. Handle this using a two-step bitcast using an |
| // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. |
| assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && |
| "Only one type should be a pointer type"); |
| assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && |
| "Only one type should be a floating point type"); |
| Type *IntTy = |
| IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); |
| VectorType *VecIntTy = VectorType::get(IntTy, VF); |
| Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); |
| return Builder.CreateBitOrPointerCast(CastVal, DstVTy); |
| } |
| |
| void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, |
| BasicBlock *Bypass) { |
| Value *Count = getOrCreateTripCount(L); |
| // Reuse existing vector loop preheader for TC checks. |
| // Note that new preheader block is generated for vector loop. |
| BasicBlock *const TCCheckBlock = LoopVectorPreHeader; |
| IRBuilder<> Builder(TCCheckBlock->getTerminator()); |
| |
| // Generate code to check if the loop's trip count is less than VF * UF, or |
| // equal to it in case a scalar epilogue is required; this implies that the |
| // vector trip count is zero. This check also covers the case where adding one |
| // to the backedge-taken count overflowed leading to an incorrect trip count |
| // of zero. In this case we will also jump to the scalar loop. |
| auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE |
| : ICmpInst::ICMP_ULT; |
| |
| // If tail is to be folded, vector loop takes care of all iterations. |
| Value *CheckMinIters = Builder.getFalse(); |
| if (!Cost->foldTailByMasking()) |
| CheckMinIters = Builder.CreateICmp( |
| P, Count, ConstantInt::get(Count->getType(), VF * UF), |
| "min.iters.check"); |
| |
| // Create new preheader for vector loop. |
| LoopVectorPreHeader = |
| SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, |
| "vector.ph"); |
| |
| assert(DT->properlyDominates(DT->getNode(TCCheckBlock), |
| DT->getNode(Bypass)->getIDom()) && |
| "TC check is expected to dominate Bypass"); |
| |
| // Update dominator for Bypass & LoopExit. |
| DT->changeImmediateDominator(Bypass, TCCheckBlock); |
| DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); |
| |
| ReplaceInstWithInst( |
| TCCheckBlock->getTerminator(), |
| BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); |
| LoopBypassBlocks.push_back(TCCheckBlock); |
| } |
| |
| void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { |
| // Reuse existing vector loop preheader for SCEV checks. |
| // Note that new preheader block is generated for vector loop. |
| BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader; |
| |
| // Generate the code to check that the SCEV assumptions that we made. |
| // We want the new basic block to start at the first instruction in a |
| // sequence of instructions that form a check. |
| SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(), |
| "scev.check"); |
| Value *SCEVCheck = Exp.expandCodeForPredicate( |
| &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator()); |
| |
| if (auto *C = dyn_cast<ConstantInt>(SCEVCheck)) |
| if (C->isZero()) |
| return; |
| |
| assert(!SCEVCheckBlock->getParent()->hasOptSize() && |
| "Cannot SCEV check stride or overflow when optimizing for size"); |
| |
| SCEVCheckBlock->setName("vector.scevcheck"); |
| // Create new preheader for vector loop. |
| LoopVectorPreHeader = |
| SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI, |
| nullptr, "vector.ph"); |
| |
| // Update dominator only if this is first RT check. |
| if (LoopBypassBlocks.empty()) { |
| DT->changeImmediateDominator(Bypass, SCEVCheckBlock); |
| DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); |
| } |
| |
| ReplaceInstWithInst( |
| SCEVCheckBlock->getTerminator(), |
| BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck)); |
| LoopBypassBlocks.push_back(SCEVCheckBlock); |
| AddedSafetyChecks = true; |
| } |
| |
| void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { |
| // VPlan-native path does not do any analysis for runtime checks currently. |
| if (EnableVPlanNativePath) |
| return; |
| |
| // Reuse existing vector loop preheader for runtime memory checks. |
| // Note that new preheader block is generated for vector loop. |
| BasicBlock *const MemCheckBlock = L->getLoopPreheader(); |
| |
| // Generate the code that checks in runtime if arrays overlap. We put the |
| // checks into a separate block to make the more common case of few elements |
| // faster. |
| Instruction *FirstCheckInst; |
| Instruction *MemRuntimeCheck; |
| std::tie(FirstCheckInst, MemRuntimeCheck) = |
| Legal->getLAI()->addRuntimeChecks(MemCheckBlock->getTerminator()); |
| if (!MemRuntimeCheck) |
| return; |
| |
| if (MemCheckBlock->getParent()->hasOptSize()) { |
| assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && |
| "Cannot emit memory checks when optimizing for size, unless forced " |
| "to vectorize."); |
| ORE->emit([&]() { |
| return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", |
| L->getStartLoc(), L->getHeader()) |
| << "Code-size may be reduced by not forcing " |
| "vectorization, or by source-code modifications " |
| "eliminating the need for runtime checks " |
| "(e.g., adding 'restrict')."; |
| }); |
| } |
| |
| MemCheckBlock->setName("vector.memcheck"); |
| // Create new preheader for vector loop. |
| LoopVectorPreHeader = |
| SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr, |
| "vector.ph"); |
| |
| // Update dominator only if this is first RT check. |
| if (LoopBypassBlocks.empty()) { |
| DT->changeImmediateDominator(Bypass, MemCheckBlock); |
| DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock); |
| } |
| |
| ReplaceInstWithInst( |
| MemCheckBlock->getTerminator(), |
| BranchInst:: |