third_party/llvm-10.0/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp - SwiftShader - Git at Google

 //===-- SIWholeQuadMode.cpp - enter and suspend whole quad mode -----------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
 /// \file
 /// This pass adds instructions to enable whole quad mode for pixel
 /// shaders, and whole wavefront mode for all programs.
 ///
 /// Whole quad mode is required for derivative computations, but it interferes
 /// with shader side effects (stores and atomics). This pass is run on the
 /// scheduled machine IR but before register coalescing, so that machine SSA is
 /// available for analysis. It ensures that WQM is enabled when necessary, but
 /// disabled around stores and atomics.
 ///
 /// When necessary, this pass creates a function prolog
 ///
 ///   S_MOV_B64 LiveMask, EXEC
 ///   S_WQM_B64 EXEC, EXEC
 ///
 /// to enter WQM at the top of the function and surrounds blocks of Exact
 /// instructions by
 ///
 ///   S_AND_SAVEEXEC_B64 Tmp, LiveMask
 ///   ...
 ///   S_MOV_B64 EXEC, Tmp
 ///
 /// We also compute when a sequence of instructions requires Whole Wavefront
 /// Mode (WWM) and insert instructions to save and restore it:
 ///
 /// S_OR_SAVEEXEC_B64 Tmp, -1
 /// ...
 /// S_MOV_B64 EXEC, Tmp
 ///
 /// In order to avoid excessive switching during sequences of Exact
 /// instructions, the pass first analyzes which instructions must be run in WQM
 /// (aka which instructions produce values that lead to derivative
 /// computations).
 ///
 /// Basic blocks are always exited in WQM as long as some successor needs WQM.
 ///
 /// There is room for improvement given better control flow analysis:
 ///
 ///  (1) at the top level (outside of control flow statements, and as long as
 ///      kill hasn't been used), one SGPR can be saved by recovering WQM from
 ///      the LiveMask (this is implemented for the entry block).
 ///
 ///  (2) when entire regions (e.g. if-else blocks or entire loops) only
 ///      consist of exact and don't-care instructions, the switch only has to
 ///      be done at the entry and exit points rather than potentially in each
 ///      block of the region.
 ///
 //===----------------------------------------------------------------------===//

 #include "AMDGPU.h"
 #include "AMDGPUSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "SIInstrInfo.h"
 #include "SIMachineFunctionInfo.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/LiveInterval.h"
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SlotIndexes.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cassert>
 #include <vector>

 using namespace llvm;

 #define DEBUG_TYPE "si-wqm"

 namespace {

 enum {
   StateWQM = 0x1,
   StateWWM = 0x2,
   StateExact = 0x4,
 };

 struct PrintState {
 public:
   int State;

   explicit PrintState(int State) : State(State) {}
 };

 #ifndef NDEBUG
 static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) {
   if (PS.State & StateWQM)
     OS << "WQM";
   if (PS.State & StateWWM) {
     if (PS.State & StateWQM)
       OS << '|';
     OS << "WWM";
   }
   if (PS.State & StateExact) {
     if (PS.State & (StateWQM | StateWWM))
       OS << '|';
     OS << "Exact";
   }

   return OS;
 }
 #endif

 struct InstrInfo {
   char Needs = 0;
   char Disabled = 0;
   char OutNeeds = 0;
 };

 struct BlockInfo {
   char Needs = 0;
   char InNeeds = 0;
   char OutNeeds = 0;
 };

 struct WorkItem {
   MachineBasicBlock *MBB = nullptr;
   MachineInstr *MI = nullptr;

   WorkItem() = default;
   WorkItem(MachineBasicBlock *MBB) : MBB(MBB) {}
   WorkItem(MachineInstr *MI) : MI(MI) {}
 };

 class SIWholeQuadMode : public MachineFunctionPass {
 private:
   CallingConv::ID CallingConv;
   const SIInstrInfo *TII;
   const SIRegisterInfo *TRI;
   const GCNSubtarget *ST;
   MachineRegisterInfo *MRI;
   LiveIntervals *LIS;

   DenseMap<const MachineInstr *, InstrInfo> Instructions;
   DenseMap<MachineBasicBlock *, BlockInfo> Blocks;
   SmallVector<MachineInstr *, 1> LiveMaskQueries;
   SmallVector<MachineInstr *, 4> LowerToMovInstrs;
   SmallVector<MachineInstr *, 4> LowerToCopyInstrs;

   void printInfo();

   void markInstruction(MachineInstr &MI, char Flag,
                        std::vector<WorkItem> &Worklist);
   void markInstructionUses(const MachineInstr &MI, char Flag,
                            std::vector<WorkItem> &Worklist);
   char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);
   void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist);
   void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);
   char analyzeFunction(MachineFunction &MF);

   bool requiresCorrectState(const MachineInstr &MI) const;

   MachineBasicBlock::iterator saveSCC(MachineBasicBlock &MBB,
                                       MachineBasicBlock::iterator Before);
   MachineBasicBlock::iterator
   prepareInsertion(MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
                    MachineBasicBlock::iterator Last, bool PreferLast,
                    bool SaveSCC);
   void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
                unsigned SaveWQM, unsigned LiveMaskReg);
   void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
              unsigned SavedWQM);
   void toWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
              unsigned SaveOrig);
   void fromWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
                unsigned SavedOrig);
   void processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, bool isEntry);

   void lowerLiveMaskQueries(unsigned LiveMaskReg);
   void lowerCopyInstrs();

 public:
   static char ID;

   SIWholeQuadMode() :
     MachineFunctionPass(ID) { }

   bool runOnMachineFunction(MachineFunction &MF) override;

   StringRef getPassName() const override { return "SI Whole Quad Mode"; }

   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<LiveIntervals>();
     AU.addPreserved<SlotIndexes>();
     AU.addPreserved<LiveIntervals>();
     AU.setPreservesCFG();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 };

 } // end anonymous namespace

 char SIWholeQuadMode::ID = 0;

 INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
                       false)
 INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
 INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
                     false)

 char &llvm::SIWholeQuadModeID = SIWholeQuadMode::ID;

 FunctionPass *llvm::createSIWholeQuadModePass() {
   return new SIWholeQuadMode;
 }

 #ifndef NDEBUG
 LLVM_DUMP_METHOD void SIWholeQuadMode::printInfo() {
   for (const auto &BII : Blocks) {
     dbgs() << "\n"
            << printMBBReference(*BII.first) << ":\n"
            << "  InNeeds = " << PrintState(BII.second.InNeeds)
            << ", Needs = " << PrintState(BII.second.Needs)
            << ", OutNeeds = " << PrintState(BII.second.OutNeeds) << "\n\n";

     for (const MachineInstr &MI : *BII.first) {
       auto III = Instructions.find(&MI);
       if (III == Instructions.end())
         continue;

       dbgs() << "  " << MI << "    Needs = " << PrintState(III->second.Needs)
              << ", OutNeeds = " << PrintState(III->second.OutNeeds) << '\n';
     }
   }
 }
 #endif

 void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
                                       std::vector<WorkItem> &Worklist) {
   InstrInfo &II = Instructions[&MI];

   assert(!(Flag & StateExact) && Flag != 0);

   // Remove any disabled states from the flag. The user that required it gets
   // an undefined value in the helper lanes. For example, this can happen if
   // the result of an atomic is used by instruction that requires WQM, where
   // ignoring the request for WQM is correct as per the relevant specs.
   Flag &= ~II.Disabled;

   // Ignore if the flag is already encompassed by the existing needs, or we
   // just disabled everything.
   if ((II.Needs & Flag) == Flag)
     return;

   II.Needs |= Flag;
   Worklist.push_back(&MI);
 }

 /// Mark all instructions defining the uses in \p MI with \p Flag.
 void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag,
                                           std::vector<WorkItem> &Worklist) {
   for (const MachineOperand &Use : MI.uses()) {
     if (!Use.isReg() || !Use.isUse())
       continue;

     Register Reg = Use.getReg();

     // Handle physical registers that we need to track; this is mostly relevant
     // for VCC, which can appear as the (implicit) input of a uniform branch,
     // e.g. when a loop counter is stored in a VGPR.
     if (!Register::isVirtualRegister(Reg)) {
       if (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO)
         continue;

       for (MCRegUnitIterator RegUnit(Reg, TRI); RegUnit.isValid(); ++RegUnit) {
         LiveRange &LR = LIS->getRegUnit(*RegUnit);
         const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
         if (!Value)
           continue;

         // Since we're in machine SSA, we do not need to track physical
         // registers across basic blocks.
         if (Value->isPHIDef())
           continue;

         markInstruction(*LIS->getInstructionFromIndex(Value->def), Flag,
                         Worklist);
       }

       continue;
     }

     for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg()))
       markInstruction(DefMI, Flag, Worklist);
   }
 }

 // Scan instructions to determine which ones require an Exact execmask and
 // which ones seed WQM requirements.
 char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
                                        std::vector<WorkItem> &Worklist) {
   char GlobalFlags = 0;
   bool WQMOutputs = MF.getFunction().hasFnAttribute("amdgpu-ps-wqm-outputs");
   SmallVector<MachineInstr *, 4> SetInactiveInstrs;
   SmallVector<MachineInstr *, 4> SoftWQMInstrs;

   // We need to visit the basic blocks in reverse post-order so that we visit
   // defs before uses, in particular so that we don't accidentally mark an
   // instruction as needing e.g. WQM before visiting it and realizing it needs
   // WQM disabled.
   ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
   for (auto BI = RPOT.begin(), BE = RPOT.end(); BI != BE; ++BI) {
     MachineBasicBlock &MBB = **BI;
     BlockInfo &BBI = Blocks[&MBB];

     for (auto II = MBB.begin(), IE = MBB.end(); II != IE; ++II) {
       MachineInstr &MI = *II;
       InstrInfo &III = Instructions[&MI];
       unsigned Opcode = MI.getOpcode();
       char Flags = 0;

       if (TII->isWQM(Opcode)) {
         // Sampling instructions don't need to produce results for all pixels
         // in a quad, they just require all inputs of a quad to have been
         // computed for derivatives.
         markInstructionUses(MI, StateWQM, Worklist);
         GlobalFlags |= StateWQM;
         continue;
       } else if (Opcode == AMDGPU::WQM) {
         // The WQM intrinsic requires its output to have all the helper lanes
         // correct, so we need it to be in WQM.
         Flags = StateWQM;
         LowerToCopyInstrs.push_back(&MI);
       } else if (Opcode == AMDGPU::SOFT_WQM) {
         LowerToCopyInstrs.push_back(&MI);
         SoftWQMInstrs.push_back(&MI);
         continue;
       } else if (Opcode == AMDGPU::WWM) {
         // The WWM intrinsic doesn't make the same guarantee, and plus it needs
         // to be executed in WQM or Exact so that its copy doesn't clobber
         // inactive lanes.
         markInstructionUses(MI, StateWWM, Worklist);
         GlobalFlags |= StateWWM;
         LowerToMovInstrs.push_back(&MI);
         continue;
       } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 ||
                  Opcode == AMDGPU::V_SET_INACTIVE_B64) {
         III.Disabled = StateWWM;
         MachineOperand &Inactive = MI.getOperand(2);
         if (Inactive.isReg()) {
           if (Inactive.isUndef()) {
             LowerToCopyInstrs.push_back(&MI);
           } else {
             Register Reg = Inactive.getReg();
             if (Register::isVirtualRegister(Reg)) {
               for (MachineInstr &DefMI : MRI->def_instructions(Reg))
                 markInstruction(DefMI, StateWWM, Worklist);
             }
           }
         }
         SetInactiveInstrs.push_back(&MI);
         continue;
       } else if (TII->isDisableWQM(MI)) {
         BBI.Needs |= StateExact;
         if (!(BBI.InNeeds & StateExact)) {
           BBI.InNeeds |= StateExact;
           Worklist.push_back(&MBB);
         }
         GlobalFlags |= StateExact;
         III.Disabled = StateWQM | StateWWM;
         continue;
       } else {
         if (Opcode == AMDGPU::SI_PS_LIVE) {
           LiveMaskQueries.push_back(&MI);
         } else if (WQMOutputs) {
           // The function is in machine SSA form, which means that physical
           // VGPRs correspond to shader inputs and outputs. Inputs are
           // only used, outputs are only defined.
           for (const MachineOperand &MO : MI.defs()) {
             if (!MO.isReg())
               continue;

             Register Reg = MO.getReg();

             if (!Register::isVirtualRegister(Reg) &&
                 TRI->hasVectorRegisters(TRI->getPhysRegClass(Reg))) {
               Flags = StateWQM;
               break;
             }
           }
         }

         if (!Flags)
           continue;
       }

       markInstruction(MI, Flags, Worklist);
       GlobalFlags |= Flags;
     }
   }

   // Mark sure that any SET_INACTIVE instructions are computed in WQM if WQM is
   // ever used anywhere in the function. This implements the corresponding
   // semantics of @llvm.amdgcn.set.inactive.
   // Similarly for SOFT_WQM instructions, implementing @llvm.amdgcn.softwqm.
   if (GlobalFlags & StateWQM) {
     for (MachineInstr *MI : SetInactiveInstrs)
       markInstruction(*MI, StateWQM, Worklist);
     for (MachineInstr *MI : SoftWQMInstrs)
       markInstruction(*MI, StateWQM, Worklist);
   }

   return GlobalFlags;
 }

 void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
                                            std::vector<WorkItem>& Worklist) {
   MachineBasicBlock *MBB = MI.getParent();
   InstrInfo II = Instructions[&MI]; // take a copy to prevent dangling references
   BlockInfo &BI = Blocks[MBB];

   // Control flow-type instructions and stores to temporary memory that are
   // followed by WQM computations must themselves be in WQM.
   if ((II.OutNeeds & StateWQM) && !(II.Disabled & StateWQM) &&
       (MI.isTerminator() || (TII->usesVM_CNT(MI) && MI.mayStore()))) {
     Instructions[&MI].Needs = StateWQM;
     II.Needs = StateWQM;
   }

   // Propagate to block level
   if (II.Needs & StateWQM) {
     BI.Needs |= StateWQM;
     if (!(BI.InNeeds & StateWQM)) {
       BI.InNeeds |= StateWQM;
       Worklist.push_back(MBB);
     }
   }

   // Propagate backwards within block
   if (MachineInstr *PrevMI = MI.getPrevNode()) {
     char InNeeds = (II.Needs & ~StateWWM) | II.OutNeeds;
     if (!PrevMI->isPHI()) {
       InstrInfo &PrevII = Instructions[PrevMI];
       if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) {
         PrevII.OutNeeds |= InNeeds;
         Worklist.push_back(PrevMI);
       }
     }
   }

   // Propagate WQM flag to instruction inputs
   assert(!(II.Needs & StateExact));

   if (II.Needs != 0)
     markInstructionUses(MI, II.Needs, Worklist);

   // Ensure we process a block containing WWM, even if it does not require any
   // WQM transitions.
   if (II.Needs & StateWWM)
     BI.Needs |= StateWWM;
 }

 void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB,
                                      std::vector<WorkItem>& Worklist) {
   BlockInfo BI = Blocks[&MBB]; // Make a copy to prevent dangling references.

   // Propagate through instructions
   if (!MBB.empty()) {
     MachineInstr *LastMI = &*MBB.rbegin();
     InstrInfo &LastII = Instructions[LastMI];
     if ((LastII.OutNeeds | BI.OutNeeds) != LastII.OutNeeds) {
       LastII.OutNeeds |= BI.OutNeeds;
       Worklist.push_back(LastMI);
     }
   }

   // Predecessor blocks must provide for our WQM/Exact needs.
   for (MachineBasicBlock *Pred : MBB.predecessors()) {
     BlockInfo &PredBI = Blocks[Pred];
     if ((PredBI.OutNeeds | BI.InNeeds) == PredBI.OutNeeds)
       continue;

     PredBI.OutNeeds |= BI.InNeeds;
     PredBI.InNeeds |= BI.InNeeds;
     Worklist.push_back(Pred);
   }

   // All successors must be prepared to accept the same set of WQM/Exact data.
   for (MachineBasicBlock *Succ : MBB.successors()) {
     BlockInfo &SuccBI = Blocks[Succ];
     if ((SuccBI.InNeeds | BI.OutNeeds) == SuccBI.InNeeds)
       continue;

     SuccBI.InNeeds |= BI.OutNeeds;
     Worklist.push_back(Succ);
   }
 }

 char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) {
   std::vector<WorkItem> Worklist;
   char GlobalFlags = scanInstructions(MF, Worklist);

   while (!Worklist.empty()) {
     WorkItem WI = Worklist.back();
     Worklist.pop_back();

     if (WI.MI)
       propagateInstruction(*WI.MI, Worklist);
     else
       propagateBlock(*WI.MBB, Worklist);
   }

   return GlobalFlags;
 }

 /// Whether \p MI really requires the exec state computed during analysis.
 ///
 /// Scalar instructions must occasionally be marked WQM for correct propagation
 /// (e.g. thread masks leading up to branches), but when it comes to actual
 /// execution, they don't care about EXEC.
 bool SIWholeQuadMode::requiresCorrectState(const MachineInstr &MI) const {
   if (MI.isTerminator())
     return true;

   // Skip instructions that are not affected by EXEC
   if (TII->isScalarUnit(MI))
     return false;

   // Generic instructions such as COPY will either disappear by register
   // coalescing or be lowered to SALU or VALU instructions.
   if (MI.isTransient()) {
     if (MI.getNumExplicitOperands() >= 1) {
       const MachineOperand &Op = MI.getOperand(0);
       if (Op.isReg()) {
         if (TRI->isSGPRReg(*MRI, Op.getReg())) {
           // SGPR instructions are not affected by EXEC
           return false;
         }
       }
     }
   }

   return true;
 }

 MachineBasicBlock::iterator
 SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB,
                          MachineBasicBlock::iterator Before) {
   Register SaveReg = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);

   MachineInstr *Save =
       BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), SaveReg)
           .addReg(AMDGPU::SCC);
   MachineInstr *Restore =
       BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::SCC)
           .addReg(SaveReg);

   LIS->InsertMachineInstrInMaps(*Save);
   LIS->InsertMachineInstrInMaps(*Restore);
   LIS->createAndComputeVirtRegInterval(SaveReg);

   return Restore;
 }

 // Return an iterator in the (inclusive) range [First, Last] at which
 // instructions can be safely inserted, keeping in mind that some of the
 // instructions we want to add necessarily clobber SCC.
 MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
     MachineBasicBlock::iterator Last, bool PreferLast, bool SaveSCC) {
   if (!SaveSCC)
     return PreferLast ? Last : First;

   LiveRange &LR = LIS->getRegUnit(*MCRegUnitIterator(AMDGPU::SCC, TRI));
   auto MBBE = MBB.end();
   SlotIndex FirstIdx = First != MBBE ? LIS->getInstructionIndex(*First)
                                      : LIS->getMBBEndIdx(&MBB);
   SlotIndex LastIdx =
       Last != MBBE ? LIS->getInstructionIndex(*Last) : LIS->getMBBEndIdx(&MBB);
   SlotIndex Idx = PreferLast ? LastIdx : FirstIdx;
   const LiveRange::Segment *S;

   for (;;) {
     S = LR.getSegmentContaining(Idx);
     if (!S)
       break;

     if (PreferLast) {
       SlotIndex Next = S->start.getBaseIndex();
       if (Next < FirstIdx)
         break;
       Idx = Next;
     } else {
       SlotIndex Next = S->end.getNextIndex().getBaseIndex();
       if (Next > LastIdx)
         break;
       Idx = Next;
     }
   }

   MachineBasicBlock::iterator MBBI;

   if (MachineInstr *MI = LIS->getInstructionFromIndex(Idx))
     MBBI = MI;
   else {
     assert(Idx == LIS->getMBBEndIdx(&MBB));
     MBBI = MBB.end();
   }

   if (S)
     MBBI = saveSCC(MBB, MBBI);

   return MBBI;
 }

 void SIWholeQuadMode::toExact(MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator Before,
                               unsigned SaveWQM, unsigned LiveMaskReg) {
   MachineInstr *MI;

   if (SaveWQM) {
     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(ST->isWave32() ?
                    AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64),
                  SaveWQM)
              .addReg(LiveMaskReg);
   } else {
     unsigned Exec = ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(ST->isWave32() ?
                    AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64),
                  Exec)
              .addReg(Exec)
              .addReg(LiveMaskReg);
   }

   LIS->InsertMachineInstrInMaps(*MI);
 }

 void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,
                             MachineBasicBlock::iterator Before,
                             unsigned SavedWQM) {
   MachineInstr *MI;

   unsigned Exec = ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
   if (SavedWQM) {
     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), Exec)
              .addReg(SavedWQM);
   } else {
     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(ST->isWave32() ?
                    AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64),
                  Exec)
              .addReg(Exec);
   }

   LIS->InsertMachineInstrInMaps(*MI);
 }

 void SIWholeQuadMode::toWWM(MachineBasicBlock &MBB,
                             MachineBasicBlock::iterator Before,
                             unsigned SaveOrig) {
   MachineInstr *MI;

   assert(SaveOrig);
   MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_WWM), SaveOrig)
            .addImm(-1);
   LIS->InsertMachineInstrInMaps(*MI);
 }

 void SIWholeQuadMode::fromWWM(MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator Before,
                               unsigned SavedOrig) {
   MachineInstr *MI;

   assert(SavedOrig);
   MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_WWM),
                ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC)
            .addReg(SavedOrig);
   LIS->InsertMachineInstrInMaps(*MI);
 }

 void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
                                    bool isEntry) {
   auto BII = Blocks.find(&MBB);
   if (BII == Blocks.end())
     return;

   const BlockInfo &BI = BII->second;

   // This is a non-entry block that is WQM throughout, so no need to do
   // anything.
   if (!isEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact)
     return;

   LLVM_DEBUG(dbgs() << "\nProcessing block " << printMBBReference(MBB)
                     << ":\n");

   unsigned SavedWQMReg = 0;
   unsigned SavedNonWWMReg = 0;
   bool WQMFromExec = isEntry;
   char State = (isEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM;
   char NonWWMState = 0;
   const TargetRegisterClass *BoolRC = TRI->getBoolRC();

   auto II = MBB.getFirstNonPHI(), IE = MBB.end();
   if (isEntry)
     ++II; // Skip the instruction that saves LiveMask

   // This stores the first instruction where it's safe to switch from WQM to
   // Exact or vice versa.
   MachineBasicBlock::iterator FirstWQM = IE;

   // This stores the first instruction where it's safe to switch from WWM to
   // Exact/WQM or to switch to WWM. It must always be the same as, or after,
   // FirstWQM since if it's safe to switch to/from WWM, it must be safe to
   // switch to/from WQM as well.
   MachineBasicBlock::iterator FirstWWM = IE;
   for (;;) {
     MachineBasicBlock::iterator Next = II;
     char Needs = StateExact | StateWQM; // WWM is disabled by default
     char OutNeeds = 0;

     if (FirstWQM == IE)
       FirstWQM = II;

     if (FirstWWM == IE)
       FirstWWM = II;

     // First, figure out the allowed states (Needs) based on the propagated
     // flags.
     if (II != IE) {
       MachineInstr &MI = *II;

       if (requiresCorrectState(MI)) {
         auto III = Instructions.find(&MI);
         if (III != Instructions.end()) {
           if (III->second.Needs & StateWWM)
             Needs = StateWWM;
           else if (III->second.Needs & StateWQM)
             Needs = StateWQM;
           else
             Needs &= ~III->second.Disabled;
           OutNeeds = III->second.OutNeeds;
         }
       } else {
         // If the instruction doesn't actually need a correct EXEC, then we can
         // safely leave WWM enabled.
         Needs = StateExact | StateWQM | StateWWM;
       }

       if (MI.isTerminator() && OutNeeds == StateExact)
         Needs = StateExact;

       if (MI.getOpcode() == AMDGPU::SI_ELSE && BI.OutNeeds == StateExact)
         MI.getOperand(3).setImm(1);

       ++Next;
     } else {
       // End of basic block
       if (BI.OutNeeds & StateWQM)
         Needs = StateWQM;
       else if (BI.OutNeeds == StateExact)
         Needs = StateExact;
       else
         Needs = StateWQM | StateExact;
     }

     // Now, transition if necessary.
     if (!(Needs & State)) {
       MachineBasicBlock::iterator First;
       if (State == StateWWM || Needs == StateWWM) {
         // We must switch to or from WWM
         First = FirstWWM;
       } else {
         // We only need to switch to/from WQM, so we can use FirstWQM
         First = FirstWQM;
       }

       MachineBasicBlock::iterator Before =
           prepareInsertion(MBB, First, II, Needs == StateWQM,
                            Needs == StateExact || WQMFromExec);

       if (State == StateWWM) {
         assert(SavedNonWWMReg);
         fromWWM(MBB, Before, SavedNonWWMReg);
         State = NonWWMState;
       }

       if (Needs == StateWWM) {
         NonWWMState = State;
         SavedNonWWMReg = MRI->createVirtualRegister(BoolRC);
         toWWM(MBB, Before, SavedNonWWMReg);
         State = StateWWM;
       } else {
         if (State == StateWQM && (Needs & StateExact) && !(Needs & StateWQM)) {
           if (!WQMFromExec && (OutNeeds & StateWQM))
             SavedWQMReg = MRI->createVirtualRegister(BoolRC);

           toExact(MBB, Before, SavedWQMReg, LiveMaskReg);
           State = StateExact;
         } else if (State == StateExact && (Needs & StateWQM) &&
                    !(Needs & StateExact)) {
           assert(WQMFromExec == (SavedWQMReg == 0));

           toWQM(MBB, Before, SavedWQMReg);

           if (SavedWQMReg) {
             LIS->createAndComputeVirtRegInterval(SavedWQMReg);
             SavedWQMReg = 0;
           }
           State = StateWQM;
         } else {
           // We can get here if we transitioned from WWM to a non-WWM state that
           // already matches our needs, but we shouldn't need to do anything.
           assert(Needs & State);
         }
       }
     }

     if (Needs != (StateExact | StateWQM | StateWWM)) {
       if (Needs != (StateExact | StateWQM))
         FirstWQM = IE;
       FirstWWM = IE;
     }

     if (II == IE)
       break;
     II = Next;
   }
 }

 void SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg) {
   for (MachineInstr *MI : LiveMaskQueries) {
     const DebugLoc &DL = MI->getDebugLoc();
     Register Dest = MI->getOperand(0).getReg();
     MachineInstr *Copy =
         BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest)
             .addReg(LiveMaskReg);

     LIS->ReplaceMachineInstrInMaps(*MI, *Copy);
     MI->eraseFromParent();
   }
 }

 void SIWholeQuadMode::lowerCopyInstrs() {
   for (MachineInstr *MI : LowerToMovInstrs) {
     assert(MI->getNumExplicitOperands() == 2);

     const Register Reg = MI->getOperand(0).getReg();

     if (TRI->isVGPR(*MRI, Reg)) {
       const TargetRegisterClass *regClass = Register::isVirtualRegister(Reg)
                                                 ? MRI->getRegClass(Reg)
                                                 : TRI->getPhysRegClass(Reg);

       const unsigned MovOp = TII->getMovOpcode(regClass);
       MI->setDesc(TII->get(MovOp));

       // And make it implicitly depend on exec (like all VALU movs should do).
       MI->addOperand(MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
     } else {
       MI->setDesc(TII->get(AMDGPU::COPY));
     }
   }
   for (MachineInstr *MI : LowerToCopyInstrs) {
     if (MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B32 ||
         MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B64) {
       assert(MI->getNumExplicitOperands() == 3);
       // the only reason we should be here is V_SET_INACTIVE has
       // an undef input so it is being replaced by a simple copy.
       // There should be a second undef source that we should remove.
       assert(MI->getOperand(2).isUndef());
       MI->RemoveOperand(2);
       MI->untieRegOperand(1);
     } else {
       assert(MI->getNumExplicitOperands() == 2);
     }

     MI->setDesc(TII->get(AMDGPU::COPY));
   }
 }

 bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
   Instructions.clear();
   Blocks.clear();
   LiveMaskQueries.clear();
   LowerToCopyInstrs.clear();
   LowerToMovInstrs.clear();
   CallingConv = MF.getFunction().getCallingConv();

   ST = &MF.getSubtarget<GCNSubtarget>();

   TII = ST->getInstrInfo();
   TRI = &TII->getRegisterInfo();
   MRI = &MF.getRegInfo();
   LIS = &getAnalysis<LiveIntervals>();

   char GlobalFlags = analyzeFunction(MF);
   unsigned LiveMaskReg = 0;
   unsigned Exec = ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
   if (!(GlobalFlags & StateWQM)) {
     lowerLiveMaskQueries(Exec);
     if (!(GlobalFlags & StateWWM) && LowerToCopyInstrs.empty() && LowerToMovInstrs.empty())
       return !LiveMaskQueries.empty();
   } else {
     // Store a copy of the original live mask when required
     MachineBasicBlock &Entry = MF.front();
     MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI();

     if (GlobalFlags & StateExact || !LiveMaskQueries.empty()) {
       LiveMaskReg = MRI->createVirtualRegister(TRI->getBoolRC());
       MachineInstr *MI = BuildMI(Entry, EntryMI, DebugLoc(),
                                  TII->get(AMDGPU::COPY), LiveMaskReg)
                              .addReg(Exec);
       LIS->InsertMachineInstrInMaps(*MI);
     }

     lowerLiveMaskQueries(LiveMaskReg);

     if (GlobalFlags == StateWQM) {
       // For a shader that needs only WQM, we can just set it once.
       BuildMI(Entry, EntryMI, DebugLoc(), TII->get(ST->isWave32() ?
                 AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64),
               Exec)
           .addReg(Exec);

       lowerCopyInstrs();
       // EntryMI may become invalid here
       return true;
     }
   }

   LLVM_DEBUG(printInfo());

   lowerCopyInstrs();

   // Handle the general case
   for (auto BII : Blocks)
     processBlock(*BII.first, LiveMaskReg, BII.first == &*MF.begin());

   // Physical registers like SCC aren't tracked by default anyway, so just
   // removing the ranges we computed is the simplest option for maintaining
   // the analysis results.
   LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::SCC, TRI));

   return true;
 }