third_party/llvm-16.0/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp - SwiftShader - Git at Google

 //===- AMDGPUInsertDelayAlu.cpp - Insert s_delay_alu instructions ---------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
 /// \file
 /// Insert s_delay_alu instructions to avoid stalls on GFX11+.
 //
 //===----------------------------------------------------------------------===//

 #include "AMDGPU.h"
 #include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "SIInstrInfo.h"
 #include "llvm/ADT/SetVector.h"

 using namespace llvm;

 #define DEBUG_TYPE "amdgpu-insert-delay-alu"

 namespace {

 class AMDGPUInsertDelayAlu : public MachineFunctionPass {
 public:
   static char ID;

   const SIInstrInfo *SII;
   const TargetRegisterInfo *TRI;

   TargetSchedModel SchedModel;

   AMDGPUInsertDelayAlu() : MachineFunctionPass(ID) {}

   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
     MachineFunctionPass::getAnalysisUsage(AU);
   }

   // Return true if MI waits for all outstanding VALU instructions to complete.
   static bool instructionWaitsForVALU(const MachineInstr &MI) {
     // These instruction types wait for VA_VDST==0 before issuing.
     const uint64_t VA_VDST_0 = SIInstrFlags::DS | SIInstrFlags::EXP |
                                SIInstrFlags::FLAT | SIInstrFlags::MIMG |
                                SIInstrFlags::MTBUF | SIInstrFlags::MUBUF;
     if (MI.getDesc().TSFlags & VA_VDST_0)
       return true;
     if (MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B32 ||
         MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B64)
       return true;
     if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
         (MI.getOperand(0).getImm() & 0xf000) == 0)
       return true;
     return false;
   }

   // Types of delay that can be encoded in an s_delay_alu instruction.
   enum DelayType { VALU, TRANS, SALU, OTHER };

   // Get the delay type for an instruction with the specified TSFlags.
   static DelayType getDelayType(uint64_t TSFlags) {
     if (TSFlags & SIInstrFlags::TRANS)
       return TRANS;
     if (TSFlags & SIInstrFlags::VALU)
       return VALU;
     if (TSFlags & SIInstrFlags::SALU)
       return SALU;
     return OTHER;
   }

   // Information about the last instruction(s) that wrote to a particular
   // regunit. In straight-line code there will only be one such instruction, but
   // when control flow converges we merge the delay information from each path
   // to represent the union of the worst-case delays of each type.
   struct DelayInfo {
     // One larger than the maximum number of (non-TRANS) VALU instructions we
     // can encode in an s_delay_alu instruction.
     static const unsigned VALU_MAX = 5;

     // One larger than the maximum number of TRANS instructions we can encode in
     // an s_delay_alu instruction.
     static const unsigned TRANS_MAX = 4;

     // If it was written by a (non-TRANS) VALU, remember how many clock cycles
     // are left until it completes, and how many other (non-TRANS) VALU we have
     // seen since it was issued.
     uint8_t VALUCycles = 0;
     uint8_t VALUNum = VALU_MAX;

     // If it was written by a TRANS, remember how many clock cycles are left
     // until it completes, and how many other TRANS we have seen since it was
     // issued.
     uint8_t TRANSCycles = 0;
     uint8_t TRANSNum = TRANS_MAX;
     // Also remember how many other (non-TRANS) VALU we have seen since it was
     // issued. When an instruction depends on both a prior TRANS and a prior
     // non-TRANS VALU, this is used to decide whether to encode a wait for just
     // one or both of them.
     uint8_t TRANSNumVALU = VALU_MAX;

     // If it was written by an SALU, remember how many clock cycles are left
     // until it completes.
     uint8_t SALUCycles = 0;

     DelayInfo() = default;

     DelayInfo(DelayType Type, unsigned Cycles) {
       switch (Type) {
       default:
         llvm_unreachable("unexpected type");
       case VALU:
         VALUCycles = Cycles;
         VALUNum = 0;
         break;
       case TRANS:
         TRANSCycles = Cycles;
         TRANSNum = 0;
         TRANSNumVALU = 0;
         break;
       case SALU:
         SALUCycles = Cycles;
         break;
       }
     }

     bool operator==(const DelayInfo &RHS) const {
       return VALUCycles == RHS.VALUCycles && VALUNum == RHS.VALUNum &&
              TRANSCycles == RHS.TRANSCycles && TRANSNum == RHS.TRANSNum &&
              TRANSNumVALU == RHS.TRANSNumVALU && SALUCycles == RHS.SALUCycles;
     }

     bool operator!=(const DelayInfo &RHS) const { return !(*this == RHS); }

     // Merge another DelayInfo into this one, to represent the union of the
     // worst-case delays of each type.
     void merge(const DelayInfo &RHS) {
       VALUCycles = std::max(VALUCycles, RHS.VALUCycles);
       VALUNum = std::min(VALUNum, RHS.VALUNum);
       TRANSCycles = std::max(TRANSCycles, RHS.TRANSCycles);
       TRANSNum = std::min(TRANSNum, RHS.TRANSNum);
       TRANSNumVALU = std::min(TRANSNumVALU, RHS.TRANSNumVALU);
       SALUCycles = std::max(SALUCycles, RHS.SALUCycles);
     }

     // Update this DelayInfo after issuing an instruction. IsVALU should be 1
     // when issuing a (non-TRANS) VALU, else 0. IsTRANS should be 1 when issuing
     // a TRANS, else 0. Cycles is the number of cycles it takes to issue the
     // instruction.  Return true if there is no longer any useful delay info.
     bool advance(DelayType Type, unsigned Cycles) {
       bool Erase = true;

       VALUNum += (Type == VALU);
       if (VALUNum >= VALU_MAX || VALUCycles <= Cycles) {
         // Forget about the VALU instruction. It was too far back or has
         // definitely completed by now.
         VALUNum = VALU_MAX;
         VALUCycles = 0;
       } else {
         VALUCycles -= Cycles;
         Erase = false;
       }

       TRANSNum += (Type == TRANS);
       TRANSNumVALU += (Type == VALU);
       if (TRANSNum >= TRANS_MAX || TRANSCycles <= Cycles) {
         // Forget about any TRANS instruction. It was too far back or has
         // definitely completed by now.
         TRANSNum = TRANS_MAX;
         TRANSNumVALU = VALU_MAX;
         TRANSCycles = 0;
       } else {
         TRANSCycles -= Cycles;
         Erase = false;
       }

       if (SALUCycles <= Cycles) {
         // Forget about any SALU instruction. It has definitely completed by
         // now.
         SALUCycles = 0;
       } else {
         SALUCycles -= Cycles;
         Erase = false;
       }

       return Erase;
     }

 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
     void dump() const {
       if (VALUCycles)
         dbgs() << " VALUCycles=" << (int)VALUCycles;
       if (VALUNum < VALU_MAX)
         dbgs() << " VALUNum=" << (int)VALUNum;
       if (TRANSCycles)
         dbgs() << " TRANSCycles=" << (int)TRANSCycles;
       if (TRANSNum < TRANS_MAX)
         dbgs() << " TRANSNum=" << (int)TRANSNum;
       if (TRANSNumVALU < VALU_MAX)
         dbgs() << " TRANSNumVALU=" << (int)TRANSNumVALU;
       if (SALUCycles)
         dbgs() << " SALUCycles=" << (int)SALUCycles;
     }
 #endif
   };

   // A map from regunits to the delay info for that regunit.
   struct DelayState : DenseMap<unsigned, DelayInfo> {
     // Merge another DelayState into this one by merging the delay info for each
     // regunit.
     void merge(const DelayState &RHS) {
       for (const auto &KV : RHS) {
         iterator It;
         bool Inserted;
         std::tie(It, Inserted) = insert(KV);
         if (!Inserted)
           It->second.merge(KV.second);
       }
     }

     // Advance the delay info for each regunit, erasing any that are no longer
     // useful.
     void advance(DelayType Type, unsigned Cycles) {
       iterator Next;
       for (auto I = begin(), E = end(); I != E; I = Next) {
         Next = std::next(I);
         if (I->second.advance(Type, Cycles))
           erase(I);
       }
     }

 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
     void dump(const TargetRegisterInfo *TRI) const {
       if (empty()) {
         dbgs() << "    empty\n";
         return;
       }

       // Dump DelayInfo for each RegUnit in numerical order.
       SmallVector<const_iterator, 8> Order;
       Order.reserve(size());
       for (const_iterator I = begin(), E = end(); I != E; ++I)
         Order.push_back(I);
       llvm::sort(Order, [](const const_iterator &A, const const_iterator &B) {
         return A->first < B->first;
       });
       for (const_iterator I : Order) {
         dbgs() << "    " << printRegUnit(I->first, TRI);
         I->second.dump();
         dbgs() << "\n";
       }
     }
 #endif
   };

   // The saved delay state at the end of each basic block.
   DenseMap<MachineBasicBlock *, DelayState> BlockState;

   // Emit an s_delay_alu instruction if necessary before MI.
   MachineInstr *emitDelayAlu(MachineInstr &MI, DelayInfo Delay,
                              MachineInstr *LastDelayAlu) {
     unsigned Imm = 0;

     // Wait for a TRANS instruction.
     if (Delay.TRANSNum < DelayInfo::TRANS_MAX)
       Imm |= 4 + Delay.TRANSNum;

     // Wait for a VALU instruction (if it's more recent than any TRANS
     // instruction that we're also waiting for).
     if (Delay.VALUNum < DelayInfo::VALU_MAX &&
         Delay.VALUNum <= Delay.TRANSNumVALU) {
       if (Imm & 0xf)
         Imm |= Delay.VALUNum << 7;
       else
         Imm |= Delay.VALUNum;
     }

     // Wait for an SALU instruction.
     if (Delay.SALUCycles) {
       if (Imm & 0x780) {
         // We have already encoded a VALU and a TRANS delay. There's no room in
         // the encoding for an SALU delay as well, so just drop it.
       } else if (Imm & 0xf) {
         Imm |= (Delay.SALUCycles + 8) << 7;
       } else {
         Imm |= Delay.SALUCycles + 8;
       }
     }

     // Don't emit the s_delay_alu instruction if there's nothing to wait for.
     if (!Imm)
       return LastDelayAlu;

     // If we only need to wait for one instruction, try encoding it in the last
     // s_delay_alu that we emitted.
     if (!(Imm & 0x780) && LastDelayAlu) {
       unsigned Skip = 0;
       for (auto I = MachineBasicBlock::instr_iterator(LastDelayAlu),
                 E = MachineBasicBlock::instr_iterator(MI);
            ++I != E;) {
         if (!I->isBundle() && !I->isMetaInstruction())
           ++Skip;
       }
       if (Skip < 6) {
         MachineOperand &Op = LastDelayAlu->getOperand(0);
         unsigned LastImm = Op.getImm();
         assert((LastImm & ~0xf) == 0 &&
                "Remembered an s_delay_alu with no room for another delay!");
         LastImm |= Imm << 7 | Skip << 4;
         Op.setImm(LastImm);
         return nullptr;
       }
     }

     auto &MBB = *MI.getParent();
     MachineInstr *DelayAlu =
         BuildMI(MBB, MI, DebugLoc(), SII->get(AMDGPU::S_DELAY_ALU)).addImm(Imm);
     // Remember the s_delay_alu for next time if there is still room in it to
     // encode another delay.
     return (Imm & 0x780) ? nullptr : DelayAlu;
   }

   bool runOnMachineBasicBlock(MachineBasicBlock &MBB, bool Emit) {
     DelayState State;
     for (auto *Pred : MBB.predecessors())
       State.merge(BlockState[Pred]);

     LLVM_DEBUG(dbgs() << "  State at start of " << printMBBReference(MBB)
                       << "\n";
                State.dump(TRI););

     bool Changed = false;
     MachineInstr *LastDelayAlu = nullptr;

     // Iterate over the contents of bundles, but don't emit any instructions
     // inside a bundle.
     for (auto &MI : MBB.instrs()) {
       if (MI.isBundle() || MI.isMetaInstruction())
         continue;

       // Ignore some more instructions that do not generate any code.
       switch (MI.getOpcode()) {
       case AMDGPU::SI_RETURN_TO_EPILOG:
         continue;
       }

       DelayType Type = getDelayType(MI.getDesc().TSFlags);

       if (instructionWaitsForVALU(MI)) {
         // Forget about all outstanding VALU delays.
         State = DelayState();
       } else if (Type != OTHER) {
         DelayInfo Delay;
         // TODO: Scan implicit uses too?
         for (const auto &Op : MI.explicit_uses()) {
           if (Op.isReg()) {
             // One of the operands of the writelane is also the output operand.
             // This creates the insertion of redundant delays. Hence, we have to
             // ignore this operand.
             if (MI.getOpcode() == AMDGPU::V_WRITELANE_B32 && Op.isTied())
               continue;
             for (MCRegUnitIterator UI(Op.getReg(), TRI); UI.isValid(); ++UI) {
               auto It = State.find(*UI);
               if (It != State.end()) {
                 Delay.merge(It->second);
                 State.erase(*UI);
               }
             }
           }
         }
         if (Emit && !MI.isBundledWithPred()) {
           // TODO: For VALU->SALU delays should we use s_delay_alu or s_nop or
           // just ignore them?
           LastDelayAlu = emitDelayAlu(MI, Delay, LastDelayAlu);
         }
       }

       if (Type != OTHER) {
         // TODO: Scan implicit defs too?
         for (const auto &Op : MI.defs()) {
           unsigned Latency = SchedModel.computeOperandLatency(
               &MI, MI.getOperandNo(&Op), nullptr, 0);
           for (MCRegUnitIterator UI(Op.getReg(), TRI); UI.isValid(); ++UI)
             State[*UI] = DelayInfo(Type, Latency);
         }
       }

       // Advance by the number of cycles it takes to issue this instruction.
       // TODO: Use a more advanced model that accounts for instructions that
       // take multiple cycles to issue on a particular pipeline.
       unsigned Cycles = SIInstrInfo::getNumWaitStates(MI);
       // TODO: In wave64 mode, double the number of cycles for VALU and VMEM
       // instructions on the assumption that they will usually have to be issued
       // twice?
       State.advance(Type, Cycles);

       LLVM_DEBUG(dbgs() << "  State after " << MI; State.dump(TRI););
     }

     if (Emit) {
       assert(State == BlockState[&MBB] &&
              "Basic block state should not have changed on final pass!");
     } else if (State != BlockState[&MBB]) {
       BlockState[&MBB] = std::move(State);
       Changed = true;
     }
     return Changed;
   }

   bool runOnMachineFunction(MachineFunction &MF) override {
     if (skipFunction(MF.getFunction()))
       return false;

     LLVM_DEBUG(dbgs() << "AMDGPUInsertDelayAlu running on " << MF.getName()
                       << "\n");

     const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
     if (!ST.hasDelayAlu())
       return false;

     SII = ST.getInstrInfo();
     TRI = ST.getRegisterInfo();

     SchedModel.init(&ST);

     // Calculate the delay state for each basic block, iterating until we reach
     // a fixed point.
     SetVector<MachineBasicBlock *> WorkList;
     for (auto &MBB : reverse(MF))
       WorkList.insert(&MBB);
     while (!WorkList.empty()) {
       auto &MBB = *WorkList.pop_back_val();
       bool Changed = runOnMachineBasicBlock(MBB, false);
       if (Changed)
         WorkList.insert(MBB.succ_begin(), MBB.succ_end());
     }

     LLVM_DEBUG(dbgs() << "Final pass over all BBs\n");

     // Make one last pass over all basic blocks to emit s_delay_alu
     // instructions.
     bool Changed = false;
     for (auto &MBB : MF)
       Changed |= runOnMachineBasicBlock(MBB, true);
     return Changed;
   }
 };

 } // namespace

 char AMDGPUInsertDelayAlu::ID = 0;

 char &llvm::AMDGPUInsertDelayAluID = AMDGPUInsertDelayAlu::ID;

 INITIALIZE_PASS(AMDGPUInsertDelayAlu, DEBUG_TYPE, "AMDGPU Insert Delay ALU",
                 false, false)
	//===- AMDGPUInsertDelayAlu.cpp - Insert s_delay_alu instructions ---------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	/// \file
	/// Insert s_delay_alu instructions to avoid stalls on GFX11+.
	//
	//===----------------------------------------------------------------------===//

	#include "AMDGPU.h"
	#include "GCNSubtarget.h"
	#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
	#include "SIInstrInfo.h"
	#include "llvm/ADT/SetVector.h"

	using namespace llvm;

	#define DEBUG_TYPE "amdgpu-insert-delay-alu"

	namespace {

	class AMDGPUInsertDelayAlu : public MachineFunctionPass {
	public:
	static char ID;

	const SIInstrInfo *SII;
	const TargetRegisterInfo *TRI;

	TargetSchedModel SchedModel;

	AMDGPUInsertDelayAlu() : MachineFunctionPass(ID) {}

	void getAnalysisUsage(AnalysisUsage &AU) const override {
	AU.setPreservesCFG();
	MachineFunctionPass::getAnalysisUsage(AU);
	}

	// Return true if MI waits for all outstanding VALU instructions to complete.
	static bool instructionWaitsForVALU(const MachineInstr &MI) {
	// These instruction types wait for VA_VDST==0 before issuing.
	const uint64_t VA_VDST_0 = SIInstrFlags::DS \| SIInstrFlags::EXP \|
	SIInstrFlags::FLAT \| SIInstrFlags::MIMG \|
	SIInstrFlags::MTBUF \| SIInstrFlags::MUBUF;
	if (MI.getDesc().TSFlags & VA_VDST_0)
	return true;
	if (MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B32 \|\|
	MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B64)
	return true;
	if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
	(MI.getOperand(0).getImm() & 0xf000) == 0)
	return true;
	return false;
	}

	// Types of delay that can be encoded in an s_delay_alu instruction.
	enum DelayType { VALU, TRANS, SALU, OTHER };

	// Get the delay type for an instruction with the specified TSFlags.
	static DelayType getDelayType(uint64_t TSFlags) {
	if (TSFlags & SIInstrFlags::TRANS)
	return TRANS;
	if (TSFlags & SIInstrFlags::VALU)
	return VALU;
	if (TSFlags & SIInstrFlags::SALU)
	return SALU;
	return OTHER;
	}

	// Information about the last instruction(s) that wrote to a particular
	// regunit. In straight-line code there will only be one such instruction, but
	// when control flow converges we merge the delay information from each path
	// to represent the union of the worst-case delays of each type.
	struct DelayInfo {
	// One larger than the maximum number of (non-TRANS) VALU instructions we
	// can encode in an s_delay_alu instruction.
	static const unsigned VALU_MAX = 5;

	// One larger than the maximum number of TRANS instructions we can encode in
	// an s_delay_alu instruction.
	static const unsigned TRANS_MAX = 4;

	// If it was written by a (non-TRANS) VALU, remember how many clock cycles
	// are left until it completes, and how many other (non-TRANS) VALU we have
	// seen since it was issued.
	uint8_t VALUCycles = 0;
	uint8_t VALUNum = VALU_MAX;

	// If it was written by a TRANS, remember how many clock cycles are left
	// until it completes, and how many other TRANS we have seen since it was
	// issued.
	uint8_t TRANSCycles = 0;
	uint8_t TRANSNum = TRANS_MAX;
	// Also remember how many other (non-TRANS) VALU we have seen since it was
	// issued. When an instruction depends on both a prior TRANS and a prior
	// non-TRANS VALU, this is used to decide whether to encode a wait for just
	// one or both of them.
	uint8_t TRANSNumVALU = VALU_MAX;

	// If it was written by an SALU, remember how many clock cycles are left
	// until it completes.
	uint8_t SALUCycles = 0;

	DelayInfo() = default;

	DelayInfo(DelayType Type, unsigned Cycles) {
	switch (Type) {
	default:
	llvm_unreachable("unexpected type");
	case VALU:
	VALUCycles = Cycles;
	VALUNum = 0;
	break;
	case TRANS:
	TRANSCycles = Cycles;
	TRANSNum = 0;
	TRANSNumVALU = 0;
	break;
	case SALU:
	SALUCycles = Cycles;
	break;
	}
	}

	bool operator==(const DelayInfo &RHS) const {
	return VALUCycles == RHS.VALUCycles && VALUNum == RHS.VALUNum &&
	TRANSCycles == RHS.TRANSCycles && TRANSNum == RHS.TRANSNum &&
	TRANSNumVALU == RHS.TRANSNumVALU && SALUCycles == RHS.SALUCycles;
	}

	bool operator!=(const DelayInfo &RHS) const { return !(*this == RHS); }

	// Merge another DelayInfo into this one, to represent the union of the
	// worst-case delays of each type.
	void merge(const DelayInfo &RHS) {
	VALUCycles = std::max(VALUCycles, RHS.VALUCycles);
	VALUNum = std::min(VALUNum, RHS.VALUNum);
	TRANSCycles = std::max(TRANSCycles, RHS.TRANSCycles);
	TRANSNum = std::min(TRANSNum, RHS.TRANSNum);
	TRANSNumVALU = std::min(TRANSNumVALU, RHS.TRANSNumVALU);
	SALUCycles = std::max(SALUCycles, RHS.SALUCycles);
	}

	// Update this DelayInfo after issuing an instruction. IsVALU should be 1
	// when issuing a (non-TRANS) VALU, else 0. IsTRANS should be 1 when issuing
	// a TRANS, else 0. Cycles is the number of cycles it takes to issue the
	// instruction. Return true if there is no longer any useful delay info.
	bool advance(DelayType Type, unsigned Cycles) {
	bool Erase = true;

	VALUNum += (Type == VALU);
	if (VALUNum >= VALU_MAX \|\| VALUCycles <= Cycles) {
	// Forget about the VALU instruction. It was too far back or has
	// definitely completed by now.
	VALUNum = VALU_MAX;
	VALUCycles = 0;
	} else {
	VALUCycles -= Cycles;
	Erase = false;
	}

	TRANSNum += (Type == TRANS);
	TRANSNumVALU += (Type == VALU);
	if (TRANSNum >= TRANS_MAX \|\| TRANSCycles <= Cycles) {
	// Forget about any TRANS instruction. It was too far back or has
	// definitely completed by now.
	TRANSNum = TRANS_MAX;
	TRANSNumVALU = VALU_MAX;
	TRANSCycles = 0;
	} else {
	TRANSCycles -= Cycles;
	Erase = false;
	}

	if (SALUCycles <= Cycles) {
	// Forget about any SALU instruction. It has definitely completed by
	// now.
	SALUCycles = 0;
	} else {
	SALUCycles -= Cycles;
	Erase = false;
	}

	return Erase;
	}

	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
	void dump() const {
	if (VALUCycles)
	dbgs() << " VALUCycles=" << (int)VALUCycles;
	if (VALUNum < VALU_MAX)
	dbgs() << " VALUNum=" << (int)VALUNum;
	if (TRANSCycles)
	dbgs() << " TRANSCycles=" << (int)TRANSCycles;
	if (TRANSNum < TRANS_MAX)
	dbgs() << " TRANSNum=" << (int)TRANSNum;
	if (TRANSNumVALU < VALU_MAX)
	dbgs() << " TRANSNumVALU=" << (int)TRANSNumVALU;
	if (SALUCycles)
	dbgs() << " SALUCycles=" << (int)SALUCycles;
	}
	#endif
	};

	// A map from regunits to the delay info for that regunit.
	struct DelayState : DenseMap<unsigned, DelayInfo> {
	// Merge another DelayState into this one by merging the delay info for each
	// regunit.
	void merge(const DelayState &RHS) {
	for (const auto &KV : RHS) {
	iterator It;
	bool Inserted;
	std::tie(It, Inserted) = insert(KV);
	if (!Inserted)
	It->second.merge(KV.second);
	}
	}

	// Advance the delay info for each regunit, erasing any that are no longer
	// useful.
	void advance(DelayType Type, unsigned Cycles) {
	iterator Next;
	for (auto I = begin(), E = end(); I != E; I = Next) {
	Next = std::next(I);
	if (I->second.advance(Type, Cycles))
	erase(I);
	}
	}

	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
	void dump(const TargetRegisterInfo *TRI) const {
	if (empty()) {
	dbgs() << " empty\n";
	return;
	}

	// Dump DelayInfo for each RegUnit in numerical order.
	SmallVector<const_iterator, 8> Order;
	Order.reserve(size());
	for (const_iterator I = begin(), E = end(); I != E; ++I)
	Order.push_back(I);
	llvm::sort(Order, [](const const_iterator &A, const const_iterator &B) {
	return A->first < B->first;
	});
	for (const_iterator I : Order) {
	dbgs() << " " << printRegUnit(I->first, TRI);
	I->second.dump();
	dbgs() << "\n";
	}
	}
	#endif
	};

	// The saved delay state at the end of each basic block.
	DenseMap<MachineBasicBlock *, DelayState> BlockState;

	// Emit an s_delay_alu instruction if necessary before MI.
	MachineInstr *emitDelayAlu(MachineInstr &MI, DelayInfo Delay,
	MachineInstr *LastDelayAlu) {
	unsigned Imm = 0;

	// Wait for a TRANS instruction.
	if (Delay.TRANSNum < DelayInfo::TRANS_MAX)
	Imm \|= 4 + Delay.TRANSNum;

	// Wait for a VALU instruction (if it's more recent than any TRANS
	// instruction that we're also waiting for).
	if (Delay.VALUNum < DelayInfo::VALU_MAX &&
	Delay.VALUNum <= Delay.TRANSNumVALU) {
	if (Imm & 0xf)
	Imm \|= Delay.VALUNum << 7;
	else
	Imm \|= Delay.VALUNum;
	}

	// Wait for an SALU instruction.
	if (Delay.SALUCycles) {
	if (Imm & 0x780) {
	// We have already encoded a VALU and a TRANS delay. There's no room in
	// the encoding for an SALU delay as well, so just drop it.
	} else if (Imm & 0xf) {
	Imm \|= (Delay.SALUCycles + 8) << 7;
	} else {
	Imm \|= Delay.SALUCycles + 8;
	}
	}

	// Don't emit the s_delay_alu instruction if there's nothing to wait for.
	if (!Imm)
	return LastDelayAlu;

	// If we only need to wait for one instruction, try encoding it in the last
	// s_delay_alu that we emitted.
	if (!(Imm & 0x780) && LastDelayAlu) {
	unsigned Skip = 0;
	for (auto I = MachineBasicBlock::instr_iterator(LastDelayAlu),
	E = MachineBasicBlock::instr_iterator(MI);
	++I != E;) {
	if (!I->isBundle() && !I->isMetaInstruction())
	++Skip;
	}
	if (Skip < 6) {
	MachineOperand &Op = LastDelayAlu->getOperand(0);
	unsigned LastImm = Op.getImm();
	assert((LastImm & ~0xf) == 0 &&
	"Remembered an s_delay_alu with no room for another delay!");
	LastImm \|= Imm << 7 \| Skip << 4;
	Op.setImm(LastImm);
	return nullptr;
	}
	}

	auto &MBB = *MI.getParent();
	MachineInstr *DelayAlu =
	BuildMI(MBB, MI, DebugLoc(), SII->get(AMDGPU::S_DELAY_ALU)).addImm(Imm);
	// Remember the s_delay_alu for next time if there is still room in it to
	// encode another delay.
	return (Imm & 0x780) ? nullptr : DelayAlu;
	}

	bool runOnMachineBasicBlock(MachineBasicBlock &MBB, bool Emit) {
	DelayState State;
	for (auto *Pred : MBB.predecessors())
	State.merge(BlockState[Pred]);

	LLVM_DEBUG(dbgs() << " State at start of " << printMBBReference(MBB)
	<< "\n";
	State.dump(TRI););

	bool Changed = false;
	MachineInstr *LastDelayAlu = nullptr;

	// Iterate over the contents of bundles, but don't emit any instructions
	// inside a bundle.
	for (auto &MI : MBB.instrs()) {
	if (MI.isBundle() \|\| MI.isMetaInstruction())
	continue;

	// Ignore some more instructions that do not generate any code.
	switch (MI.getOpcode()) {
	case AMDGPU::SI_RETURN_TO_EPILOG:
	continue;
	}

	DelayType Type = getDelayType(MI.getDesc().TSFlags);

	if (instructionWaitsForVALU(MI)) {
	// Forget about all outstanding VALU delays.
	State = DelayState();
	} else if (Type != OTHER) {
	DelayInfo Delay;
	// TODO: Scan implicit uses too?
	for (const auto &Op : MI.explicit_uses()) {
	if (Op.isReg()) {
	// One of the operands of the writelane is also the output operand.
	// This creates the insertion of redundant delays. Hence, we have to
	// ignore this operand.
	if (MI.getOpcode() == AMDGPU::V_WRITELANE_B32 && Op.isTied())
	continue;
	for (MCRegUnitIterator UI(Op.getReg(), TRI); UI.isValid(); ++UI) {
	auto It = State.find(*UI);
	if (It != State.end()) {
	Delay.merge(It->second);
	State.erase(*UI);
	}
	}
	}
	}
	if (Emit && !MI.isBundledWithPred()) {
	// TODO: For VALU->SALU delays should we use s_delay_alu or s_nop or
	// just ignore them?
	LastDelayAlu = emitDelayAlu(MI, Delay, LastDelayAlu);
	}
	}

	if (Type != OTHER) {
	// TODO: Scan implicit defs too?
	for (const auto &Op : MI.defs()) {
	unsigned Latency = SchedModel.computeOperandLatency(
	&MI, MI.getOperandNo(&Op), nullptr, 0);
	for (MCRegUnitIterator UI(Op.getReg(), TRI); UI.isValid(); ++UI)
	State[*UI] = DelayInfo(Type, Latency);
	}
	}

	// Advance by the number of cycles it takes to issue this instruction.
	// TODO: Use a more advanced model that accounts for instructions that
	// take multiple cycles to issue on a particular pipeline.
	unsigned Cycles = SIInstrInfo::getNumWaitStates(MI);
	// TODO: In wave64 mode, double the number of cycles for VALU and VMEM
	// instructions on the assumption that they will usually have to be issued
	// twice?
	State.advance(Type, Cycles);

	LLVM_DEBUG(dbgs() << " State after " << MI; State.dump(TRI););
	}

	if (Emit) {
	assert(State == BlockState[&MBB] &&
	"Basic block state should not have changed on final pass!");
	} else if (State != BlockState[&MBB]) {
	BlockState[&MBB] = std::move(State);
	Changed = true;
	}
	return Changed;
	}

	bool runOnMachineFunction(MachineFunction &MF) override {
	if (skipFunction(MF.getFunction()))
	return false;

	LLVM_DEBUG(dbgs() << "AMDGPUInsertDelayAlu running on " << MF.getName()
	<< "\n");

	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
	if (!ST.hasDelayAlu())
	return false;

	SII = ST.getInstrInfo();
	TRI = ST.getRegisterInfo();

	SchedModel.init(&ST);

	// Calculate the delay state for each basic block, iterating until we reach
	// a fixed point.
	SetVector<MachineBasicBlock *> WorkList;
	for (auto &MBB : reverse(MF))
	WorkList.insert(&MBB);
	while (!WorkList.empty()) {
	auto &MBB = *WorkList.pop_back_val();
	bool Changed = runOnMachineBasicBlock(MBB, false);
	if (Changed)
	WorkList.insert(MBB.succ_begin(), MBB.succ_end());
	}

	LLVM_DEBUG(dbgs() << "Final pass over all BBs\n");

	// Make one last pass over all basic blocks to emit s_delay_alu
	// instructions.
	bool Changed = false;
	for (auto &MBB : MF)
	Changed \|= runOnMachineBasicBlock(MBB, true);
	return Changed;
	}
	};

	} // namespace

	char AMDGPUInsertDelayAlu::ID = 0;

	char &llvm::AMDGPUInsertDelayAluID = AMDGPUInsertDelayAlu::ID;

	INITIALIZE_PASS(AMDGPUInsertDelayAlu, DEBUG_TYPE, "AMDGPU Insert Delay ALU",
	false, false)