blob: c9cdbc89f3a41f043dd2c7686a7e362db5fa30ae [file] [log] [blame]
//===- AMDGPUInsertDelayAlu.cpp - Insert s_delay_alu instructions ---------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
/// \file
/// Insert s_delay_alu instructions to avoid stalls on GFX11+.
//
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIInstrInfo.h"
#include "llvm/ADT/SetVector.h"
using namespace llvm;
#define DEBUG_TYPE "amdgpu-insert-delay-alu"
namespace {
class AMDGPUInsertDelayAlu : public MachineFunctionPass {
public:
static char ID;
const SIInstrInfo *SII;
const TargetRegisterInfo *TRI;
TargetSchedModel SchedModel;
AMDGPUInsertDelayAlu() : MachineFunctionPass(ID) {}
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
MachineFunctionPass::getAnalysisUsage(AU);
}
// Return true if MI waits for all outstanding VALU instructions to complete.
static bool instructionWaitsForVALU(const MachineInstr &MI) {
// These instruction types wait for VA_VDST==0 before issuing.
const uint64_t VA_VDST_0 = SIInstrFlags::DS | SIInstrFlags::EXP |
SIInstrFlags::FLAT | SIInstrFlags::MIMG |
SIInstrFlags::MTBUF | SIInstrFlags::MUBUF;
if (MI.getDesc().TSFlags & VA_VDST_0)
return true;
if (MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B32 ||
MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B64)
return true;
if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
(MI.getOperand(0).getImm() & 0xf000) == 0)
return true;
return false;
}
// Types of delay that can be encoded in an s_delay_alu instruction.
enum DelayType { VALU, TRANS, SALU, OTHER };
// Get the delay type for an instruction with the specified TSFlags.
static DelayType getDelayType(uint64_t TSFlags) {
if (TSFlags & SIInstrFlags::TRANS)
return TRANS;
if (TSFlags & SIInstrFlags::VALU)
return VALU;
if (TSFlags & SIInstrFlags::SALU)
return SALU;
return OTHER;
}
// Information about the last instruction(s) that wrote to a particular
// regunit. In straight-line code there will only be one such instruction, but
// when control flow converges we merge the delay information from each path
// to represent the union of the worst-case delays of each type.
struct DelayInfo {
// One larger than the maximum number of (non-TRANS) VALU instructions we
// can encode in an s_delay_alu instruction.
static const unsigned VALU_MAX = 5;
// One larger than the maximum number of TRANS instructions we can encode in
// an s_delay_alu instruction.
static const unsigned TRANS_MAX = 4;
// If it was written by a (non-TRANS) VALU, remember how many clock cycles
// are left until it completes, and how many other (non-TRANS) VALU we have
// seen since it was issued.
uint8_t VALUCycles = 0;
uint8_t VALUNum = VALU_MAX;
// If it was written by a TRANS, remember how many clock cycles are left
// until it completes, and how many other TRANS we have seen since it was
// issued.
uint8_t TRANSCycles = 0;
uint8_t TRANSNum = TRANS_MAX;
// Also remember how many other (non-TRANS) VALU we have seen since it was
// issued. When an instruction depends on both a prior TRANS and a prior
// non-TRANS VALU, this is used to decide whether to encode a wait for just
// one or both of them.
uint8_t TRANSNumVALU = VALU_MAX;
// If it was written by an SALU, remember how many clock cycles are left
// until it completes.
uint8_t SALUCycles = 0;
DelayInfo() = default;
DelayInfo(DelayType Type, unsigned Cycles) {
switch (Type) {
default:
llvm_unreachable("unexpected type");
case VALU:
VALUCycles = Cycles;
VALUNum = 0;
break;
case TRANS:
TRANSCycles = Cycles;
TRANSNum = 0;
TRANSNumVALU = 0;
break;
case SALU:
SALUCycles = Cycles;
break;
}
}
bool operator==(const DelayInfo &RHS) const {
return VALUCycles == RHS.VALUCycles && VALUNum == RHS.VALUNum &&
TRANSCycles == RHS.TRANSCycles && TRANSNum == RHS.TRANSNum &&
TRANSNumVALU == RHS.TRANSNumVALU && SALUCycles == RHS.SALUCycles;
}
bool operator!=(const DelayInfo &RHS) const { return !(*this == RHS); }
// Merge another DelayInfo into this one, to represent the union of the
// worst-case delays of each type.
void merge(const DelayInfo &RHS) {
VALUCycles = std::max(VALUCycles, RHS.VALUCycles);
VALUNum = std::min(VALUNum, RHS.VALUNum);
TRANSCycles = std::max(TRANSCycles, RHS.TRANSCycles);
TRANSNum = std::min(TRANSNum, RHS.TRANSNum);
TRANSNumVALU = std::min(TRANSNumVALU, RHS.TRANSNumVALU);
SALUCycles = std::max(SALUCycles, RHS.SALUCycles);
}
// Update this DelayInfo after issuing an instruction. IsVALU should be 1
// when issuing a (non-TRANS) VALU, else 0. IsTRANS should be 1 when issuing
// a TRANS, else 0. Cycles is the number of cycles it takes to issue the
// instruction. Return true if there is no longer any useful delay info.
bool advance(DelayType Type, unsigned Cycles) {
bool Erase = true;
VALUNum += (Type == VALU);
if (VALUNum >= VALU_MAX || VALUCycles <= Cycles) {
// Forget about the VALU instruction. It was too far back or has
// definitely completed by now.
VALUNum = VALU_MAX;
VALUCycles = 0;
} else {
VALUCycles -= Cycles;
Erase = false;
}
TRANSNum += (Type == TRANS);
TRANSNumVALU += (Type == VALU);
if (TRANSNum >= TRANS_MAX || TRANSCycles <= Cycles) {
// Forget about any TRANS instruction. It was too far back or has
// definitely completed by now.
TRANSNum = TRANS_MAX;
TRANSNumVALU = VALU_MAX;
TRANSCycles = 0;
} else {
TRANSCycles -= Cycles;
Erase = false;
}
if (SALUCycles <= Cycles) {
// Forget about any SALU instruction. It has definitely completed by
// now.
SALUCycles = 0;
} else {
SALUCycles -= Cycles;
Erase = false;
}
return Erase;
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void dump() const {
if (VALUCycles)
dbgs() << " VALUCycles=" << (int)VALUCycles;
if (VALUNum < VALU_MAX)
dbgs() << " VALUNum=" << (int)VALUNum;
if (TRANSCycles)
dbgs() << " TRANSCycles=" << (int)TRANSCycles;
if (TRANSNum < TRANS_MAX)
dbgs() << " TRANSNum=" << (int)TRANSNum;
if (TRANSNumVALU < VALU_MAX)
dbgs() << " TRANSNumVALU=" << (int)TRANSNumVALU;
if (SALUCycles)
dbgs() << " SALUCycles=" << (int)SALUCycles;
}
#endif
};
// A map from regunits to the delay info for that regunit.
struct DelayState : DenseMap<unsigned, DelayInfo> {
// Merge another DelayState into this one by merging the delay info for each
// regunit.
void merge(const DelayState &RHS) {
for (const auto &KV : RHS) {
iterator It;
bool Inserted;
std::tie(It, Inserted) = insert(KV);
if (!Inserted)
It->second.merge(KV.second);
}
}
// Advance the delay info for each regunit, erasing any that are no longer
// useful.
void advance(DelayType Type, unsigned Cycles) {
iterator Next;
for (auto I = begin(), E = end(); I != E; I = Next) {
Next = std::next(I);
if (I->second.advance(Type, Cycles))
erase(I);
}
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void dump(const TargetRegisterInfo *TRI) const {
if (empty()) {
dbgs() << " empty\n";
return;
}
// Dump DelayInfo for each RegUnit in numerical order.
SmallVector<const_iterator, 8> Order;
Order.reserve(size());
for (const_iterator I = begin(), E = end(); I != E; ++I)
Order.push_back(I);
llvm::sort(Order, [](const const_iterator &A, const const_iterator &B) {
return A->first < B->first;
});
for (const_iterator I : Order) {
dbgs() << " " << printRegUnit(I->first, TRI);
I->second.dump();
dbgs() << "\n";
}
}
#endif
};
// The saved delay state at the end of each basic block.
DenseMap<MachineBasicBlock *, DelayState> BlockState;
// Emit an s_delay_alu instruction if necessary before MI.
MachineInstr *emitDelayAlu(MachineInstr &MI, DelayInfo Delay,
MachineInstr *LastDelayAlu) {
unsigned Imm = 0;
// Wait for a TRANS instruction.
if (Delay.TRANSNum < DelayInfo::TRANS_MAX)
Imm |= 4 + Delay.TRANSNum;
// Wait for a VALU instruction (if it's more recent than any TRANS
// instruction that we're also waiting for).
if (Delay.VALUNum < DelayInfo::VALU_MAX &&
Delay.VALUNum <= Delay.TRANSNumVALU) {
if (Imm & 0xf)
Imm |= Delay.VALUNum << 7;
else
Imm |= Delay.VALUNum;
}
// Wait for an SALU instruction.
if (Delay.SALUCycles) {
if (Imm & 0x780) {
// We have already encoded a VALU and a TRANS delay. There's no room in
// the encoding for an SALU delay as well, so just drop it.
} else if (Imm & 0xf) {
Imm |= (Delay.SALUCycles + 8) << 7;
} else {
Imm |= Delay.SALUCycles + 8;
}
}
// Don't emit the s_delay_alu instruction if there's nothing to wait for.
if (!Imm)
return LastDelayAlu;
// If we only need to wait for one instruction, try encoding it in the last
// s_delay_alu that we emitted.
if (!(Imm & 0x780) && LastDelayAlu) {
unsigned Skip = 0;
for (auto I = MachineBasicBlock::instr_iterator(LastDelayAlu),
E = MachineBasicBlock::instr_iterator(MI);
++I != E;) {
if (!I->isBundle() && !I->isMetaInstruction())
++Skip;
}
if (Skip < 6) {
MachineOperand &Op = LastDelayAlu->getOperand(0);
unsigned LastImm = Op.getImm();
assert((LastImm & ~0xf) == 0 &&
"Remembered an s_delay_alu with no room for another delay!");
LastImm |= Imm << 7 | Skip << 4;
Op.setImm(LastImm);
return nullptr;
}
}
auto &MBB = *MI.getParent();
MachineInstr *DelayAlu =
BuildMI(MBB, MI, DebugLoc(), SII->get(AMDGPU::S_DELAY_ALU)).addImm(Imm);
// Remember the s_delay_alu for next time if there is still room in it to
// encode another delay.
return (Imm & 0x780) ? nullptr : DelayAlu;
}
bool runOnMachineBasicBlock(MachineBasicBlock &MBB, bool Emit) {
DelayState State;
for (auto *Pred : MBB.predecessors())
State.merge(BlockState[Pred]);
LLVM_DEBUG(dbgs() << " State at start of " << printMBBReference(MBB)
<< "\n";
State.dump(TRI););
bool Changed = false;
MachineInstr *LastDelayAlu = nullptr;
// Iterate over the contents of bundles, but don't emit any instructions
// inside a bundle.
for (auto &MI : MBB.instrs()) {
if (MI.isBundle() || MI.isMetaInstruction())
continue;
// Ignore some more instructions that do not generate any code.
switch (MI.getOpcode()) {
case AMDGPU::SI_RETURN_TO_EPILOG:
continue;
}
DelayType Type = getDelayType(MI.getDesc().TSFlags);
if (instructionWaitsForVALU(MI)) {
// Forget about all outstanding VALU delays.
State = DelayState();
} else if (Type != OTHER) {
DelayInfo Delay;
// TODO: Scan implicit uses too?
for (const auto &Op : MI.explicit_uses()) {
if (Op.isReg()) {
// One of the operands of the writelane is also the output operand.
// This creates the insertion of redundant delays. Hence, we have to
// ignore this operand.
if (MI.getOpcode() == AMDGPU::V_WRITELANE_B32 && Op.isTied())
continue;
for (MCRegUnitIterator UI(Op.getReg(), TRI); UI.isValid(); ++UI) {
auto It = State.find(*UI);
if (It != State.end()) {
Delay.merge(It->second);
State.erase(*UI);
}
}
}
}
if (Emit && !MI.isBundledWithPred()) {
// TODO: For VALU->SALU delays should we use s_delay_alu or s_nop or
// just ignore them?
LastDelayAlu = emitDelayAlu(MI, Delay, LastDelayAlu);
}
}
if (Type != OTHER) {
// TODO: Scan implicit defs too?
for (const auto &Op : MI.defs()) {
unsigned Latency = SchedModel.computeOperandLatency(
&MI, MI.getOperandNo(&Op), nullptr, 0);
for (MCRegUnitIterator UI(Op.getReg(), TRI); UI.isValid(); ++UI)
State[*UI] = DelayInfo(Type, Latency);
}
}
// Advance by the number of cycles it takes to issue this instruction.
// TODO: Use a more advanced model that accounts for instructions that
// take multiple cycles to issue on a particular pipeline.
unsigned Cycles = SIInstrInfo::getNumWaitStates(MI);
// TODO: In wave64 mode, double the number of cycles for VALU and VMEM
// instructions on the assumption that they will usually have to be issued
// twice?
State.advance(Type, Cycles);
LLVM_DEBUG(dbgs() << " State after " << MI; State.dump(TRI););
}
if (Emit) {
assert(State == BlockState[&MBB] &&
"Basic block state should not have changed on final pass!");
} else if (State != BlockState[&MBB]) {
BlockState[&MBB] = std::move(State);
Changed = true;
}
return Changed;
}
bool runOnMachineFunction(MachineFunction &MF) override {
if (skipFunction(MF.getFunction()))
return false;
LLVM_DEBUG(dbgs() << "AMDGPUInsertDelayAlu running on " << MF.getName()
<< "\n");
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
if (!ST.hasDelayAlu())
return false;
SII = ST.getInstrInfo();
TRI = ST.getRegisterInfo();
SchedModel.init(&ST);
// Calculate the delay state for each basic block, iterating until we reach
// a fixed point.
SetVector<MachineBasicBlock *> WorkList;
for (auto &MBB : reverse(MF))
WorkList.insert(&MBB);
while (!WorkList.empty()) {
auto &MBB = *WorkList.pop_back_val();
bool Changed = runOnMachineBasicBlock(MBB, false);
if (Changed)
WorkList.insert(MBB.succ_begin(), MBB.succ_end());
}
LLVM_DEBUG(dbgs() << "Final pass over all BBs\n");
// Make one last pass over all basic blocks to emit s_delay_alu
// instructions.
bool Changed = false;
for (auto &MBB : MF)
Changed |= runOnMachineBasicBlock(MBB, true);
return Changed;
}
};
} // namespace
char AMDGPUInsertDelayAlu::ID = 0;
char &llvm::AMDGPUInsertDelayAluID = AMDGPUInsertDelayAlu::ID;
INITIALIZE_PASS(AMDGPUInsertDelayAlu, DEBUG_TYPE, "AMDGPU Insert Delay ALU",
false, false)