third_party/llvm-16.0/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp - SwiftShader - Git at Google

 //===-- AMDGPULowerKernelAttributes.cpp ------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
 /// \file This pass does attempts to make use of reqd_work_group_size metadata
 /// to eliminate loads from the dispatch packet and to constant fold OpenCL
 /// get_local_size-like functions.
 //
 //===----------------------------------------------------------------------===//

 #include "AMDGPU.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Pass.h"

 #define DEBUG_TYPE "amdgpu-lower-kernel-attributes"

 using namespace llvm;

 namespace {

 // Field offsets in hsa_kernel_dispatch_packet_t.
 enum DispatchPackedOffsets {
   WORKGROUP_SIZE_X = 4,
   WORKGROUP_SIZE_Y = 6,
   WORKGROUP_SIZE_Z = 8,

   GRID_SIZE_X = 12,
   GRID_SIZE_Y = 16,
   GRID_SIZE_Z = 20
 };

 // Field offsets to implicit kernel argument pointer.
 enum ImplicitArgOffsets {
   HIDDEN_BLOCK_COUNT_X = 0,
   HIDDEN_BLOCK_COUNT_Y = 4,
   HIDDEN_BLOCK_COUNT_Z = 8,

   HIDDEN_GROUP_SIZE_X = 12,
   HIDDEN_GROUP_SIZE_Y = 14,
   HIDDEN_GROUP_SIZE_Z = 16,

   HIDDEN_REMAINDER_X = 18,
   HIDDEN_REMAINDER_Y = 20,
   HIDDEN_REMAINDER_Z = 22,
 };

 class AMDGPULowerKernelAttributes : public ModulePass {
 public:
   static char ID;

   AMDGPULowerKernelAttributes() : ModulePass(ID) {}

   bool runOnModule(Module &M) override;

   StringRef getPassName() const override {
     return "AMDGPU Kernel Attributes";
   }

   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesAll();
  }
 };

 Function *getBasePtrIntrinsic(Module &M, bool IsV5OrAbove) {
   auto IntrinsicId = IsV5OrAbove ? Intrinsic::amdgcn_implicitarg_ptr
                                  : Intrinsic::amdgcn_dispatch_ptr;
   StringRef Name = Intrinsic::getName(IntrinsicId);
   return M.getFunction(Name);
 }

 } // end anonymous namespace

 static bool processUse(CallInst *CI, bool IsV5OrAbove) {
   Function *F = CI->getParent()->getParent();

   auto MD = F->getMetadata("reqd_work_group_size");
   const bool HasReqdWorkGroupSize = MD && MD->getNumOperands() == 3;

   const bool HasUniformWorkGroupSize =
     F->getFnAttribute("uniform-work-group-size").getValueAsBool();

   if (!HasReqdWorkGroupSize && !HasUniformWorkGroupSize)
     return false;

   Value *BlockCounts[3] = {nullptr, nullptr, nullptr};
   Value *GroupSizes[3]  = {nullptr, nullptr, nullptr};
   Value *Remainders[3]  = {nullptr, nullptr, nullptr};
   Value *GridSizes[3]   = {nullptr, nullptr, nullptr};

   const DataLayout &DL = F->getParent()->getDataLayout();

   // We expect to see several GEP users, casted to the appropriate type and
   // loaded.
   for (User *U : CI->users()) {
     if (!U->hasOneUse())
       continue;

     int64_t Offset = 0;
     auto *Load = dyn_cast<LoadInst>(U); // Load from ImplicitArgPtr/DispatchPtr?
     auto *BCI = dyn_cast<BitCastInst>(U);
     if (!Load && !BCI) {
       if (GetPointerBaseWithConstantOffset(U, Offset, DL) != CI)
         continue;
       Load = dyn_cast<LoadInst>(*U->user_begin()); // Load from GEP?
       BCI = dyn_cast<BitCastInst>(*U->user_begin());
     }

     if (BCI) {
       if (!BCI->hasOneUse())
         continue;
       Load = dyn_cast<LoadInst>(*BCI->user_begin()); // Load from BCI?
     }

     if (!Load || !Load->isSimple())
       continue;

     unsigned LoadSize = DL.getTypeStoreSize(Load->getType());

     // TODO: Handle merged loads.
     if (IsV5OrAbove) { // Base is ImplicitArgPtr.
       switch (Offset) {
       case HIDDEN_BLOCK_COUNT_X:
         if (LoadSize == 4)
           BlockCounts[0] = Load;
         break;
       case HIDDEN_BLOCK_COUNT_Y:
         if (LoadSize == 4)
           BlockCounts[1] = Load;
         break;
       case HIDDEN_BLOCK_COUNT_Z:
         if (LoadSize == 4)
           BlockCounts[2] = Load;
         break;
       case HIDDEN_GROUP_SIZE_X:
         if (LoadSize == 2)
           GroupSizes[0] = Load;
         break;
       case HIDDEN_GROUP_SIZE_Y:
         if (LoadSize == 2)
           GroupSizes[1] = Load;
         break;
       case HIDDEN_GROUP_SIZE_Z:
         if (LoadSize == 2)
           GroupSizes[2] = Load;
         break;
       case HIDDEN_REMAINDER_X:
         if (LoadSize == 2)
           Remainders[0] = Load;
         break;
       case HIDDEN_REMAINDER_Y:
         if (LoadSize == 2)
           Remainders[1] = Load;
         break;
       case HIDDEN_REMAINDER_Z:
         if (LoadSize == 2)
           Remainders[2] = Load;
         break;
       default:
         break;
       }
     } else { // Base is DispatchPtr.
       switch (Offset) {
       case WORKGROUP_SIZE_X:
         if (LoadSize == 2)
           GroupSizes[0] = Load;
         break;
       case WORKGROUP_SIZE_Y:
         if (LoadSize == 2)
           GroupSizes[1] = Load;
         break;
       case WORKGROUP_SIZE_Z:
         if (LoadSize == 2)
           GroupSizes[2] = Load;
         break;
       case GRID_SIZE_X:
         if (LoadSize == 4)
           GridSizes[0] = Load;
         break;
       case GRID_SIZE_Y:
         if (LoadSize == 4)
           GridSizes[1] = Load;
         break;
       case GRID_SIZE_Z:
         if (LoadSize == 4)
           GridSizes[2] = Load;
         break;
       default:
         break;
       }
     }
   }

   bool MadeChange = false;
   if (IsV5OrAbove && HasUniformWorkGroupSize) {
     // Under v5  __ockl_get_local_size returns the value computed by the expression:
     //
     //   workgroup_id < hidden_block_count ? hidden_group_size : hidden_remainder
     //
     // For functions with the attribute uniform-work-group-size=true. we can evaluate
     // workgroup_id < hidden_block_count as true, and thus hidden_group_size is returned
     // for __ockl_get_local_size.
     for (int I = 0; I < 3; ++I) {
       Value *BlockCount = BlockCounts[I];
       if (!BlockCount)
         continue;

       using namespace llvm::PatternMatch;
       auto GroupIDIntrin =
           I == 0 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_x>()
                  : (I == 1 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_y>()
                            : m_Intrinsic<Intrinsic::amdgcn_workgroup_id_z>());

       for (User *ICmp : BlockCount->users()) {
         ICmpInst::Predicate Pred;
         if (match(ICmp, m_ICmp(Pred, GroupIDIntrin, m_Specific(BlockCount)))) {
           if (Pred != ICmpInst::ICMP_ULT)
             continue;
           ICmp->replaceAllUsesWith(llvm::ConstantInt::getTrue(ICmp->getType()));
           MadeChange = true;
         }
       }
     }

     // All remainders should be 0 with uniform work group size.
     for (Value *Remainder : Remainders) {
       if (!Remainder)
         continue;
       Remainder->replaceAllUsesWith(Constant::getNullValue(Remainder->getType()));
       MadeChange = true;
     }
   } else if (HasUniformWorkGroupSize) { // Pre-V5.
     // Pattern match the code used to handle partial workgroup dispatches in the
     // library implementation of get_local_size, so the entire function can be
     // constant folded with a known group size.
     //
     // uint r = grid_size - group_id * group_size;
     // get_local_size = (r < group_size) ? r : group_size;
     //
     // If we have uniform-work-group-size (which is the default in OpenCL 1.2),
     // the grid_size is required to be a multiple of group_size). In this case:
     //
     // grid_size - (group_id * group_size) < group_size
     // ->
     // grid_size < group_size + (group_id * group_size)
     //
     // (grid_size / group_size) < 1 + group_id
     //
     // grid_size / group_size is at least 1, so we can conclude the select
     // condition is false (except for group_id == 0, where the select result is
     // the same).
     for (int I = 0; I < 3; ++I) {
       Value *GroupSize = GroupSizes[I];
       Value *GridSize = GridSizes[I];
       if (!GroupSize || !GridSize)
         continue;

       using namespace llvm::PatternMatch;
       auto GroupIDIntrin =
           I == 0 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_x>()
                  : (I == 1 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_y>()
                            : m_Intrinsic<Intrinsic::amdgcn_workgroup_id_z>());

       for (User *U : GroupSize->users()) {
         auto *ZextGroupSize = dyn_cast<ZExtInst>(U);
         if (!ZextGroupSize)
           continue;

         for (User *UMin : ZextGroupSize->users()) {
           if (match(UMin,
                     m_UMin(m_Sub(m_Specific(GridSize),
                                  m_Mul(GroupIDIntrin, m_Specific(ZextGroupSize))),
                            m_Specific(ZextGroupSize)))) {
             if (HasReqdWorkGroupSize) {
               ConstantInt *KnownSize
                 = mdconst::extract<ConstantInt>(MD->getOperand(I));
               UMin->replaceAllUsesWith(ConstantExpr::getIntegerCast(
                   KnownSize, UMin->getType(), false));
             } else {
               UMin->replaceAllUsesWith(ZextGroupSize);
             }

             MadeChange = true;
           }
         }
       }
     }
   }

   // If reqd_work_group_size is set, we can replace work group size with it.
   if (!HasReqdWorkGroupSize)
     return MadeChange;

   for (int I = 0; I < 3; I++) {
     Value *GroupSize = GroupSizes[I];
     if (!GroupSize)
       continue;

     ConstantInt *KnownSize = mdconst::extract<ConstantInt>(MD->getOperand(I));
     GroupSize->replaceAllUsesWith(
         ConstantExpr::getIntegerCast(KnownSize, GroupSize->getType(), false));
     MadeChange = true;
   }

   return MadeChange;
 }


 // TODO: Move makeLIDRangeMetadata usage into here. Seem to not get
 // TargetPassConfig for subtarget.
 bool AMDGPULowerKernelAttributes::runOnModule(Module &M) {
   bool MadeChange = false;
   bool IsV5OrAbove = AMDGPU::getAmdhsaCodeObjectVersion() >= 5;
   Function *BasePtr = getBasePtrIntrinsic(M, IsV5OrAbove);

   if (!BasePtr) // ImplicitArgPtr/DispatchPtr not used.
     return false;

   SmallPtrSet<Instruction *, 4> HandledUses;
   for (auto *U : BasePtr->users()) {
     CallInst *CI = cast<CallInst>(U);
     if (HandledUses.insert(CI).second) {
       if (processUse(CI, IsV5OrAbove))
         MadeChange = true;
     }
   }

   return MadeChange;
 }


 INITIALIZE_PASS_BEGIN(AMDGPULowerKernelAttributes, DEBUG_TYPE,
                       "AMDGPU Kernel Attributes", false, false)
 INITIALIZE_PASS_END(AMDGPULowerKernelAttributes, DEBUG_TYPE,
                     "AMDGPU Kernel Attributes", false, false)

 char AMDGPULowerKernelAttributes::ID = 0;

 ModulePass *llvm::createAMDGPULowerKernelAttributesPass() {
   return new AMDGPULowerKernelAttributes();
 }

 PreservedAnalyses
 AMDGPULowerKernelAttributesPass::run(Function &F, FunctionAnalysisManager &AM) {
   bool IsV5OrAbove = AMDGPU::getAmdhsaCodeObjectVersion() >= 5;
   Function *BasePtr = getBasePtrIntrinsic(*F.getParent(), IsV5OrAbove);

   if (!BasePtr) // ImplicitArgPtr/DispatchPtr not used.
     return PreservedAnalyses::all();

   for (Instruction &I : instructions(F)) {
     if (CallInst *CI = dyn_cast<CallInst>(&I)) {
       if (CI->getCalledFunction() == BasePtr)
         processUse(CI, IsV5OrAbove);
     }
   }

   return PreservedAnalyses::all();
 }
	//===-- AMDGPULowerKernelAttributes.cpp ------------------------------------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	/// \file This pass does attempts to make use of reqd_work_group_size metadata
	/// to eliminate loads from the dispatch packet and to constant fold OpenCL
	/// get_local_size-like functions.
	//
	//===----------------------------------------------------------------------===//

	#include "AMDGPU.h"
	#include "Utils/AMDGPUBaseInfo.h"
	#include "llvm/Analysis/ValueTracking.h"
	#include "llvm/CodeGen/Passes.h"
	#include "llvm/CodeGen/TargetPassConfig.h"
	#include "llvm/IR/Constants.h"
	#include "llvm/IR/Function.h"
	#include "llvm/IR/InstIterator.h"
	#include "llvm/IR/Instructions.h"
	#include "llvm/IR/IntrinsicsAMDGPU.h"
	#include "llvm/IR/PatternMatch.h"
	#include "llvm/Pass.h"

	#define DEBUG_TYPE "amdgpu-lower-kernel-attributes"

	using namespace llvm;

	namespace {

	// Field offsets in hsa_kernel_dispatch_packet_t.
	enum DispatchPackedOffsets {
	WORKGROUP_SIZE_X = 4,
	WORKGROUP_SIZE_Y = 6,
	WORKGROUP_SIZE_Z = 8,

	GRID_SIZE_X = 12,
	GRID_SIZE_Y = 16,
	GRID_SIZE_Z = 20
	};

	// Field offsets to implicit kernel argument pointer.
	enum ImplicitArgOffsets {
	HIDDEN_BLOCK_COUNT_X = 0,
	HIDDEN_BLOCK_COUNT_Y = 4,
	HIDDEN_BLOCK_COUNT_Z = 8,

	HIDDEN_GROUP_SIZE_X = 12,
	HIDDEN_GROUP_SIZE_Y = 14,
	HIDDEN_GROUP_SIZE_Z = 16,

	HIDDEN_REMAINDER_X = 18,
	HIDDEN_REMAINDER_Y = 20,
	HIDDEN_REMAINDER_Z = 22,
	};

	class AMDGPULowerKernelAttributes : public ModulePass {
	public:
	static char ID;

	AMDGPULowerKernelAttributes() : ModulePass(ID) {}

	bool runOnModule(Module &M) override;

	StringRef getPassName() const override {
	return "AMDGPU Kernel Attributes";
	}

	void getAnalysisUsage(AnalysisUsage &AU) const override {
	AU.setPreservesAll();
	}
	};

	Function *getBasePtrIntrinsic(Module &M, bool IsV5OrAbove) {
	auto IntrinsicId = IsV5OrAbove ? Intrinsic::amdgcn_implicitarg_ptr
	: Intrinsic::amdgcn_dispatch_ptr;
	StringRef Name = Intrinsic::getName(IntrinsicId);
	return M.getFunction(Name);
	}

	} // end anonymous namespace

	static bool processUse(CallInst *CI, bool IsV5OrAbove) {
	Function *F = CI->getParent()->getParent();

	auto MD = F->getMetadata("reqd_work_group_size");
	const bool HasReqdWorkGroupSize = MD && MD->getNumOperands() == 3;

	const bool HasUniformWorkGroupSize =
	F->getFnAttribute("uniform-work-group-size").getValueAsBool();

	if (!HasReqdWorkGroupSize && !HasUniformWorkGroupSize)
	return false;

	Value *BlockCounts[3] = {nullptr, nullptr, nullptr};
	Value *GroupSizes[3] = {nullptr, nullptr, nullptr};
	Value *Remainders[3] = {nullptr, nullptr, nullptr};
	Value *GridSizes[3] = {nullptr, nullptr, nullptr};

	const DataLayout &DL = F->getParent()->getDataLayout();

	// We expect to see several GEP users, casted to the appropriate type and
	// loaded.
	for (User *U : CI->users()) {
	if (!U->hasOneUse())
	continue;

	int64_t Offset = 0;
	auto *Load = dyn_cast<LoadInst>(U); // Load from ImplicitArgPtr/DispatchPtr?
	auto *BCI = dyn_cast<BitCastInst>(U);
	if (!Load && !BCI) {
	if (GetPointerBaseWithConstantOffset(U, Offset, DL) != CI)
	continue;
	Load = dyn_cast<LoadInst>(*U->user_begin()); // Load from GEP?
	BCI = dyn_cast<BitCastInst>(*U->user_begin());
	}

	if (BCI) {
	if (!BCI->hasOneUse())
	continue;
	Load = dyn_cast<LoadInst>(*BCI->user_begin()); // Load from BCI?
	}

	if (!Load \|\| !Load->isSimple())
	continue;

	unsigned LoadSize = DL.getTypeStoreSize(Load->getType());

	// TODO: Handle merged loads.
	if (IsV5OrAbove) { // Base is ImplicitArgPtr.
	switch (Offset) {
	case HIDDEN_BLOCK_COUNT_X:
	if (LoadSize == 4)
	BlockCounts[0] = Load;
	break;
	case HIDDEN_BLOCK_COUNT_Y:
	if (LoadSize == 4)
	BlockCounts[1] = Load;
	break;
	case HIDDEN_BLOCK_COUNT_Z:
	if (LoadSize == 4)
	BlockCounts[2] = Load;
	break;
	case HIDDEN_GROUP_SIZE_X:
	if (LoadSize == 2)
	GroupSizes[0] = Load;
	break;
	case HIDDEN_GROUP_SIZE_Y:
	if (LoadSize == 2)
	GroupSizes[1] = Load;
	break;
	case HIDDEN_GROUP_SIZE_Z:
	if (LoadSize == 2)
	GroupSizes[2] = Load;
	break;
	case HIDDEN_REMAINDER_X:
	if (LoadSize == 2)
	Remainders[0] = Load;
	break;
	case HIDDEN_REMAINDER_Y:
	if (LoadSize == 2)
	Remainders[1] = Load;
	break;
	case HIDDEN_REMAINDER_Z:
	if (LoadSize == 2)
	Remainders[2] = Load;
	break;
	default:
	break;
	}
	} else { // Base is DispatchPtr.
	switch (Offset) {
	case WORKGROUP_SIZE_X:
	if (LoadSize == 2)
	GroupSizes[0] = Load;
	break;
	case WORKGROUP_SIZE_Y:
	if (LoadSize == 2)
	GroupSizes[1] = Load;
	break;
	case WORKGROUP_SIZE_Z:
	if (LoadSize == 2)
	GroupSizes[2] = Load;
	break;
	case GRID_SIZE_X:
	if (LoadSize == 4)
	GridSizes[0] = Load;
	break;
	case GRID_SIZE_Y:
	if (LoadSize == 4)
	GridSizes[1] = Load;
	break;
	case GRID_SIZE_Z:
	if (LoadSize == 4)
	GridSizes[2] = Load;
	break;
	default:
	break;
	}
	}
	}

	bool MadeChange = false;
	if (IsV5OrAbove && HasUniformWorkGroupSize) {
	// Under v5 __ockl_get_local_size returns the value computed by the expression:
	//
	// workgroup_id < hidden_block_count ? hidden_group_size : hidden_remainder
	//
	// For functions with the attribute uniform-work-group-size=true. we can evaluate
	// workgroup_id < hidden_block_count as true, and thus hidden_group_size is returned
	// for __ockl_get_local_size.
	for (int I = 0; I < 3; ++I) {
	Value *BlockCount = BlockCounts[I];
	if (!BlockCount)
	continue;

	using namespace llvm::PatternMatch;
	auto GroupIDIntrin =
	I == 0 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_x>()
	: (I == 1 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_y>()
	: m_Intrinsic<Intrinsic::amdgcn_workgroup_id_z>());

	for (User *ICmp : BlockCount->users()) {
	ICmpInst::Predicate Pred;
	if (match(ICmp, m_ICmp(Pred, GroupIDIntrin, m_Specific(BlockCount)))) {
	if (Pred != ICmpInst::ICMP_ULT)
	continue;
	ICmp->replaceAllUsesWith(llvm::ConstantInt::getTrue(ICmp->getType()));
	MadeChange = true;
	}
	}
	}

	// All remainders should be 0 with uniform work group size.
	for (Value *Remainder : Remainders) {
	if (!Remainder)
	continue;
	Remainder->replaceAllUsesWith(Constant::getNullValue(Remainder->getType()));
	MadeChange = true;
	}
	} else if (HasUniformWorkGroupSize) { // Pre-V5.
	// Pattern match the code used to handle partial workgroup dispatches in the
	// library implementation of get_local_size, so the entire function can be
	// constant folded with a known group size.
	//
	// uint r = grid_size - group_id * group_size;
	// get_local_size = (r < group_size) ? r : group_size;
	//
	// If we have uniform-work-group-size (which is the default in OpenCL 1.2),
	// the grid_size is required to be a multiple of group_size). In this case:
	//
	// grid_size - (group_id * group_size) < group_size
	// ->
	// grid_size < group_size + (group_id * group_size)
	//
	// (grid_size / group_size) < 1 + group_id
	//
	// grid_size / group_size is at least 1, so we can conclude the select
	// condition is false (except for group_id == 0, where the select result is
	// the same).
	for (int I = 0; I < 3; ++I) {
	Value *GroupSize = GroupSizes[I];
	Value *GridSize = GridSizes[I];
	if (!GroupSize \|\| !GridSize)
	continue;

	using namespace llvm::PatternMatch;
	auto GroupIDIntrin =
	I == 0 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_x>()
	: (I == 1 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_y>()
	: m_Intrinsic<Intrinsic::amdgcn_workgroup_id_z>());

	for (User *U : GroupSize->users()) {
	auto *ZextGroupSize = dyn_cast<ZExtInst>(U);
	if (!ZextGroupSize)
	continue;

	for (User *UMin : ZextGroupSize->users()) {
	if (match(UMin,
	m_UMin(m_Sub(m_Specific(GridSize),
	m_Mul(GroupIDIntrin, m_Specific(ZextGroupSize))),
	m_Specific(ZextGroupSize)))) {
	if (HasReqdWorkGroupSize) {
	ConstantInt *KnownSize
	= mdconst::extract<ConstantInt>(MD->getOperand(I));
	UMin->replaceAllUsesWith(ConstantExpr::getIntegerCast(
	KnownSize, UMin->getType(), false));
	} else {
	UMin->replaceAllUsesWith(ZextGroupSize);
	}

	MadeChange = true;
	}
	}
	}
	}
	}

	// If reqd_work_group_size is set, we can replace work group size with it.
	if (!HasReqdWorkGroupSize)
	return MadeChange;

	for (int I = 0; I < 3; I++) {
	Value *GroupSize = GroupSizes[I];
	if (!GroupSize)
	continue;

	ConstantInt *KnownSize = mdconst::extract<ConstantInt>(MD->getOperand(I));
	GroupSize->replaceAllUsesWith(
	ConstantExpr::getIntegerCast(KnownSize, GroupSize->getType(), false));
	MadeChange = true;
	}

	return MadeChange;
	}


	// TODO: Move makeLIDRangeMetadata usage into here. Seem to not get
	// TargetPassConfig for subtarget.
	bool AMDGPULowerKernelAttributes::runOnModule(Module &M) {
	bool MadeChange = false;
	bool IsV5OrAbove = AMDGPU::getAmdhsaCodeObjectVersion() >= 5;
	Function *BasePtr = getBasePtrIntrinsic(M, IsV5OrAbove);

	if (!BasePtr) // ImplicitArgPtr/DispatchPtr not used.
	return false;

	SmallPtrSet<Instruction *, 4> HandledUses;
	for (auto *U : BasePtr->users()) {
	CallInst *CI = cast<CallInst>(U);
	if (HandledUses.insert(CI).second) {
	if (processUse(CI, IsV5OrAbove))
	MadeChange = true;
	}
	}

	return MadeChange;
	}


	INITIALIZE_PASS_BEGIN(AMDGPULowerKernelAttributes, DEBUG_TYPE,
	"AMDGPU Kernel Attributes", false, false)
	INITIALIZE_PASS_END(AMDGPULowerKernelAttributes, DEBUG_TYPE,
	"AMDGPU Kernel Attributes", false, false)

	char AMDGPULowerKernelAttributes::ID = 0;

	ModulePass *llvm::createAMDGPULowerKernelAttributesPass() {
	return new AMDGPULowerKernelAttributes();
	}

	PreservedAnalyses
	AMDGPULowerKernelAttributesPass::run(Function &F, FunctionAnalysisManager &AM) {
	bool IsV5OrAbove = AMDGPU::getAmdhsaCodeObjectVersion() >= 5;
	Function BasePtr = getBasePtrIntrinsic(F.getParent(), IsV5OrAbove);

	if (!BasePtr) // ImplicitArgPtr/DispatchPtr not used.
	return PreservedAnalyses::all();

	for (Instruction &I : instructions(F)) {
	if (CallInst *CI = dyn_cast<CallInst>(&I)) {
	if (CI->getCalledFunction() == BasePtr)
	processUse(CI, IsV5OrAbove);
	}
	}

	return PreservedAnalyses::all();
	}