104#include "llvm/IR/IntrinsicsAMDGPU.h"
114#define DEBUG_TYPE "amdgpu-sw-lower-lds"
115#define COV5_HIDDEN_DYN_LDS_SIZE_ARG 15
118using namespace AMDGPU;
123 AsanInstrumentLDS(
"amdgpu-asan-instrument-lds",
124 cl::desc(
"Run asan instrumentation on LDS instructions "
125 "lowered to global memory"),
130struct LDSAccessTypeInfo {
138struct KernelLDSParameters {
142 LDSAccessTypeInfo DirectAccess;
143 LDSAccessTypeInfo IndirectAccess;
145 LDSToReplacementIndicesMap;
153struct NonKernelLDSParameters {
160struct AsanInstrumentInfo {
166struct FunctionsAndLDSAccess {
174class AMDGPUSwLowerLDS {
177 DomTreeCallback Callback)
178 : M(
Mod), AMDGPUTM(TM), IRB(M.getContext()), DTCallback(Callback) {}
180 void getUsesOfLDSByNonKernels();
181 void getNonKernelsWithLDSArguments(
const CallGraph &CG);
186 void buildSwLDSGlobal(
Function *Func);
187 void buildSwDynLDSGlobal(
Function *Func);
188 void populateSwMetadataGlobal(
Function *Func);
189 void populateSwLDSAttributeAndMetadata(
Function *Func);
190 void populateLDSToReplacementIndicesMap(
Function *Func);
191 void getLDSMemoryInstructions(
Function *Func,
193 void replaceKernelLDSAccesses(
Function *Func);
194 Value *getTranslatedGlobalMemoryPtrOfLDS(
Value *LoadMallocPtr,
Value *LDSPtr);
195 void translateLDSMemoryOperationsToGlobalMemory(
200 void buildNonKernelLDSOffsetTable(NonKernelLDSParameters &NKLDSParams);
201 void buildNonKernelLDSBaseTable(NonKernelLDSParameters &NKLDSParams);
203 getAddressesOfVariablesInKernel(
Function *Func,
205 void lowerNonKernelLDSAccesses(
Function *Func,
207 NonKernelLDSParameters &NKLDSParams);
209 updateMallocSizeForDynamicLDS(
Function *Func,
Value **CurrMallocSize,
210 Value *HiddenDynLDSSize,
218 DomTreeCallback DTCallback;
219 FunctionsAndLDSAccess FuncLDSAccessInfo;
220 AsanInstrumentInfo AsanInfo;
223template <
typename T>
SetVector<T> sortByName(std::vector<T> &&V) {
226 sort(V, [](
const auto *L,
const auto *R) {
227 return L->getName() < R->getName();
236 std::vector<GlobalVariable *>(Variables.
begin(), Variables.
end()));
244 if (Kernels.size() > UINT32_MAX) {
248 sortByName(std::vector<Function *>(Kernels.begin(), Kernels.end()));
249 for (
size_t i = 0; i < Kernels.size(); i++) {
254 Func->setMetadata(
"llvm.amdgcn.lds.kernel.id",
257 return OrderedKernels;
260void AMDGPUSwLowerLDS::getNonKernelsWithLDSArguments(
const CallGraph &CG) {
264 for (
auto &K : FuncLDSAccessInfo.KernelToLDSParametersMap) {
269 for (
auto &
I : *CGN) {
278 Type *ArgTy = (*AI).getType();
283 FuncLDSAccessInfo.NonKernelsWithLDSArgument.insert(CalledFunc);
286 FuncLDSAccessInfo.KernelsWithIndirectLDSAccess.insert(Func);
292void AMDGPUSwLowerLDS::getUsesOfLDSByNonKernels() {
293 for (
GlobalVariable *GV : FuncLDSAccessInfo.AllNonKernelLDSAccess) {
298 if (
auto *
I = dyn_cast<Instruction>(V)) {
301 FuncLDSAccessInfo.NonKernelToLDSAccessMap[
F].insert(GV);
315 ConstantInt::get(IntTy,
Address + 1));
316 GV->
setMetadata(LLVMContext::MD_absolute_symbol, MetadataNode);
327 Func->addFnAttr(
"amdgpu-lds-size", Buffer);
333 IRBuilder<> Builder(Entry, Entry->getFirstNonPHIIt());
336 Intrinsic::donothing, {});
338 Value *UseInstance[1] = {
345void AMDGPUSwLowerLDS::buildSwLDSGlobal(
Function *Func) {
348 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
356 LDSParams.SwLDS->setSanitizerMetadata(MD);
359void AMDGPUSwLowerLDS::buildSwDynLDSGlobal(
Function *Func) {
361 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
362 if (LDSParams.DirectAccess.DynamicLDSGlobals.empty() &&
363 LDSParams.IndirectAccess.DynamicLDSGlobals.empty())
366 auto *emptyCharArray = ArrayType::get(IRB.getInt8Ty(), 0);
369 "llvm.amdgcn." + Func->getName() +
".dynlds",
nullptr,
371 markUsedByKernel(Func, LDSParams.SwDynLDS);
374 LDSParams.SwDynLDS->setSanitizerMetadata(MD);
377void AMDGPUSwLowerLDS::populateSwLDSAttributeAndMetadata(
Function *Func) {
378 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
379 bool IsDynLDSUsed = LDSParams.SwDynLDS;
381 recordLDSAbsoluteAddress(M, LDSParams.SwLDS, 0);
382 addLDSSizeAttribute(Func,
Offset, IsDynLDSUsed);
383 if (LDSParams.SwDynLDS)
384 recordLDSAbsoluteAddress(M, LDSParams.SwDynLDS,
Offset);
387void AMDGPUSwLowerLDS::populateSwMetadataGlobal(
Function *Func) {
390 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
391 auto &Ctx = M.getContext();
392 auto &
DL = M.getDataLayout();
393 std::vector<Type *> Items;
395 std::vector<Constant *> Initializers;
396 Align MaxAlignment(1);
399 MaxAlignment = std::max(MaxAlignment, GVAlign);
402 for (
GlobalVariable *GV : LDSParams.DirectAccess.StaticLDSGlobals)
403 UpdateMaxAlignment(GV);
405 for (
GlobalVariable *GV : LDSParams.DirectAccess.DynamicLDSGlobals)
406 UpdateMaxAlignment(GV);
408 for (
GlobalVariable *GV : LDSParams.IndirectAccess.StaticLDSGlobals)
409 UpdateMaxAlignment(GV);
411 for (
GlobalVariable *GV : LDSParams.IndirectAccess.DynamicLDSGlobals)
412 UpdateMaxAlignment(GV);
417 MDItemOS <<
"llvm.amdgcn.sw.lds." << Func->getName() <<
".md.item";
421 uint32_t &MallocSize = LDSParams.MallocSize;
423 int AsanScale = AsanInfo.Scale;
424 auto buildInitializerForSwLDSMD =
426 for (
auto &GV : LDSGlobals) {
429 UniqueLDSGlobals.
insert(GV);
432 const uint64_t SizeInBytes =
DL.getTypeAllocSize(Ty);
434 Constant *ItemStartOffset = ConstantInt::get(Int32Ty, MallocSize);
435 Constant *SizeInBytesConst = ConstantInt::get(Int32Ty, SizeInBytes);
440 MallocSize += SizeInBytes;
442 LDSParams.RedzoneOffsetAndSizeVector.emplace_back(MallocSize,
444 MallocSize += RightRedzoneSize;
447 alignTo(SizeInBytes + RightRedzoneSize, MaxAlignment);
449 ConstantInt::get(Int32Ty, AlignedSize);
451 MallocSize =
alignTo(MallocSize, MaxAlignment);
454 AlignedSizeInBytesConst});
455 Initializers.push_back(InitItem);
459 SwLDSVector.
insert(LDSParams.SwLDS);
460 buildInitializerForSwLDSMD(SwLDSVector);
461 buildInitializerForSwLDSMD(LDSParams.DirectAccess.StaticLDSGlobals);
462 buildInitializerForSwLDSMD(LDSParams.IndirectAccess.StaticLDSGlobals);
463 buildInitializerForSwLDSMD(LDSParams.DirectAccess.DynamicLDSGlobals);
464 buildInitializerForSwLDSMD(LDSParams.IndirectAccess.DynamicLDSGlobals);
467 Type *Ty = LDSParams.SwLDS->getValueType();
468 const uint64_t SizeInBytes =
DL.getTypeAllocSize(Ty);
470 LDSParams.LDSSize = AlignedSize;
473 MDTypeOS <<
"llvm.amdgcn.sw.lds." << Func->getName() <<
".md.type";
478 MDOS <<
"llvm.amdgcn.sw.lds." << Func->getName() <<
".md";
484 LDSParams.SwLDSMetadata->setInitializer(
data);
487 LDSParams.SwLDS->setAlignment(MaxAlignment);
488 if (LDSParams.SwDynLDS)
489 LDSParams.SwDynLDS->setAlignment(MaxAlignment);
492 LDSParams.SwLDSMetadata->setSanitizerMetadata(MD);
495void AMDGPUSwLowerLDS::populateLDSToReplacementIndicesMap(
Function *Func) {
498 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
502 for (
auto &GV : LDSGlobals) {
505 UniqueLDSGlobals.
insert(GV);
506 LDSParams.LDSToReplacementIndicesMap[GV] = {0,
Idx, 0};
512 SwLDSVector.
insert(LDSParams.SwLDS);
513 PopulateIndices(SwLDSVector,
Idx);
514 PopulateIndices(LDSParams.DirectAccess.StaticLDSGlobals,
Idx);
515 PopulateIndices(LDSParams.IndirectAccess.StaticLDSGlobals,
Idx);
516 PopulateIndices(LDSParams.DirectAccess.DynamicLDSGlobals,
Idx);
517 PopulateIndices(LDSParams.IndirectAccess.DynamicLDSGlobals,
Idx);
521 Value *Replacement) {
523 auto ReplaceUsesLambda = [Func](
const Use &U) ->
bool {
524 auto *V = U.getUser();
525 if (
auto *Inst = dyn_cast<Instruction>(V)) {
526 auto *Func1 = Inst->getParent()->getParent();
535void AMDGPUSwLowerLDS::replaceKernelLDSAccesses(
Function *Func) {
536 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
544 auto &IndirectAccess = LDSParams.IndirectAccess;
545 auto &DirectAccess = LDSParams.DirectAccess;
549 for (
auto &GV : LDSGlobals) {
552 if ((IndirectAccess.StaticLDSGlobals.contains(GV) ||
553 IndirectAccess.DynamicLDSGlobals.contains(GV)) &&
554 (!DirectAccess.StaticLDSGlobals.contains(GV) &&
555 !DirectAccess.DynamicLDSGlobals.contains(GV)))
559 UniqueLDSGlobals.
insert(GV);
560 auto &Indices = LDSParams.LDSToReplacementIndicesMap[GV];
561 assert(Indices.size() == 3);
562 Constant *GEPIdx[] = {ConstantInt::get(Int32Ty, Indices[0]),
563 ConstantInt::get(Int32Ty, Indices[1]),
564 ConstantInt::get(Int32Ty, Indices[2])};
566 SwLDSMetadataStructType, SwLDSMetadata, GEPIdx,
true);
568 Value *BasePlusOffset =
569 IRB.CreateInBoundsGEP(IRB.getInt8Ty(), SwLDS, {Offset});
572 replacesUsesOfGlobalInFunction(Func, GV, BasePlusOffset);
575 ReplaceLDSGlobalUses(DirectAccess.StaticLDSGlobals);
576 ReplaceLDSGlobalUses(IndirectAccess.StaticLDSGlobals);
577 ReplaceLDSGlobalUses(DirectAccess.DynamicLDSGlobals);
578 ReplaceLDSGlobalUses(IndirectAccess.DynamicLDSGlobals);
581void AMDGPUSwLowerLDS::updateMallocSizeForDynamicLDS(
584 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
589 assert(SwLDS && SwLDSMetadata);
593 Value *MaxAlignValue = IRB.getInt32(MaxAlignment);
594 Value *MaxAlignValueMinusOne = IRB.getInt32(MaxAlignment - 1);
597 auto &Indices = LDSParams.LDSToReplacementIndicesMap[DynGV];
599 Constant *Index0 = ConstantInt::get(Int32Ty, 0);
600 Constant *Index1 = ConstantInt::get(Int32Ty, Indices[1]);
602 Constant *Index2Offset = ConstantInt::get(Int32Ty, 0);
603 auto *GEPForOffset = IRB.CreateInBoundsGEP(
604 MetadataStructType, SwLDSMetadata, {Index0, Index1, Index2Offset});
606 IRB.CreateStore(*CurrMallocSize, GEPForOffset);
608 Constant *Index2Size = ConstantInt::get(Int32Ty, 1);
609 auto *GEPForSize = IRB.CreateInBoundsGEP(MetadataStructType, SwLDSMetadata,
610 {Index0, Index1, Index2Size});
612 Value *CurrDynLDSSize = IRB.CreateLoad(Int32Ty, HiddenDynLDSSize);
613 IRB.CreateStore(CurrDynLDSSize, GEPForSize);
614 Constant *Index2AlignedSize = ConstantInt::get(Int32Ty, 2);
615 auto *GEPForAlignedSize = IRB.CreateInBoundsGEP(
616 MetadataStructType, SwLDSMetadata, {Index0, Index1, Index2AlignedSize});
618 Value *AlignedDynLDSSize =
619 IRB.CreateAdd(CurrDynLDSSize, MaxAlignValueMinusOne);
620 AlignedDynLDSSize = IRB.CreateUDiv(AlignedDynLDSSize, MaxAlignValue);
621 AlignedDynLDSSize = IRB.CreateMul(AlignedDynLDSSize, MaxAlignValue);
622 IRB.CreateStore(AlignedDynLDSSize, GEPForAlignedSize);
625 *CurrMallocSize = IRB.CreateAdd(*CurrMallocSize, AlignedDynLDSSize);
635 return DILocation::get(SP->getContext(), SP->getLine(), 1, SP);
639void AMDGPUSwLowerLDS::getLDSMemoryInstructions(
643 if (
LoadInst *LI = dyn_cast<LoadInst>(&Inst)) {
645 LDSInstructions.
insert(&Inst);
646 }
else if (
StoreInst *SI = dyn_cast<StoreInst>(&Inst)) {
648 LDSInstructions.
insert(&Inst);
649 }
else if (
AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(&Inst)) {
651 LDSInstructions.
insert(&Inst);
654 LDSInstructions.
insert(&Inst);
658 LDSInstructions.
insert(&Inst);
665Value *AMDGPUSwLowerLDS::getTranslatedGlobalMemoryPtrOfLDS(
Value *LoadMallocPtr,
667 assert(LDSPtr &&
"Invalid LDS pointer operand");
672 if (
auto *VecPtrTy = dyn_cast<VectorType>(LDSPtrType)) {
675 IntTy = VectorType::get(IntTy, NumElements);
677 Value *GepIndex = IRB.CreatePtrToInt(LDSPtr, IntTy);
678 return IRB.CreateInBoundsGEP(IRB.getInt8Ty(), LoadMallocPtr, {GepIndex});
681void AMDGPUSwLowerLDS::translateLDSMemoryOperationsToGlobalMemory(
684 LLVM_DEBUG(
dbgs() <<
"Translating LDS memory operations to global memory : "
687 IRB.SetInsertPoint(Inst);
688 if (
LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
689 Value *LIOperand = LI->getPointerOperand();
691 getTranslatedGlobalMemoryPtrOfLDS(LoadMallocPtr, LIOperand);
692 LoadInst *NewLI = IRB.CreateAlignedLoad(LI->getType(), Replacement,
693 LI->getAlign(), LI->isVolatile());
694 NewLI->
setAtomic(LI->getOrdering(), LI->getSyncScopeID());
695 AsanInfo.Instructions.insert(NewLI);
696 LI->replaceAllUsesWith(NewLI);
697 LI->eraseFromParent();
698 }
else if (
StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
699 Value *SIOperand = SI->getPointerOperand();
701 getTranslatedGlobalMemoryPtrOfLDS(LoadMallocPtr, SIOperand);
702 StoreInst *NewSI = IRB.CreateAlignedStore(
703 SI->getValueOperand(), Replacement, SI->getAlign(), SI->isVolatile());
704 NewSI->
setAtomic(SI->getOrdering(), SI->getSyncScopeID());
705 AsanInfo.Instructions.insert(NewSI);
706 SI->replaceAllUsesWith(NewSI);
707 SI->eraseFromParent();
708 }
else if (
AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) {
709 Value *RMWPtrOperand = RMW->getPointerOperand();
710 Value *RMWValOperand = RMW->getValOperand();
712 getTranslatedGlobalMemoryPtrOfLDS(LoadMallocPtr, RMWPtrOperand);
714 RMW->getOperation(), Replacement, RMWValOperand, RMW->getAlign(),
715 RMW->getOrdering(), RMW->getSyncScopeID());
717 AsanInfo.Instructions.insert(NewRMW);
718 RMW->replaceAllUsesWith(NewRMW);
719 RMW->eraseFromParent();
721 Value *XCHGPtrOperand = XCHG->getPointerOperand();
723 getTranslatedGlobalMemoryPtrOfLDS(LoadMallocPtr, XCHGPtrOperand);
725 Replacement, XCHG->getCompareOperand(), XCHG->getNewValOperand(),
726 XCHG->getAlign(), XCHG->getSuccessOrdering(),
727 XCHG->getFailureOrdering(), XCHG->getSyncScopeID());
729 AsanInfo.Instructions.insert(NewXCHG);
730 XCHG->replaceAllUsesWith(NewXCHG);
731 XCHG->eraseFromParent();
733 Value *AIOperand = ASC->getPointerOperand();
735 getTranslatedGlobalMemoryPtrOfLDS(LoadMallocPtr, AIOperand);
736 Value *NewAI = IRB.CreateAddrSpaceCast(Replacement, ASC->
getType());
741 ASC->eraseFromParent();
747void AMDGPUSwLowerLDS::poisonRedzones(
Function *Func,
Value *MallocPtr) {
748 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
752 "__asan_poison_region",
753 FunctionType::get(VoidTy, {Int64Ty, Int64Ty},
false));
755 auto RedzonesVec = LDSParams.RedzoneOffsetAndSizeVector;
756 size_t VecSize = RedzonesVec.size();
757 for (
unsigned i = 0; i < VecSize; i++) {
758 auto &RedzonePair = RedzonesVec[i];
759 uint64_t RedzoneOffset = RedzonePair.first;
760 uint64_t RedzoneSize = RedzonePair.second;
761 Value *RedzoneAddrOffset = IRB.CreateInBoundsGEP(
762 IRB.getInt8Ty(), MallocPtr, {IRB.getInt64(RedzoneOffset)});
763 Value *RedzoneAddress = IRB.CreatePtrToInt(RedzoneAddrOffset, Int64Ty);
764 IRB.CreateCall(AsanPoisonRegion,
765 {RedzoneAddress, IRB.getInt64(RedzoneSize)});
769void AMDGPUSwLowerLDS::lowerKernelLDSAccesses(
Function *Func,
771 LLVM_DEBUG(
dbgs() <<
"Sw Lowering Kernel LDS for : " << Func->getName());
772 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
773 auto &Ctx = M.getContext();
774 auto *PrevEntryBlock = &Func->getEntryBlock();
776 getLDSMemoryInstructions(Func, LDSInstructions);
784 IRB.SetInsertPoint(WIdBlock, WIdBlock->begin());
787 IRB.SetCurrentDebugLocation(FirstDL);
788 Value *WIdx = IRB.CreateIntrinsic(Intrinsic::amdgcn_workitem_id_x, {});
789 Value *WIdy = IRB.CreateIntrinsic(Intrinsic::amdgcn_workitem_id_y, {});
790 Value *WIdz = IRB.CreateIntrinsic(Intrinsic::amdgcn_workitem_id_z, {});
791 Value *XYOr = IRB.CreateOr(WIdx, WIdy);
792 Value *XYZOr = IRB.CreateOr(XYOr, WIdz);
793 Value *WIdzCond = IRB.CreateICmpEQ(XYZOr, IRB.getInt32(0));
797 IRB.CreateCondBr(WIdzCond, MallocBlock, PrevEntryBlock);
800 IRB.SetInsertPoint(MallocBlock, MallocBlock->begin());
807 assert(SwLDS && SwLDSMetadata);
811 Value *CurrMallocSize;
817 for (
auto &GV : LDSGlobals) {
820 UniqueLDSGlobals.
insert(GV);
824 GetUniqueLDSGlobals(LDSParams.DirectAccess.StaticLDSGlobals);
825 GetUniqueLDSGlobals(LDSParams.IndirectAccess.StaticLDSGlobals);
826 unsigned NumStaticLDS = 1 + UniqueLDSGlobals.
size();
827 UniqueLDSGlobals.
clear();
830 auto *GEPForEndStaticLDSOffset =
831 IRB.CreateInBoundsGEP(MetadataStructType, SwLDSMetadata,
832 {ConstantInt::get(Int32Ty, 0),
833 ConstantInt::get(Int32Ty, NumStaticLDS - 1),
834 ConstantInt::get(Int32Ty, 0)});
836 auto *GEPForEndStaticLDSSize =
837 IRB.CreateInBoundsGEP(MetadataStructType, SwLDSMetadata,
838 {ConstantInt::get(Int32Ty, 0),
839 ConstantInt::get(Int32Ty, NumStaticLDS - 1),
840 ConstantInt::get(Int32Ty, 2)});
842 Value *EndStaticLDSOffset =
843 IRB.CreateLoad(Int32Ty, GEPForEndStaticLDSOffset);
844 Value *EndStaticLDSSize = IRB.CreateLoad(Int32Ty, GEPForEndStaticLDSSize);
845 CurrMallocSize = IRB.CreateAdd(EndStaticLDSOffset, EndStaticLDSSize);
847 CurrMallocSize = IRB.getInt32(MallocSize);
849 if (LDSParams.SwDynLDS) {
852 "Dynamic LDS size query is only supported for CO V5 and later.");
855 IRB.CreateIntrinsic(Intrinsic::amdgcn_implicitarg_ptr, {});
856 Value *HiddenDynLDSSize = IRB.CreateInBoundsGEP(
857 ImplicitArg->
getType(), ImplicitArg,
859 UniqueLDSGlobals.
clear();
860 GetUniqueLDSGlobals(LDSParams.DirectAccess.DynamicLDSGlobals);
861 GetUniqueLDSGlobals(LDSParams.IndirectAccess.DynamicLDSGlobals);
862 updateMallocSizeForDynamicLDS(Func, &CurrMallocSize, HiddenDynLDSSize,
866 CurrMallocSize = IRB.CreateZExt(CurrMallocSize, Int64Ty);
870 Value *ReturnAddress =
871 IRB.CreateIntrinsic(Intrinsic::returnaddress, {IRB.getInt32(0)});
874 FunctionType::get(Int64Ty, {Int64Ty, Int64Ty},
false));
875 Value *RAPtrToInt = IRB.CreatePtrToInt(ReturnAddress, Int64Ty);
876 Value *MallocCall = IRB.CreateCall(MallocFunc, {CurrMallocSize, RAPtrToInt});
882 IRB.CreateStore(MallocPtr, SwLDS);
885 poisonRedzones(Func, MallocPtr);
888 IRB.CreateBr(PrevEntryBlock);
892 IRB.SetInsertPoint(PrevEntryBlock, PrevEntryBlock->begin());
893 auto *XYZCondPhi = IRB.CreatePHI(Int1Ty, 2,
"xyzCond");
894 XYZCondPhi->addIncoming(IRB.getInt1(0), WIdBlock);
895 XYZCondPhi->addIncoming(IRB.getInt1(1), MallocBlock);
897 IRB.CreateIntrinsic(Intrinsic::amdgcn_s_barrier, {});
900 Value *LoadMallocPtr =
904 replaceKernelLDSAccesses(Func);
908 translateLDSMemoryOperationsToGlobalMemory(Func, LoadMallocPtr,
916 if (
ReturnInst *RI = dyn_cast<ReturnInst>(&BB.back())) {
917 RI->eraseFromParent();
918 IRB.SetInsertPoint(&BB, BB.end());
919 IRB.CreateBr(CondFreeBlock);
925 IRB.SetInsertPoint(CondFreeBlock, CondFreeBlock->begin());
926 IRB.CreateIntrinsic(Intrinsic::amdgcn_s_barrier, {});
927 IRB.CreateCondBr(XYZCondPhi, FreeBlock, EndBlock);
930 IRB.SetInsertPoint(FreeBlock, FreeBlock->begin());
935 FunctionType::get(IRB.getVoidTy(), {Int64Ty, Int64Ty},
false));
937 IRB.CreateIntrinsic(Intrinsic::returnaddress, IRB.getInt32(0));
938 Value *RAPToInt = IRB.CreatePtrToInt(ReturnAddr, Int64Ty);
939 Value *MallocPtrToInt = IRB.CreatePtrToInt(LoadMallocPtr, Int64Ty);
940 IRB.CreateCall(AsanFreeFunc, {MallocPtrToInt, RAPToInt});
942 IRB.CreateBr(EndBlock);
945 IRB.SetInsertPoint(EndBlock, EndBlock->begin());
948 DTU.
applyUpdates({{DominatorTree::Insert, WIdBlock, MallocBlock},
949 {DominatorTree::Insert, MallocBlock, PrevEntryBlock},
950 {DominatorTree::Insert, CondFreeBlock, FreeBlock},
951 {DominatorTree::Insert, FreeBlock, EndBlock}});
954Constant *AMDGPUSwLowerLDS::getAddressesOfVariablesInKernel(
957 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
961 auto *SwLDSMetadataStructType =
967 for (
auto *GV : Variables) {
968 auto It = LDSParams.LDSToReplacementIndicesMap.find(GV);
969 if (It == LDSParams.LDSToReplacementIndicesMap.end()) {
974 auto &Indices = It->second;
975 Constant *GEPIdx[] = {ConstantInt::get(Int32Ty, Indices[0]),
976 ConstantInt::get(Int32Ty, Indices[1]),
977 ConstantInt::get(Int32Ty, Indices[2])};
979 SwLDSMetadata, GEPIdx,
true);
980 Elements.push_back(
GEP);
985void AMDGPUSwLowerLDS::buildNonKernelLDSBaseTable(
986 NonKernelLDSParameters &NKLDSParams) {
990 auto &Kernels = NKLDSParams.OrderedKernels;
994 const size_t NumberKernels = Kernels.size();
997 std::vector<Constant *> OverallConstantExprElts(NumberKernels);
998 for (
size_t i = 0; i < NumberKernels; i++) {
1000 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
1003 Constant *GEPIdx[] = {ConstantInt::get(Int32Ty, 0)};
1006 OverallConstantExprElts[i] =
GEP;
1016 NKLDSParams.LDSBaseTable->setSanitizerMetadata(MD);
1019void AMDGPUSwLowerLDS::buildNonKernelLDSOffsetTable(
1020 NonKernelLDSParameters &NKLDSParams) {
1028 auto &Variables = NKLDSParams.OrdereLDSGlobals;
1029 auto &Kernels = NKLDSParams.OrderedKernels;
1030 if (Variables.
empty() || Kernels.empty())
1032 const size_t NumberVariables = Variables.
size();
1033 const size_t NumberKernels = Kernels.size();
1039 ArrayType::get(KernelOffsetsType, NumberKernels);
1040 std::vector<Constant *> overallConstantExprElts(NumberKernels);
1041 for (
size_t i = 0; i < NumberKernels; i++) {
1043 overallConstantExprElts[i] =
1044 getAddressesOfVariablesInKernel(Func, Variables);
1054 NKLDSParams.LDSOffsetTable->setSanitizerMetadata(MD);
1057void AMDGPUSwLowerLDS::lowerNonKernelLDSAccesses(
1059 NonKernelLDSParameters &NKLDSParams) {
1062 LLVM_DEBUG(
dbgs() <<
"Sw LDS lowering, lower non-kernel access for : "
1063 << Func->getName());
1064 auto InsertAt = Func->getEntryBlock().getFirstNonPHIOrDbgOrAlloca();
1065 IRB.SetInsertPoint(InsertAt);
1069 getLDSMemoryInstructions(Func, LDSInstructions);
1071 auto *KernelId = IRB.CreateIntrinsic(Intrinsic::amdgcn_lds_kernel_id, {});
1074 auto &OrdereLDSGlobals = NKLDSParams.OrdereLDSGlobals;
1075 Value *BaseGEP = IRB.CreateInBoundsGEP(
1076 LDSBaseTable->
getValueType(), LDSBaseTable, {IRB.getInt32(0), KernelId});
1079 Value *LoadMallocPtr =
1083 const auto *GVIt =
llvm::find(OrdereLDSGlobals, GV);
1084 assert(GVIt != OrdereLDSGlobals.end());
1085 uint32_t GVOffset = std::distance(OrdereLDSGlobals.begin(), GVIt);
1087 Value *OffsetGEP = IRB.CreateInBoundsGEP(
1089 {IRB.getInt32(0), KernelId, IRB.getInt32(GVOffset)});
1092 Value *
Offset = IRB.CreateLoad(IRB.getInt32Ty(), OffsetLoad);
1093 Value *BasePlusOffset =
1094 IRB.CreateInBoundsGEP(IRB.getInt8Ty(), BaseLoad, {Offset});
1095 LLVM_DEBUG(
dbgs() <<
"Sw LDS Lowering, Replace non-kernel LDS for "
1097 replacesUsesOfGlobalInFunction(Func, GV, BasePlusOffset);
1099 translateLDSMemoryOperationsToGlobalMemory(Func, LoadMallocPtr,
1103static void reorderStaticDynamicIndirectLDSSet(KernelLDSParameters &LDSParams) {
1106 auto &DirectAccess = LDSParams.DirectAccess;
1107 auto &IndirectAccess = LDSParams.IndirectAccess;
1108 LDSParams.DirectAccess.StaticLDSGlobals = sortByName(
1109 std::vector<GlobalVariable *>(DirectAccess.StaticLDSGlobals.begin(),
1110 DirectAccess.StaticLDSGlobals.end()));
1111 LDSParams.DirectAccess.DynamicLDSGlobals = sortByName(
1112 std::vector<GlobalVariable *>(DirectAccess.DynamicLDSGlobals.begin(),
1113 DirectAccess.DynamicLDSGlobals.end()));
1114 LDSParams.IndirectAccess.StaticLDSGlobals = sortByName(
1115 std::vector<GlobalVariable *>(IndirectAccess.StaticLDSGlobals.begin(),
1116 IndirectAccess.StaticLDSGlobals.end()));
1117 LDSParams.IndirectAccess.DynamicLDSGlobals = sortByName(
1118 std::vector<GlobalVariable *>(IndirectAccess.DynamicLDSGlobals.begin(),
1119 IndirectAccess.DynamicLDSGlobals.end()));
1122void AMDGPUSwLowerLDS::initAsanInfo() {
1128 bool OrShadowOffset;
1130 &
Offset, &Scale, &OrShadowOffset);
1131 AsanInfo.Scale = Scale;
1132 AsanInfo.Offset =
Offset;
1136 for (
auto &K : LDSAccesses) {
1140 if (
F->hasFnAttribute(Attribute::SanitizeAddress))
1146bool AMDGPUSwLowerLDS::run() {
1147 bool Changed =
false;
1158 bool LowerAllLDS = hasFnWithSanitizeAddressAttr(LDSUsesInfo.
direct_access) ||
1166 bool DirectAccess) {
1167 for (
auto &K : LDSAccesses) {
1169 if (!
F || K.second.empty())
1175 FuncLDSAccessInfo.KernelToLDSParametersMap.insert(
1176 {
F, KernelLDSParameters()});
1178 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[
F];
1180 FuncLDSAccessInfo.KernelsWithIndirectLDSAccess.insert(
F);
1182 if (!DirectAccess) {
1184 LDSParams.IndirectAccess.DynamicLDSGlobals.insert(GV);
1186 LDSParams.IndirectAccess.StaticLDSGlobals.insert(GV);
1187 FuncLDSAccessInfo.AllNonKernelLDSAccess.insert(GV);
1190 LDSParams.DirectAccess.DynamicLDSGlobals.insert(GV);
1192 LDSParams.DirectAccess.StaticLDSGlobals.insert(GV);
1198 PopulateKernelStaticDynamicLDS(LDSUsesInfo.
direct_access,
true);
1204 for (
auto &K : FuncLDSAccessInfo.KernelToLDSParametersMap) {
1206 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
1207 if (LDSParams.DirectAccess.StaticLDSGlobals.empty() &&
1208 LDSParams.DirectAccess.DynamicLDSGlobals.empty() &&
1209 LDSParams.IndirectAccess.StaticLDSGlobals.empty() &&
1210 LDSParams.IndirectAccess.DynamicLDSGlobals.empty()) {
1215 {
"amdgpu-no-workitem-id-x",
"amdgpu-no-workitem-id-y",
1216 "amdgpu-no-workitem-id-z",
"amdgpu-no-heap-ptr"});
1217 if (!LDSParams.IndirectAccess.StaticLDSGlobals.empty() ||
1218 !LDSParams.IndirectAccess.DynamicLDSGlobals.empty())
1220 reorderStaticDynamicIndirectLDSSet(LDSParams);
1221 buildSwLDSGlobal(Func);
1222 buildSwDynLDSGlobal(Func);
1223 populateSwMetadataGlobal(Func);
1224 populateSwLDSAttributeAndMetadata(Func);
1225 populateLDSToReplacementIndicesMap(Func);
1227 DomTreeUpdater::UpdateStrategy::Lazy);
1228 lowerKernelLDSAccesses(Func, DTU);
1234 getUsesOfLDSByNonKernels();
1237 getNonKernelsWithLDSArguments(CG);
1240 if (!FuncLDSAccessInfo.NonKernelToLDSAccessMap.empty() ||
1241 !FuncLDSAccessInfo.NonKernelsWithLDSArgument.empty()) {
1242 NonKernelLDSParameters NKLDSParams;
1243 NKLDSParams.OrderedKernels = getOrderedIndirectLDSAccessingKernels(
1244 FuncLDSAccessInfo.KernelsWithIndirectLDSAccess);
1245 NKLDSParams.OrdereLDSGlobals = getOrderedNonKernelAllLDSGlobals(
1246 FuncLDSAccessInfo.AllNonKernelLDSAccess);
1247 buildNonKernelLDSBaseTable(NKLDSParams);
1248 buildNonKernelLDSOffsetTable(NKLDSParams);
1249 for (
auto &K : FuncLDSAccessInfo.NonKernelToLDSAccessMap) {
1253 std::vector<GlobalVariable *>(LDSGlobals.
begin(), LDSGlobals.
end()));
1254 lowerNonKernelLDSAccesses(Func, OrderedLDSGlobals, NKLDSParams);
1256 for (
Function *Func : FuncLDSAccessInfo.NonKernelsWithLDSArgument) {
1257 auto &K = FuncLDSAccessInfo.NonKernelToLDSAccessMap;
1258 if (K.contains(Func))
1261 lowerNonKernelLDSAccesses(Func, Vec, NKLDSParams);
1278 if (AsanInstrumentLDS) {
1285 for (
auto &Operand : OperandsToInstrument) {
1288 Operand.Alignment.valueOrOne(), Operand.TypeStoreSize,
1289 Operand.IsWrite,
nullptr,
false,
false, AsanInfo.Scale,
1298class AMDGPUSwLowerLDSLegacy :
public ModulePass {
1311char AMDGPUSwLowerLDSLegacy::ID = 0;
1315 "AMDGPU Software lowering of LDS",
false,
false)
1320bool AMDGPUSwLowerLDSLegacy::runOnModule(
Module &M) {
1323 if (!M.getModuleFlag(
"nosanitize_address"))
1326 getAnalysisIfAvailable<DominatorTreeWrapperPass>();
1331 auto &TPC = getAnalysis<TargetPassConfig>();
1334 AMDGPUSwLowerLDS SwLowerLDSImpl(M, *AMDGPUTM, DTCallback);
1335 bool IsChanged = SwLowerLDSImpl.run();
1341 return new AMDGPUSwLowerLDSLegacy(TM);
1348 if (!M.getModuleFlag(
"nosanitize_address"))
1354 AMDGPUSwLowerLDS SwLowerLDSImpl(M,
TM, DTCallback);
1355 bool IsChanged = SwLowerLDSImpl.run();
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
amdgpu sw lower AMDGPU Software lowering of LDS
#define COV5_HIDDEN_DYN_LDS_SIZE_ARG
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file provides interfaces used to build and manipulate a call graph, which is a very useful tool ...
This file contains the declarations for the subclasses of Constant, which represent the different fla...
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
if(auto Err=PB.parsePassPipeline(MPM, Passes)) return wrap(std MPM run * Mod
FunctionAnalysisManager FAM
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
pre isel intrinsic lowering
This file implements a set that has insertion order iteration characteristics.
Target-Independent Code Generator Pass Configuration Options pass.
static DebugLoc getOrCreateDebugLoc(const Instruction *InsertBefore, DISubprogram *SP)
This class represents a conversion between pointers from one address space to another.
A container for analyses that lazily runs them and caches their results.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Represent the analysis usage information of a pass.
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
An instruction that atomically checks whether a specified value is in a memory location,...
void setVolatile(bool V)
Specify whether this is a volatile cmpxchg.
an instruction that atomically reads a memory location, combines it with another value,...
void setVolatile(bool V)
Specify whether this is a volatile RMW or not.
LLVM Basic Block Representation.
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
A node in the call graph for a module.
Function * getFunction() const
Returns the function that this call graph node represents.
The basic data container for the call graph of a Module of IR.
static LLVM_ABI Constant * get(ArrayType *T, ArrayRef< Constant * > V)
static Constant * getGetElementPtr(Type *Ty, Constant *C, ArrayRef< Constant * > IdxList, GEPNoWrapFlags NW=GEPNoWrapFlags::none(), std::optional< ConstantRange > InRange=std::nullopt, Type *OnlyIfReducedTy=nullptr)
Getelementptr form.
static LLVM_ABI Constant * get(StructType *T, ArrayRef< Constant * > V)
This is an important base class in LLVM.
LLVM_ABI void removeDeadConstantUsers() const
If there are any dead constant users dangling off of this constant, remove them.
Subprogram description. Uses SubclassData1.
A parsed version of the target data layout string in and methods for querying it.
Implements a dense probed hash-table based set.
Analysis pass which computes a DominatorTree.
Legacy analysis pass which computes a DominatorTree.
DominatorTree & getDomTree()
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
void applyUpdates(ArrayRef< UpdateT > Updates)
Submit updates to all available trees.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set a particular kind of metadata attachment.
LLVM_ABI bool isDeclaration() const
Return true if the primary definition of this global value is outside of the current translation unit...
PointerType * getType() const
Global values are always pointers.
@ InternalLinkage
Rename collisions when linking (static functions).
@ ExternalLinkage
Externally visible function.
Type * getValueType() const
uint64_t getAlignment() const
FIXME: Remove this function once transition to Align is over.
LLVM_ABI void eraseFromParent()
eraseFromParent - This method unlinks 'this' from the containing module and deletes it.
Value * CreateConstInBoundsGEP1_32(Type *Ty, Value *Ptr, unsigned Idx0, const Twine &Name="")
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
An analysis over an "outer" IR unit that provides access to an analysis manager over an "inner" IR un...
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
This is an important class for using LLVM in a threaded context.
An instruction for reading from memory.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
LLVM_ABI MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
void push_back(MachineInstr *MI)
ModulePass class - This class is used to implement unstructured interprocedural optimizations and ana...
virtual bool runOnModule(Module &M)=0
runOnModule - Virtual method overriden by subclasses to process the module being operated on.
A Module instance is used to store all the information related to an LLVM module.
A container for an operand bundle being viewed as a set of values rather than a set of uses.
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
PreservedAnalyses & preserve()
Mark an analysis as preserved.
Return a value (possibly void), from a function.
A vector that has set insertion semantics.
size_type size() const
Determine the number of elements in the SetVector.
iterator end()
Get an iterator to the end of the SetVector.
void clear()
Completely clear the SetVector.
bool empty() const
Determine if the SetVector is empty or not.
iterator begin()
Get an iterator to the beginning of the SetVector.
bool insert(const value_type &X)
Insert a new element into the SetVector.
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this store instruction.
StringRef - Represent a constant reference to a string, i.e.
Class to represent struct types.
static LLVM_ABI StructType * create(LLVMContext &Context, StringRef Name)
This creates an identified struct.
Target-Independent Code Generator Pass Configuration Options.
The instances of the Type class are immutable: once they are created, they are never changed.
bool isPointerTy() const
True if this is an instance of PointerType.
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
static LLVM_ABI Type * getVoidTy(LLVMContext &C)
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
A Use represents the edge between a Value definition and its users.
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
LLVM_ABI void printAsOperand(raw_ostream &O, bool PrintType=true, const Module *M=nullptr) const
Print the name of this Value out to the specified raw_ostream.
LLVM_ABI void replaceUsesWithIf(Value *New, llvm::function_ref< bool(Use &U)> ShouldReplace)
Go through the uses list for this definition and make each use point to "V" if the callback ShouldRep...
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
An efficient, type-erasing, non-owning reference to a callable.
A raw_ostream that writes to an std::string.
A raw_ostream that writes to an SmallVector or SmallString.
StringRef str() const
Return a StringRef for the vector contents.
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
void getInterestingMemoryOperands(Module &M, Instruction *I, SmallVectorImpl< InterestingMemoryOperand > &Interesting)
Get all the memory operands from the instruction that needs to be instrumented.
bool isDynamicLDS(const GlobalVariable &GV)
unsigned getAMDHSACodeObjectVersion(const Module &M)
void removeFnAttrFromReachable(CallGraph &CG, Function *KernelRoot, ArrayRef< StringRef > FnAttrs)
Strip FnAttr attribute from any functions where we may have introduced its use.
LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M)
bool isLDSVariableToLower(const GlobalVariable &GV)
bool eliminateConstantExprUsesOfLDSFromAllInstructions(Module &M)
Align getAlign(const DataLayout &DL, const GlobalVariable *GV)
bool isKernelLDS(const Function *F)
void instrumentAddress(Module &M, IRBuilder<> &IRB, Instruction *OrigIns, Instruction *InsertBefore, Value *Addr, Align Alignment, TypeSize TypeStoreSize, bool IsWrite, Value *SizeArgument, bool UseCalls, bool Recover, int AsanScale, int AsanOffset)
Instrument the memory operand Addr.
uint64_t getRedzoneSizeForGlobal(int AsanScale, uint64_t SizeInBytes)
Given SizeInBytes of the Value to be instrunmented, Returns the redzone size corresponding to it.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
ModulePass * createAMDGPUSwLowerLDSLegacyPass(const AMDGPUTargetMachine *TM=nullptr)
constexpr from_range_t from_range
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
char & AMDGPUSwLowerLDSLegacyPassID
void sort(IteratorTy Start, IteratorTy End)
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
void getAddressSanitizerParams(const Triple &TargetTriple, int LongSize, bool IsKasan, uint64_t *ShadowBase, int *MappingScale, bool *OrShadowOffset)
const AMDGPUTargetMachine & TM
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
FunctionVariableMap direct_access
FunctionVariableMap indirect_access
This struct is a compact representation of a valid (non-zero power of two) alignment.