LLVM 22.0.0git
AMDGPUSwLowerLDS.cpp
Go to the documentation of this file.
1//===-- AMDGPUSwLowerLDS.cpp -----------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass lowers the local data store, LDS, uses in kernel and non-kernel
10// functions in module to use dynamically allocated global memory.
11// Packed LDS Layout is emulated in the global memory.
12// The lowered memory instructions from LDS to global memory are then
13// instrumented for address sanitizer, to catch addressing errors.
14// This pass only work when address sanitizer has been enabled and has
15// instrumented the IR. It identifies that IR has been instrumented using
16// "nosanitize_address" module flag.
17//
18// Replacement of Kernel LDS accesses:
19// For a kernel, LDS access can be static or dynamic which are direct
20// (accessed within kernel) and indirect (accessed through non-kernels).
21// All these LDS accesses corresponding to kernel will be packed together,
22// where all static LDS accesses will be allocated first and then dynamic
23// LDS follows. The total size with alignment is calculated. A new LDS global
24// will be created for the kernel called "SW LDS" and it will have the
25// attribute "amdgpu-lds-size" attached with value of the size calculated.
26// All the LDS accesses in the module will be replaced by GEP with offset
27// into the "Sw LDS".
28// A new "llvm.amdgcn.<kernel>.dynlds" is created per kernel accessing
29// the dynamic LDS. This will be marked used by kernel and will have
30// MD_absolue_symbol metadata set to total static LDS size, Since dynamic
31// LDS allocation starts after all static LDS allocation.
32//
33// A device global memory equal to the total LDS size will be allocated.
34// At the prologue of the kernel, a single work-item from the
35// work-group, does a "malloc" and stores the pointer of the
36// allocation in "SW LDS".
37//
38// To store the offsets corresponding to all LDS accesses, another global
39// variable is created which will be called "SW LDS metadata" in this pass.
40// - SW LDS Global:
41// It is LDS global of ptr type with name
42// "llvm.amdgcn.sw.lds.<kernel-name>".
43// - Metadata Global:
44// It is of struct type, with n members. n equals the number of LDS
45// globals accessed by the kernel(direct and indirect). Each member of
46// struct is another struct of type {i32, i32, i32}. First member
47// corresponds to offset, second member corresponds to size of LDS global
48// being replaced and third represents the total aligned size. It will
49// have name "llvm.amdgcn.sw.lds.<kernel-name>.md". This global will have
50// an intializer with static LDS related offsets and sizes initialized.
51// But for dynamic LDS related entries, offsets will be intialized to
52// previous static LDS allocation end offset. Sizes for them will be zero
53// initially. These dynamic LDS offset and size values will be updated
54// within the kernel, since kernel can read the dynamic LDS size
55// allocation done at runtime with query to "hidden_dynamic_lds_size"
56// hidden kernel argument.
57//
58// At the epilogue of kernel, allocated memory would be made free by the same
59// single work-item.
60//
61// Replacement of non-kernel LDS accesses:
62// Multiple kernels can access the same non-kernel function.
63// All the kernels accessing LDS through non-kernels are sorted and
64// assigned a kernel-id. All the LDS globals accessed by non-kernels
65// are sorted. This information is used to build two tables:
66// - Base table:
67// Base table will have single row, with elements of the row
68// placed as per kernel ID. Each element in the row corresponds
69// to ptr of "SW LDS" variable created for that kernel.
70// - Offset table:
71// Offset table will have multiple rows and columns.
72// Rows are assumed to be from 0 to (n-1). n is total number
73// of kernels accessing the LDS through non-kernels.
74// Each row will have m elements. m is the total number of
75// unique LDS globals accessed by all non-kernels.
76// Each element in the row correspond to the ptr of
77// the replacement of LDS global done by that particular kernel.
78// A LDS variable in non-kernel will be replaced based on the information
79// from base and offset tables. Based on kernel-id query, ptr of "SW
80// LDS" for that corresponding kernel is obtained from base table.
81// The Offset into the base "SW LDS" is obtained from
82// corresponding element in offset table. With this information, replacement
83// value is obtained.
84//===----------------------------------------------------------------------===//
85
86#include "AMDGPU.h"
88#include "AMDGPUMemoryUtils.h"
89#include "AMDGPUTargetMachine.h"
90#include "llvm/ADT/DenseMap.h"
91#include "llvm/ADT/DenseSet.h"
92#include "llvm/ADT/SetVector.h"
94#include "llvm/ADT/StringRef.h"
98#include "llvm/IR/Constants.h"
99#include "llvm/IR/DIBuilder.h"
100#include "llvm/IR/DebugInfo.h"
102#include "llvm/IR/IRBuilder.h"
103#include "llvm/IR/Instructions.h"
104#include "llvm/IR/IntrinsicsAMDGPU.h"
105#include "llvm/IR/MDBuilder.h"
107#include "llvm/Pass.h"
111
112#include <algorithm>
113
114#define DEBUG_TYPE "amdgpu-sw-lower-lds"
115#define COV5_HIDDEN_DYN_LDS_SIZE_ARG 15
116
117using namespace llvm;
118using namespace AMDGPU;
119
120namespace {
121
123 AsanInstrumentLDS("amdgpu-asan-instrument-lds",
124 cl::desc("Run asan instrumentation on LDS instructions "
125 "lowered to global memory"),
126 cl::init(true), cl::Hidden);
127
128using DomTreeCallback = function_ref<DominatorTree *(Function &F)>;
129
130struct LDSAccessTypeInfo {
131 SetVector<GlobalVariable *> StaticLDSGlobals;
132 SetVector<GlobalVariable *> DynamicLDSGlobals;
133};
134
135// Struct to hold all the Metadata required for a kernel
136// to replace a LDS global uses with corresponding offset
137// in to device global memory.
138struct KernelLDSParameters {
139 GlobalVariable *SwLDS = nullptr;
140 GlobalVariable *SwDynLDS = nullptr;
141 GlobalVariable *SwLDSMetadata = nullptr;
142 LDSAccessTypeInfo DirectAccess;
143 LDSAccessTypeInfo IndirectAccess;
145 LDSToReplacementIndicesMap;
146 uint32_t MallocSize = 0;
147 uint32_t LDSSize = 0;
148 SmallVector<std::pair<uint32_t, uint32_t>, 64> RedzoneOffsetAndSizeVector;
149};
150
151// Struct to store information for creation of offset table
152// for all the non-kernel LDS accesses.
153struct NonKernelLDSParameters {
154 GlobalVariable *LDSBaseTable = nullptr;
155 GlobalVariable *LDSOffsetTable = nullptr;
156 SetVector<Function *> OrderedKernels;
157 SetVector<GlobalVariable *> OrdereLDSGlobals;
158};
159
160struct AsanInstrumentInfo {
161 int Scale = 0;
162 uint32_t Offset = 0;
163 SetVector<Instruction *> Instructions;
164};
165
166struct FunctionsAndLDSAccess {
167 DenseMap<Function *, KernelLDSParameters> KernelToLDSParametersMap;
168 SetVector<Function *> KernelsWithIndirectLDSAccess;
169 SetVector<Function *> NonKernelsWithLDSArgument;
170 SetVector<GlobalVariable *> AllNonKernelLDSAccess;
171 FunctionVariableMap NonKernelToLDSAccessMap;
172};
173
174class AMDGPUSwLowerLDS {
175public:
176 AMDGPUSwLowerLDS(Module &Mod, const AMDGPUTargetMachine &TM,
177 DomTreeCallback Callback)
178 : M(Mod), AMDGPUTM(TM), IRB(M.getContext()), DTCallback(Callback) {}
179 bool run();
180 void getUsesOfLDSByNonKernels();
181 void getNonKernelsWithLDSArguments(const CallGraph &CG);
183 getOrderedIndirectLDSAccessingKernels(SetVector<Function *> &Kernels);
185 getOrderedNonKernelAllLDSGlobals(SetVector<GlobalVariable *> &Variables);
186 void buildSwLDSGlobal(Function *Func);
187 void buildSwDynLDSGlobal(Function *Func);
188 void populateSwMetadataGlobal(Function *Func);
189 void populateSwLDSAttributeAndMetadata(Function *Func);
190 void populateLDSToReplacementIndicesMap(Function *Func);
191 void getLDSMemoryInstructions(Function *Func,
192 SetVector<Instruction *> &LDSInstructions);
193 void replaceKernelLDSAccesses(Function *Func);
194 Value *getTranslatedGlobalMemoryPtrOfLDS(Value *LoadMallocPtr, Value *LDSPtr);
195 void translateLDSMemoryOperationsToGlobalMemory(
196 Function *Func, Value *LoadMallocPtr,
197 SetVector<Instruction *> &LDSInstructions);
198 void poisonRedzones(Function *Func, Value *MallocPtr);
199 void lowerKernelLDSAccesses(Function *Func, DomTreeUpdater &DTU);
200 void buildNonKernelLDSOffsetTable(NonKernelLDSParameters &NKLDSParams);
201 void buildNonKernelLDSBaseTable(NonKernelLDSParameters &NKLDSParams);
202 Constant *
203 getAddressesOfVariablesInKernel(Function *Func,
204 SetVector<GlobalVariable *> &Variables);
205 void lowerNonKernelLDSAccesses(Function *Func,
206 SetVector<GlobalVariable *> &LDSGlobals,
207 NonKernelLDSParameters &NKLDSParams);
208 void
209 updateMallocSizeForDynamicLDS(Function *Func, Value **CurrMallocSize,
210 Value *HiddenDynLDSSize,
211 SetVector<GlobalVariable *> &DynamicLDSGlobals);
212 void initAsanInfo();
213
214private:
215 Module &M;
216 const AMDGPUTargetMachine &AMDGPUTM;
217 IRBuilder<> IRB;
218 DomTreeCallback DTCallback;
219 FunctionsAndLDSAccess FuncLDSAccessInfo;
220 AsanInstrumentInfo AsanInfo;
221};
222
223template <typename T> SetVector<T> sortByName(std::vector<T> &&V) {
224 // Sort the vector of globals or Functions based on their name.
225 // Returns a SetVector of globals/Functions.
226 sort(V, [](const auto *L, const auto *R) {
227 return L->getName() < R->getName();
228 });
229 return {SetVector<T>(llvm::from_range, V)};
230}
231
232SetVector<GlobalVariable *> AMDGPUSwLowerLDS::getOrderedNonKernelAllLDSGlobals(
233 SetVector<GlobalVariable *> &Variables) {
234 // Sort all the non-kernel LDS accesses based on their name.
235 return sortByName(
236 std::vector<GlobalVariable *>(Variables.begin(), Variables.end()));
237}
238
239SetVector<Function *> AMDGPUSwLowerLDS::getOrderedIndirectLDSAccessingKernels(
240 SetVector<Function *> &Kernels) {
241 // Sort the non-kernels accessing LDS based on their name.
242 // Also assign a kernel ID metadata based on the sorted order.
243 LLVMContext &Ctx = M.getContext();
244 if (Kernels.size() > UINT32_MAX) {
245 report_fatal_error("Unimplemented SW LDS lowering for > 2**32 kernels");
246 }
247 SetVector<Function *> OrderedKernels =
248 sortByName(std::vector<Function *>(Kernels.begin(), Kernels.end()));
249 for (size_t i = 0; i < Kernels.size(); i++) {
250 Metadata *AttrMDArgs[1] = {
251 ConstantAsMetadata::get(IRB.getInt32(i)),
252 };
253 Function *Func = OrderedKernels[i];
254 Func->setMetadata("llvm.amdgcn.lds.kernel.id",
255 MDNode::get(Ctx, AttrMDArgs));
256 }
257 return OrderedKernels;
258}
259
260void AMDGPUSwLowerLDS::getNonKernelsWithLDSArguments(const CallGraph &CG) {
261 // Among the kernels accessing LDS, get list of
262 // Non-kernels to which a call is made and a ptr
263 // to addrspace(3) is passed as argument.
264 for (auto &K : FuncLDSAccessInfo.KernelToLDSParametersMap) {
265 Function *Func = K.first;
266 const CallGraphNode *CGN = CG[Func];
267 if (!CGN)
268 continue;
269 for (auto &I : *CGN) {
270 CallGraphNode *CallerCGN = I.second;
271 Function *CalledFunc = CallerCGN->getFunction();
272 if (!CalledFunc || CalledFunc->isDeclaration())
273 continue;
274 if (AMDGPU::isKernelLDS(CalledFunc))
275 continue;
276 for (auto AI = CalledFunc->arg_begin(), E = CalledFunc->arg_end();
277 AI != E; ++AI) {
278 Type *ArgTy = (*AI).getType();
279 if (!ArgTy->isPointerTy())
280 continue;
282 continue;
283 FuncLDSAccessInfo.NonKernelsWithLDSArgument.insert(CalledFunc);
284 // Also add the Calling function to KernelsWithIndirectLDSAccess list
285 // so that base table of LDS is generated.
286 FuncLDSAccessInfo.KernelsWithIndirectLDSAccess.insert(Func);
287 }
288 }
289 }
290}
291
292void AMDGPUSwLowerLDS::getUsesOfLDSByNonKernels() {
293 for (GlobalVariable *GV : FuncLDSAccessInfo.AllNonKernelLDSAccess) {
295 continue;
296
297 for (User *V : GV->users()) {
298 if (auto *I = dyn_cast<Instruction>(V)) {
299 Function *F = I->getFunction();
300 if (!isKernelLDS(F) && !F->isDeclaration())
301 FuncLDSAccessInfo.NonKernelToLDSAccessMap[F].insert(GV);
302 }
303 }
304 }
305}
306
307static void recordLDSAbsoluteAddress(Module &M, GlobalVariable *GV,
309 // Write the specified address into metadata where it can be retrieved by
310 // the assembler. Format is a half open range, [Address Address+1)
311 LLVMContext &Ctx = M.getContext();
312 auto *IntTy = M.getDataLayout().getIntPtrType(Ctx, AMDGPUAS::LOCAL_ADDRESS);
313 MDBuilder MDB(Ctx);
314 MDNode *MetadataNode = MDB.createRange(ConstantInt::get(IntTy, Address),
315 ConstantInt::get(IntTy, Address + 1));
316 GV->setMetadata(LLVMContext::MD_absolute_symbol, MetadataNode);
317}
318
319static void addLDSSizeAttribute(Function *Func, uint32_t Offset,
320 bool IsDynLDS) {
321 if (Offset != 0) {
322 std::string Buffer;
323 raw_string_ostream SS{Buffer};
324 SS << Offset;
325 if (IsDynLDS)
326 SS << "," << Offset;
327 Func->addFnAttr("amdgpu-lds-size", Buffer);
328 }
329}
330
331static void markUsedByKernel(Function *Func, GlobalVariable *SGV) {
332 BasicBlock *Entry = &Func->getEntryBlock();
333 IRBuilder<> Builder(Entry, Entry->getFirstNonPHIIt());
334
335 Function *Decl = Intrinsic::getOrInsertDeclaration(Func->getParent(),
336 Intrinsic::donothing, {});
337
338 Value *UseInstance[1] = {
339 Builder.CreateConstInBoundsGEP1_32(SGV->getValueType(), SGV, 0)};
340
341 Builder.CreateCall(Decl, {},
342 {OperandBundleDefT<Value *>("ExplicitUse", UseInstance)});
343}
344
345void AMDGPUSwLowerLDS::buildSwLDSGlobal(Function *Func) {
346 // Create new LDS global required for each kernel to store
347 // device global memory pointer.
348 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
349 // Create new global pointer variable
350 LDSParams.SwLDS = new GlobalVariable(
351 M, IRB.getPtrTy(), false, GlobalValue::InternalLinkage,
352 PoisonValue::get(IRB.getPtrTy()), "llvm.amdgcn.sw.lds." + Func->getName(),
355 MD.NoAddress = true;
356 LDSParams.SwLDS->setSanitizerMetadata(MD);
357}
358
359void AMDGPUSwLowerLDS::buildSwDynLDSGlobal(Function *Func) {
360 // Create new Dyn LDS global if kernel accesses dyn LDS.
361 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
362 if (LDSParams.DirectAccess.DynamicLDSGlobals.empty() &&
363 LDSParams.IndirectAccess.DynamicLDSGlobals.empty())
364 return;
365 // Create new global pointer variable
366 auto *emptyCharArray = ArrayType::get(IRB.getInt8Ty(), 0);
367 LDSParams.SwDynLDS = new GlobalVariable(
368 M, emptyCharArray, false, GlobalValue::ExternalLinkage, nullptr,
369 "llvm.amdgcn." + Func->getName() + ".dynlds", nullptr,
371 markUsedByKernel(Func, LDSParams.SwDynLDS);
373 MD.NoAddress = true;
374 LDSParams.SwDynLDS->setSanitizerMetadata(MD);
375}
376
377void AMDGPUSwLowerLDS::populateSwLDSAttributeAndMetadata(Function *Func) {
378 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
379 bool IsDynLDSUsed = LDSParams.SwDynLDS;
380 uint32_t Offset = LDSParams.LDSSize;
381 recordLDSAbsoluteAddress(M, LDSParams.SwLDS, 0);
382 addLDSSizeAttribute(Func, Offset, IsDynLDSUsed);
383 if (LDSParams.SwDynLDS)
384 recordLDSAbsoluteAddress(M, LDSParams.SwDynLDS, Offset);
385}
386
387void AMDGPUSwLowerLDS::populateSwMetadataGlobal(Function *Func) {
388 // Create new metadata global for every kernel and initialize the
389 // start offsets and sizes corresponding to each LDS accesses.
390 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
391 auto &Ctx = M.getContext();
392 auto &DL = M.getDataLayout();
393 std::vector<Type *> Items;
394 Type *Int32Ty = IRB.getInt32Ty();
395 std::vector<Constant *> Initializers;
396 Align MaxAlignment(1);
397 auto UpdateMaxAlignment = [&MaxAlignment, &DL](GlobalVariable *GV) {
398 Align GVAlign = AMDGPU::getAlign(DL, GV);
399 MaxAlignment = std::max(MaxAlignment, GVAlign);
400 };
401
402 for (GlobalVariable *GV : LDSParams.DirectAccess.StaticLDSGlobals)
403 UpdateMaxAlignment(GV);
404
405 for (GlobalVariable *GV : LDSParams.DirectAccess.DynamicLDSGlobals)
406 UpdateMaxAlignment(GV);
407
408 for (GlobalVariable *GV : LDSParams.IndirectAccess.StaticLDSGlobals)
409 UpdateMaxAlignment(GV);
410
411 for (GlobalVariable *GV : LDSParams.IndirectAccess.DynamicLDSGlobals)
412 UpdateMaxAlignment(GV);
413
414 //{StartOffset, AlignedSizeInBytes}
415 SmallString<128> MDItemStr;
416 raw_svector_ostream MDItemOS(MDItemStr);
417 MDItemOS << "llvm.amdgcn.sw.lds." << Func->getName() << ".md.item";
418
419 StructType *LDSItemTy =
420 StructType::create(Ctx, {Int32Ty, Int32Ty, Int32Ty}, MDItemOS.str());
421 uint32_t &MallocSize = LDSParams.MallocSize;
422 SetVector<GlobalVariable *> UniqueLDSGlobals;
423 int AsanScale = AsanInfo.Scale;
424 auto buildInitializerForSwLDSMD =
425 [&](SetVector<GlobalVariable *> &LDSGlobals) {
426 for (auto &GV : LDSGlobals) {
427 if (is_contained(UniqueLDSGlobals, GV))
428 continue;
429 UniqueLDSGlobals.insert(GV);
430
431 Type *Ty = GV->getValueType();
432 const uint64_t SizeInBytes = DL.getTypeAllocSize(Ty);
433 Items.push_back(LDSItemTy);
434 Constant *ItemStartOffset = ConstantInt::get(Int32Ty, MallocSize);
435 Constant *SizeInBytesConst = ConstantInt::get(Int32Ty, SizeInBytes);
436 // Get redzone size corresponding a size.
437 const uint64_t RightRedzoneSize =
438 AMDGPU::getRedzoneSizeForGlobal(AsanScale, SizeInBytes);
439 // Update MallocSize with current size and redzone size.
440 MallocSize += SizeInBytes;
441 if (!AMDGPU::isDynamicLDS(*GV))
442 LDSParams.RedzoneOffsetAndSizeVector.emplace_back(MallocSize,
443 RightRedzoneSize);
444 MallocSize += RightRedzoneSize;
445 // Align current size plus redzone.
446 uint64_t AlignedSize =
447 alignTo(SizeInBytes + RightRedzoneSize, MaxAlignment);
448 Constant *AlignedSizeInBytesConst =
449 ConstantInt::get(Int32Ty, AlignedSize);
450 // Align MallocSize
451 MallocSize = alignTo(MallocSize, MaxAlignment);
452 Constant *InitItem =
453 ConstantStruct::get(LDSItemTy, {ItemStartOffset, SizeInBytesConst,
454 AlignedSizeInBytesConst});
455 Initializers.push_back(InitItem);
456 }
457 };
458 SetVector<GlobalVariable *> SwLDSVector;
459 SwLDSVector.insert(LDSParams.SwLDS);
460 buildInitializerForSwLDSMD(SwLDSVector);
461 buildInitializerForSwLDSMD(LDSParams.DirectAccess.StaticLDSGlobals);
462 buildInitializerForSwLDSMD(LDSParams.IndirectAccess.StaticLDSGlobals);
463 buildInitializerForSwLDSMD(LDSParams.DirectAccess.DynamicLDSGlobals);
464 buildInitializerForSwLDSMD(LDSParams.IndirectAccess.DynamicLDSGlobals);
465
466 // Update the LDS size used by the kernel.
467 Type *Ty = LDSParams.SwLDS->getValueType();
468 const uint64_t SizeInBytes = DL.getTypeAllocSize(Ty);
469 uint64_t AlignedSize = alignTo(SizeInBytes, MaxAlignment);
470 LDSParams.LDSSize = AlignedSize;
471 SmallString<128> MDTypeStr;
472 raw_svector_ostream MDTypeOS(MDTypeStr);
473 MDTypeOS << "llvm.amdgcn.sw.lds." << Func->getName() << ".md.type";
474 StructType *MetadataStructType =
475 StructType::create(Ctx, Items, MDTypeOS.str());
476 SmallString<128> MDStr;
477 raw_svector_ostream MDOS(MDStr);
478 MDOS << "llvm.amdgcn.sw.lds." << Func->getName() << ".md";
479 LDSParams.SwLDSMetadata = new GlobalVariable(
480 M, MetadataStructType, false, GlobalValue::InternalLinkage,
481 PoisonValue::get(MetadataStructType), MDOS.str(), nullptr,
483 Constant *data = ConstantStruct::get(MetadataStructType, Initializers);
484 LDSParams.SwLDSMetadata->setInitializer(data);
485 assert(LDSParams.SwLDS);
486 // Set the alignment to MaxAlignment for SwLDS.
487 LDSParams.SwLDS->setAlignment(MaxAlignment);
488 if (LDSParams.SwDynLDS)
489 LDSParams.SwDynLDS->setAlignment(MaxAlignment);
491 MD.NoAddress = true;
492 LDSParams.SwLDSMetadata->setSanitizerMetadata(MD);
493}
494
495void AMDGPUSwLowerLDS::populateLDSToReplacementIndicesMap(Function *Func) {
496 // Fill the corresponding LDS replacement indices for each LDS access
497 // related to this kernel.
498 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
499 SetVector<GlobalVariable *> UniqueLDSGlobals;
500 auto PopulateIndices = [&](SetVector<GlobalVariable *> &LDSGlobals,
501 uint32_t &Idx) {
502 for (auto &GV : LDSGlobals) {
503 if (is_contained(UniqueLDSGlobals, GV))
504 continue;
505 UniqueLDSGlobals.insert(GV);
506 LDSParams.LDSToReplacementIndicesMap[GV] = {0, Idx, 0};
507 ++Idx;
508 }
509 };
510 uint32_t Idx = 0;
511 SetVector<GlobalVariable *> SwLDSVector;
512 SwLDSVector.insert(LDSParams.SwLDS);
513 PopulateIndices(SwLDSVector, Idx);
514 PopulateIndices(LDSParams.DirectAccess.StaticLDSGlobals, Idx);
515 PopulateIndices(LDSParams.IndirectAccess.StaticLDSGlobals, Idx);
516 PopulateIndices(LDSParams.DirectAccess.DynamicLDSGlobals, Idx);
517 PopulateIndices(LDSParams.IndirectAccess.DynamicLDSGlobals, Idx);
518}
519
520static void replacesUsesOfGlobalInFunction(Function *Func, GlobalVariable *GV,
521 Value *Replacement) {
522 // Replace all uses of LDS global in this Function with a Replacement.
523 auto ReplaceUsesLambda = [Func](const Use &U) -> bool {
524 auto *V = U.getUser();
525 if (auto *Inst = dyn_cast<Instruction>(V)) {
526 auto *Func1 = Inst->getParent()->getParent();
527 if (Func == Func1)
528 return true;
529 }
530 return false;
531 };
532 GV->replaceUsesWithIf(Replacement, ReplaceUsesLambda);
533}
534
535void AMDGPUSwLowerLDS::replaceKernelLDSAccesses(Function *Func) {
536 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
537 GlobalVariable *SwLDS = LDSParams.SwLDS;
538 assert(SwLDS);
539 GlobalVariable *SwLDSMetadata = LDSParams.SwLDSMetadata;
540 assert(SwLDSMetadata);
541 StructType *SwLDSMetadataStructType =
542 cast<StructType>(SwLDSMetadata->getValueType());
543 Type *Int32Ty = IRB.getInt32Ty();
544 auto &IndirectAccess = LDSParams.IndirectAccess;
545 auto &DirectAccess = LDSParams.DirectAccess;
546 // Replace all uses of LDS global in this Function with a Replacement.
547 SetVector<GlobalVariable *> UniqueLDSGlobals;
548 auto ReplaceLDSGlobalUses = [&](SetVector<GlobalVariable *> &LDSGlobals) {
549 for (auto &GV : LDSGlobals) {
550 // Do not generate instructions if LDS access is in non-kernel
551 // i.e indirect-access.
552 if ((IndirectAccess.StaticLDSGlobals.contains(GV) ||
553 IndirectAccess.DynamicLDSGlobals.contains(GV)) &&
554 (!DirectAccess.StaticLDSGlobals.contains(GV) &&
555 !DirectAccess.DynamicLDSGlobals.contains(GV)))
556 continue;
557 if (is_contained(UniqueLDSGlobals, GV))
558 continue;
559 UniqueLDSGlobals.insert(GV);
560 auto &Indices = LDSParams.LDSToReplacementIndicesMap[GV];
561 assert(Indices.size() == 3);
562 Constant *GEPIdx[] = {ConstantInt::get(Int32Ty, Indices[0]),
563 ConstantInt::get(Int32Ty, Indices[1]),
564 ConstantInt::get(Int32Ty, Indices[2])};
566 SwLDSMetadataStructType, SwLDSMetadata, GEPIdx, true);
567 Value *Offset = IRB.CreateLoad(Int32Ty, GEP);
568 Value *BasePlusOffset =
569 IRB.CreateInBoundsGEP(IRB.getInt8Ty(), SwLDS, {Offset});
570 LLVM_DEBUG(GV->printAsOperand(dbgs() << "Sw LDS Lowering, Replacing LDS ",
571 false));
572 replacesUsesOfGlobalInFunction(Func, GV, BasePlusOffset);
573 }
574 };
575 ReplaceLDSGlobalUses(DirectAccess.StaticLDSGlobals);
576 ReplaceLDSGlobalUses(IndirectAccess.StaticLDSGlobals);
577 ReplaceLDSGlobalUses(DirectAccess.DynamicLDSGlobals);
578 ReplaceLDSGlobalUses(IndirectAccess.DynamicLDSGlobals);
579}
580
581void AMDGPUSwLowerLDS::updateMallocSizeForDynamicLDS(
582 Function *Func, Value **CurrMallocSize, Value *HiddenDynLDSSize,
583 SetVector<GlobalVariable *> &DynamicLDSGlobals) {
584 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
585 Type *Int32Ty = IRB.getInt32Ty();
586
587 GlobalVariable *SwLDS = LDSParams.SwLDS;
588 GlobalVariable *SwLDSMetadata = LDSParams.SwLDSMetadata;
589 assert(SwLDS && SwLDSMetadata);
590 StructType *MetadataStructType =
591 cast<StructType>(SwLDSMetadata->getValueType());
592 unsigned MaxAlignment = SwLDS->getAlignment();
593 Value *MaxAlignValue = IRB.getInt32(MaxAlignment);
594 Value *MaxAlignValueMinusOne = IRB.getInt32(MaxAlignment - 1);
595
596 for (GlobalVariable *DynGV : DynamicLDSGlobals) {
597 auto &Indices = LDSParams.LDSToReplacementIndicesMap[DynGV];
598 // Update the Offset metadata.
599 Constant *Index0 = ConstantInt::get(Int32Ty, 0);
600 Constant *Index1 = ConstantInt::get(Int32Ty, Indices[1]);
601
602 Constant *Index2Offset = ConstantInt::get(Int32Ty, 0);
603 auto *GEPForOffset = IRB.CreateInBoundsGEP(
604 MetadataStructType, SwLDSMetadata, {Index0, Index1, Index2Offset});
605
606 IRB.CreateStore(*CurrMallocSize, GEPForOffset);
607 // Update the size and Aligned Size metadata.
608 Constant *Index2Size = ConstantInt::get(Int32Ty, 1);
609 auto *GEPForSize = IRB.CreateInBoundsGEP(MetadataStructType, SwLDSMetadata,
610 {Index0, Index1, Index2Size});
611
612 Value *CurrDynLDSSize = IRB.CreateLoad(Int32Ty, HiddenDynLDSSize);
613 IRB.CreateStore(CurrDynLDSSize, GEPForSize);
614 Constant *Index2AlignedSize = ConstantInt::get(Int32Ty, 2);
615 auto *GEPForAlignedSize = IRB.CreateInBoundsGEP(
616 MetadataStructType, SwLDSMetadata, {Index0, Index1, Index2AlignedSize});
617
618 Value *AlignedDynLDSSize =
619 IRB.CreateAdd(CurrDynLDSSize, MaxAlignValueMinusOne);
620 AlignedDynLDSSize = IRB.CreateUDiv(AlignedDynLDSSize, MaxAlignValue);
621 AlignedDynLDSSize = IRB.CreateMul(AlignedDynLDSSize, MaxAlignValue);
622 IRB.CreateStore(AlignedDynLDSSize, GEPForAlignedSize);
623
624 // Update the Current Malloc Size
625 *CurrMallocSize = IRB.CreateAdd(*CurrMallocSize, AlignedDynLDSSize);
626 }
627}
628
629static DebugLoc getOrCreateDebugLoc(const Instruction *InsertBefore,
630 DISubprogram *SP) {
631 assert(InsertBefore);
632 if (InsertBefore->getDebugLoc())
633 return InsertBefore->getDebugLoc();
634 if (SP)
635 return DILocation::get(SP->getContext(), SP->getLine(), 1, SP);
636 return DebugLoc();
637}
638
639void AMDGPUSwLowerLDS::getLDSMemoryInstructions(
640 Function *Func, SetVector<Instruction *> &LDSInstructions) {
641 for (BasicBlock &BB : *Func) {
642 for (Instruction &Inst : BB) {
643 if (LoadInst *LI = dyn_cast<LoadInst>(&Inst)) {
644 if (LI->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS)
645 LDSInstructions.insert(&Inst);
646 } else if (StoreInst *SI = dyn_cast<StoreInst>(&Inst)) {
647 if (SI->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS)
648 LDSInstructions.insert(&Inst);
649 } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(&Inst)) {
650 if (RMW->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS)
651 LDSInstructions.insert(&Inst);
652 } else if (AtomicCmpXchgInst *XCHG = dyn_cast<AtomicCmpXchgInst>(&Inst)) {
653 if (XCHG->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS)
654 LDSInstructions.insert(&Inst);
655 } else if (AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(&Inst)) {
656 if (ASC->getSrcAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
657 ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS)
658 LDSInstructions.insert(&Inst);
659 } else
660 continue;
661 }
662 }
663}
664
665Value *AMDGPUSwLowerLDS::getTranslatedGlobalMemoryPtrOfLDS(Value *LoadMallocPtr,
666 Value *LDSPtr) {
667 assert(LDSPtr && "Invalid LDS pointer operand");
668 Type *LDSPtrType = LDSPtr->getType();
669 LLVMContext &Ctx = M.getContext();
670 const DataLayout &DL = M.getDataLayout();
671 Type *IntTy = DL.getIntPtrType(Ctx, AMDGPUAS::LOCAL_ADDRESS);
672 if (auto *VecPtrTy = dyn_cast<VectorType>(LDSPtrType)) {
673 // Handle vector of pointers
674 ElementCount NumElements = VecPtrTy->getElementCount();
675 IntTy = VectorType::get(IntTy, NumElements);
676 }
677 Value *GepIndex = IRB.CreatePtrToInt(LDSPtr, IntTy);
678 return IRB.CreateInBoundsGEP(IRB.getInt8Ty(), LoadMallocPtr, {GepIndex});
679}
680
681void AMDGPUSwLowerLDS::translateLDSMemoryOperationsToGlobalMemory(
682 Function *Func, Value *LoadMallocPtr,
683 SetVector<Instruction *> &LDSInstructions) {
684 LLVM_DEBUG(dbgs() << "Translating LDS memory operations to global memory : "
685 << Func->getName());
686 for (Instruction *Inst : LDSInstructions) {
687 IRB.SetInsertPoint(Inst);
688 if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
689 Value *LIOperand = LI->getPointerOperand();
690 Value *Replacement =
691 getTranslatedGlobalMemoryPtrOfLDS(LoadMallocPtr, LIOperand);
692 LoadInst *NewLI = IRB.CreateAlignedLoad(LI->getType(), Replacement,
693 LI->getAlign(), LI->isVolatile());
694 NewLI->setAtomic(LI->getOrdering(), LI->getSyncScopeID());
695 AsanInfo.Instructions.insert(NewLI);
696 LI->replaceAllUsesWith(NewLI);
697 LI->eraseFromParent();
698 } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
699 Value *SIOperand = SI->getPointerOperand();
700 Value *Replacement =
701 getTranslatedGlobalMemoryPtrOfLDS(LoadMallocPtr, SIOperand);
702 StoreInst *NewSI = IRB.CreateAlignedStore(
703 SI->getValueOperand(), Replacement, SI->getAlign(), SI->isVolatile());
704 NewSI->setAtomic(SI->getOrdering(), SI->getSyncScopeID());
705 AsanInfo.Instructions.insert(NewSI);
706 SI->replaceAllUsesWith(NewSI);
707 SI->eraseFromParent();
708 } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) {
709 Value *RMWPtrOperand = RMW->getPointerOperand();
710 Value *RMWValOperand = RMW->getValOperand();
711 Value *Replacement =
712 getTranslatedGlobalMemoryPtrOfLDS(LoadMallocPtr, RMWPtrOperand);
713 AtomicRMWInst *NewRMW = IRB.CreateAtomicRMW(
714 RMW->getOperation(), Replacement, RMWValOperand, RMW->getAlign(),
715 RMW->getOrdering(), RMW->getSyncScopeID());
716 NewRMW->setVolatile(RMW->isVolatile());
717 AsanInfo.Instructions.insert(NewRMW);
718 RMW->replaceAllUsesWith(NewRMW);
719 RMW->eraseFromParent();
720 } else if (AtomicCmpXchgInst *XCHG = dyn_cast<AtomicCmpXchgInst>(Inst)) {
721 Value *XCHGPtrOperand = XCHG->getPointerOperand();
722 Value *Replacement =
723 getTranslatedGlobalMemoryPtrOfLDS(LoadMallocPtr, XCHGPtrOperand);
724 AtomicCmpXchgInst *NewXCHG = IRB.CreateAtomicCmpXchg(
725 Replacement, XCHG->getCompareOperand(), XCHG->getNewValOperand(),
726 XCHG->getAlign(), XCHG->getSuccessOrdering(),
727 XCHG->getFailureOrdering(), XCHG->getSyncScopeID());
728 NewXCHG->setVolatile(XCHG->isVolatile());
729 AsanInfo.Instructions.insert(NewXCHG);
730 XCHG->replaceAllUsesWith(NewXCHG);
731 XCHG->eraseFromParent();
732 } else if (AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(Inst)) {
733 Value *AIOperand = ASC->getPointerOperand();
734 Value *Replacement =
735 getTranslatedGlobalMemoryPtrOfLDS(LoadMallocPtr, AIOperand);
736 Value *NewAI = IRB.CreateAddrSpaceCast(Replacement, ASC->getType());
737 // Note: No need to add the instruction to AsanInfo instructions to be
738 // instrumented list. FLAT_ADDRESS ptr would have been already
739 // instrumented by asan pass prior to this pass.
740 ASC->replaceAllUsesWith(NewAI);
741 ASC->eraseFromParent();
742 } else
743 report_fatal_error("Unimplemented LDS lowering instruction");
744 }
745}
746
747void AMDGPUSwLowerLDS::poisonRedzones(Function *Func, Value *MallocPtr) {
748 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
749 Type *Int64Ty = IRB.getInt64Ty();
750 Type *VoidTy = IRB.getVoidTy();
751 FunctionCallee AsanPoisonRegion = M.getOrInsertFunction(
752 "__asan_poison_region",
753 FunctionType::get(VoidTy, {Int64Ty, Int64Ty}, false));
754
755 auto RedzonesVec = LDSParams.RedzoneOffsetAndSizeVector;
756 size_t VecSize = RedzonesVec.size();
757 for (unsigned i = 0; i < VecSize; i++) {
758 auto &RedzonePair = RedzonesVec[i];
759 uint64_t RedzoneOffset = RedzonePair.first;
760 uint64_t RedzoneSize = RedzonePair.second;
761 Value *RedzoneAddrOffset = IRB.CreateInBoundsGEP(
762 IRB.getInt8Ty(), MallocPtr, {IRB.getInt64(RedzoneOffset)});
763 Value *RedzoneAddress = IRB.CreatePtrToInt(RedzoneAddrOffset, Int64Ty);
764 IRB.CreateCall(AsanPoisonRegion,
765 {RedzoneAddress, IRB.getInt64(RedzoneSize)});
766 }
767}
768
769void AMDGPUSwLowerLDS::lowerKernelLDSAccesses(Function *Func,
770 DomTreeUpdater &DTU) {
771 LLVM_DEBUG(dbgs() << "Sw Lowering Kernel LDS for : " << Func->getName());
772 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
773 auto &Ctx = M.getContext();
774 auto *PrevEntryBlock = &Func->getEntryBlock();
775 SetVector<Instruction *> LDSInstructions;
776 getLDSMemoryInstructions(Func, LDSInstructions);
777
778 // Create malloc block.
779 auto *MallocBlock = BasicBlock::Create(Ctx, "Malloc", Func, PrevEntryBlock);
780
781 // Create WIdBlock block which has instructions related to selection of
782 // {0,0,0} indiex work item in the work group.
783 auto *WIdBlock = BasicBlock::Create(Ctx, "WId", Func, MallocBlock);
784 IRB.SetInsertPoint(WIdBlock, WIdBlock->begin());
785 DebugLoc FirstDL =
786 getOrCreateDebugLoc(&*PrevEntryBlock->begin(), Func->getSubprogram());
787 IRB.SetCurrentDebugLocation(FirstDL);
788 Value *WIdx = IRB.CreateIntrinsic(Intrinsic::amdgcn_workitem_id_x, {});
789 Value *WIdy = IRB.CreateIntrinsic(Intrinsic::amdgcn_workitem_id_y, {});
790 Value *WIdz = IRB.CreateIntrinsic(Intrinsic::amdgcn_workitem_id_z, {});
791 Value *XYOr = IRB.CreateOr(WIdx, WIdy);
792 Value *XYZOr = IRB.CreateOr(XYOr, WIdz);
793 Value *WIdzCond = IRB.CreateICmpEQ(XYZOr, IRB.getInt32(0));
794
795 // All work items will branch to PrevEntryBlock except {0,0,0} index
796 // work item which will branch to malloc block.
797 IRB.CreateCondBr(WIdzCond, MallocBlock, PrevEntryBlock);
798
799 // Malloc block
800 IRB.SetInsertPoint(MallocBlock, MallocBlock->begin());
801
802 // If Dynamic LDS globals are accessed by the kernel,
803 // Get the size of dyn lds from hidden dyn_lds_size kernel arg.
804 // Update the corresponding metadata global entries for this dyn lds global.
805 GlobalVariable *SwLDS = LDSParams.SwLDS;
806 GlobalVariable *SwLDSMetadata = LDSParams.SwLDSMetadata;
807 assert(SwLDS && SwLDSMetadata);
808 StructType *MetadataStructType =
809 cast<StructType>(SwLDSMetadata->getValueType());
810 uint32_t MallocSize = 0;
811 Value *CurrMallocSize;
812 Type *Int32Ty = IRB.getInt32Ty();
813 Type *Int64Ty = IRB.getInt64Ty();
814
815 SetVector<GlobalVariable *> UniqueLDSGlobals;
816 auto GetUniqueLDSGlobals = [&](SetVector<GlobalVariable *> &LDSGlobals) {
817 for (auto &GV : LDSGlobals) {
818 if (is_contained(UniqueLDSGlobals, GV))
819 continue;
820 UniqueLDSGlobals.insert(GV);
821 }
822 };
823
824 GetUniqueLDSGlobals(LDSParams.DirectAccess.StaticLDSGlobals);
825 GetUniqueLDSGlobals(LDSParams.IndirectAccess.StaticLDSGlobals);
826 unsigned NumStaticLDS = 1 + UniqueLDSGlobals.size();
827 UniqueLDSGlobals.clear();
828
829 if (NumStaticLDS) {
830 auto *GEPForEndStaticLDSOffset =
831 IRB.CreateInBoundsGEP(MetadataStructType, SwLDSMetadata,
832 {ConstantInt::get(Int32Ty, 0),
833 ConstantInt::get(Int32Ty, NumStaticLDS - 1),
834 ConstantInt::get(Int32Ty, 0)});
835
836 auto *GEPForEndStaticLDSSize =
837 IRB.CreateInBoundsGEP(MetadataStructType, SwLDSMetadata,
838 {ConstantInt::get(Int32Ty, 0),
839 ConstantInt::get(Int32Ty, NumStaticLDS - 1),
840 ConstantInt::get(Int32Ty, 2)});
841
842 Value *EndStaticLDSOffset =
843 IRB.CreateLoad(Int32Ty, GEPForEndStaticLDSOffset);
844 Value *EndStaticLDSSize = IRB.CreateLoad(Int32Ty, GEPForEndStaticLDSSize);
845 CurrMallocSize = IRB.CreateAdd(EndStaticLDSOffset, EndStaticLDSSize);
846 } else
847 CurrMallocSize = IRB.getInt32(MallocSize);
848
849 if (LDSParams.SwDynLDS) {
852 "Dynamic LDS size query is only supported for CO V5 and later.");
853 // Get size from hidden dyn_lds_size argument of kernel
854 Value *ImplicitArg =
855 IRB.CreateIntrinsic(Intrinsic::amdgcn_implicitarg_ptr, {});
856 Value *HiddenDynLDSSize = IRB.CreateInBoundsGEP(
857 ImplicitArg->getType(), ImplicitArg,
858 {ConstantInt::get(Int64Ty, COV5_HIDDEN_DYN_LDS_SIZE_ARG)});
859 UniqueLDSGlobals.clear();
860 GetUniqueLDSGlobals(LDSParams.DirectAccess.DynamicLDSGlobals);
861 GetUniqueLDSGlobals(LDSParams.IndirectAccess.DynamicLDSGlobals);
862 updateMallocSizeForDynamicLDS(Func, &CurrMallocSize, HiddenDynLDSSize,
863 UniqueLDSGlobals);
864 }
865
866 CurrMallocSize = IRB.CreateZExt(CurrMallocSize, Int64Ty);
867
868 // Create a call to malloc function which does device global memory allocation
869 // with size equals to all LDS global accesses size in this kernel.
870 Value *ReturnAddress =
871 IRB.CreateIntrinsic(Intrinsic::returnaddress, {IRB.getInt32(0)});
872 FunctionCallee MallocFunc = M.getOrInsertFunction(
873 StringRef("__asan_malloc_impl"),
874 FunctionType::get(Int64Ty, {Int64Ty, Int64Ty}, false));
875 Value *RAPtrToInt = IRB.CreatePtrToInt(ReturnAddress, Int64Ty);
876 Value *MallocCall = IRB.CreateCall(MallocFunc, {CurrMallocSize, RAPtrToInt});
877
878 Value *MallocPtr =
879 IRB.CreateIntToPtr(MallocCall, IRB.getPtrTy(AMDGPUAS::GLOBAL_ADDRESS));
880
881 // Create store of malloc to new global
882 IRB.CreateStore(MallocPtr, SwLDS);
883
884 // Create calls to __asan_poison_region to poison redzones.
885 poisonRedzones(Func, MallocPtr);
886
887 // Create branch to PrevEntryBlock
888 IRB.CreateBr(PrevEntryBlock);
889
890 // Create wave-group barrier at the starting of Previous entry block
891 Type *Int1Ty = IRB.getInt1Ty();
892 IRB.SetInsertPoint(PrevEntryBlock, PrevEntryBlock->begin());
893 auto *XYZCondPhi = IRB.CreatePHI(Int1Ty, 2, "xyzCond");
894 XYZCondPhi->addIncoming(IRB.getInt1(0), WIdBlock);
895 XYZCondPhi->addIncoming(IRB.getInt1(1), MallocBlock);
896
897 IRB.CreateIntrinsic(Intrinsic::amdgcn_s_barrier, {});
898
899 // Load malloc pointer from Sw LDS.
900 Value *LoadMallocPtr =
901 IRB.CreateLoad(IRB.getPtrTy(AMDGPUAS::GLOBAL_ADDRESS), SwLDS);
902
903 // Replace All uses of LDS globals with new LDS pointers.
904 replaceKernelLDSAccesses(Func);
905
906 // Replace Memory Operations on LDS with corresponding
907 // global memory pointers.
908 translateLDSMemoryOperationsToGlobalMemory(Func, LoadMallocPtr,
909 LDSInstructions);
910
911 auto *CondFreeBlock = BasicBlock::Create(Ctx, "CondFree", Func);
912 auto *FreeBlock = BasicBlock::Create(Ctx, "Free", Func);
913 auto *EndBlock = BasicBlock::Create(Ctx, "End", Func);
914 for (BasicBlock &BB : *Func) {
915 if (!BB.empty()) {
916 if (ReturnInst *RI = dyn_cast<ReturnInst>(&BB.back())) {
917 RI->eraseFromParent();
918 IRB.SetInsertPoint(&BB, BB.end());
919 IRB.CreateBr(CondFreeBlock);
920 }
921 }
922 }
923
924 // Cond Free Block
925 IRB.SetInsertPoint(CondFreeBlock, CondFreeBlock->begin());
926 IRB.CreateIntrinsic(Intrinsic::amdgcn_s_barrier, {});
927 IRB.CreateCondBr(XYZCondPhi, FreeBlock, EndBlock);
928
929 // Free Block
930 IRB.SetInsertPoint(FreeBlock, FreeBlock->begin());
931
932 // Free the previously allocate device global memory.
933 FunctionCallee AsanFreeFunc = M.getOrInsertFunction(
934 StringRef("__asan_free_impl"),
935 FunctionType::get(IRB.getVoidTy(), {Int64Ty, Int64Ty}, false));
936 Value *ReturnAddr =
937 IRB.CreateIntrinsic(Intrinsic::returnaddress, IRB.getInt32(0));
938 Value *RAPToInt = IRB.CreatePtrToInt(ReturnAddr, Int64Ty);
939 Value *MallocPtrToInt = IRB.CreatePtrToInt(LoadMallocPtr, Int64Ty);
940 IRB.CreateCall(AsanFreeFunc, {MallocPtrToInt, RAPToInt});
941
942 IRB.CreateBr(EndBlock);
943
944 // End Block
945 IRB.SetInsertPoint(EndBlock, EndBlock->begin());
946 IRB.CreateRetVoid();
947 // Update the DomTree with corresponding links to basic blocks.
948 DTU.applyUpdates({{DominatorTree::Insert, WIdBlock, MallocBlock},
949 {DominatorTree::Insert, MallocBlock, PrevEntryBlock},
950 {DominatorTree::Insert, CondFreeBlock, FreeBlock},
951 {DominatorTree::Insert, FreeBlock, EndBlock}});
952}
953
954Constant *AMDGPUSwLowerLDS::getAddressesOfVariablesInKernel(
955 Function *Func, SetVector<GlobalVariable *> &Variables) {
956 Type *Int32Ty = IRB.getInt32Ty();
957 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
958
959 GlobalVariable *SwLDSMetadata = LDSParams.SwLDSMetadata;
960 assert(SwLDSMetadata);
961 auto *SwLDSMetadataStructType =
962 cast<StructType>(SwLDSMetadata->getValueType());
963 ArrayType *KernelOffsetsType =
964 ArrayType::get(IRB.getPtrTy(AMDGPUAS::GLOBAL_ADDRESS), Variables.size());
965
967 for (auto *GV : Variables) {
968 auto It = LDSParams.LDSToReplacementIndicesMap.find(GV);
969 if (It == LDSParams.LDSToReplacementIndicesMap.end()) {
970 Elements.push_back(
972 continue;
973 }
974 auto &Indices = It->second;
975 Constant *GEPIdx[] = {ConstantInt::get(Int32Ty, Indices[0]),
976 ConstantInt::get(Int32Ty, Indices[1]),
977 ConstantInt::get(Int32Ty, Indices[2])};
978 Constant *GEP = ConstantExpr::getGetElementPtr(SwLDSMetadataStructType,
979 SwLDSMetadata, GEPIdx, true);
980 Elements.push_back(GEP);
981 }
982 return ConstantArray::get(KernelOffsetsType, Elements);
983}
984
985void AMDGPUSwLowerLDS::buildNonKernelLDSBaseTable(
986 NonKernelLDSParameters &NKLDSParams) {
987 // Base table will have single row, with elements of the row
988 // placed as per kernel ID. Each element in the row corresponds
989 // to addresss of "SW LDS" global of the kernel.
990 auto &Kernels = NKLDSParams.OrderedKernels;
991 if (Kernels.empty())
992 return;
993 Type *Int32Ty = IRB.getInt32Ty();
994 const size_t NumberKernels = Kernels.size();
995 ArrayType *AllKernelsOffsetsType =
996 ArrayType::get(IRB.getPtrTy(AMDGPUAS::LOCAL_ADDRESS), NumberKernels);
997 std::vector<Constant *> OverallConstantExprElts(NumberKernels);
998 for (size_t i = 0; i < NumberKernels; i++) {
999 Function *Func = Kernels[i];
1000 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
1001 GlobalVariable *SwLDS = LDSParams.SwLDS;
1002 assert(SwLDS);
1003 Constant *GEPIdx[] = {ConstantInt::get(Int32Ty, 0)};
1004 Constant *GEP =
1005 ConstantExpr::getGetElementPtr(SwLDS->getType(), SwLDS, GEPIdx, true);
1006 OverallConstantExprElts[i] = GEP;
1007 }
1008 Constant *init =
1009 ConstantArray::get(AllKernelsOffsetsType, OverallConstantExprElts);
1010 NKLDSParams.LDSBaseTable = new GlobalVariable(
1011 M, AllKernelsOffsetsType, true, GlobalValue::InternalLinkage, init,
1012 "llvm.amdgcn.sw.lds.base.table", nullptr, GlobalValue::NotThreadLocal,
1015 MD.NoAddress = true;
1016 NKLDSParams.LDSBaseTable->setSanitizerMetadata(MD);
1017}
1018
1019void AMDGPUSwLowerLDS::buildNonKernelLDSOffsetTable(
1020 NonKernelLDSParameters &NKLDSParams) {
1021 // Offset table will have multiple rows and columns.
1022 // Rows are assumed to be from 0 to (n-1). n is total number
1023 // of kernels accessing the LDS through non-kernels.
1024 // Each row will have m elements. m is the total number of
1025 // unique LDS globals accessed by non-kernels.
1026 // Each element in the row correspond to the address of
1027 // the replacement of LDS global done by that particular kernel.
1028 auto &Variables = NKLDSParams.OrdereLDSGlobals;
1029 auto &Kernels = NKLDSParams.OrderedKernels;
1030 if (Variables.empty() || Kernels.empty())
1031 return;
1032 const size_t NumberVariables = Variables.size();
1033 const size_t NumberKernels = Kernels.size();
1034
1035 ArrayType *KernelOffsetsType =
1036 ArrayType::get(IRB.getPtrTy(AMDGPUAS::GLOBAL_ADDRESS), NumberVariables);
1037
1038 ArrayType *AllKernelsOffsetsType =
1039 ArrayType::get(KernelOffsetsType, NumberKernels);
1040 std::vector<Constant *> overallConstantExprElts(NumberKernels);
1041 for (size_t i = 0; i < NumberKernels; i++) {
1042 Function *Func = Kernels[i];
1043 overallConstantExprElts[i] =
1044 getAddressesOfVariablesInKernel(Func, Variables);
1045 }
1046 Constant *Init =
1047 ConstantArray::get(AllKernelsOffsetsType, overallConstantExprElts);
1048 NKLDSParams.LDSOffsetTable = new GlobalVariable(
1049 M, AllKernelsOffsetsType, true, GlobalValue::InternalLinkage, Init,
1050 "llvm.amdgcn.sw.lds.offset.table", nullptr, GlobalValue::NotThreadLocal,
1053 MD.NoAddress = true;
1054 NKLDSParams.LDSOffsetTable->setSanitizerMetadata(MD);
1055}
1056
1057void AMDGPUSwLowerLDS::lowerNonKernelLDSAccesses(
1058 Function *Func, SetVector<GlobalVariable *> &LDSGlobals,
1059 NonKernelLDSParameters &NKLDSParams) {
1060 // Replace LDS access in non-kernel with replacement queried from
1061 // Base table and offset from offset table.
1062 LLVM_DEBUG(dbgs() << "Sw LDS lowering, lower non-kernel access for : "
1063 << Func->getName());
1064 auto InsertAt = Func->getEntryBlock().getFirstNonPHIOrDbgOrAlloca();
1065 IRB.SetInsertPoint(InsertAt);
1066
1067 // Get LDS memory instructions.
1068 SetVector<Instruction *> LDSInstructions;
1069 getLDSMemoryInstructions(Func, LDSInstructions);
1070
1071 auto *KernelId = IRB.CreateIntrinsic(Intrinsic::amdgcn_lds_kernel_id, {});
1072 GlobalVariable *LDSBaseTable = NKLDSParams.LDSBaseTable;
1073 GlobalVariable *LDSOffsetTable = NKLDSParams.LDSOffsetTable;
1074 auto &OrdereLDSGlobals = NKLDSParams.OrdereLDSGlobals;
1075 Value *BaseGEP = IRB.CreateInBoundsGEP(
1076 LDSBaseTable->getValueType(), LDSBaseTable, {IRB.getInt32(0), KernelId});
1077 Value *BaseLoad =
1078 IRB.CreateLoad(IRB.getPtrTy(AMDGPUAS::LOCAL_ADDRESS), BaseGEP);
1079 Value *LoadMallocPtr =
1080 IRB.CreateLoad(IRB.getPtrTy(AMDGPUAS::GLOBAL_ADDRESS), BaseLoad);
1081
1082 for (GlobalVariable *GV : LDSGlobals) {
1083 const auto *GVIt = llvm::find(OrdereLDSGlobals, GV);
1084 assert(GVIt != OrdereLDSGlobals.end());
1085 uint32_t GVOffset = std::distance(OrdereLDSGlobals.begin(), GVIt);
1086
1087 Value *OffsetGEP = IRB.CreateInBoundsGEP(
1088 LDSOffsetTable->getValueType(), LDSOffsetTable,
1089 {IRB.getInt32(0), KernelId, IRB.getInt32(GVOffset)});
1090 Value *OffsetLoad =
1091 IRB.CreateLoad(IRB.getPtrTy(AMDGPUAS::GLOBAL_ADDRESS), OffsetGEP);
1092 Value *Offset = IRB.CreateLoad(IRB.getInt32Ty(), OffsetLoad);
1093 Value *BasePlusOffset =
1094 IRB.CreateInBoundsGEP(IRB.getInt8Ty(), BaseLoad, {Offset});
1095 LLVM_DEBUG(dbgs() << "Sw LDS Lowering, Replace non-kernel LDS for "
1096 << GV->getName());
1097 replacesUsesOfGlobalInFunction(Func, GV, BasePlusOffset);
1098 }
1099 translateLDSMemoryOperationsToGlobalMemory(Func, LoadMallocPtr,
1100 LDSInstructions);
1101}
1102
1103static void reorderStaticDynamicIndirectLDSSet(KernelLDSParameters &LDSParams) {
1104 // Sort Static, dynamic LDS globals which are either
1105 // direct or indirect access on basis of name.
1106 auto &DirectAccess = LDSParams.DirectAccess;
1107 auto &IndirectAccess = LDSParams.IndirectAccess;
1108 LDSParams.DirectAccess.StaticLDSGlobals = sortByName(
1109 std::vector<GlobalVariable *>(DirectAccess.StaticLDSGlobals.begin(),
1110 DirectAccess.StaticLDSGlobals.end()));
1111 LDSParams.DirectAccess.DynamicLDSGlobals = sortByName(
1112 std::vector<GlobalVariable *>(DirectAccess.DynamicLDSGlobals.begin(),
1113 DirectAccess.DynamicLDSGlobals.end()));
1114 LDSParams.IndirectAccess.StaticLDSGlobals = sortByName(
1115 std::vector<GlobalVariable *>(IndirectAccess.StaticLDSGlobals.begin(),
1116 IndirectAccess.StaticLDSGlobals.end()));
1117 LDSParams.IndirectAccess.DynamicLDSGlobals = sortByName(
1118 std::vector<GlobalVariable *>(IndirectAccess.DynamicLDSGlobals.begin(),
1119 IndirectAccess.DynamicLDSGlobals.end()));
1120}
1121
1122void AMDGPUSwLowerLDS::initAsanInfo() {
1123 // Get Shadow mapping scale and offset.
1124 unsigned LongSize =
1125 M.getDataLayout().getPointerSizeInBits(AMDGPUAS::GLOBAL_ADDRESS);
1127 int Scale;
1128 bool OrShadowOffset;
1129 llvm::getAddressSanitizerParams(AMDGPUTM.getTargetTriple(), LongSize, false,
1130 &Offset, &Scale, &OrShadowOffset);
1131 AsanInfo.Scale = Scale;
1132 AsanInfo.Offset = Offset;
1133}
1134
1135static bool hasFnWithSanitizeAddressAttr(FunctionVariableMap &LDSAccesses) {
1136 for (auto &K : LDSAccesses) {
1137 Function *F = K.first;
1138 if (!F)
1139 continue;
1140 if (F->hasFnAttribute(Attribute::SanitizeAddress))
1141 return true;
1142 }
1143 return false;
1144}
1145
1146bool AMDGPUSwLowerLDS::run() {
1147 bool Changed = false;
1148
1149 CallGraph CG = CallGraph(M);
1150
1152
1153 // Get all the direct and indirect access of LDS for all the kernels.
1154 LDSUsesInfoTy LDSUsesInfo = getTransitiveUsesOfLDS(CG, M);
1155
1156 // Flag to decide whether to lower all the LDS accesses
1157 // based on sanitize_address attribute.
1158 bool LowerAllLDS = hasFnWithSanitizeAddressAttr(LDSUsesInfo.direct_access) ||
1159 hasFnWithSanitizeAddressAttr(LDSUsesInfo.indirect_access);
1160
1161 if (!LowerAllLDS)
1162 return Changed;
1163
1164 // Utility to group LDS access into direct, indirect, static and dynamic.
1165 auto PopulateKernelStaticDynamicLDS = [&](FunctionVariableMap &LDSAccesses,
1166 bool DirectAccess) {
1167 for (auto &K : LDSAccesses) {
1168 Function *F = K.first;
1169 if (!F || K.second.empty())
1170 continue;
1171
1173
1174 // Only inserts if key isn't already in the map.
1175 FuncLDSAccessInfo.KernelToLDSParametersMap.insert(
1176 {F, KernelLDSParameters()});
1177
1178 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[F];
1179 if (!DirectAccess)
1180 FuncLDSAccessInfo.KernelsWithIndirectLDSAccess.insert(F);
1181 for (GlobalVariable *GV : K.second) {
1182 if (!DirectAccess) {
1183 if (AMDGPU::isDynamicLDS(*GV))
1184 LDSParams.IndirectAccess.DynamicLDSGlobals.insert(GV);
1185 else
1186 LDSParams.IndirectAccess.StaticLDSGlobals.insert(GV);
1187 FuncLDSAccessInfo.AllNonKernelLDSAccess.insert(GV);
1188 } else {
1189 if (AMDGPU::isDynamicLDS(*GV))
1190 LDSParams.DirectAccess.DynamicLDSGlobals.insert(GV);
1191 else
1192 LDSParams.DirectAccess.StaticLDSGlobals.insert(GV);
1193 }
1194 }
1195 }
1196 };
1197
1198 PopulateKernelStaticDynamicLDS(LDSUsesInfo.direct_access, true);
1199 PopulateKernelStaticDynamicLDS(LDSUsesInfo.indirect_access, false);
1200
1201 // Get address sanitizer scale.
1202 initAsanInfo();
1203
1204 for (auto &K : FuncLDSAccessInfo.KernelToLDSParametersMap) {
1205 Function *Func = K.first;
1206 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
1207 if (LDSParams.DirectAccess.StaticLDSGlobals.empty() &&
1208 LDSParams.DirectAccess.DynamicLDSGlobals.empty() &&
1209 LDSParams.IndirectAccess.StaticLDSGlobals.empty() &&
1210 LDSParams.IndirectAccess.DynamicLDSGlobals.empty()) {
1211 Changed = false;
1212 } else {
1214 CG, Func,
1215 {"amdgpu-no-workitem-id-x", "amdgpu-no-workitem-id-y",
1216 "amdgpu-no-workitem-id-z", "amdgpu-no-heap-ptr"});
1217 if (!LDSParams.IndirectAccess.StaticLDSGlobals.empty() ||
1218 !LDSParams.IndirectAccess.DynamicLDSGlobals.empty())
1219 removeFnAttrFromReachable(CG, Func, {"amdgpu-no-lds-kernel-id"});
1220 reorderStaticDynamicIndirectLDSSet(LDSParams);
1221 buildSwLDSGlobal(Func);
1222 buildSwDynLDSGlobal(Func);
1223 populateSwMetadataGlobal(Func);
1224 populateSwLDSAttributeAndMetadata(Func);
1225 populateLDSToReplacementIndicesMap(Func);
1226 DomTreeUpdater DTU(DTCallback(*Func),
1227 DomTreeUpdater::UpdateStrategy::Lazy);
1228 lowerKernelLDSAccesses(Func, DTU);
1229 Changed = true;
1230 }
1231 }
1232
1233 // Get the Uses of LDS from non-kernels.
1234 getUsesOfLDSByNonKernels();
1235
1236 // Get non-kernels with LDS ptr as argument and called by kernels.
1237 getNonKernelsWithLDSArguments(CG);
1238
1239 // Lower LDS accesses in non-kernels.
1240 if (!FuncLDSAccessInfo.NonKernelToLDSAccessMap.empty() ||
1241 !FuncLDSAccessInfo.NonKernelsWithLDSArgument.empty()) {
1242 NonKernelLDSParameters NKLDSParams;
1243 NKLDSParams.OrderedKernels = getOrderedIndirectLDSAccessingKernels(
1244 FuncLDSAccessInfo.KernelsWithIndirectLDSAccess);
1245 NKLDSParams.OrdereLDSGlobals = getOrderedNonKernelAllLDSGlobals(
1246 FuncLDSAccessInfo.AllNonKernelLDSAccess);
1247 buildNonKernelLDSBaseTable(NKLDSParams);
1248 buildNonKernelLDSOffsetTable(NKLDSParams);
1249 for (auto &K : FuncLDSAccessInfo.NonKernelToLDSAccessMap) {
1250 Function *Func = K.first;
1251 DenseSet<GlobalVariable *> &LDSGlobals = K.second;
1252 SetVector<GlobalVariable *> OrderedLDSGlobals = sortByName(
1253 std::vector<GlobalVariable *>(LDSGlobals.begin(), LDSGlobals.end()));
1254 lowerNonKernelLDSAccesses(Func, OrderedLDSGlobals, NKLDSParams);
1255 }
1256 for (Function *Func : FuncLDSAccessInfo.NonKernelsWithLDSArgument) {
1257 auto &K = FuncLDSAccessInfo.NonKernelToLDSAccessMap;
1258 if (K.contains(Func))
1259 continue;
1261 lowerNonKernelLDSAccesses(Func, Vec, NKLDSParams);
1262 }
1263 Changed = true;
1264 }
1265
1266 if (!Changed)
1267 return Changed;
1268
1269 for (auto &GV : make_early_inc_range(M.globals())) {
1271 // probably want to remove from used lists
1273 if (GV.use_empty())
1274 GV.eraseFromParent();
1275 }
1276 }
1277
1278 if (AsanInstrumentLDS) {
1279 SmallVector<InterestingMemoryOperand, 16> OperandsToInstrument;
1280 for (Instruction *Inst : AsanInfo.Instructions) {
1281 SmallVector<InterestingMemoryOperand, 1> InterestingOperands;
1282 getInterestingMemoryOperands(M, Inst, InterestingOperands);
1283 llvm::append_range(OperandsToInstrument, InterestingOperands);
1284 }
1285 for (auto &Operand : OperandsToInstrument) {
1286 Value *Addr = Operand.getPtr();
1287 instrumentAddress(M, IRB, Operand.getInsn(), Operand.getInsn(), Addr,
1288 Operand.Alignment.valueOrOne(), Operand.TypeStoreSize,
1289 Operand.IsWrite, nullptr, false, false, AsanInfo.Scale,
1290 AsanInfo.Offset);
1291 Changed = true;
1292 }
1293 }
1294
1295 return Changed;
1296}
1297
1298class AMDGPUSwLowerLDSLegacy : public ModulePass {
1299public:
1300 const AMDGPUTargetMachine *AMDGPUTM;
1301 static char ID;
1302 AMDGPUSwLowerLDSLegacy(const AMDGPUTargetMachine *TM)
1303 : ModulePass(ID), AMDGPUTM(TM) {}
1304 bool runOnModule(Module &M) override;
1305 void getAnalysisUsage(AnalysisUsage &AU) const override {
1307 }
1308};
1309} // namespace
1310
1311char AMDGPUSwLowerLDSLegacy::ID = 0;
1312char &llvm::AMDGPUSwLowerLDSLegacyPassID = AMDGPUSwLowerLDSLegacy::ID;
1313
1314INITIALIZE_PASS_BEGIN(AMDGPUSwLowerLDSLegacy, "amdgpu-sw-lower-lds",
1315 "AMDGPU Software lowering of LDS", false, false)
1317INITIALIZE_PASS_END(AMDGPUSwLowerLDSLegacy, "amdgpu-sw-lower-lds",
1318 "AMDGPU Software lowering of LDS", false, false)
1319
1320bool AMDGPUSwLowerLDSLegacy::runOnModule(Module &M) {
1321 // AddressSanitizer pass adds "nosanitize_address" module flag if it has
1322 // instrumented the IR. Return early if the flag is not present.
1323 if (!M.getModuleFlag("nosanitize_address"))
1324 return false;
1325 DominatorTreeWrapperPass *const DTW =
1326 getAnalysisIfAvailable<DominatorTreeWrapperPass>();
1327 auto DTCallback = [&DTW](Function &F) -> DominatorTree * {
1328 return DTW ? &DTW->getDomTree() : nullptr;
1329 };
1330 if (!AMDGPUTM) {
1331 auto &TPC = getAnalysis<TargetPassConfig>();
1332 AMDGPUTM = &TPC.getTM<AMDGPUTargetMachine>();
1333 }
1334 AMDGPUSwLowerLDS SwLowerLDSImpl(M, *AMDGPUTM, DTCallback);
1335 bool IsChanged = SwLowerLDSImpl.run();
1336 return IsChanged;
1337}
1338
1339ModulePass *
1341 return new AMDGPUSwLowerLDSLegacy(TM);
1342}
1343
1346 // AddressSanitizer pass adds "nosanitize_address" module flag if it has
1347 // instrumented the IR. Return early if the flag is not present.
1348 if (!M.getModuleFlag("nosanitize_address"))
1349 return PreservedAnalyses::all();
1350 auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
1351 auto DTCallback = [&FAM](Function &F) -> DominatorTree * {
1353 };
1354 AMDGPUSwLowerLDS SwLowerLDSImpl(M, TM, DTCallback);
1355 bool IsChanged = SwLowerLDSImpl.run();
1356 if (!IsChanged)
1357 return PreservedAnalyses::all();
1358
1361 return PA;
1362}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
amdgpu sw lower lds
amdgpu sw lower AMDGPU Software lowering of LDS
#define COV5_HIDDEN_DYN_LDS_SIZE_ARG
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file provides interfaces used to build and manipulate a call graph, which is a very useful tool ...
This file contains the declarations for the subclasses of Constant, which represent the different fla...
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
uint64_t Addr
Hexagon Common GEP
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
if(auto Err=PB.parsePassPipeline(MPM, Passes)) return wrap(std MPM run * Mod
FunctionAnalysisManager FAM
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:39
pre isel intrinsic lowering
This file implements a set that has insertion order iteration characteristics.
static Split data
This file contains some functions that are useful when dealing with strings.
#define LLVM_DEBUG(...)
Definition: Debug.h:119
Target-Independent Code Generator Pass Configuration Options pass.
static DebugLoc getOrCreateDebugLoc(const Instruction *InsertBefore, DISubprogram *SP)
This class represents a conversion between pointers from one address space to another.
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:255
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:412
Represent the analysis usage information of a pass.
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
An instruction that atomically checks whether a specified value is in a memory location,...
Definition: Instructions.h:506
void setVolatile(bool V)
Specify whether this is a volatile cmpxchg.
Definition: Instructions.h:564
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:709
void setVolatile(bool V)
Specify whether this is a volatile RMW or not.
Definition: Instructions.h:857
LLVM Basic Block Representation.
Definition: BasicBlock.h:62
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition: BasicBlock.h:206
A node in the call graph for a module.
Definition: CallGraph.h:162
Function * getFunction() const
Returns the function that this call graph node represents.
Definition: CallGraph.h:193
The basic data container for the call graph of a Module of IR.
Definition: CallGraph.h:72
static LLVM_ABI Constant * get(ArrayType *T, ArrayRef< Constant * > V)
Definition: Constants.cpp:1314
static ConstantAsMetadata * get(Constant *C)
Definition: Metadata.h:535
static Constant * getGetElementPtr(Type *Ty, Constant *C, ArrayRef< Constant * > IdxList, GEPNoWrapFlags NW=GEPNoWrapFlags::none(), std::optional< ConstantRange > InRange=std::nullopt, Type *OnlyIfReducedTy=nullptr)
Getelementptr form.
Definition: Constants.h:1274
static LLVM_ABI Constant * get(StructType *T, ArrayRef< Constant * > V)
Definition: Constants.cpp:1380
This is an important base class in LLVM.
Definition: Constant.h:43
LLVM_ABI void removeDeadConstantUsers() const
If there are any dead constant users dangling off of this constant, remove them.
Definition: Constants.cpp:739
Subprogram description. Uses SubclassData1.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
A debug info location.
Definition: DebugLoc.h:124
Implements a dense probed hash-table based set.
Definition: DenseSet.h:263
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:284
Legacy analysis pass which computes a DominatorTree.
Definition: Dominators.h:322
DominatorTree & getDomTree()
Definition: Dominators.h:330
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:165
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
Definition: DerivedTypes.h:170
arg_iterator arg_end()
Definition: Function.h:875
arg_iterator arg_begin()
Definition: Function.h:866
void applyUpdates(ArrayRef< UpdateT > Updates)
Submit updates to all available trees.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set a particular kind of metadata attachment.
Definition: Metadata.cpp:1571
LLVM_ABI bool isDeclaration() const
Return true if the primary definition of this global value is outside of the current translation unit...
Definition: Globals.cpp:316
PointerType * getType() const
Global values are always pointers.
Definition: GlobalValue.h:296
@ InternalLinkage
Rename collisions when linking (static functions).
Definition: GlobalValue.h:60
@ ExternalLinkage
Externally visible function.
Definition: GlobalValue.h:53
Type * getValueType() const
Definition: GlobalValue.h:298
uint64_t getAlignment() const
FIXME: Remove this function once transition to Align is over.
LLVM_ABI void eraseFromParent()
eraseFromParent - This method unlinks 'this' from the containing module and deletes it.
Definition: Globals.cpp:507
Value * CreateConstInBoundsGEP1_32(Type *Ty, Value *Ptr, unsigned Idx0, const Twine &Name="")
Definition: IRBuilder.h:1946
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2508
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2780
An analysis over an "outer" IR unit that provides access to an analysis manager over an "inner" IR un...
Definition: PassManager.h:585
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:513
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:68
An instruction for reading from memory.
Definition: Instructions.h:180
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
Definition: Instructions.h:245
LLVM_ABI MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
Definition: MDBuilder.cpp:96
Metadata node.
Definition: Metadata.h:1077
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1565
void push_back(MachineInstr *MI)
Root of the metadata hierarchy.
Definition: Metadata.h:63
ModulePass class - This class is used to implement unstructured interprocedural optimizations and ana...
Definition: Pass.h:255
virtual bool runOnModule(Module &M)=0
runOnModule - Virtual method overriden by subclasses to process the module being operated on.
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:67
A container for an operand bundle being viewed as a set of values rather than a set of uses.
Definition: InstrTypes.h:1069
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
Definition: Pass.cpp:112
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1885
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:118
PreservedAnalyses & preserve()
Mark an analysis as preserved.
Definition: Analysis.h:132
Return a value (possibly void), from a function.
A vector that has set insertion semantics.
Definition: SetVector.h:59
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:104
iterator end()
Get an iterator to the end of the SetVector.
Definition: SetVector.h:119
void clear()
Completely clear the SetVector.
Definition: SetVector.h:284
bool empty() const
Determine if the SetVector is empty or not.
Definition: SetVector.h:99
iterator begin()
Get an iterator to the beginning of the SetVector.
Definition: SetVector.h:109
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:168
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition: SmallString.h:26
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1197
An instruction for storing to memory.
Definition: Instructions.h:296
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this store instruction.
Definition: Instructions.h:369
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:55
Class to represent struct types.
Definition: DerivedTypes.h:218
static LLVM_ABI StructType * create(LLVMContext &Context, StringRef Name)
This creates an identified struct.
Definition: Type.cpp:620
Target-Independent Code Generator Pass Configuration Options.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:267
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
static LLVM_ABI Type * getVoidTy(LLVMContext &C)
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
A Use represents the edge between a Value definition and its users.
Definition: Use.h:35
LLVM Value Representation.
Definition: Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:256
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:546
iterator_range< user_iterator > users()
Definition: Value.h:426
LLVM_ABI void printAsOperand(raw_ostream &O, bool PrintType=true, const Module *M=nullptr) const
Print the name of this Value out to the specified raw_ostream.
Definition: AsmWriter.cpp:5305
LLVM_ABI void replaceUsesWithIf(Value *New, llvm::function_ref< bool(Use &U)> ShouldReplace)
Go through the uses list for this definition and make each use point to "V" if the callback ShouldRep...
Definition: Value.cpp:554
bool use_empty() const
Definition: Value.h:346
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:322
An efficient, type-erasing, non-owning reference to a callable.
A raw_ostream that writes to an std::string.
Definition: raw_ostream.h:662
A raw_ostream that writes to an SmallVector or SmallString.
Definition: raw_ostream.h:692
StringRef str() const
Return a StringRef for the vector contents.
Definition: raw_ostream.h:721
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
void getInterestingMemoryOperands(Module &M, Instruction *I, SmallVectorImpl< InterestingMemoryOperand > &Interesting)
Get all the memory operands from the instruction that needs to be instrumented.
bool isDynamicLDS(const GlobalVariable &GV)
unsigned getAMDHSACodeObjectVersion(const Module &M)
void removeFnAttrFromReachable(CallGraph &CG, Function *KernelRoot, ArrayRef< StringRef > FnAttrs)
Strip FnAttr attribute from any functions where we may have introduced its use.
LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M)
bool isLDSVariableToLower(const GlobalVariable &GV)
bool eliminateConstantExprUsesOfLDSFromAllInstructions(Module &M)
Align getAlign(const DataLayout &DL, const GlobalVariable *GV)
bool isKernelLDS(const Function *F)
void instrumentAddress(Module &M, IRBuilder<> &IRB, Instruction *OrigIns, Instruction *InsertBefore, Value *Addr, Align Alignment, TypeSize TypeStoreSize, bool IsWrite, Value *SizeArgument, bool UseCalls, bool Recover, int AsanScale, int AsanOffset)
Instrument the memory operand Addr.
uint64_t getRedzoneSizeForGlobal(int AsanScale, uint64_t SizeInBytes)
Given SizeInBytes of the Value to be instrunmented, Returns the redzone size corresponding to it.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Definition: Intrinsics.cpp:751
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:444
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Offset
Definition: DWP.cpp:477
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1770
ModulePass * createAMDGPUSwLowerLDSLegacyPass(const AMDGPUTargetMachine *TM=nullptr)
constexpr from_range_t from_range
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition: STLExtras.h:2155
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:663
char & AMDGPUSwLowerLDSLegacyPassID
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1669
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:207
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition: Error.cpp:167
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1916
void getAddressSanitizerParams(const Triple &TargetTriple, int LongSize, bool IsKasan, uint64_t *ShadowBase, int *MappingScale, bool *OrShadowOffset)
const AMDGPUTargetMachine & TM
Definition: AMDGPU.h:307
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
FunctionVariableMap direct_access
FunctionVariableMap indirect_access
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39