LLVM 22.0.0git
NVPTXLowerArgs.cpp
Go to the documentation of this file.
1//===-- NVPTXLowerArgs.cpp - Lower arguments ------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9//
10// Arguments to kernel and device functions are passed via param space,
11// which imposes certain restrictions:
12// http://docs.nvidia.com/cuda/parallel-thread-execution/#state-spaces
13//
14// Kernel parameters are read-only and accessible only via ld.param
15// instruction, directly or via a pointer.
16//
17// Device function parameters are directly accessible via
18// ld.param/st.param, but taking the address of one returns a pointer
19// to a copy created in local space which *can't* be used with
20// ld.param/st.param.
21//
22// Copying a byval struct into local memory in IR allows us to enforce
23// the param space restrictions, gives the rest of IR a pointer w/o
24// param space restrictions, and gives us an opportunity to eliminate
25// the copy.
26//
27// Pointer arguments to kernel functions need more work to be lowered:
28//
29// 1. Convert non-byval pointer arguments of CUDA kernels to pointers in the
30// global address space. This allows later optimizations to emit
31// ld.global.*/st.global.* for accessing these pointer arguments. For
32// example,
33//
34// define void @foo(float* %input) {
35// %v = load float, float* %input, align 4
36// ...
37// }
38//
39// becomes
40//
41// define void @foo(float* %input) {
42// %input2 = addrspacecast float* %input to float addrspace(1)*
43// %input3 = addrspacecast float addrspace(1)* %input2 to float*
44// %v = load float, float* %input3, align 4
45// ...
46// }
47//
48// Later, NVPTXInferAddressSpaces will optimize it to
49//
50// define void @foo(float* %input) {
51// %input2 = addrspacecast float* %input to float addrspace(1)*
52// %v = load float, float addrspace(1)* %input2, align 4
53// ...
54// }
55//
56// 2. Convert byval kernel parameters to pointers in the param address space
57// (so that NVPTX emits ld/st.param). Convert pointers *within* a byval
58// kernel parameter to pointers in the global address space. This allows
59// NVPTX to emit ld/st.global.
60//
61// struct S {
62// int *x;
63// int *y;
64// };
65// __global__ void foo(S s) {
66// int *b = s.y;
67// // use b
68// }
69//
70// "b" points to the global address space. In the IR level,
71//
72// define void @foo(ptr byval %input) {
73// %b_ptr = getelementptr {ptr, ptr}, ptr %input, i64 0, i32 1
74// %b = load ptr, ptr %b_ptr
75// ; use %b
76// }
77//
78// becomes
79//
80// define void @foo({i32*, i32*}* byval %input) {
81// %b_param = addrspacecat ptr %input to ptr addrspace(101)
82// %b_ptr = getelementptr {ptr, ptr}, ptr addrspace(101) %b_param, i64 0, i32 1
83// %b = load ptr, ptr addrspace(101) %b_ptr
84// %b_global = addrspacecast ptr %b to ptr addrspace(1)
85// ; use %b_generic
86// }
87//
88// Create a local copy of kernel byval parameters used in a way that *might* mutate
89// the parameter, by storing it in an alloca. Mutations to "grid_constant" parameters
90// are undefined behaviour, and don't require local copies.
91//
92// define void @foo(ptr byval(%struct.s) align 4 %input) {
93// store i32 42, ptr %input
94// ret void
95// }
96//
97// becomes
98//
99// define void @foo(ptr byval(%struct.s) align 4 %input) #1 {
100// %input1 = alloca %struct.s, align 4
101// %input2 = addrspacecast ptr %input to ptr addrspace(101)
102// %input3 = load %struct.s, ptr addrspace(101) %input2, align 4
103// store %struct.s %input3, ptr %input1, align 4
104// store i32 42, ptr %input1, align 4
105// ret void
106// }
107//
108// If %input were passed to a device function, or written to memory,
109// conservatively assume that %input gets mutated, and create a local copy.
110//
111// Convert param pointers to grid_constant byval kernel parameters that are
112// passed into calls (device functions, intrinsics, inline asm), or otherwise
113// "escape" (into stores/ptrtoints) to the generic address space, using the
114// `nvvm.ptr.param.to.gen` intrinsic, so that NVPTX emits cvta.param
115// (available for sm70+)
116//
117// define void @foo(ptr byval(%struct.s) %input) {
118// ; %input is a grid_constant
119// %call = call i32 @escape(ptr %input)
120// ret void
121// }
122//
123// becomes
124//
125// define void @foo(ptr byval(%struct.s) %input) {
126// %input1 = addrspacecast ptr %input to ptr addrspace(101)
127// ; the following intrinsic converts pointer to generic. We don't use an addrspacecast
128// ; to prevent generic -> param -> generic from getting cancelled out
129// %input1.gen = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) %input1)
130// %call = call i32 @escape(ptr %input1.gen)
131// ret void
132// }
133//
134// TODO: merge this pass with NVPTXInferAddressSpaces so that other passes don't
135// cancel the addrspacecast pair this pass emits.
136//===----------------------------------------------------------------------===//
137
139#include "NVPTX.h"
140#include "NVPTXTargetMachine.h"
141#include "NVPTXUtilities.h"
142#include "llvm/ADT/STLExtras.h"
146#include "llvm/IR/Function.h"
147#include "llvm/IR/IRBuilder.h"
148#include "llvm/IR/Instructions.h"
150#include "llvm/IR/IntrinsicsNVPTX.h"
151#include "llvm/IR/Type.h"
153#include "llvm/Pass.h"
154#include "llvm/Support/Debug.h"
157#include <numeric>
158#include <queue>
159
160#define DEBUG_TYPE "nvptx-lower-args"
161
162using namespace llvm;
163
164namespace {
165class NVPTXLowerArgsLegacyPass : public FunctionPass {
166 bool runOnFunction(Function &F) override;
167
168public:
169 static char ID; // Pass identification, replacement for typeid
170 NVPTXLowerArgsLegacyPass() : FunctionPass(ID) {}
171 StringRef getPassName() const override {
172 return "Lower pointer arguments of CUDA kernels";
173 }
174 void getAnalysisUsage(AnalysisUsage &AU) const override {
176 }
177};
178} // namespace
179
180char NVPTXLowerArgsLegacyPass::ID = 1;
181
182INITIALIZE_PASS_BEGIN(NVPTXLowerArgsLegacyPass, "nvptx-lower-args",
183 "Lower arguments (NVPTX)", false, false)
185INITIALIZE_PASS_END(NVPTXLowerArgsLegacyPass, "nvptx-lower-args",
186 "Lower arguments (NVPTX)", false, false)
187
188// =============================================================================
189// If the function had a byval struct ptr arg, say foo(%struct.x* byval %d),
190// and we can't guarantee that the only accesses are loads,
191// then add the following instructions to the first basic block:
192//
193// %temp = alloca %struct.x, align 8
194// %tempd = addrspacecast %struct.x* %d to %struct.x addrspace(101)*
195// %tv = load %struct.x addrspace(101)* %tempd
196// store %struct.x %tv, %struct.x* %temp, align 8
197//
198// The above code allocates some space in the stack and copies the incoming
199// struct from param space to local space.
200// Then replace all occurrences of %d by %temp.
201//
202// In case we know that all users are GEPs or Loads, replace them with the same
203// ones in parameter AS, so we can access them using ld.param.
204// =============================================================================
205
206// For Loads, replaces the \p OldUse of the pointer with a Use of the same
207// pointer in parameter AS.
208// For "escapes" (to memory, a function call, or a ptrtoint), cast the OldUse to
209// generic using cvta.param.
210static void convertToParamAS(Use *OldUse, Value *Param, bool HasCvtaParam,
211 bool IsGridConstant) {
212 Instruction *I = dyn_cast<Instruction>(OldUse->getUser());
213 assert(I && "OldUse must be in an instruction");
214 struct IP {
215 Use *OldUse;
216 Instruction *OldInstruction;
217 Value *NewParam;
218 };
219 SmallVector<IP> ItemsToConvert = {{OldUse, I, Param}};
220 SmallVector<Instruction *> InstructionsToDelete;
221
222 auto CloneInstInParamAS = [HasCvtaParam,
223 IsGridConstant](const IP &I) -> Value * {
224 if (auto *LI = dyn_cast<LoadInst>(I.OldInstruction)) {
225 LI->setOperand(0, I.NewParam);
226 return LI;
227 }
228 if (auto *GEP = dyn_cast<GetElementPtrInst>(I.OldInstruction)) {
229 SmallVector<Value *, 4> Indices(GEP->indices());
230 auto *NewGEP = GetElementPtrInst::Create(
231 GEP->getSourceElementType(), I.NewParam, Indices, GEP->getName(),
232 GEP->getIterator());
233 NewGEP->setIsInBounds(GEP->isInBounds());
234 return NewGEP;
235 }
236 if (auto *BC = dyn_cast<BitCastInst>(I.OldInstruction)) {
237 auto *NewBCType = PointerType::get(BC->getContext(), ADDRESS_SPACE_PARAM);
238 return BitCastInst::Create(BC->getOpcode(), I.NewParam, NewBCType,
239 BC->getName(), BC->getIterator());
240 }
241 if (auto *ASC = dyn_cast<AddrSpaceCastInst>(I.OldInstruction)) {
242 assert(ASC->getDestAddressSpace() == ADDRESS_SPACE_PARAM);
243 (void)ASC;
244 // Just pass through the argument, the old ASC is no longer needed.
245 return I.NewParam;
246 }
247 if (auto *MI = dyn_cast<MemTransferInst>(I.OldInstruction)) {
248 if (MI->getRawSource() == I.OldUse->get()) {
249 // convert to memcpy/memmove from param space.
250 IRBuilder<> Builder(I.OldInstruction);
251 Intrinsic::ID ID = MI->getIntrinsicID();
252
253 CallInst *B = Builder.CreateMemTransferInst(
254 ID, MI->getRawDest(), MI->getDestAlign(), I.NewParam,
255 MI->getSourceAlign(), MI->getLength(), MI->isVolatile());
256 for (unsigned I : {0, 1})
257 if (uint64_t Bytes = MI->getParamDereferenceableBytes(I))
258 B->addDereferenceableParamAttr(I, Bytes);
259 return B;
260 }
261 // We may be able to handle other cases if the argument is
262 // __grid_constant__
263 }
264
265 if (HasCvtaParam) {
266 auto GetParamAddrCastToGeneric =
267 [](Value *Addr, Instruction *OriginalUser) -> Value * {
268 IRBuilder<> IRB(OriginalUser);
269 Type *GenTy = IRB.getPtrTy(ADDRESS_SPACE_GENERIC);
270 return IRB.CreateAddrSpaceCast(Addr, GenTy, Addr->getName() + ".gen");
271 };
272 auto *ParamInGenericAS =
273 GetParamAddrCastToGeneric(I.NewParam, I.OldInstruction);
274
275 // phi/select could use generic arg pointers w/o __grid_constant__
276 if (auto *PHI = dyn_cast<PHINode>(I.OldInstruction)) {
277 for (auto [Idx, V] : enumerate(PHI->incoming_values())) {
278 if (V.get() == I.OldUse->get())
279 PHI->setIncomingValue(Idx, ParamInGenericAS);
280 }
281 }
282 if (auto *SI = dyn_cast<SelectInst>(I.OldInstruction)) {
283 if (SI->getTrueValue() == I.OldUse->get())
284 SI->setTrueValue(ParamInGenericAS);
285 if (SI->getFalseValue() == I.OldUse->get())
286 SI->setFalseValue(ParamInGenericAS);
287 }
288
289 // Escapes or writes can only use generic param pointers if
290 // __grid_constant__ is in effect.
291 if (IsGridConstant) {
292 if (auto *CI = dyn_cast<CallInst>(I.OldInstruction)) {
293 I.OldUse->set(ParamInGenericAS);
294 return CI;
295 }
296 if (auto *SI = dyn_cast<StoreInst>(I.OldInstruction)) {
297 // byval address is being stored, cast it to generic
298 if (SI->getValueOperand() == I.OldUse->get())
299 SI->setOperand(0, ParamInGenericAS);
300 return SI;
301 }
302 if (auto *PI = dyn_cast<PtrToIntInst>(I.OldInstruction)) {
303 if (PI->getPointerOperand() == I.OldUse->get())
304 PI->setOperand(0, ParamInGenericAS);
305 return PI;
306 }
307 // TODO: iIf we allow stores, we should allow memcpy/memset to
308 // parameter, too.
309 }
310 }
311
312 llvm_unreachable("Unsupported instruction");
313 };
314
315 while (!ItemsToConvert.empty()) {
316 IP I = ItemsToConvert.pop_back_val();
317 Value *NewInst = CloneInstInParamAS(I);
318
319 if (NewInst && NewInst != I.OldInstruction) {
320 // We've created a new instruction. Queue users of the old instruction to
321 // be converted and the instruction itself to be deleted. We can't delete
322 // the old instruction yet, because it's still in use by a load somewhere.
323 for (Use &U : I.OldInstruction->uses())
324 ItemsToConvert.push_back({&U, cast<Instruction>(U.getUser()), NewInst});
325
326 InstructionsToDelete.push_back(I.OldInstruction);
327 }
328 }
329
330 // Now we know that all argument loads are using addresses in parameter space
331 // and we can finally remove the old instructions in generic AS. Instructions
332 // scheduled for removal should be processed in reverse order so the ones
333 // closest to the load are deleted first. Otherwise they may still be in use.
334 // E.g if we have Value = Load(BitCast(GEP(arg))), InstructionsToDelete will
335 // have {GEP,BitCast}. GEP can't be deleted first, because it's still used by
336 // the BitCast.
337 for (Instruction *I : llvm::reverse(InstructionsToDelete))
338 I->eraseFromParent();
339}
340
341// Adjust alignment of arguments passed byval in .param address space. We can
342// increase alignment of such arguments in a way that ensures that we can
343// effectively vectorize their loads. We should also traverse all loads from
344// byval pointer and adjust their alignment, if those were using known offset.
345// Such alignment changes must be conformed with parameter store and load in
346// NVPTXTargetLowering::LowerCall.
347static void adjustByValArgAlignment(Argument *Arg, Value *ArgInParamAS,
348 const NVPTXTargetLowering *TLI) {
349 Function *Func = Arg->getParent();
351 const DataLayout &DL = Func->getDataLayout();
352
353 const Align NewArgAlign =
355 const Align CurArgAlign = Arg->getParamAlign().valueOrOne();
356
357 if (CurArgAlign >= NewArgAlign)
358 return;
359
360 LLVM_DEBUG(dbgs() << "Try to use alignment " << NewArgAlign.value()
361 << " instead of " << CurArgAlign.value() << " for " << *Arg
362 << '\n');
363
364 auto NewAlignAttr =
365 Attribute::getWithAlignment(Func->getContext(), NewArgAlign);
366 Arg->removeAttr(Attribute::Alignment);
367 Arg->addAttr(NewAlignAttr);
368
369 struct Load {
370 LoadInst *Inst;
372 };
373
374 struct LoadContext {
375 Value *InitialVal;
377 };
378
379 SmallVector<Load> Loads;
380 std::queue<LoadContext> Worklist;
381 Worklist.push({ArgInParamAS, 0});
382
383 while (!Worklist.empty()) {
384 LoadContext Ctx = Worklist.front();
385 Worklist.pop();
386
387 for (User *CurUser : Ctx.InitialVal->users()) {
388 if (auto *I = dyn_cast<LoadInst>(CurUser))
389 Loads.push_back({I, Ctx.Offset});
390 else if (isa<BitCastInst>(CurUser) || isa<AddrSpaceCastInst>(CurUser))
391 Worklist.push({cast<Instruction>(CurUser), Ctx.Offset});
392 else if (auto *I = dyn_cast<GetElementPtrInst>(CurUser)) {
393 APInt OffsetAccumulated =
394 APInt::getZero(DL.getIndexSizeInBits(ADDRESS_SPACE_PARAM));
395
396 if (!I->accumulateConstantOffset(DL, OffsetAccumulated))
397 continue;
398
399 uint64_t OffsetLimit = -1;
400 uint64_t Offset = OffsetAccumulated.getLimitedValue(OffsetLimit);
401 assert(Offset != OffsetLimit && "Expect Offset less than UINT64_MAX");
402
403 Worklist.push({I, Ctx.Offset + Offset});
404 }
405 }
406 }
407
408 for (Load &CurLoad : Loads) {
409 Align NewLoadAlign(std::gcd(NewArgAlign.value(), CurLoad.Offset));
410 Align CurLoadAlign = CurLoad.Inst->getAlign();
411 CurLoad.Inst->setAlignment(std::max(NewLoadAlign, CurLoadAlign));
412 }
413}
414
415// Create a call to the nvvm_internal_addrspace_wrap intrinsic and set the
416// alignment of the return value based on the alignment of the argument.
418 Argument &Arg) {
419 CallInst *ArgInParam =
420 IRB.CreateIntrinsic(Intrinsic::nvvm_internal_addrspace_wrap,
422 &Arg, {}, Arg.getName() + ".param");
423
424 if (MaybeAlign ParamAlign = Arg.getParamAlign())
425 ArgInParam->addRetAttr(
426 Attribute::getWithAlignment(ArgInParam->getContext(), *ParamAlign));
427
428 return ArgInParam;
429}
430
431namespace {
432struct ArgUseChecker : PtrUseVisitor<ArgUseChecker> {
433 using Base = PtrUseVisitor<ArgUseChecker>;
434
435 bool IsGridConstant;
436 // Set of phi/select instructions using the Arg
437 SmallPtrSet<Instruction *, 4> Conditionals;
438
439 ArgUseChecker(const DataLayout &DL, bool IsGridConstant)
440 : PtrUseVisitor(DL), IsGridConstant(IsGridConstant) {}
441
442 PtrInfo visitArgPtr(Argument &A) {
443 assert(A.getType()->isPointerTy());
444 IntegerType *IntIdxTy = cast<IntegerType>(DL.getIndexType(A.getType()));
445 IsOffsetKnown = false;
446 Offset = APInt(IntIdxTy->getBitWidth(), 0);
447 PI.reset();
448 Conditionals.clear();
449
450 LLVM_DEBUG(dbgs() << "Checking Argument " << A << "\n");
451 // Enqueue the uses of this pointer.
452 enqueueUsers(A);
453
454 // Visit all the uses off the worklist until it is empty.
455 // Note that unlike PtrUseVisitor we intentionally do not track offsets.
456 // We're only interested in how we use the pointer.
457 while (!(Worklist.empty() || PI.isAborted())) {
458 UseToVisit ToVisit = Worklist.pop_back_val();
459 U = ToVisit.UseAndIsOffsetKnown.getPointer();
460 Instruction *I = cast<Instruction>(U->getUser());
462 Conditionals.insert(I);
463 LLVM_DEBUG(dbgs() << "Processing " << *I << "\n");
464 Base::visit(I);
465 }
466 if (PI.isEscaped())
467 LLVM_DEBUG(dbgs() << "Argument pointer escaped: " << *PI.getEscapingInst()
468 << "\n");
469 else if (PI.isAborted())
470 LLVM_DEBUG(dbgs() << "Pointer use needs a copy: " << *PI.getAbortingInst()
471 << "\n");
472 LLVM_DEBUG(dbgs() << "Traversed " << Conditionals.size()
473 << " conditionals\n");
474 return PI;
475 }
476
477 void visitStoreInst(StoreInst &SI) {
478 // Storing the pointer escapes it.
479 if (U->get() == SI.getValueOperand())
480 return PI.setEscapedAndAborted(&SI);
481 // Writes to the pointer are UB w/ __grid_constant__, but do not force a
482 // copy.
483 if (!IsGridConstant)
484 return PI.setAborted(&SI);
485 }
486
487 void visitAddrSpaceCastInst(AddrSpaceCastInst &ASC) {
488 // ASC to param space are no-ops and do not need a copy
490 return PI.setEscapedAndAborted(&ASC);
492 }
493
494 void visitPtrToIntInst(PtrToIntInst &I) {
495 if (IsGridConstant)
496 return;
498 }
499 void visitPHINodeOrSelectInst(Instruction &I) {
501 }
502 // PHI and select just pass through the pointers.
503 void visitPHINode(PHINode &PN) { enqueueUsers(PN); }
504 void visitSelectInst(SelectInst &SI) { enqueueUsers(SI); }
505
506 void visitMemTransferInst(MemTransferInst &II) {
507 if (*U == II.getRawDest() && !IsGridConstant)
508 PI.setAborted(&II);
509 // memcpy/memmove are OK when the pointer is source. We can convert them to
510 // AS-specific memcpy.
511 }
512
513 void visitMemSetInst(MemSetInst &II) {
514 if (!IsGridConstant)
515 PI.setAborted(&II);
516 }
517}; // struct ArgUseChecker
518
519void copyByValParam(Function &F, Argument &Arg) {
520 LLVM_DEBUG(dbgs() << "Creating a local copy of " << Arg << "\n");
521 // Otherwise we have to create a temporary copy.
522 BasicBlock::iterator FirstInst = F.getEntryBlock().begin();
524 const DataLayout &DL = F.getDataLayout();
525 IRBuilder<> IRB(&*FirstInst);
526 AllocaInst *AllocA = IRB.CreateAlloca(StructType, nullptr, Arg.getName());
527 // Set the alignment to alignment of the byval parameter. This is because,
528 // later load/stores assume that alignment, and we are going to replace
529 // the use of the byval parameter with this alloca instruction.
530 AllocA->setAlignment(
531 Arg.getParamAlign().value_or(DL.getPrefTypeAlign(StructType)));
532 Arg.replaceAllUsesWith(AllocA);
533
534 CallInst *ArgInParam = createNVVMInternalAddrspaceWrap(IRB, Arg);
535
536 // Be sure to propagate alignment to this load; LLVM doesn't know that NVPTX
537 // addrspacecast preserves alignment. Since params are constant, this load
538 // is definitely not volatile.
539 const auto ArgSize = *AllocA->getAllocationSize(DL);
540 IRB.CreateMemCpy(AllocA, AllocA->getAlign(), ArgInParam, AllocA->getAlign(),
541 ArgSize);
542}
543} // namespace
544
545static void handleByValParam(const NVPTXTargetMachine &TM, Argument *Arg) {
546 Function *Func = Arg->getParent();
547 assert(isKernelFunction(*Func));
548 const bool HasCvtaParam = TM.getSubtargetImpl(*Func)->hasCvtaParam();
549 const bool IsGridConstant = HasCvtaParam && isParamGridConstant(*Arg);
550 const DataLayout &DL = Func->getDataLayout();
551 BasicBlock::iterator FirstInst = Func->getEntryBlock().begin();
552 [[maybe_unused]] Type *StructType = Arg->getParamByValType();
553 assert(StructType && "Missing byval type");
554
555 ArgUseChecker AUC(DL, IsGridConstant);
556 ArgUseChecker::PtrInfo PI = AUC.visitArgPtr(*Arg);
557 bool ArgUseIsReadOnly = !(PI.isEscaped() || PI.isAborted());
558 // Easy case, accessing parameter directly is fine.
559 if (ArgUseIsReadOnly && AUC.Conditionals.empty()) {
560 // Convert all loads and intermediate operations to use parameter AS and
561 // skip creation of a local copy of the argument.
563
564 IRBuilder<> IRB(&*FirstInst);
565 CallInst *ArgInParamAS = createNVVMInternalAddrspaceWrap(IRB, *Arg);
566
567 for (Use *U : UsesToUpdate)
568 convertToParamAS(U, ArgInParamAS, HasCvtaParam, IsGridConstant);
569 LLVM_DEBUG(dbgs() << "No need to copy or cast " << *Arg << "\n");
570
571 const auto *TLI =
572 cast<NVPTXTargetLowering>(TM.getSubtargetImpl()->getTargetLowering());
573
574 adjustByValArgAlignment(Arg, ArgInParamAS, TLI);
575
576 return;
577 }
578
579 // We can't access byval arg directly and need a pointer. on sm_70+ we have
580 // ability to take a pointer to the argument without making a local copy.
581 // However, we're still not allowed to write to it. If the user specified
582 // `__grid_constant__` for the argument, we'll consider escaped pointer as
583 // read-only.
584 if (IsGridConstant || (HasCvtaParam && ArgUseIsReadOnly)) {
585 LLVM_DEBUG(dbgs() << "Using non-copy pointer to " << *Arg << "\n");
586 // Replace all argument pointer uses (which might include a device function
587 // call) with a cast to the generic address space using cvta.param
588 // instruction, which avoids a local copy.
589 IRBuilder<> IRB(&Func->getEntryBlock().front());
590
591 // Cast argument to param address space. Because the backend will emit the
592 // argument already in the param address space, we need to use the noop
593 // intrinsic, this had the added benefit of preventing other optimizations
594 // from folding away this pair of addrspacecasts.
595 auto *ParamSpaceArg = createNVVMInternalAddrspaceWrap(IRB, *Arg);
596
597 // Cast param address to generic address space.
598 Value *GenericArg = IRB.CreateAddrSpaceCast(
599 ParamSpaceArg, IRB.getPtrTy(ADDRESS_SPACE_GENERIC),
600 Arg->getName() + ".gen");
601
602 Arg->replaceAllUsesWith(GenericArg);
603
604 // Do not replace Arg in the cast to param space
605 ParamSpaceArg->setOperand(0, Arg);
606 } else
607 copyByValParam(*Func, *Arg);
608}
609
610static void markPointerAsAS(Value *Ptr, const unsigned AS) {
611 if (Ptr->getType()->getPointerAddressSpace() != ADDRESS_SPACE_GENERIC)
612 return;
613
614 // Deciding where to emit the addrspacecast pair.
615 BasicBlock::iterator InsertPt;
616 if (Argument *Arg = dyn_cast<Argument>(Ptr)) {
617 // Insert at the functon entry if Ptr is an argument.
618 InsertPt = Arg->getParent()->getEntryBlock().begin();
619 } else {
620 // Insert right after Ptr if Ptr is an instruction.
621 InsertPt = ++cast<Instruction>(Ptr)->getIterator();
622 assert(InsertPt != InsertPt->getParent()->end() &&
623 "We don't call this function with Ptr being a terminator.");
624 }
625
626 Instruction *PtrInGlobal = new AddrSpaceCastInst(
627 Ptr, PointerType::get(Ptr->getContext(), AS), Ptr->getName(), InsertPt);
628 Value *PtrInGeneric = new AddrSpaceCastInst(PtrInGlobal, Ptr->getType(),
629 Ptr->getName(), InsertPt);
630 // Replace with PtrInGeneric all uses of Ptr except PtrInGlobal.
631 Ptr->replaceAllUsesWith(PtrInGeneric);
632 PtrInGlobal->setOperand(0, Ptr);
633}
634
638
639// =============================================================================
640// Main function for this pass.
641// =============================================================================
643 // Copying of byval aggregates + SROA may result in pointers being loaded as
644 // integers, followed by intotoptr. We may want to mark those as global, too,
645 // but only if the loaded integer is used exclusively for conversion to a
646 // pointer with inttoptr.
647 auto HandleIntToPtr = [](Value &V) {
648 if (llvm::all_of(V.users(), [](User *U) { return isa<IntToPtrInst>(U); })) {
649 SmallVector<User *, 16> UsersToUpdate(V.users());
650 for (User *U : UsersToUpdate)
652 }
653 };
654 if (TM.getDrvInterface() == NVPTX::CUDA) {
655 // Mark pointers in byval structs as global.
656 for (auto &B : F) {
657 for (auto &I : B) {
658 if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
659 if (LI->getType()->isPointerTy() || LI->getType()->isIntegerTy()) {
660 Value *UO = getUnderlyingObject(LI->getPointerOperand());
661 if (Argument *Arg = dyn_cast<Argument>(UO)) {
662 if (Arg->hasByValAttr()) {
663 // LI is a load from a pointer within a byval kernel parameter.
664 if (LI->getType()->isPointerTy())
666 else
667 HandleIntToPtr(*LI);
668 }
669 }
670 }
671 }
672 }
673 }
674 }
675
676 LLVM_DEBUG(dbgs() << "Lowering kernel args of " << F.getName() << "\n");
677 for (Argument &Arg : F.args()) {
678 if (Arg.getType()->isPointerTy() && Arg.hasByValAttr()) {
679 handleByValParam(TM, &Arg);
680 } else if (Arg.getType()->isIntegerTy() &&
681 TM.getDrvInterface() == NVPTX::CUDA) {
682 HandleIntToPtr(Arg);
683 }
684 }
685 return true;
686}
687
688// Device functions only need to copy byval args into local memory.
690 LLVM_DEBUG(dbgs() << "Lowering function args of " << F.getName() << "\n");
691
692 const auto *TLI =
693 cast<NVPTXTargetLowering>(TM.getSubtargetImpl()->getTargetLowering());
694
695 for (Argument &Arg : F.args())
696 if (Arg.getType()->isPointerTy() && Arg.hasByValAttr())
697 adjustByValArgAlignment(&Arg, &Arg, TLI);
698
699 return true;
700}
701
706
707bool NVPTXLowerArgsLegacyPass::runOnFunction(Function &F) {
708 auto &TM = getAnalysis<TargetPassConfig>().getTM<NVPTXTargetMachine>();
709 return processFunction(F, TM);
710}
712 return new NVPTXLowerArgsLegacyPass();
713}
714
716 LLVM_DEBUG(dbgs() << "Creating a copy of byval args of " << F.getName()
717 << "\n");
718 bool Changed = false;
719 if (isKernelFunction(F)) {
720 for (Argument &Arg : F.args())
721 if (Arg.getType()->isPointerTy() && Arg.hasByValAttr() &&
722 !isParamGridConstant(Arg)) {
723 copyByValParam(F, Arg);
724 Changed = true;
725 }
726 }
727 return Changed;
728}
729
735
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Rewrite undef for PHI
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static bool runOnFunction(Function &F, bool PostInlining)
Hexagon Common GEP
IRTranslator LLVM IR MI
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
NVPTX address space definition.
static bool runOnDeviceFunction(const NVPTXTargetMachine &TM, Function &F)
static CallInst * createNVVMInternalAddrspaceWrap(IRBuilder<> &IRB, Argument &Arg)
static void adjustByValArgAlignment(Argument *Arg, Value *ArgInParamAS, const NVPTXTargetLowering *TLI)
static bool copyFunctionByValArgs(Function &F)
static void markPointerAsAS(Value *Ptr, const unsigned AS)
nvptx lower Lower static false void convertToParamAS(Use *OldUse, Value *Param, bool HasCvtaParam, bool IsGridConstant)
static bool processFunction(Function &F, NVPTXTargetMachine &TM)
static bool runOnKernelFunction(const NVPTXTargetMachine &TM, Function &F)
static void markPointerAsGlobal(Value *Ptr)
static void handleByValParam(const NVPTXTargetMachine &TM, Argument *Arg)
uint64_t IntrinsicInst * II
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
This file provides a collection of visitors which walk the (instruction) uses of a pointer.
This file contains some templates that are useful if you are working with the STL at all.
#define LLVM_DEBUG(...)
Definition Debug.h:119
Target-Independent Code Generator Pass Configuration Options pass.
Class for arbitrary precision integers.
Definition APInt.h:78
uint64_t getLimitedValue(uint64_t Limit=UINT64_MAX) const
If this value is smaller than the specified limit, return it, otherwise return the limit value.
Definition APInt.h:475
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition APInt.h:200
This class represents a conversion between pointers from one address space to another.
unsigned getDestAddressSpace() const
Returns the address space of the result.
an instruction to allocate memory on the stack
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
LLVM_ABI std::optional< TypeSize > getAllocationSize(const DataLayout &DL) const
Get allocation size in bytes.
void setAlignment(Align Align)
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
LLVM_ABI void addAttr(Attribute::AttrKind Kind)
Definition Function.cpp:321
LLVM_ABI bool hasByValAttr() const
Return true if this argument has the byval attribute.
Definition Function.cpp:128
LLVM_ABI void removeAttr(Attribute::AttrKind Kind)
Remove attributes from an argument.
Definition Function.cpp:329
const Function * getParent() const
Definition Argument.h:44
LLVM_ABI Type * getParamByValType() const
If this is a byval argument, return its type.
Definition Function.cpp:225
LLVM_ABI MaybeAlign getParamAlign() const
If this is a byval or inalloca argument, return its alignment.
Definition Function.cpp:216
static LLVM_ABI Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
iterator begin()
Instruction iterator methods.
Definition BasicBlock.h:459
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
void addRetAttr(Attribute::AttrKind Kind)
Adds the attribute to the return value.
This class represents a function call, abstracting a target machine's calling convention.
static LLVM_ABI CastInst * Create(Instruction::CastOps, Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Provides a way to construct any of the CastInst subclasses using an opcode instead of the subclass's ...
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
const BasicBlock & getEntryBlock() const
Definition Function.h:807
static GetElementPtrInst * Create(Type *PointeeType, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
PointerType * getPtrTy(unsigned AddrSpace=0)
Fetch the type representing a pointer.
Definition IRBuilder.h:605
Value * CreateAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2209
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2780
void visit(Iterator Start, Iterator End)
Definition InstVisitor.h:87
unsigned getBitWidth() const
Get the number of bits in this IntegerType.
An instruction for reading from memory.
Align getFunctionParamOptimizedAlign(const Function *F, Type *ArgTy, const DataLayout &DL) const
getFunctionParamOptimizedAlign - since function arguments are passed via .param space,...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
Definition Analysis.h:115
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
A base class for visitors over the uses of a pointer value.
void visitAddrSpaceCastInst(AddrSpaceCastInst &ASC)
void visitPtrToIntInst(PtrToIntInst &I)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
Class to represent struct types.
Target-Independent Code Generator Pass Configuration Options.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
void setOperand(unsigned i, Value *Val)
Definition User.h:237
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
iterator_range< user_iterator > users()
Definition Value.h:426
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.cpp:1101
iterator_range< use_iterator > uses()
Definition Value.h:380
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
@ Offset
Definition DWP.cpp:477
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1727
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2474
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:649
FunctionPass * createNVPTXLowerArgsPass()
auto reverse(ContainerTy &&C)
Definition STLExtras.h:420
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
bool isParamGridConstant(const Argument &Arg)
bool isKernelFunction(const Function &F)
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:565
iterator_range< pointer_iterator< WrappedIteratorT > > make_pointer_range(RangeT &&Range)
Definition iterator.h:363
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:85
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:117
Align valueOrOne() const
For convenience, returns a valid alignment or 1 if undefined.
Definition Alignment.h:141
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)