LLVM 21.0.0git
AMDGPUCallLowering.cpp
Go to the documentation of this file.
1//===-- llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp - Call lowering -----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements the lowering of LLVM calls to machine code calls for
11/// GlobalISel.
12///
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPUCallLowering.h"
16#include "AMDGPU.h"
17#include "AMDGPULegalizerInfo.h"
19#include "SIRegisterInfo.h"
24#include "llvm/IR/IntrinsicsAMDGPU.h"
25
26#define DEBUG_TYPE "amdgpu-call-lowering"
27
28using namespace llvm;
29
30namespace {
31
32/// Wrapper around extendRegister to ensure we extend to a full 32-bit register.
33static Register extendRegisterMin32(CallLowering::ValueHandler &Handler,
34 Register ValVReg, const CCValAssign &VA) {
35 if (VA.getLocVT().getSizeInBits() < 32) {
36 // 16-bit types are reported as legal for 32-bit registers. We need to
37 // extend and do a 32-bit copy to avoid the verifier complaining about it.
38 return Handler.MIRBuilder.buildAnyExt(LLT::scalar(32), ValVReg).getReg(0);
39 }
40
41 return Handler.extendRegister(ValVReg, VA);
42}
43
44struct AMDGPUOutgoingValueHandler : public CallLowering::OutgoingValueHandler {
45 AMDGPUOutgoingValueHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI,
47 : OutgoingValueHandler(B, MRI), MIB(MIB) {}
48
50
53 ISD::ArgFlagsTy Flags) override {
54 llvm_unreachable("not implemented");
55 }
56
57 void assignValueToAddress(Register ValVReg, Register Addr, LLT MemTy,
58 const MachinePointerInfo &MPO,
59 const CCValAssign &VA) override {
60 llvm_unreachable("not implemented");
61 }
62
63 void assignValueToReg(Register ValVReg, Register PhysReg,
64 const CCValAssign &VA) override {
65 Register ExtReg = extendRegisterMin32(*this, ValVReg, VA);
66
67 // If this is a scalar return, insert a readfirstlane just in case the value
68 // ends up in a VGPR.
69 // FIXME: Assert this is a shader return.
70 const SIRegisterInfo *TRI
71 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
72 if (TRI->isSGPRReg(MRI, PhysReg)) {
73 LLT Ty = MRI.getType(ExtReg);
74 LLT S32 = LLT::scalar(32);
75 if (Ty != S32) {
76 // FIXME: We should probably support readfirstlane intrinsics with all
77 // legal 32-bit types.
78 assert(Ty.getSizeInBits() == 32);
79 if (Ty.isPointer())
80 ExtReg = MIRBuilder.buildPtrToInt(S32, ExtReg).getReg(0);
81 else
82 ExtReg = MIRBuilder.buildBitcast(S32, ExtReg).getReg(0);
83 }
84
85 auto ToSGPR = MIRBuilder
86 .buildIntrinsic(Intrinsic::amdgcn_readfirstlane,
87 {MRI.getType(ExtReg)})
88 .addReg(ExtReg);
89 ExtReg = ToSGPR.getReg(0);
90 }
91
92 MIRBuilder.buildCopy(PhysReg, ExtReg);
93 MIB.addUse(PhysReg, RegState::Implicit);
94 }
95};
96
97struct AMDGPUIncomingArgHandler : public CallLowering::IncomingValueHandler {
98 uint64_t StackUsed = 0;
99
100 AMDGPUIncomingArgHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI)
101 : IncomingValueHandler(B, MRI) {}
102
105 ISD::ArgFlagsTy Flags) override {
106 auto &MFI = MIRBuilder.getMF().getFrameInfo();
107
108 // Byval is assumed to be writable memory, but other stack passed arguments
109 // are not.
110 const bool IsImmutable = !Flags.isByVal();
111 int FI = MFI.CreateFixedObject(Size, Offset, IsImmutable);
112 MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI);
113 auto AddrReg = MIRBuilder.buildFrameIndex(
115 StackUsed = std::max(StackUsed, Size + Offset);
116 return AddrReg.getReg(0);
117 }
118
119 void assignValueToReg(Register ValVReg, Register PhysReg,
120 const CCValAssign &VA) override {
121 markPhysRegUsed(PhysReg);
122
123 if (VA.getLocVT().getSizeInBits() < 32) {
124 // 16-bit types are reported as legal for 32-bit registers. We need to do
125 // a 32-bit copy, and truncate to avoid the verifier complaining about it.
126 auto Copy = MIRBuilder.buildCopy(LLT::scalar(32), PhysReg);
127
128 // If we have signext/zeroext, it applies to the whole 32-bit register
129 // before truncation.
130 auto Extended =
131 buildExtensionHint(VA, Copy.getReg(0), LLT(VA.getLocVT()));
132 MIRBuilder.buildTrunc(ValVReg, Extended);
133 return;
134 }
135
136 IncomingValueHandler::assignValueToReg(ValVReg, PhysReg, VA);
137 }
138
139 void assignValueToAddress(Register ValVReg, Register Addr, LLT MemTy,
140 const MachinePointerInfo &MPO,
141 const CCValAssign &VA) override {
142 MachineFunction &MF = MIRBuilder.getMF();
143
144 auto *MMO = MF.getMachineMemOperand(
146 inferAlignFromPtrInfo(MF, MPO));
147 MIRBuilder.buildLoad(ValVReg, Addr, *MMO);
148 }
149
150 /// How the physical register gets marked varies between formal
151 /// parameters (it's a basic-block live-in), and a call instruction
152 /// (it's an implicit-def of the BL).
153 virtual void markPhysRegUsed(unsigned PhysReg) = 0;
154};
155
156struct FormalArgHandler : public AMDGPUIncomingArgHandler {
158 : AMDGPUIncomingArgHandler(B, MRI) {}
159
160 void markPhysRegUsed(unsigned PhysReg) override {
161 MIRBuilder.getMBB().addLiveIn(PhysReg);
162 }
163};
164
165struct CallReturnHandler : public AMDGPUIncomingArgHandler {
166 CallReturnHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
168 : AMDGPUIncomingArgHandler(MIRBuilder, MRI), MIB(MIB) {}
169
170 void markPhysRegUsed(unsigned PhysReg) override {
171 MIB.addDef(PhysReg, RegState::Implicit);
172 }
173
175};
176
177struct AMDGPUOutgoingArgHandler : public AMDGPUOutgoingValueHandler {
178 /// For tail calls, the byte offset of the call's argument area from the
179 /// callee's. Unused elsewhere.
180 int FPDiff;
181
182 // Cache the SP register vreg if we need it more than once in this call site.
183 Register SPReg;
184
185 bool IsTailCall;
186
187 AMDGPUOutgoingArgHandler(MachineIRBuilder &MIRBuilder,
189 bool IsTailCall = false, int FPDiff = 0)
190 : AMDGPUOutgoingValueHandler(MIRBuilder, MRI, MIB), FPDiff(FPDiff),
191 IsTailCall(IsTailCall) {}
192
193 Register getStackAddress(uint64_t Size, int64_t Offset,
195 ISD::ArgFlagsTy Flags) override {
196 MachineFunction &MF = MIRBuilder.getMF();
197 const LLT PtrTy = LLT::pointer(AMDGPUAS::PRIVATE_ADDRESS, 32);
198 const LLT S32 = LLT::scalar(32);
199
200 if (IsTailCall) {
201 Offset += FPDiff;
202 int FI = MF.getFrameInfo().CreateFixedObject(Size, Offset, true);
203 auto FIReg = MIRBuilder.buildFrameIndex(PtrTy, FI);
205 return FIReg.getReg(0);
206 }
207
209
210 if (!SPReg) {
211 const GCNSubtarget &ST = MIRBuilder.getMF().getSubtarget<GCNSubtarget>();
212 if (ST.enableFlatScratch()) {
213 // The stack is accessed unswizzled, so we can use a regular copy.
214 SPReg = MIRBuilder.buildCopy(PtrTy,
215 MFI->getStackPtrOffsetReg()).getReg(0);
216 } else {
217 // The address we produce here, without knowing the use context, is going
218 // to be interpreted as a vector address, so we need to convert to a
219 // swizzled address.
220 SPReg = MIRBuilder.buildInstr(AMDGPU::G_AMDGPU_WAVE_ADDRESS, {PtrTy},
221 {MFI->getStackPtrOffsetReg()}).getReg(0);
222 }
223 }
224
225 auto OffsetReg = MIRBuilder.buildConstant(S32, Offset);
226
227 auto AddrReg = MIRBuilder.buildPtrAdd(PtrTy, SPReg, OffsetReg);
229 return AddrReg.getReg(0);
230 }
231
232 void assignValueToAddress(Register ValVReg, Register Addr, LLT MemTy,
233 const MachinePointerInfo &MPO,
234 const CCValAssign &VA) override {
235 MachineFunction &MF = MIRBuilder.getMF();
236 uint64_t LocMemOffset = VA.getLocMemOffset();
237 const auto &ST = MF.getSubtarget<GCNSubtarget>();
238
239 auto *MMO = MF.getMachineMemOperand(
240 MPO, MachineMemOperand::MOStore, MemTy,
241 commonAlignment(ST.getStackAlignment(), LocMemOffset));
242 MIRBuilder.buildStore(ValVReg, Addr, *MMO);
243 }
244
245 void assignValueToAddress(const CallLowering::ArgInfo &Arg,
246 unsigned ValRegIndex, Register Addr, LLT MemTy,
247 const MachinePointerInfo &MPO,
248 const CCValAssign &VA) override {
249 Register ValVReg = VA.getLocInfo() != CCValAssign::LocInfo::FPExt
250 ? extendRegister(Arg.Regs[ValRegIndex], VA)
251 : Arg.Regs[ValRegIndex];
252 assignValueToAddress(ValVReg, Addr, MemTy, MPO, VA);
253 }
254};
255} // anonymous namespace
256
258 : CallLowering(&TLI) {
259}
260
261// FIXME: Compatibility shim
263 switch (MIOpc) {
264 case TargetOpcode::G_SEXT:
265 return ISD::SIGN_EXTEND;
266 case TargetOpcode::G_ZEXT:
267 return ISD::ZERO_EXTEND;
268 case TargetOpcode::G_ANYEXT:
269 return ISD::ANY_EXTEND;
270 default:
271 llvm_unreachable("not an extend opcode");
272 }
273}
274
275bool AMDGPUCallLowering::canLowerReturn(MachineFunction &MF,
276 CallingConv::ID CallConv,
278 bool IsVarArg) const {
279 // For shaders. Vector types should be explicitly handled by CC.
280 if (AMDGPU::isEntryFunctionCC(CallConv))
281 return true;
282
284 const SITargetLowering &TLI = *getTLI<SITargetLowering>();
285 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs,
286 MF.getFunction().getContext());
287
288 return checkReturn(CCInfo, Outs, TLI.CCAssignFnForReturn(CallConv, IsVarArg));
289}
290
291/// Lower the return value for the already existing \p Ret. This assumes that
292/// \p B's insertion point is correct.
293bool AMDGPUCallLowering::lowerReturnVal(MachineIRBuilder &B,
294 const Value *Val, ArrayRef<Register> VRegs,
295 MachineInstrBuilder &Ret) const {
296 if (!Val)
297 return true;
298
299 auto &MF = B.getMF();
300 const auto &F = MF.getFunction();
301 const DataLayout &DL = MF.getDataLayout();
302 MachineRegisterInfo *MRI = B.getMRI();
303 LLVMContext &Ctx = F.getContext();
304
305 CallingConv::ID CC = F.getCallingConv();
306 const SITargetLowering &TLI = *getTLI<SITargetLowering>();
307
308 SmallVector<EVT, 8> SplitEVTs;
309 ComputeValueVTs(TLI, DL, Val->getType(), SplitEVTs);
310 assert(VRegs.size() == SplitEVTs.size() &&
311 "For each split Type there should be exactly one VReg.");
312
313 SmallVector<ArgInfo, 8> SplitRetInfos;
314
315 for (unsigned i = 0; i < SplitEVTs.size(); ++i) {
316 EVT VT = SplitEVTs[i];
317 Register Reg = VRegs[i];
318 ArgInfo RetInfo(Reg, VT.getTypeForEVT(Ctx), 0);
320
321 if (VT.isScalarInteger()) {
322 unsigned ExtendOp = TargetOpcode::G_ANYEXT;
323 if (RetInfo.Flags[0].isSExt()) {
324 assert(RetInfo.Regs.size() == 1 && "expect only simple return values");
325 ExtendOp = TargetOpcode::G_SEXT;
326 } else if (RetInfo.Flags[0].isZExt()) {
327 assert(RetInfo.Regs.size() == 1 && "expect only simple return values");
328 ExtendOp = TargetOpcode::G_ZEXT;
329 }
330
331 EVT ExtVT = TLI.getTypeForExtReturn(Ctx, VT,
332 extOpcodeToISDExtOpcode(ExtendOp));
333 if (ExtVT != VT) {
334 RetInfo.Ty = ExtVT.getTypeForEVT(Ctx);
335 LLT ExtTy = getLLTForType(*RetInfo.Ty, DL);
336 Reg = B.buildInstr(ExtendOp, {ExtTy}, {Reg}).getReg(0);
337 }
338 }
339
340 if (Reg != RetInfo.Regs[0]) {
341 RetInfo.Regs[0] = Reg;
342 // Reset the arg flags after modifying Reg.
344 }
345
346 splitToValueTypes(RetInfo, SplitRetInfos, DL, CC);
347 }
348
349 CCAssignFn *AssignFn = TLI.CCAssignFnForReturn(CC, F.isVarArg());
350
351 OutgoingValueAssigner Assigner(AssignFn);
352 AMDGPUOutgoingValueHandler RetHandler(B, *MRI, Ret);
353 return determineAndHandleAssignments(RetHandler, Assigner, SplitRetInfos, B,
354 CC, F.isVarArg());
355}
356
358 ArrayRef<Register> VRegs,
359 FunctionLoweringInfo &FLI) const {
360
361 MachineFunction &MF = B.getMF();
363 MFI->setIfReturnsVoid(!Val);
364
365 assert(!Val == VRegs.empty() && "Return value without a vreg");
366
367 CallingConv::ID CC = B.getMF().getFunction().getCallingConv();
368 const bool IsShader = AMDGPU::isShader(CC);
369 const bool IsWaveEnd =
370 (IsShader && MFI->returnsVoid()) || AMDGPU::isKernel(CC);
371 if (IsWaveEnd) {
372 B.buildInstr(AMDGPU::S_ENDPGM)
373 .addImm(0);
374 return true;
375 }
376
377 unsigned ReturnOpc =
378 IsShader ? AMDGPU::SI_RETURN_TO_EPILOG : AMDGPU::SI_RETURN;
379 auto Ret = B.buildInstrNoInsert(ReturnOpc);
380
381 if (!FLI.CanLowerReturn)
382 insertSRetStores(B, Val->getType(), VRegs, FLI.DemoteRegister);
383 else if (!lowerReturnVal(B, Val, VRegs, Ret))
384 return false;
385
386 // TODO: Handle CalleeSavedRegsViaCopy.
387
388 B.insertInstr(Ret);
389 return true;
390}
391
392void AMDGPUCallLowering::lowerParameterPtr(Register DstReg, MachineIRBuilder &B,
393 uint64_t Offset) const {
394 MachineFunction &MF = B.getMF();
397 Register KernArgSegmentPtr =
399 Register KernArgSegmentVReg = MRI.getLiveInVirtReg(KernArgSegmentPtr);
400
401 auto OffsetReg = B.buildConstant(LLT::scalar(64), Offset);
402
403 B.buildPtrAdd(DstReg, KernArgSegmentVReg, OffsetReg);
404}
405
406void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &B, ArgInfo &OrigArg,
408 Align Alignment) const {
409 MachineFunction &MF = B.getMF();
410 const Function &F = MF.getFunction();
411 const DataLayout &DL = F.getDataLayout();
413
415
416 SmallVector<ArgInfo, 32> SplitArgs;
417 SmallVector<uint64_t> FieldOffsets;
418 splitToValueTypes(OrigArg, SplitArgs, DL, F.getCallingConv(), &FieldOffsets);
419
420 unsigned Idx = 0;
421 for (ArgInfo &SplitArg : SplitArgs) {
422 Register PtrReg = B.getMRI()->createGenericVirtualRegister(PtrTy);
423 lowerParameterPtr(PtrReg, B, Offset + FieldOffsets[Idx]);
424
425 LLT ArgTy = getLLTForType(*SplitArg.Ty, DL);
426 if (SplitArg.Flags[0].isPointer()) {
427 // Compensate for losing pointeriness in splitValueTypes.
428 LLT PtrTy = LLT::pointer(SplitArg.Flags[0].getPointerAddrSpace(),
429 ArgTy.getScalarSizeInBits());
430 ArgTy = ArgTy.isVector() ? LLT::vector(ArgTy.getElementCount(), PtrTy)
431 : PtrTy;
432 }
433
435 PtrInfo,
438 ArgTy, commonAlignment(Alignment, FieldOffsets[Idx]));
439
440 assert(SplitArg.Regs.size() == 1);
441
442 B.buildLoad(SplitArg.Regs[0], PtrReg, *MMO);
443 ++Idx;
444 }
445}
446
447// Allocate special inputs passed in user SGPRs.
448static void allocateHSAUserSGPRs(CCState &CCInfo,
450 MachineFunction &MF,
451 const SIRegisterInfo &TRI,
452 SIMachineFunctionInfo &Info) {
453 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
454 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
455 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
456 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
457 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
458 CCInfo.AllocateReg(PrivateSegmentBufferReg);
459 }
460
461 if (UserSGPRInfo.hasDispatchPtr()) {
462 Register DispatchPtrReg = Info.addDispatchPtr(TRI);
463 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
464 CCInfo.AllocateReg(DispatchPtrReg);
465 }
466
467 if (UserSGPRInfo.hasQueuePtr()) {
468 Register QueuePtrReg = Info.addQueuePtr(TRI);
469 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
470 CCInfo.AllocateReg(QueuePtrReg);
471 }
472
473 if (UserSGPRInfo.hasKernargSegmentPtr()) {
475 Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
477 Register VReg = MRI.createGenericVirtualRegister(P4);
478 MRI.addLiveIn(InputPtrReg, VReg);
479 B.getMBB().addLiveIn(InputPtrReg);
480 B.buildCopy(VReg, InputPtrReg);
481 CCInfo.AllocateReg(InputPtrReg);
482 }
483
484 if (UserSGPRInfo.hasDispatchID()) {
485 Register DispatchIDReg = Info.addDispatchID(TRI);
486 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
487 CCInfo.AllocateReg(DispatchIDReg);
488 }
489
490 if (UserSGPRInfo.hasFlatScratchInit()) {
491 Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
492 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
493 CCInfo.AllocateReg(FlatScratchInitReg);
494 }
495
496 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
497 // these from the dispatch pointer.
498}
499
501 MachineIRBuilder &B, const Function &F,
502 ArrayRef<ArrayRef<Register>> VRegs) const {
503 MachineFunction &MF = B.getMF();
504 const GCNSubtarget *Subtarget = &MF.getSubtarget<GCNSubtarget>();
507 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
508 const SITargetLowering &TLI = *getTLI<SITargetLowering>();
509 const DataLayout &DL = F.getDataLayout();
510
512 CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext());
513
514 allocateHSAUserSGPRs(CCInfo, B, MF, *TRI, *Info);
515
516 unsigned i = 0;
517 const Align KernArgBaseAlign(16);
518 const unsigned BaseOffset = Subtarget->getExplicitKernelArgOffset();
519 uint64_t ExplicitArgOffset = 0;
520
521 // TODO: Align down to dword alignment and extract bits for extending loads.
522 for (auto &Arg : F.args()) {
523 // TODO: Add support for kernarg preload.
524 if (Arg.hasAttribute("amdgpu-hidden-argument")) {
525 LLVM_DEBUG(dbgs() << "Preloading hidden arguments is not supported\n");
526 return false;
527 }
528
529 const bool IsByRef = Arg.hasByRefAttr();
530 Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
531 unsigned AllocSize = DL.getTypeAllocSize(ArgTy);
532 if (AllocSize == 0)
533 continue;
534
535 MaybeAlign ParamAlign = IsByRef ? Arg.getParamAlign() : std::nullopt;
536 Align ABIAlign = DL.getValueOrABITypeAlignment(ParamAlign, ArgTy);
537
538 uint64_t ArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + BaseOffset;
539 ExplicitArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + AllocSize;
540
541 if (Arg.use_empty()) {
542 ++i;
543 continue;
544 }
545
546 Align Alignment = commonAlignment(KernArgBaseAlign, ArgOffset);
547
548 if (IsByRef) {
549 unsigned ByRefAS = cast<PointerType>(Arg.getType())->getAddressSpace();
550
551 assert(VRegs[i].size() == 1 &&
552 "expected only one register for byval pointers");
553 if (ByRefAS == AMDGPUAS::CONSTANT_ADDRESS) {
554 lowerParameterPtr(VRegs[i][0], B, ArgOffset);
555 } else {
556 const LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
557 Register PtrReg = MRI.createGenericVirtualRegister(ConstPtrTy);
558 lowerParameterPtr(PtrReg, B, ArgOffset);
559
560 B.buildAddrSpaceCast(VRegs[i][0], PtrReg);
561 }
562 } else {
563 ArgInfo OrigArg(VRegs[i], Arg, i);
564 const unsigned OrigArgIdx = i + AttributeList::FirstArgIndex;
565 setArgFlags(OrigArg, OrigArgIdx, DL, F);
566 lowerParameter(B, OrigArg, ArgOffset, Alignment);
567 }
568
569 ++i;
570 }
571
572 TLI.allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
573 TLI.allocateSystemSGPRs(CCInfo, MF, *Info, F.getCallingConv(), false);
574 return true;
575}
576
579 FunctionLoweringInfo &FLI) const {
580 CallingConv::ID CC = F.getCallingConv();
581
582 // The infrastructure for normal calling convention lowering is essentially
583 // useless for kernels. We want to avoid any kind of legalization or argument
584 // splitting.
586 return lowerFormalArgumentsKernel(B, F, VRegs);
587
588 const bool IsGraphics = AMDGPU::isGraphics(CC);
589 const bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CC);
590
591 MachineFunction &MF = B.getMF();
592 MachineBasicBlock &MBB = B.getMBB();
595 const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
596 const SIRegisterInfo *TRI = Subtarget.getRegisterInfo();
597 const DataLayout &DL = F.getDataLayout();
598
600 CCState CCInfo(CC, F.isVarArg(), MF, ArgLocs, F.getContext());
601 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo();
602
603 if (UserSGPRInfo.hasImplicitBufferPtr()) {
604 Register ImplicitBufferPtrReg = Info->addImplicitBufferPtr(*TRI);
605 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
606 CCInfo.AllocateReg(ImplicitBufferPtrReg);
607 }
608
609 // FIXME: This probably isn't defined for mesa
610 if (UserSGPRInfo.hasFlatScratchInit() && !Subtarget.isAmdPalOS()) {
611 Register FlatScratchInitReg = Info->addFlatScratchInit(*TRI);
612 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
613 CCInfo.AllocateReg(FlatScratchInitReg);
614 }
615
616 SmallVector<ArgInfo, 32> SplitArgs;
617 unsigned Idx = 0;
618 unsigned PSInputNum = 0;
619
620 // Insert the hidden sret parameter if the return value won't fit in the
621 // return registers.
622 if (!FLI.CanLowerReturn)
623 insertSRetIncomingArgument(F, SplitArgs, FLI.DemoteRegister, MRI, DL);
624
625 for (auto &Arg : F.args()) {
626 if (DL.getTypeStoreSize(Arg.getType()) == 0)
627 continue;
628
629 const bool InReg = Arg.hasAttribute(Attribute::InReg);
630
631 if (Arg.hasAttribute(Attribute::SwiftSelf) ||
632 Arg.hasAttribute(Attribute::SwiftError) ||
633 Arg.hasAttribute(Attribute::Nest))
634 return false;
635
636 if (CC == CallingConv::AMDGPU_PS && !InReg && PSInputNum <= 15) {
637 const bool ArgUsed = !Arg.use_empty();
638 bool SkipArg = !ArgUsed && !Info->isPSInputAllocated(PSInputNum);
639
640 if (!SkipArg) {
641 Info->markPSInputAllocated(PSInputNum);
642 if (ArgUsed)
643 Info->markPSInputEnabled(PSInputNum);
644 }
645
646 ++PSInputNum;
647
648 if (SkipArg) {
649 for (Register R : VRegs[Idx])
650 B.buildUndef(R);
651
652 ++Idx;
653 continue;
654 }
655 }
656
657 ArgInfo OrigArg(VRegs[Idx], Arg, Idx);
658 const unsigned OrigArgIdx = Idx + AttributeList::FirstArgIndex;
659 setArgFlags(OrigArg, OrigArgIdx, DL, F);
660
661 splitToValueTypes(OrigArg, SplitArgs, DL, CC);
662 ++Idx;
663 }
664
665 // At least one interpolation mode must be enabled or else the GPU will
666 // hang.
667 //
668 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
669 // set PSInputAddr, the user wants to enable some bits after the compilation
670 // based on run-time states. Since we can't know what the final PSInputEna
671 // will look like, so we shouldn't do anything here and the user should take
672 // responsibility for the correct programming.
673 //
674 // Otherwise, the following restrictions apply:
675 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
676 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
677 // enabled too.
678 if (CC == CallingConv::AMDGPU_PS) {
679 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
680 ((Info->getPSInputAddr() & 0xF) == 0 &&
681 Info->isPSInputAllocated(11))) {
682 CCInfo.AllocateReg(AMDGPU::VGPR0);
683 CCInfo.AllocateReg(AMDGPU::VGPR1);
684 Info->markPSInputAllocated(0);
685 Info->markPSInputEnabled(0);
686 }
687
688 if (Subtarget.isAmdPalOS()) {
689 // For isAmdPalOS, the user does not enable some bits after compilation
690 // based on run-time states; the register values being generated here are
691 // the final ones set in hardware. Therefore we need to apply the
692 // workaround to PSInputAddr and PSInputEnable together. (The case where
693 // a bit is set in PSInputAddr but not PSInputEnable is where the frontend
694 // set up an input arg for a particular interpolation mode, but nothing
695 // uses that input arg. Really we should have an earlier pass that removes
696 // such an arg.)
697 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
698 if ((PsInputBits & 0x7F) == 0 ||
699 ((PsInputBits & 0xF) == 0 &&
700 (PsInputBits >> 11 & 1)))
701 Info->markPSInputEnabled(llvm::countr_zero(Info->getPSInputAddr()));
702 }
703 }
704
705 const SITargetLowering &TLI = *getTLI<SITargetLowering>();
706 CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CC, F.isVarArg());
707
708 if (!MBB.empty())
709 B.setInstr(*MBB.begin());
710
711 if (!IsEntryFunc && !IsGraphics) {
712 // For the fixed ABI, pass workitem IDs in the last argument register.
713 TLI.allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
714
715 if (!Subtarget.enableFlatScratch())
716 CCInfo.AllocateReg(Info->getScratchRSrcReg());
717 TLI.allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
718 }
719
720 IncomingValueAssigner Assigner(AssignFn);
721 if (!determineAssignments(Assigner, SplitArgs, CCInfo))
722 return false;
723
724 FormalArgHandler Handler(B, MRI);
725 if (!handleAssignments(Handler, SplitArgs, CCInfo, ArgLocs, B))
726 return false;
727
728 uint64_t StackSize = Assigner.StackSize;
729
730 // Start adding system SGPRs.
731 if (IsEntryFunc)
732 TLI.allocateSystemSGPRs(CCInfo, MF, *Info, CC, IsGraphics);
733
734 // When we tail call, we need to check if the callee's arguments will fit on
735 // the caller's stack. So, whenever we lower formal arguments, we should keep
736 // track of this information, since we might lower a tail call in this
737 // function later.
738 Info->setBytesInStackArgArea(StackSize);
739
740 // Move back to the end of the basic block.
741 B.setMBB(MBB);
742
743 return true;
744}
745
747 CCState &CCInfo,
748 SmallVectorImpl<std::pair<MCRegister, Register>> &ArgRegs,
749 CallLoweringInfo &Info) const {
750 MachineFunction &MF = MIRBuilder.getMF();
751
752 // If there's no call site, this doesn't correspond to a call from the IR and
753 // doesn't need implicit inputs.
754 if (!Info.CB)
755 return true;
756
757 const AMDGPUFunctionArgInfo *CalleeArgInfo
759
761 const AMDGPUFunctionArgInfo &CallerArgInfo = MFI->getArgInfo();
762
763
764 // TODO: Unify with private memory register handling. This is complicated by
765 // the fact that at least in kernels, the input argument is not necessarily
766 // in the same location as the input.
776 };
777
778 static constexpr StringLiteral ImplicitAttrNames[] = {
779 "amdgpu-no-dispatch-ptr",
780 "amdgpu-no-queue-ptr",
781 "amdgpu-no-implicitarg-ptr",
782 "amdgpu-no-dispatch-id",
783 "amdgpu-no-workgroup-id-x",
784 "amdgpu-no-workgroup-id-y",
785 "amdgpu-no-workgroup-id-z",
786 "amdgpu-no-lds-kernel-id",
787 };
788
790
791 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
792 const AMDGPULegalizerInfo *LI
793 = static_cast<const AMDGPULegalizerInfo*>(ST.getLegalizerInfo());
794
795 unsigned I = 0;
796 for (auto InputID : InputRegs) {
797 const ArgDescriptor *OutgoingArg;
798 const TargetRegisterClass *ArgRC;
799 LLT ArgTy;
800
801 // If the callee does not use the attribute value, skip copying the value.
802 if (Info.CB->hasFnAttr(ImplicitAttrNames[I++]))
803 continue;
804
805 std::tie(OutgoingArg, ArgRC, ArgTy) =
806 CalleeArgInfo->getPreloadedValue(InputID);
807 if (!OutgoingArg)
808 continue;
809
810 const ArgDescriptor *IncomingArg;
811 const TargetRegisterClass *IncomingArgRC;
812 std::tie(IncomingArg, IncomingArgRC, ArgTy) =
813 CallerArgInfo.getPreloadedValue(InputID);
814 assert(IncomingArgRC == ArgRC);
815
816 Register InputReg = MRI.createGenericVirtualRegister(ArgTy);
817
818 if (IncomingArg) {
819 LI->buildLoadInputValue(InputReg, MIRBuilder, IncomingArg, ArgRC, ArgTy);
820 } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
821 LI->getImplicitArgPtr(InputReg, MRI, MIRBuilder);
822 } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) {
823 std::optional<uint32_t> Id =
825 if (Id) {
826 MIRBuilder.buildConstant(InputReg, *Id);
827 } else {
828 MIRBuilder.buildUndef(InputReg);
829 }
830 } else {
831 // We may have proven the input wasn't needed, although the ABI is
832 // requiring it. We just need to allocate the register appropriately.
833 MIRBuilder.buildUndef(InputReg);
834 }
835
836 if (OutgoingArg->isRegister()) {
837 ArgRegs.emplace_back(OutgoingArg->getRegister(), InputReg);
838 if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
839 report_fatal_error("failed to allocate implicit input argument");
840 } else {
841 LLVM_DEBUG(dbgs() << "Unhandled stack passed implicit input argument\n");
842 return false;
843 }
844 }
845
846 // Pack workitem IDs into a single register or pass it as is if already
847 // packed.
848 const ArgDescriptor *OutgoingArg;
849 const TargetRegisterClass *ArgRC;
850 LLT ArgTy;
851
852 std::tie(OutgoingArg, ArgRC, ArgTy) =
854 if (!OutgoingArg)
855 std::tie(OutgoingArg, ArgRC, ArgTy) =
857 if (!OutgoingArg)
858 std::tie(OutgoingArg, ArgRC, ArgTy) =
860 if (!OutgoingArg)
861 return false;
862
863 auto WorkitemIDX =
864 CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X);
865 auto WorkitemIDY =
866 CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
867 auto WorkitemIDZ =
868 CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
869
870 const ArgDescriptor *IncomingArgX = std::get<0>(WorkitemIDX);
871 const ArgDescriptor *IncomingArgY = std::get<0>(WorkitemIDY);
872 const ArgDescriptor *IncomingArgZ = std::get<0>(WorkitemIDZ);
873 const LLT S32 = LLT::scalar(32);
874
875 const bool NeedWorkItemIDX = !Info.CB->hasFnAttr("amdgpu-no-workitem-id-x");
876 const bool NeedWorkItemIDY = !Info.CB->hasFnAttr("amdgpu-no-workitem-id-y");
877 const bool NeedWorkItemIDZ = !Info.CB->hasFnAttr("amdgpu-no-workitem-id-z");
878
879 // If incoming ids are not packed we need to pack them.
880 // FIXME: Should consider known workgroup size to eliminate known 0 cases.
881 Register InputReg;
882 if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX &&
883 NeedWorkItemIDX) {
884 if (ST.getMaxWorkitemID(MF.getFunction(), 0) != 0) {
885 InputReg = MRI.createGenericVirtualRegister(S32);
886 LI->buildLoadInputValue(InputReg, MIRBuilder, IncomingArgX,
887 std::get<1>(WorkitemIDX),
888 std::get<2>(WorkitemIDX));
889 } else {
890 InputReg = MIRBuilder.buildConstant(S32, 0).getReg(0);
891 }
892 }
893
894 if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY &&
895 NeedWorkItemIDY && ST.getMaxWorkitemID(MF.getFunction(), 1) != 0) {
896 Register Y = MRI.createGenericVirtualRegister(S32);
897 LI->buildLoadInputValue(Y, MIRBuilder, IncomingArgY,
898 std::get<1>(WorkitemIDY), std::get<2>(WorkitemIDY));
899
900 Y = MIRBuilder.buildShl(S32, Y, MIRBuilder.buildConstant(S32, 10)).getReg(0);
901 InputReg = InputReg ? MIRBuilder.buildOr(S32, InputReg, Y).getReg(0) : Y;
902 }
903
904 if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ &&
905 NeedWorkItemIDZ && ST.getMaxWorkitemID(MF.getFunction(), 2) != 0) {
906 Register Z = MRI.createGenericVirtualRegister(S32);
907 LI->buildLoadInputValue(Z, MIRBuilder, IncomingArgZ,
908 std::get<1>(WorkitemIDZ), std::get<2>(WorkitemIDZ));
909
910 Z = MIRBuilder.buildShl(S32, Z, MIRBuilder.buildConstant(S32, 20)).getReg(0);
911 InputReg = InputReg ? MIRBuilder.buildOr(S32, InputReg, Z).getReg(0) : Z;
912 }
913
914 if (!InputReg &&
915 (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
916 InputReg = MRI.createGenericVirtualRegister(S32);
917 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
918 // We're in a situation where the outgoing function requires the workitem
919 // ID, but the calling function does not have it (e.g a graphics function
920 // calling a C calling convention function). This is illegal, but we need
921 // to produce something.
922 MIRBuilder.buildUndef(InputReg);
923 } else {
924 // Workitem ids are already packed, any of present incoming arguments will
925 // carry all required fields.
927 IncomingArgX ? *IncomingArgX :
928 IncomingArgY ? *IncomingArgY : *IncomingArgZ, ~0u);
929 LI->buildLoadInputValue(InputReg, MIRBuilder, &IncomingArg,
930 &AMDGPU::VGPR_32RegClass, S32);
931 }
932 }
933
934 if (OutgoingArg->isRegister()) {
935 if (InputReg)
936 ArgRegs.emplace_back(OutgoingArg->getRegister(), InputReg);
937
938 if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
939 report_fatal_error("failed to allocate implicit input argument");
940 } else {
941 LLVM_DEBUG(dbgs() << "Unhandled stack passed implicit input argument\n");
942 return false;
943 }
944
945 return true;
946}
947
948/// Returns a pair containing the fixed CCAssignFn and the vararg CCAssignFn for
949/// CC.
950static std::pair<CCAssignFn *, CCAssignFn *>
952 return {TLI.CCAssignFnForCall(CC, false), TLI.CCAssignFnForCall(CC, true)};
953}
954
955static unsigned getCallOpcode(const MachineFunction &CallerF, bool IsIndirect,
956 bool IsTailCall, bool isWave32,
958 // For calls to amdgpu_cs_chain functions, the address is known to be uniform.
959 assert((AMDGPU::isChainCC(CC) || !IsIndirect || !IsTailCall) &&
960 "Indirect calls can't be tail calls, "
961 "because the address can be divergent");
962 if (!IsTailCall)
963 return AMDGPU::G_SI_CALL;
964
966 return isWave32 ? AMDGPU::SI_CS_CHAIN_TC_W32 : AMDGPU::SI_CS_CHAIN_TC_W64;
967
968 return CC == CallingConv::AMDGPU_Gfx ? AMDGPU::SI_TCRETURN_GFX :
969 AMDGPU::SI_TCRETURN;
970}
971
972// Add operands to call instruction to track the callee.
974 MachineIRBuilder &MIRBuilder,
976 if (Info.Callee.isReg()) {
977 CallInst.addReg(Info.Callee.getReg());
978 CallInst.addImm(0);
979 } else if (Info.Callee.isGlobal() && Info.Callee.getOffset() == 0) {
980 // The call lowering lightly assumed we can directly encode a call target in
981 // the instruction, which is not the case. Materialize the address here.
982 const GlobalValue *GV = Info.Callee.getGlobal();
983 auto Ptr = MIRBuilder.buildGlobalValue(
984 LLT::pointer(GV->getAddressSpace(), 64), GV);
985 CallInst.addReg(Ptr.getReg(0));
986 CallInst.add(Info.Callee);
987 } else
988 return false;
989
990 return true;
991}
992
995 SmallVectorImpl<ArgInfo> &InArgs) const {
996 const Function &CallerF = MF.getFunction();
997 CallingConv::ID CalleeCC = Info.CallConv;
998 CallingConv::ID CallerCC = CallerF.getCallingConv();
999
1000 // If the calling conventions match, then everything must be the same.
1001 if (CalleeCC == CallerCC)
1002 return true;
1003
1004 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1005
1006 // Make sure that the caller and callee preserve all of the same registers.
1007 const auto *TRI = ST.getRegisterInfo();
1008
1009 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
1010 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
1011 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
1012 return false;
1013
1014 // Check if the caller and callee will handle arguments in the same way.
1015 const SITargetLowering &TLI = *getTLI<SITargetLowering>();
1016 CCAssignFn *CalleeAssignFnFixed;
1017 CCAssignFn *CalleeAssignFnVarArg;
1018 std::tie(CalleeAssignFnFixed, CalleeAssignFnVarArg) =
1019 getAssignFnsForCC(CalleeCC, TLI);
1020
1021 CCAssignFn *CallerAssignFnFixed;
1022 CCAssignFn *CallerAssignFnVarArg;
1023 std::tie(CallerAssignFnFixed, CallerAssignFnVarArg) =
1024 getAssignFnsForCC(CallerCC, TLI);
1025
1026 // FIXME: We are not accounting for potential differences in implicitly passed
1027 // inputs, but only the fixed ABI is supported now anyway.
1028 IncomingValueAssigner CalleeAssigner(CalleeAssignFnFixed,
1029 CalleeAssignFnVarArg);
1030 IncomingValueAssigner CallerAssigner(CallerAssignFnFixed,
1031 CallerAssignFnVarArg);
1032 return resultsCompatible(Info, MF, InArgs, CalleeAssigner, CallerAssigner);
1033}
1034
1037 SmallVectorImpl<ArgInfo> &OutArgs) const {
1038 // If there are no outgoing arguments, then we are done.
1039 if (OutArgs.empty())
1040 return true;
1041
1042 const Function &CallerF = MF.getFunction();
1043 CallingConv::ID CalleeCC = Info.CallConv;
1044 CallingConv::ID CallerCC = CallerF.getCallingConv();
1045 const SITargetLowering &TLI = *getTLI<SITargetLowering>();
1046
1047 CCAssignFn *AssignFnFixed;
1048 CCAssignFn *AssignFnVarArg;
1049 std::tie(AssignFnFixed, AssignFnVarArg) = getAssignFnsForCC(CalleeCC, TLI);
1050
1051 // We have outgoing arguments. Make sure that we can tail call with them.
1053 CCState OutInfo(CalleeCC, false, MF, OutLocs, CallerF.getContext());
1054 OutgoingValueAssigner Assigner(AssignFnFixed, AssignFnVarArg);
1055
1056 if (!determineAssignments(Assigner, OutArgs, OutInfo)) {
1057 LLVM_DEBUG(dbgs() << "... Could not analyze call operands.\n");
1058 return false;
1059 }
1060
1061 // Make sure that they can fit on the caller's stack.
1062 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1063 if (OutInfo.getStackSize() > FuncInfo->getBytesInStackArgArea()) {
1064 LLVM_DEBUG(dbgs() << "... Cannot fit call operands on caller's stack.\n");
1065 return false;
1066 }
1067
1068 // Verify that the parameters in callee-saved registers match.
1069 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1070 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1071 const uint32_t *CallerPreservedMask = TRI->getCallPreservedMask(MF, CallerCC);
1073 return parametersInCSRMatch(MRI, CallerPreservedMask, OutLocs, OutArgs);
1074}
1075
1076/// Return true if the calling convention is one that we can guarantee TCO for.
1078 return CC == CallingConv::Fast;
1079}
1080
1081/// Return true if we might ever do TCO for calls with this calling convention.
1083 switch (CC) {
1084 case CallingConv::C:
1086 return true;
1087 default:
1088 return canGuaranteeTCO(CC);
1089 }
1090}
1091
1094 SmallVectorImpl<ArgInfo> &InArgs, SmallVectorImpl<ArgInfo> &OutArgs) const {
1095 // Must pass all target-independent checks in order to tail call optimize.
1096 if (!Info.IsTailCall)
1097 return false;
1098
1099 // Indirect calls can't be tail calls, because the address can be divergent.
1100 // TODO Check divergence info if the call really is divergent.
1101 if (Info.Callee.isReg())
1102 return false;
1103
1104 MachineFunction &MF = B.getMF();
1105 const Function &CallerF = MF.getFunction();
1106 CallingConv::ID CalleeCC = Info.CallConv;
1107 CallingConv::ID CallerCC = CallerF.getCallingConv();
1108
1109 const SIRegisterInfo *TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
1110 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
1111 // Kernels aren't callable, and don't have a live in return address so it
1112 // doesn't make sense to do a tail call with entry functions.
1113 if (!CallerPreserved)
1114 return false;
1115
1116 if (!mayTailCallThisCC(CalleeCC)) {
1117 LLVM_DEBUG(dbgs() << "... Calling convention cannot be tail called.\n");
1118 return false;
1119 }
1120
1121 if (any_of(CallerF.args(), [](const Argument &A) {
1122 return A.hasByValAttr() || A.hasSwiftErrorAttr();
1123 })) {
1124 LLVM_DEBUG(dbgs() << "... Cannot tail call from callers with byval "
1125 "or swifterror arguments\n");
1126 return false;
1127 }
1128
1129 // If we have -tailcallopt, then we're done.
1131 return canGuaranteeTCO(CalleeCC) && CalleeCC == CallerF.getCallingConv();
1132
1133 // Verify that the incoming and outgoing arguments from the callee are
1134 // safe to tail call.
1135 if (!doCallerAndCalleePassArgsTheSameWay(Info, MF, InArgs)) {
1136 LLVM_DEBUG(
1137 dbgs()
1138 << "... Caller and callee have incompatible calling conventions.\n");
1139 return false;
1140 }
1141
1142 // FIXME: We need to check if any arguments passed in SGPR are uniform. If
1143 // they are not, this cannot be a tail call. If they are uniform, but may be
1144 // VGPR, we need to insert readfirstlanes.
1145 if (!areCalleeOutgoingArgsTailCallable(Info, MF, OutArgs))
1146 return false;
1147
1148 LLVM_DEBUG(dbgs() << "... Call is eligible for tail call optimization.\n");
1149 return true;
1150}
1151
1152// Insert outgoing implicit arguments for a call, by inserting copies to the
1153// implicit argument registers and adding the necessary implicit uses to the
1154// call instruction.
1157 const GCNSubtarget &ST, const SIMachineFunctionInfo &FuncInfo,
1158 CallingConv::ID CalleeCC,
1159 ArrayRef<std::pair<MCRegister, Register>> ImplicitArgRegs) const {
1160 if (!ST.enableFlatScratch()) {
1161 // Insert copies for the SRD. In the HSA case, this should be an identity
1162 // copy.
1163 auto ScratchRSrcReg = MIRBuilder.buildCopy(LLT::fixed_vector(4, 32),
1164 FuncInfo.getScratchRSrcReg());
1165
1166 auto CalleeRSrcReg = AMDGPU::isChainCC(CalleeCC)
1167 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
1168 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3;
1169
1170 MIRBuilder.buildCopy(CalleeRSrcReg, ScratchRSrcReg);
1171 CallInst.addReg(CalleeRSrcReg, RegState::Implicit);
1172 }
1173
1174 for (std::pair<MCRegister, Register> ArgReg : ImplicitArgRegs) {
1175 MIRBuilder.buildCopy((Register)ArgReg.first, ArgReg.second);
1176 CallInst.addReg(ArgReg.first, RegState::Implicit);
1177 }
1178}
1179
1181 MachineIRBuilder &MIRBuilder, CallLoweringInfo &Info,
1182 SmallVectorImpl<ArgInfo> &OutArgs) const {
1183 MachineFunction &MF = MIRBuilder.getMF();
1184 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1186 const Function &F = MF.getFunction();
1188 const SITargetLowering &TLI = *getTLI<SITargetLowering>();
1189
1190 // True when we're tail calling, but without -tailcallopt.
1191 bool IsSibCall = !MF.getTarget().Options.GuaranteedTailCallOpt;
1192
1193 // Find out which ABI gets to decide where things go.
1194 CallingConv::ID CalleeCC = Info.CallConv;
1195 CCAssignFn *AssignFnFixed;
1196 CCAssignFn *AssignFnVarArg;
1197 std::tie(AssignFnFixed, AssignFnVarArg) = getAssignFnsForCC(CalleeCC, TLI);
1198
1199 MachineInstrBuilder CallSeqStart;
1200 if (!IsSibCall)
1201 CallSeqStart = MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKUP);
1202
1203 unsigned Opc =
1204 getCallOpcode(MF, Info.Callee.isReg(), true, ST.isWave32(), CalleeCC);
1205 auto MIB = MIRBuilder.buildInstrNoInsert(Opc);
1206 if (!addCallTargetOperands(MIB, MIRBuilder, Info))
1207 return false;
1208
1209 // Byte offset for the tail call. When we are sibcalling, this will always
1210 // be 0.
1211 MIB.addImm(0);
1212
1213 // If this is a chain call, we need to pass in the EXEC mask.
1214 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1215 if (AMDGPU::isChainCC(Info.CallConv)) {
1216 ArgInfo ExecArg = Info.OrigArgs[1];
1217 assert(ExecArg.Regs.size() == 1 && "Too many regs for EXEC");
1218
1219 if (!ExecArg.Ty->isIntegerTy(ST.getWavefrontSize()))
1220 return false;
1221
1222 if (const auto *CI = dyn_cast<ConstantInt>(ExecArg.OrigValue)) {
1223 MIB.addImm(CI->getSExtValue());
1224 } else {
1225 MIB.addReg(ExecArg.Regs[0]);
1226 unsigned Idx = MIB->getNumOperands() - 1;
1227 MIB->getOperand(Idx).setReg(constrainOperandRegClass(
1228 MF, *TRI, MRI, *ST.getInstrInfo(), *ST.getRegBankInfo(), *MIB,
1229 MIB->getDesc(), MIB->getOperand(Idx), Idx));
1230 }
1231 }
1232
1233 // Tell the call which registers are clobbered.
1234 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CalleeCC);
1235 MIB.addRegMask(Mask);
1236
1237 // FPDiff is the byte offset of the call's argument area from the callee's.
1238 // Stores to callee stack arguments will be placed in FixedStackSlots offset
1239 // by this amount for a tail call. In a sibling call it must be 0 because the
1240 // caller will deallocate the entire stack and the callee still expects its
1241 // arguments to begin at SP+0.
1242 int FPDiff = 0;
1243
1244 // This will be 0 for sibcalls, potentially nonzero for tail calls produced
1245 // by -tailcallopt. For sibcalls, the memory operands for the call are
1246 // already available in the caller's incoming argument space.
1247 unsigned NumBytes = 0;
1248 if (!IsSibCall) {
1249 // We aren't sibcalling, so we need to compute FPDiff. We need to do this
1250 // before handling assignments, because FPDiff must be known for memory
1251 // arguments.
1252 unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
1254 CCState OutInfo(CalleeCC, false, MF, OutLocs, F.getContext());
1255
1256 // FIXME: Not accounting for callee implicit inputs
1257 OutgoingValueAssigner CalleeAssigner(AssignFnFixed, AssignFnVarArg);
1258 if (!determineAssignments(CalleeAssigner, OutArgs, OutInfo))
1259 return false;
1260
1261 // The callee will pop the argument stack as a tail call. Thus, we must
1262 // keep it 16-byte aligned.
1263 NumBytes = alignTo(OutInfo.getStackSize(), ST.getStackAlignment());
1264
1265 // FPDiff will be negative if this tail call requires more space than we
1266 // would automatically have in our incoming argument space. Positive if we
1267 // actually shrink the stack.
1268 FPDiff = NumReusableBytes - NumBytes;
1269
1270 // The stack pointer must be 16-byte aligned at all times it's used for a
1271 // memory operation, which in practice means at *all* times and in
1272 // particular across call boundaries. Therefore our own arguments started at
1273 // a 16-byte aligned SP and the delta applied for the tail call should
1274 // satisfy the same constraint.
1275 assert(isAligned(ST.getStackAlignment(), FPDiff) &&
1276 "unaligned stack on tail call");
1277 }
1278
1280 CCState CCInfo(Info.CallConv, Info.IsVarArg, MF, ArgLocs, F.getContext());
1281
1282 // We could pass MIB and directly add the implicit uses to the call
1283 // now. However, as an aesthetic choice, place implicit argument operands
1284 // after the ordinary user argument registers.
1286
1287 if (Info.CallConv != CallingConv::AMDGPU_Gfx &&
1288 !AMDGPU::isChainCC(Info.CallConv)) {
1289 // With a fixed ABI, allocate fixed registers before user arguments.
1290 if (!passSpecialInputs(MIRBuilder, CCInfo, ImplicitArgRegs, Info))
1291 return false;
1292 }
1293
1294 OutgoingValueAssigner Assigner(AssignFnFixed, AssignFnVarArg);
1295
1296 if (!determineAssignments(Assigner, OutArgs, CCInfo))
1297 return false;
1298
1299 // Do the actual argument marshalling.
1300 AMDGPUOutgoingArgHandler Handler(MIRBuilder, MRI, MIB, true, FPDiff);
1301 if (!handleAssignments(Handler, OutArgs, CCInfo, ArgLocs, MIRBuilder))
1302 return false;
1303
1304 if (Info.ConvergenceCtrlToken) {
1305 MIB.addUse(Info.ConvergenceCtrlToken, RegState::Implicit);
1306 }
1307 handleImplicitCallArguments(MIRBuilder, MIB, ST, *FuncInfo, CalleeCC,
1308 ImplicitArgRegs);
1309
1310 // If we have -tailcallopt, we need to adjust the stack. We'll do the call
1311 // sequence start and end here.
1312 if (!IsSibCall) {
1313 MIB->getOperand(1).setImm(FPDiff);
1314 CallSeqStart.addImm(NumBytes).addImm(0);
1315 // End the call sequence *before* emitting the call. Normally, we would
1316 // tidy the frame up after the call. However, here, we've laid out the
1317 // parameters so that when SP is reset, they will be in the correct
1318 // location.
1319 MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKDOWN).addImm(NumBytes).addImm(0);
1320 }
1321
1322 // Now we can add the actual call instruction to the correct basic block.
1323 MIRBuilder.insertInstr(MIB);
1324
1325 // If Callee is a reg, since it is used by a target specific
1326 // instruction, it must have a register class matching the
1327 // constraint of that instruction.
1328
1329 // FIXME: We should define regbankselectable call instructions to handle
1330 // divergent call targets.
1331 if (MIB->getOperand(0).isReg()) {
1333 MF, *TRI, MRI, *ST.getInstrInfo(), *ST.getRegBankInfo(), *MIB,
1334 MIB->getDesc(), MIB->getOperand(0), 0));
1335 }
1336
1338 Info.LoweredTailCall = true;
1339 return true;
1340}
1341
1342/// Lower a call to the @llvm.amdgcn.cs.chain intrinsic.
1344 CallLoweringInfo &Info) const {
1345 ArgInfo Callee = Info.OrigArgs[0];
1346 ArgInfo SGPRArgs = Info.OrigArgs[2];
1347 ArgInfo VGPRArgs = Info.OrigArgs[3];
1348 ArgInfo Flags = Info.OrigArgs[4];
1349
1350 assert(cast<ConstantInt>(Flags.OrigValue)->isZero() &&
1351 "Non-zero flags aren't supported yet.");
1352 assert(Info.OrigArgs.size() == 5 && "Additional args aren't supported yet.");
1353
1354 MachineFunction &MF = MIRBuilder.getMF();
1355 const Function &F = MF.getFunction();
1356 const DataLayout &DL = F.getDataLayout();
1357
1358 // The function to jump to is actually the first argument, so we'll change the
1359 // Callee and other info to match that before using our existing helper.
1360 const Value *CalleeV = Callee.OrigValue->stripPointerCasts();
1361 if (const Function *F = dyn_cast<Function>(CalleeV)) {
1362 Info.Callee = MachineOperand::CreateGA(F, 0);
1363 Info.CallConv = F->getCallingConv();
1364 } else {
1365 assert(Callee.Regs.size() == 1 && "Too many regs for the callee");
1366 Info.Callee = MachineOperand::CreateReg(Callee.Regs[0], false);
1367 Info.CallConv = CallingConv::AMDGPU_CS_Chain; // amdgpu_cs_chain_preserve
1368 // behaves the same here.
1369 }
1370
1371 // The function that we're calling cannot be vararg (only the intrinsic is).
1372 Info.IsVarArg = false;
1373
1374 assert(
1375 all_of(SGPRArgs.Flags, [](ISD::ArgFlagsTy F) { return F.isInReg(); }) &&
1376 "SGPR arguments should be marked inreg");
1377 assert(
1378 none_of(VGPRArgs.Flags, [](ISD::ArgFlagsTy F) { return F.isInReg(); }) &&
1379 "VGPR arguments should not be marked inreg");
1380
1382 splitToValueTypes(SGPRArgs, OutArgs, DL, Info.CallConv);
1383 splitToValueTypes(VGPRArgs, OutArgs, DL, Info.CallConv);
1384
1385 Info.IsMustTailCall = true;
1386 return lowerTailCall(MIRBuilder, Info, OutArgs);
1387}
1388
1390 CallLoweringInfo &Info) const {
1391 if (Function *F = Info.CB->getCalledFunction())
1392 if (F->isIntrinsic()) {
1393 assert(F->getIntrinsicID() == Intrinsic::amdgcn_cs_chain &&
1394 "Unexpected intrinsic");
1395 return lowerChainCall(MIRBuilder, Info);
1396 }
1397
1398 if (Info.IsVarArg) {
1399 LLVM_DEBUG(dbgs() << "Variadic functions not implemented\n");
1400 return false;
1401 }
1402
1403 MachineFunction &MF = MIRBuilder.getMF();
1404 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1405 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1406
1407 const Function &F = MF.getFunction();
1409 const SITargetLowering &TLI = *getTLI<SITargetLowering>();
1410 const DataLayout &DL = F.getDataLayout();
1411
1413 for (auto &OrigArg : Info.OrigArgs)
1414 splitToValueTypes(OrigArg, OutArgs, DL, Info.CallConv);
1415
1417 if (Info.CanLowerReturn && !Info.OrigRet.Ty->isVoidTy())
1418 splitToValueTypes(Info.OrigRet, InArgs, DL, Info.CallConv);
1419
1420 // If we can lower as a tail call, do that instead.
1421 bool CanTailCallOpt =
1422 isEligibleForTailCallOptimization(MIRBuilder, Info, InArgs, OutArgs);
1423
1424 // We must emit a tail call if we have musttail.
1425 if (Info.IsMustTailCall && !CanTailCallOpt) {
1426 LLVM_DEBUG(dbgs() << "Failed to lower musttail call as tail call\n");
1427 return false;
1428 }
1429
1430 Info.IsTailCall = CanTailCallOpt;
1431 if (CanTailCallOpt)
1432 return lowerTailCall(MIRBuilder, Info, OutArgs);
1433
1434 // Find out which ABI gets to decide where things go.
1435 CCAssignFn *AssignFnFixed;
1436 CCAssignFn *AssignFnVarArg;
1437 std::tie(AssignFnFixed, AssignFnVarArg) =
1438 getAssignFnsForCC(Info.CallConv, TLI);
1439
1440 MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKUP)
1441 .addImm(0)
1442 .addImm(0);
1443
1444 // Create a temporarily-floating call instruction so we can add the implicit
1445 // uses of arg registers.
1446 unsigned Opc = getCallOpcode(MF, Info.Callee.isReg(), false, ST.isWave32(),
1447 Info.CallConv);
1448
1449 auto MIB = MIRBuilder.buildInstrNoInsert(Opc);
1450 MIB.addDef(TRI->getReturnAddressReg(MF));
1451
1452 if (!Info.IsConvergent)
1454
1455 if (!addCallTargetOperands(MIB, MIRBuilder, Info))
1456 return false;
1457
1458 // Tell the call which registers are clobbered.
1459 const uint32_t *Mask = TRI->getCallPreservedMask(MF, Info.CallConv);
1460 MIB.addRegMask(Mask);
1461
1463 CCState CCInfo(Info.CallConv, Info.IsVarArg, MF, ArgLocs, F.getContext());
1464
1465 // We could pass MIB and directly add the implicit uses to the call
1466 // now. However, as an aesthetic choice, place implicit argument operands
1467 // after the ordinary user argument registers.
1469
1470 if (Info.CallConv != CallingConv::AMDGPU_Gfx) {
1471 // With a fixed ABI, allocate fixed registers before user arguments.
1472 if (!passSpecialInputs(MIRBuilder, CCInfo, ImplicitArgRegs, Info))
1473 return false;
1474 }
1475
1476 // Do the actual argument marshalling.
1477 SmallVector<Register, 8> PhysRegs;
1478
1479 OutgoingValueAssigner Assigner(AssignFnFixed, AssignFnVarArg);
1480 if (!determineAssignments(Assigner, OutArgs, CCInfo))
1481 return false;
1482
1483 AMDGPUOutgoingArgHandler Handler(MIRBuilder, MRI, MIB, false);
1484 if (!handleAssignments(Handler, OutArgs, CCInfo, ArgLocs, MIRBuilder))
1485 return false;
1486
1488
1489 if (Info.ConvergenceCtrlToken) {
1490 MIB.addUse(Info.ConvergenceCtrlToken, RegState::Implicit);
1491 }
1492 handleImplicitCallArguments(MIRBuilder, MIB, ST, *MFI, Info.CallConv,
1493 ImplicitArgRegs);
1494
1495 // Get a count of how many bytes are to be pushed on the stack.
1496 unsigned NumBytes = CCInfo.getStackSize();
1497
1498 // If Callee is a reg, since it is used by a target specific
1499 // instruction, it must have a register class matching the
1500 // constraint of that instruction.
1501
1502 // FIXME: We should define regbankselectable call instructions to handle
1503 // divergent call targets.
1504 if (MIB->getOperand(1).isReg()) {
1505 MIB->getOperand(1).setReg(constrainOperandRegClass(
1506 MF, *TRI, MRI, *ST.getInstrInfo(),
1507 *ST.getRegBankInfo(), *MIB, MIB->getDesc(), MIB->getOperand(1),
1508 1));
1509 }
1510
1511 // Now we can add the actual call instruction to the correct position.
1512 MIRBuilder.insertInstr(MIB);
1513
1514 // Finally we can copy the returned value back into its virtual-register. In
1515 // symmetry with the arguments, the physical register must be an
1516 // implicit-define of the call instruction.
1517 if (Info.CanLowerReturn && !Info.OrigRet.Ty->isVoidTy()) {
1518 CCAssignFn *RetAssignFn = TLI.CCAssignFnForReturn(Info.CallConv,
1519 Info.IsVarArg);
1520 IncomingValueAssigner Assigner(RetAssignFn);
1521 CallReturnHandler Handler(MIRBuilder, MRI, MIB);
1522 if (!determineAndHandleAssignments(Handler, Assigner, InArgs, MIRBuilder,
1523 Info.CallConv, Info.IsVarArg))
1524 return false;
1525 }
1526
1527 uint64_t CalleePopBytes = NumBytes;
1528
1529 MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKDOWN)
1530 .addImm(0)
1531 .addImm(CalleePopBytes);
1532
1533 if (!Info.CanLowerReturn) {
1534 insertSRetLoads(MIRBuilder, Info.OrigRet.Ty, Info.OrigRet.Regs,
1535 Info.DemoteRegister, Info.DemoteStackIndex);
1536 }
1537
1538 return true;
1539}
unsigned const MachineRegisterInfo * MRI
static unsigned getCallOpcode(const MachineFunction &CallerF, bool IsIndirect, bool IsTailCall, std::optional< CallLowering::PtrAuthInfo > &PAI, MachineRegisterInfo &MRI)
static std::pair< CCAssignFn *, CCAssignFn * > getAssignFnsForCC(CallingConv::ID CC, const AArch64TargetLowering &TLI)
Returns a pair containing the fixed CCAssignFn and the vararg CCAssignFn for CC.
static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls)
Return true if the calling convention is one that we can guarantee TCO for.
static bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
static bool addCallTargetOperands(MachineInstrBuilder &CallInst, MachineIRBuilder &MIRBuilder, AMDGPUCallLowering::CallLoweringInfo &Info)
static ISD::NodeType extOpcodeToISDExtOpcode(unsigned MIOpc)
static void allocateHSAUserSGPRs(CCState &CCInfo, MachineIRBuilder &B, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
This file describes how to lower LLVM calls to machine code calls.
static const LLT S32
This file declares the targeting of the Machinelegalizer class for AMDGPU.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(...)
Definition: Debug.h:106
uint64_t Addr
uint64_t Size
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
This file declares the MachineIRBuilder class.
unsigned const TargetRegisterInfo * TRI
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Interface definition for SIRegisterInfo.
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
bool lowerTailCall(MachineIRBuilder &MIRBuilder, CallLoweringInfo &Info, SmallVectorImpl< ArgInfo > &OutArgs) const
bool isEligibleForTailCallOptimization(MachineIRBuilder &MIRBuilder, CallLoweringInfo &Info, SmallVectorImpl< ArgInfo > &InArgs, SmallVectorImpl< ArgInfo > &OutArgs) const
Returns true if the call can be lowered as a tail call.
bool lowerFormalArgumentsKernel(MachineIRBuilder &B, const Function &F, ArrayRef< ArrayRef< Register > > VRegs) const
bool lowerReturn(MachineIRBuilder &B, const Value *Val, ArrayRef< Register > VRegs, FunctionLoweringInfo &FLI) const override
This hook behaves as the extended lowerReturn function, but for targets that do not support swifterro...
void handleImplicitCallArguments(MachineIRBuilder &MIRBuilder, MachineInstrBuilder &CallInst, const GCNSubtarget &ST, const SIMachineFunctionInfo &MFI, CallingConv::ID CalleeCC, ArrayRef< std::pair< MCRegister, Register > > ImplicitArgRegs) const
bool areCalleeOutgoingArgsTailCallable(CallLoweringInfo &Info, MachineFunction &MF, SmallVectorImpl< ArgInfo > &OutArgs) const
bool lowerChainCall(MachineIRBuilder &MIRBuilder, CallLoweringInfo &Info) const
Lower a call to the @llvm.amdgcn.cs.chain intrinsic.
AMDGPUCallLowering(const AMDGPUTargetLowering &TLI)
bool passSpecialInputs(MachineIRBuilder &MIRBuilder, CCState &CCInfo, SmallVectorImpl< std::pair< MCRegister, Register > > &ArgRegs, CallLoweringInfo &Info) const
bool lowerFormalArguments(MachineIRBuilder &B, const Function &F, ArrayRef< ArrayRef< Register > > VRegs, FunctionLoweringInfo &FLI) const override
This hook must be implemented to lower the incoming (formal) arguments, described by VRegs,...
bool lowerCall(MachineIRBuilder &MIRBuilder, CallLoweringInfo &Info) const override
This hook must be implemented to lower the given call instruction, including argument and return valu...
bool doCallerAndCalleePassArgsTheSameWay(CallLoweringInfo &Info, MachineFunction &MF, SmallVectorImpl< ArgInfo > &InArgs) const
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
unsigned getExplicitKernelArgOffset() const
Returns the offset in bytes from the start of the input buffer of the first explicit kernel argument.
EVT getTypeForExtReturn(LLVMContext &Context, EVT VT, ISD::NodeType ExtendKind) const override
Return the type that should be used to zero or sign extend a zeroext/signext integer return value.
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:168
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:163
CCState - This class holds information needed while lowering arguments and return values.
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
CCValAssign - Represent assignment of one arg/retval to a location.
LocInfo getLocInfo() const
int64_t getLocMemOffset() const
This class represents a function call, abstracting a target machine's calling convention.
void insertSRetLoads(MachineIRBuilder &MIRBuilder, Type *RetTy, ArrayRef< Register > VRegs, Register DemoteReg, int FI) const
Load the returned value from the stack into virtual registers in VRegs.
bool handleAssignments(ValueHandler &Handler, SmallVectorImpl< ArgInfo > &Args, CCState &CCState, SmallVectorImpl< CCValAssign > &ArgLocs, MachineIRBuilder &MIRBuilder, ArrayRef< Register > ThisReturnRegs={}) const
Use Handler to insert code to handle the argument/return values represented by Args.
bool resultsCompatible(CallLoweringInfo &Info, MachineFunction &MF, SmallVectorImpl< ArgInfo > &InArgs, ValueAssigner &CalleeAssigner, ValueAssigner &CallerAssigner) const
void splitToValueTypes(const ArgInfo &OrigArgInfo, SmallVectorImpl< ArgInfo > &SplitArgs, const DataLayout &DL, CallingConv::ID CallConv, SmallVectorImpl< uint64_t > *Offsets=nullptr) const
Break OrigArgInfo into one or more pieces the calling convention can process, returned in SplitArgs.
void insertSRetIncomingArgument(const Function &F, SmallVectorImpl< ArgInfo > &SplitArgs, Register &DemoteReg, MachineRegisterInfo &MRI, const DataLayout &DL) const
Insert the hidden sret ArgInfo to the beginning of SplitArgs.
bool determineAndHandleAssignments(ValueHandler &Handler, ValueAssigner &Assigner, SmallVectorImpl< ArgInfo > &Args, MachineIRBuilder &MIRBuilder, CallingConv::ID CallConv, bool IsVarArg, ArrayRef< Register > ThisReturnRegs={}) const
Invoke ValueAssigner::assignArg on each of the given Args and then use Handler to move them to the as...
void insertSRetStores(MachineIRBuilder &MIRBuilder, Type *RetTy, ArrayRef< Register > VRegs, Register DemoteReg) const
Store the return value given by VRegs into stack starting at the offset specified in DemoteReg.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ArgInfo > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
bool determineAssignments(ValueAssigner &Assigner, SmallVectorImpl< ArgInfo > &Args, CCState &CCInfo) const
Analyze the argument list in Args, using Assigner to populate CCInfo.
bool checkReturn(CCState &CCInfo, SmallVectorImpl< BaseArgInfo > &Outs, CCAssignFn *Fn) const
void setArgFlags(ArgInfo &Arg, unsigned OpIdx, const DataLayout &DL, const FuncInfoTy &FuncInfo) const
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
iterator_range< arg_iterator > args()
Definition: Function.h:898
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:277
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:369
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:291
bool hasKernargSegmentPtr() const
bool hasPrivateSegmentBuffer() const
bool hasImplicitBufferPtr() const
unsigned getAddressSpace() const
Definition: GlobalValue.h:206
constexpr unsigned getScalarSizeInBits() const
Definition: LowLevelType.h:264
static constexpr LLT vector(ElementCount EC, unsigned ScalarSizeInBits)
Get a low-level vector of some number of elements and element width.
Definition: LowLevelType.h:64
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
Definition: LowLevelType.h:42
constexpr bool isVector() const
Definition: LowLevelType.h:148
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
Definition: LowLevelType.h:57
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
Definition: LowLevelType.h:190
constexpr bool isPointer() const
Definition: LowLevelType.h:149
constexpr ElementCount getElementCount() const
Definition: LowLevelType.h:183
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
Definition: LowLevelType.h:100
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
void setHasTailCall(bool V=true)
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Helper class to build MachineInstr.
MachineInstrBuilder insertInstr(MachineInstrBuilder MIB)
Insert an existing instruction at the insertion point.
MachineInstrBuilder buildGlobalValue(const DstOp &Res, const GlobalValue *GV)
Build and insert Res = G_GLOBAL_VALUE GV.
MachineInstrBuilder buildUndef(const DstOp &Res)
Build and insert Res = IMPLICIT_DEF.
MachineInstrBuilder buildPtrAdd(const DstOp &Res, const SrcOp &Op0, const SrcOp &Op1, std::optional< unsigned > Flags=std::nullopt)
Build and insert Res = G_PTR_ADD Op0, Op1.
MachineInstrBuilder buildShl(const DstOp &Dst, const SrcOp &Src0, const SrcOp &Src1, std::optional< unsigned > Flags=std::nullopt)
MachineInstrBuilder buildStore(const SrcOp &Val, const SrcOp &Addr, MachineMemOperand &MMO)
Build and insert G_STORE Val, Addr, MMO.
MachineInstrBuilder buildInstr(unsigned Opcode)
Build and insert <empty> = Opcode <empty>.
MachineInstrBuilder buildFrameIndex(const DstOp &Res, int Idx)
Build and insert Res = G_FRAME_INDEX Idx.
MachineFunction & getMF()
Getter for the function we currently build.
MachineInstrBuilder buildAnyExt(const DstOp &Res, const SrcOp &Op)
Build and insert Res = G_ANYEXT Op0.
MachineInstrBuilder buildOr(const DstOp &Dst, const SrcOp &Src0, const SrcOp &Src1, std::optional< unsigned > Flags=std::nullopt)
Build and insert Res = G_OR Op0, Op1.
MachineInstrBuilder buildInstrNoInsert(unsigned Opcode)
Build but don't insert <empty> = Opcode <empty>.
MachineInstrBuilder buildCopy(const DstOp &Res, const SrcOp &Op)
Build and insert Res = COPY Op.
virtual MachineInstrBuilder buildConstant(const DstOp &Res, const ConstantInt &Val)
Build and insert Res = G_CONSTANT Val.
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & setMIFlag(MachineInstr::MIFlag Flag) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:587
A description of a memory reference used in the backend.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
void setReg(Register Reg)
Change the register this operand corresponds to.
static MachineOperand CreateGA(const GlobalValue *GV, int64_t Offset, unsigned TargetFlags=0)
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
Register getScratchRSrcReg() const
Returns the physical register reserved for use as the resource descriptor for scratch accesses.
MCRegister getPreloadedReg(AMDGPUFunctionArgInfo::PreloadedValue Value) const
AMDGPUFunctionArgInfo & getArgInfo()
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
bool empty() const
Definition: SmallVector.h:81
size_t size() const
Definition: SmallVector.h:78
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
Definition: StringRef.h:853
TargetOptions Options
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:237
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ PRIVATE_ADDRESS
Address space for private memory.
bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE bool isKernel(CallingConv::ID CC)
bool isChainCC(CallingConv::ID CC)
bool isShader(CallingConv::ID cc)
bool isGraphics(CallingConv::ID cc)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ AMDGPU_Gfx
Used for AMD graphics targets.
Definition: CallingConv.h:232
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition: CallingConv.h:245
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:194
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition: ISDOpcodes.h:40
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition: ISDOpcodes.h:814
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:805
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:811
@ Implicit
Not emitted register (e.g. carry, or temporary result).
Reg
All possible values of the reg field in the ModR/M byte.
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Offset
Definition: DWP.cpp:480
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1739
Register constrainOperandRegClass(const MachineFunction &MF, const TargetRegisterInfo &TRI, MachineRegisterInfo &MRI, const TargetInstrInfo &TII, const RegisterBankInfo &RBI, MachineInstr &InsertPt, const TargetRegisterClass &RegClass, MachineOperand &RegMO)
Constrain the Register operand OpIdx, so that it is now constrained to the TargetRegisterClass passed...
Definition: Utils.cpp:56
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1697
bool isAligned(Align Lhs, uint64_t SizeInBytes)
Checks that SizeInBytes is a multiple of the alignment.
Definition: Alignment.h:145
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1746
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1753
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition: Analysis.cpp:79
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
LLT getLLTForType(Type &Ty, const DataLayout &DL)
Construct a low-level type based on an LLVM type.
Align inferAlignFromPtrInfo(MachineFunction &MF, const MachinePointerInfo &MPO)
Definition: Utils.cpp:893
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
Helper struct shared between Function Specialization and SCCP Solver.
Definition: SCCPSolver.h:41
const Value * OrigValue
Optionally track the original IR value for the argument.
Definition: CallLowering.h:73
SmallVector< Register, 4 > Regs
Definition: CallLowering.h:63
SmallVector< ISD::ArgFlagsTy, 4 > Flags
Definition: CallLowering.h:51
Base class for ValueHandlers used for arguments coming into the current function, or for return value...
Definition: CallLowering.h:331
void assignValueToReg(Register ValVReg, Register PhysReg, const CCValAssign &VA) override
Provides a default implementation for argument handling.
Register buildExtensionHint(const CCValAssign &VA, Register SrcReg, LLT NarrowTy)
Insert G_ASSERT_ZEXT/G_ASSERT_SEXT or other hint instruction based on VA, returning the new register ...
Base class for ValueHandlers used for arguments passed to a function call, or for return values.
Definition: CallLowering.h:347
uint64_t StackSize
The size of the currently allocated portion of the stack.
Definition: CallLowering.h:217
virtual Register getStackAddress(uint64_t MemSize, int64_t Offset, MachinePointerInfo &MPO, ISD::ArgFlagsTy Flags)=0
Materialize a VReg containing the address of the specified stack-based object.
virtual void assignValueToAddress(Register ValVReg, Register Addr, LLT MemTy, const MachinePointerInfo &MPO, const CCValAssign &VA)=0
The specified value has been assigned to a stack location.
Register extendRegister(Register ValReg, const CCValAssign &VA, unsigned MaxSizeBits=0)
Extend a register to the location type given in VA, capped at extending to at most MaxSize bits.
virtual void assignValueToReg(Register ValVReg, Register PhysReg, const CCValAssign &VA)=0
The specified value has been assigned to a physical register, handle the appropriate COPY (either to ...
Extended Value Type.
Definition: ValueTypes.h:35
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:210
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition: ValueTypes.h:157
This class contains a discriminated union of information about pointers in memory operands,...
static MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117