LLVM 22.0.0git
AMDGPUPromoteAlloca.cpp
Go to the documentation of this file.
1//===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// Eliminates allocas by either converting them into vectors or by migrating
10// them to local address space.
11//
12// Two passes are exposed by this file:
13// - "promote-alloca-to-vector", which runs early in the pipeline and only
14// promotes to vector. Promotion to vector is almost always profitable
15// except when the alloca is too big and the promotion would result in
16// very high register pressure.
17// - "promote-alloca", which does both promotion to vector and LDS and runs
18// much later in the pipeline. This runs after SROA because promoting to
19// LDS is of course less profitable than getting rid of the alloca or
20// vectorizing it, thus we only want to do it when the only alternative is
21// lowering the alloca to stack.
22//
23// Note that both of them exist for the old and new PMs. The new PM passes are
24// declared in AMDGPU.h and the legacy PM ones are declared here.s
25//
26//===----------------------------------------------------------------------===//
27
28#include "AMDGPU.h"
29#include "GCNSubtarget.h"
31#include "llvm/ADT/STLExtras.h"
38#include "llvm/IR/IRBuilder.h"
40#include "llvm/IR/IntrinsicsAMDGPU.h"
41#include "llvm/IR/IntrinsicsR600.h"
44#include "llvm/Pass.h"
47
48#define DEBUG_TYPE "amdgpu-promote-alloca"
49
50using namespace llvm;
51
52namespace {
53
54static cl::opt<bool>
55 DisablePromoteAllocaToVector("disable-promote-alloca-to-vector",
56 cl::desc("Disable promote alloca to vector"),
57 cl::init(false));
58
59static cl::opt<bool>
60 DisablePromoteAllocaToLDS("disable-promote-alloca-to-lds",
61 cl::desc("Disable promote alloca to LDS"),
62 cl::init(false));
63
64static cl::opt<unsigned> PromoteAllocaToVectorLimit(
65 "amdgpu-promote-alloca-to-vector-limit",
66 cl::desc("Maximum byte size to consider promote alloca to vector"),
67 cl::init(0));
68
69static cl::opt<unsigned> PromoteAllocaToVectorMaxRegs(
70 "amdgpu-promote-alloca-to-vector-max-regs",
72 "Maximum vector size (in 32b registers) to use when promoting alloca"),
73 cl::init(32));
74
75// Use up to 1/4 of available register budget for vectorization.
76// FIXME: Increase the limit for whole function budgets? Perhaps x2?
77static cl::opt<unsigned> PromoteAllocaToVectorVGPRRatio(
78 "amdgpu-promote-alloca-to-vector-vgpr-ratio",
79 cl::desc("Ratio of VGPRs to budget for promoting alloca to vectors"),
80 cl::init(4));
81
83 LoopUserWeight("promote-alloca-vector-loop-user-weight",
84 cl::desc("The bonus weight of users of allocas within loop "
85 "when sorting profitable allocas"),
86 cl::init(4));
87
88// Shared implementation which can do both promotion to vector and to LDS.
89class AMDGPUPromoteAllocaImpl {
90private:
91 const TargetMachine &TM;
92 LoopInfo &LI;
93 Module *Mod = nullptr;
94 const DataLayout *DL = nullptr;
95
96 // FIXME: This should be per-kernel.
97 uint32_t LocalMemLimit = 0;
98 uint32_t CurrentLocalMemUsage = 0;
99 unsigned MaxVGPRs;
100 unsigned VGPRBudgetRatio;
101 unsigned MaxVectorRegs;
102
103 bool IsAMDGCN = false;
104 bool IsAMDHSA = false;
105
106 std::pair<Value *, Value *> getLocalSizeYZ(IRBuilder<> &Builder);
107 Value *getWorkitemID(IRBuilder<> &Builder, unsigned N);
108
109 /// BaseAlloca is the alloca root the search started from.
110 /// Val may be that alloca or a recursive user of it.
111 bool collectUsesWithPtrTypes(Value *BaseAlloca, Value *Val,
112 std::vector<Value *> &WorkList) const;
113
114 /// Val is a derived pointer from Alloca. OpIdx0/OpIdx1 are the operand
115 /// indices to an instruction with 2 pointer inputs (e.g. select, icmp).
116 /// Returns true if both operands are derived from the same alloca. Val should
117 /// be the same value as one of the input operands of UseInst.
118 bool binaryOpIsDerivedFromSameAlloca(Value *Alloca, Value *Val,
119 Instruction *UseInst, int OpIdx0,
120 int OpIdx1) const;
121
122 /// Check whether we have enough local memory for promotion.
123 bool hasSufficientLocalMem(const Function &F);
124
125 bool tryPromoteAllocaToVector(AllocaInst &I);
126 bool tryPromoteAllocaToLDS(AllocaInst &I, bool SufficientLDS);
127
128 void sortAllocasToPromote(SmallVectorImpl<AllocaInst *> &Allocas);
129
130 void setFunctionLimits(const Function &F);
131
132public:
133 AMDGPUPromoteAllocaImpl(TargetMachine &TM, LoopInfo &LI) : TM(TM), LI(LI) {
134
135 const Triple &TT = TM.getTargetTriple();
136 IsAMDGCN = TT.isAMDGCN();
137 IsAMDHSA = TT.getOS() == Triple::AMDHSA;
138 }
139
140 bool run(Function &F, bool PromoteToLDS);
141};
142
143// FIXME: This can create globals so should be a module pass.
144class AMDGPUPromoteAlloca : public FunctionPass {
145public:
146 static char ID;
147
148 AMDGPUPromoteAlloca() : FunctionPass(ID) {}
149
150 bool runOnFunction(Function &F) override {
151 if (skipFunction(F))
152 return false;
153 if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
154 return AMDGPUPromoteAllocaImpl(
155 TPC->getTM<TargetMachine>(),
156 getAnalysis<LoopInfoWrapperPass>().getLoopInfo())
157 .run(F, /*PromoteToLDS*/ true);
158 return false;
159 }
160
161 StringRef getPassName() const override { return "AMDGPU Promote Alloca"; }
162
163 void getAnalysisUsage(AnalysisUsage &AU) const override {
164 AU.setPreservesCFG();
167 }
168};
169
170static unsigned getMaxVGPRs(unsigned LDSBytes, const TargetMachine &TM,
171 const Function &F) {
172 if (!TM.getTargetTriple().isAMDGCN())
173 return 128;
174
175 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
176
177 unsigned DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F);
178 // Temporarily check both the attribute and the subtarget feature, until the
179 // latter is removed.
180 if (DynamicVGPRBlockSize == 0 && ST.isDynamicVGPREnabled())
181 DynamicVGPRBlockSize = ST.getDynamicVGPRBlockSize();
182
183 unsigned MaxVGPRs = ST.getMaxNumVGPRs(
184 ST.getWavesPerEU(ST.getFlatWorkGroupSizes(F), LDSBytes, F).first,
185 DynamicVGPRBlockSize);
186
187 // A non-entry function has only 32 caller preserved registers.
188 // Do not promote alloca which will force spilling unless we know the function
189 // will be inlined.
190 if (!F.hasFnAttribute(Attribute::AlwaysInline) &&
191 !AMDGPU::isEntryFunctionCC(F.getCallingConv()))
192 MaxVGPRs = std::min(MaxVGPRs, 32u);
193 return MaxVGPRs;
194}
195
196} // end anonymous namespace
197
198char AMDGPUPromoteAlloca::ID = 0;
199
201 "AMDGPU promote alloca to vector or LDS", false, false)
202// Move LDS uses from functions to kernels before promote alloca for accurate
203// estimation of LDS available
204INITIALIZE_PASS_DEPENDENCY(AMDGPULowerModuleLDSLegacy)
206INITIALIZE_PASS_END(AMDGPUPromoteAlloca, DEBUG_TYPE,
207 "AMDGPU promote alloca to vector or LDS", false, false)
208
209char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID;
210
213 auto &LI = AM.getResult<LoopAnalysis>(F);
214 bool Changed = AMDGPUPromoteAllocaImpl(TM, LI).run(F, /*PromoteToLDS=*/true);
215 if (Changed) {
218 return PA;
219 }
220 return PreservedAnalyses::all();
221}
222
225 auto &LI = AM.getResult<LoopAnalysis>(F);
226 bool Changed = AMDGPUPromoteAllocaImpl(TM, LI).run(F, /*PromoteToLDS=*/false);
227 if (Changed) {
230 return PA;
231 }
232 return PreservedAnalyses::all();
233}
234
236 return new AMDGPUPromoteAlloca();
237}
238
239static void collectAllocaUses(AllocaInst &Alloca,
241 SmallVector<Instruction *, 4> WorkList({&Alloca});
242 while (!WorkList.empty()) {
243 auto *Cur = WorkList.pop_back_val();
244 for (auto &U : Cur->uses()) {
245 Uses.push_back(&U);
246
247 if (isa<GetElementPtrInst>(U.getUser()))
248 WorkList.push_back(cast<Instruction>(U.getUser()));
249 }
250 }
251}
252
253void AMDGPUPromoteAllocaImpl::sortAllocasToPromote(
256
257 for (auto *Alloca : Allocas) {
258 LLVM_DEBUG(dbgs() << "Scoring: " << *Alloca << "\n");
259 unsigned &Score = Scores[Alloca];
260 // Increment score by one for each user + a bonus for users within loops.
262 collectAllocaUses(*Alloca, Uses);
263 for (auto *U : Uses) {
264 Instruction *Inst = cast<Instruction>(U->getUser());
265 if (isa<GetElementPtrInst>(Inst))
266 continue;
267 unsigned UserScore =
268 1 + (LoopUserWeight * LI.getLoopDepth(Inst->getParent()));
269 LLVM_DEBUG(dbgs() << " [+" << UserScore << "]:\t" << *Inst << "\n");
270 Score += UserScore;
271 }
272 LLVM_DEBUG(dbgs() << " => Final Score:" << Score << "\n");
273 }
274
275 stable_sort(Allocas, [&](AllocaInst *A, AllocaInst *B) {
276 return Scores.at(A) > Scores.at(B);
277 });
278
279 // clang-format off
281 dbgs() << "Sorted Worklist:\n";
282 for (auto *A: Allocas)
283 dbgs() << " " << *A << "\n";
284 );
285 // clang-format on
286}
287
288void AMDGPUPromoteAllocaImpl::setFunctionLimits(const Function &F) {
289 // Load per function limits, overriding with global options where appropriate.
290 // R600 register tuples/aliasing are fragile with large vector promotions so
291 // apply architecture specific limit here.
292 const int R600MaxVectorRegs = 16;
293 MaxVectorRegs = F.getFnAttributeAsParsedInteger(
294 "amdgpu-promote-alloca-to-vector-max-regs",
295 IsAMDGCN ? PromoteAllocaToVectorMaxRegs : R600MaxVectorRegs);
296 if (PromoteAllocaToVectorMaxRegs.getNumOccurrences())
297 MaxVectorRegs = PromoteAllocaToVectorMaxRegs;
298 VGPRBudgetRatio = F.getFnAttributeAsParsedInteger(
299 "amdgpu-promote-alloca-to-vector-vgpr-ratio",
300 PromoteAllocaToVectorVGPRRatio);
301 if (PromoteAllocaToVectorVGPRRatio.getNumOccurrences())
302 VGPRBudgetRatio = PromoteAllocaToVectorVGPRRatio;
303}
304
305bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
306 Mod = F.getParent();
307 DL = &Mod->getDataLayout();
308
310 if (!ST.isPromoteAllocaEnabled())
311 return false;
312
313 bool SufficientLDS = PromoteToLDS && hasSufficientLocalMem(F);
314 MaxVGPRs = getMaxVGPRs(CurrentLocalMemUsage, TM, F);
315 setFunctionLimits(F);
316
317 unsigned VectorizationBudget =
318 (PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8
319 : (MaxVGPRs * 32)) /
320 VGPRBudgetRatio;
321
323 for (Instruction &I : F.getEntryBlock()) {
324 if (AllocaInst *AI = dyn_cast<AllocaInst>(&I)) {
325 // Array allocations are probably not worth handling, since an allocation
326 // of the array type is the canonical form.
327 if (!AI->isStaticAlloca() || AI->isArrayAllocation())
328 continue;
329 Allocas.push_back(AI);
330 }
331 }
332
333 sortAllocasToPromote(Allocas);
334
335 bool Changed = false;
336 for (AllocaInst *AI : Allocas) {
337 const unsigned AllocaCost = DL->getTypeSizeInBits(AI->getAllocatedType());
338 // First, check if we have enough budget to vectorize this alloca.
339 if (AllocaCost <= VectorizationBudget) {
340 // If we do, attempt vectorization, otherwise, fall through and try
341 // promoting to LDS instead.
342 if (tryPromoteAllocaToVector(*AI)) {
343 Changed = true;
344 assert((VectorizationBudget - AllocaCost) < VectorizationBudget &&
345 "Underflow!");
346 VectorizationBudget -= AllocaCost;
347 LLVM_DEBUG(dbgs() << " Remaining vectorization budget:"
348 << VectorizationBudget << "\n");
349 continue;
350 }
351 } else {
352 LLVM_DEBUG(dbgs() << "Alloca too big for vectorization (size:"
353 << AllocaCost << ", budget:" << VectorizationBudget
354 << "): " << *AI << "\n");
355 }
356
357 if (PromoteToLDS && tryPromoteAllocaToLDS(*AI, SufficientLDS))
358 Changed = true;
359 }
360
361 // NOTE: tryPromoteAllocaToVector removes the alloca, so Allocas contains
362 // dangling pointers. If we want to reuse it past this point, the loop above
363 // would need to be updated to remove successfully promoted allocas.
364
365 return Changed;
366}
367
369 ConstantInt *SrcIndex = nullptr;
370 ConstantInt *DestIndex = nullptr;
371};
372
373// Checks if the instruction I is a memset user of the alloca AI that we can
374// deal with. Currently, only non-volatile memsets that affect the whole alloca
375// are handled.
377 const DataLayout &DL) {
378 using namespace PatternMatch;
379 // For now we only care about non-volatile memsets that affect the whole type
380 // (start at index 0 and fill the whole alloca).
381 //
382 // TODO: Now that we moved to PromoteAlloca we could handle any memsets
383 // (except maybe volatile ones?) - we just need to use shufflevector if it
384 // only affects a subset of the vector.
385 const unsigned Size = DL.getTypeStoreSize(AI->getAllocatedType());
386 return I->getOperand(0) == AI &&
387 match(I->getOperand(2), m_SpecificInt(Size)) && !I->isVolatile();
388}
389
391 Value *Ptr, const std::map<GetElementPtrInst *, WeakTrackingVH> &GEPIdx) {
392 auto *GEP = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts());
393 if (!GEP)
394 return ConstantInt::getNullValue(Type::getInt32Ty(Ptr->getContext()));
395
396 auto I = GEPIdx.find(GEP);
397 assert(I != GEPIdx.end() && "Must have entry for GEP!");
398
399 Value *IndexValue = I->second;
400 assert(IndexValue && "index value missing from GEP index map");
401 return IndexValue;
402}
403
405 Type *VecElemTy, const DataLayout &DL,
406 SmallVector<Instruction *> &NewInsts) {
407 // TODO: Extracting a "multiple of X" from a GEP might be a useful generic
408 // helper.
409 unsigned BW = DL.getIndexTypeSizeInBits(GEP->getType());
411 APInt ConstOffset(BW, 0);
412
413 // Walk backwards through nested GEPs to collect both constant and variable
414 // offsets, so that nested vector GEP chains can be lowered in one step.
415 //
416 // Given this IR fragment as input:
417 //
418 // %0 = alloca [10 x <2 x i32>], align 8, addrspace(5)
419 // %1 = getelementptr [10 x <2 x i32>], ptr addrspace(5) %0, i32 0, i32 %j
420 // %2 = getelementptr i8, ptr addrspace(5) %1, i32 4
421 // %3 = load i32, ptr addrspace(5) %2, align 4
422 //
423 // Combine both GEP operations in a single pass, producing:
424 // BasePtr = %0
425 // ConstOffset = 4
426 // VarOffsets = { %j -> element_size(<2 x i32>) }
427 //
428 // That lets us emit a single buffer_load directly into a VGPR, without ever
429 // allocating scratch memory for the intermediate pointer.
430 Value *CurPtr = GEP;
431 while (auto *CurGEP = dyn_cast<GetElementPtrInst>(CurPtr)) {
432 if (!CurGEP->collectOffset(DL, BW, VarOffsets, ConstOffset))
433 return nullptr;
434
435 // Move to the next outer pointer.
436 CurPtr = CurGEP->getPointerOperand();
437 }
438
439 assert(CurPtr == Alloca && "GEP not based on alloca");
440
441 unsigned VecElemSize = DL.getTypeAllocSize(VecElemTy);
442 if (VarOffsets.size() > 1)
443 return nullptr;
444
445 APInt IndexQuot;
446 uint64_t Rem;
447 APInt::udivrem(ConstOffset, VecElemSize, IndexQuot, Rem);
448 if (Rem != 0)
449 return nullptr;
450 if (VarOffsets.size() == 0)
451 return ConstantInt::get(GEP->getContext(), IndexQuot);
452
453 IRBuilder<> Builder(GEP);
454
455 const auto &VarOffset = VarOffsets.front();
456 APInt OffsetQuot;
457 APInt::udivrem(VarOffset.second, VecElemSize, OffsetQuot, Rem);
458 if (Rem != 0 || OffsetQuot.isZero())
459 return nullptr;
460
461 Value *Offset = VarOffset.first;
462 auto *OffsetType = dyn_cast<IntegerType>(Offset->getType());
463 if (!OffsetType)
464 return nullptr;
465
466 if (!OffsetQuot.isOne()) {
467 ConstantInt *ConstMul =
468 ConstantInt::get(OffsetType, OffsetQuot.getZExtValue());
469 Offset = Builder.CreateMul(Offset, ConstMul);
470 if (Instruction *NewInst = dyn_cast<Instruction>(Offset))
471 NewInsts.push_back(NewInst);
472 }
473 if (ConstOffset.isZero())
474 return Offset;
475
476 ConstantInt *ConstIndex =
477 ConstantInt::get(OffsetType, IndexQuot.getZExtValue());
478 Value *IndexAdd = Builder.CreateAdd(ConstIndex, Offset);
479 if (Instruction *NewInst = dyn_cast<Instruction>(IndexAdd))
480 NewInsts.push_back(NewInst);
481 return IndexAdd;
482}
483
484/// Promotes a single user of the alloca to a vector form.
485///
486/// \param Inst Instruction to be promoted.
487/// \param DL Module Data Layout.
488/// \param VectorTy Vectorized Type.
489/// \param VecStoreSize Size of \p VectorTy in bytes.
490/// \param ElementSize Size of \p VectorTy element type in bytes.
491/// \param TransferInfo MemTransferInst info map.
492/// \param GEPVectorIdx GEP -> VectorIdx cache.
493/// \param CurVal Current value of the vector (e.g. last stored value)
494/// \param[out] DeferredLoads \p Inst is added to this vector if it can't
495/// be promoted now. This happens when promoting requires \p
496/// CurVal, but \p CurVal is nullptr.
497/// \return the stored value if \p Inst would have written to the alloca, or
498/// nullptr otherwise.
500 Instruction *Inst, const DataLayout &DL, FixedVectorType *VectorTy,
501 unsigned VecStoreSize, unsigned ElementSize,
503 std::map<GetElementPtrInst *, WeakTrackingVH> &GEPVectorIdx, Value *CurVal,
504 SmallVectorImpl<LoadInst *> &DeferredLoads) {
505 // Note: we use InstSimplifyFolder because it can leverage the DataLayout
506 // to do more folding, especially in the case of vector splats.
509 Builder.SetInsertPoint(Inst);
510
511 const auto GetOrLoadCurrentVectorValue = [&]() -> Value * {
512 if (CurVal)
513 return CurVal;
514
515 // If the current value is not known, insert a dummy load and lower it on
516 // the second pass.
517 LoadInst *Dummy =
518 Builder.CreateLoad(VectorTy, PoisonValue::get(Builder.getPtrTy()),
519 "promotealloca.dummyload");
520 DeferredLoads.push_back(Dummy);
521 return Dummy;
522 };
523
524 const auto CreateTempPtrIntCast = [&Builder, DL](Value *Val,
525 Type *PtrTy) -> Value * {
526 assert(DL.getTypeStoreSize(Val->getType()) == DL.getTypeStoreSize(PtrTy));
527 const unsigned Size = DL.getTypeStoreSizeInBits(PtrTy);
528 if (!PtrTy->isVectorTy())
529 return Builder.CreateBitOrPointerCast(Val, Builder.getIntNTy(Size));
530 const unsigned NumPtrElts = cast<FixedVectorType>(PtrTy)->getNumElements();
531 // If we want to cast to cast, e.g. a <2 x ptr> into a <4 x i32>, we need to
532 // first cast the ptr vector to <2 x i64>.
533 assert((Size % NumPtrElts == 0) && "Vector size not divisble");
534 Type *EltTy = Builder.getIntNTy(Size / NumPtrElts);
535 return Builder.CreateBitOrPointerCast(
536 Val, FixedVectorType::get(EltTy, NumPtrElts));
537 };
538
539 Type *VecEltTy = VectorTy->getElementType();
540
541 switch (Inst->getOpcode()) {
542 case Instruction::Load: {
543 // Loads can only be lowered if the value is known.
544 if (!CurVal) {
545 DeferredLoads.push_back(cast<LoadInst>(Inst));
546 return nullptr;
547 }
548
550 cast<LoadInst>(Inst)->getPointerOperand(), GEPVectorIdx);
551
552 // We're loading the full vector.
553 Type *AccessTy = Inst->getType();
554 TypeSize AccessSize = DL.getTypeStoreSize(AccessTy);
555 if (Constant *CI = dyn_cast<Constant>(Index)) {
556 if (CI->isZeroValue() && AccessSize == VecStoreSize) {
557 if (AccessTy->isPtrOrPtrVectorTy())
558 CurVal = CreateTempPtrIntCast(CurVal, AccessTy);
559 else if (CurVal->getType()->isPtrOrPtrVectorTy())
560 CurVal = CreateTempPtrIntCast(CurVal, CurVal->getType());
561 Value *NewVal = Builder.CreateBitOrPointerCast(CurVal, AccessTy);
562 Inst->replaceAllUsesWith(NewVal);
563 return nullptr;
564 }
565 }
566
567 // Loading a subvector.
568 if (isa<FixedVectorType>(AccessTy)) {
569 assert(AccessSize.isKnownMultipleOf(DL.getTypeStoreSize(VecEltTy)));
570 const unsigned NumLoadedElts = AccessSize / DL.getTypeStoreSize(VecEltTy);
571 auto *SubVecTy = FixedVectorType::get(VecEltTy, NumLoadedElts);
572 assert(DL.getTypeStoreSize(SubVecTy) == DL.getTypeStoreSize(AccessTy));
573
574 Value *SubVec = PoisonValue::get(SubVecTy);
575 for (unsigned K = 0; K < NumLoadedElts; ++K) {
576 Value *CurIdx =
577 Builder.CreateAdd(Index, ConstantInt::get(Index->getType(), K));
578 SubVec = Builder.CreateInsertElement(
579 SubVec, Builder.CreateExtractElement(CurVal, CurIdx), K);
580 }
581
582 if (AccessTy->isPtrOrPtrVectorTy())
583 SubVec = CreateTempPtrIntCast(SubVec, AccessTy);
584 else if (SubVecTy->isPtrOrPtrVectorTy())
585 SubVec = CreateTempPtrIntCast(SubVec, SubVecTy);
586
587 SubVec = Builder.CreateBitOrPointerCast(SubVec, AccessTy);
588 Inst->replaceAllUsesWith(SubVec);
589 return nullptr;
590 }
591
592 // We're loading one element.
593 Value *ExtractElement = Builder.CreateExtractElement(CurVal, Index);
594 if (AccessTy != VecEltTy)
595 ExtractElement = Builder.CreateBitOrPointerCast(ExtractElement, AccessTy);
596
597 Inst->replaceAllUsesWith(ExtractElement);
598 return nullptr;
599 }
600 case Instruction::Store: {
601 // For stores, it's a bit trickier and it depends on whether we're storing
602 // the full vector or not. If we're storing the full vector, we don't need
603 // to know the current value. If this is a store of a single element, we
604 // need to know the value.
605 StoreInst *SI = cast<StoreInst>(Inst);
606 Value *Index = calculateVectorIndex(SI->getPointerOperand(), GEPVectorIdx);
607 Value *Val = SI->getValueOperand();
608
609 // We're storing the full vector, we can handle this without knowing CurVal.
610 Type *AccessTy = Val->getType();
611 TypeSize AccessSize = DL.getTypeStoreSize(AccessTy);
612 if (Constant *CI = dyn_cast<Constant>(Index)) {
613 if (CI->isZeroValue() && AccessSize == VecStoreSize) {
614 if (AccessTy->isPtrOrPtrVectorTy())
615 Val = CreateTempPtrIntCast(Val, AccessTy);
616 else if (VectorTy->isPtrOrPtrVectorTy())
617 Val = CreateTempPtrIntCast(Val, VectorTy);
618 return Builder.CreateBitOrPointerCast(Val, VectorTy);
619 }
620 }
621
622 // Storing a subvector.
623 if (isa<FixedVectorType>(AccessTy)) {
624 assert(AccessSize.isKnownMultipleOf(DL.getTypeStoreSize(VecEltTy)));
625 const unsigned NumWrittenElts =
626 AccessSize / DL.getTypeStoreSize(VecEltTy);
627 const unsigned NumVecElts = VectorTy->getNumElements();
628 auto *SubVecTy = FixedVectorType::get(VecEltTy, NumWrittenElts);
629 assert(DL.getTypeStoreSize(SubVecTy) == DL.getTypeStoreSize(AccessTy));
630
631 if (SubVecTy->isPtrOrPtrVectorTy())
632 Val = CreateTempPtrIntCast(Val, SubVecTy);
633 else if (AccessTy->isPtrOrPtrVectorTy())
634 Val = CreateTempPtrIntCast(Val, AccessTy);
635
636 Val = Builder.CreateBitOrPointerCast(Val, SubVecTy);
637
638 Value *CurVec = GetOrLoadCurrentVectorValue();
639 for (unsigned K = 0, NumElts = std::min(NumWrittenElts, NumVecElts);
640 K < NumElts; ++K) {
641 Value *CurIdx =
642 Builder.CreateAdd(Index, ConstantInt::get(Index->getType(), K));
643 CurVec = Builder.CreateInsertElement(
644 CurVec, Builder.CreateExtractElement(Val, K), CurIdx);
645 }
646 return CurVec;
647 }
648
649 if (Val->getType() != VecEltTy)
650 Val = Builder.CreateBitOrPointerCast(Val, VecEltTy);
651 return Builder.CreateInsertElement(GetOrLoadCurrentVectorValue(), Val,
652 Index);
653 }
654 case Instruction::Call: {
655 if (auto *MTI = dyn_cast<MemTransferInst>(Inst)) {
656 // For memcpy, we need to know curval.
657 ConstantInt *Length = cast<ConstantInt>(MTI->getLength());
658 unsigned NumCopied = Length->getZExtValue() / ElementSize;
659 MemTransferInfo *TI = &TransferInfo[MTI];
660 unsigned SrcBegin = TI->SrcIndex->getZExtValue();
661 unsigned DestBegin = TI->DestIndex->getZExtValue();
662
663 SmallVector<int> Mask;
664 for (unsigned Idx = 0; Idx < VectorTy->getNumElements(); ++Idx) {
665 if (Idx >= DestBegin && Idx < DestBegin + NumCopied) {
666 Mask.push_back(SrcBegin < VectorTy->getNumElements()
667 ? SrcBegin++
669 } else {
670 Mask.push_back(Idx);
671 }
672 }
673
674 return Builder.CreateShuffleVector(GetOrLoadCurrentVectorValue(), Mask);
675 }
676
677 if (auto *MSI = dyn_cast<MemSetInst>(Inst)) {
678 // For memset, we don't need to know the previous value because we
679 // currently only allow memsets that cover the whole alloca.
680 Value *Elt = MSI->getOperand(1);
681 const unsigned BytesPerElt = DL.getTypeStoreSize(VecEltTy);
682 if (BytesPerElt > 1) {
683 Value *EltBytes = Builder.CreateVectorSplat(BytesPerElt, Elt);
684
685 // If the element type of the vector is a pointer, we need to first cast
686 // to an integer, then use a PtrCast.
687 if (VecEltTy->isPointerTy()) {
688 Type *PtrInt = Builder.getIntNTy(BytesPerElt * 8);
689 Elt = Builder.CreateBitCast(EltBytes, PtrInt);
690 Elt = Builder.CreateIntToPtr(Elt, VecEltTy);
691 } else
692 Elt = Builder.CreateBitCast(EltBytes, VecEltTy);
693 }
694
695 return Builder.CreateVectorSplat(VectorTy->getElementCount(), Elt);
696 }
697
698 if (auto *Intr = dyn_cast<IntrinsicInst>(Inst)) {
699 if (Intr->getIntrinsicID() == Intrinsic::objectsize) {
700 Intr->replaceAllUsesWith(
701 Builder.getIntN(Intr->getType()->getIntegerBitWidth(),
702 DL.getTypeAllocSize(VectorTy)));
703 return nullptr;
704 }
705 }
706
707 llvm_unreachable("Unsupported call when promoting alloca to vector");
708 }
709
710 default:
711 llvm_unreachable("Inconsistency in instructions promotable to vector");
712 }
713
714 llvm_unreachable("Did not return after promoting instruction!");
715}
716
717static bool isSupportedAccessType(FixedVectorType *VecTy, Type *AccessTy,
718 const DataLayout &DL) {
719 // Access as a vector type can work if the size of the access vector is a
720 // multiple of the size of the alloca's vector element type.
721 //
722 // Examples:
723 // - VecTy = <8 x float>, AccessTy = <4 x float> -> OK
724 // - VecTy = <4 x double>, AccessTy = <2 x float> -> OK
725 // - VecTy = <4 x double>, AccessTy = <3 x float> -> NOT OK
726 // - 3*32 is not a multiple of 64
727 //
728 // We could handle more complicated cases, but it'd make things a lot more
729 // complicated.
730 if (isa<FixedVectorType>(AccessTy)) {
731 TypeSize AccTS = DL.getTypeStoreSize(AccessTy);
732 // If the type size and the store size don't match, we would need to do more
733 // than just bitcast to translate between an extracted/insertable subvectors
734 // and the accessed value.
735 if (AccTS * 8 != DL.getTypeSizeInBits(AccessTy))
736 return false;
737 TypeSize VecTS = DL.getTypeStoreSize(VecTy->getElementType());
738 return AccTS.isKnownMultipleOf(VecTS);
739 }
740
742 DL);
743}
744
745/// Iterates over an instruction worklist that may contain multiple instructions
746/// from the same basic block, but in a different order.
747template <typename InstContainer>
748static void forEachWorkListItem(const InstContainer &WorkList,
749 std::function<void(Instruction *)> Fn) {
750 // Bucket up uses of the alloca by the block they occur in.
751 // This is important because we have to handle multiple defs/uses in a block
752 // ourselves: SSAUpdater is purely for cross-block references.
754 for (Instruction *User : WorkList)
755 UsesByBlock[User->getParent()].insert(User);
756
757 for (Instruction *User : WorkList) {
758 BasicBlock *BB = User->getParent();
759 auto &BlockUses = UsesByBlock[BB];
760
761 // Already processed, skip.
762 if (BlockUses.empty())
763 continue;
764
765 // Only user in the block, directly process it.
766 if (BlockUses.size() == 1) {
767 Fn(User);
768 continue;
769 }
770
771 // Multiple users in the block, do a linear scan to see users in order.
772 for (Instruction &Inst : *BB) {
773 if (!BlockUses.contains(&Inst))
774 continue;
775
776 Fn(&Inst);
777 }
778
779 // Clear the block so we know it's been processed.
780 BlockUses.clear();
781 }
782}
783
784/// Find an insert point after an alloca, after all other allocas clustered at
785/// the start of the block.
788 for (BasicBlock::iterator E = BB.end(); I != E && isa<AllocaInst>(*I); ++I)
789 ;
790 return I;
791}
792
793// FIXME: Should try to pick the most likely to be profitable allocas first.
794bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
795 LLVM_DEBUG(dbgs() << "Trying to promote to vector: " << Alloca << '\n');
796
797 if (DisablePromoteAllocaToVector) {
798 LLVM_DEBUG(dbgs() << " Promote alloca to vector is disabled\n");
799 return false;
800 }
801
802 Type *AllocaTy = Alloca.getAllocatedType();
803 auto *VectorTy = dyn_cast<FixedVectorType>(AllocaTy);
804 if (auto *ArrayTy = dyn_cast<ArrayType>(AllocaTy)) {
805 uint64_t NumElems = 1;
806 Type *ElemTy;
807 do {
808 NumElems *= ArrayTy->getNumElements();
809 ElemTy = ArrayTy->getElementType();
810 } while ((ArrayTy = dyn_cast<ArrayType>(ElemTy)));
811
812 // Check for array of vectors
813 auto *InnerVectorTy = dyn_cast<FixedVectorType>(ElemTy);
814 if (InnerVectorTy) {
815 NumElems *= InnerVectorTy->getNumElements();
816 ElemTy = InnerVectorTy->getElementType();
817 }
818
819 if (VectorType::isValidElementType(ElemTy) && NumElems > 0) {
820 unsigned ElementSize = DL->getTypeSizeInBits(ElemTy) / 8;
821 if (ElementSize > 0) {
822 unsigned AllocaSize = DL->getTypeStoreSize(AllocaTy);
823 // Expand vector if required to match padding of inner type,
824 // i.e. odd size subvectors.
825 // Storage size of new vector must match that of alloca for correct
826 // behaviour of byte offsets and GEP computation.
827 if (NumElems * ElementSize != AllocaSize)
828 NumElems = AllocaSize / ElementSize;
829 if (NumElems > 0 && (AllocaSize % ElementSize) == 0)
830 VectorTy = FixedVectorType::get(ElemTy, NumElems);
831 }
832 }
833 }
834
835 if (!VectorTy) {
836 LLVM_DEBUG(dbgs() << " Cannot convert type to vector\n");
837 return false;
838 }
839
840 const unsigned MaxElements =
841 (MaxVectorRegs * 32) / DL->getTypeSizeInBits(VectorTy->getElementType());
842
843 if (VectorTy->getNumElements() > MaxElements ||
844 VectorTy->getNumElements() < 2) {
845 LLVM_DEBUG(dbgs() << " " << *VectorTy
846 << " has an unsupported number of elements\n");
847 return false;
848 }
849
850 std::map<GetElementPtrInst *, WeakTrackingVH> GEPVectorIdx;
852 SmallVector<Instruction *> UsersToRemove;
853 SmallVector<Instruction *> DeferredInsts;
854 SmallVector<Instruction *> NewGEPInsts;
856
857 const auto RejectUser = [&](Instruction *Inst, Twine Msg) {
858 LLVM_DEBUG(dbgs() << " Cannot promote alloca to vector: " << Msg << "\n"
859 << " " << *Inst << "\n");
860 for (auto *Inst : reverse(NewGEPInsts))
861 Inst->eraseFromParent();
862 return false;
863 };
864
866 collectAllocaUses(Alloca, Uses);
867
868 LLVM_DEBUG(dbgs() << " Attempting promotion to: " << *VectorTy << "\n");
869
870 Type *VecEltTy = VectorTy->getElementType();
871 unsigned ElementSizeInBits = DL->getTypeSizeInBits(VecEltTy);
872 if (ElementSizeInBits != DL->getTypeAllocSizeInBits(VecEltTy)) {
873 LLVM_DEBUG(dbgs() << " Cannot convert to vector if the allocation size "
874 "does not match the type's size\n");
875 return false;
876 }
877 unsigned ElementSize = ElementSizeInBits / 8;
878 assert(ElementSize > 0);
879 for (auto *U : Uses) {
880 Instruction *Inst = cast<Instruction>(U->getUser());
881
882 if (Value *Ptr = getLoadStorePointerOperand(Inst)) {
883 // This is a store of the pointer, not to the pointer.
884 if (isa<StoreInst>(Inst) &&
885 U->getOperandNo() != StoreInst::getPointerOperandIndex())
886 return RejectUser(Inst, "pointer is being stored");
887
888 Type *AccessTy = getLoadStoreType(Inst);
889 if (AccessTy->isAggregateType())
890 return RejectUser(Inst, "unsupported load/store as aggregate");
891 assert(!AccessTy->isAggregateType() || AccessTy->isArrayTy());
892
893 // Check that this is a simple access of a vector element.
894 bool IsSimple = isa<LoadInst>(Inst) ? cast<LoadInst>(Inst)->isSimple()
895 : cast<StoreInst>(Inst)->isSimple();
896 if (!IsSimple)
897 return RejectUser(Inst, "not a simple load or store");
898
899 Ptr = Ptr->stripPointerCasts();
900
901 // Alloca already accessed as vector.
902 if (Ptr == &Alloca && DL->getTypeStoreSize(Alloca.getAllocatedType()) ==
903 DL->getTypeStoreSize(AccessTy)) {
904 WorkList.push_back(Inst);
905 continue;
906 }
907
908 if (!isSupportedAccessType(VectorTy, AccessTy, *DL))
909 return RejectUser(Inst, "not a supported access type");
910
911 WorkList.push_back(Inst);
912 continue;
913 }
914
915 if (auto *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
916 // If we can't compute a vector index from this GEP, then we can't
917 // promote this alloca to vector.
918 Value *Index = GEPToVectorIndex(GEP, &Alloca, VecEltTy, *DL, NewGEPInsts);
919 if (!Index)
920 return RejectUser(Inst, "cannot compute vector index for GEP");
921
922 GEPVectorIdx[GEP] = Index;
923 UsersToRemove.push_back(Inst);
924 continue;
925 }
926
927 if (MemSetInst *MSI = dyn_cast<MemSetInst>(Inst);
928 MSI && isSupportedMemset(MSI, &Alloca, *DL)) {
929 WorkList.push_back(Inst);
930 continue;
931 }
932
933 if (MemTransferInst *TransferInst = dyn_cast<MemTransferInst>(Inst)) {
934 if (TransferInst->isVolatile())
935 return RejectUser(Inst, "mem transfer inst is volatile");
936
937 ConstantInt *Len = dyn_cast<ConstantInt>(TransferInst->getLength());
938 if (!Len || (Len->getZExtValue() % ElementSize))
939 return RejectUser(Inst, "mem transfer inst length is non-constant or "
940 "not a multiple of the vector element size");
941
942 if (TransferInfo.try_emplace(TransferInst).second) {
943 DeferredInsts.push_back(Inst);
944 WorkList.push_back(Inst);
945 }
946
947 auto getPointerIndexOfAlloca = [&](Value *Ptr) -> ConstantInt * {
948 GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
949 if (Ptr != &Alloca && !GEPVectorIdx.count(GEP))
950 return nullptr;
951
952 return dyn_cast<ConstantInt>(calculateVectorIndex(Ptr, GEPVectorIdx));
953 };
954
955 unsigned OpNum = U->getOperandNo();
956 MemTransferInfo *TI = &TransferInfo[TransferInst];
957 if (OpNum == 0) {
958 Value *Dest = TransferInst->getDest();
959 ConstantInt *Index = getPointerIndexOfAlloca(Dest);
960 if (!Index)
961 return RejectUser(Inst, "could not calculate constant dest index");
962 TI->DestIndex = Index;
963 } else {
964 assert(OpNum == 1);
965 Value *Src = TransferInst->getSource();
966 ConstantInt *Index = getPointerIndexOfAlloca(Src);
967 if (!Index)
968 return RejectUser(Inst, "could not calculate constant src index");
969 TI->SrcIndex = Index;
970 }
971 continue;
972 }
973
974 if (auto *Intr = dyn_cast<IntrinsicInst>(Inst)) {
975 if (Intr->getIntrinsicID() == Intrinsic::objectsize) {
976 WorkList.push_back(Inst);
977 continue;
978 }
979 }
980
981 // Ignore assume-like intrinsics and comparisons used in assumes.
982 if (isAssumeLikeIntrinsic(Inst)) {
983 if (!Inst->use_empty())
984 return RejectUser(Inst, "assume-like intrinsic cannot have any users");
985 UsersToRemove.push_back(Inst);
986 continue;
987 }
988
989 if (isa<ICmpInst>(Inst) && all_of(Inst->users(), [](User *U) {
990 return isAssumeLikeIntrinsic(cast<Instruction>(U));
991 })) {
992 UsersToRemove.push_back(Inst);
993 continue;
994 }
995
996 return RejectUser(Inst, "unhandled alloca user");
997 }
998
999 while (!DeferredInsts.empty()) {
1000 Instruction *Inst = DeferredInsts.pop_back_val();
1001 MemTransferInst *TransferInst = cast<MemTransferInst>(Inst);
1002 // TODO: Support the case if the pointers are from different alloca or
1003 // from different address spaces.
1004 MemTransferInfo &Info = TransferInfo[TransferInst];
1005 if (!Info.SrcIndex || !Info.DestIndex)
1006 return RejectUser(
1007 Inst, "mem transfer inst is missing constant src and/or dst index");
1008 }
1009
1010 LLVM_DEBUG(dbgs() << " Converting alloca to vector " << *AllocaTy << " -> "
1011 << *VectorTy << '\n');
1012 const unsigned VecStoreSize = DL->getTypeStoreSize(VectorTy);
1013
1014 // Alloca is uninitialized memory. Imitate that by making the first value
1015 // undef.
1016 SSAUpdater Updater;
1017 Updater.Initialize(VectorTy, "promotealloca");
1018
1019 BasicBlock *EntryBB = Alloca.getParent();
1020 BasicBlock::iterator InitInsertPos =
1021 skipToNonAllocaInsertPt(*EntryBB, Alloca.getIterator());
1022 // Alloca memory is undefined to begin, not poison.
1023 Value *AllocaInitValue =
1024 new FreezeInst(PoisonValue::get(VectorTy), "", InitInsertPos);
1025 AllocaInitValue->takeName(&Alloca);
1026
1027 Updater.AddAvailableValue(EntryBB, AllocaInitValue);
1028
1029 // First handle the initial worklist.
1030 SmallVector<LoadInst *, 4> DeferredLoads;
1031 forEachWorkListItem(WorkList, [&](Instruction *I) {
1032 BasicBlock *BB = I->getParent();
1033 // On the first pass, we only take values that are trivially known, i.e.
1034 // where AddAvailableValue was already called in this block.
1036 I, *DL, VectorTy, VecStoreSize, ElementSize, TransferInfo, GEPVectorIdx,
1037 Updater.FindValueForBlock(BB), DeferredLoads);
1038 if (Result)
1039 Updater.AddAvailableValue(BB, Result);
1040 });
1041
1042 // Then handle deferred loads.
1043 forEachWorkListItem(DeferredLoads, [&](Instruction *I) {
1045 BasicBlock *BB = I->getParent();
1046 // On the second pass, we use GetValueInMiddleOfBlock to guarantee we always
1047 // get a value, inserting PHIs as needed.
1049 I, *DL, VectorTy, VecStoreSize, ElementSize, TransferInfo, GEPVectorIdx,
1050 Updater.GetValueInMiddleOfBlock(I->getParent()), NewDLs);
1051 if (Result)
1052 Updater.AddAvailableValue(BB, Result);
1053 assert(NewDLs.empty() && "No more deferred loads should be queued!");
1054 });
1055
1056 // Delete all instructions. On the first pass, new dummy loads may have been
1057 // added so we need to collect them too.
1058 DenseSet<Instruction *> InstsToDelete(WorkList.begin(), WorkList.end());
1059 InstsToDelete.insert_range(DeferredLoads);
1060 for (Instruction *I : InstsToDelete) {
1061 assert(I->use_empty());
1062 I->eraseFromParent();
1063 }
1064
1065 // Delete all the users that are known to be removeable.
1066 for (Instruction *I : reverse(UsersToRemove)) {
1067 I->dropDroppableUses();
1068 assert(I->use_empty());
1069 I->eraseFromParent();
1070 }
1071
1072 // Alloca should now be dead too.
1073 assert(Alloca.use_empty());
1074 Alloca.eraseFromParent();
1075 return true;
1076}
1077
1078std::pair<Value *, Value *>
1079AMDGPUPromoteAllocaImpl::getLocalSizeYZ(IRBuilder<> &Builder) {
1080 Function &F = *Builder.GetInsertBlock()->getParent();
1082
1083 if (!IsAMDHSA) {
1084 CallInst *LocalSizeY =
1085 Builder.CreateIntrinsic(Intrinsic::r600_read_local_size_y, {});
1086 CallInst *LocalSizeZ =
1087 Builder.CreateIntrinsic(Intrinsic::r600_read_local_size_z, {});
1088
1089 ST.makeLIDRangeMetadata(LocalSizeY);
1090 ST.makeLIDRangeMetadata(LocalSizeZ);
1091
1092 return std::pair(LocalSizeY, LocalSizeZ);
1093 }
1094
1095 // We must read the size out of the dispatch pointer.
1096 assert(IsAMDGCN);
1097
1098 // We are indexing into this struct, and want to extract the workgroup_size_*
1099 // fields.
1100 //
1101 // typedef struct hsa_kernel_dispatch_packet_s {
1102 // uint16_t header;
1103 // uint16_t setup;
1104 // uint16_t workgroup_size_x ;
1105 // uint16_t workgroup_size_y;
1106 // uint16_t workgroup_size_z;
1107 // uint16_t reserved0;
1108 // uint32_t grid_size_x ;
1109 // uint32_t grid_size_y ;
1110 // uint32_t grid_size_z;
1111 //
1112 // uint32_t private_segment_size;
1113 // uint32_t group_segment_size;
1114 // uint64_t kernel_object;
1115 //
1116 // #ifdef HSA_LARGE_MODEL
1117 // void *kernarg_address;
1118 // #elif defined HSA_LITTLE_ENDIAN
1119 // void *kernarg_address;
1120 // uint32_t reserved1;
1121 // #else
1122 // uint32_t reserved1;
1123 // void *kernarg_address;
1124 // #endif
1125 // uint64_t reserved2;
1126 // hsa_signal_t completion_signal; // uint64_t wrapper
1127 // } hsa_kernel_dispatch_packet_t
1128 //
1129 CallInst *DispatchPtr =
1130 Builder.CreateIntrinsic(Intrinsic::amdgcn_dispatch_ptr, {});
1131 DispatchPtr->addRetAttr(Attribute::NoAlias);
1132 DispatchPtr->addRetAttr(Attribute::NonNull);
1133 F.removeFnAttr("amdgpu-no-dispatch-ptr");
1134
1135 // Size of the dispatch packet struct.
1136 DispatchPtr->addDereferenceableRetAttr(64);
1137
1138 Type *I32Ty = Type::getInt32Ty(Mod->getContext());
1139
1140 // We could do a single 64-bit load here, but it's likely that the basic
1141 // 32-bit and extract sequence is already present, and it is probably easier
1142 // to CSE this. The loads should be mergeable later anyway.
1143 Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(I32Ty, DispatchPtr, 1);
1144 LoadInst *LoadXY = Builder.CreateAlignedLoad(I32Ty, GEPXY, Align(4));
1145
1146 Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(I32Ty, DispatchPtr, 2);
1147 LoadInst *LoadZU = Builder.CreateAlignedLoad(I32Ty, GEPZU, Align(4));
1148
1149 MDNode *MD = MDNode::get(Mod->getContext(), {});
1150 LoadXY->setMetadata(LLVMContext::MD_invariant_load, MD);
1151 LoadZU->setMetadata(LLVMContext::MD_invariant_load, MD);
1152 ST.makeLIDRangeMetadata(LoadZU);
1153
1154 // Extract y component. Upper half of LoadZU should be zero already.
1155 Value *Y = Builder.CreateLShr(LoadXY, 16);
1156
1157 return std::pair(Y, LoadZU);
1158}
1159
1160Value *AMDGPUPromoteAllocaImpl::getWorkitemID(IRBuilder<> &Builder,
1161 unsigned N) {
1162 Function *F = Builder.GetInsertBlock()->getParent();
1165 StringRef AttrName;
1166
1167 switch (N) {
1168 case 0:
1169 IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_x
1170 : (Intrinsic::ID)Intrinsic::r600_read_tidig_x;
1171 AttrName = "amdgpu-no-workitem-id-x";
1172 break;
1173 case 1:
1174 IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_y
1175 : (Intrinsic::ID)Intrinsic::r600_read_tidig_y;
1176 AttrName = "amdgpu-no-workitem-id-y";
1177 break;
1178
1179 case 2:
1180 IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_z
1181 : (Intrinsic::ID)Intrinsic::r600_read_tidig_z;
1182 AttrName = "amdgpu-no-workitem-id-z";
1183 break;
1184 default:
1185 llvm_unreachable("invalid dimension");
1186 }
1187
1188 Function *WorkitemIdFn = Intrinsic::getOrInsertDeclaration(Mod, IntrID);
1189 CallInst *CI = Builder.CreateCall(WorkitemIdFn);
1190 ST.makeLIDRangeMetadata(CI);
1191 F->removeFnAttr(AttrName);
1192
1193 return CI;
1194}
1195
1196static bool isCallPromotable(CallInst *CI) {
1197 IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
1198 if (!II)
1199 return false;
1200
1201 switch (II->getIntrinsicID()) {
1202 case Intrinsic::memcpy:
1203 case Intrinsic::memmove:
1204 case Intrinsic::memset:
1205 case Intrinsic::lifetime_start:
1206 case Intrinsic::lifetime_end:
1207 case Intrinsic::invariant_start:
1208 case Intrinsic::invariant_end:
1209 case Intrinsic::launder_invariant_group:
1210 case Intrinsic::strip_invariant_group:
1211 case Intrinsic::objectsize:
1212 return true;
1213 default:
1214 return false;
1215 }
1216}
1217
1218bool AMDGPUPromoteAllocaImpl::binaryOpIsDerivedFromSameAlloca(
1219 Value *BaseAlloca, Value *Val, Instruction *Inst, int OpIdx0,
1220 int OpIdx1) const {
1221 // Figure out which operand is the one we might not be promoting.
1222 Value *OtherOp = Inst->getOperand(OpIdx0);
1223 if (Val == OtherOp)
1224 OtherOp = Inst->getOperand(OpIdx1);
1225
1226 if (isa<ConstantPointerNull, ConstantAggregateZero>(OtherOp))
1227 return true;
1228
1229 // TODO: getUnderlyingObject will not work on a vector getelementptr
1230 Value *OtherObj = getUnderlyingObject(OtherOp);
1231 if (!isa<AllocaInst>(OtherObj))
1232 return false;
1233
1234 // TODO: We should be able to replace undefs with the right pointer type.
1235
1236 // TODO: If we know the other base object is another promotable
1237 // alloca, not necessarily this alloca, we can do this. The
1238 // important part is both must have the same address space at
1239 // the end.
1240 if (OtherObj != BaseAlloca) {
1241 LLVM_DEBUG(
1242 dbgs() << "Found a binary instruction with another alloca object\n");
1243 return false;
1244 }
1245
1246 return true;
1247}
1248
1249bool AMDGPUPromoteAllocaImpl::collectUsesWithPtrTypes(
1250 Value *BaseAlloca, Value *Val, std::vector<Value *> &WorkList) const {
1251
1252 for (User *User : Val->users()) {
1253 if (is_contained(WorkList, User))
1254 continue;
1255
1256 if (CallInst *CI = dyn_cast<CallInst>(User)) {
1257 if (!isCallPromotable(CI))
1258 return false;
1259
1260 WorkList.push_back(User);
1261 continue;
1262 }
1263
1264 Instruction *UseInst = cast<Instruction>(User);
1265 if (UseInst->getOpcode() == Instruction::PtrToInt)
1266 return false;
1267
1268 if (LoadInst *LI = dyn_cast<LoadInst>(UseInst)) {
1269 if (LI->isVolatile())
1270 return false;
1271 continue;
1272 }
1273
1274 if (StoreInst *SI = dyn_cast<StoreInst>(UseInst)) {
1275 if (SI->isVolatile())
1276 return false;
1277
1278 // Reject if the stored value is not the pointer operand.
1279 if (SI->getPointerOperand() != Val)
1280 return false;
1281 continue;
1282 }
1283
1284 if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(UseInst)) {
1285 if (RMW->isVolatile())
1286 return false;
1287 continue;
1288 }
1289
1290 if (AtomicCmpXchgInst *CAS = dyn_cast<AtomicCmpXchgInst>(UseInst)) {
1291 if (CAS->isVolatile())
1292 return false;
1293 continue;
1294 }
1295
1296 // Only promote a select if we know that the other select operand
1297 // is from another pointer that will also be promoted.
1298 if (ICmpInst *ICmp = dyn_cast<ICmpInst>(UseInst)) {
1299 if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, ICmp, 0, 1))
1300 return false;
1301
1302 // May need to rewrite constant operands.
1303 WorkList.push_back(ICmp);
1304 continue;
1305 }
1306
1307 if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(UseInst)) {
1308 // Be conservative if an address could be computed outside the bounds of
1309 // the alloca.
1310 if (!GEP->isInBounds())
1311 return false;
1312 } else if (SelectInst *SI = dyn_cast<SelectInst>(UseInst)) {
1313 // Only promote a select if we know that the other select operand is from
1314 // another pointer that will also be promoted.
1315 if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, SI, 1, 2))
1316 return false;
1317 } else if (PHINode *Phi = dyn_cast<PHINode>(UseInst)) {
1318 // Repeat for phis.
1319
1320 // TODO: Handle more complex cases. We should be able to replace loops
1321 // over arrays.
1322 switch (Phi->getNumIncomingValues()) {
1323 case 1:
1324 break;
1325 case 2:
1326 if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, Phi, 0, 1))
1327 return false;
1328 break;
1329 default:
1330 return false;
1331 }
1332 } else if (!isa<ExtractElementInst>(User)) {
1333 // Do not promote vector/aggregate type instructions. It is hard to track
1334 // their users.
1335
1336 // Do not promote addrspacecast.
1337 //
1338 // TODO: If we know the address is only observed through flat pointers, we
1339 // could still promote.
1340 return false;
1341 }
1342
1343 WorkList.push_back(User);
1344 if (!collectUsesWithPtrTypes(BaseAlloca, User, WorkList))
1345 return false;
1346 }
1347
1348 return true;
1349}
1350
1351bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) {
1352
1353 FunctionType *FTy = F.getFunctionType();
1355
1356 // If the function has any arguments in the local address space, then it's
1357 // possible these arguments require the entire local memory space, so
1358 // we cannot use local memory in the pass.
1359 for (Type *ParamTy : FTy->params()) {
1360 PointerType *PtrTy = dyn_cast<PointerType>(ParamTy);
1361 if (PtrTy && PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
1362 LocalMemLimit = 0;
1363 LLVM_DEBUG(dbgs() << "Function has local memory argument. Promoting to "
1364 "local memory disabled.\n");
1365 return false;
1366 }
1367 }
1368
1369 LocalMemLimit = ST.getAddressableLocalMemorySize();
1370 if (LocalMemLimit == 0)
1371 return false;
1372
1374 SmallPtrSet<const Constant *, 8> VisitedConstants;
1376
1377 auto visitUsers = [&](const GlobalVariable *GV, const Constant *Val) -> bool {
1378 for (const User *U : Val->users()) {
1379 if (const Instruction *Use = dyn_cast<Instruction>(U)) {
1380 if (Use->getParent()->getParent() == &F)
1381 return true;
1382 } else {
1383 const Constant *C = cast<Constant>(U);
1384 if (VisitedConstants.insert(C).second)
1385 Stack.push_back(C);
1386 }
1387 }
1388
1389 return false;
1390 };
1391
1392 for (GlobalVariable &GV : Mod->globals()) {
1394 continue;
1395
1396 if (visitUsers(&GV, &GV)) {
1397 UsedLDS.insert(&GV);
1398 Stack.clear();
1399 continue;
1400 }
1401
1402 // For any ConstantExpr uses, we need to recursively search the users until
1403 // we see a function.
1404 while (!Stack.empty()) {
1405 const Constant *C = Stack.pop_back_val();
1406 if (visitUsers(&GV, C)) {
1407 UsedLDS.insert(&GV);
1408 Stack.clear();
1409 break;
1410 }
1411 }
1412 }
1413
1414 const DataLayout &DL = Mod->getDataLayout();
1415 SmallVector<std::pair<uint64_t, Align>, 16> AllocatedSizes;
1416 AllocatedSizes.reserve(UsedLDS.size());
1417
1418 for (const GlobalVariable *GV : UsedLDS) {
1419 Align Alignment =
1420 DL.getValueOrABITypeAlignment(GV->getAlign(), GV->getValueType());
1421 uint64_t AllocSize = DL.getTypeAllocSize(GV->getValueType());
1422
1423 // HIP uses an extern unsized array in local address space for dynamically
1424 // allocated shared memory. In that case, we have to disable the promotion.
1425 if (GV->hasExternalLinkage() && AllocSize == 0) {
1426 LocalMemLimit = 0;
1427 LLVM_DEBUG(dbgs() << "Function has a reference to externally allocated "
1428 "local memory. Promoting to local memory "
1429 "disabled.\n");
1430 return false;
1431 }
1432
1433 AllocatedSizes.emplace_back(AllocSize, Alignment);
1434 }
1435
1436 // Sort to try to estimate the worst case alignment padding
1437 //
1438 // FIXME: We should really do something to fix the addresses to a more optimal
1439 // value instead
1440 llvm::sort(AllocatedSizes, llvm::less_second());
1441
1442 // Check how much local memory is being used by global objects
1443 CurrentLocalMemUsage = 0;
1444
1445 // FIXME: Try to account for padding here. The real padding and address is
1446 // currently determined from the inverse order of uses in the function when
1447 // legalizing, which could also potentially change. We try to estimate the
1448 // worst case here, but we probably should fix the addresses earlier.
1449 for (auto Alloc : AllocatedSizes) {
1450 CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Alloc.second);
1451 CurrentLocalMemUsage += Alloc.first;
1452 }
1453
1454 unsigned MaxOccupancy =
1455 ST.getWavesPerEU(ST.getFlatWorkGroupSizes(F), CurrentLocalMemUsage, F)
1456 .second;
1457
1458 // Round up to the next tier of usage.
1459 unsigned MaxSizeWithWaveCount =
1460 ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy, F);
1461
1462 // Program may already use more LDS than is usable at maximum occupancy.
1463 if (CurrentLocalMemUsage > MaxSizeWithWaveCount)
1464 return false;
1465
1466 LocalMemLimit = MaxSizeWithWaveCount;
1467
1468 LLVM_DEBUG(dbgs() << F.getName() << " uses " << CurrentLocalMemUsage
1469 << " bytes of LDS\n"
1470 << " Rounding size to " << MaxSizeWithWaveCount
1471 << " with a maximum occupancy of " << MaxOccupancy << '\n'
1472 << " and " << (LocalMemLimit - CurrentLocalMemUsage)
1473 << " available for promotion\n");
1474
1475 return true;
1476}
1477
1478// FIXME: Should try to pick the most likely to be profitable allocas first.
1479bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaInst &I,
1480 bool SufficientLDS) {
1481 LLVM_DEBUG(dbgs() << "Trying to promote to LDS: " << I << '\n');
1482
1483 if (DisablePromoteAllocaToLDS) {
1484 LLVM_DEBUG(dbgs() << " Promote alloca to LDS is disabled\n");
1485 return false;
1486 }
1487
1488 const DataLayout &DL = Mod->getDataLayout();
1489 IRBuilder<> Builder(&I);
1490
1491 const Function &ContainingFunction = *I.getParent()->getParent();
1492 CallingConv::ID CC = ContainingFunction.getCallingConv();
1493
1494 // Don't promote the alloca to LDS for shader calling conventions as the work
1495 // item ID intrinsics are not supported for these calling conventions.
1496 // Furthermore not all LDS is available for some of the stages.
1497 switch (CC) {
1500 break;
1501 default:
1502 LLVM_DEBUG(
1503 dbgs()
1504 << " promote alloca to LDS not supported with calling convention.\n");
1505 return false;
1506 }
1507
1508 // Not likely to have sufficient local memory for promotion.
1509 if (!SufficientLDS)
1510 return false;
1511
1512 const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, ContainingFunction);
1513 unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second;
1514
1515 Align Alignment =
1516 DL.getValueOrABITypeAlignment(I.getAlign(), I.getAllocatedType());
1517
1518 // FIXME: This computed padding is likely wrong since it depends on inverse
1519 // usage order.
1520 //
1521 // FIXME: It is also possible that if we're allowed to use all of the memory
1522 // could end up using more than the maximum due to alignment padding.
1523
1524 uint32_t NewSize = alignTo(CurrentLocalMemUsage, Alignment);
1525 uint32_t AllocSize =
1526 WorkGroupSize * DL.getTypeAllocSize(I.getAllocatedType());
1527 NewSize += AllocSize;
1528
1529 if (NewSize > LocalMemLimit) {
1530 LLVM_DEBUG(dbgs() << " " << AllocSize
1531 << " bytes of local memory not available to promote\n");
1532 return false;
1533 }
1534
1535 CurrentLocalMemUsage = NewSize;
1536
1537 std::vector<Value *> WorkList;
1538
1539 if (!collectUsesWithPtrTypes(&I, &I, WorkList)) {
1540 LLVM_DEBUG(dbgs() << " Do not know how to convert all uses\n");
1541 return false;
1542 }
1543
1544 LLVM_DEBUG(dbgs() << "Promoting alloca to local memory\n");
1545
1546 Function *F = I.getParent()->getParent();
1547
1548 Type *GVTy = ArrayType::get(I.getAllocatedType(), WorkGroupSize);
1551 Twine(F->getName()) + Twine('.') + I.getName(), nullptr,
1554 GV->setAlignment(I.getAlign());
1555
1556 Value *TCntY, *TCntZ;
1557
1558 std::tie(TCntY, TCntZ) = getLocalSizeYZ(Builder);
1559 Value *TIdX = getWorkitemID(Builder, 0);
1560 Value *TIdY = getWorkitemID(Builder, 1);
1561 Value *TIdZ = getWorkitemID(Builder, 2);
1562
1563 Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ, "", true, true);
1564 Tmp0 = Builder.CreateMul(Tmp0, TIdX);
1565 Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ, "", true, true);
1566 Value *TID = Builder.CreateAdd(Tmp0, Tmp1);
1567 TID = Builder.CreateAdd(TID, TIdZ);
1568
1569 LLVMContext &Context = Mod->getContext();
1570 Value *Indices[] = {Constant::getNullValue(Type::getInt32Ty(Context)), TID};
1571
1572 Value *Offset = Builder.CreateInBoundsGEP(GVTy, GV, Indices);
1573 I.mutateType(Offset->getType());
1574 I.replaceAllUsesWith(Offset);
1575 I.eraseFromParent();
1576
1577 SmallVector<IntrinsicInst *> DeferredIntrs;
1578
1580
1581 for (Value *V : WorkList) {
1582 CallInst *Call = dyn_cast<CallInst>(V);
1583 if (!Call) {
1584 if (ICmpInst *CI = dyn_cast<ICmpInst>(V)) {
1585 Value *LHS = CI->getOperand(0);
1586 Value *RHS = CI->getOperand(1);
1587
1588 Type *NewTy = LHS->getType()->getWithNewType(NewPtrTy);
1589 if (isa<ConstantPointerNull, ConstantAggregateZero>(LHS))
1590 CI->setOperand(0, Constant::getNullValue(NewTy));
1591
1592 if (isa<ConstantPointerNull, ConstantAggregateZero>(RHS))
1593 CI->setOperand(1, Constant::getNullValue(NewTy));
1594
1595 continue;
1596 }
1597
1598 // The operand's value should be corrected on its own and we don't want to
1599 // touch the users.
1600 if (isa<AddrSpaceCastInst>(V))
1601 continue;
1602
1603 assert(V->getType()->isPtrOrPtrVectorTy());
1604
1605 Type *NewTy = V->getType()->getWithNewType(NewPtrTy);
1606 V->mutateType(NewTy);
1607
1608 // Adjust the types of any constant operands.
1609 if (SelectInst *SI = dyn_cast<SelectInst>(V)) {
1610 if (isa<ConstantPointerNull, ConstantAggregateZero>(SI->getOperand(1)))
1611 SI->setOperand(1, Constant::getNullValue(NewTy));
1612
1613 if (isa<ConstantPointerNull, ConstantAggregateZero>(SI->getOperand(2)))
1614 SI->setOperand(2, Constant::getNullValue(NewTy));
1615 } else if (PHINode *Phi = dyn_cast<PHINode>(V)) {
1616 for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) {
1617 if (isa<ConstantPointerNull, ConstantAggregateZero>(
1618 Phi->getIncomingValue(I)))
1619 Phi->setIncomingValue(I, Constant::getNullValue(NewTy));
1620 }
1621 }
1622
1623 continue;
1624 }
1625
1626 IntrinsicInst *Intr = cast<IntrinsicInst>(Call);
1627 Builder.SetInsertPoint(Intr);
1628 switch (Intr->getIntrinsicID()) {
1629 case Intrinsic::lifetime_start:
1630 case Intrinsic::lifetime_end:
1631 // These intrinsics are for address space 0 only
1632 Intr->eraseFromParent();
1633 continue;
1634 case Intrinsic::memcpy:
1635 case Intrinsic::memmove:
1636 // These have 2 pointer operands. In case if second pointer also needs
1637 // to be replaced we defer processing of these intrinsics until all
1638 // other values are processed.
1639 DeferredIntrs.push_back(Intr);
1640 continue;
1641 case Intrinsic::memset: {
1642 MemSetInst *MemSet = cast<MemSetInst>(Intr);
1643 Builder.CreateMemSet(MemSet->getRawDest(), MemSet->getValue(),
1644 MemSet->getLength(), MemSet->getDestAlign(),
1645 MemSet->isVolatile());
1646 Intr->eraseFromParent();
1647 continue;
1648 }
1649 case Intrinsic::invariant_start:
1650 case Intrinsic::invariant_end:
1651 case Intrinsic::launder_invariant_group:
1652 case Intrinsic::strip_invariant_group: {
1654 if (Intr->getIntrinsicID() == Intrinsic::invariant_start) {
1655 Args.emplace_back(Intr->getArgOperand(0));
1656 } else if (Intr->getIntrinsicID() == Intrinsic::invariant_end) {
1657 Args.emplace_back(Intr->getArgOperand(0));
1658 Args.emplace_back(Intr->getArgOperand(1));
1659 }
1660 Args.emplace_back(Offset);
1662 Intr->getModule(), Intr->getIntrinsicID(), Offset->getType());
1663 CallInst *NewIntr =
1664 CallInst::Create(F, Args, Intr->getName(), Intr->getIterator());
1665 Intr->mutateType(NewIntr->getType());
1666 Intr->replaceAllUsesWith(NewIntr);
1667 Intr->eraseFromParent();
1668 continue;
1669 }
1670 case Intrinsic::objectsize: {
1671 Value *Src = Intr->getOperand(0);
1672
1673 CallInst *NewCall = Builder.CreateIntrinsic(
1674 Intrinsic::objectsize,
1675 {Intr->getType(), PointerType::get(Context, AMDGPUAS::LOCAL_ADDRESS)},
1676 {Src, Intr->getOperand(1), Intr->getOperand(2), Intr->getOperand(3)});
1677 Intr->replaceAllUsesWith(NewCall);
1678 Intr->eraseFromParent();
1679 continue;
1680 }
1681 default:
1682 Intr->print(errs());
1683 llvm_unreachable("Don't know how to promote alloca intrinsic use.");
1684 }
1685 }
1686
1687 for (IntrinsicInst *Intr : DeferredIntrs) {
1688 Builder.SetInsertPoint(Intr);
1689 Intrinsic::ID ID = Intr->getIntrinsicID();
1690 assert(ID == Intrinsic::memcpy || ID == Intrinsic::memmove);
1691
1692 MemTransferInst *MI = cast<MemTransferInst>(Intr);
1693 auto *B = Builder.CreateMemTransferInst(
1694 ID, MI->getRawDest(), MI->getDestAlign(), MI->getRawSource(),
1695 MI->getSourceAlign(), MI->getLength(), MI->isVolatile());
1696
1697 for (unsigned I = 0; I != 2; ++I) {
1698 if (uint64_t Bytes = Intr->getParamDereferenceableBytes(I)) {
1699 B->addDereferenceableParamAttr(I, Bytes);
1700 }
1701 }
1702
1703 Intr->eraseFromParent();
1704 }
1705
1706 return true;
1707}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
unsigned Intr
static Value * GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca, Type *VecElemTy, const DataLayout &DL, SmallVector< Instruction * > &NewInsts)
AMDGPU promote alloca to vector or LDS
static Value * promoteAllocaUserToVector(Instruction *Inst, const DataLayout &DL, FixedVectorType *VectorTy, unsigned VecStoreSize, unsigned ElementSize, DenseMap< MemTransferInst *, MemTransferInfo > &TransferInfo, std::map< GetElementPtrInst *, WeakTrackingVH > &GEPVectorIdx, Value *CurVal, SmallVectorImpl< LoadInst * > &DeferredLoads)
Promotes a single user of the alloca to a vector form.
static void collectAllocaUses(AllocaInst &Alloca, SmallVectorImpl< Use * > &Uses)
static bool isSupportedAccessType(FixedVectorType *VecTy, Type *AccessTy, const DataLayout &DL)
static void forEachWorkListItem(const InstContainer &WorkList, std::function< void(Instruction *)> Fn)
Iterates over an instruction worklist that may contain multiple instructions from the same basic bloc...
static bool isSupportedMemset(MemSetInst *I, AllocaInst *AI, const DataLayout &DL)
static BasicBlock::iterator skipToNonAllocaInsertPt(BasicBlock &BB, BasicBlock::iterator I)
Find an insert point after an alloca, after all other allocas clustered at the start of the block.
static bool isCallPromotable(CallInst *CI)
#define DEBUG_TYPE
static Value * calculateVectorIndex(Value *Ptr, const std::map< GetElementPtrInst *, WeakTrackingVH > &GEPIdx)
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
uint64_t Size
AMD GCN specific subclass of TargetSubtarget.
Hexagon Common GEP
IRTranslator LLVM IR MI
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
if(auto Err=PB.parsePassPipeline(MPM, Passes)) return wrap(std MPM run * Mod
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:39
Remove Loads Into Fake Uses
static unsigned getNumElements(Type *Ty)
This file contains some templates that are useful if you are working with the STL at all.
#define LLVM_DEBUG(...)
Definition: Debug.h:119
Target-Independent Code Generator Pass Configuration Options pass.
Value * RHS
Value * LHS
static const AMDGPUSubtarget & get(const MachineFunction &MF)
Class for arbitrary precision integers.
Definition: APInt.h:78
static LLVM_ABI void udivrem(const APInt &LHS, const APInt &RHS, APInt &Quotient, APInt &Remainder)
Dual division/remainder interface.
Definition: APInt.cpp:1758
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1540
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition: APInt.h:380
bool isOne() const
Determine if this is a value of 1.
Definition: APInt.h:389
an instruction to allocate memory on the stack
Definition: Instructions.h:64
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
Definition: Instructions.h:121
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:255
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:412
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:270
static LLVM_ABI ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
An instruction that atomically checks whether a specified value is in a memory location,...
Definition: Instructions.h:506
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:709
LLVM Basic Block Representation.
Definition: BasicBlock.h:62
iterator end()
Definition: BasicBlock.h:472
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:213
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:170
Represents analyses that only rely on functions' control flow.
Definition: Analysis.h:73
void addDereferenceableRetAttr(uint64_t Bytes)
adds the dereferenceable attribute to the list of attributes.
Definition: InstrTypes.h:1581
void addRetAttr(Attribute::AttrKind Kind)
Adds the attribute to the return value.
Definition: InstrTypes.h:1491
This class represents a function call, abstracting a target machine's calling convention.
static CallInst * Create(FunctionType *Ty, Value *F, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
static LLVM_ABI bool isBitOrNoopPointerCastable(Type *SrcTy, Type *DestTy, const DataLayout &DL)
Check whether a bitcast, inttoptr, or ptrtoint cast between these types is valid and a no-op.
This is the shared class of boolean and integer constants.
Definition: Constants.h:87
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition: Constants.h:163
This is an important base class in LLVM.
Definition: Constant.h:43
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:373
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition: DenseMap.h:245
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
Definition: DenseMap.h:221
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:230
Implements a dense probed hash-table based set.
Definition: DenseSet.h:263
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:592
unsigned getNumElements() const
Definition: DerivedTypes.h:635
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:803
This class represents a freeze function that returns random concrete value if an operand is either a ...
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:314
virtual bool runOnFunction(Function &F)=0
runOnFunction - Virtual method overriden by subclasses to do the per-function processing of the pass.
bool skipFunction(const Function &F) const
Optional passes call this function to check whether the pass should be skipped.
Definition: Pass.cpp:188
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:270
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:949
bool hasExternalLinkage() const
Definition: GlobalValue.h:513
void setUnnamedAddr(UnnamedAddr Val)
Definition: GlobalValue.h:233
unsigned getAddressSpace() const
Definition: GlobalValue.h:207
@ InternalLinkage
Rename collisions when linking (static functions).
Definition: GlobalValue.h:60
Type * getValueType() const
Definition: GlobalValue.h:298
MaybeAlign getAlign() const
Returns the alignment of the given variable.
void setAlignment(Align Align)
Sets the alignment attribute of the GlobalVariable.
This instruction compares its operands according to the predicate given to the constructor.
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2571
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2559
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition: IRBuilder.h:575
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition: IRBuilder.h:1864
LLVM_ABI Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
Definition: IRBuilder.cpp:1115
Value * CreateIntToPtr(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2199
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition: IRBuilder.h:1513
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:201
Value * CreateInBoundsGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="")
Definition: IRBuilder.h:1931
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:834
Value * CreateBitOrPointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2286
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2204
ConstantInt * getIntN(unsigned N, uint64_t C)
Get a constant N-bit value, zero extended or truncated from a 64-bit value.
Definition: IRBuilder.h:533
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition: IRBuilder.h:1847
CallInst * CreateMemSet(Value *Ptr, Value *Val, uint64_t Size, MaybeAlign Align, bool isVolatile=false, const AAMDNodes &AAInfo=AAMDNodes())
Create and insert a memset to the specified pointer and the specified value.
Definition: IRBuilder.h:630
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition: IRBuilder.h:2593
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1403
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2508
PointerType * getPtrTy(unsigned AddrSpace=0)
Fetch the type representing a pointer.
Definition: IRBuilder.h:605
Value * CreateConstInBoundsGEP1_64(Type *Ty, Value *Ptr, uint64_t Idx0, const Twine &Name="")
Definition: IRBuilder.h:1993
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:207
LLVM_ABI CallInst * CreateMemTransferInst(Intrinsic::ID IntrID, Value *Dst, MaybeAlign DstAlign, Value *Src, MaybeAlign SrcAlign, Value *Size, bool isVolatile=false, const AAMDNodes &AAInfo=AAMDNodes())
Definition: IRBuilder.cpp:209
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1437
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2780
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1718
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:312
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:49
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:68
An instruction for reading from memory.
Definition: Instructions.h:180
Analysis pass that exposes the LoopInfo for a function.
Definition: LoopInfo.h:570
LLVM_ABI LoopInfo run(Function &F, FunctionAnalysisManager &AM)
Definition: LoopInfo.cpp:981
The legacy pass manager's analysis pass to compute loop information.
Definition: LoopInfo.h:597
Metadata node.
Definition: Metadata.h:1077
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1565
size_type size() const
Definition: MapVector.h:56
std::pair< KeyT, ValueT > & front()
Definition: MapVector.h:79
Value * getLength() const
Value * getRawDest() const
MaybeAlign getDestAlign() const
bool isVolatile() const
Value * getValue() const
This class wraps the llvm.memset and llvm.memset.inline intrinsics.
This class wraps the llvm.memcpy/memmove intrinsics.
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:67
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
Definition: Pass.cpp:112
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Definition: Pass.cpp:85
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1885
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition: Analysis.h:151
Helper class for SSA formation on a set of values defined in multiple blocks.
Definition: SSAUpdater.h:39
Value * FindValueForBlock(BasicBlock *BB) const
Return the value for the specified block if the SSAUpdater has one, otherwise return nullptr.
Definition: SSAUpdater.cpp:65
void Initialize(Type *Ty, StringRef Name)
Reset this object to get ready for a new set of SSA updates with type 'Ty'.
Definition: SSAUpdater.cpp:52
Value * GetValueInMiddleOfBlock(BasicBlock *BB)
Construct SSA form, materializing a value that is live in the middle of the specified block.
Definition: SSAUpdater.cpp:97
void AddAvailableValue(BasicBlock *BB, Value *V)
Indicate that a rewritten value is available in the specified block with the specified value.
Definition: SSAUpdater.cpp:69
This class represents the LLVM 'select' instruction.
size_type size() const
Definition: SmallPtrSet.h:99
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:401
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:541
bool empty() const
Definition: SmallVector.h:82
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:574
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:938
void reserve(size_type N)
Definition: SmallVector.h:664
void push_back(const T &Elt)
Definition: SmallVector.h:414
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1197
An instruction for storing to memory.
Definition: Instructions.h:296
static unsigned getPointerOperandIndex()
Definition: Instructions.h:388
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:55
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:83
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:47
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:82
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isArrayTy() const
True if this is an instance of ArrayType.
Definition: Type.h:264
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:267
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
bool isAggregateType() const
Return true if the type is an aggregate type.
Definition: Type.h:304
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition: Type.h:270
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
A Use represents the edge between a Value definition and its users.
Definition: Use.h:35
void setOperand(unsigned i, Value *Val)
Definition: User.h:237
Value * getOperand(unsigned i) const
Definition: User.h:232
LLVM Value Representation.
Definition: Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:256
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:546
iterator_range< user_iterator > users()
Definition: Value.h:426
bool use_empty() const
Definition: Value.h:346
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1098
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition: Value.cpp:396
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:695
static LLVM_ABI bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
Type * getElementType() const
Definition: DerivedTypes.h:463
constexpr bool isKnownMultipleOf(ScalarTy RHS) const
This function tells the caller whether the element count is known at compile time to be a multiple of...
Definition: TypeSize.h:184
const ParentTy * getParent() const
Definition: ilist_node.h:34
self_iterator getIterator()
Definition: ilist_node.h:134
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ LOCAL_ADDRESS
Address space for local memory.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
unsigned getDynamicVGPRBlockSize(const Function &F)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ SPIR_KERNEL
Used for SPIR kernel functions.
Definition: CallingConv.h:144
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Definition: Intrinsics.cpp:751
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:444
NodeAddr< PhiNode * > Phi
Definition: RDFGraph.h:390
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Offset
Definition: DWP.cpp:477
@ Length
Definition: DWP.cpp:477
void stable_sort(R &&Range)
Definition: STLExtras.h:2077
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1744
LLVM_ABI bool isAssumeLikeIntrinsic(const Instruction *I)
Return true if it is an intrinsic that cannot be speculated but also cannot trap.
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:428
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1669
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:207
constexpr int PoisonMaskElem
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
FunctionPass * createAMDGPUPromoteAlloca()
@ Mod
The access may modify the value stored in memory.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1916
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
char & AMDGPUPromoteAllocaID
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
#define N
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
A MapVector that performs no allocations if smaller than a certain size.
Definition: MapVector.h:249
Function object to check whether the second component of a container supported by std::get (like std:...
Definition: STLExtras.h:1481