40#include "llvm/IR/IntrinsicsAMDGPU.h"
41#include "llvm/IR/IntrinsicsR600.h"
48#define DEBUG_TYPE "amdgpu-promote-alloca"
55 DisablePromoteAllocaToVector(
"disable-promote-alloca-to-vector",
56 cl::desc(
"Disable promote alloca to vector"),
60 DisablePromoteAllocaToLDS(
"disable-promote-alloca-to-lds",
61 cl::desc(
"Disable promote alloca to LDS"),
65 "amdgpu-promote-alloca-to-vector-limit",
66 cl::desc(
"Maximum byte size to consider promote alloca to vector"),
70 "amdgpu-promote-alloca-to-vector-max-regs",
72 "Maximum vector size (in 32b registers) to use when promoting alloca"),
78 "amdgpu-promote-alloca-to-vector-vgpr-ratio",
79 cl::desc(
"Ratio of VGPRs to budget for promoting alloca to vectors"),
83 LoopUserWeight(
"promote-alloca-vector-loop-user-weight",
84 cl::desc(
"The bonus weight of users of allocas within loop "
85 "when sorting profitable allocas"),
89class AMDGPUPromoteAllocaImpl {
100 unsigned VGPRBudgetRatio;
101 unsigned MaxVectorRegs;
103 bool IsAMDGCN =
false;
104 bool IsAMDHSA =
false;
106 std::pair<Value *, Value *> getLocalSizeYZ(
IRBuilder<> &Builder);
111 bool collectUsesWithPtrTypes(
Value *BaseAlloca,
Value *Val,
112 std::vector<Value *> &WorkList)
const;
118 bool binaryOpIsDerivedFromSameAlloca(
Value *Alloca,
Value *Val,
123 bool hasSufficientLocalMem(
const Function &
F);
126 bool tryPromoteAllocaToLDS(
AllocaInst &
I,
bool SufficientLDS);
130 void setFunctionLimits(
const Function &
F);
135 const Triple &TT = TM.getTargetTriple();
136 IsAMDGCN = TT.isAMDGCN();
140 bool run(
Function &
F,
bool PromoteToLDS);
153 if (
auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
154 return AMDGPUPromoteAllocaImpl(
156 getAnalysis<LoopInfoWrapperPass>().getLoopInfo())
170static unsigned getMaxVGPRs(
unsigned LDSBytes,
const TargetMachine &TM,
172 if (!TM.getTargetTriple().isAMDGCN())
180 if (DynamicVGPRBlockSize == 0 && ST.isDynamicVGPREnabled())
181 DynamicVGPRBlockSize = ST.getDynamicVGPRBlockSize();
183 unsigned MaxVGPRs = ST.getMaxNumVGPRs(
184 ST.getWavesPerEU(ST.getFlatWorkGroupSizes(
F), LDSBytes,
F).first,
185 DynamicVGPRBlockSize);
190 if (!
F.hasFnAttribute(Attribute::AlwaysInline) &&
192 MaxVGPRs = std::min(MaxVGPRs, 32u);
198char AMDGPUPromoteAlloca::ID = 0;
201 "AMDGPU promote alloca to vector or LDS",
false,
false)
214 bool Changed = AMDGPUPromoteAllocaImpl(TM, LI).
run(
F,
true);
226 bool Changed = AMDGPUPromoteAllocaImpl(TM, LI).
run(
F,
false);
236 return new AMDGPUPromoteAlloca();
242 while (!WorkList.empty()) {
244 for (
auto &U : Cur->uses()) {
247 if (isa<GetElementPtrInst>(U.getUser()))
248 WorkList.push_back(cast<Instruction>(U.getUser()));
253void AMDGPUPromoteAllocaImpl::sortAllocasToPromote(
257 for (
auto *Alloca : Allocas) {
259 unsigned &Score = Scores[Alloca];
263 for (
auto *U :
Uses) {
265 if (isa<GetElementPtrInst>(Inst))
268 1 + (LoopUserWeight * LI.getLoopDepth(Inst->
getParent()));
269 LLVM_DEBUG(
dbgs() <<
" [+" << UserScore <<
"]:\t" << *Inst <<
"\n");
276 return Scores.
at(
A) > Scores.
at(
B);
281 dbgs() <<
"Sorted Worklist:\n";
282 for (
auto *
A: Allocas)
283 dbgs() <<
" " << *
A <<
"\n";
288void AMDGPUPromoteAllocaImpl::setFunctionLimits(
const Function &
F) {
292 const int R600MaxVectorRegs = 16;
293 MaxVectorRegs =
F.getFnAttributeAsParsedInteger(
294 "amdgpu-promote-alloca-to-vector-max-regs",
295 IsAMDGCN ? PromoteAllocaToVectorMaxRegs : R600MaxVectorRegs);
296 if (PromoteAllocaToVectorMaxRegs.getNumOccurrences())
297 MaxVectorRegs = PromoteAllocaToVectorMaxRegs;
298 VGPRBudgetRatio =
F.getFnAttributeAsParsedInteger(
299 "amdgpu-promote-alloca-to-vector-vgpr-ratio",
300 PromoteAllocaToVectorVGPRRatio);
301 if (PromoteAllocaToVectorVGPRRatio.getNumOccurrences())
302 VGPRBudgetRatio = PromoteAllocaToVectorVGPRRatio;
305bool AMDGPUPromoteAllocaImpl::run(
Function &
F,
bool PromoteToLDS) {
307 DL = &
Mod->getDataLayout();
310 if (!
ST.isPromoteAllocaEnabled())
313 bool SufficientLDS = PromoteToLDS && hasSufficientLocalMem(
F);
314 MaxVGPRs = getMaxVGPRs(CurrentLocalMemUsage, TM,
F);
315 setFunctionLimits(
F);
317 unsigned VectorizationBudget =
318 (PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8
327 if (!AI->isStaticAlloca() || AI->isArrayAllocation())
333 sortAllocasToPromote(Allocas);
335 bool Changed =
false;
337 const unsigned AllocaCost =
DL->getTypeSizeInBits(AI->getAllocatedType());
339 if (AllocaCost <= VectorizationBudget) {
342 if (tryPromoteAllocaToVector(*AI)) {
344 assert((VectorizationBudget - AllocaCost) < VectorizationBudget &&
346 VectorizationBudget -= AllocaCost;
348 << VectorizationBudget <<
"\n");
353 << AllocaCost <<
", budget:" << VectorizationBudget
354 <<
"): " << *AI <<
"\n");
357 if (PromoteToLDS && tryPromoteAllocaToLDS(*AI, SufficientLDS))
378 using namespace PatternMatch;
386 return I->getOperand(0) == AI &&
391 Value *
Ptr,
const std::map<GetElementPtrInst *, WeakTrackingVH> &GEPIdx) {
392 auto *
GEP = dyn_cast<GetElementPtrInst>(
Ptr->stripPointerCasts());
396 auto I = GEPIdx.find(
GEP);
397 assert(
I != GEPIdx.end() &&
"Must have entry for GEP!");
399 Value *IndexValue =
I->second;
400 assert(IndexValue &&
"index value missing from GEP index map");
409 unsigned BW =
DL.getIndexTypeSizeInBits(
GEP->getType());
411 APInt ConstOffset(BW, 0);
431 while (
auto *CurGEP = dyn_cast<GetElementPtrInst>(CurPtr)) {
432 if (!CurGEP->collectOffset(
DL, BW, VarOffsets, ConstOffset))
436 CurPtr = CurGEP->getPointerOperand();
439 assert(CurPtr == Alloca &&
"GEP not based on alloca");
441 unsigned VecElemSize =
DL.getTypeAllocSize(VecElemTy);
442 if (VarOffsets.
size() > 1)
450 if (VarOffsets.
size() == 0)
451 return ConstantInt::get(
GEP->getContext(), IndexQuot);
455 const auto &VarOffset = VarOffsets.
front();
458 if (Rem != 0 || OffsetQuot.
isZero())
462 auto *OffsetType = dyn_cast<IntegerType>(
Offset->getType());
466 if (!OffsetQuot.
isOne()) {
468 ConstantInt::get(OffsetType, OffsetQuot.
getZExtValue());
479 if (
Instruction *NewInst = dyn_cast<Instruction>(IndexAdd))
501 unsigned VecStoreSize,
unsigned ElementSize,
503 std::map<GetElementPtrInst *, WeakTrackingVH> &GEPVectorIdx,
Value *CurVal,
511 const auto GetOrLoadCurrentVectorValue = [&]() ->
Value * {
519 "promotealloca.dummyload");
524 const auto CreateTempPtrIntCast = [&Builder,
DL](
Value *Val,
526 assert(
DL.getTypeStoreSize(Val->getType()) ==
DL.getTypeStoreSize(PtrTy));
527 const unsigned Size =
DL.getTypeStoreSizeInBits(PtrTy);
528 if (!PtrTy->isVectorTy())
530 const unsigned NumPtrElts = cast<FixedVectorType>(PtrTy)->getNumElements();
533 assert((
Size % NumPtrElts == 0) &&
"Vector size not divisble");
542 case Instruction::Load: {
545 DeferredLoads.
push_back(cast<LoadInst>(Inst));
554 TypeSize AccessSize =
DL.getTypeStoreSize(AccessTy);
555 if (
Constant *CI = dyn_cast<Constant>(Index)) {
556 if (CI->isZeroValue() && AccessSize == VecStoreSize) {
558 CurVal = CreateTempPtrIntCast(CurVal, AccessTy);
560 CurVal = CreateTempPtrIntCast(CurVal, CurVal->
getType());
568 if (isa<FixedVectorType>(AccessTy)) {
570 const unsigned NumLoadedElts = AccessSize /
DL.getTypeStoreSize(VecEltTy);
572 assert(
DL.getTypeStoreSize(SubVecTy) ==
DL.getTypeStoreSize(AccessTy));
575 for (
unsigned K = 0; K < NumLoadedElts; ++K) {
577 Builder.
CreateAdd(Index, ConstantInt::get(Index->getType(), K));
583 SubVec = CreateTempPtrIntCast(SubVec, AccessTy);
584 else if (SubVecTy->isPtrOrPtrVectorTy())
585 SubVec = CreateTempPtrIntCast(SubVec, SubVecTy);
594 if (AccessTy != VecEltTy)
600 case Instruction::Store: {
607 Value *Val = SI->getValueOperand();
611 TypeSize AccessSize =
DL.getTypeStoreSize(AccessTy);
612 if (
Constant *CI = dyn_cast<Constant>(Index)) {
613 if (CI->isZeroValue() && AccessSize == VecStoreSize) {
615 Val = CreateTempPtrIntCast(Val, AccessTy);
617 Val = CreateTempPtrIntCast(Val, VectorTy);
623 if (isa<FixedVectorType>(AccessTy)) {
625 const unsigned NumWrittenElts =
626 AccessSize /
DL.getTypeStoreSize(VecEltTy);
629 assert(
DL.getTypeStoreSize(SubVecTy) ==
DL.getTypeStoreSize(AccessTy));
631 if (SubVecTy->isPtrOrPtrVectorTy())
632 Val = CreateTempPtrIntCast(Val, SubVecTy);
634 Val = CreateTempPtrIntCast(Val, AccessTy);
638 Value *CurVec = GetOrLoadCurrentVectorValue();
639 for (
unsigned K = 0, NumElts = std::min(NumWrittenElts, NumVecElts);
642 Builder.
CreateAdd(Index, ConstantInt::get(Index->getType(), K));
649 if (Val->
getType() != VecEltTy)
654 case Instruction::Call: {
655 if (
auto *MTI = dyn_cast<MemTransferInst>(Inst)) {
658 unsigned NumCopied =
Length->getZExtValue() / ElementSize;
665 if (
Idx >= DestBegin &&
Idx < DestBegin + NumCopied) {
677 if (
auto *MSI = dyn_cast<MemSetInst>(Inst)) {
680 Value *Elt = MSI->getOperand(1);
681 const unsigned BytesPerElt =
DL.getTypeStoreSize(VecEltTy);
682 if (BytesPerElt > 1) {
698 if (
auto *
Intr = dyn_cast<IntrinsicInst>(Inst)) {
699 if (
Intr->getIntrinsicID() == Intrinsic::objectsize) {
700 Intr->replaceAllUsesWith(
701 Builder.
getIntN(
Intr->getType()->getIntegerBitWidth(),
702 DL.getTypeAllocSize(VectorTy)));
730 if (isa<FixedVectorType>(AccessTy)) {
731 TypeSize AccTS =
DL.getTypeStoreSize(AccessTy);
735 if (AccTS * 8 !=
DL.getTypeSizeInBits(AccessTy))
747template <
typename InstContainer>
759 auto &BlockUses = UsesByBlock[BB];
762 if (BlockUses.empty())
766 if (BlockUses.size() == 1) {
773 if (!BlockUses.contains(&Inst))
794bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(
AllocaInst &Alloca) {
795 LLVM_DEBUG(
dbgs() <<
"Trying to promote to vector: " << Alloca <<
'\n');
797 if (DisablePromoteAllocaToVector) {
803 auto *VectorTy = dyn_cast<FixedVectorType>(AllocaTy);
804 if (
auto *ArrayTy = dyn_cast<ArrayType>(AllocaTy)) {
808 NumElems *= ArrayTy->getNumElements();
809 ElemTy = ArrayTy->getElementType();
810 }
while ((ArrayTy = dyn_cast<ArrayType>(ElemTy)));
813 auto *InnerVectorTy = dyn_cast<FixedVectorType>(ElemTy);
815 NumElems *= InnerVectorTy->getNumElements();
816 ElemTy = InnerVectorTy->getElementType();
820 unsigned ElementSize =
DL->getTypeSizeInBits(ElemTy) / 8;
821 if (ElementSize > 0) {
822 unsigned AllocaSize =
DL->getTypeStoreSize(AllocaTy);
827 if (NumElems * ElementSize != AllocaSize)
828 NumElems = AllocaSize / ElementSize;
829 if (NumElems > 0 && (AllocaSize % ElementSize) == 0)
840 const unsigned MaxElements =
841 (MaxVectorRegs * 32) /
DL->getTypeSizeInBits(VectorTy->getElementType());
843 if (VectorTy->getNumElements() > MaxElements ||
844 VectorTy->getNumElements() < 2) {
846 <<
" has an unsupported number of elements\n");
850 std::map<GetElementPtrInst *, WeakTrackingVH> GEPVectorIdx;
858 LLVM_DEBUG(
dbgs() <<
" Cannot promote alloca to vector: " << Msg <<
"\n"
859 <<
" " << *Inst <<
"\n");
860 for (
auto *Inst :
reverse(NewGEPInsts))
868 LLVM_DEBUG(
dbgs() <<
" Attempting promotion to: " << *VectorTy <<
"\n");
870 Type *VecEltTy = VectorTy->getElementType();
871 unsigned ElementSizeInBits =
DL->getTypeSizeInBits(VecEltTy);
872 if (ElementSizeInBits !=
DL->getTypeAllocSizeInBits(VecEltTy)) {
873 LLVM_DEBUG(
dbgs() <<
" Cannot convert to vector if the allocation size "
874 "does not match the type's size\n");
877 unsigned ElementSize = ElementSizeInBits / 8;
879 for (
auto *U :
Uses) {
884 if (isa<StoreInst>(Inst) &&
886 return RejectUser(Inst,
"pointer is being stored");
890 return RejectUser(Inst,
"unsupported load/store as aggregate");
894 bool IsSimple = isa<LoadInst>(Inst) ? cast<LoadInst>(Inst)->isSimple()
895 : cast<StoreInst>(Inst)->isSimple();
897 return RejectUser(Inst,
"not a simple load or store");
899 Ptr =
Ptr->stripPointerCasts();
903 DL->getTypeStoreSize(AccessTy)) {
909 return RejectUser(Inst,
"not a supported access type");
915 if (
auto *
GEP = dyn_cast<GetElementPtrInst>(Inst)) {
920 return RejectUser(Inst,
"cannot compute vector index for GEP");
927 if (
MemSetInst *MSI = dyn_cast<MemSetInst>(Inst);
934 if (TransferInst->isVolatile())
935 return RejectUser(Inst,
"mem transfer inst is volatile");
937 ConstantInt *
Len = dyn_cast<ConstantInt>(TransferInst->getLength());
938 if (!Len || (
Len->getZExtValue() % ElementSize))
939 return RejectUser(Inst,
"mem transfer inst length is non-constant or "
940 "not a multiple of the vector element size");
942 if (TransferInfo.
try_emplace(TransferInst).second) {
949 if (
Ptr != &Alloca && !GEPVectorIdx.count(
GEP))
955 unsigned OpNum =
U->getOperandNo();
958 Value *Dest = TransferInst->getDest();
961 return RejectUser(Inst,
"could not calculate constant dest index");
965 Value *Src = TransferInst->getSource();
968 return RejectUser(Inst,
"could not calculate constant src index");
974 if (
auto *
Intr = dyn_cast<IntrinsicInst>(Inst)) {
975 if (
Intr->getIntrinsicID() == Intrinsic::objectsize) {
984 return RejectUser(Inst,
"assume-like intrinsic cannot have any users");
990 return isAssumeLikeIntrinsic(cast<Instruction>(U));
996 return RejectUser(Inst,
"unhandled alloca user");
999 while (!DeferredInsts.
empty()) {
1005 if (!
Info.SrcIndex || !
Info.DestIndex)
1007 Inst,
"mem transfer inst is missing constant src and/or dst index");
1010 LLVM_DEBUG(
dbgs() <<
" Converting alloca to vector " << *AllocaTy <<
" -> "
1011 << *VectorTy <<
'\n');
1012 const unsigned VecStoreSize =
DL->getTypeStoreSize(VectorTy);
1017 Updater.
Initialize(VectorTy,
"promotealloca");
1023 Value *AllocaInitValue =
1025 AllocaInitValue->
takeName(&Alloca);
1036 I, *
DL, VectorTy, VecStoreSize, ElementSize, TransferInfo, GEPVectorIdx,
1049 I, *
DL, VectorTy, VecStoreSize, ElementSize, TransferInfo, GEPVectorIdx,
1053 assert(NewDLs.
empty() &&
"No more deferred loads should be queued!");
1059 InstsToDelete.insert_range(DeferredLoads);
1062 I->eraseFromParent();
1067 I->dropDroppableUses();
1069 I->eraseFromParent();
1078std::pair<Value *, Value *>
1079AMDGPUPromoteAllocaImpl::getLocalSizeYZ(
IRBuilder<> &Builder) {
1089 ST.makeLIDRangeMetadata(LocalSizeY);
1090 ST.makeLIDRangeMetadata(LocalSizeZ);
1092 return std::pair(LocalSizeY, LocalSizeZ);
1133 F.removeFnAttr(
"amdgpu-no-dispatch-ptr");
1150 LoadXY->
setMetadata(LLVMContext::MD_invariant_load, MD);
1151 LoadZU->
setMetadata(LLVMContext::MD_invariant_load, MD);
1152 ST.makeLIDRangeMetadata(LoadZU);
1157 return std::pair(
Y, LoadZU);
1169 IntrID = IsAMDGCN ? (
Intrinsic::ID)Intrinsic::amdgcn_workitem_id_x
1171 AttrName =
"amdgpu-no-workitem-id-x";
1174 IntrID = IsAMDGCN ? (
Intrinsic::ID)Intrinsic::amdgcn_workitem_id_y
1176 AttrName =
"amdgpu-no-workitem-id-y";
1180 IntrID = IsAMDGCN ? (
Intrinsic::ID)Intrinsic::amdgcn_workitem_id_z
1182 AttrName =
"amdgpu-no-workitem-id-z";
1190 ST.makeLIDRangeMetadata(CI);
1191 F->removeFnAttr(AttrName);
1201 switch (
II->getIntrinsicID()) {
1202 case Intrinsic::memcpy:
1203 case Intrinsic::memmove:
1204 case Intrinsic::memset:
1205 case Intrinsic::lifetime_start:
1206 case Intrinsic::lifetime_end:
1207 case Intrinsic::invariant_start:
1208 case Intrinsic::invariant_end:
1209 case Intrinsic::launder_invariant_group:
1210 case Intrinsic::strip_invariant_group:
1211 case Intrinsic::objectsize:
1218bool AMDGPUPromoteAllocaImpl::binaryOpIsDerivedFromSameAlloca(
1226 if (isa<ConstantPointerNull, ConstantAggregateZero>(OtherOp))
1231 if (!isa<AllocaInst>(OtherObj))
1240 if (OtherObj != BaseAlloca) {
1242 dbgs() <<
"Found a binary instruction with another alloca object\n");
1249bool AMDGPUPromoteAllocaImpl::collectUsesWithPtrTypes(
1250 Value *BaseAlloca,
Value *Val, std::vector<Value *> &WorkList)
const {
1260 WorkList.push_back(
User);
1265 if (UseInst->
getOpcode() == Instruction::PtrToInt)
1268 if (
LoadInst *LI = dyn_cast<LoadInst>(UseInst)) {
1269 if (LI->isVolatile())
1274 if (
StoreInst *SI = dyn_cast<StoreInst>(UseInst)) {
1275 if (
SI->isVolatile())
1279 if (
SI->getPointerOperand() != Val)
1284 if (
AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(UseInst)) {
1285 if (RMW->isVolatile())
1291 if (CAS->isVolatile())
1298 if (
ICmpInst *ICmp = dyn_cast<ICmpInst>(UseInst)) {
1299 if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, ICmp, 0, 1))
1303 WorkList.push_back(ICmp);
1310 if (!
GEP->isInBounds())
1312 }
else if (
SelectInst *SI = dyn_cast<SelectInst>(UseInst)) {
1315 if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, SI, 1, 2))
1317 }
else if (
PHINode *Phi = dyn_cast<PHINode>(UseInst)) {
1322 switch (
Phi->getNumIncomingValues()) {
1326 if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, Phi, 0, 1))
1332 }
else if (!isa<ExtractElementInst>(
User)) {
1343 WorkList.push_back(
User);
1344 if (!collectUsesWithPtrTypes(BaseAlloca,
User, WorkList))
1351bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(
const Function &
F) {
1359 for (
Type *ParamTy : FTy->params()) {
1360 PointerType *PtrTy = dyn_cast<PointerType>(ParamTy);
1363 LLVM_DEBUG(
dbgs() <<
"Function has local memory argument. Promoting to "
1364 "local memory disabled.\n");
1369 LocalMemLimit =
ST.getAddressableLocalMemorySize();
1370 if (LocalMemLimit == 0)
1380 if (
Use->getParent()->getParent() == &
F)
1384 if (VisitedConstants.
insert(
C).second)
1396 if (visitUsers(&GV, &GV)) {
1404 while (!
Stack.empty()) {
1406 if (visitUsers(&GV,
C)) {
1427 LLVM_DEBUG(
dbgs() <<
"Function has a reference to externally allocated "
1428 "local memory. Promoting to local memory "
1443 CurrentLocalMemUsage = 0;
1449 for (
auto Alloc : AllocatedSizes) {
1450 CurrentLocalMemUsage =
alignTo(CurrentLocalMemUsage,
Alloc.second);
1451 CurrentLocalMemUsage +=
Alloc.first;
1454 unsigned MaxOccupancy =
1455 ST.getWavesPerEU(
ST.getFlatWorkGroupSizes(
F), CurrentLocalMemUsage,
F)
1459 unsigned MaxSizeWithWaveCount =
1460 ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy,
F);
1463 if (CurrentLocalMemUsage > MaxSizeWithWaveCount)
1466 LocalMemLimit = MaxSizeWithWaveCount;
1469 <<
" bytes of LDS\n"
1470 <<
" Rounding size to " << MaxSizeWithWaveCount
1471 <<
" with a maximum occupancy of " << MaxOccupancy <<
'\n'
1472 <<
" and " << (LocalMemLimit - CurrentLocalMemUsage)
1473 <<
" available for promotion\n");
1479bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(
AllocaInst &
I,
1480 bool SufficientLDS) {
1483 if (DisablePromoteAllocaToLDS) {
1491 const Function &ContainingFunction = *
I.getParent()->getParent();
1504 <<
" promote alloca to LDS not supported with calling convention.\n");
1513 unsigned WorkGroupSize =
ST.getFlatWorkGroupSizes(ContainingFunction).second;
1516 DL.getValueOrABITypeAlignment(
I.getAlign(),
I.getAllocatedType());
1526 WorkGroupSize *
DL.getTypeAllocSize(
I.getAllocatedType());
1527 NewSize += AllocSize;
1529 if (NewSize > LocalMemLimit) {
1531 <<
" bytes of local memory not available to promote\n");
1535 CurrentLocalMemUsage = NewSize;
1537 std::vector<Value *> WorkList;
1539 if (!collectUsesWithPtrTypes(&
I, &
I, WorkList)) {
1551 Twine(
F->getName()) +
Twine(
'.') +
I.getName(),
nullptr,
1556 Value *TCntY, *TCntZ;
1558 std::tie(TCntY, TCntZ) = getLocalSizeYZ(Builder);
1559 Value *TIdX = getWorkitemID(Builder, 0);
1560 Value *TIdY = getWorkitemID(Builder, 1);
1561 Value *TIdZ = getWorkitemID(Builder, 2);
1573 I.mutateType(
Offset->getType());
1575 I.eraseFromParent();
1581 for (
Value *V : WorkList) {
1584 if (
ICmpInst *CI = dyn_cast<ICmpInst>(V)) {
1589 if (isa<ConstantPointerNull, ConstantAggregateZero>(LHS))
1592 if (isa<ConstantPointerNull, ConstantAggregateZero>(RHS))
1600 if (isa<AddrSpaceCastInst>(V))
1603 assert(
V->getType()->isPtrOrPtrVectorTy());
1605 Type *NewTy =
V->getType()->getWithNewType(NewPtrTy);
1606 V->mutateType(NewTy);
1609 if (
SelectInst *SI = dyn_cast<SelectInst>(V)) {
1610 if (isa<ConstantPointerNull, ConstantAggregateZero>(
SI->getOperand(1)))
1613 if (isa<ConstantPointerNull, ConstantAggregateZero>(
SI->getOperand(2)))
1615 }
else if (
PHINode *Phi = dyn_cast<PHINode>(V)) {
1616 for (
unsigned I = 0, E =
Phi->getNumIncomingValues();
I != E; ++
I) {
1617 if (isa<ConstantPointerNull, ConstantAggregateZero>(
1618 Phi->getIncomingValue(
I)))
1628 switch (
Intr->getIntrinsicID()) {
1629 case Intrinsic::lifetime_start:
1630 case Intrinsic::lifetime_end:
1632 Intr->eraseFromParent();
1634 case Intrinsic::memcpy:
1635 case Intrinsic::memmove:
1641 case Intrinsic::memset: {
1646 Intr->eraseFromParent();
1649 case Intrinsic::invariant_start:
1650 case Intrinsic::invariant_end:
1651 case Intrinsic::launder_invariant_group:
1652 case Intrinsic::strip_invariant_group: {
1654 if (
Intr->getIntrinsicID() == Intrinsic::invariant_start) {
1655 Args.emplace_back(
Intr->getArgOperand(0));
1656 }
else if (
Intr->getIntrinsicID() == Intrinsic::invariant_end) {
1657 Args.emplace_back(
Intr->getArgOperand(0));
1658 Args.emplace_back(
Intr->getArgOperand(1));
1666 Intr->replaceAllUsesWith(NewIntr);
1667 Intr->eraseFromParent();
1670 case Intrinsic::objectsize: {
1674 Intrinsic::objectsize,
1676 {Src,
Intr->getOperand(1),
Intr->getOperand(2),
Intr->getOperand(3)});
1677 Intr->replaceAllUsesWith(NewCall);
1678 Intr->eraseFromParent();
1690 assert(
ID == Intrinsic::memcpy ||
ID == Intrinsic::memmove);
1694 ID,
MI->getRawDest(),
MI->getDestAlign(),
MI->getRawSource(),
1695 MI->getSourceAlign(),
MI->getLength(),
MI->isVolatile());
1697 for (
unsigned I = 0;
I != 2; ++
I) {
1698 if (
uint64_t Bytes =
Intr->getParamDereferenceableBytes(
I)) {
1699 B->addDereferenceableParamAttr(
I, Bytes);
1703 Intr->eraseFromParent();
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
AMD GCN specific subclass of TargetSubtarget.
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
if(auto Err=PB.parsePassPipeline(MPM, Passes)) return wrap(std MPM run * Mod
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Remove Loads Into Fake Uses
static unsigned getNumElements(Type *Ty)
Target-Independent Code Generator Pass Configuration Options pass.
static const AMDGPUSubtarget & get(const MachineFunction &MF)
Class for arbitrary precision integers.
static LLVM_ABI void udivrem(const APInt &LHS, const APInt &RHS, APInt &Quotient, APInt &Remainder)
Dual division/remainder interface.
uint64_t getZExtValue() const
Get zero extended value.
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
bool isOne() const
Determine if this is a value of 1.
an instruction to allocate memory on the stack
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
A container for analyses that lazily runs them and caches their results.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
static LLVM_ABI ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
An instruction that atomically checks whether a specified value is in a memory location,...
an instruction that atomically reads a memory location, combines it with another value,...
LLVM Basic Block Representation.
const Function * getParent() const
Return the enclosing method, or null if none.
InstListType::iterator iterator
Instruction iterators...
Represents analyses that only rely on functions' control flow.
void addDereferenceableRetAttr(uint64_t Bytes)
adds the dereferenceable attribute to the list of attributes.
void addRetAttr(Attribute::AttrKind Kind)
Adds the attribute to the return value.
This class represents a function call, abstracting a target machine's calling convention.
static CallInst * Create(FunctionType *Ty, Value *F, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
static LLVM_ABI bool isBitOrNoopPointerCastable(Type *SrcTy, Type *DestTy, const DataLayout &DL)
Check whether a bitcast, inttoptr, or ptrtoint cast between these types is valid and a no-op.
This is the shared class of boolean and integer constants.
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
This is an important base class in LLVM.
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Implements a dense probed hash-table based set.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
This class represents a freeze function that returns random concrete value if an operand is either a ...
FunctionPass class - This class is used to implement most global optimizations.
virtual bool runOnFunction(Function &F)=0
runOnFunction - Virtual method overriden by subclasses to do the per-function processing of the pass.
bool skipFunction(const Function &F) const
Optional passes call this function to check whether the pass should be skipped.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
bool hasExternalLinkage() const
void setUnnamedAddr(UnnamedAddr Val)
unsigned getAddressSpace() const
@ InternalLinkage
Rename collisions when linking (static functions).
Type * getValueType() const
MaybeAlign getAlign() const
Returns the alignment of the given variable.
void setAlignment(Align Align)
Sets the alignment attribute of the GlobalVariable.
This instruction compares its operands according to the predicate given to the constructor.
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
LLVM_ABI Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
Value * CreateIntToPtr(Value *V, Type *DestTy, const Twine &Name="")
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
BasicBlock * GetInsertBlock() const
Value * CreateInBoundsGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="")
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Value * CreateBitOrPointerCast(Value *V, Type *DestTy, const Twine &Name="")
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
ConstantInt * getIntN(unsigned N, uint64_t C)
Get a constant N-bit value, zero extended or truncated from a 64-bit value.
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
CallInst * CreateMemSet(Value *Ptr, Value *Val, uint64_t Size, MaybeAlign Align, bool isVolatile=false, const AAMDNodes &AAInfo=AAMDNodes())
Create and insert a memset to the specified pointer and the specified value.
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
PointerType * getPtrTy(unsigned AddrSpace=0)
Fetch the type representing a pointer.
Value * CreateConstInBoundsGEP1_64(Type *Ty, Value *Ptr, uint64_t Idx0, const Twine &Name="")
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
LLVM_ABI CallInst * CreateMemTransferInst(Intrinsic::ID IntrID, Value *Dst, MaybeAlign DstAlign, Value *Src, MaybeAlign SrcAlign, Value *Size, bool isVolatile=false, const AAMDNodes &AAInfo=AAMDNodes())
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
A wrapper class for inspecting calls to intrinsic functions.
This is an important class for using LLVM in a threaded context.
An instruction for reading from memory.
Analysis pass that exposes the LoopInfo for a function.
LLVM_ABI LoopInfo run(Function &F, FunctionAnalysisManager &AM)
The legacy pass manager's analysis pass to compute loop information.
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
std::pair< KeyT, ValueT > & front()
Value * getLength() const
Value * getRawDest() const
MaybeAlign getDestAlign() const
This class wraps the llvm.memset and llvm.memset.inline intrinsics.
This class wraps the llvm.memcpy/memmove intrinsics.
A Module instance is used to store all the information related to an LLVM module.
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Helper class for SSA formation on a set of values defined in multiple blocks.
Value * FindValueForBlock(BasicBlock *BB) const
Return the value for the specified block if the SSAUpdater has one, otherwise return nullptr.
void Initialize(Type *Ty, StringRef Name)
Reset this object to get ready for a new set of SSA updates with type 'Ty'.
Value * GetValueInMiddleOfBlock(BasicBlock *BB)
Construct SSA form, materializing a value that is live in the middle of the specified block.
void AddAvailableValue(BasicBlock *BB, Value *V)
Indicate that a rewritten value is available in the specified block with the specified value.
This class represents the LLVM 'select' instruction.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
static unsigned getPointerOperandIndex()
StringRef - Represent a constant reference to a string, i.e.
Primary interface to the complete machine description for the target machine.
Triple - Helper class for working with autoconf configuration names.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
The instances of the Type class are immutable: once they are created, they are never changed.
bool isArrayTy() const
True if this is an instance of ArrayType.
bool isPointerTy() const
True if this is an instance of PointerType.
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
bool isAggregateType() const
Return true if the type is an aggregate type.
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
A Use represents the edge between a Value definition and its users.
void setOperand(unsigned i, Value *Val)
Value * getOperand(unsigned i) const
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static LLVM_ABI bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
Type * getElementType() const
constexpr bool isKnownMultipleOf(ScalarTy RHS) const
This function tells the caller whether the element count is known at compile time to be a multiple of...
const ParentTy * getParent() const
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ LOCAL_ADDRESS
Address space for local memory.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
unsigned getDynamicVGPRBlockSize(const Function &F)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ SPIR_KERNEL
Used for SPIR kernel functions.
@ C
The default llvm calling convention, compatible with C.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
bool match(Val *V, const Pattern &P)
initializer< Ty > init(const Ty &Val)
NodeAddr< PhiNode * > Phi
This is an optimization pass for GlobalISel generic memory operations.
void stable_sort(R &&Range)
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
LLVM_ABI bool isAssumeLikeIntrinsic(const Instruction *I)
Return true if it is an intrinsic that cannot be speculated but also cannot trap.
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
auto reverse(ContainerTy &&C)
void sort(IteratorTy Start, IteratorTy End)
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
constexpr int PoisonMaskElem
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
FunctionPass * createAMDGPUPromoteAlloca()
@ Mod
The access may modify the value stored in memory.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
char & AMDGPUPromoteAllocaID
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
This struct is a compact representation of a valid (non-zero power of two) alignment.
A MapVector that performs no allocations if smaller than a certain size.
Function object to check whether the second component of a container supported by std::get (like std:...