50#include "llvm/IR/IntrinsicsAMDGPU.h"
51#include "llvm/IR/IntrinsicsNVPTX.h"
67#define DEBUG_TYPE "openmp-opt"
70 "openmp-opt-disable",
cl::desc(
"Disable OpenMP specific optimizations."),
74 "openmp-opt-enable-merging",
80 cl::desc(
"Disable function internalization."),
91 "openmp-hide-memory-transfer-latency",
92 cl::desc(
"[WIP] Tries to hide the latency of host to device memory"
97 "openmp-opt-disable-deglobalization",
98 cl::desc(
"Disable OpenMP optimizations involving deglobalization."),
102 "openmp-opt-disable-spmdization",
103 cl::desc(
"Disable OpenMP optimizations involving SPMD-ization."),
107 "openmp-opt-disable-folding",
112 "openmp-opt-disable-state-machine-rewrite",
113 cl::desc(
"Disable OpenMP optimizations that replace the state machine."),
117 "openmp-opt-disable-barrier-elimination",
118 cl::desc(
"Disable OpenMP optimizations that eliminate barriers."),
122 "openmp-opt-print-module-after",
123 cl::desc(
"Print the current module after OpenMP optimizations."),
127 "openmp-opt-print-module-before",
128 cl::desc(
"Print the current module before OpenMP optimizations."),
132 "openmp-opt-inline-device",
143 cl::desc(
"Maximal number of attributor iterations."),
148 cl::desc(
"Maximum amount of shared memory to use."),
149 cl::init(std::numeric_limits<unsigned>::max()));
152 "Number of OpenMP runtime calls deduplicated");
154 "Number of OpenMP parallel regions deleted");
156 "Number of OpenMP runtime functions identified");
158 "Number of OpenMP runtime function uses identified");
160 "Number of OpenMP target region entry points (=kernels) identified");
162 "Number of non-OpenMP target region kernels identified");
164 "Number of OpenMP target region entry points (=kernels) executed in "
165 "SPMD-mode instead of generic-mode");
166STATISTIC(NumOpenMPTargetRegionKernelsWithoutStateMachine,
167 "Number of OpenMP target region entry points (=kernels) executed in "
168 "generic-mode without a state machines");
169STATISTIC(NumOpenMPTargetRegionKernelsCustomStateMachineWithFallback,
170 "Number of OpenMP target region entry points (=kernels) executed in "
171 "generic-mode with customized state machines with fallback");
172STATISTIC(NumOpenMPTargetRegionKernelsCustomStateMachineWithoutFallback,
173 "Number of OpenMP target region entry points (=kernels) executed in "
174 "generic-mode with customized state machines without fallback");
176 NumOpenMPParallelRegionsReplacedInGPUStateMachine,
177 "Number of OpenMP parallel regions replaced with ID in GPU state machines");
179 "Number of OpenMP parallel regions merged");
181 "Amount of memory pushed to shared memory");
182STATISTIC(NumBarriersEliminated,
"Number of redundant barriers eliminated");
210#define KERNEL_ENVIRONMENT_IDX(MEMBER, IDX) \
211 constexpr const unsigned MEMBER##Idx = IDX;
216#undef KERNEL_ENVIRONMENT_IDX
218#define KERNEL_ENVIRONMENT_CONFIGURATION_IDX(MEMBER, IDX) \
219 constexpr const unsigned MEMBER##Idx = IDX;
229#undef KERNEL_ENVIRONMENT_CONFIGURATION_IDX
231#define KERNEL_ENVIRONMENT_GETTER(MEMBER, RETURNTYPE) \
232 RETURNTYPE *get##MEMBER##FromKernelEnvironment(ConstantStruct *KernelEnvC) { \
233 return cast<RETURNTYPE>(KernelEnvC->getAggregateElement(MEMBER##Idx)); \
239#undef KERNEL_ENVIRONMENT_GETTER
241#define KERNEL_ENVIRONMENT_CONFIGURATION_GETTER(MEMBER) \
242 ConstantInt *get##MEMBER##FromKernelEnvironment( \
243 ConstantStruct *KernelEnvC) { \
244 ConstantStruct *ConfigC = \
245 getConfigurationFromKernelEnvironment(KernelEnvC); \
246 return dyn_cast<ConstantInt>(ConfigC->getAggregateElement(MEMBER##Idx)); \
257#undef KERNEL_ENVIRONMENT_CONFIGURATION_GETTER
261 constexpr const int InitKernelEnvironmentArgNo = 0;
276struct AAHeapToShared;
283 OMPInformationCache(
Module &M, AnalysisGetter &AG,
287 OpenMPPostLink(OpenMPPostLink) {
290 const Triple
T(OMPBuilder.M.getTargetTriple());
291 switch (
T.getArch()) {
295 assert(OMPBuilder.Config.IsTargetDevice &&
296 "OpenMP AMDGPU/NVPTX is only prepared to deal with device code.");
297 OMPBuilder.Config.IsGPU =
true;
300 OMPBuilder.Config.IsGPU =
false;
303 OMPBuilder.initialize();
304 initializeRuntimeFunctions(M);
305 initializeInternalControlVars();
309 struct InternalControlVarInfo {
317 StringRef EnvVarName;
323 ConstantInt *InitValue;
336 struct RuntimeFunctionInfo {
357 using UseVector = SmallVector<Use *, 16>;
360 void clearUsesMap() { UsesMap.clear(); }
363 operator bool()
const {
return Declaration; }
366 UseVector &getOrCreateUseVector(Function *
F) {
367 std::shared_ptr<UseVector> &UV = UsesMap[
F];
369 UV = std::make_shared<UseVector>();
375 const UseVector *getUseVector(Function &
F)
const {
376 auto I = UsesMap.find(&
F);
377 if (
I != UsesMap.end())
378 return I->second.get();
383 size_t getNumFunctionsWithUses()
const {
return UsesMap.size(); }
387 size_t getNumArgs()
const {
return ArgumentTypes.size(); }
392 void foreachUse(SmallVectorImpl<Function *> &SCC,
393 function_ref<
bool(Use &, Function &)> CB) {
394 for (Function *
F : SCC)
400 void foreachUse(function_ref<
bool(Use &, Function &)> CB, Function *
F) {
401 SmallVector<unsigned, 8> ToBeDeleted;
405 UseVector &UV = getOrCreateUseVector(
F);
415 while (!ToBeDeleted.
empty()) {
425 DenseMap<Function *, std::shared_ptr<UseVector>> UsesMap;
429 decltype(UsesMap)::iterator
begin() {
return UsesMap.begin(); }
430 decltype(UsesMap)::iterator
end() {
return UsesMap.end(); }
434 OpenMPIRBuilder OMPBuilder;
438 RuntimeFunction::OMPRTL___last>
442 DenseMap<Function *, RuntimeFunction> RuntimeFunctionIDMap;
446 InternalControlVar::ICV___last>
451 void initializeInternalControlVars() {
452#define ICV_RT_SET(_Name, RTL) \
454 auto &ICV = ICVs[_Name]; \
457#define ICV_RT_GET(Name, RTL) \
459 auto &ICV = ICVs[Name]; \
462#define ICV_DATA_ENV(Enum, _Name, _EnvVarName, Init) \
464 auto &ICV = ICVs[Enum]; \
467 ICV.InitKind = Init; \
468 ICV.EnvVarName = _EnvVarName; \
469 switch (ICV.InitKind) { \
470 case ICV_IMPLEMENTATION_DEFINED: \
471 ICV.InitValue = nullptr; \
474 ICV.InitValue = ConstantInt::get( \
475 Type::getInt32Ty(OMPBuilder.Int32->getContext()), 0); \
478 ICV.InitValue = ConstantInt::getFalse(OMPBuilder.Int1->getContext()); \
484#include "llvm/Frontend/OpenMP/OMPKinds.def"
490 static bool declMatchesRTFTypes(Function *
F,
Type *RTFRetType,
497 if (
F->getReturnType() != RTFRetType)
499 if (
F->arg_size() != RTFArgTypes.
size())
502 auto *RTFTyIt = RTFArgTypes.
begin();
503 for (Argument &Arg :
F->args()) {
504 if (Arg.getType() != *RTFTyIt)
514 unsigned collectUses(RuntimeFunctionInfo &RFI,
bool CollectStats =
true) {
515 unsigned NumUses = 0;
516 if (!RFI.Declaration)
518 OMPBuilder.addAttributes(RFI.Kind, *RFI.Declaration);
521 NumOpenMPRuntimeFunctionsIdentified += 1;
522 NumOpenMPRuntimeFunctionUsesIdentified += RFI.Declaration->getNumUses();
526 for (Use &U : RFI.Declaration->uses()) {
528 if (!
CGSCC ||
CGSCC->empty() ||
CGSCC->contains(UserI->getFunction())) {
529 RFI.getOrCreateUseVector(UserI->getFunction()).push_back(&U);
533 RFI.getOrCreateUseVector(
nullptr).push_back(&U);
542 auto &RFI = RFIs[RTF];
544 collectUses(RFI,
false);
548 void recollectUses() {
549 for (
int Idx = 0; Idx < RFIs.size(); ++Idx)
554 void setCallingConvention(FunctionCallee Callee, CallInst *CI) {
569 RuntimeFunctionInfo &RFI = RFIs[Fn];
571 if (!RFI.Declaration || RFI.Declaration->isDeclaration())
579 void initializeRuntimeFunctions(
Module &M) {
582#define OMP_TYPE(VarName, ...) \
583 Type *VarName = OMPBuilder.VarName; \
586#define OMP_ARRAY_TYPE(VarName, ...) \
587 ArrayType *VarName##Ty = OMPBuilder.VarName##Ty; \
589 PointerType *VarName##PtrTy = OMPBuilder.VarName##PtrTy; \
590 (void)VarName##PtrTy;
592#define OMP_FUNCTION_TYPE(VarName, ...) \
593 FunctionType *VarName = OMPBuilder.VarName; \
595 PointerType *VarName##Ptr = OMPBuilder.VarName##Ptr; \
598#define OMP_STRUCT_TYPE(VarName, ...) \
599 StructType *VarName = OMPBuilder.VarName; \
601 PointerType *VarName##Ptr = OMPBuilder.VarName##Ptr; \
604#define OMP_RTL(_Enum, _Name, _IsVarArg, _ReturnType, ...) \
606 SmallVector<Type *, 8> ArgsTypes({__VA_ARGS__}); \
607 Function *F = M.getFunction(_Name); \
608 RTLFunctions.insert(F); \
609 if (declMatchesRTFTypes(F, OMPBuilder._ReturnType, ArgsTypes)) { \
610 RuntimeFunctionIDMap[F] = _Enum; \
611 auto &RFI = RFIs[_Enum]; \
614 RFI.IsVarArg = _IsVarArg; \
615 RFI.ReturnType = OMPBuilder._ReturnType; \
616 RFI.ArgumentTypes = std::move(ArgsTypes); \
617 RFI.Declaration = F; \
618 unsigned NumUses = collectUses(RFI); \
621 dbgs() << TAG << RFI.Name << (RFI.Declaration ? "" : " not") \
623 if (RFI.Declaration) \
624 dbgs() << TAG << "-> got " << NumUses << " uses in " \
625 << RFI.getNumFunctionsWithUses() \
626 << " different functions.\n"; \
630#include "llvm/Frontend/OpenMP/OMPKinds.def"
635 for (Function &
F : M) {
636 for (StringRef Prefix : {
"__kmpc",
"_ZN4ompx",
"omp_"})
637 if (
F.hasFnAttribute(Attribute::NoInline) &&
638 F.getName().starts_with(Prefix) &&
639 !
F.hasFnAttribute(Attribute::OptimizeNone))
640 F.removeFnAttr(Attribute::NoInline);
648 DenseSet<const Function *> RTLFunctions;
651 bool OpenMPPostLink =
false;
654template <
typename Ty,
bool InsertInval
idates = true>
656 bool contains(
const Ty &Elem)
const {
return Set.contains(Elem); }
657 bool insert(
const Ty &Elem) {
658 if (InsertInvalidates)
659 BooleanState::indicatePessimisticFixpoint();
660 return Set.insert(Elem);
663 const Ty &operator[](
int Idx)
const {
return Set[Idx]; }
664 bool operator==(
const BooleanStateWithSetVector &
RHS)
const {
665 return BooleanState::operator==(
RHS) && Set ==
RHS.Set;
667 bool operator!=(
const BooleanStateWithSetVector &
RHS)
const {
668 return !(*
this ==
RHS);
671 bool empty()
const {
return Set.empty(); }
672 size_t size()
const {
return Set.size(); }
675 BooleanStateWithSetVector &
operator^=(
const BooleanStateWithSetVector &
RHS) {
676 BooleanState::operator^=(
RHS);
677 Set.insert_range(
RHS.Set);
686 typename decltype(Set)::iterator
begin() {
return Set.begin(); }
687 typename decltype(Set)::iterator
end() {
return Set.end(); }
688 typename decltype(Set)::const_iterator
begin()
const {
return Set.begin(); }
689 typename decltype(Set)::const_iterator
end()
const {
return Set.end(); }
692template <
typename Ty,
bool InsertInval
idates = true>
693using BooleanStateWithPtrSetVector =
694 BooleanStateWithSetVector<Ty *, InsertInvalidates>;
698 bool IsAtFixpoint =
false;
702 BooleanStateWithPtrSetVector<CallBase,
false>
703 ReachedKnownParallelRegions;
706 BooleanStateWithPtrSetVector<CallBase> ReachedUnknownParallelRegions;
711 BooleanStateWithPtrSetVector<Instruction, false> SPMDCompatibilityTracker;
715 CallBase *KernelInitCB =
nullptr;
719 ConstantStruct *KernelEnvC =
nullptr;
723 CallBase *KernelDeinitCB =
nullptr;
726 bool IsKernelEntry =
false;
729 BooleanStateWithPtrSetVector<Function, false> ReachingKernelEntries;
734 BooleanStateWithSetVector<uint8_t> ParallelLevels;
737 bool NestedParallelism =
false;
742 KernelInfoState() =
default;
743 KernelInfoState(
bool BestState) {
745 indicatePessimisticFixpoint();
749 bool isValidState()
const override {
return true; }
752 bool isAtFixpoint()
const override {
return IsAtFixpoint; }
757 ParallelLevels.indicatePessimisticFixpoint();
758 ReachingKernelEntries.indicatePessimisticFixpoint();
759 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
760 ReachedKnownParallelRegions.indicatePessimisticFixpoint();
761 ReachedUnknownParallelRegions.indicatePessimisticFixpoint();
762 NestedParallelism =
true;
763 return ChangeStatus::CHANGED;
769 ParallelLevels.indicateOptimisticFixpoint();
770 ReachingKernelEntries.indicateOptimisticFixpoint();
771 SPMDCompatibilityTracker.indicateOptimisticFixpoint();
772 ReachedKnownParallelRegions.indicateOptimisticFixpoint();
773 ReachedUnknownParallelRegions.indicateOptimisticFixpoint();
774 return ChangeStatus::UNCHANGED;
778 KernelInfoState &getAssumed() {
return *
this; }
779 const KernelInfoState &getAssumed()
const {
return *
this; }
782 if (SPMDCompatibilityTracker !=
RHS.SPMDCompatibilityTracker)
784 if (ReachedKnownParallelRegions !=
RHS.ReachedKnownParallelRegions)
786 if (ReachedUnknownParallelRegions !=
RHS.ReachedUnknownParallelRegions)
788 if (ReachingKernelEntries !=
RHS.ReachingKernelEntries)
790 if (ParallelLevels !=
RHS.ParallelLevels)
792 if (NestedParallelism !=
RHS.NestedParallelism)
798 bool mayContainParallelRegion() {
799 return !ReachedKnownParallelRegions.empty() ||
800 !ReachedUnknownParallelRegions.empty();
804 static KernelInfoState getBestState() {
return KernelInfoState(
true); }
806 static KernelInfoState getBestState(KernelInfoState &KIS) {
807 return getBestState();
811 static KernelInfoState getWorstState() {
return KernelInfoState(
false); }
814 KernelInfoState
operator^=(
const KernelInfoState &KIS) {
816 if (KIS.KernelInitCB) {
817 if (KernelInitCB && KernelInitCB != KIS.KernelInitCB)
820 KernelInitCB = KIS.KernelInitCB;
822 if (KIS.KernelDeinitCB) {
823 if (KernelDeinitCB && KernelDeinitCB != KIS.KernelDeinitCB)
826 KernelDeinitCB = KIS.KernelDeinitCB;
828 if (KIS.KernelEnvC) {
829 if (KernelEnvC && KernelEnvC != KIS.KernelEnvC)
832 KernelEnvC = KIS.KernelEnvC;
834 SPMDCompatibilityTracker ^= KIS.SPMDCompatibilityTracker;
835 ReachedKnownParallelRegions ^= KIS.ReachedKnownParallelRegions;
836 ReachedUnknownParallelRegions ^= KIS.ReachedUnknownParallelRegions;
837 NestedParallelism |= KIS.NestedParallelism;
841 KernelInfoState
operator&=(
const KernelInfoState &KIS) {
842 return (*
this ^= KIS);
852 AllocaInst *Array =
nullptr;
854 SmallVector<Value *, 8> StoredValues;
856 SmallVector<StoreInst *, 8> LastAccesses;
858 OffloadArray() =
default;
864 bool initialize(AllocaInst &Array, Instruction &Before) {
865 if (!Array.getAllocatedType()->isArrayTy())
868 if (!getValues(Array, Before))
871 this->Array = &Array;
875 static const unsigned DeviceIDArgNum = 1;
876 static const unsigned BasePtrsArgNum = 3;
877 static const unsigned PtrsArgNum = 4;
878 static const unsigned SizesArgNum = 5;
884 bool getValues(AllocaInst &Array, Instruction &Before) {
886 const uint64_t NumValues = Array.getAllocatedType()->getArrayNumElements();
887 StoredValues.assign(NumValues,
nullptr);
888 LastAccesses.assign(NumValues,
nullptr);
896 const DataLayout &
DL = Array.getDataLayout();
899 for (Instruction &
I : *BB) {
913 LastAccesses[Idx] = S;
923 const unsigned NumValues = StoredValues.size();
924 for (
unsigned I = 0;
I < NumValues; ++
I) {
925 if (!StoredValues[
I] || !LastAccesses[
I])
935 using OptimizationRemarkGetter =
936 function_ref<OptimizationRemarkEmitter &(
Function *)>;
938 OpenMPOpt(SmallVectorImpl<Function *> &SCC, CallGraphUpdater &CGUpdater,
939 OptimizationRemarkGetter OREGetter,
940 OMPInformationCache &OMPInfoCache, Attributor &A)
941 : M(*(*SCC.
begin())->
getParent()), SCC(SCC), CGUpdater(CGUpdater),
942 OREGetter(OREGetter), OMPInfoCache(OMPInfoCache), A(A) {}
945 bool remarksEnabled() {
946 auto &Ctx = M.getContext();
947 return Ctx.getDiagHandlerPtr()->isAnyRemarkEnabled(
DEBUG_TYPE);
951 bool run(
bool IsModulePass) {
961 Changed |= runAttributor(IsModulePass);
964 OMPInfoCache.recollectUses();
967 Changed |= rewriteDeviceCodeStateMachine();
969 if (remarksEnabled())
970 analysisGlobalization();
977 Changed |= runAttributor(IsModulePass);
980 OMPInfoCache.recollectUses();
982 Changed |= deleteParallelRegions();
985 Changed |= hideMemTransfersLatency();
986 Changed |= deduplicateRuntimeCalls();
988 if (mergeParallelRegions()) {
989 deduplicateRuntimeCalls();
995 if (OMPInfoCache.OpenMPPostLink)
996 Changed |= removeRuntimeSymbols();
1003 void printICVs()
const {
1007 for (Function *
F : SCC) {
1008 for (
auto ICV : ICVs) {
1009 auto ICVInfo = OMPInfoCache.ICVs[ICV];
1010 auto Remark = [&](OptimizationRemarkAnalysis ORA) {
1011 return ORA <<
"OpenMP ICV " <<
ore::NV(
"OpenMPICV", ICVInfo.Name)
1013 << (ICVInfo.InitValue
1014 ?
toString(ICVInfo.InitValue->getValue(), 10,
true)
1015 :
"IMPLEMENTATION_DEFINED");
1024 void printKernels()
const {
1025 for (Function *
F : SCC) {
1029 auto Remark = [&](OptimizationRemarkAnalysis ORA) {
1030 return ORA <<
"OpenMP GPU kernel "
1031 <<
ore::NV(
"OpenMPGPUKernel",
F->getName()) <<
"\n";
1040 static CallInst *getCallIfRegularCall(
1041 Use &U, OMPInformationCache::RuntimeFunctionInfo *RFI =
nullptr) {
1052 static CallInst *getCallIfRegularCall(
1053 Value &V, OMPInformationCache::RuntimeFunctionInfo *RFI =
nullptr) {
1064 bool mergeParallelRegions() {
1065 const unsigned CallbackCalleeOperand = 2;
1066 const unsigned CallbackFirstArgOperand = 3;
1067 using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
1070 OMPInformationCache::RuntimeFunctionInfo &RFI =
1071 OMPInfoCache.RFIs[OMPRTL___kmpc_fork_call];
1073 if (!RFI.Declaration)
1077 OMPInformationCache::RuntimeFunctionInfo UnmergableCallsInfo[] = {
1078 OMPInfoCache.RFIs[OMPRTL___kmpc_push_proc_bind],
1079 OMPInfoCache.RFIs[OMPRTL___kmpc_push_num_threads],
1083 LoopInfo *LI =
nullptr;
1084 DominatorTree *DT =
nullptr;
1086 SmallDenseMap<BasicBlock *, SmallPtrSet<Instruction *, 4>> BB2PRMap;
1088 BasicBlock *StartBB =
nullptr, *EndBB =
nullptr;
1089 auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
1090 BasicBlock *CGStartBB = CodeGenIP.getBlock();
1092 SplitBlock(CGStartBB, &*CodeGenIP.getPoint(), DT, LI);
1093 assert(StartBB !=
nullptr &&
"StartBB should not be null");
1095 assert(EndBB !=
nullptr &&
"EndBB should not be null");
1096 EndBB->getTerminator()->setSuccessor(0, CGEndBB);
1100 auto PrivCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
Value &,
1101 Value &Inner,
Value *&ReplacementValue) -> InsertPointTy {
1102 ReplacementValue = &Inner;
1106 auto FiniCB = [&](InsertPointTy CodeGenIP) {
return Error::success(); };
1110 auto CreateSequentialRegion = [&](
Function *OuterFn,
1116 BasicBlock *ParentBB = SeqStartI->getParent();
1118 SplitBlock(ParentBB, SeqEndI->getNextNode(), DT, LI);
1122 SplitBlock(ParentBB, SeqStartI, DT, LI,
nullptr,
"seq.par.merged");
1125 "Expected a different CFG");
1129 auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
1130 BasicBlock *CGStartBB = CodeGenIP.getBlock();
1132 SplitBlock(CGStartBB, &*CodeGenIP.getPoint(), DT, LI);
1133 assert(SeqStartBB !=
nullptr &&
"SeqStartBB should not be null");
1135 assert(SeqEndBB !=
nullptr &&
"SeqEndBB should not be null");
1139 auto FiniCB = [&](InsertPointTy CodeGenIP) {
return Error::success(); };
1143 for (Instruction &
I : *SeqStartBB) {
1144 SmallPtrSet<Instruction *, 4> OutsideUsers;
1145 for (User *Usr :
I.users()) {
1153 OutsideUsers.
insert(&UsrI);
1156 if (OutsideUsers.
empty())
1161 const DataLayout &
DL = M.getDataLayout();
1162 AllocaInst *AllocaI =
new AllocaInst(
1163 I.getType(),
DL.getAllocaAddrSpace(),
nullptr,
1168 new StoreInst(&
I, AllocaI, SeqStartBB->getTerminator()->getIterator());
1172 for (Instruction *UsrI : OutsideUsers) {
1173 LoadInst *LoadI =
new LoadInst(
I.getType(), AllocaI,
1174 I.getName() +
".seq.output.load",
1180 OpenMPIRBuilder::LocationDescription Loc(
1181 InsertPointTy(ParentBB, ParentBB->
end()),
DL);
1182 OpenMPIRBuilder::InsertPointTy SeqAfterIP =
cantFail(
1183 OMPInfoCache.OMPBuilder.createMaster(Loc, BodyGenCB, FiniCB));
1185 OMPInfoCache.OMPBuilder.createBarrier(SeqAfterIP, OMPD_parallel));
1200 auto Merge = [&](
const SmallVectorImpl<CallInst *> &MergableCIs,
1204 assert(MergableCIs.
size() > 1 &&
"Assumed multiple mergable CIs");
1206 auto Remark = [&](OptimizationRemark
OR) {
1207 OR <<
"Parallel region merged with parallel region"
1208 << (MergableCIs.
size() > 2 ?
"s" :
"") <<
" at ";
1211 if (CI != MergableCIs.
back())
1219 Function *OriginalFn = BB->getParent();
1221 <<
" parallel regions in " << OriginalFn->
getName()
1225 EndBB =
SplitBlock(BB, MergableCIs.
back()->getNextNode(), DT, LI);
1227 SplitBlock(EndBB, &*EndBB->getFirstInsertionPt(), DT, LI);
1231 assert(BB->getUniqueSuccessor() == StartBB &&
"Expected a different CFG");
1232 const DebugLoc DL = BB->getTerminator()->getDebugLoc();
1237 for (
auto *It = MergableCIs.
begin(), *End = MergableCIs.
end() - 1;
1246 CreateSequentialRegion(OriginalFn, BB, ForkCI->
getNextNode(),
1250 OpenMPIRBuilder::LocationDescription Loc(InsertPointTy(BB, BB->end()),
1252 IRBuilder<>::InsertPoint AllocaIP(
1257 OpenMPIRBuilder::InsertPointTy AfterIP =
1258 cantFail(OMPInfoCache.OMPBuilder.createParallel(
1259 Loc, AllocaIP, BodyGenCB, PrivCB, FiniCB,
nullptr,
nullptr,
1260 OMP_PROC_BIND_default,
false));
1264 OMPInfoCache.OMPBuilder.finalize(OriginalFn);
1270 SmallVector<Value *, 8>
Args;
1271 for (
auto *CI : MergableCIs) {
1273 FunctionType *FT = OMPInfoCache.OMPBuilder.ParallelTask;
1277 for (
unsigned U = CallbackFirstArgOperand,
E = CI->
arg_size(); U <
E;
1287 for (
unsigned U = CallbackFirstArgOperand,
E = CI->
arg_size(); U <
E;
1291 U - (CallbackFirstArgOperand - CallbackCalleeOperand), A);
1294 if (CI != MergableCIs.back()) {
1297 cantFail(OMPInfoCache.OMPBuilder.createBarrier(
1306 assert(OutlinedFn != OriginalFn &&
"Outlining failed");
1307 CGUpdater.registerOutlinedFunction(*OriginalFn, *OutlinedFn);
1308 CGUpdater.reanalyzeFunction(*OriginalFn);
1310 NumOpenMPParallelRegionsMerged += MergableCIs.size();
1318 CallInst *CI = getCallIfRegularCall(U, &RFI);
1325 RFI.foreachUse(SCC, DetectPRsCB);
1331 for (
auto &It : BB2PRMap) {
1332 auto &CIs = It.getSecond();
1347 auto IsMergable = [&](
Instruction &
I,
bool IsBeforeMergableRegion) {
1350 if (
I.isTerminator())
1357 if (IsBeforeMergableRegion) {
1359 if (!CalledFunction)
1366 for (
const auto &RFI : UnmergableCallsInfo) {
1367 if (CalledFunction == RFI.Declaration)
1382 for (
auto It = BB->
begin(), End = BB->
end(); It != End;) {
1386 if (CIs.count(&
I)) {
1392 if (IsMergable(
I, MergableCIs.
empty()))
1397 for (; It != End; ++It) {
1399 if (CIs.count(&SkipI)) {
1401 <<
" due to " <<
I <<
"\n");
1408 if (MergableCIs.
size() > 1) {
1409 MergableCIsVector.
push_back(MergableCIs);
1411 <<
" parallel regions in block " << BB->
getName()
1416 MergableCIs.
clear();
1419 if (!MergableCIsVector.
empty()) {
1422 for (
auto &MergableCIs : MergableCIsVector)
1423 Merge(MergableCIs, BB);
1424 MergableCIsVector.clear();
1431 OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_fork_call);
1432 OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_barrier);
1433 OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_master);
1434 OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_end_master);
1441 bool deleteParallelRegions() {
1442 const unsigned CallbackCalleeOperand = 2;
1444 OMPInformationCache::RuntimeFunctionInfo &RFI =
1445 OMPInfoCache.RFIs[OMPRTL___kmpc_fork_call];
1447 if (!RFI.Declaration)
1452 CallInst *CI = getCallIfRegularCall(U);
1459 if (!Fn->onlyReadsMemory())
1461 if (!Fn->hasFnAttribute(Attribute::WillReturn))
1467 auto Remark = [&](OptimizationRemark
OR) {
1468 return OR <<
"Removing parallel region with no side-effects.";
1474 ++NumOpenMPParallelRegionsDeleted;
1478 RFI.foreachUse(SCC, DeleteCallCB);
1484 bool deduplicateRuntimeCalls() {
1488 OMPRTL_omp_get_num_threads,
1489 OMPRTL_omp_in_parallel,
1490 OMPRTL_omp_get_cancellation,
1491 OMPRTL_omp_get_supported_active_levels,
1492 OMPRTL_omp_get_level,
1493 OMPRTL_omp_get_ancestor_thread_num,
1494 OMPRTL_omp_get_team_size,
1495 OMPRTL_omp_get_active_level,
1496 OMPRTL_omp_in_final,
1497 OMPRTL_omp_get_proc_bind,
1498 OMPRTL_omp_get_num_places,
1499 OMPRTL_omp_get_num_procs,
1500 OMPRTL_omp_get_place_num,
1501 OMPRTL_omp_get_partition_num_places,
1502 OMPRTL_omp_get_partition_place_nums};
1505 SmallSetVector<Value *, 16> GTIdArgs;
1506 collectGlobalThreadIdArguments(GTIdArgs);
1508 <<
" global thread ID arguments\n");
1510 for (Function *
F : SCC) {
1511 for (
auto DeduplicableRuntimeCallID : DeduplicableRuntimeCallIDs)
1512 Changed |= deduplicateRuntimeCalls(
1513 *
F, OMPInfoCache.RFIs[DeduplicableRuntimeCallID]);
1517 Value *GTIdArg =
nullptr;
1518 for (Argument &Arg :
F->args())
1519 if (GTIdArgs.
count(&Arg)) {
1523 Changed |= deduplicateRuntimeCalls(
1524 *
F, OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num], GTIdArg);
1531 bool removeRuntimeSymbols() {
1536 if (GlobalVariable *GV = M.getNamedGlobal(
"__llvm_rpc_client")) {
1537 if (GV->hasNUsesOrMore(1))
1541 GV->eraseFromParent();
1553 bool hideMemTransfersLatency() {
1554 auto &RFI = OMPInfoCache.RFIs[OMPRTL___tgt_target_data_begin_mapper];
1557 auto *RTCall = getCallIfRegularCall(U, &RFI);
1561 OffloadArray OffloadArrays[3];
1562 if (!getValuesInOffloadArrays(*RTCall, OffloadArrays))
1565 LLVM_DEBUG(dumpValuesInOffloadArrays(OffloadArrays));
1568 bool WasSplit =
false;
1569 Instruction *WaitMovementPoint = canBeMovedDownwards(*RTCall);
1570 if (WaitMovementPoint)
1571 WasSplit = splitTargetDataBeginRTC(*RTCall, *WaitMovementPoint);
1576 if (OMPInfoCache.runtimeFnsAvailable(
1577 {OMPRTL___tgt_target_data_begin_mapper_issue,
1578 OMPRTL___tgt_target_data_begin_mapper_wait}))
1579 RFI.foreachUse(SCC, SplitMemTransfers);
1584 void analysisGlobalization() {
1585 auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
1587 auto CheckGlobalization = [&](
Use &
U,
Function &Decl) {
1588 if (CallInst *CI = getCallIfRegularCall(U, &RFI)) {
1589 auto Remark = [&](OptimizationRemarkMissed ORM) {
1591 <<
"Found thread data sharing on the GPU. "
1592 <<
"Expect degraded performance due to data globalization.";
1600 RFI.foreachUse(SCC, CheckGlobalization);
1605 bool getValuesInOffloadArrays(CallInst &RuntimeCall,
1607 assert(OAs.
size() == 3 &&
"Need space for three offload arrays!");
1617 Value *BasePtrsArg =
1629 if (!OAs[0].
initialize(*BasePtrsArray, RuntimeCall))
1637 if (!OAs[1].
initialize(*PtrsArray, RuntimeCall))
1649 if (!OAs[2].
initialize(*SizesArray, RuntimeCall))
1660 assert(OAs.
size() == 3 &&
"There are three offload arrays to debug!");
1663 std::string ValuesStr;
1664 raw_string_ostream
Printer(ValuesStr);
1665 std::string Separator =
" --- ";
1667 for (
auto *BP : OAs[0].StoredValues) {
1671 LLVM_DEBUG(
dbgs() <<
"\t\toffload_baseptrs: " << ValuesStr <<
"\n");
1674 for (
auto *
P : OAs[1].StoredValues) {
1681 for (
auto *S : OAs[2].StoredValues) {
1685 LLVM_DEBUG(
dbgs() <<
"\t\toffload_sizes: " << ValuesStr <<
"\n");
1690 Instruction *canBeMovedDownwards(CallInst &RuntimeCall) {
1695 bool IsWorthIt =
false;
1714 return RuntimeCall.
getParent()->getTerminator();
1718 bool splitTargetDataBeginRTC(CallInst &RuntimeCall,
1719 Instruction &WaitMovementPoint) {
1723 auto &
IRBuilder = OMPInfoCache.OMPBuilder;
1726 IRBuilder.Builder.SetInsertPoint(&Entry,
1727 Entry.getFirstNonPHIOrDbgOrAlloca());
1729 IRBuilder.AsyncInfo,
nullptr,
"handle");
1736 FunctionCallee IssueDecl =
IRBuilder.getOrCreateRuntimeFunction(
1737 M, OMPRTL___tgt_target_data_begin_mapper_issue);
1740 SmallVector<Value *, 16>
Args;
1741 for (
auto &Arg : RuntimeCall.
args())
1742 Args.push_back(Arg.get());
1743 Args.push_back(Handle);
1747 OMPInfoCache.setCallingConvention(IssueDecl, IssueCallsite);
1752 FunctionCallee WaitDecl =
IRBuilder.getOrCreateRuntimeFunction(
1753 M, OMPRTL___tgt_target_data_begin_mapper_wait);
1755 Value *WaitParams[2] = {
1757 OffloadArray::DeviceIDArgNum),
1761 WaitDecl, WaitParams,
"", WaitMovementPoint.
getIterator());
1762 OMPInfoCache.setCallingConvention(WaitDecl, WaitCallsite);
1767 static Value *combinedIdentStruct(
Value *CurrentIdent,
Value *NextIdent,
1768 bool GlobalOnly,
bool &SingleChoice) {
1769 if (CurrentIdent == NextIdent)
1770 return CurrentIdent;
1775 SingleChoice = !CurrentIdent;
1787 getCombinedIdentFromCallUsesIn(OMPInformationCache::RuntimeFunctionInfo &RFI,
1788 Function &
F,
bool GlobalOnly) {
1789 bool SingleChoice =
true;
1790 Value *Ident =
nullptr;
1792 CallInst *CI = getCallIfRegularCall(U, &RFI);
1793 if (!CI || &
F != &Caller)
1796 true, SingleChoice);
1799 RFI.foreachUse(SCC, CombineIdentStruct);
1801 if (!Ident || !SingleChoice) {
1804 if (!OMPInfoCache.OMPBuilder.getInsertionPoint().getBlock())
1805 OMPInfoCache.OMPBuilder.updateToLocation(OpenMPIRBuilder::InsertPointTy(
1806 &
F.getEntryBlock(),
F.getEntryBlock().begin()));
1809 uint32_t SrcLocStrSize;
1811 OMPInfoCache.OMPBuilder.getOrCreateDefaultSrcLocStr(SrcLocStrSize);
1812 Ident = OMPInfoCache.OMPBuilder.getOrCreateIdent(Loc, SrcLocStrSize);
1819 bool deduplicateRuntimeCalls(Function &
F,
1820 OMPInformationCache::RuntimeFunctionInfo &RFI,
1821 Value *ReplVal =
nullptr) {
1822 auto *UV = RFI.getUseVector(
F);
1823 if (!UV || UV->size() + (ReplVal !=
nullptr) < 2)
1827 dbgs() <<
TAG <<
"Deduplicate " << UV->size() <<
" uses of " << RFI.Name
1828 << (ReplVal ?
" with an existing value\n" :
"\n") <<
"\n");
1832 "Unexpected replacement value!");
1835 auto CanBeMoved = [
this](CallBase &CB) {
1836 unsigned NumArgs = CB.arg_size();
1839 if (CB.getArgOperand(0)->getType() != OMPInfoCache.OMPBuilder.IdentPtr)
1841 for (
unsigned U = 1;
U < NumArgs; ++
U)
1849 OMPInfoCache.getAnalysisResultForFunction<DominatorTreeAnalysis>(
F);
1853 for (Use *U : *UV) {
1854 if (CallInst *CI = getCallIfRegularCall(*U, &RFI)) {
1859 if (!CanBeMoved(*CI))
1867 assert(IP &&
"Expected insertion point!");
1877 Value *Ident = getCombinedIdentFromCallUsesIn(RFI,
F,
1885 CallInst *CI = getCallIfRegularCall(U, &RFI);
1886 if (!CI || CI == ReplVal || &
F != &Caller)
1890 auto Remark = [&](OptimizationRemark
OR) {
1891 return OR <<
"OpenMP runtime call "
1892 <<
ore::NV(
"OpenMPOptRuntime", RFI.Name) <<
" deduplicated.";
1901 ++NumOpenMPRuntimeCallsDeduplicated;
1905 RFI.foreachUse(SCC, ReplaceAndDeleteCB);
1911 void collectGlobalThreadIdArguments(SmallSetVector<Value *, 16> >IdArgs) {
1918 auto CallArgOpIsGTId = [&](
Function &
F,
unsigned ArgNo, CallInst &RefCI) {
1919 if (!
F.hasLocalLinkage())
1921 for (Use &U :
F.uses()) {
1922 if (CallInst *CI = getCallIfRegularCall(U)) {
1924 if (CI == &RefCI || GTIdArgs.
count(ArgOp) ||
1925 getCallIfRegularCall(
1926 *ArgOp, &OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num]))
1935 auto AddUserArgs = [&](
Value >Id) {
1936 for (Use &U : GTId.uses())
1940 if (CallArgOpIsGTId(*Callee,
U.getOperandNo(), *CI))
1945 OMPInformationCache::RuntimeFunctionInfo &GlobThreadNumRFI =
1946 OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num];
1948 GlobThreadNumRFI.foreachUse(SCC, [&](Use &U, Function &
F) {
1949 if (CallInst *CI = getCallIfRegularCall(U, &GlobThreadNumRFI))
1957 for (
unsigned U = 0;
U < GTIdArgs.
size(); ++
U)
1958 AddUserArgs(*GTIdArgs[U]);
1966 DenseMap<Function *, std::optional<Kernel>> UniqueKernelMap;
1969 Kernel getUniqueKernelFor(Function &
F);
1972 Kernel getUniqueKernelFor(Instruction &
I) {
1973 return getUniqueKernelFor(*
I.getFunction());
1978 bool rewriteDeviceCodeStateMachine();
1994 template <
typename RemarkKind,
typename RemarkCallBack>
1995 void emitRemark(Instruction *
I, StringRef RemarkName,
1996 RemarkCallBack &&RemarkCB)
const {
1998 auto &ORE = OREGetter(
F);
2002 return RemarkCB(RemarkKind(
DEBUG_TYPE, RemarkName,
I))
2003 <<
" [" << RemarkName <<
"]";
2007 [&]() {
return RemarkCB(RemarkKind(
DEBUG_TYPE, RemarkName,
I)); });
2011 template <
typename RemarkKind,
typename RemarkCallBack>
2012 void emitRemark(Function *
F, StringRef RemarkName,
2013 RemarkCallBack &&RemarkCB)
const {
2014 auto &ORE = OREGetter(
F);
2018 return RemarkCB(RemarkKind(
DEBUG_TYPE, RemarkName,
F))
2019 <<
" [" << RemarkName <<
"]";
2023 [&]() {
return RemarkCB(RemarkKind(
DEBUG_TYPE, RemarkName,
F)); });
2030 SmallVectorImpl<Function *> &SCC;
2034 CallGraphUpdater &CGUpdater;
2037 OptimizationRemarkGetter OREGetter;
2040 OMPInformationCache &OMPInfoCache;
2046 bool runAttributor(
bool IsModulePass) {
2050 registerAAs(IsModulePass);
2055 <<
" functions, result: " <<
Changed <<
".\n");
2057 if (
Changed == ChangeStatus::CHANGED)
2058 OMPInfoCache.invalidateAnalyses();
2060 return Changed == ChangeStatus::CHANGED;
2067 void registerAAs(
bool IsModulePass);
2072 static void registerAAsForFunction(Attributor &A,
const Function &
F);
2076 if (OMPInfoCache.CGSCC && !OMPInfoCache.CGSCC->empty() &&
2077 !OMPInfoCache.CGSCC->contains(&
F))
2082 std::optional<Kernel> &CachedKernel = UniqueKernelMap[&
F];
2084 return *CachedKernel;
2091 return *CachedKernel;
2094 CachedKernel =
nullptr;
2095 if (!
F.hasLocalLinkage()) {
2098 auto Remark = [&](OptimizationRemarkAnalysis ORA) {
2099 return ORA <<
"Potentially unknown OpenMP target region caller.";
2107 auto GetUniqueKernelForUse = [&](
const Use &
U) ->
Kernel {
2110 if (
Cmp->isEquality())
2111 return getUniqueKernelFor(*Cmp);
2116 if (CB->isCallee(&U))
2117 return getUniqueKernelFor(*CB);
2119 OMPInformationCache::RuntimeFunctionInfo &KernelParallelRFI =
2120 OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_51];
2122 if (OpenMPOpt::getCallIfRegularCall(*
U.getUser(), &KernelParallelRFI))
2123 return getUniqueKernelFor(*CB);
2131 SmallPtrSet<Kernel, 2> PotentialKernels;
2132 OMPInformationCache::foreachUse(
F, [&](
const Use &U) {
2133 PotentialKernels.
insert(GetUniqueKernelForUse(U));
2137 if (PotentialKernels.
size() == 1)
2138 K = *PotentialKernels.
begin();
2141 UniqueKernelMap[&
F] =
K;
2146bool OpenMPOpt::rewriteDeviceCodeStateMachine() {
2147 OMPInformationCache::RuntimeFunctionInfo &KernelParallelRFI =
2148 OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_51];
2151 if (!KernelParallelRFI)
2158 for (Function *
F : SCC) {
2162 bool UnknownUse =
false;
2163 bool KernelParallelUse =
false;
2164 unsigned NumDirectCalls = 0;
2167 OMPInformationCache::foreachUse(*
F, [&](Use &U) {
2169 if (CB->isCallee(&U)) {
2175 ToBeReplacedStateMachineUses.
push_back(&U);
2181 OpenMPOpt::getCallIfRegularCall(*
U.getUser(), &KernelParallelRFI);
2182 const unsigned int WrapperFunctionArgNo = 6;
2183 if (!KernelParallelUse && CI &&
2185 KernelParallelUse =
true;
2186 ToBeReplacedStateMachineUses.
push_back(&U);
2194 if (!KernelParallelUse)
2200 if (UnknownUse || NumDirectCalls != 1 ||
2201 ToBeReplacedStateMachineUses.
size() > 2) {
2202 auto Remark = [&](OptimizationRemarkAnalysis ORA) {
2203 return ORA <<
"Parallel region is used in "
2204 << (UnknownUse ?
"unknown" :
"unexpected")
2205 <<
" ways. Will not attempt to rewrite the state machine.";
2215 auto Remark = [&](OptimizationRemarkAnalysis ORA) {
2216 return ORA <<
"Parallel region is not called from a unique kernel. "
2217 "Will not attempt to rewrite the state machine.";
2229 Type *Int8Ty = Type::getInt8Ty(
M.getContext());
2231 auto *
ID =
new GlobalVariable(
2235 for (Use *U : ToBeReplacedStateMachineUses)
2237 ID,
U->get()->getType()));
2239 ++NumOpenMPParallelRegionsReplacedInGPUStateMachine;
2248struct AAICVTracker :
public StateWrapper<BooleanState, AbstractAttribute> {
2249 using Base = StateWrapper<BooleanState, AbstractAttribute>;
2250 AAICVTracker(
const IRPosition &IRP, Attributor &
A) :
Base(IRP) {}
2253 bool isAssumedTracked()
const {
return getAssumed(); }
2256 bool isKnownTracked()
const {
return getAssumed(); }
2259 static AAICVTracker &createForPosition(
const IRPosition &IRP, Attributor &
A);
2263 const Instruction *
I,
2264 Attributor &
A)
const {
2265 return std::nullopt;
2271 virtual std::optional<Value *>
2279 StringRef
getName()
const override {
return "AAICVTracker"; }
2282 const char *getIdAddr()
const override {
return &
ID; }
2285 static bool classof(
const AbstractAttribute *AA) {
2289 static const char ID;
2292struct AAICVTrackerFunction :
public AAICVTracker {
2293 AAICVTrackerFunction(
const IRPosition &IRP, Attributor &
A)
2294 : AAICVTracker(IRP,
A) {}
2297 const std::string getAsStr(Attributor *)
const override {
2298 return "ICVTrackerFunction";
2302 void trackStatistics()
const override {}
2306 return ChangeStatus::UNCHANGED;
2311 InternalControlVar::ICV___last>
2312 ICVReplacementValuesMap;
2319 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
2322 auto &SetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Setter];
2324 auto &ValuesMap = ICVReplacementValuesMap[ICV];
2326 CallInst *CI = OpenMPOpt::getCallIfRegularCall(U);
2332 if (ValuesMap.insert(std::make_pair(CI, CI->
getArgOperand(0))).second)
2333 HasChanged = ChangeStatus::CHANGED;
2339 std::optional<Value *> ReplVal = getValueForCall(
A,
I, ICV);
2340 if (ReplVal && ValuesMap.insert(std::make_pair(&
I, *ReplVal)).second)
2341 HasChanged = ChangeStatus::CHANGED;
2347 SetterRFI.foreachUse(TrackValues,
F);
2349 bool UsedAssumedInformation =
false;
2350 A.checkForAllInstructions(CallCheck, *
this, {Instruction::Call},
2351 UsedAssumedInformation,
2357 if (HasChanged == ChangeStatus::CHANGED)
2358 ValuesMap.try_emplace(Entry);
2366 std::optional<Value *> getValueForCall(Attributor &
A,
const Instruction &
I,
2370 if (!CB || CB->hasFnAttr(
"no_openmp") ||
2371 CB->hasFnAttr(
"no_openmp_routines") ||
2372 CB->hasFnAttr(
"no_openmp_constructs"))
2373 return std::nullopt;
2375 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
2376 auto &GetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Getter];
2377 auto &SetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Setter];
2378 Function *CalledFunction = CB->getCalledFunction();
2381 if (CalledFunction ==
nullptr)
2383 if (CalledFunction == GetterRFI.Declaration)
2384 return std::nullopt;
2385 if (CalledFunction == SetterRFI.Declaration) {
2386 if (ICVReplacementValuesMap[ICV].
count(&
I))
2387 return ICVReplacementValuesMap[ICV].lookup(&
I);
2396 const auto *ICVTrackingAA =
A.getAAFor<AAICVTracker>(
2399 if (ICVTrackingAA->isAssumedTracked()) {
2400 std::optional<Value *> URV =
2401 ICVTrackingAA->getUniqueReplacementValue(ICV);
2412 std::optional<Value *>
2414 return std::nullopt;
2419 const Instruction *
I,
2420 Attributor &
A)
const override {
2421 const auto &ValuesMap = ICVReplacementValuesMap[ICV];
2422 if (ValuesMap.count(
I))
2423 return ValuesMap.lookup(
I);
2426 SmallPtrSet<const Instruction *, 16> Visited;
2429 std::optional<Value *> ReplVal;
2431 while (!Worklist.
empty()) {
2433 if (!Visited.
insert(CurrInst).second)
2441 if (ValuesMap.count(CurrInst)) {
2442 std::optional<Value *> NewReplVal = ValuesMap.lookup(CurrInst);
2445 ReplVal = NewReplVal;
2451 if (ReplVal != NewReplVal)
2457 std::optional<Value *> NewReplVal = getValueForCall(
A, *CurrInst, ICV);
2463 ReplVal = NewReplVal;
2469 if (ReplVal != NewReplVal)
2474 if (CurrBB ==
I->getParent() && ReplVal)
2479 if (
const Instruction *Terminator = Pred->getTerminator())
2487struct AAICVTrackerFunctionReturned : AAICVTracker {
2488 AAICVTrackerFunctionReturned(
const IRPosition &IRP, Attributor &
A)
2489 : AAICVTracker(IRP,
A) {}
2492 const std::string getAsStr(Attributor *)
const override {
2493 return "ICVTrackerFunctionReturned";
2497 void trackStatistics()
const override {}
2501 return ChangeStatus::UNCHANGED;
2506 InternalControlVar::ICV___last>
2507 ICVReplacementValuesMap;
2510 std::optional<Value *>
2512 return ICVReplacementValuesMap[ICV];
2517 const auto *ICVTrackingAA =
A.getAAFor<AAICVTracker>(
2520 if (!ICVTrackingAA->isAssumedTracked())
2521 return indicatePessimisticFixpoint();
2524 std::optional<Value *> &ReplVal = ICVReplacementValuesMap[ICV];
2525 std::optional<Value *> UniqueICVValue;
2528 std::optional<Value *> NewReplVal =
2529 ICVTrackingAA->getReplacementValue(ICV, &
I,
A);
2532 if (UniqueICVValue && UniqueICVValue != NewReplVal)
2535 UniqueICVValue = NewReplVal;
2540 bool UsedAssumedInformation =
false;
2541 if (!
A.checkForAllInstructions(CheckReturnInst, *
this, {Instruction::Ret},
2542 UsedAssumedInformation,
2544 UniqueICVValue =
nullptr;
2546 if (UniqueICVValue == ReplVal)
2549 ReplVal = UniqueICVValue;
2550 Changed = ChangeStatus::CHANGED;
2557struct AAICVTrackerCallSite : AAICVTracker {
2558 AAICVTrackerCallSite(
const IRPosition &IRP, Attributor &
A)
2559 : AAICVTracker(IRP,
A) {}
2562 assert(getAnchorScope() &&
"Expected anchor function");
2566 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
2568 auto ICVInfo = OMPInfoCache.ICVs[ICV];
2569 auto &Getter = OMPInfoCache.RFIs[ICVInfo.Getter];
2570 if (Getter.Declaration == getAssociatedFunction()) {
2571 AssociatedICV = ICVInfo.Kind;
2577 indicatePessimisticFixpoint();
2581 if (!ReplVal || !*ReplVal)
2582 return ChangeStatus::UNCHANGED;
2585 A.deleteAfterManifest(*getCtxI());
2587 return ChangeStatus::CHANGED;
2591 const std::string getAsStr(Attributor *)
const override {
2592 return "ICVTrackerCallSite";
2596 void trackStatistics()
const override {}
2599 std::optional<Value *> ReplVal;
2602 const auto *ICVTrackingAA =
A.getAAFor<AAICVTracker>(
2606 if (!ICVTrackingAA->isAssumedTracked())
2607 return indicatePessimisticFixpoint();
2609 std::optional<Value *> NewReplVal =
2610 ICVTrackingAA->getReplacementValue(AssociatedICV, getCtxI(),
A);
2612 if (ReplVal == NewReplVal)
2613 return ChangeStatus::UNCHANGED;
2615 ReplVal = NewReplVal;
2616 return ChangeStatus::CHANGED;
2621 std::optional<Value *>
2627struct AAICVTrackerCallSiteReturned : AAICVTracker {
2628 AAICVTrackerCallSiteReturned(
const IRPosition &IRP, Attributor &
A)
2629 : AAICVTracker(IRP,
A) {}
2632 const std::string getAsStr(Attributor *)
const override {
2633 return "ICVTrackerCallSiteReturned";
2637 void trackStatistics()
const override {}
2641 return ChangeStatus::UNCHANGED;
2646 InternalControlVar::ICV___last>
2647 ICVReplacementValuesMap;
2651 std::optional<Value *>
2653 return ICVReplacementValuesMap[ICV];
2658 const auto *ICVTrackingAA =
A.getAAFor<AAICVTracker>(
2660 DepClassTy::REQUIRED);
2663 if (!ICVTrackingAA->isAssumedTracked())
2664 return indicatePessimisticFixpoint();
2667 std::optional<Value *> &ReplVal = ICVReplacementValuesMap[ICV];
2668 std::optional<Value *> NewReplVal =
2669 ICVTrackingAA->getUniqueReplacementValue(ICV);
2671 if (ReplVal == NewReplVal)
2674 ReplVal = NewReplVal;
2675 Changed = ChangeStatus::CHANGED;
2683static bool hasFunctionEndAsUniqueSuccessor(
const BasicBlock *BB) {
2689 return hasFunctionEndAsUniqueSuccessor(
Successor);
2692struct AAExecutionDomainFunction :
public AAExecutionDomain {
2693 AAExecutionDomainFunction(
const IRPosition &IRP, Attributor &
A)
2694 : AAExecutionDomain(IRP,
A) {}
2696 ~AAExecutionDomainFunction() {
delete RPOT; }
2700 assert(
F &&
"Expected anchor function");
2701 RPOT =
new ReversePostOrderTraversal<Function *>(
F);
2704 const std::string getAsStr(Attributor *)
const override {
2705 unsigned TotalBlocks = 0, InitialThreadBlocks = 0, AlignedBlocks = 0;
2706 for (
auto &It : BEDMap) {
2710 InitialThreadBlocks += It.getSecond().IsExecutedByInitialThreadOnly;
2711 AlignedBlocks += It.getSecond().IsReachedFromAlignedBarrierOnly &&
2712 It.getSecond().IsReachingAlignedBarrierOnly;
2714 return "[AAExecutionDomain] " + std::to_string(InitialThreadBlocks) +
"/" +
2715 std::to_string(AlignedBlocks) +
" of " +
2716 std::to_string(TotalBlocks) +
2717 " executed by initial thread / aligned";
2721 void trackStatistics()
const override {}
2725 for (
const BasicBlock &BB : *getAnchorScope()) {
2726 if (!isExecutedByInitialThreadOnly(BB))
2728 dbgs() <<
TAG <<
" Basic block @" << getAnchorScope()->getName() <<
" "
2729 << BB.
getName() <<
" is executed by a single thread.\n";
2738 SmallPtrSet<CallBase *, 16> DeletedBarriers;
2739 auto HandleAlignedBarrier = [&](CallBase *CB) {
2740 const ExecutionDomainTy &ED = CB ? CEDMap[{CB, PRE}] : BEDMap[
nullptr];
2741 if (!ED.IsReachedFromAlignedBarrierOnly ||
2742 ED.EncounteredNonLocalSideEffect)
2744 if (!ED.EncounteredAssumes.empty() && !
A.isModulePass())
2755 DeletedBarriers.
insert(CB);
2756 A.deleteAfterManifest(*CB);
2757 ++NumBarriersEliminated;
2758 Changed = ChangeStatus::CHANGED;
2759 }
else if (!ED.AlignedBarriers.empty()) {
2760 Changed = ChangeStatus::CHANGED;
2762 ED.AlignedBarriers.end());
2763 SmallSetVector<CallBase *, 16> Visited;
2764 while (!Worklist.
empty()) {
2766 if (!Visited.
insert(LastCB))
2770 if (!hasFunctionEndAsUniqueSuccessor(LastCB->
getParent()))
2772 if (!DeletedBarriers.
count(LastCB)) {
2773 ++NumBarriersEliminated;
2774 A.deleteAfterManifest(*LastCB);
2780 const ExecutionDomainTy &LastED = CEDMap[{LastCB, PRE}];
2781 Worklist.
append(LastED.AlignedBarriers.begin(),
2782 LastED.AlignedBarriers.end());
2788 if (!ED.EncounteredAssumes.empty() && (CB || !ED.AlignedBarriers.empty()))
2789 for (
auto *AssumeCB : ED.EncounteredAssumes)
2790 A.deleteAfterManifest(*AssumeCB);
2793 for (
auto *CB : AlignedBarriers)
2794 HandleAlignedBarrier(CB);
2798 HandleAlignedBarrier(
nullptr);
2803 bool isNoOpFence(
const FenceInst &FI)
const override {
2804 return getState().isValidState() && !NonNoOpFences.count(&FI);
2810 mergeInPredecessorBarriersAndAssumptions(Attributor &
A, ExecutionDomainTy &ED,
2811 const ExecutionDomainTy &PredED);
2816 bool mergeInPredecessor(Attributor &
A, ExecutionDomainTy &ED,
2817 const ExecutionDomainTy &PredED,
2818 bool InitialEdgeOnly =
false);
2821 bool handleCallees(Attributor &
A, ExecutionDomainTy &EntryBBED);
2828 bool isExecutedByInitialThreadOnly(
const BasicBlock &BB)
const override {
2829 if (!isValidState())
2831 assert(BB.
getParent() == getAnchorScope() &&
"Block is out of scope!");
2832 return BEDMap.lookup(&BB).IsExecutedByInitialThreadOnly;
2835 bool isExecutedInAlignedRegion(Attributor &
A,
2836 const Instruction &
I)
const override {
2837 assert(
I.getFunction() == getAnchorScope() &&
2838 "Instruction is out of scope!");
2839 if (!isValidState())
2842 bool ForwardIsOk =
true;
2851 if (CB != &
I && AlignedBarriers.contains(
const_cast<CallBase *
>(CB)))
2853 const auto &It = CEDMap.find({CB, PRE});
2854 if (It == CEDMap.end())
2856 if (!It->getSecond().IsReachingAlignedBarrierOnly)
2857 ForwardIsOk =
false;
2861 if (!CurI && !BEDMap.lookup(
I.getParent()).IsReachingAlignedBarrierOnly)
2862 ForwardIsOk =
false;
2870 if (CB != &
I && AlignedBarriers.contains(
const_cast<CallBase *
>(CB)))
2872 const auto &It = CEDMap.find({CB, POST});
2873 if (It == CEDMap.end())
2875 if (It->getSecond().IsReachedFromAlignedBarrierOnly)
2888 return BEDMap.lookup(
nullptr).IsReachedFromAlignedBarrierOnly;
2890 return BEDMap.lookup(PredBB).IsReachedFromAlignedBarrierOnly;
2900 ExecutionDomainTy getExecutionDomain(
const BasicBlock &BB)
const override {
2902 "No request should be made against an invalid state!");
2903 return BEDMap.lookup(&BB);
2905 std::pair<ExecutionDomainTy, ExecutionDomainTy>
2906 getExecutionDomain(
const CallBase &CB)
const override {
2908 "No request should be made against an invalid state!");
2909 return {CEDMap.lookup({&CB, PRE}), CEDMap.lookup({&CB, POST})};
2911 ExecutionDomainTy getFunctionExecutionDomain()
const override {
2913 "No request should be made against an invalid state!");
2914 return InterProceduralED;
2920 static bool isInitialThreadOnlyEdge(Attributor &
A, BranchInst *
Edge,
2921 BasicBlock &SuccessorBB) {
2922 if (!
Edge || !
Edge->isConditional())
2924 if (
Edge->getSuccessor(0) != &SuccessorBB)
2928 if (!Cmp || !
Cmp->isTrueWhenEqual() || !
Cmp->isEquality())
2936 if (
C->isAllOnesValue()) {
2938 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
2939 auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_target_init];
2940 CB = CB ? OpenMPOpt::getCallIfRegularCall(*CB, &RFI) : nullptr;
2943 ConstantStruct *KernelEnvC =
2945 ConstantInt *ExecModeC =
2946 KernelInfo::getExecModeFromKernelEnvironment(KernelEnvC);
2953 if (
II->getIntrinsicID() == Intrinsic::nvvm_read_ptx_sreg_tid_x)
2958 if (
II->getIntrinsicID() == Intrinsic::amdgcn_workitem_id_x)
2966 ExecutionDomainTy InterProceduralED;
2970 DenseMap<const BasicBlock *, ExecutionDomainTy> BEDMap;
2971 DenseMap<PointerIntPair<const CallBase *, 1, Direction>, ExecutionDomainTy>
2973 SmallSetVector<CallBase *, 16> AlignedBarriers;
2975 ReversePostOrderTraversal<Function *> *RPOT =
nullptr;
2978 static bool setAndRecord(
bool &R,
bool V) {
2986 SmallPtrSet<const FenceInst *, 8> NonNoOpFences;
2989void AAExecutionDomainFunction::mergeInPredecessorBarriersAndAssumptions(
2990 Attributor &
A, ExecutionDomainTy &ED,
const ExecutionDomainTy &PredED) {
2991 for (
auto *EA : PredED.EncounteredAssumes)
2992 ED.addAssumeInst(
A, *EA);
2994 for (
auto *AB : PredED.AlignedBarriers)
2995 ED.addAlignedBarrier(
A, *AB);
2998bool AAExecutionDomainFunction::mergeInPredecessor(
2999 Attributor &
A, ExecutionDomainTy &ED,
const ExecutionDomainTy &PredED,
3000 bool InitialEdgeOnly) {
3004 setAndRecord(ED.IsExecutedByInitialThreadOnly,
3005 InitialEdgeOnly || (PredED.IsExecutedByInitialThreadOnly &&
3006 ED.IsExecutedByInitialThreadOnly));
3008 Changed |= setAndRecord(ED.IsReachedFromAlignedBarrierOnly,
3009 ED.IsReachedFromAlignedBarrierOnly &&
3010 PredED.IsReachedFromAlignedBarrierOnly);
3011 Changed |= setAndRecord(ED.EncounteredNonLocalSideEffect,
3012 ED.EncounteredNonLocalSideEffect |
3013 PredED.EncounteredNonLocalSideEffect);
3015 if (ED.IsReachedFromAlignedBarrierOnly)
3016 mergeInPredecessorBarriersAndAssumptions(
A, ED, PredED);
3018 ED.clearAssumeInstAndAlignedBarriers();
3022bool AAExecutionDomainFunction::handleCallees(Attributor &
A,
3023 ExecutionDomainTy &EntryBBED) {
3025 auto PredForCallSite = [&](AbstractCallSite ACS) {
3026 const auto *EDAA =
A.getAAFor<AAExecutionDomain>(
3028 DepClassTy::OPTIONAL);
3029 if (!EDAA || !EDAA->getState().isValidState())
3032 EDAA->getExecutionDomain(*
cast<CallBase>(ACS.getInstruction())));
3036 ExecutionDomainTy ExitED;
3037 bool AllCallSitesKnown;
3038 if (
A.checkForAllCallSites(PredForCallSite, *
this,
3040 AllCallSitesKnown)) {
3041 for (
const auto &[CSInED, CSOutED] : CallSiteEDs) {
3042 mergeInPredecessor(
A, EntryBBED, CSInED);
3043 ExitED.IsReachingAlignedBarrierOnly &=
3044 CSOutED.IsReachingAlignedBarrierOnly;
3051 EntryBBED.IsExecutedByInitialThreadOnly =
false;
3052 EntryBBED.IsReachedFromAlignedBarrierOnly =
true;
3053 EntryBBED.EncounteredNonLocalSideEffect =
false;
3054 ExitED.IsReachingAlignedBarrierOnly =
false;
3056 EntryBBED.IsExecutedByInitialThreadOnly =
false;
3057 EntryBBED.IsReachedFromAlignedBarrierOnly =
false;
3058 EntryBBED.EncounteredNonLocalSideEffect =
true;
3059 ExitED.IsReachingAlignedBarrierOnly =
false;
3064 auto &FnED = BEDMap[
nullptr];
3065 Changed |= setAndRecord(FnED.IsReachedFromAlignedBarrierOnly,
3066 FnED.IsReachedFromAlignedBarrierOnly &
3067 EntryBBED.IsReachedFromAlignedBarrierOnly);
3068 Changed |= setAndRecord(FnED.IsReachingAlignedBarrierOnly,
3069 FnED.IsReachingAlignedBarrierOnly &
3070 ExitED.IsReachingAlignedBarrierOnly);
3071 Changed |= setAndRecord(FnED.IsExecutedByInitialThreadOnly,
3072 EntryBBED.IsExecutedByInitialThreadOnly);
3076ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &
A) {
3083 auto HandleAlignedBarrier = [&](CallBase &CB, ExecutionDomainTy &ED) {
3084 Changed |= AlignedBarriers.insert(&CB);
3086 auto &CallInED = CEDMap[{&CB, PRE}];
3087 Changed |= mergeInPredecessor(
A, CallInED, ED);
3088 CallInED.IsReachingAlignedBarrierOnly =
true;
3090 ED.EncounteredNonLocalSideEffect =
false;
3091 ED.IsReachedFromAlignedBarrierOnly =
true;
3093 ED.clearAssumeInstAndAlignedBarriers();
3094 ED.addAlignedBarrier(
A, CB);
3095 auto &CallOutED = CEDMap[{&CB, POST}];
3096 Changed |= mergeInPredecessor(
A, CallOutED, ED);
3100 A.getAAFor<AAIsDead>(*
this, getIRPosition(), DepClassTy::OPTIONAL);
3107 for (
auto &RIt : *RPOT) {
3110 bool IsEntryBB = &BB == &EntryBB;
3113 bool AlignedBarrierLastInBlock = IsEntryBB && IsKernel;
3114 bool IsExplicitlyAligned = IsEntryBB && IsKernel;
3115 ExecutionDomainTy ED;
3122 if (LivenessAA && LivenessAA->isAssumedDead(&BB))
3126 if (LivenessAA && LivenessAA->isEdgeDead(PredBB, &BB))
3128 bool InitialEdgeOnly = isInitialThreadOnlyEdge(
3130 mergeInPredecessor(
A, ED, BEDMap[PredBB], InitialEdgeOnly);
3136 for (Instruction &
I : BB) {
3137 bool UsedAssumedInformation;
3138 if (
A.isAssumedDead(
I, *
this, LivenessAA, UsedAssumedInformation,
3139 false, DepClassTy::OPTIONAL,
3147 ED.addAssumeInst(
A, *AI);
3151 if (
II->isAssumeLikeIntrinsic())
3156 if (!ED.EncounteredNonLocalSideEffect) {
3158 if (ED.IsReachedFromAlignedBarrierOnly)
3163 case AtomicOrdering::NotAtomic:
3165 case AtomicOrdering::Unordered:
3167 case AtomicOrdering::Monotonic:
3169 case AtomicOrdering::Acquire:
3171 case AtomicOrdering::Release:
3173 case AtomicOrdering::AcquireRelease:
3175 case AtomicOrdering::SequentiallyConsistent:
3179 NonNoOpFences.insert(FI);
3184 bool IsAlignedBarrier =
3188 AlignedBarrierLastInBlock &= IsNoSync;
3189 IsExplicitlyAligned &= IsNoSync;
3195 if (IsAlignedBarrier) {
3196 HandleAlignedBarrier(*CB, ED);
3197 AlignedBarrierLastInBlock =
true;
3198 IsExplicitlyAligned =
true;
3204 if (!ED.EncounteredNonLocalSideEffect &&
3206 ED.EncounteredNonLocalSideEffect =
true;
3208 ED.IsReachedFromAlignedBarrierOnly =
false;
3216 auto &CallInED = CEDMap[{CB, PRE}];
3217 Changed |= mergeInPredecessor(
A, CallInED, ED);
3223 if (!IsNoSync && Callee && !
Callee->isDeclaration()) {
3224 const auto *EDAA =
A.getAAFor<AAExecutionDomain>(
3226 if (EDAA && EDAA->getState().isValidState()) {
3227 const auto &CalleeED = EDAA->getFunctionExecutionDomain();
3228 ED.IsReachedFromAlignedBarrierOnly =
3229 CalleeED.IsReachedFromAlignedBarrierOnly;
3230 AlignedBarrierLastInBlock = ED.IsReachedFromAlignedBarrierOnly;
3231 if (IsNoSync || !CalleeED.IsReachedFromAlignedBarrierOnly)
3232 ED.EncounteredNonLocalSideEffect |=
3233 CalleeED.EncounteredNonLocalSideEffect;
3235 ED.EncounteredNonLocalSideEffect =
3236 CalleeED.EncounteredNonLocalSideEffect;
3237 if (!CalleeED.IsReachingAlignedBarrierOnly) {
3239 setAndRecord(CallInED.IsReachingAlignedBarrierOnly,
false);
3242 if (CalleeED.IsReachedFromAlignedBarrierOnly)
3243 mergeInPredecessorBarriersAndAssumptions(
A, ED, CalleeED);
3244 auto &CallOutED = CEDMap[{CB, POST}];
3245 Changed |= mergeInPredecessor(
A, CallOutED, ED);
3250 ED.IsReachedFromAlignedBarrierOnly =
false;
3251 Changed |= setAndRecord(CallInED.IsReachingAlignedBarrierOnly,
false);
3254 AlignedBarrierLastInBlock &= ED.IsReachedFromAlignedBarrierOnly;
3256 auto &CallOutED = CEDMap[{CB, POST}];
3257 Changed |= mergeInPredecessor(
A, CallOutED, ED);
3260 if (!
I.mayHaveSideEffects() && !
I.mayReadFromMemory())
3266 const auto *MemAA =
A.getAAFor<AAMemoryLocation>(
3274 if (MemAA && MemAA->getState().isValidState() &&
3275 MemAA->checkForAllAccessesToMemoryKind(
3280 auto &InfoCache =
A.getInfoCache();
3281 if (!
I.mayHaveSideEffects() && InfoCache.isOnlyUsedByAssume(
I))
3285 if (LI->hasMetadata(LLVMContext::MD_invariant_load))
3288 if (!ED.EncounteredNonLocalSideEffect &&
3290 ED.EncounteredNonLocalSideEffect =
true;
3293 bool IsEndAndNotReachingAlignedBarriersOnly =
false;
3295 !BB.getTerminator()->getNumSuccessors()) {
3297 Changed |= mergeInPredecessor(
A, InterProceduralED, ED);
3299 auto &FnED = BEDMap[
nullptr];
3300 if (IsKernel && !IsExplicitlyAligned)
3301 FnED.IsReachingAlignedBarrierOnly =
false;
3302 Changed |= mergeInPredecessor(
A, FnED, ED);
3304 if (!FnED.IsReachingAlignedBarrierOnly) {
3305 IsEndAndNotReachingAlignedBarriersOnly =
true;
3306 SyncInstWorklist.
push_back(BB.getTerminator());
3307 auto &BBED = BEDMap[&BB];
3308 Changed |= setAndRecord(BBED.IsReachingAlignedBarrierOnly,
false);
3312 ExecutionDomainTy &StoredED = BEDMap[&BB];
3313 ED.IsReachingAlignedBarrierOnly = StoredED.IsReachingAlignedBarrierOnly &
3314 !IsEndAndNotReachingAlignedBarriersOnly;
3320 if (ED.IsExecutedByInitialThreadOnly !=
3321 StoredED.IsExecutedByInitialThreadOnly ||
3322 ED.IsReachedFromAlignedBarrierOnly !=
3323 StoredED.IsReachedFromAlignedBarrierOnly ||
3324 ED.EncounteredNonLocalSideEffect !=
3325 StoredED.EncounteredNonLocalSideEffect)
3329 StoredED = std::move(ED);
3334 SmallSetVector<BasicBlock *, 16> Visited;
3335 while (!SyncInstWorklist.
empty()) {
3338 bool HitAlignedBarrierOrKnownEnd =
false;
3343 auto &CallOutED = CEDMap[{CB, POST}];
3344 Changed |= setAndRecord(CallOutED.IsReachingAlignedBarrierOnly,
false);
3345 auto &CallInED = CEDMap[{CB, PRE}];
3346 HitAlignedBarrierOrKnownEnd =
3347 AlignedBarriers.count(CB) || !CallInED.IsReachingAlignedBarrierOnly;
3348 if (HitAlignedBarrierOrKnownEnd)
3350 Changed |= setAndRecord(CallInED.IsReachingAlignedBarrierOnly,
false);
3352 if (HitAlignedBarrierOrKnownEnd)
3356 if (LivenessAA && LivenessAA->isEdgeDead(PredBB, SyncBB))
3358 if (!Visited.
insert(PredBB))
3360 auto &PredED = BEDMap[PredBB];
3361 if (setAndRecord(PredED.IsReachingAlignedBarrierOnly,
false)) {
3363 SyncInstWorklist.
push_back(PredBB->getTerminator());
3366 if (SyncBB != &EntryBB)
3369 setAndRecord(InterProceduralED.IsReachingAlignedBarrierOnly,
false);
3372 return Changed ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED;
3377struct AAHeapToShared :
public StateWrapper<BooleanState, AbstractAttribute> {
3378 using Base = StateWrapper<BooleanState, AbstractAttribute>;
3379 AAHeapToShared(
const IRPosition &IRP, Attributor &
A) :
Base(IRP) {}
3382 static AAHeapToShared &createForPosition(
const IRPosition &IRP,
3386 virtual bool isAssumedHeapToShared(CallBase &CB)
const = 0;
3390 virtual bool isAssumedHeapToSharedRemovedFree(CallBase &CB)
const = 0;
3393 StringRef
getName()
const override {
return "AAHeapToShared"; }
3396 const char *getIdAddr()
const override {
return &
ID; }
3400 static bool classof(
const AbstractAttribute *AA) {
3405 static const char ID;
3408struct AAHeapToSharedFunction :
public AAHeapToShared {
3409 AAHeapToSharedFunction(
const IRPosition &IRP, Attributor &
A)
3410 : AAHeapToShared(IRP,
A) {}
3412 const std::string getAsStr(Attributor *)
const override {
3413 return "[AAHeapToShared] " + std::to_string(MallocCalls.size()) +
3414 " malloc calls eligible.";
3418 void trackStatistics()
const override {}
3422 void findPotentialRemovedFreeCalls(Attributor &
A) {
3423 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
3424 auto &FreeRFI = OMPInfoCache.RFIs[OMPRTL___kmpc_free_shared];
3426 PotentialRemovedFreeCalls.clear();
3428 for (CallBase *CB : MallocCalls) {
3430 for (
auto *U : CB->
users()) {
3432 if (
C &&
C->getCalledFunction() == FreeRFI.Declaration)
3436 if (FreeCalls.
size() != 1)
3439 PotentialRemovedFreeCalls.insert(FreeCalls.
front());
3445 indicatePessimisticFixpoint();
3449 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
3450 auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
3451 if (!RFI.Declaration)
3455 [](
const IRPosition &,
const AbstractAttribute *,
3456 bool &) -> std::optional<Value *> {
return nullptr; };
3459 for (User *U : RFI.Declaration->
users())
3463 MallocCalls.insert(CB);
3468 findPotentialRemovedFreeCalls(
A);
3471 bool isAssumedHeapToShared(CallBase &CB)
const override {
3472 return isValidState() && MallocCalls.count(&CB);
3475 bool isAssumedHeapToSharedRemovedFree(CallBase &CB)
const override {
3476 return isValidState() && PotentialRemovedFreeCalls.count(&CB);
3480 if (MallocCalls.empty())
3481 return ChangeStatus::UNCHANGED;
3483 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
3484 auto &FreeCall = OMPInfoCache.RFIs[OMPRTL___kmpc_free_shared];
3488 DepClassTy::OPTIONAL);
3491 for (CallBase *CB : MallocCalls) {
3493 if (HS &&
HS->isAssumedHeapToStack(*CB))
3498 for (
auto *U : CB->
users()) {
3500 if (
C &&
C->getCalledFunction() == FreeCall.Declaration)
3503 if (FreeCalls.
size() != 1)
3510 <<
" with shared memory."
3511 <<
" Shared memory usage is limited to "
3517 <<
" with " << AllocSize->getZExtValue()
3518 <<
" bytes of shared memory\n");
3523 Type *Int8Ty = Type::getInt8Ty(
M->getContext());
3524 Type *Int8ArrTy = ArrayType::get(Int8Ty, AllocSize->getZExtValue());
3525 auto *SharedMem =
new GlobalVariable(
3529 static_cast<unsigned>(AddressSpace::Shared));
3531 SharedMem, PointerType::getUnqual(
M->getContext()));
3533 auto Remark = [&](OptimizationRemark
OR) {
3534 return OR <<
"Replaced globalized variable with "
3535 <<
ore::NV(
"SharedMemory", AllocSize->getZExtValue())
3536 << (AllocSize->isOne() ?
" byte " :
" bytes ")
3537 <<
"of shared memory.";
3539 A.emitRemark<OptimizationRemark>(CB,
"OMP111",
Remark);
3541 MaybeAlign Alignment = CB->getRetAlign();
3543 "HeapToShared on allocation without alignment attribute");
3544 SharedMem->setAlignment(*Alignment);
3547 A.deleteAfterManifest(*CB);
3548 A.deleteAfterManifest(*FreeCalls.
front());
3550 SharedMemoryUsed += AllocSize->getZExtValue();
3551 NumBytesMovedToSharedMemory = SharedMemoryUsed;
3552 Changed = ChangeStatus::CHANGED;
3559 if (MallocCalls.empty())
3560 return indicatePessimisticFixpoint();
3561 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
3562 auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
3563 if (!RFI.Declaration)
3564 return ChangeStatus::UNCHANGED;
3568 auto NumMallocCalls = MallocCalls.size();
3571 for (User *U : RFI.Declaration->
users()) {
3573 if (CB->getCaller() !=
F)
3575 if (!MallocCalls.count(CB))
3578 MallocCalls.remove(CB);
3581 const auto *ED =
A.getAAFor<AAExecutionDomain>(
3583 if (!ED || !ED->isExecutedByInitialThreadOnly(*CB))
3584 MallocCalls.remove(CB);
3588 findPotentialRemovedFreeCalls(
A);
3590 if (NumMallocCalls != MallocCalls.size())
3591 return ChangeStatus::CHANGED;
3593 return ChangeStatus::UNCHANGED;
3597 SmallSetVector<CallBase *, 4> MallocCalls;
3599 SmallPtrSet<CallBase *, 4> PotentialRemovedFreeCalls;
3601 unsigned SharedMemoryUsed = 0;
3604struct AAKernelInfo :
public StateWrapper<KernelInfoState, AbstractAttribute> {
3605 using Base = StateWrapper<KernelInfoState, AbstractAttribute>;
3606 AAKernelInfo(
const IRPosition &IRP, Attributor &
A) :
Base(IRP) {}
3610 static bool requiresCalleeForCallBase() {
return false; }
3613 void trackStatistics()
const override {}
3616 const std::string getAsStr(Attributor *)
const override {
3617 if (!isValidState())
3619 return std::string(SPMDCompatibilityTracker.isAssumed() ?
"SPMD"
3621 std::string(SPMDCompatibilityTracker.isAtFixpoint() ?
" [FIX]"
3623 std::string(
" #PRs: ") +
3624 (ReachedKnownParallelRegions.isValidState()
3625 ? std::to_string(ReachedKnownParallelRegions.size())
3627 ", #Unknown PRs: " +
3628 (ReachedUnknownParallelRegions.isValidState()
3629 ? std::to_string(ReachedUnknownParallelRegions.size())
3631 ", #Reaching Kernels: " +
3632 (ReachingKernelEntries.isValidState()
3633 ? std::to_string(ReachingKernelEntries.size())
3636 (ParallelLevels.isValidState()
3637 ? std::to_string(ParallelLevels.size())
3639 ", NestedPar: " + (NestedParallelism ?
"yes" :
"no");
3643 static AAKernelInfo &createForPosition(
const IRPosition &IRP, Attributor &
A);
3646 StringRef
getName()
const override {
return "AAKernelInfo"; }
3649 const char *getIdAddr()
const override {
return &
ID; }
3652 static bool classof(
const AbstractAttribute *AA) {
3656 static const char ID;
3661struct AAKernelInfoFunction : AAKernelInfo {
3662 AAKernelInfoFunction(
const IRPosition &IRP, Attributor &
A)
3663 : AAKernelInfo(IRP,
A) {}
3665 SmallPtrSet<Instruction *, 4> GuardedInstructions;
3667 SmallPtrSetImpl<Instruction *> &getGuardedInstructions() {
3668 return GuardedInstructions;
3671 void setConfigurationOfKernelEnvironment(ConstantStruct *ConfigC) {
3673 KernelEnvC, ConfigC, {KernelInfo::ConfigurationIdx});
3674 assert(NewKernelEnvC &&
"Failed to create new kernel environment");
3678#define KERNEL_ENVIRONMENT_CONFIGURATION_SETTER(MEMBER) \
3679 void set##MEMBER##OfKernelEnvironment(ConstantInt *NewVal) { \
3680 ConstantStruct *ConfigC = \
3681 KernelInfo::getConfigurationFromKernelEnvironment(KernelEnvC); \
3682 Constant *NewConfigC = ConstantFoldInsertValueInstruction( \
3683 ConfigC, NewVal, {KernelInfo::MEMBER##Idx}); \
3684 assert(NewConfigC && "Failed to create new configuration environment"); \
3685 setConfigurationOfKernelEnvironment(cast<ConstantStruct>(NewConfigC)); \
3696#undef KERNEL_ENVIRONMENT_CONFIGURATION_SETTER
3703 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
3707 OMPInformationCache::RuntimeFunctionInfo &InitRFI =
3708 OMPInfoCache.RFIs[OMPRTL___kmpc_target_init];
3709 OMPInformationCache::RuntimeFunctionInfo &DeinitRFI =
3710 OMPInfoCache.RFIs[OMPRTL___kmpc_target_deinit];
3714 auto StoreCallBase = [](
Use &U,
3715 OMPInformationCache::RuntimeFunctionInfo &RFI,
3717 CallBase *CB = OpenMPOpt::getCallIfRegularCall(U, &RFI);
3719 "Unexpected use of __kmpc_target_init or __kmpc_target_deinit!");
3721 "Multiple uses of __kmpc_target_init or __kmpc_target_deinit!");
3727 StoreCallBase(U, InitRFI, KernelInitCB);
3731 DeinitRFI.foreachUse(
3733 StoreCallBase(U, DeinitRFI, KernelDeinitCB);
3739 if (!KernelInitCB || !KernelDeinitCB)
3743 ReachingKernelEntries.insert(Fn);
3744 IsKernelEntry =
true;
3752 KernelConfigurationSimplifyCB =
3754 bool &UsedAssumedInformation) -> std::optional<Constant *> {
3755 if (!isAtFixpoint()) {
3758 UsedAssumedInformation =
true;
3764 A.registerGlobalVariableSimplificationCallback(
3765 *KernelEnvGV, KernelConfigurationSimplifyCB);
3768 bool CanChangeToSPMD = OMPInfoCache.runtimeFnsAvailable(
3769 {OMPRTL___kmpc_get_hardware_thread_id_in_block,
3770 OMPRTL___kmpc_barrier_simple_spmd});
3774 KernelInfo::getExecModeFromKernelEnvironment(KernelEnvC);
3779 SPMDCompatibilityTracker.indicateOptimisticFixpoint();
3783 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
3785 setExecModeOfKernelEnvironment(AssumedExecModeC);
3790 OpenMPIRBuilder::readThreadBoundsForKernel(
T, *Fn);
3792 setMinThreadsOfKernelEnvironment(ConstantInt::get(
Int32Ty, MinThreads));
3795 auto [MinTeams, MaxTeams] =
3796 OpenMPIRBuilder::readTeamBoundsForKernel(
T, *Fn);
3798 setMinTeamsOfKernelEnvironment(ConstantInt::get(
Int32Ty, MinTeams));
3800 setMaxTeamsOfKernelEnvironment(ConstantInt::get(
Int32Ty, MaxTeams));
3803 KernelInfo::getMayUseNestedParallelismFromKernelEnvironment(KernelEnvC);
3804 ConstantInt *AssumedMayUseNestedParallelismC = ConstantInt::get(
3806 setMayUseNestedParallelismOfKernelEnvironment(
3807 AssumedMayUseNestedParallelismC);
3811 KernelInfo::getUseGenericStateMachineFromKernelEnvironment(
3814 ConstantInt::get(UseGenericStateMachineC->
getIntegerType(),
false);
3815 setUseGenericStateMachineOfKernelEnvironment(
3816 AssumedUseGenericStateMachineC);
3822 if (!OMPInfoCache.RFIs[RFKind].Declaration)
3824 A.registerVirtualUseCallback(*OMPInfoCache.RFIs[RFKind].Declaration, CB);
3828 auto AddDependence = [](
Attributor &
A,
const AAKernelInfo *KI,
3845 if (SPMDCompatibilityTracker.isValidState())
3846 return AddDependence(
A,
this, QueryingAA);
3848 if (!ReachedKnownParallelRegions.isValidState())
3849 return AddDependence(
A,
this, QueryingAA);
3855 RegisterVirtualUse(OMPRTL___kmpc_get_hardware_num_threads_in_block,
3856 CustomStateMachineUseCB);
3857 RegisterVirtualUse(OMPRTL___kmpc_get_warp_size, CustomStateMachineUseCB);
3858 RegisterVirtualUse(OMPRTL___kmpc_barrier_simple_generic,
3859 CustomStateMachineUseCB);
3860 RegisterVirtualUse(OMPRTL___kmpc_kernel_parallel,
3861 CustomStateMachineUseCB);
3862 RegisterVirtualUse(OMPRTL___kmpc_kernel_end_parallel,
3863 CustomStateMachineUseCB);
3867 if (SPMDCompatibilityTracker.isAtFixpoint())
3874 if (!SPMDCompatibilityTracker.isValidState())
3875 return AddDependence(
A,
this, QueryingAA);
3878 RegisterVirtualUse(OMPRTL___kmpc_get_hardware_thread_id_in_block,
3887 if (!SPMDCompatibilityTracker.isValidState())
3888 return AddDependence(
A,
this, QueryingAA);
3889 if (SPMDCompatibilityTracker.empty())
3890 return AddDependence(
A,
this, QueryingAA);
3891 if (!mayContainParallelRegion())
3892 return AddDependence(
A,
this, QueryingAA);
3895 RegisterVirtualUse(OMPRTL___kmpc_barrier_simple_spmd, SPMDBarrierUseCB);
3899 static std::string sanitizeForGlobalName(std::string S) {
3903 return !((C >=
'a' && C <=
'z') || (C >=
'A' && C <=
'Z') ||
3904 (C >=
'0' && C <=
'9') || C ==
'_');
3915 if (!KernelInitCB || !KernelDeinitCB)
3916 return ChangeStatus::UNCHANGED;
3920 bool HasBuiltStateMachine =
true;
3921 if (!changeToSPMDMode(
A,
Changed)) {
3923 HasBuiltStateMachine = buildCustomStateMachine(
A,
Changed);
3925 HasBuiltStateMachine =
false;
3929 ConstantStruct *ExistingKernelEnvC =
3931 ConstantInt *OldUseGenericStateMachineVal =
3932 KernelInfo::getUseGenericStateMachineFromKernelEnvironment(
3933 ExistingKernelEnvC);
3934 if (!HasBuiltStateMachine)
3935 setUseGenericStateMachineOfKernelEnvironment(
3936 OldUseGenericStateMachineVal);
3939 GlobalVariable *KernelEnvGV =
3943 Changed = ChangeStatus::CHANGED;
3949 void insertInstructionGuardsHelper(Attributor &
A) {
3950 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
3952 auto CreateGuardedRegion = [&](
Instruction *RegionStartI,
3954 LoopInfo *LI =
nullptr;
3955 DominatorTree *DT =
nullptr;
3956 MemorySSAUpdater *MSU =
nullptr;
3957 using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
3986 DT, LI, MSU,
"region.guarded.end");
3989 MSU,
"region.barrier");
3992 DT, LI, MSU,
"region.exit");
3994 SplitBlock(ParentBB, RegionStartI, DT, LI, MSU,
"region.guarded");
3997 "Expected a different CFG");
4000 ParentBB, ParentBB->
getTerminator(), DT, LI, MSU,
"region.check.tid");
4003 A.registerManifestAddedBasicBlock(*RegionEndBB);
4004 A.registerManifestAddedBasicBlock(*RegionBarrierBB);
4005 A.registerManifestAddedBasicBlock(*RegionExitBB);
4006 A.registerManifestAddedBasicBlock(*RegionStartBB);
4007 A.registerManifestAddedBasicBlock(*RegionCheckTidBB);
4009 bool HasBroadcastValues =
false;
4012 for (Instruction &
I : *RegionStartBB) {
4014 for (Use &U :
I.uses()) {
4020 if (OutsideUses.
empty())
4023 HasBroadcastValues =
true;
4027 auto *SharedMem =
new GlobalVariable(
4028 M,
I.getType(),
false,
4030 sanitizeForGlobalName(
4031 (
I.getName() +
".guarded.output.alloc").str()),
4033 static_cast<unsigned>(AddressSpace::Shared));
4036 new StoreInst(&
I, SharedMem,
4039 LoadInst *LoadI =
new LoadInst(
4040 I.getType(), SharedMem,
I.getName() +
".guarded.output.load",
4044 for (Use *U : OutsideUses)
4045 A.changeUseAfterManifest(*U, *LoadI);
4048 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
4053 OpenMPIRBuilder::LocationDescription Loc(
4054 InsertPointTy(ParentBB, ParentBB->
end()),
DL);
4055 OMPInfoCache.OMPBuilder.updateToLocation(Loc);
4056 uint32_t SrcLocStrSize;
4058 OMPInfoCache.OMPBuilder.getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4060 OMPInfoCache.OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4065 OpenMPIRBuilder::LocationDescription LocRegionCheckTid(
4066 InsertPointTy(RegionCheckTidBB, RegionCheckTidBB->
end()),
DL);
4067 OMPInfoCache.OMPBuilder.updateToLocation(LocRegionCheckTid);
4068 FunctionCallee HardwareTidFn =
4069 OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
4070 M, OMPRTL___kmpc_get_hardware_thread_id_in_block);
4072 OMPInfoCache.OMPBuilder.Builder.CreateCall(HardwareTidFn, {});
4074 OMPInfoCache.setCallingConvention(HardwareTidFn, Tid);
4075 Value *TidCheck = OMPInfoCache.OMPBuilder.Builder.CreateIsNull(Tid);
4076 OMPInfoCache.OMPBuilder.Builder
4077 .CreateCondBr(TidCheck, RegionStartBB, RegionBarrierBB)
4082 FunctionCallee BarrierFn =
4083 OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
4084 M, OMPRTL___kmpc_barrier_simple_spmd);
4085 OMPInfoCache.OMPBuilder.updateToLocation(InsertPointTy(
4088 OMPInfoCache.OMPBuilder.Builder.CreateCall(BarrierFn, {Ident, Tid});
4090 OMPInfoCache.setCallingConvention(BarrierFn, Barrier);
4093 if (HasBroadcastValues) {
4098 OMPInfoCache.setCallingConvention(BarrierFn, Barrier);
4102 auto &AllocSharedRFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
4103 SmallPtrSet<BasicBlock *, 8> Visited;
4104 for (Instruction *GuardedI : SPMDCompatibilityTracker) {
4106 if (!Visited.
insert(BB).second)
4112 while (++IP != IPEnd) {
4113 if (!IP->mayHaveSideEffects() && !IP->mayReadFromMemory())
4116 if (OpenMPOpt::getCallIfRegularCall(*
I, &AllocSharedRFI))
4118 if (!
I->user_empty() || !SPMDCompatibilityTracker.contains(
I)) {
4119 LastEffect =
nullptr;
4126 for (
auto &Reorder : Reorders)
4127 Reorder.first->moveBefore(Reorder.second->getIterator());
4132 for (Instruction *GuardedI : SPMDCompatibilityTracker) {
4134 auto *CalleeAA =
A.lookupAAFor<AAKernelInfo>(
4137 assert(CalleeAA !=
nullptr &&
"Expected Callee AAKernelInfo");
4140 if (CalleeAAFunction.getGuardedInstructions().contains(GuardedI))
4143 Instruction *GuardedRegionStart =
nullptr, *GuardedRegionEnd =
nullptr;
4144 for (Instruction &
I : *BB) {
4147 if (SPMDCompatibilityTracker.contains(&
I)) {
4148 CalleeAAFunction.getGuardedInstructions().insert(&
I);
4149 if (GuardedRegionStart)
4150 GuardedRegionEnd = &
I;
4152 GuardedRegionStart = GuardedRegionEnd = &
I;
4159 if (GuardedRegionStart) {
4161 std::make_pair(GuardedRegionStart, GuardedRegionEnd));
4162 GuardedRegionStart =
nullptr;
4163 GuardedRegionEnd =
nullptr;
4168 for (
auto &GR : GuardedRegions)
4169 CreateGuardedRegion(GR.first, GR.second);
4172 void forceSingleThreadPerWorkgroupHelper(Attributor &
A) {
4181 auto &Ctx = getAnchorValue().getContext();
4188 KernelInitCB->
getNextNode(),
"main.thread.user_code");
4193 A.registerManifestAddedBasicBlock(*InitBB);
4194 A.registerManifestAddedBasicBlock(*UserCodeBB);
4195 A.registerManifestAddedBasicBlock(*ReturnBB);
4204 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
4205 FunctionCallee ThreadIdInBlockFn =
4206 OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
4207 M, OMPRTL___kmpc_get_hardware_thread_id_in_block);
4210 CallInst *ThreadIdInBlock =
4212 OMPInfoCache.setCallingConvention(ThreadIdInBlockFn, ThreadIdInBlock);
4218 ConstantInt::get(ThreadIdInBlock->
getType(), 0),
4219 "thread.is_main", InitBB);
4225 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
4227 if (!SPMDCompatibilityTracker.isAssumed()) {
4228 for (Instruction *NonCompatibleI : SPMDCompatibilityTracker) {
4229 if (!NonCompatibleI)
4234 if (OMPInfoCache.RTLFunctions.contains(CB->getCalledFunction()))
4237 auto Remark = [&](OptimizationRemarkAnalysis ORA) {
4238 ORA <<
"Value has potential side effects preventing SPMD-mode "
4241 ORA <<
". Add `[[omp::assume(\"ompx_spmd_amenable\")]]` to "
4242 "the called function to override";
4246 A.emitRemark<OptimizationRemarkAnalysis>(NonCompatibleI,
"OMP121",
4250 << *NonCompatibleI <<
"\n");
4262 Kernel = CB->getCaller();
4267 ConstantStruct *ExistingKernelEnvC =
4270 KernelInfo::getExecModeFromKernelEnvironment(ExistingKernelEnvC);
4276 Changed = ChangeStatus::CHANGED;
4280 if (mayContainParallelRegion())
4281 insertInstructionGuardsHelper(
A);
4283 forceSingleThreadPerWorkgroupHelper(
A);
4288 "Initially non-SPMD kernel has SPMD exec mode!");
4289 setExecModeOfKernelEnvironment(
4293 ++NumOpenMPTargetRegionKernelsSPMD;
4295 auto Remark = [&](OptimizationRemark
OR) {
4296 return OR <<
"Transformed generic-mode kernel to SPMD-mode.";
4298 A.emitRemark<OptimizationRemark>(KernelInitCB,
"OMP120",
Remark);
4308 if (!ReachedKnownParallelRegions.isValidState())
4311 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
4312 if (!OMPInfoCache.runtimeFnsAvailable(
4313 {OMPRTL___kmpc_get_hardware_num_threads_in_block,
4314 OMPRTL___kmpc_get_warp_size, OMPRTL___kmpc_barrier_simple_generic,
4315 OMPRTL___kmpc_kernel_parallel, OMPRTL___kmpc_kernel_end_parallel}))
4318 ConstantStruct *ExistingKernelEnvC =
4325 ConstantInt *UseStateMachineC =
4326 KernelInfo::getUseGenericStateMachineFromKernelEnvironment(
4327 ExistingKernelEnvC);
4328 ConstantInt *ModeC =
4329 KernelInfo::getExecModeFromKernelEnvironment(ExistingKernelEnvC);
4334 if (UseStateMachineC->
isZero() ||
4338 Changed = ChangeStatus::CHANGED;
4341 setUseGenericStateMachineOfKernelEnvironment(
4348 if (!mayContainParallelRegion()) {
4349 ++NumOpenMPTargetRegionKernelsWithoutStateMachine;
4351 auto Remark = [&](OptimizationRemark
OR) {
4352 return OR <<
"Removing unused state machine from generic-mode kernel.";
4354 A.emitRemark<OptimizationRemark>(KernelInitCB,
"OMP130",
Remark);
4360 if (ReachedUnknownParallelRegions.empty()) {
4361 ++NumOpenMPTargetRegionKernelsCustomStateMachineWithoutFallback;
4363 auto Remark = [&](OptimizationRemark
OR) {
4364 return OR <<
"Rewriting generic-mode kernel with a customized state "
4367 A.emitRemark<OptimizationRemark>(KernelInitCB,
"OMP131",
Remark);
4369 ++NumOpenMPTargetRegionKernelsCustomStateMachineWithFallback;
4371 auto Remark = [&](OptimizationRemarkAnalysis
OR) {
4372 return OR <<
"Generic-mode kernel is executed with a customized state "
4373 "machine that requires a fallback.";
4375 A.emitRemark<OptimizationRemarkAnalysis>(KernelInitCB,
"OMP132",
Remark);
4378 for (CallBase *UnknownParallelRegionCB : ReachedUnknownParallelRegions) {
4379 if (!UnknownParallelRegionCB)
4381 auto Remark = [&](OptimizationRemarkAnalysis ORA) {
4382 return ORA <<
"Call may contain unknown parallel regions. Use "
4383 <<
"`[[omp::assume(\"omp_no_parallelism\")]]` to "
4386 A.emitRemark<OptimizationRemarkAnalysis>(UnknownParallelRegionCB,
4421 auto &Ctx = getAnchorValue().getContext();
4425 BasicBlock *InitBB = KernelInitCB->getParent();
4427 KernelInitCB->getNextNode(),
"thread.user_code.check");
4431 Ctx,
"worker_state_machine.begin",
Kernel, UserCodeEntryBB);
4433 Ctx,
"worker_state_machine.finished",
Kernel, UserCodeEntryBB);
4435 Ctx,
"worker_state_machine.is_active.check",
Kernel, UserCodeEntryBB);
4438 Kernel, UserCodeEntryBB);
4441 Kernel, UserCodeEntryBB);
4443 Ctx,
"worker_state_machine.done.barrier",
Kernel, UserCodeEntryBB);
4444 A.registerManifestAddedBasicBlock(*InitBB);
4445 A.registerManifestAddedBasicBlock(*UserCodeEntryBB);
4446 A.registerManifestAddedBasicBlock(*IsWorkerCheckBB);
4447 A.registerManifestAddedBasicBlock(*StateMachineBeginBB);
4448 A.registerManifestAddedBasicBlock(*StateMachineFinishedBB);
4449 A.registerManifestAddedBasicBlock(*StateMachineIsActiveCheckBB);
4450 A.registerManifestAddedBasicBlock(*StateMachineIfCascadeCurrentBB);
4451 A.registerManifestAddedBasicBlock(*StateMachineEndParallelBB);
4452 A.registerManifestAddedBasicBlock(*StateMachineDoneBarrierBB);
4454 const DebugLoc &DLoc = KernelInitCB->getDebugLoc();
4460 ConstantInt::get(KernelInitCB->getType(), -1),
4461 "thread.is_worker", InitBB);
4466 FunctionCallee BlockHwSizeFn =
4467 OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
4468 M, OMPRTL___kmpc_get_hardware_num_threads_in_block);
4469 FunctionCallee WarpSizeFn =
4470 OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
4471 M, OMPRTL___kmpc_get_warp_size);
4472 CallInst *BlockHwSize =
4474 OMPInfoCache.setCallingConvention(BlockHwSizeFn, BlockHwSize);
4476 CallInst *WarpSize =
4478 OMPInfoCache.setCallingConvention(WarpSizeFn, WarpSize);
4481 BlockHwSize, WarpSize,
"block.size", IsWorkerCheckBB);
4485 "thread.is_main_or_worker", IsWorkerCheckBB);
4488 IsMainOrWorker, IsWorkerCheckBB);
4491 const DataLayout &
DL =
M.getDataLayout();
4492 Type *VoidPtrTy = PointerType::getUnqual(Ctx);
4494 new AllocaInst(VoidPtrTy,
DL.getAllocaAddrSpace(),
nullptr,
4498 OMPInfoCache.OMPBuilder.updateToLocation(
4499 OpenMPIRBuilder::LocationDescription(
4500 IRBuilder<>::InsertPoint(StateMachineBeginBB,
4501 StateMachineBeginBB->
end()),
4504 Value *Ident = KernelInfo::getIdentFromKernelEnvironment(KernelEnvC);
4505 Value *GTid = KernelInitCB;
4507 FunctionCallee BarrierFn =
4508 OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
4509 M, OMPRTL___kmpc_barrier_simple_generic);
4512 OMPInfoCache.setCallingConvention(BarrierFn, Barrier);
4516 (
unsigned int)AddressSpace::Generic) {
4517 WorkFnAI =
new AddrSpaceCastInst(
4518 WorkFnAI, PointerType::get(Ctx, (
unsigned int)AddressSpace::Generic),
4519 WorkFnAI->
getName() +
".generic", StateMachineBeginBB);
4523 FunctionCallee KernelParallelFn =
4524 OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
4525 M, OMPRTL___kmpc_kernel_parallel);
4527 KernelParallelFn, {WorkFnAI},
"worker.is_active", StateMachineBeginBB);
4528 OMPInfoCache.setCallingConvention(KernelParallelFn, IsActiveWorker);
4530 Instruction *WorkFn =
new LoadInst(VoidPtrTy, WorkFnAI,
"worker.work_fn",
4531 StateMachineBeginBB);
4534 FunctionType *ParallelRegionFnTy = FunctionType::get(
4535 Type::getVoidTy(Ctx), {Type::getInt16Ty(Ctx), Type::getInt32Ty(Ctx)},
4541 StateMachineBeginBB);
4542 IsDone->setDebugLoc(DLoc);
4544 IsDone, StateMachineBeginBB)
4548 StateMachineDoneBarrierBB, IsActiveWorker,
4549 StateMachineIsActiveCheckBB)
4555 const unsigned int WrapperFunctionArgNo = 6;
4560 for (
int I = 0,
E = ReachedKnownParallelRegions.size();
I <
E; ++
I) {
4561 auto *CB = ReachedKnownParallelRegions[
I];
4563 CB->getArgOperand(WrapperFunctionArgNo)->stripPointerCasts());
4565 Ctx,
"worker_state_machine.parallel_region.execute",
Kernel,
4566 StateMachineEndParallelBB);
4568 ->setDebugLoc(DLoc);
4574 Kernel, StateMachineEndParallelBB);
4575 A.registerManifestAddedBasicBlock(*PRExecuteBB);
4576 A.registerManifestAddedBasicBlock(*PRNextBB);
4581 if (
I + 1 <
E || !ReachedUnknownParallelRegions.empty()) {
4584 "worker.check_parallel_region", StateMachineIfCascadeCurrentBB);
4592 StateMachineIfCascadeCurrentBB)
4594 StateMachineIfCascadeCurrentBB = PRNextBB;
4600 if (!ReachedUnknownParallelRegions.empty()) {
4601 StateMachineIfCascadeCurrentBB->
setName(
4602 "worker_state_machine.parallel_region.fallback.execute");
4604 StateMachineIfCascadeCurrentBB)
4605 ->setDebugLoc(DLoc);
4608 StateMachineIfCascadeCurrentBB)
4611 FunctionCallee EndParallelFn =
4612 OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
4613 M, OMPRTL___kmpc_kernel_end_parallel);
4614 CallInst *EndParallel =
4616 OMPInfoCache.setCallingConvention(EndParallelFn, EndParallel);
4622 ->setDebugLoc(DLoc);
4632 KernelInfoState StateBefore = getState();
4638 struct UpdateKernelEnvCRAII {
4639 AAKernelInfoFunction &AA;
4641 UpdateKernelEnvCRAII(AAKernelInfoFunction &AA) : AA(AA) {}
4643 ~UpdateKernelEnvCRAII() {
4647 ConstantStruct *ExistingKernelEnvC =
4650 if (!AA.isValidState()) {
4651 AA.KernelEnvC = ExistingKernelEnvC;
4655 if (!AA.ReachedKnownParallelRegions.isValidState())
4656 AA.setUseGenericStateMachineOfKernelEnvironment(
4657 KernelInfo::getUseGenericStateMachineFromKernelEnvironment(
4658 ExistingKernelEnvC));
4660 if (!AA.SPMDCompatibilityTracker.isValidState())
4661 AA.setExecModeOfKernelEnvironment(
4662 KernelInfo::getExecModeFromKernelEnvironment(ExistingKernelEnvC));
4664 ConstantInt *MayUseNestedParallelismC =
4665 KernelInfo::getMayUseNestedParallelismFromKernelEnvironment(
4667 ConstantInt *NewMayUseNestedParallelismC = ConstantInt::get(
4668 MayUseNestedParallelismC->
getIntegerType(), AA.NestedParallelism);
4669 AA.setMayUseNestedParallelismOfKernelEnvironment(
4670 NewMayUseNestedParallelismC);
4680 if (!
I.mayWriteToMemory())
4683 const auto *UnderlyingObjsAA =
A.getAAFor<AAUnderlyingObjects>(
4685 DepClassTy::OPTIONAL);
4686 auto *
HS =
A.getAAFor<AAHeapToStack>(
4688 DepClassTy::OPTIONAL);
4689 if (UnderlyingObjsAA &&
4690 UnderlyingObjsAA->forallUnderlyingObjects([&](
Value &Obj) {
4691 if (AA::isAssumedThreadLocalObject(A, Obj, *this))
4695 auto *CB = dyn_cast<CallBase>(&Obj);
4696 return CB && HS && HS->isAssumedHeapToStack(*CB);
4702 SPMDCompatibilityTracker.insert(&
I);
4706 bool UsedAssumedInformationInCheckRWInst =
false;
4707 if (!SPMDCompatibilityTracker.isAtFixpoint())
4708 if (!
A.checkForAllReadWriteInstructions(
4709 CheckRWInst, *
this, UsedAssumedInformationInCheckRWInst))
4710 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
4712 bool UsedAssumedInformationFromReachingKernels =
false;
4713 if (!IsKernelEntry) {
4714 updateParallelLevels(
A);
4716 bool AllReachingKernelsKnown =
true;
4717 updateReachingKernelEntries(
A, AllReachingKernelsKnown);
4718 UsedAssumedInformationFromReachingKernels = !AllReachingKernelsKnown;
4720 if (!SPMDCompatibilityTracker.empty()) {
4721 if (!ParallelLevels.isValidState())
4722 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
4723 else if (!ReachingKernelEntries.isValidState())
4724 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
4730 for (
auto *
Kernel : ReachingKernelEntries) {
4731 auto *CBAA =
A.getAAFor<AAKernelInfo>(
4733 if (CBAA && CBAA->SPMDCompatibilityTracker.isValidState() &&
4734 CBAA->SPMDCompatibilityTracker.isAssumed())
4738 if (!CBAA || !CBAA->SPMDCompatibilityTracker.isAtFixpoint())
4739 UsedAssumedInformationFromReachingKernels =
true;
4741 if (SPMD != 0 &&
Generic != 0)
4742 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
4748 bool AllParallelRegionStatesWereFixed =
true;
4749 bool AllSPMDStatesWereFixed =
true;
4752 auto *CBAA =
A.getAAFor<AAKernelInfo>(
4756 getState() ^= CBAA->getState();
4757 AllSPMDStatesWereFixed &= CBAA->SPMDCompatibilityTracker.isAtFixpoint();
4758 AllParallelRegionStatesWereFixed &=
4759 CBAA->ReachedKnownParallelRegions.isAtFixpoint();
4760 AllParallelRegionStatesWereFixed &=
4761 CBAA->ReachedUnknownParallelRegions.isAtFixpoint();
4765 bool UsedAssumedInformationInCheckCallInst =
false;
4766 if (!
A.checkForAllCallLikeInstructions(
4767 CheckCallInst, *
this, UsedAssumedInformationInCheckCallInst)) {
4769 <<
"Failed to visit all call-like instructions!\n";);
4770 return indicatePessimisticFixpoint();
4775 if (!UsedAssumedInformationInCheckCallInst &&
4776 AllParallelRegionStatesWereFixed) {
4777 ReachedKnownParallelRegions.indicateOptimisticFixpoint();
4778 ReachedUnknownParallelRegions.indicateOptimisticFixpoint();
4783 if (!UsedAssumedInformationInCheckRWInst &&
4784 !UsedAssumedInformationInCheckCallInst &&
4785 !UsedAssumedInformationFromReachingKernels && AllSPMDStatesWereFixed)
4786 SPMDCompatibilityTracker.indicateOptimisticFixpoint();
4788 return StateBefore == getState() ? ChangeStatus::UNCHANGED
4789 : ChangeStatus::CHANGED;
4794 void updateReachingKernelEntries(Attributor &
A,
4795 bool &AllReachingKernelsKnown) {
4796 auto PredCallSite = [&](AbstractCallSite ACS) {
4799 assert(Caller &&
"Caller is nullptr");
4801 auto *CAA =
A.getOrCreateAAFor<AAKernelInfo>(
4803 if (CAA && CAA->ReachingKernelEntries.isValidState()) {
4804 ReachingKernelEntries ^= CAA->ReachingKernelEntries;
4810 ReachingKernelEntries.indicatePessimisticFixpoint();
4815 if (!
A.checkForAllCallSites(PredCallSite, *
this,
4817 AllReachingKernelsKnown))
4818 ReachingKernelEntries.indicatePessimisticFixpoint();
4822 void updateParallelLevels(Attributor &
A) {
4823 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
4824 OMPInformationCache::RuntimeFunctionInfo &Parallel51RFI =
4825 OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_51];
4827 auto PredCallSite = [&](AbstractCallSite ACS) {
4830 assert(Caller &&
"Caller is nullptr");
4834 if (CAA && CAA->ParallelLevels.isValidState()) {
4840 if (Caller == Parallel51RFI.Declaration) {
4841 ParallelLevels.indicatePessimisticFixpoint();
4845 ParallelLevels ^= CAA->ParallelLevels;
4852 ParallelLevels.indicatePessimisticFixpoint();
4857 bool AllCallSitesKnown =
true;
4858 if (!
A.checkForAllCallSites(PredCallSite, *
this,
4861 ParallelLevels.indicatePessimisticFixpoint();
4868struct AAKernelInfoCallSite : AAKernelInfo {
4869 AAKernelInfoCallSite(
const IRPosition &IRP, Attributor &
A)
4870 : AAKernelInfo(IRP,
A) {}
4874 AAKernelInfo::initialize(
A);
4877 auto *AssumptionAA =
A.getAAFor<AAAssumptionInfo>(
4881 if (AssumptionAA && AssumptionAA->hasAssumption(
"ompx_spmd_amenable")) {
4882 indicateOptimisticFixpoint();
4890 indicateOptimisticFixpoint();
4899 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
4900 const auto &It = OMPInfoCache.RuntimeFunctionIDMap.find(Callee);
4901 if (It == OMPInfoCache.RuntimeFunctionIDMap.end()) {
4903 if (!Callee || !
A.isFunctionIPOAmendable(*Callee)) {
4907 if (!AssumptionAA ||
4908 !(AssumptionAA->hasAssumption(
"omp_no_openmp") ||
4909 AssumptionAA->hasAssumption(
"omp_no_parallelism")))
4910 ReachedUnknownParallelRegions.insert(&CB);
4914 if (!SPMDCompatibilityTracker.isAtFixpoint()) {
4915 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
4916 SPMDCompatibilityTracker.insert(&CB);
4921 indicateOptimisticFixpoint();
4927 if (NumCallees > 1) {
4928 indicatePessimisticFixpoint();
4935 case OMPRTL___kmpc_is_spmd_exec_mode:
4936 case OMPRTL___kmpc_distribute_static_fini:
4937 case OMPRTL___kmpc_for_static_fini:
4938 case OMPRTL___kmpc_global_thread_num:
4939 case OMPRTL___kmpc_get_hardware_num_threads_in_block:
4940 case OMPRTL___kmpc_get_hardware_num_blocks:
4941 case OMPRTL___kmpc_single:
4942 case OMPRTL___kmpc_end_single:
4943 case OMPRTL___kmpc_master:
4944 case OMPRTL___kmpc_end_master:
4945 case OMPRTL___kmpc_barrier:
4946 case OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2:
4947 case OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2:
4948 case OMPRTL___kmpc_error:
4949 case OMPRTL___kmpc_flush:
4950 case OMPRTL___kmpc_get_hardware_thread_id_in_block:
4951 case OMPRTL___kmpc_get_warp_size:
4952 case OMPRTL_omp_get_thread_num:
4953 case OMPRTL_omp_get_num_threads:
4954 case OMPRTL_omp_get_max_threads:
4955 case OMPRTL_omp_in_parallel:
4956 case OMPRTL_omp_get_dynamic:
4957 case OMPRTL_omp_get_cancellation:
4958 case OMPRTL_omp_get_nested:
4959 case OMPRTL_omp_get_schedule:
4960 case OMPRTL_omp_get_thread_limit:
4961 case OMPRTL_omp_get_supported_active_levels:
4962 case OMPRTL_omp_get_max_active_levels:
4963 case OMPRTL_omp_get_level:
4964 case OMPRTL_omp_get_ancestor_thread_num:
4965 case OMPRTL_omp_get_team_size:
4966 case OMPRTL_omp_get_active_level:
4967 case OMPRTL_omp_in_final:
4968 case OMPRTL_omp_get_proc_bind:
4969 case OMPRTL_omp_get_num_places:
4970 case OMPRTL_omp_get_num_procs:
4971 case OMPRTL_omp_get_place_proc_ids:
4972 case OMPRTL_omp_get_place_num:
4973 case OMPRTL_omp_get_partition_num_places:
4974 case OMPRTL_omp_get_partition_place_nums:
4975 case OMPRTL_omp_get_wtime:
4977 case OMPRTL___kmpc_distribute_static_init_4:
4978 case OMPRTL___kmpc_distribute_static_init_4u:
4979 case OMPRTL___kmpc_distribute_static_init_8:
4980 case OMPRTL___kmpc_distribute_static_init_8u:
4981 case OMPRTL___kmpc_for_static_init_4:
4982 case OMPRTL___kmpc_for_static_init_4u:
4983 case OMPRTL___kmpc_for_static_init_8:
4984 case OMPRTL___kmpc_for_static_init_8u: {
4986 unsigned ScheduleArgOpNo = 2;
4987 auto *ScheduleTypeCI =
4989 unsigned ScheduleTypeVal =
4990 ScheduleTypeCI ? ScheduleTypeCI->getZExtValue() : 0;
4992 case OMPScheduleType::UnorderedStatic:
4993 case OMPScheduleType::UnorderedStaticChunked:
4994 case OMPScheduleType::OrderedDistribute:
4995 case OMPScheduleType::OrderedDistributeChunked:
4998 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
4999 SPMDCompatibilityTracker.insert(&CB);
5003 case OMPRTL___kmpc_target_init:
5006 case OMPRTL___kmpc_target_deinit:
5007 KernelDeinitCB = &CB;
5009 case OMPRTL___kmpc_parallel_51:
5010 if (!handleParallel51(
A, CB))
5011 indicatePessimisticFixpoint();
5013 case OMPRTL___kmpc_omp_task:
5015 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
5016 SPMDCompatibilityTracker.insert(&CB);
5017 ReachedUnknownParallelRegions.insert(&CB);
5019 case OMPRTL___kmpc_alloc_shared:
5020 case OMPRTL___kmpc_free_shared:
5026 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
5027 SPMDCompatibilityTracker.insert(&CB);
5033 indicateOptimisticFixpoint();
5037 A.getAAFor<AACallEdges>(*
this, getIRPosition(), DepClassTy::OPTIONAL);
5038 if (!AACE || !AACE->getState().isValidState() || AACE->hasUnknownCallee()) {
5039 CheckCallee(getAssociatedFunction(), 1);
5042 const auto &OptimisticEdges = AACE->getOptimisticEdges();
5043 for (
auto *Callee : OptimisticEdges) {
5044 CheckCallee(Callee, OptimisticEdges.size());
5055 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
5056 KernelInfoState StateBefore = getState();
5058 auto CheckCallee = [&](
Function *
F,
int NumCallees) {
5059 const auto &It = OMPInfoCache.RuntimeFunctionIDMap.find(
F);
5063 if (It == OMPInfoCache.RuntimeFunctionIDMap.end()) {
5066 A.getAAFor<AAKernelInfo>(*
this, FnPos, DepClassTy::REQUIRED);
5068 return indicatePessimisticFixpoint();
5069 if (getState() == FnAA->getState())
5070 return ChangeStatus::UNCHANGED;
5071 getState() = FnAA->getState();
5072 return ChangeStatus::CHANGED;
5075 return indicatePessimisticFixpoint();
5078 if (It->getSecond() == OMPRTL___kmpc_parallel_51) {
5079 if (!handleParallel51(
A, CB))
5080 return indicatePessimisticFixpoint();
5081 return StateBefore == getState() ? ChangeStatus::UNCHANGED
5082 : ChangeStatus::CHANGED;
5088 (It->getSecond() == OMPRTL___kmpc_alloc_shared ||
5089 It->getSecond() == OMPRTL___kmpc_free_shared) &&
5090 "Expected a __kmpc_alloc_shared or __kmpc_free_shared runtime call");
5092 auto *HeapToStackAA =
A.getAAFor<AAHeapToStack>(
5094 auto *HeapToSharedAA =
A.getAAFor<AAHeapToShared>(
5102 case OMPRTL___kmpc_alloc_shared:
5103 if ((!HeapToStackAA || !HeapToStackAA->isAssumedHeapToStack(CB)) &&
5104 (!HeapToSharedAA || !HeapToSharedAA->isAssumedHeapToShared(CB)))
5105 SPMDCompatibilityTracker.insert(&CB);
5107 case OMPRTL___kmpc_free_shared:
5108 if ((!HeapToStackAA ||
5109 !HeapToStackAA->isAssumedHeapToStackRemovedFree(CB)) &&
5111 !HeapToSharedAA->isAssumedHeapToSharedRemovedFree(CB)))
5112 SPMDCompatibilityTracker.insert(&CB);
5115 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
5116 SPMDCompatibilityTracker.insert(&CB);
5118 return ChangeStatus::CHANGED;
5122 A.getAAFor<AACallEdges>(*
this, getIRPosition(), DepClassTy::OPTIONAL);
5123 if (!AACE || !AACE->getState().isValidState() || AACE->hasUnknownCallee()) {
5124 if (Function *
F = getAssociatedFunction())
5127 const auto &OptimisticEdges = AACE->getOptimisticEdges();
5128 for (
auto *Callee : OptimisticEdges) {
5129 CheckCallee(Callee, OptimisticEdges.size());
5135 return StateBefore == getState() ? ChangeStatus::UNCHANGED
5136 : ChangeStatus::CHANGED;
5141 bool handleParallel51(Attributor &
A, CallBase &CB) {
5142 const unsigned int NonWrapperFunctionArgNo = 5;
5143 const unsigned int WrapperFunctionArgNo = 6;
5144 auto ParallelRegionOpArgNo = SPMDCompatibilityTracker.isAssumed()
5145 ? NonWrapperFunctionArgNo
5146 : WrapperFunctionArgNo;
5150 if (!ParallelRegion)
5153 ReachedKnownParallelRegions.insert(&CB);
5155 auto *FnAA =
A.getAAFor<AAKernelInfo>(
5157 NestedParallelism |= !FnAA || !FnAA->getState().isValidState() ||
5158 !FnAA->ReachedKnownParallelRegions.empty() ||
5159 !FnAA->ReachedKnownParallelRegions.isValidState() ||
5160 !FnAA->ReachedUnknownParallelRegions.isValidState() ||
5161 !FnAA->ReachedUnknownParallelRegions.empty();
5166struct AAFoldRuntimeCall
5167 :
public StateWrapper<BooleanState, AbstractAttribute> {
5168 using Base = StateWrapper<BooleanState, AbstractAttribute>;
5170 AAFoldRuntimeCall(
const IRPosition &IRP, Attributor &
A) :
Base(IRP) {}
5173 void trackStatistics()
const override {}
5176 static AAFoldRuntimeCall &createForPosition(
const IRPosition &IRP,
5180 StringRef
getName()
const override {
return "AAFoldRuntimeCall"; }
5183 const char *getIdAddr()
const override {
return &
ID; }
5187 static bool classof(
const AbstractAttribute *AA) {
5191 static const char ID;
5194struct AAFoldRuntimeCallCallSiteReturned : AAFoldRuntimeCall {
5195 AAFoldRuntimeCallCallSiteReturned(
const IRPosition &IRP, Attributor &
A)
5196 : AAFoldRuntimeCall(IRP,
A) {}
5199 const std::string getAsStr(Attributor *)
const override {
5200 if (!isValidState())
5203 std::string Str(
"simplified value: ");
5205 if (!SimplifiedValue)
5206 return Str + std::string(
"none");
5208 if (!*SimplifiedValue)
5209 return Str + std::string(
"nullptr");
5212 return Str + std::to_string(CI->getSExtValue());
5214 return Str + std::string(
"unknown");
5219 indicatePessimisticFixpoint();
5223 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
5224 const auto &It = OMPInfoCache.RuntimeFunctionIDMap.find(Callee);
5225 assert(It != OMPInfoCache.RuntimeFunctionIDMap.end() &&
5226 "Expected a known OpenMP runtime function");
5228 RFKind = It->getSecond();
5231 A.registerSimplificationCallback(
5233 [&](
const IRPosition &IRP,
const AbstractAttribute *AA,
5234 bool &UsedAssumedInformation) -> std::optional<Value *> {
5235 assert((isValidState() || SimplifiedValue ==
nullptr) &&
5236 "Unexpected invalid state!");
5238 if (!isAtFixpoint()) {
5239 UsedAssumedInformation =
true;
5241 A.recordDependence(*
this, *AA, DepClassTy::OPTIONAL);
5243 return SimplifiedValue;
5250 case OMPRTL___kmpc_is_spmd_exec_mode:
5253 case OMPRTL___kmpc_parallel_level:
5256 case OMPRTL___kmpc_get_hardware_num_threads_in_block:
5257 Changed =
Changed | foldKernelFnAttribute(
A,
"omp_target_thread_limit");
5259 case OMPRTL___kmpc_get_hardware_num_blocks:
5272 if (SimplifiedValue && *SimplifiedValue) {
5275 A.deleteAfterManifest(
I);
5278 auto Remark = [&](OptimizationRemark
OR) {
5280 return OR <<
"Replacing OpenMP runtime call "
5282 <<
ore::NV(
"FoldedValue",
C->getZExtValue()) <<
".";
5283 return OR <<
"Replacing OpenMP runtime call "
5288 A.emitRemark<OptimizationRemark>(CB,
"OMP180",
Remark);
5291 << **SimplifiedValue <<
"\n");
5293 Changed = ChangeStatus::CHANGED;
5300 SimplifiedValue =
nullptr;
5301 return AAFoldRuntimeCall::indicatePessimisticFixpoint();
5307 std::optional<Value *> SimplifiedValueBefore = SimplifiedValue;
5309 unsigned AssumedSPMDCount = 0, KnownSPMDCount = 0;
5310 unsigned AssumedNonSPMDCount = 0, KnownNonSPMDCount = 0;
5311 auto *CallerKernelInfoAA =
A.getAAFor<AAKernelInfo>(
5314 if (!CallerKernelInfoAA ||
5315 !CallerKernelInfoAA->ReachingKernelEntries.isValidState())
5316 return indicatePessimisticFixpoint();
5318 for (
Kernel K : CallerKernelInfoAA->ReachingKernelEntries) {
5320 DepClassTy::REQUIRED);
5322 if (!AA || !AA->isValidState()) {
5323 SimplifiedValue =
nullptr;
5324 return indicatePessimisticFixpoint();
5327 if (AA->SPMDCompatibilityTracker.isAssumed()) {
5328 if (AA->SPMDCompatibilityTracker.isAtFixpoint())
5333 if (AA->SPMDCompatibilityTracker.isAtFixpoint())
5334 ++KnownNonSPMDCount;
5336 ++AssumedNonSPMDCount;
5340 if ((AssumedSPMDCount + KnownSPMDCount) &&
5341 (AssumedNonSPMDCount + KnownNonSPMDCount))
5342 return indicatePessimisticFixpoint();
5344 auto &Ctx = getAnchorValue().getContext();
5345 if (KnownSPMDCount || AssumedSPMDCount) {
5346 assert(KnownNonSPMDCount == 0 && AssumedNonSPMDCount == 0 &&
5347 "Expected only SPMD kernels!");
5350 SimplifiedValue = ConstantInt::get(Type::getInt8Ty(Ctx),
true);
5351 }
else if (KnownNonSPMDCount || AssumedNonSPMDCount) {
5352 assert(KnownSPMDCount == 0 && AssumedSPMDCount == 0 &&
5353 "Expected only non-SPMD kernels!");
5356 SimplifiedValue = ConstantInt::get(Type::getInt8Ty(Ctx),
false);
5361 assert(!SimplifiedValue &&
"SimplifiedValue should be none");
5364 return SimplifiedValue == SimplifiedValueBefore ? ChangeStatus::UNCHANGED
5365 : ChangeStatus::CHANGED;
5370 std::optional<Value *> SimplifiedValueBefore = SimplifiedValue;
5372 auto *CallerKernelInfoAA =
A.getAAFor<AAKernelInfo>(
5375 if (!CallerKernelInfoAA ||
5376 !CallerKernelInfoAA->ParallelLevels.isValidState())
5377 return indicatePessimisticFixpoint();
5379 if (!CallerKernelInfoAA->ReachingKernelEntries.isValidState())
5380 return indicatePessimisticFixpoint();
5382 if (CallerKernelInfoAA->ReachingKernelEntries.empty()) {
5383 assert(!SimplifiedValue &&
5384 "SimplifiedValue should keep none at this point");
5385 return ChangeStatus::UNCHANGED;
5388 unsigned AssumedSPMDCount = 0, KnownSPMDCount = 0;
5389 unsigned AssumedNonSPMDCount = 0, KnownNonSPMDCount = 0;
5390 for (
Kernel K : CallerKernelInfoAA->ReachingKernelEntries) {
5392 DepClassTy::REQUIRED);
5393 if (!AA || !AA->SPMDCompatibilityTracker.isValidState())
5394 return indicatePessimisticFixpoint();
5396 if (AA->SPMDCompatibilityTracker.isAssumed()) {
5397 if (AA->SPMDCompatibilityTracker.isAtFixpoint())
5402 if (AA->SPMDCompatibilityTracker.isAtFixpoint())
5403 ++KnownNonSPMDCount;
5405 ++AssumedNonSPMDCount;
5409 if ((AssumedSPMDCount + KnownSPMDCount) &&
5410 (AssumedNonSPMDCount + KnownNonSPMDCount))
5411 return indicatePessimisticFixpoint();
5413 auto &Ctx = getAnchorValue().getContext();
5417 if (AssumedSPMDCount || KnownSPMDCount) {
5418 assert(KnownNonSPMDCount == 0 && AssumedNonSPMDCount == 0 &&
5419 "Expected only SPMD kernels!");
5420 SimplifiedValue = ConstantInt::get(Type::getInt8Ty(Ctx), 1);
5422 assert(KnownSPMDCount == 0 && AssumedSPMDCount == 0 &&
5423 "Expected only non-SPMD kernels!");
5424 SimplifiedValue = ConstantInt::get(Type::getInt8Ty(Ctx), 0);
5426 return SimplifiedValue == SimplifiedValueBefore ? ChangeStatus::UNCHANGED
5427 : ChangeStatus::CHANGED;
5430 ChangeStatus foldKernelFnAttribute(Attributor &
A, llvm::StringRef Attr) {
5432 int32_t CurrentAttrValue = -1;
5433 std::optional<Value *> SimplifiedValueBefore = SimplifiedValue;
5435 auto *CallerKernelInfoAA =
A.getAAFor<AAKernelInfo>(
5438 if (!CallerKernelInfoAA ||
5439 !CallerKernelInfoAA->ReachingKernelEntries.isValidState())
5440 return indicatePessimisticFixpoint();
5443 for (
Kernel K : CallerKernelInfoAA->ReachingKernelEntries) {
5444 int32_t NextAttrVal =
K->getFnAttributeAsParsedInteger(Attr, -1);
5446 if (NextAttrVal == -1 ||
5447 (CurrentAttrValue != -1 && CurrentAttrValue != NextAttrVal))
5448 return indicatePessimisticFixpoint();
5449 CurrentAttrValue = NextAttrVal;
5452 if (CurrentAttrValue != -1) {
5453 auto &Ctx = getAnchorValue().getContext();
5455 ConstantInt::get(Type::getInt32Ty(Ctx), CurrentAttrValue);
5457 return SimplifiedValue == SimplifiedValueBefore ? ChangeStatus::UNCHANGED
5458 : ChangeStatus::CHANGED;
5464 std::optional<Value *> SimplifiedValue;
5474 auto &RFI = OMPInfoCache.RFIs[RF];
5475 RFI.foreachUse(SCC, [&](Use &U, Function &
F) {
5476 CallInst *CI = OpenMPOpt::getCallIfRegularCall(U, &RFI);
5479 A.getOrCreateAAFor<AAFoldRuntimeCall>(
5481 DepClassTy::NONE,
false,
5487void OpenMPOpt::registerAAs(
bool IsModulePass) {
5497 A.getOrCreateAAFor<AAKernelInfo>(
5499 DepClassTy::NONE,
false,
5503 OMPInformationCache::RuntimeFunctionInfo &InitRFI =
5504 OMPInfoCache.RFIs[OMPRTL___kmpc_target_init];
5505 InitRFI.foreachUse(SCC, CreateKernelInfoCB);
5507 registerFoldRuntimeCall(OMPRTL___kmpc_is_spmd_exec_mode);
5508 registerFoldRuntimeCall(OMPRTL___kmpc_parallel_level);
5509 registerFoldRuntimeCall(OMPRTL___kmpc_get_hardware_num_threads_in_block);
5510 registerFoldRuntimeCall(OMPRTL___kmpc_get_hardware_num_blocks);
5515 for (
int Idx = 0; Idx < OMPInfoCache.ICVs.size() - 1; ++Idx) {
5518 auto &GetterRFI = OMPInfoCache.RFIs[ICVInfo.Getter];
5521 CallInst *CI = OpenMPOpt::getCallIfRegularCall(U, &GetterRFI);
5528 A.getOrCreateAAFor<AAICVTracker>(CBPos);
5532 GetterRFI.foreachUse(SCC, CreateAA);
5541 for (
auto *
F : SCC) {
5542 if (
F->isDeclaration())
5548 if (
F->hasLocalLinkage()) {
5550 const auto *CB = dyn_cast<CallBase>(U.getUser());
5551 return CB && CB->isCallee(&U) &&
5552 A.isRunOn(const_cast<Function *>(CB->getCaller()));
5556 registerAAsForFunction(
A, *
F);
5560void OpenMPOpt::registerAAsForFunction(Attributor &
A,
const Function &
F) {
5566 if (
F.hasFnAttribute(Attribute::Convergent))
5571 bool UsedAssumedInformation =
false;
5574 A.getOrCreateAAFor<AAAddressSpace>(
5580 A.getOrCreateAAFor<AAIndirectCallInfo>(
5585 A.getOrCreateAAFor<AAAddressSpace>(
5594 if (
II->getIntrinsicID() == Intrinsic::assume) {
5595 A.getOrCreateAAFor<AAPotentialValues>(
5603const char AAICVTracker::ID = 0;
5604const char AAKernelInfo::ID = 0;
5606const char AAHeapToShared::ID = 0;
5607const char AAFoldRuntimeCall::ID = 0;
5609AAICVTracker &AAICVTracker::createForPosition(
const IRPosition &IRP,
5611 AAICVTracker *AA =
nullptr;
5619 AA =
new (
A.Allocator) AAICVTrackerFunctionReturned(IRP,
A);
5622 AA =
new (
A.Allocator) AAICVTrackerCallSiteReturned(IRP,
A);
5625 AA =
new (
A.Allocator) AAICVTrackerCallSite(IRP,
A);
5628 AA =
new (
A.Allocator) AAICVTrackerFunction(IRP,
A);
5637 AAExecutionDomainFunction *
AA =
nullptr;
5647 "AAExecutionDomain can only be created for function position!");
5649 AA =
new (
A.Allocator) AAExecutionDomainFunction(IRP,
A);
5656AAHeapToShared &AAHeapToShared::createForPosition(
const IRPosition &IRP,
5658 AAHeapToSharedFunction *
AA =
nullptr;
5668 "AAHeapToShared can only be created for function position!");
5670 AA =
new (
A.Allocator) AAHeapToSharedFunction(IRP,
A);
5677AAKernelInfo &AAKernelInfo::createForPosition(
const IRPosition &IRP,
5679 AAKernelInfo *AA =
nullptr;
5689 AA =
new (
A.Allocator) AAKernelInfoCallSite(IRP,
A);
5692 AA =
new (
A.Allocator) AAKernelInfoFunction(IRP,
A);
5699AAFoldRuntimeCall &AAFoldRuntimeCall::createForPosition(
const IRPosition &IRP,
5701 AAFoldRuntimeCall *AA =
nullptr;
5710 llvm_unreachable(
"KernelInfo can only be created for call site position!");
5712 AA =
new (
A.Allocator) AAFoldRuntimeCallCallSiteReturned(IRP,
A);
5733 if (Kernels.contains(&
F))
5735 return !
F.use_empty();
5742 return ORA <<
"Could not internalize function. "
5743 <<
"Some optimizations may not be possible. [OMP140]";
5755 if (!
F.isDeclaration() && !Kernels.contains(&
F) && IsCalled(
F) &&
5759 }
else if (!
F.hasLocalLinkage() && !
F.hasFnAttribute(Attribute::Cold)) {
5772 if (!
F.isDeclaration() && !InternalizedMap.
lookup(&
F)) {
5774 Functions.insert(&
F);
5792 OMPInformationCache InfoCache(M, AG, Allocator,
nullptr, PostLink);
5794 unsigned MaxFixpointIterations =
5806 return F.hasFnAttribute(
"kernel");
5811 OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache,
A);
5817 if (!
F.isDeclaration() && !Kernels.contains(&
F) &&
5818 !
F.hasFnAttribute(Attribute::NoInline))
5819 F.addFnAttr(Attribute::AlwaysInline);
5849 Module &M = *
C.begin()->getFunction().getParent();
5871 OMPInformationCache InfoCache(*(Functions.back()->getParent()), AG, Allocator,
5872 &Functions, PostLink);
5874 unsigned MaxFixpointIterations =
5888 OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache,
A);
5889 bool Changed = OMPOpt.run(
false);
5908 if (
F.hasKernelCallingConv()) {
5913 ++NumOpenMPTargetRegionKernels;
5916 ++NumNonOpenMPTargetRegionKernels;
5923 Metadata *MD = M.getModuleFlag(
"openmp");
5931 Metadata *MD = M.getModuleFlag(
"openmp-device");
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Expand Atomic instructions
static cl::opt< unsigned > SetFixpointIterations("attributor-max-iterations", cl::Hidden, cl::desc("Maximal number of fixpoint iterations."), cl::init(32))
static const Function * getParent(const Value *V)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
This file provides interfaces used to manipulate a call graph, regardless if it is a "old style" Call...
This file provides interfaces used to build and manipulate a call graph, which is a very useful tool ...
This file contains the declarations for the subclasses of Constant, which represent the different fla...
dxil pretty DXIL Metadata Pretty Printer
This file defines the DenseSet and SmallDenseSet classes.
This file defines an array type that can be indexed using scoped enum values.
static void emitRemark(const Function &F, OptimizationRemarkEmitter &ORE, bool Skip)
Loop::LoopBounds::Direction Direction
Machine Check Debug Module
This file provides utility analysis objects describing memory locations.
uint64_t IntrinsicInst * II
This file defines constans and helpers used when dealing with OpenMP.
This file defines constans that will be used by both host and device compilation.
static constexpr auto TAG
static cl::opt< bool > HideMemoryTransferLatency("openmp-hide-memory-transfer-latency", cl::desc("[WIP] Tries to hide the latency of host to device memory" " transfers"), cl::Hidden, cl::init(false))
static cl::opt< bool > DisableOpenMPOptStateMachineRewrite("openmp-opt-disable-state-machine-rewrite", cl::desc("Disable OpenMP optimizations that replace the state machine."), cl::Hidden, cl::init(false))
static cl::opt< bool > EnableParallelRegionMerging("openmp-opt-enable-merging", cl::desc("Enable the OpenMP region merging optimization."), cl::Hidden, cl::init(false))
static cl::opt< bool > PrintModuleAfterOptimizations("openmp-opt-print-module-after", cl::desc("Print the current module after OpenMP optimizations."), cl::Hidden, cl::init(false))
#define KERNEL_ENVIRONMENT_CONFIGURATION_GETTER(MEMBER)
#define KERNEL_ENVIRONMENT_CONFIGURATION_IDX(MEMBER, IDX)
#define KERNEL_ENVIRONMENT_CONFIGURATION_SETTER(MEMBER)
static cl::opt< bool > PrintOpenMPKernels("openmp-print-gpu-kernels", cl::init(false), cl::Hidden)
static cl::opt< bool > DisableOpenMPOptFolding("openmp-opt-disable-folding", cl::desc("Disable OpenMP optimizations involving folding."), cl::Hidden, cl::init(false))
static cl::opt< bool > PrintModuleBeforeOptimizations("openmp-opt-print-module-before", cl::desc("Print the current module before OpenMP optimizations."), cl::Hidden, cl::init(false))
static cl::opt< unsigned > SetFixpointIterations("openmp-opt-max-iterations", cl::Hidden, cl::desc("Maximal number of attributor iterations."), cl::init(256))
static cl::opt< bool > DisableInternalization("openmp-opt-disable-internalization", cl::desc("Disable function internalization."), cl::Hidden, cl::init(false))
static cl::opt< bool > PrintICVValues("openmp-print-icv-values", cl::init(false), cl::Hidden)
static cl::opt< bool > DisableOpenMPOptimizations("openmp-opt-disable", cl::desc("Disable OpenMP specific optimizations."), cl::Hidden, cl::init(false))
static cl::opt< unsigned > SharedMemoryLimit("openmp-opt-shared-limit", cl::Hidden, cl::desc("Maximum amount of shared memory to use."), cl::init(std::numeric_limits< unsigned >::max()))
static cl::opt< bool > EnableVerboseRemarks("openmp-opt-verbose-remarks", cl::desc("Enables more verbose remarks."), cl::Hidden, cl::init(false))
static cl::opt< bool > DisableOpenMPOptDeglobalization("openmp-opt-disable-deglobalization", cl::desc("Disable OpenMP optimizations involving deglobalization."), cl::Hidden, cl::init(false))
static cl::opt< bool > DisableOpenMPOptBarrierElimination("openmp-opt-disable-barrier-elimination", cl::desc("Disable OpenMP optimizations that eliminate barriers."), cl::Hidden, cl::init(false))
static cl::opt< bool > DeduceICVValues("openmp-deduce-icv-values", cl::init(false), cl::Hidden)
#define KERNEL_ENVIRONMENT_IDX(MEMBER, IDX)
#define KERNEL_ENVIRONMENT_GETTER(MEMBER, RETURNTYPE)
static cl::opt< bool > DisableOpenMPOptSPMDization("openmp-opt-disable-spmdization", cl::desc("Disable OpenMP optimizations involving SPMD-ization."), cl::Hidden, cl::init(false))
static cl::opt< bool > AlwaysInlineDeviceFunctions("openmp-opt-inline-device", cl::desc("Inline all applicable functions on the device."), cl::Hidden, cl::init(false))
FunctionAnalysisManager FAM
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
static StringRef getName(Value *V)
std::pair< BasicBlock *, BasicBlock * > Edge
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static const int BlockSize
static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T, ArrayRef< StringLiteral > StandardNames)
Initialize the set of available library functions based on the specified target triple.
static cl::opt< unsigned > MaxThreads("xcore-max-threads", cl::Optional, cl::desc("Maximum number of threads (for emulation thread-local storage)"), cl::Hidden, cl::value_desc("number"), cl::init(8))
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
size_t size() const
size - Get the array size.
iterator begin()
Instruction iterator methods.
LLVM_ABI const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
const Function * getParent() const
Return the enclosing method, or null if none.
reverse_iterator rbegin()
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
LLVM_ABI const BasicBlock * getUniqueSuccessor() const
Return the successor of this block if it has a unique successor.
InstListType::reverse_iterator reverse_iterator
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
static BranchInst * Create(BasicBlock *IfTrue, InsertPosition InsertBefore=nullptr)
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
void setCallingConv(CallingConv::ID CC)
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool doesNotAccessMemory(unsigned OpNo) const
LLVM_ABI bool isIndirectCall() const
Return true if the callsite is an indirect call.
bool isCallee(Value::const_user_iterator UI) const
Determine whether the passed iterator points to the callee operand's Use.
Value * getArgOperand(unsigned i) const
void setArgOperand(unsigned i, Value *v)
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned getArgOperandNo(const Use *U) const
Given a use for a arg operand, get the arg operand number that corresponds to it.
unsigned arg_size() const
AttributeList getAttributes() const
Return the attributes for this call.
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
bool isArgOperand(const Use *U) const
bool hasOperandBundles() const
Return true if this User has any operand bundles.
LLVM_ABI Function * getCaller()
Helper to get the caller (the parent function).
Wrapper to unify "old style" CallGraph and "new style" LazyCallGraph.
void initialize(LazyCallGraph &LCG, LazyCallGraph::SCC &SCC, CGSCCAnalysisManager &AM, CGSCCUpdateResult &UR)
Initializers for usage outside of a CGSCC pass, inside a CGSCC pass in the old and new pass manager (...
static CallInst * Create(FunctionType *Ty, Value *F, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
@ ICMP_SLT
signed less than
static LLVM_ABI Constant * getPointerCast(Constant *C, Type *Ty)
Create a BitCast, AddrSpaceCast, or a PtrToInt cast constant expression.
static LLVM_ABI Constant * getPointerBitCastOrAddrSpaceCast(Constant *C, Type *Ty)
Create a BitCast or AddrSpaceCast for a pointer type depending on the address space.
This is the shared class of boolean and integer constants.
IntegerType * getIntegerType() const
Variant of the getType() method to always return an IntegerType, which reduces the amount of casting ...
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
This is an important base class in LLVM.
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
LLVM_ABI Instruction * findNearestCommonDominator(Instruction *I1, Instruction *I2) const
Find the nearest instruction I that dominates both I1 and I2, in the sense that a result produced bef...
static ErrorSuccess success()
Create a success value.
AtomicOrdering getOrdering() const
Returns the ordering constraint of this fence instruction.
A proxy from a FunctionAnalysisManager to an SCC.
const BasicBlock & getEntryBlock() const
const BasicBlock & front() const
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Argument * getArg(unsigned i) const
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
LLVM_ABI bool isDeclaration() const
Return true if the primary definition of this global value is outside of the current translation unit...
bool hasLocalLinkage() const
Module * getParent()
Get the module that this global value is contained inside of...
@ PrivateLinkage
Like Internal, but omit from symbol table.
@ InternalLinkage
Rename collisions when linking (static functions).
const Constant * getInitializer() const
getInitializer - Return the initializer for this global variable.
LLVM_ABI void setInitializer(Constant *InitVal)
setInitializer - Sets the initializer for this global variable, removing any existing initializer if ...
LLVM_ABI bool isLifetimeStartOrEnd() const LLVM_READONLY
Return true if the instruction is a llvm.lifetime.start or llvm.lifetime.end marker.
LLVM_ABI bool mayWriteToMemory() const LLVM_READONLY
Return true if this instruction may modify memory.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
LLVM_ABI bool mayHaveSideEffects() const LLVM_READONLY
Return true if the instruction may have side effects.
LLVM_ABI bool mayReadFromMemory() const LLVM_READONLY
Return true if this instruction may read memory.
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
LLVM_ABI void setSuccessor(unsigned Idx, BasicBlock *BB)
Update the specified successor to point at the provided block.
A node in the call graph.
An SCC of the call graph.
A lazily constructed view of the call graph of a module.
LLVM_ABI void eraseFromParent()
This method unlinks 'this' from the containing function and deletes it.
LLVM_ABI StringRef getName() const
Return the name of the corresponding LLVM basic block, or an empty string.
A Module instance is used to store all the information related to an LLVM module.
const Triple & getTargetTriple() const
Get the target triple which is a string describing the target host.
PreservedAnalyses run(LazyCallGraph::SCC &C, CGSCCAnalysisManager &AM, LazyCallGraph &CG, CGSCCUpdateResult &UR)
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
static ReturnInst * Create(LLVMContext &C, Value *retVal=nullptr, InsertPosition InsertBefore=nullptr)
A vector that has set insertion semantics.
size_type size() const
Determine the number of elements in the SetVector.
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
bool insert(const value_type &X)
Insert a new element into the SetVector.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
reference emplace_back(ArgTypes &&... Args)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
bool starts_with(StringRef Prefix) const
Check if this string starts with the given Prefix.
Triple - Helper class for working with autoconf configuration names.
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
LLVM_ABI bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Type * getType() const
All values are typed, get the type of this value.
LLVM_ABI void setName(const Twine &Name)
Change the name of the value.
bool hasOneUse() const
Return true if there is exactly one use of this value.
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
LLVM_ABI const Value * stripPointerCasts() const
Strip off pointer casts, all-zero GEPs and address space casts.
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
const ParentTy * getParent() const
self_iterator getIterator()
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
GlobalVariable * getKernelEnvironementGVFromKernelInitCB(CallBase *KernelInitCB)
ConstantStruct * getKernelEnvironementFromKernelInitCB(CallBase *KernelInitCB)
Abstract Attribute helper functions.
LLVM_ABI bool isValidAtPosition(const ValueAndContext &VAC, InformationCache &InfoCache)
Return true if the value of VAC is a valid at the position of VAC, that is a constant,...
LLVM_ABI bool isPotentiallyAffectedByBarrier(Attributor &A, const Instruction &I, const AbstractAttribute &QueryingAA)
Return true if I is potentially affected by a barrier.
LLVM_ABI bool isNoSyncInst(Attributor &A, const Instruction &I, const AbstractAttribute &QueryingAA)
Return true if I is a nosync instruction.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
E & operator^=(E &LHS, E RHS)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ C
The default llvm calling convention, compatible with C.
@ BasicBlock
Various leaf nodes.
initializer< Ty > init(const Ty &Val)
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
constexpr uint64_t PointerSize
aarch64 pointer size.
bool isOpenMPDevice(Module &M)
Helper to determine if M is a OpenMP target offloading device module.
bool containsOpenMP(Module &M)
Helper to determine if M contains OpenMP.
InternalControlVar
IDs for all Internal Control Variables (ICVs).
RuntimeFunction
IDs for all omp runtime library (RTL) functions.
KernelSet getDeviceKernels(Module &M)
Get OpenMP device kernels in M.
@ OMP_TGT_EXEC_MODE_GENERIC_SPMD
@ OMP_TGT_EXEC_MODE_GENERIC
SetVector< Kernel > KernelSet
Set of kernels in the module.
Function * Kernel
Summary of a kernel (=entry point for target offloading).
bool isOpenMPKernel(Function &Fn)
Return true iff Fn is an OpenMP GPU kernel; Fn has the "kernel" attribute.
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< UseNode * > Use
friend class Instruction
Iterator for Instructions in a `BasicBlock.
LLVM_ABI iterator begin() const
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
FunctionAddr VTableAddr Value
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
bool succ_empty(const Instruction *I)
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
FunctionAddr VTableAddr uintptr_t uintptr_t Int32Ty
bool operator!=(uint64_t V1, const APInt &V2)
constexpr from_range_t from_range
Value * GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset, const DataLayout &DL, bool AllowNonInbounds=true)
Analyze the specified pointer to see if it can be expressed as a base pointer plus a constant offset.
InnerAnalysisManagerProxy< FunctionAnalysisManager, Module > FunctionAnalysisManagerModuleProxy
Provide the FunctionAnalysisManager to Module proxy.
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
AnalysisManager< LazyCallGraph::SCC, LazyCallGraph & > CGSCCAnalysisManager
The CGSCC analysis manager.
@ ThinLTOPostLink
ThinLTO postlink (backend compile) phase.
@ FullLTOPostLink
Full LTO postlink (backend compile) phase.
@ ThinLTOPreLink
ThinLTO prelink (summary) phase.
auto dyn_cast_or_null(const Y &Val)
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
BumpPtrAllocatorImpl BumpPtrAllocator
The standard BumpPtrAllocator which just uses the default template parameters.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
void cantFail(Error Err, const char *Msg=nullptr)
Report a fatal error if Err is a failure value.
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
bool operator&=(SparseBitVector< ElementSize > *LHS, const SparseBitVector< ElementSize > &RHS)
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
ArrayRef(const T &OneElt) -> ArrayRef< T >
std::string toString(const APInt &I, unsigned Radix, bool Signed, bool formatAsCLiteral=false, bool UpperCase=true, bool InsertSeparators=false)
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
auto predecessors(const MachineBasicBlock *BB)
LLVM_ABI Constant * ConstantFoldInsertValueInstruction(Constant *Agg, Constant *Val, ArrayRef< unsigned > Idxs)
ConstantFoldInsertValueInstruction - Attempt to constant fold an insertvalue instruction with the spe...
@ OPTIONAL
The target may be valid if the source is not.
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
AnalysisManager< Module > ModuleAnalysisManager
Convenience typedef for the Module analysis manager.
static LLVM_ABI AAExecutionDomain & createForPosition(const IRPosition &IRP, Attributor &A)
Create an abstract attribute view for the position IRP.
AAExecutionDomain(const IRPosition &IRP, Attributor &A)
static LLVM_ABI const char ID
Unique ID (due to the unique address)
AccessKind
Simple enum to distinguish read/write/read-write accesses.
StateType::base_t MemoryLocationsKind
static LLVM_ABI bool isAlignedBarrier(const CallBase &CB, bool ExecutedAligned)
Helper function to determine if CB is an aligned (GPU) barrier.
Base struct for all "concrete attribute" deductions.
virtual const char * getIdAddr() const =0
This function should return the address of the ID of the AbstractAttribute.
An interface to query the internal state of an abstract attribute.
Wrapper for FunctionAnalysisManager.
Configuration for the Attributor.
std::function< void(Attributor &A, const Function &F)> InitializationCallback
Callback function to be invoked on internal functions marked live.
std::optional< unsigned > MaxFixpointIterations
Maximum number of iterations to run until fixpoint.
bool RewriteSignatures
Flag to determine if we rewrite function signatures.
OptimizationRemarkGetter OREGetter
IPOAmendableCBTy IPOAmendableCB
bool IsModulePass
Is the user of the Attributor a module pass or not.
bool DefaultInitializeLiveInternals
Flag to determine if we want to initialize all default AAs for an internal function marked live.
The fixpoint analysis framework that orchestrates the attribute deduction.
static LLVM_ABI bool isInternalizable(Function &F)
Returns true if the function F can be internalized.
std::function< std::optional< Value * >( const IRPosition &, const AbstractAttribute *, bool &)> SimplifictionCallbackTy
Register CB as a simplification callback.
std::function< std::optional< Constant * >( const GlobalVariable &, const AbstractAttribute *, bool &)> GlobalVariableSimplifictionCallbackTy
Register CB as a simplification callback.
std::function< bool(Attributor &, const AbstractAttribute *)> VirtualUseCallbackTy
static LLVM_ABI bool internalizeFunctions(SmallPtrSetImpl< Function * > &FnSet, DenseMap< Function *, Function * > &FnMap)
Make copies of each function in the set FnSet such that the copied version has internal linkage after...
Simple wrapper for a single bit (boolean) state.
Support structure for SCC passes to communicate updates the call graph back to the CGSCC pass manager...
Helper to describe and deal with positions in the LLVM-IR.
static const IRPosition callsite_returned(const CallBase &CB)
Create a position describing the returned value of CB.
static const IRPosition returned(const Function &F, const CallBaseContext *CBContext=nullptr)
Create a position describing the returned value of F.
static const IRPosition value(const Value &V, const CallBaseContext *CBContext=nullptr)
Create a position describing the value of V.
static const IRPosition inst(const Instruction &I, const CallBaseContext *CBContext=nullptr)
Create a position describing the instruction I.
@ IRP_ARGUMENT
An attribute for a function argument.
@ IRP_RETURNED
An attribute for the function return value.
@ IRP_CALL_SITE
An attribute for a call site (function scope).
@ IRP_CALL_SITE_RETURNED
An attribute for a call site return value.
@ IRP_FUNCTION
An attribute for a function (scope).
@ IRP_FLOAT
A position that is not associated with a spot suitable for attributes.
@ IRP_CALL_SITE_ARGUMENT
An attribute for a call site argument.
@ IRP_INVALID
An invalid position.
static const IRPosition function(const Function &F, const CallBaseContext *CBContext=nullptr)
Create a position describing the function scope of F.
Kind getPositionKind() const
Return the associated position kind.
static const IRPosition callsite_function(const CallBase &CB)
Create a position describing the function scope of CB.
Defines various target-specific GPU grid values that must be consistent between host RTL (plugin),...