69#define DEBUG_TYPE "amdgpu-split-module"
75 "amdgpu-module-splitting-max-depth",
77 "maximum search depth. 0 forces a greedy approach. "
78 "warning: the algorithm is up to O(2^N), where N is the max depth."),
81static cl::opt<float> LargeFnFactor(
84 "when max depth is reached and we can no longer branch out, this "
85 "value determines if a function is worth merging into an already "
86 "existing partition to reduce code duplication. This is a factor "
87 "of the ideal partition size, e.g. 2.0 means we consider the "
88 "function for merging if its cost (including its callees) is 2x the "
89 "size of an ideal partition."));
91static cl::opt<float> LargeFnOverlapForMerge(
93 cl::desc(
"when a function is considered for merging into a partition that "
94 "already contains some of its callees, do the merge if at least "
95 "n% of the code it can reach is already present inside the "
96 "partition; e.g. 0.7 means only merge >70%"));
99 "amdgpu-module-splitting-no-externalize-globals",
cl::Hidden,
100 cl::desc(
"disables externalization of global variable with local linkage; "
101 "may cause globals to be duplicated which increases binary size"));
104 "amdgpu-module-splitting-no-externalize-address-taken",
cl::Hidden,
106 "disables externalization of functions whose addresses are taken"));
109 ModuleDotCfgOutput(
"amdgpu-module-splitting-print-module-dotcfg",
111 cl::desc(
"output file to write out the dotgraph "
112 "representation of the input module"));
115 "amdgpu-module-splitting-print-partition-summaries",
cl::Hidden,
116 cl::desc(
"output file to write out a summary of "
117 "the partitions created for each module"));
121 UseLockFile(
"amdgpu-module-splitting-serial-execution",
cl::Hidden,
122 cl::desc(
"use a lock file so only one process in the system "
123 "can run this pass at once. useful to avoid mangled "
124 "debug output in multithreaded environments."));
127 DebugProposalSearch(
"amdgpu-module-splitting-debug-proposal-search",
129 cl::desc(
"print all proposals received and whether "
130 "they were rejected or accepted"));
133struct SplitModuleTimer : NamedRegionTimer {
134 SplitModuleTimer(StringRef Name, StringRef
Desc)
135 : NamedRegionTimer(Name,
Desc,
DEBUG_TYPE,
"AMDGPU Module Splitting",
144using FunctionsCostMap = DenseMap<const Function *, CostType>;
145using GetTTIFn = function_ref<
const TargetTransformInfo &(Function &)>;
146static constexpr unsigned InvalidPID = -1;
151static auto formatRatioOf(CostType Num, CostType Dem) {
152 CostType DemOr1 = Dem ? Dem : 1;
153 return format(
"%0.2f", (
static_cast<double>(Num) / DemOr1) * 100);
164static bool isNonCopyable(
const Function &
F) {
165 return F.hasExternalLinkage() || !
F.isDefinitionExact() ||
171 if (GV.hasLocalLinkage()) {
179 GV.setName(
"__llvmsplit_unnamed");
188static CostType calculateFunctionCosts(GetTTIFn GetTTI,
Module &M,
189 FunctionsCostMap &CostMap) {
190 SplitModuleTimer SMT(
"calculateFunctionCosts",
"cost analysis");
192 LLVM_DEBUG(
dbgs() <<
"[cost analysis] calculating function costs\n");
193 CostType ModuleCost = 0;
194 [[maybe_unused]] CostType KernelCost = 0;
197 if (Fn.isDeclaration())
201 const auto &
TTI = GetTTI(Fn);
202 for (
const auto &BB : Fn) {
203 for (
const auto &
I : BB) {
208 CostType CostVal = Cost.isValid()
211 assert((FnCost + CostVal) >= FnCost &&
"Overflow!");
218 CostMap[&Fn] = FnCost;
219 assert((ModuleCost + FnCost) >= ModuleCost &&
"Overflow!");
220 ModuleCost += FnCost;
223 KernelCost += FnCost;
231 const CostType FnCost = ModuleCost - KernelCost;
232 dbgs() <<
" - total module cost is " << ModuleCost <<
". kernels cost "
233 <<
"" << KernelCost <<
" ("
234 <<
format(
"%0.2f", (
float(KernelCost) / ModuleCost) * 100)
235 <<
"% of the module), functions cost " << FnCost <<
" ("
236 <<
format(
"%0.2f", (
float(FnCost) / ModuleCost) * 100)
237 <<
"% of the module)\n";
244static bool canBeIndirectlyCalled(
const Function &
F) {
247 return !
F.hasLocalLinkage() ||
248 F.hasAddressTaken(
nullptr,
276 enum class EdgeKind :
uint8_t {
295 : Src(Src), Dst(Dst), Kind(Kind) {}
302 using EdgesVec = SmallVector<const Edge *, 0>;
304 using nodes_iterator =
const Node *
const *;
306 SplitGraph(
const Module &M,
const FunctionsCostMap &CostMap,
308 : M(M), CostMap(CostMap), ModuleCost(ModuleCost) {}
310 void buildGraph(CallGraph &CG);
313 bool verifyGraph()
const;
316 bool empty()
const {
return Nodes.empty(); }
318 return {Nodes.begin(), Nodes.end()};
322 unsigned getNumNodes()
const {
return Nodes.size(); }
323 BitVector createNodesBitVector()
const {
return BitVector(Nodes.size()); }
325 const Module &getModule()
const {
return M; }
327 CostType getModuleCost()
const {
return ModuleCost; }
328 CostType
getCost(
const Function &
F)
const {
return CostMap.at(&
F); }
332 CostType calculateCost(
const BitVector &BV)
const;
337 Node &
getNode(DenseMap<const GlobalValue *, Node *> &Cache,
338 const GlobalValue &GV);
341 const Edge &createEdge(
Node &Src,
Node &Dst, EdgeKind EK);
344 const FunctionsCostMap &CostMap;
350 SpecificBumpPtrAllocator<Node> NodesPool;
356 std::is_trivially_destructible_v<Edge>,
357 "Edge must be trivially destructible to use the BumpPtrAllocator");
373class SplitGraph::Node {
374 friend class SplitGraph;
377 Node(
unsigned ID,
const GlobalValue &GV, CostType IndividualCost,
379 :
ID(
ID), GV(GV), IndividualCost(IndividualCost),
380 IsNonCopyable(IsNonCopyable), IsEntryFnCC(
false), IsGraphEntry(
false) {
387 unsigned getID()
const {
return ID; }
393 CostType getIndividualCost()
const {
return IndividualCost; }
395 bool isNonCopyable()
const {
return IsNonCopyable; }
396 bool isEntryFunctionCC()
const {
return IsEntryFnCC; }
402 bool isGraphEntryPoint()
const {
return IsGraphEntry; }
404 StringRef
getName()
const {
return GV.getName(); }
406 bool hasAnyIncomingEdges()
const {
return IncomingEdges.size(); }
407 bool hasAnyIncomingEdgesOfKind(EdgeKind EK)
const {
408 return any_of(IncomingEdges, [&](
const auto *
E) {
return E->Kind == EK; });
411 bool hasAnyOutgoingEdges()
const {
return OutgoingEdges.size(); }
412 bool hasAnyOutgoingEdgesOfKind(EdgeKind EK)
const {
413 return any_of(OutgoingEdges, [&](
const auto *
E) {
return E->Kind == EK; });
417 return IncomingEdges;
421 return OutgoingEdges;
424 bool shouldFollowIndirectCalls()
const {
return isEntryFunctionCC(); }
431 void visitAllDependencies(std::function<
void(
const Node &)> Visitor)
const;
440 void getDependencies(BitVector &BV)
const {
441 visitAllDependencies([&](
const Node &
N) { BV.set(
N.getID()); });
445 void markAsGraphEntry() { IsGraphEntry =
true; }
448 const GlobalValue &GV;
449 CostType IndividualCost;
450 bool IsNonCopyable : 1;
451 bool IsEntryFnCC : 1;
452 bool IsGraphEntry : 1;
456 EdgesVec IncomingEdges;
457 EdgesVec OutgoingEdges;
460void SplitGraph::Node::visitAllDependencies(
461 std::function<
void(
const Node &)> Visitor)
const {
462 const bool FollowIndirect = shouldFollowIndirectCalls();
465 DenseSet<const Node *> Seen;
466 SmallVector<const Node *, 8> WorkList({
this});
467 while (!WorkList.empty()) {
468 const Node *CurN = WorkList.pop_back_val();
469 if (
auto [It, Inserted] = Seen.insert(CurN); !Inserted)
474 for (
const Edge *
E : CurN->outgoing_edges()) {
475 if (!FollowIndirect &&
E->Kind == EdgeKind::IndirectCall)
477 WorkList.push_back(
E->Dst);
489static bool handleCalleesMD(
const Instruction &
I,
490 SetVector<Function *> &Callees) {
491 auto *MD =
I.getMetadata(LLVMContext::MD_callees);
495 for (
const auto &Op : MD->operands()) {
496 Function *Callee = mdconst::extract_or_null<Function>(Op);
499 Callees.insert(Callee);
505void SplitGraph::buildGraph(CallGraph &CG) {
506 SplitModuleTimer SMT(
"buildGraph",
"graph construction");
509 <<
"[build graph] constructing graph representation of the input\n");
517 DenseMap<const GlobalValue *, Node *> Cache;
518 SmallVector<const Function *> FnsWithIndirectCalls, IndirectlyCallableFns;
519 for (
const Function &Fn : M) {
520 if (Fn.isDeclaration())
524 SetVector<const Function *> DirectCallees;
525 bool CallsExternal =
false;
526 for (
auto &CGEntry : *CG[&Fn]) {
527 auto *CGNode = CGEntry.second;
528 if (
auto *Callee = CGNode->getFunction()) {
529 if (!Callee->isDeclaration())
530 DirectCallees.insert(Callee);
531 }
else if (CGNode == CG.getCallsExternalNode())
532 CallsExternal =
true;
538 LLVM_DEBUG(dbgs() <<
" [!] callgraph is incomplete for ";
539 Fn.printAsOperand(dbgs());
540 dbgs() <<
" - analyzing function\n");
542 SetVector<Function *> KnownCallees;
543 bool HasUnknownIndirectCall =
false;
546 const auto *CB = dyn_cast<CallBase>(&Inst);
547 if (!CB || CB->getCalledFunction())
552 if (CB->isInlineAsm()) {
553 LLVM_DEBUG(dbgs() <<
" found inline assembly\n");
557 if (handleCalleesMD(Inst, KnownCallees))
561 KnownCallees.clear();
565 HasUnknownIndirectCall =
true;
569 if (HasUnknownIndirectCall) {
570 LLVM_DEBUG(dbgs() <<
" indirect call found\n");
571 FnsWithIndirectCalls.push_back(&Fn);
572 }
else if (!KnownCallees.empty())
573 DirectCallees.insert_range(KnownCallees);
577 for (
const auto *Callee : DirectCallees)
578 createEdge(
N,
getNode(Cache, *Callee), EdgeKind::DirectCall);
580 if (canBeIndirectlyCalled(Fn))
581 IndirectlyCallableFns.push_back(&Fn);
585 for (
const Function *Fn : FnsWithIndirectCalls) {
586 for (
const Function *Candidate : IndirectlyCallableFns) {
589 createEdge(Src, Dst, EdgeKind::IndirectCall);
594 SmallVector<Node *, 16> CandidateEntryPoints;
595 BitVector NodesReachableByKernels = createNodesBitVector();
596 for (
Node *
N : Nodes) {
598 if (
N->isEntryFunctionCC()) {
599 N->markAsGraphEntry();
600 N->getDependencies(NodesReachableByKernels);
601 }
else if (!
N->hasAnyIncomingEdgesOfKind(EdgeKind::DirectCall))
602 CandidateEntryPoints.push_back(
N);
605 for (
Node *
N : CandidateEntryPoints) {
611 if (!NodesReachableByKernels.test(
N->getID()))
612 N->markAsGraphEntry();
621bool SplitGraph::verifyGraph()
const {
622 unsigned ExpectedID = 0;
624 DenseSet<const Node *> SeenNodes;
625 DenseSet<const Function *> SeenFunctionNodes;
626 for (
const Node *
N : Nodes) {
627 if (
N->getID() != (ExpectedID++)) {
628 errs() <<
"Node IDs are incorrect!\n";
632 if (!SeenNodes.insert(
N).second) {
633 errs() <<
"Node seen more than once!\n";
638 errs() <<
"getNode doesn't return the right node\n";
642 for (
const Edge *
E :
N->IncomingEdges) {
643 if (!
E->Src || !
E->Dst || (
E->Dst !=
N) ||
644 (
find(
E->Src->OutgoingEdges,
E) ==
E->Src->OutgoingEdges.end())) {
645 errs() <<
"ill-formed incoming edges\n";
650 for (
const Edge *
E :
N->OutgoingEdges) {
651 if (!
E->Src || !
E->Dst || (
E->Src !=
N) ||
652 (
find(
E->Dst->IncomingEdges,
E) ==
E->Dst->IncomingEdges.end())) {
653 errs() <<
"ill-formed outgoing edges\n";
658 const Function &Fn =
N->getFunction();
659 if (AMDGPU::isEntryFunctionCC(Fn.getCallingConv())) {
660 if (
N->hasAnyIncomingEdges()) {
661 errs() <<
"Kernels cannot have incoming edges\n";
666 if (Fn.isDeclaration()) {
667 errs() <<
"declarations shouldn't have nodes!\n";
671 auto [It, Inserted] = SeenFunctionNodes.insert(&Fn);
673 errs() <<
"one function has multiple nodes!\n";
678 if (ExpectedID != Nodes.size()) {
679 errs() <<
"Node IDs out of sync!\n";
683 if (createNodesBitVector().size() != getNumNodes()) {
684 errs() <<
"nodes bit vector doesn't have the right size!\n";
689 BitVector BV = createNodesBitVector();
691 if (
N->isGraphEntryPoint())
692 N->getDependencies(BV);
696 for (
const auto &Fn : M) {
697 if (!Fn.isDeclaration()) {
698 if (!SeenFunctionNodes.contains(&Fn)) {
699 errs() <<
"Fn has no associated node in the graph!\n";
706 errs() <<
"not all nodes are reachable through the graph's entry points!\n";
714CostType SplitGraph::calculateCost(
const BitVector &BV)
const {
716 for (
unsigned NodeID : BV.set_bits())
717 Cost +=
getNode(NodeID).getIndividualCost();
722SplitGraph::getNode(DenseMap<const GlobalValue *, Node *> &Cache,
723 const GlobalValue &GV) {
724 auto &
N = Cache[&GV];
729 bool NonCopyable =
false;
730 if (
const Function *Fn = dyn_cast<Function>(&GV)) {
731 NonCopyable = isNonCopyable(*Fn);
732 Cost = CostMap.at(Fn);
734 N =
new (NodesPool.Allocate())
Node(Nodes.size(), GV, Cost, NonCopyable);
740const SplitGraph::Edge &SplitGraph::createEdge(
Node &Src,
Node &Dst,
742 const Edge *
E =
new (EdgesPool.
Allocate<Edge>(1))
Edge(&Src, &Dst, EK);
743 Src.OutgoingEdges.push_back(
E);
744 Dst.IncomingEdges.push_back(
E);
763 SplitProposal(
const SplitGraph &SG,
unsigned MaxPartitions) : SG(&SG) {
764 Partitions.resize(MaxPartitions, {0, SG.createNodesBitVector()});
767 void setName(StringRef NewName) { Name = NewName; }
768 StringRef
getName()
const {
return Name; }
770 const BitVector &operator[](
unsigned PID)
const {
771 return Partitions[PID].second;
774 void add(
unsigned PID,
const BitVector &BV) {
775 Partitions[PID].second |= BV;
779 void print(raw_ostream &OS)
const;
784 unsigned findCheapestPartition()
const;
787 void calculateScores();
790 void verifyCompleteness()
const;
802 double getCodeSizeScore()
const {
return CodeSizeScore; }
816 double getBottleneckScore()
const {
return BottleneckScore; }
819 void updateScore(
unsigned PID) {
821 for (
auto &[PCost, Nodes] : Partitions) {
823 PCost = SG->calculateCost(Nodes);
829 double CodeSizeScore = 0.0;
831 double BottleneckScore = 0.0;
833 CostType TotalCost = 0;
835 const SplitGraph *SG =
nullptr;
838 std::vector<std::pair<CostType, BitVector>> Partitions;
841void SplitProposal::print(raw_ostream &OS)
const {
844 OS <<
"[proposal] " << Name <<
", total cost:" << TotalCost
845 <<
", code size score:" << format(
"%0.3f", CodeSizeScore)
846 <<
", bottleneck score:" << format(
"%0.3f", BottleneckScore) <<
'\n';
847 for (
const auto &[PID, Part] : enumerate(Partitions)) {
848 const auto &[Cost, NodeIDs] = Part;
849 OS <<
" - P" << PID <<
" nodes:" << NodeIDs.count() <<
" cost: " << Cost
850 <<
'|' << formatRatioOf(Cost, SG->getModuleCost()) <<
"%\n";
854unsigned SplitProposal::findCheapestPartition()
const {
855 assert(!Partitions.empty());
856 CostType CurCost = std::numeric_limits<CostType>::max();
857 unsigned CurPID = InvalidPID;
858 for (
const auto &[Idx, Part] : enumerate(Partitions)) {
859 if (Part.first <= CurCost) {
861 CurCost = Part.first;
864 assert(CurPID != InvalidPID);
868void SplitProposal::calculateScores() {
869 if (Partitions.empty())
873 CostType LargestPCost = 0;
874 for (
auto &[PCost, Nodes] : Partitions) {
875 if (PCost > LargestPCost)
876 LargestPCost = PCost;
879 CostType ModuleCost = SG->getModuleCost();
880 CodeSizeScore = double(TotalCost) / ModuleCost;
881 assert(CodeSizeScore >= 0.0);
883 BottleneckScore = double(LargestPCost) / ModuleCost;
885 CodeSizeScore = std::ceil(CodeSizeScore * 100.0) / 100.0;
886 BottleneckScore = std::ceil(BottleneckScore * 100.0) / 100.0;
890void SplitProposal::verifyCompleteness()
const {
891 if (Partitions.empty())
894 BitVector Result = Partitions[0].second;
895 for (
const auto &
P : drop_begin(Partitions))
897 assert(Result.all() &&
"some nodes are missing from this proposal!");
916class RecursiveSearchSplitting {
918 using SubmitProposalFn = function_ref<void(SplitProposal)>;
920 RecursiveSearchSplitting(
const SplitGraph &SG,
unsigned NumParts,
921 SubmitProposalFn SubmitProposal);
926 struct WorkListEntry {
927 WorkListEntry(
const BitVector &BV) : Cluster(BV) {}
929 unsigned NumNonEntryNodes = 0;
930 CostType TotalCost = 0;
931 CostType CostExcludingGraphEntryPoints = 0;
938 void setupWorkList();
949 void pickPartition(
unsigned Depth,
unsigned Idx, SplitProposal SP);
956 std::pair<unsigned, CostType>
957 findMostSimilarPartition(
const WorkListEntry &Entry,
const SplitProposal &SP);
959 const SplitGraph &SG;
961 SubmitProposalFn SubmitProposal;
965 CostType LargeClusterThreshold = 0;
966 unsigned NumProposalsSubmitted = 0;
967 SmallVector<WorkListEntry> WorkList;
970RecursiveSearchSplitting::RecursiveSearchSplitting(
971 const SplitGraph &SG,
unsigned NumParts, SubmitProposalFn SubmitProposal)
972 : SG(SG), NumParts(NumParts), SubmitProposal(SubmitProposal) {
977 report_fatal_error(
"[amdgpu-split-module] search depth of " +
978 Twine(MaxDepth) +
" is too high!");
979 LargeClusterThreshold =
980 (LargeFnFactor != 0.0)
981 ? CostType(((SG.getModuleCost() / NumParts) * LargeFnFactor))
982 : std::numeric_limits<CostType>::max();
983 LLVM_DEBUG(dbgs() <<
"[recursive search] large cluster threshold set at "
984 << LargeClusterThreshold <<
"\n");
987void RecursiveSearchSplitting::run() {
989 SplitModuleTimer SMT(
"recursive_search_prepare",
"preparing worklist");
994 SplitModuleTimer SMT(
"recursive_search_pick",
"partitioning");
995 SplitProposal SP(SG, NumParts);
996 pickPartition(0, 0, SP);
1000void RecursiveSearchSplitting::setupWorkList() {
1008 EquivalenceClasses<unsigned> NodeEC;
1009 for (
const SplitGraph::Node *
N : SG.nodes()) {
1010 if (!
N->isGraphEntryPoint())
1013 NodeEC.insert(
N->getID());
1014 N->visitAllDependencies([&](
const SplitGraph::Node &Dep) {
1015 if (&Dep !=
N && Dep.isNonCopyable())
1016 NodeEC.unionSets(
N->getID(), Dep.getID());
1020 for (
const auto &
Node : NodeEC) {
1021 if (!
Node->isLeader())
1024 BitVector Cluster = SG.createNodesBitVector();
1025 for (
unsigned M : NodeEC.members(*
Node)) {
1026 const SplitGraph::Node &
N = SG.getNode(M);
1027 if (
N.isGraphEntryPoint())
1028 N.getDependencies(Cluster);
1030 WorkList.emplace_back(std::move(Cluster));
1034 for (WorkListEntry &Entry : WorkList) {
1035 for (
unsigned NodeID : Entry.Cluster.set_bits()) {
1036 const SplitGraph::Node &
N = SG.getNode(NodeID);
1037 const CostType Cost =
N.getIndividualCost();
1039 Entry.TotalCost += Cost;
1040 if (!
N.isGraphEntryPoint()) {
1041 Entry.CostExcludingGraphEntryPoints += Cost;
1042 ++Entry.NumNonEntryNodes;
1047 stable_sort(WorkList, [](
const WorkListEntry &
A,
const WorkListEntry &
B) {
1048 if (
A.TotalCost !=
B.TotalCost)
1049 return A.TotalCost >
B.TotalCost;
1051 if (
A.CostExcludingGraphEntryPoints !=
B.CostExcludingGraphEntryPoints)
1052 return A.CostExcludingGraphEntryPoints >
B.CostExcludingGraphEntryPoints;
1054 if (
A.NumNonEntryNodes !=
B.NumNonEntryNodes)
1055 return A.NumNonEntryNodes >
B.NumNonEntryNodes;
1057 return A.Cluster.count() >
B.Cluster.count();
1061 dbgs() <<
"[recursive search] worklist:\n";
1062 for (
const auto &[Idx, Entry] : enumerate(WorkList)) {
1063 dbgs() <<
" - [" << Idx <<
"]: ";
1064 for (
unsigned NodeID : Entry.Cluster.set_bits())
1065 dbgs() << NodeID <<
" ";
1066 dbgs() <<
"(total_cost:" << Entry.TotalCost
1067 <<
", cost_excl_entries:" << Entry.CostExcludingGraphEntryPoints
1073void RecursiveSearchSplitting::pickPartition(
unsigned Depth,
unsigned Idx,
1075 while (Idx < WorkList.size()) {
1078 const WorkListEntry &Entry = WorkList[Idx];
1079 const BitVector &Cluster = Entry.Cluster;
1083 const unsigned CheapestPID = SP.findCheapestPartition();
1084 assert(CheapestPID != InvalidPID);
1088 const auto [MostSimilarPID, SimilarDepsCost] =
1089 findMostSimilarPartition(Entry, SP);
1093 unsigned SinglePIDToTry = InvalidPID;
1094 if (MostSimilarPID == InvalidPID)
1095 SinglePIDToTry = CheapestPID;
1096 else if (MostSimilarPID == CheapestPID)
1097 SinglePIDToTry = CheapestPID;
1098 else if (Depth >= MaxDepth) {
1101 if (Entry.CostExcludingGraphEntryPoints > LargeClusterThreshold) {
1103 assert(SimilarDepsCost && Entry.CostExcludingGraphEntryPoints);
1104 const double Ratio =
static_cast<double>(SimilarDepsCost) /
1105 Entry.CostExcludingGraphEntryPoints;
1106 assert(Ratio >= 0.0 && Ratio <= 1.0);
1107 if (Ratio > LargeFnOverlapForMerge) {
1112 SinglePIDToTry = MostSimilarPID;
1115 SinglePIDToTry = CheapestPID;
1122 if (SinglePIDToTry != InvalidPID) {
1123 LLVM_DEBUG(dbgs() << Idx <<
"=P" << SinglePIDToTry <<
' ');
1125 SP.add(SinglePIDToTry, Cluster);
1130 assert(MostSimilarPID != InvalidPID);
1139 SplitProposal BranchSP = SP;
1141 <<
" [lb] " << Idx <<
"=P" << CheapestPID <<
"? ");
1142 BranchSP.add(CheapestPID, Cluster);
1143 pickPartition(Depth + 1, Idx + 1, BranchSP);
1148 SplitProposal BranchSP = SP;
1150 <<
" [ms] " << Idx <<
"=P" << MostSimilarPID <<
"? ");
1151 BranchSP.add(MostSimilarPID, Cluster);
1152 pickPartition(Depth + 1, Idx + 1, BranchSP);
1160 assert(Idx == WorkList.size());
1161 assert(NumProposalsSubmitted <= (2u << MaxDepth) &&
1162 "Search got out of bounds?");
1163 SP.setName(
"recursive_search (depth=" + std::to_string(Depth) +
") #" +
1164 std::to_string(NumProposalsSubmitted++));
1169std::pair<unsigned, CostType>
1170RecursiveSearchSplitting::findMostSimilarPartition(
const WorkListEntry &Entry,
1171 const SplitProposal &SP) {
1172 if (!Entry.NumNonEntryNodes)
1173 return {InvalidPID, 0};
1178 unsigned ChosenPID = InvalidPID;
1179 CostType ChosenCost = 0;
1180 for (
unsigned PID = 0; PID < NumParts; ++PID) {
1181 BitVector BV = SP[PID];
1182 BV &= Entry.Cluster;
1187 const CostType Cost = SG.calculateCost(BV);
1189 if (ChosenPID == InvalidPID || ChosenCost < Cost ||
1190 (ChosenCost == Cost && PID > ChosenPID)) {
1196 return {ChosenPID, ChosenCost};
1203const SplitGraph::Node *mapEdgeToDst(
const SplitGraph::Edge *
E) {
1207using SplitGraphEdgeDstIterator =
1208 mapped_iterator<SplitGraph::edges_iterator,
decltype(&mapEdgeToDst)>;
1223 return {
Ref->outgoing_edges().begin(), mapEdgeToDst};
1226 return {
Ref->outgoing_edges().end(), mapEdgeToDst};
1230 return G.nodes().begin();
1233 return G.nodes().end();
1241 return SG.getModule().getName().str();
1245 return N->getName().str();
1249 const SplitGraph &SG) {
1251 if (
N->isEntryFunctionCC())
1252 Result +=
"entry-fn-cc ";
1253 if (
N->isNonCopyable())
1254 Result +=
"non-copyable ";
1255 Result +=
"cost:" + std::to_string(
N->getIndividualCost());
1260 const SplitGraph &SG) {
1261 return N->hasAnyIncomingEdges() ?
"" :
"color=\"red\"";
1265 SplitGraphEdgeDstIterator EI,
1266 const SplitGraph &SG) {
1268 switch ((*EI.getCurrent())->Kind) {
1269 case SplitGraph::EdgeKind::DirectCall:
1271 case SplitGraph::EdgeKind::IndirectCall:
1272 return "style=\"dashed\"";
1287static bool needsConservativeImport(
const GlobalValue *GV) {
1288 if (
const auto *Var = dyn_cast<GlobalVariable>(GV))
1289 return Var->hasLocalLinkage();
1290 return isa<GlobalAlias>(GV);
1295static void printPartitionSummary(raw_ostream &OS,
unsigned N,
const Module &M,
1296 unsigned PartCost,
unsigned ModuleCost) {
1297 OS <<
"*** Partition P" <<
N <<
" ***\n";
1299 for (
const auto &Fn : M) {
1300 if (!Fn.isDeclaration())
1301 OS <<
" - [function] " << Fn.getName() <<
"\n";
1304 for (
const auto &GV :
M.globals()) {
1305 if (GV.hasInitializer())
1306 OS <<
" - [global] " << GV.getName() <<
"\n";
1309 OS <<
"Partition contains " << formatRatioOf(PartCost, ModuleCost)
1310 <<
"% of the source\n";
1313static void evaluateProposal(SplitProposal &Best, SplitProposal New) {
1314 SplitModuleTimer SMT(
"proposal_evaluation",
"proposal ranking algorithm");
1317 New.verifyCompleteness();
1318 if (DebugProposalSearch)
1322 const double CurBScore = Best.getBottleneckScore();
1323 const double CurCSScore = Best.getCodeSizeScore();
1324 const double NewBScore =
New.getBottleneckScore();
1325 const double NewCSScore =
New.getCodeSizeScore();
1337 bool IsBest =
false;
1338 if (NewBScore < CurBScore)
1340 else if (NewBScore == CurBScore)
1341 IsBest = (NewCSScore < CurCSScore);
1344 Best = std::move(New);
1348 dbgs() <<
"[search] new best proposal!\n";
1350 dbgs() <<
"[search] discarding - not profitable\n";
1355static std::unique_ptr<Module> cloneAll(
const Module &M) {
1357 return CloneModule(M, VMap, [&](
const GlobalValue *GV) {
return true; });
1361static void writeDOTGraph(
const SplitGraph &SG) {
1362 if (ModuleDotCfgOutput.empty())
1366 raw_fd_ostream OS(ModuleDotCfgOutput, EC);
1368 errs() <<
"[" DEBUG_TYPE "]: cannot open '" << ModuleDotCfgOutput
1369 <<
"' - DOTGraph will not be printed\n";
1372 SG.getModule().getName());
1375static void splitAMDGPUModule(
1376 GetTTIFn GetTTI,
Module &M,
unsigned NumParts,
1377 function_ref<
void(std::unique_ptr<Module> MPart)> ModuleCallback) {
1397 if (!NoExternalizeOnAddrTaken) {
1398 for (
auto &Fn : M) {
1401 if (Fn.hasLocalLinkage() && Fn.hasAddressTaken()) {
1403 dbgs() <<
" because its address is taken\n");
1411 if (!NoExternalizeGlobals) {
1412 for (
auto &GV :
M.globals()) {
1413 if (GV.hasLocalLinkage())
1414 LLVM_DEBUG(
dbgs() <<
"[externalize] GV " << GV.getName() <<
'\n');
1421 FunctionsCostMap FnCosts;
1422 const CostType ModuleCost = calculateFunctionCosts(GetTTI, M, FnCosts);
1426 SplitGraph SG(M, FnCosts, ModuleCost);
1432 <<
"[!] no nodes in graph, input is empty - no splitting possible\n");
1433 ModuleCallback(cloneAll(M));
1438 dbgs() <<
"[graph] nodes:\n";
1439 for (
const SplitGraph::Node *
N : SG.nodes()) {
1440 dbgs() <<
" - [" <<
N->getID() <<
"]: " <<
N->getName() <<
" "
1441 << (
N->isGraphEntryPoint() ?
"(entry)" :
"") <<
" "
1442 << (
N->isNonCopyable() ?
"(noncopyable)" :
"") <<
"\n";
1450 std::optional<SplitProposal> Proposal;
1451 const auto EvaluateProposal = [&](SplitProposal
SP) {
1452 SP.calculateScores();
1454 Proposal = std::move(SP);
1456 evaluateProposal(*Proposal, std::move(SP));
1461 RecursiveSearchSplitting(SG, NumParts, EvaluateProposal).run();
1462 LLVM_DEBUG(
if (Proposal)
dbgs() <<
"[search done] selected proposal: "
1463 << Proposal->getName() <<
"\n";);
1466 LLVM_DEBUG(
dbgs() <<
"[!] no proposal made, no splitting possible!\n");
1467 ModuleCallback(cloneAll(M));
1473 std::optional<raw_fd_ostream> SummariesOS;
1474 if (!PartitionSummariesOutput.empty()) {
1476 SummariesOS.emplace(PartitionSummariesOutput, EC);
1478 errs() <<
"[" DEBUG_TYPE "]: cannot open '" << PartitionSummariesOutput
1479 <<
"' - Partition summaries will not be printed\n";
1484 bool ImportAllGVs =
true;
1486 for (
unsigned PID = 0; PID < NumParts; ++PID) {
1487 SplitModuleTimer SMT2(
"modules_creation",
1488 "creating modules for each partition");
1491 DenseSet<const Function *> FnsInPart;
1492 for (
unsigned NodeID : (*Proposal)[PID].set_bits())
1493 FnsInPart.insert(&SG.getNode(NodeID).getFunction());
1496 if (FnsInPart.empty()) {
1498 <<
" is empty, not creating module\n");
1503 CostType PartCost = 0;
1504 std::unique_ptr<Module> MPart(
1507 if (
const auto *Fn = dyn_cast<Function>(GV)) {
1508 if (FnsInPart.contains(Fn)) {
1509 PartCost += SG.getCost(*Fn);
1516 return ImportAllGVs || needsConservativeImport(GV);
1519 ImportAllGVs =
false;
1526 if (needsConservativeImport(&GV) && GV.use_empty())
1527 GV.eraseFromParent();
1531 printPartitionSummary(*SummariesOS, PID, *MPart, PartCost, ModuleCost);
1534 printPartitionSummary(
dbgs(), PID, *MPart, PartCost, ModuleCost));
1536 ModuleCallback(std::move(MPart));
1543 SplitModuleTimer SMT(
1544 "total",
"total pass runtime (incl. potentially waiting for lockfile)");
1567 dbgs() <<
"[amdgpu-split-module] unable to acquire lockfile, debug "
1568 "output may be mangled by other processes\n");
1569 }
else if (!Owned) {
1578 <<
"[amdgpu-split-module] unable to acquire lockfile, debug "
1579 "output may be mangled by other processes\n");
1585 splitAMDGPUModule(TTIGetter, M, N, ModuleCallback);
1593 splitAMDGPUModule(TTIGetter, M, N, ModuleCallback);
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static msgpack::DocNode getNode(msgpack::DocNode DN, msgpack::Type Type, MCValue Val)
The AMDGPU TargetMachine interface definition for hw codegen targets.
Unify divergent function exit nodes
This file defines the BumpPtrAllocator interface.
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
Expand Atomic instructions
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
This file provides interfaces used to build and manipulate a call graph, which is a very useful tool ...
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
static InstructionCost getCost(Instruction &Inst, TTI::TargetCostKind CostKind, TargetTransformInfo &TTI, TargetLibraryInfo &TLI)
This file defines the DenseMap class.
Generic implementation of equivalence classes through the use Tarjan's efficient union-find algorithm...
This file defines the little GraphTraits<X> template class that should be specialized by classes that...
Module.h This file contains the declarations for the Module class.
Machine Check Debug Module
FunctionAnalysisManager FAM
ModuleAnalysisManager MAM
static StringRef getName(Value *V)
std::pair< BasicBlock *, BasicBlock * > Edge
This file defines the SmallVector class.
static void externalize(GlobalValue *GV)
static const BasicSubtargetSubTypeKV * find(StringRef S, ArrayRef< BasicSubtargetSubTypeKV > A)
Find KV in array using binary search.
static Function * getFunction(FunctionType *Ty, const Twine &Name, Module *M)
PreservedAnalyses run(Module &M, ModuleAnalysisManager &MAM)
LLVM_ATTRIBUTE_RETURNS_NONNULL void * Allocate(size_t Size, Align Alignment)
Allocate space at the specified alignment.
Lightweight error class with error context and mandatory checking.
@ HiddenVisibility
The GV is hidden.
@ ExternalLinkage
Externally visible function.
static InstructionCost getMax()
Class that manages the creation of a lock file to aid implicit coordination between different process...
std::error_code unsafeMaybeUnlock() override
Remove the lock file.
WaitForUnlockResult waitForUnlockFor(std::chrono::seconds MaxSeconds) override
For a shared lock, wait until the owner releases the lock.
Expected< bool > tryLock() override
Tries to acquire the lock without blocking.
A Module instance is used to store all the information related to an LLVM module.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
StringRef str() const
Explicit conversion to StringRef.
Analysis pass providing the TargetTransformInfo.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
template class LLVM_TEMPLATE_ABI opt< bool >
template class LLVM_TEMPLATE_ABI opt< unsigned >
initializer< Ty > init(const Ty &Val)
template class LLVM_TEMPLATE_ABI opt< std::string >
LLVM_ABI void system_temp_directory(bool erasedOnReboot, SmallVectorImpl< char > &result)
Get the typical temporary directory for the system, e.g., "/var/tmp" or "C:/TEMP".
LLVM_ABI void append(SmallVectorImpl< char > &path, const Twine &a, const Twine &b="", const Twine &c="", const Twine &d="")
Append to path.
This is an optimization pass for GlobalISel generic memory operations.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
InnerAnalysisManagerProxy< FunctionAnalysisManager, Module > FunctionAnalysisManagerModuleProxy
Provide the FunctionAnalysisManager to Module proxy.
LLVM_ABI bool TimePassesIsEnabled
If the user specifies the -time-passes argument on an LLVM tool command line then the value of this b...
raw_ostream & WriteGraph(raw_ostream &O, const GraphType &G, bool ShortNames=false, const Twine &Title="")
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
BumpPtrAllocatorImpl BumpPtrAllocator
The standard BumpPtrAllocator which just uses the default template parameters.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
@ Success
The lock was released successfully.
@ OwnerDied
Owner died while holding the lock.
@ Timeout
Reached timeout while waiting for the owner to release the lock.
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
iterator_range(Container &&) -> iterator_range< llvm::detail::IterOfRange< Container > >
@ Ref
The access may reference the value stored in memory.
ValueMap< const Value *, WeakTrackingVH > ValueToValueMapTy
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI std::unique_ptr< Module > CloneModule(const Module &M)
Return an exact copy of the specified module.
void consumeError(Error Err)
Consume a Error without doing anything.
AnalysisManager< Module > ModuleAnalysisManager
Convenience typedef for the Module analysis manager.
static std::string getEdgeAttributes(const SplitGraph::Node *N, SplitGraphEdgeDstIterator EI, const SplitGraph &SG)
static std::string getGraphName(const SplitGraph &SG)
DOTGraphTraits(bool IsSimple=false)
static std::string getNodeAttributes(const SplitGraph::Node *N, const SplitGraph &SG)
static std::string getNodeDescription(const SplitGraph::Node *N, const SplitGraph &SG)
std::string getNodeLabel(const SplitGraph::Node *N, const SplitGraph &SG)
DefaultDOTGraphTraits(bool simple=false)
const SplitGraph::Edge * EdgeRef
static NodeRef getEntryNode(NodeRef N)
SplitGraph::nodes_iterator nodes_iterator
SplitGraph::edges_iterator ChildEdgeIteratorType
SplitGraphEdgeDstIterator ChildIteratorType
static nodes_iterator nodes_end(const SplitGraph &G)
static ChildIteratorType child_begin(NodeRef Ref)
static nodes_iterator nodes_begin(const SplitGraph &G)
const SplitGraph::Node * NodeRef
static ChildIteratorType child_end(NodeRef Ref)