LLVM 22.0.0git
MemProfContextDisambiguation.cpp
Go to the documentation of this file.
1//==-- MemProfContextDisambiguation.cpp - Disambiguate contexts -------------=//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements support for context disambiguation of allocation
10// calls for profile guided heap optimization. Specifically, it uses Memprof
11// profiles which indicate context specific allocation behavior (currently
12// distinguishing cold vs hot memory allocations). Cloning is performed to
13// expose the cold allocation call contexts, and the allocation calls are
14// subsequently annotated with an attribute for later transformation.
15//
16// The transformations can be performed either directly on IR (regular LTO), or
17// on a ThinLTO index (and later applied to the IR during the ThinLTO backend).
18// Both types of LTO operate on a the same base graph representation, which
19// uses CRTP to support either IR or Index formats.
20//
21//===----------------------------------------------------------------------===//
22
24#include "llvm/ADT/DenseMap.h"
25#include "llvm/ADT/DenseSet.h"
26#include "llvm/ADT/MapVector.h"
29#include "llvm/ADT/SmallSet.h"
31#include "llvm/ADT/Statistic.h"
38#include "llvm/IR/Module.h"
40#include "llvm/Pass.h"
44#include "llvm/Support/SHA1.h"
46#include "llvm/Transforms/IPO.h"
50#include <deque>
51#include <sstream>
52#include <unordered_map>
53#include <vector>
54using namespace llvm;
55using namespace llvm::memprof;
56
57#define DEBUG_TYPE "memprof-context-disambiguation"
58
59STATISTIC(FunctionClonesAnalysis,
60 "Number of function clones created during whole program analysis");
61STATISTIC(FunctionClonesThinBackend,
62 "Number of function clones created during ThinLTO backend");
63STATISTIC(FunctionsClonedThinBackend,
64 "Number of functions that had clones created during ThinLTO backend");
66 FunctionCloneDuplicatesThinBackend,
67 "Number of function clone duplicates detected during ThinLTO backend");
68STATISTIC(AllocTypeNotCold, "Number of not cold static allocations (possibly "
69 "cloned) during whole program analysis");
70STATISTIC(AllocTypeCold, "Number of cold static allocations (possibly cloned) "
71 "during whole program analysis");
72STATISTIC(AllocTypeNotColdThinBackend,
73 "Number of not cold static allocations (possibly cloned) during "
74 "ThinLTO backend");
75STATISTIC(AllocTypeColdThinBackend, "Number of cold static allocations "
76 "(possibly cloned) during ThinLTO backend");
77STATISTIC(OrigAllocsThinBackend,
78 "Number of original (not cloned) allocations with memprof profiles "
79 "during ThinLTO backend");
81 AllocVersionsThinBackend,
82 "Number of allocation versions (including clones) during ThinLTO backend");
83STATISTIC(MaxAllocVersionsThinBackend,
84 "Maximum number of allocation versions created for an original "
85 "allocation during ThinLTO backend");
86STATISTIC(UnclonableAllocsThinBackend,
87 "Number of unclonable ambigous allocations during ThinLTO backend");
88STATISTIC(RemovedEdgesWithMismatchedCallees,
89 "Number of edges removed due to mismatched callees (profiled vs IR)");
90STATISTIC(FoundProfiledCalleeCount,
91 "Number of profiled callees found via tail calls");
92STATISTIC(FoundProfiledCalleeDepth,
93 "Aggregate depth of profiled callees found via tail calls");
94STATISTIC(FoundProfiledCalleeMaxDepth,
95 "Maximum depth of profiled callees found via tail calls");
96STATISTIC(FoundProfiledCalleeNonUniquelyCount,
97 "Number of profiled callees found via multiple tail call chains");
98STATISTIC(DeferredBackedges, "Number of backedges with deferred cloning");
99STATISTIC(NewMergedNodes, "Number of new nodes created during merging");
100STATISTIC(NonNewMergedNodes, "Number of non new nodes used during merging");
101STATISTIC(MissingAllocForContextId,
102 "Number of missing alloc nodes for context ids");
103STATISTIC(SkippedCallsCloning,
104 "Number of calls skipped during cloning due to unexpected operand");
105STATISTIC(MismatchedCloneAssignments,
106 "Number of callsites assigned to call multiple non-matching clones");
107STATISTIC(TotalMergeInvokes, "Number of merge invocations for nodes");
108STATISTIC(TotalMergeIters, "Number of merge iterations for nodes");
109STATISTIC(MaxMergeIters, "Max merge iterations for nodes");
110
112 "memprof-dot-file-path-prefix", cl::init(""), cl::Hidden,
113 cl::value_desc("filename"),
114 cl::desc("Specify the path prefix of the MemProf dot files."));
115
116static cl::opt<bool> ExportToDot("memprof-export-to-dot", cl::init(false),
118 cl::desc("Export graph to dot files."));
119
120// TODO: Remove this option once new handling is validated more widely.
122 "memprof-merge-iteration", cl::init(true), cl::Hidden,
123 cl::desc("Iteratively apply merging on a node to catch new callers"));
124
125// How much of the graph to export to dot.
127 All, // The full CCG graph.
128 Alloc, // Only contexts for the specified allocation.
129 Context, // Only the specified context.
130};
131
133 "memprof-dot-scope", cl::desc("Scope of graph to export to dot"),
136 clEnumValN(DotScope::All, "all", "Export full callsite graph"),
138 "Export only nodes with contexts feeding given "
139 "-memprof-dot-alloc-id"),
140 clEnumValN(DotScope::Context, "context",
141 "Export only nodes with given -memprof-dot-context-id")));
142
144 AllocIdForDot("memprof-dot-alloc-id", cl::init(0), cl::Hidden,
145 cl::desc("Id of alloc to export if -memprof-dot-scope=alloc "
146 "or to highlight if -memprof-dot-scope=all"));
147
149 "memprof-dot-context-id", cl::init(0), cl::Hidden,
150 cl::desc("Id of context to export if -memprof-dot-scope=context or to "
151 "highlight otherwise"));
152
153static cl::opt<bool>
154 DumpCCG("memprof-dump-ccg", cl::init(false), cl::Hidden,
155 cl::desc("Dump CallingContextGraph to stdout after each stage."));
156
157static cl::opt<bool>
158 VerifyCCG("memprof-verify-ccg", cl::init(false), cl::Hidden,
159 cl::desc("Perform verification checks on CallingContextGraph."));
160
161static cl::opt<bool>
162 VerifyNodes("memprof-verify-nodes", cl::init(false), cl::Hidden,
163 cl::desc("Perform frequent verification checks on nodes."));
164
166 "memprof-import-summary",
167 cl::desc("Import summary to use for testing the ThinLTO backend via opt"),
168 cl::Hidden);
169
171 TailCallSearchDepth("memprof-tail-call-search-depth", cl::init(5),
173 cl::desc("Max depth to recursively search for missing "
174 "frames through tail calls."));
175
176// Optionally enable cloning of callsites involved with recursive cycles
178 "memprof-allow-recursive-callsites", cl::init(true), cl::Hidden,
179 cl::desc("Allow cloning of callsites involved in recursive cycles"));
180
182 "memprof-clone-recursive-contexts", cl::init(true), cl::Hidden,
183 cl::desc("Allow cloning of contexts through recursive cycles"));
184
185// Generally this is needed for correct assignment of allocation clones to
186// function clones, however, allow it to be disabled for debugging while the
187// functionality is new and being tested more widely.
188static cl::opt<bool>
189 MergeClones("memprof-merge-clones", cl::init(true), cl::Hidden,
190 cl::desc("Merge clones before assigning functions"));
191
192// When disabled, try to detect and prevent cloning of recursive contexts.
193// This is only necessary until we support cloning through recursive cycles.
194// Leave on by default for now, as disabling requires a little bit of compile
195// time overhead and doesn't affect correctness, it will just inflate the cold
196// hinted bytes reporting a bit when -memprof-report-hinted-sizes is enabled.
198 "memprof-allow-recursive-contexts", cl::init(true), cl::Hidden,
199 cl::desc("Allow cloning of contexts having recursive cycles"));
200
201// Set the minimum absolute count threshold for allowing inlining of indirect
202// calls promoted during cloning.
204 "memprof-icp-noinline-threshold", cl::init(2), cl::Hidden,
205 cl::desc("Minimum absolute count for promoted target to be inlinable"));
206
207namespace llvm {
209 "enable-memprof-context-disambiguation", cl::init(false), cl::Hidden,
210 cl::ZeroOrMore, cl::desc("Enable MemProf context disambiguation"));
211
212// Indicate we are linking with an allocator that supports hot/cold operator
213// new interfaces.
215 "supports-hot-cold-new", cl::init(false), cl::Hidden,
216 cl::desc("Linking with hot/cold operator new interfaces"));
217
219 "memprof-require-definition-for-promotion", cl::init(false), cl::Hidden,
220 cl::desc(
221 "Require target function definition when promoting indirect calls"));
222
225
226} // namespace llvm
227
228namespace {
229/// CRTP base for graphs built from either IR or ThinLTO summary index.
230///
231/// The graph represents the call contexts in all memprof metadata on allocation
232/// calls, with nodes for the allocations themselves, as well as for the calls
233/// in each context. The graph is initially built from the allocation memprof
234/// metadata (or summary) MIBs. It is then updated to match calls with callsite
235/// metadata onto the nodes, updating it to reflect any inlining performed on
236/// those calls.
237///
238/// Each MIB (representing an allocation's call context with allocation
239/// behavior) is assigned a unique context id during the graph build. The edges
240/// and nodes in the graph are decorated with the context ids they carry. This
241/// is used to correctly update the graph when cloning is performed so that we
242/// can uniquify the context for a single (possibly cloned) allocation.
243template <typename DerivedCCG, typename FuncTy, typename CallTy>
244class CallsiteContextGraph {
245public:
246 CallsiteContextGraph() = default;
247 CallsiteContextGraph(const CallsiteContextGraph &) = default;
248 CallsiteContextGraph(CallsiteContextGraph &&) = default;
249
250 /// Main entry point to perform analysis and transformations on graph.
251 bool process();
252
253 /// Perform cloning on the graph necessary to uniquely identify the allocation
254 /// behavior of an allocation based on its context.
255 void identifyClones();
256
257 /// Assign callsite clones to functions, cloning functions as needed to
258 /// accommodate the combinations of their callsite clones reached by callers.
259 /// For regular LTO this clones functions and callsites in the IR, but for
260 /// ThinLTO the cloning decisions are noted in the summaries and later applied
261 /// in applyImport.
262 bool assignFunctions();
263
264 void dump() const;
265 void print(raw_ostream &OS) const;
266 void printTotalSizes(raw_ostream &OS) const;
267
269 const CallsiteContextGraph &CCG) {
270 CCG.print(OS);
271 return OS;
272 }
273
274 friend struct GraphTraits<
275 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
276 friend struct DOTGraphTraits<
277 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
278
279 void exportToDot(std::string Label) const;
280
281 /// Represents a function clone via FuncTy pointer and clone number pair.
282 struct FuncInfo final
283 : public std::pair<FuncTy *, unsigned /*Clone number*/> {
284 using Base = std::pair<FuncTy *, unsigned>;
285 FuncInfo(const Base &B) : Base(B) {}
286 FuncInfo(FuncTy *F = nullptr, unsigned CloneNo = 0) : Base(F, CloneNo) {}
287 explicit operator bool() const { return this->first != nullptr; }
288 FuncTy *func() const { return this->first; }
289 unsigned cloneNo() const { return this->second; }
290 };
291
292 /// Represents a callsite clone via CallTy and clone number pair.
293 struct CallInfo final : public std::pair<CallTy, unsigned /*Clone number*/> {
294 using Base = std::pair<CallTy, unsigned>;
295 CallInfo(const Base &B) : Base(B) {}
296 CallInfo(CallTy Call = nullptr, unsigned CloneNo = 0)
297 : Base(Call, CloneNo) {}
298 explicit operator bool() const { return (bool)this->first; }
299 CallTy call() const { return this->first; }
300 unsigned cloneNo() const { return this->second; }
301 void setCloneNo(unsigned N) { this->second = N; }
302 void print(raw_ostream &OS) const {
303 if (!operator bool()) {
304 assert(!cloneNo());
305 OS << "null Call";
306 return;
307 }
308 call()->print(OS);
309 OS << "\t(clone " << cloneNo() << ")";
310 }
311 void dump() const {
312 print(dbgs());
313 dbgs() << "\n";
314 }
315 friend raw_ostream &operator<<(raw_ostream &OS, const CallInfo &Call) {
316 Call.print(OS);
317 return OS;
318 }
319 };
320
321 struct ContextEdge;
322
323 /// Node in the Callsite Context Graph
324 struct ContextNode {
325 // Assigned to nodes as they are created, useful for debugging.
326 unsigned NodeId = 0;
327
328 // Keep this for now since in the IR case where we have an Instruction* it
329 // is not as immediately discoverable. Used for printing richer information
330 // when dumping graph.
331 bool IsAllocation;
332
333 // Keeps track of when the Call was reset to null because there was
334 // recursion.
335 bool Recursive = false;
336
337 // This will be formed by ORing together the AllocationType enum values
338 // for contexts including this node.
339 uint8_t AllocTypes = 0;
340
341 // The corresponding allocation or interior call. This is the primary call
342 // for which we have created this node.
343 CallInfo Call;
344
345 // List of other calls that can be treated the same as the primary call
346 // through cloning. I.e. located in the same function and have the same
347 // (possibly pruned) stack ids. They will be updated the same way as the
348 // primary call when assigning to function clones.
349 SmallVector<CallInfo, 0> MatchingCalls;
350
351 // For alloc nodes this is a unique id assigned when constructed, and for
352 // callsite stack nodes it is the original stack id when the node is
353 // constructed from the memprof MIB metadata on the alloc nodes. Note that
354 // this is only used when matching callsite metadata onto the stack nodes
355 // created when processing the allocation memprof MIBs, and for labeling
356 // nodes in the dot graph. Therefore we don't bother to assign a value for
357 // clones.
358 uint64_t OrigStackOrAllocId = 0;
359
360 // Edges to all callees in the profiled call stacks.
361 // TODO: Should this be a map (from Callee node) for more efficient lookup?
362 std::vector<std::shared_ptr<ContextEdge>> CalleeEdges;
363
364 // Edges to all callers in the profiled call stacks.
365 // TODO: Should this be a map (from Caller node) for more efficient lookup?
366 std::vector<std::shared_ptr<ContextEdge>> CallerEdges;
367
368 // Returns true if we need to look at the callee edges for determining the
369 // node context ids and allocation type.
370 bool useCallerEdgesForContextInfo() const {
371 // Typically if the callee edges are empty either the caller edges are
372 // also empty, or this is an allocation (leaf node). However, if we are
373 // allowing recursive callsites and contexts this will be violated for
374 // incompletely cloned recursive cycles.
375 assert(!CalleeEdges.empty() || CallerEdges.empty() || IsAllocation ||
377 // When cloning for a recursive context, during cloning we might be in the
378 // midst of cloning for a recurrence and have moved context ids off of a
379 // caller edge onto the clone but not yet off of the incoming caller
380 // (back) edge. If we don't look at those we miss the fact that this node
381 // still has context ids of interest.
382 return IsAllocation || CloneRecursiveContexts;
383 }
384
385 // Compute the context ids for this node from the union of its edge context
386 // ids.
387 DenseSet<uint32_t> getContextIds() const {
388 unsigned Count = 0;
389 // Compute the number of ids for reserve below. In general we only need to
390 // look at one set of edges, typically the callee edges, since other than
391 // allocations and in some cases during recursion cloning, all the context
392 // ids on the callers should also flow out via callee edges.
393 for (auto &Edge : CalleeEdges.empty() ? CallerEdges : CalleeEdges)
394 Count += Edge->getContextIds().size();
395 DenseSet<uint32_t> ContextIds;
396 ContextIds.reserve(Count);
398 CalleeEdges, useCallerEdgesForContextInfo()
399 ? CallerEdges
400 : std::vector<std::shared_ptr<ContextEdge>>());
401 for (const auto &Edge : Edges)
402 ContextIds.insert_range(Edge->getContextIds());
403 return ContextIds;
404 }
405
406 // Compute the allocation type for this node from the OR of its edge
407 // allocation types.
408 uint8_t computeAllocType() const {
409 uint8_t BothTypes =
413 CalleeEdges, useCallerEdgesForContextInfo()
414 ? CallerEdges
415 : std::vector<std::shared_ptr<ContextEdge>>());
416 for (const auto &Edge : Edges) {
417 AllocType |= Edge->AllocTypes;
418 // Bail early if alloc type reached both, no further refinement.
419 if (AllocType == BothTypes)
420 return AllocType;
421 }
422 return AllocType;
423 }
424
425 // The context ids set for this node is empty if its edge context ids are
426 // also all empty.
427 bool emptyContextIds() const {
429 CalleeEdges, useCallerEdgesForContextInfo()
430 ? CallerEdges
431 : std::vector<std::shared_ptr<ContextEdge>>());
432 for (const auto &Edge : Edges) {
433 if (!Edge->getContextIds().empty())
434 return false;
435 }
436 return true;
437 }
438
439 // List of clones of this ContextNode, initially empty.
440 std::vector<ContextNode *> Clones;
441
442 // If a clone, points to the original uncloned node.
443 ContextNode *CloneOf = nullptr;
444
445 ContextNode(bool IsAllocation) : IsAllocation(IsAllocation), Call() {}
446
447 ContextNode(bool IsAllocation, CallInfo C)
448 : IsAllocation(IsAllocation), Call(C) {}
449
450 void addClone(ContextNode *Clone) {
451 if (CloneOf) {
452 CloneOf->Clones.push_back(Clone);
453 Clone->CloneOf = CloneOf;
454 } else {
455 Clones.push_back(Clone);
456 assert(!Clone->CloneOf);
457 Clone->CloneOf = this;
458 }
459 }
460
461 ContextNode *getOrigNode() {
462 if (!CloneOf)
463 return this;
464 return CloneOf;
465 }
466
467 void addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
468 unsigned int ContextId);
469
470 ContextEdge *findEdgeFromCallee(const ContextNode *Callee);
471 ContextEdge *findEdgeFromCaller(const ContextNode *Caller);
472 void eraseCalleeEdge(const ContextEdge *Edge);
473 void eraseCallerEdge(const ContextEdge *Edge);
474
475 void setCall(CallInfo C) { Call = C; }
476
477 bool hasCall() const { return (bool)Call.call(); }
478
479 void printCall(raw_ostream &OS) const { Call.print(OS); }
480
481 // True if this node was effectively removed from the graph, in which case
482 // it should have an allocation type of None and empty context ids.
483 bool isRemoved() const {
484 // Typically if the callee edges are empty either the caller edges are
485 // also empty, or this is an allocation (leaf node). However, if we are
486 // allowing recursive callsites and contexts this will be violated for
487 // incompletely cloned recursive cycles.
489 (AllocTypes == (uint8_t)AllocationType::None) ==
490 emptyContextIds());
491 return AllocTypes == (uint8_t)AllocationType::None;
492 }
493
494 void dump() const;
495 void print(raw_ostream &OS) const;
496
497 friend raw_ostream &operator<<(raw_ostream &OS, const ContextNode &Node) {
498 Node.print(OS);
499 return OS;
500 }
501 };
502
503 /// Edge in the Callsite Context Graph from a ContextNode N to a caller or
504 /// callee.
505 struct ContextEdge {
506 ContextNode *Callee;
507 ContextNode *Caller;
508
509 // This will be formed by ORing together the AllocationType enum values
510 // for contexts including this edge.
511 uint8_t AllocTypes = 0;
512
513 // Set just before initiating cloning when cloning of recursive contexts is
514 // enabled. Used to defer cloning of backedges until we have done cloning of
515 // the callee node for non-backedge caller edges. This exposes cloning
516 // opportunities through the backedge of the cycle.
517 // TODO: Note that this is not updated during cloning, and it is unclear
518 // whether that would be needed.
519 bool IsBackedge = false;
520
521 // The set of IDs for contexts including this edge.
522 DenseSet<uint32_t> ContextIds;
523
524 ContextEdge(ContextNode *Callee, ContextNode *Caller, uint8_t AllocType,
525 DenseSet<uint32_t> ContextIds)
526 : Callee(Callee), Caller(Caller), AllocTypes(AllocType),
527 ContextIds(std::move(ContextIds)) {}
528
529 DenseSet<uint32_t> &getContextIds() { return ContextIds; }
530
531 // Helper to clear the fields of this edge when we are removing it from the
532 // graph.
533 inline void clear() {
534 ContextIds.clear();
535 AllocTypes = (uint8_t)AllocationType::None;
536 Caller = nullptr;
537 Callee = nullptr;
538 }
539
540 // Check if edge was removed from the graph. This is useful while iterating
541 // over a copy of edge lists when performing operations that mutate the
542 // graph in ways that might remove one of the edges.
543 inline bool isRemoved() const {
544 if (Callee || Caller)
545 return false;
546 // Any edges that have been removed from the graph but are still in a
547 // shared_ptr somewhere should have all fields null'ed out by clear()
548 // above.
549 assert(AllocTypes == (uint8_t)AllocationType::None);
550 assert(ContextIds.empty());
551 return true;
552 }
553
554 void dump() const;
555 void print(raw_ostream &OS) const;
556
557 friend raw_ostream &operator<<(raw_ostream &OS, const ContextEdge &Edge) {
558 Edge.print(OS);
559 return OS;
560 }
561 };
562
563 /// Helpers to remove edges that have allocation type None (due to not
564 /// carrying any context ids) after transformations.
565 void removeNoneTypeCalleeEdges(ContextNode *Node);
566 void removeNoneTypeCallerEdges(ContextNode *Node);
567 void
568 recursivelyRemoveNoneTypeCalleeEdges(ContextNode *Node,
570
571protected:
572 /// Get a list of nodes corresponding to the stack ids in the given callsite
573 /// context.
574 template <class NodeT, class IteratorT>
575 std::vector<uint64_t>
576 getStackIdsWithContextNodes(CallStack<NodeT, IteratorT> &CallsiteContext);
577
578 /// Adds nodes for the given allocation and any stack ids on its memprof MIB
579 /// metadata (or summary).
580 ContextNode *addAllocNode(CallInfo Call, const FuncTy *F);
581
582 /// Adds nodes for the given MIB stack ids.
583 template <class NodeT, class IteratorT>
584 void addStackNodesForMIB(ContextNode *AllocNode,
585 CallStack<NodeT, IteratorT> &StackContext,
586 CallStack<NodeT, IteratorT> &CallsiteContext,
588 ArrayRef<ContextTotalSize> ContextSizeInfo);
589
590 /// Matches all callsite metadata (or summary) to the nodes created for
591 /// allocation memprof MIB metadata, synthesizing new nodes to reflect any
592 /// inlining performed on those callsite instructions.
593 void updateStackNodes();
594
595 /// Update graph to conservatively handle any callsite stack nodes that target
596 /// multiple different callee target functions.
597 void handleCallsitesWithMultipleTargets();
598
599 /// Mark backedges via the standard DFS based backedge algorithm.
600 void markBackedges();
601
602 /// Merge clones generated during cloning for different allocations but that
603 /// are called by the same caller node, to ensure proper function assignment.
604 void mergeClones();
605
606 // Try to partition calls on the given node (already placed into the AllCalls
607 // array) by callee function, creating new copies of Node as needed to hold
608 // calls with different callees, and moving the callee edges appropriately.
609 // Returns true if partitioning was successful.
610 bool partitionCallsByCallee(
611 ContextNode *Node, ArrayRef<CallInfo> AllCalls,
612 std::vector<std::pair<CallInfo, ContextNode *>> &NewCallToNode);
613
614 /// Save lists of calls with MemProf metadata in each function, for faster
615 /// iteration.
616 MapVector<FuncTy *, std::vector<CallInfo>> FuncToCallsWithMetadata;
617
618 /// Map from callsite node to the enclosing caller function.
619 std::map<const ContextNode *, const FuncTy *> NodeToCallingFunc;
620
621 // When exporting to dot, and an allocation id is specified, contains the
622 // context ids on that allocation.
623 DenseSet<uint32_t> DotAllocContextIds;
624
625private:
626 using EdgeIter = typename std::vector<std::shared_ptr<ContextEdge>>::iterator;
627
628 // Structure to keep track of information for each call as we are matching
629 // non-allocation callsites onto context nodes created from the allocation
630 // call metadata / summary contexts.
631 struct CallContextInfo {
632 // The callsite we're trying to match.
633 CallTy Call;
634 // The callsites stack ids that have a context node in the graph.
635 std::vector<uint64_t> StackIds;
636 // The function containing this callsite.
637 const FuncTy *Func;
638 // Initially empty, if needed this will be updated to contain the context
639 // ids for use in a new context node created for this callsite.
640 DenseSet<uint32_t> ContextIds;
641 };
642
643 /// Helper to remove edge from graph, updating edge iterator if it is provided
644 /// (in which case CalleeIter indicates which edge list is being iterated).
645 /// This will also perform the necessary clearing of the ContextEdge members
646 /// to enable later checking if the edge has been removed (since we may have
647 /// other copies of the shared_ptr in existence, and in fact rely on this to
648 /// enable removal while iterating over a copy of a node's edge list).
649 void removeEdgeFromGraph(ContextEdge *Edge, EdgeIter *EI = nullptr,
650 bool CalleeIter = true);
651
652 /// Assigns the given Node to calls at or inlined into the location with
653 /// the Node's stack id, after post order traversing and processing its
654 /// caller nodes. Uses the call information recorded in the given
655 /// StackIdToMatchingCalls map, and creates new nodes for inlined sequences
656 /// as needed. Called by updateStackNodes which sets up the given
657 /// StackIdToMatchingCalls map.
658 void assignStackNodesPostOrder(
659 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
660 DenseMap<uint64_t, std::vector<CallContextInfo>> &StackIdToMatchingCalls,
661 DenseMap<CallInfo, CallInfo> &CallToMatchingCall);
662
663 /// Duplicates the given set of context ids, updating the provided
664 /// map from each original id with the newly generated context ids,
665 /// and returning the new duplicated id set.
666 DenseSet<uint32_t> duplicateContextIds(
667 const DenseSet<uint32_t> &StackSequenceContextIds,
668 DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
669
670 /// Propagates all duplicated context ids across the graph.
671 void propagateDuplicateContextIds(
672 const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
673
674 /// Connect the NewNode to OrigNode's callees if TowardsCallee is true,
675 /// else to its callers. Also updates OrigNode's edges to remove any context
676 /// ids moved to the newly created edge.
677 void connectNewNode(ContextNode *NewNode, ContextNode *OrigNode,
678 bool TowardsCallee,
679 DenseSet<uint32_t> RemainingContextIds);
680
681 /// Get the stack id corresponding to the given Id or Index (for IR this will
682 /// return itself, for a summary index this will return the id recorded in the
683 /// index for that stack id index value).
684 uint64_t getStackId(uint64_t IdOrIndex) const {
685 return static_cast<const DerivedCCG *>(this)->getStackId(IdOrIndex);
686 }
687
688 /// Returns true if the given call targets the callee of the given edge, or if
689 /// we were able to identify the call chain through intermediate tail calls.
690 /// In the latter case new context nodes are added to the graph for the
691 /// identified tail calls, and their synthesized nodes are added to
692 /// TailCallToContextNodeMap. The EdgeIter is updated in the latter case for
693 /// the updated edges and to prepare it for an increment in the caller.
694 bool
695 calleesMatch(CallTy Call, EdgeIter &EI,
696 MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap);
697
698 // Return the callee function of the given call, or nullptr if it can't be
699 // determined
700 const FuncTy *getCalleeFunc(CallTy Call) {
701 return static_cast<DerivedCCG *>(this)->getCalleeFunc(Call);
702 }
703
704 /// Returns true if the given call targets the given function, or if we were
705 /// able to identify the call chain through intermediate tail calls (in which
706 /// case FoundCalleeChain will be populated).
707 bool calleeMatchesFunc(
708 CallTy Call, const FuncTy *Func, const FuncTy *CallerFunc,
709 std::vector<std::pair<CallTy, FuncTy *>> &FoundCalleeChain) {
710 return static_cast<DerivedCCG *>(this)->calleeMatchesFunc(
711 Call, Func, CallerFunc, FoundCalleeChain);
712 }
713
714 /// Returns true if both call instructions have the same callee.
715 bool sameCallee(CallTy Call1, CallTy Call2) {
716 return static_cast<DerivedCCG *>(this)->sameCallee(Call1, Call2);
717 }
718
719 /// Get a list of nodes corresponding to the stack ids in the given
720 /// callsite's context.
721 std::vector<uint64_t> getStackIdsWithContextNodesForCall(CallTy Call) {
722 return static_cast<DerivedCCG *>(this)->getStackIdsWithContextNodesForCall(
723 Call);
724 }
725
726 /// Get the last stack id in the context for callsite.
727 uint64_t getLastStackId(CallTy Call) {
728 return static_cast<DerivedCCG *>(this)->getLastStackId(Call);
729 }
730
731 /// Update the allocation call to record type of allocated memory.
732 void updateAllocationCall(CallInfo &Call, AllocationType AllocType) {
733 AllocType == AllocationType::Cold ? AllocTypeCold++ : AllocTypeNotCold++;
734 static_cast<DerivedCCG *>(this)->updateAllocationCall(Call, AllocType);
735 }
736
737 /// Get the AllocationType assigned to the given allocation instruction clone.
738 AllocationType getAllocationCallType(const CallInfo &Call) const {
739 return static_cast<const DerivedCCG *>(this)->getAllocationCallType(Call);
740 }
741
742 /// Update non-allocation call to invoke (possibly cloned) function
743 /// CalleeFunc.
744 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc) {
745 static_cast<DerivedCCG *>(this)->updateCall(CallerCall, CalleeFunc);
746 }
747
748 /// Clone the given function for the given callsite, recording mapping of all
749 /// of the functions tracked calls to their new versions in the CallMap.
750 /// Assigns new clones to clone number CloneNo.
751 FuncInfo cloneFunctionForCallsite(
752 FuncInfo &Func, CallInfo &Call, DenseMap<CallInfo, CallInfo> &CallMap,
753 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
754 return static_cast<DerivedCCG *>(this)->cloneFunctionForCallsite(
755 Func, Call, CallMap, CallsWithMetadataInFunc, CloneNo);
756 }
757
758 /// Gets a label to use in the dot graph for the given call clone in the given
759 /// function.
760 std::string getLabel(const FuncTy *Func, const CallTy Call,
761 unsigned CloneNo) const {
762 return static_cast<const DerivedCCG *>(this)->getLabel(Func, Call, CloneNo);
763 }
764
765 // Create and return a new ContextNode.
766 ContextNode *createNewNode(bool IsAllocation, const FuncTy *F = nullptr,
767 CallInfo C = CallInfo()) {
768 NodeOwner.push_back(std::make_unique<ContextNode>(IsAllocation, C));
769 auto *NewNode = NodeOwner.back().get();
770 if (F)
771 NodeToCallingFunc[NewNode] = F;
772 NewNode->NodeId = NodeOwner.size();
773 return NewNode;
774 }
775
776 /// Helpers to find the node corresponding to the given call or stackid.
777 ContextNode *getNodeForInst(const CallInfo &C);
778 ContextNode *getNodeForAlloc(const CallInfo &C);
779 ContextNode *getNodeForStackId(uint64_t StackId);
780
781 /// Computes the alloc type corresponding to the given context ids, by
782 /// unioning their recorded alloc types.
783 uint8_t computeAllocType(DenseSet<uint32_t> &ContextIds) const;
784
785 /// Returns the allocation type of the intersection of the contexts of two
786 /// nodes (based on their provided context id sets), optimized for the case
787 /// when Node1Ids is smaller than Node2Ids.
788 uint8_t intersectAllocTypesImpl(const DenseSet<uint32_t> &Node1Ids,
789 const DenseSet<uint32_t> &Node2Ids) const;
790
791 /// Returns the allocation type of the intersection of the contexts of two
792 /// nodes (based on their provided context id sets).
793 uint8_t intersectAllocTypes(const DenseSet<uint32_t> &Node1Ids,
794 const DenseSet<uint32_t> &Node2Ids) const;
795
796 /// Create a clone of Edge's callee and move Edge to that new callee node,
797 /// performing the necessary context id and allocation type updates.
798 /// If ContextIdsToMove is non-empty, only that subset of Edge's ids are
799 /// moved to an edge to the new callee.
800 ContextNode *
801 moveEdgeToNewCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
802 DenseSet<uint32_t> ContextIdsToMove = {});
803
804 /// Change the callee of Edge to existing callee clone NewCallee, performing
805 /// the necessary context id and allocation type updates.
806 /// If ContextIdsToMove is non-empty, only that subset of Edge's ids are
807 /// moved to an edge to the new callee.
808 void moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
809 ContextNode *NewCallee,
810 bool NewClone = false,
811 DenseSet<uint32_t> ContextIdsToMove = {});
812
813 /// Change the caller of the edge at the given callee edge iterator to be
814 /// NewCaller, performing the necessary context id and allocation type
815 /// updates. This is similar to the above moveEdgeToExistingCalleeClone, but
816 /// a simplified version of it as we always move the given edge and all of its
817 /// context ids.
818 void moveCalleeEdgeToNewCaller(const std::shared_ptr<ContextEdge> &Edge,
819 ContextNode *NewCaller);
820
821 /// Recursive helper for marking backedges via DFS.
822 void markBackedges(ContextNode *Node, DenseSet<const ContextNode *> &Visited,
823 DenseSet<const ContextNode *> &CurrentStack);
824
825 /// Recursive helper for merging clones.
826 void
827 mergeClones(ContextNode *Node, DenseSet<const ContextNode *> &Visited,
828 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode);
829 /// Main worker for merging callee clones for a given node.
830 void mergeNodeCalleeClones(
831 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
832 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode);
833 /// Helper to find other callers of the given set of callee edges that can
834 /// share the same callee merge node.
835 void findOtherCallersToShareMerge(
836 ContextNode *Node, std::vector<std::shared_ptr<ContextEdge>> &CalleeEdges,
837 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode,
838 DenseSet<ContextNode *> &OtherCallersToShareMerge);
839
840 /// Recursively perform cloning on the graph for the given Node and its
841 /// callers, in order to uniquely identify the allocation behavior of an
842 /// allocation given its context. The context ids of the allocation being
843 /// processed are given in AllocContextIds.
844 void identifyClones(ContextNode *Node, DenseSet<const ContextNode *> &Visited,
845 const DenseSet<uint32_t> &AllocContextIds);
846
847 /// Map from each context ID to the AllocationType assigned to that context.
848 DenseMap<uint32_t, AllocationType> ContextIdToAllocationType;
849
850 /// Map from each contextID to the profiled full contexts and their total
851 /// sizes (there may be more than one due to context trimming),
852 /// optionally populated when requested (via MemProfReportHintedSizes or
853 /// MinClonedColdBytePercent).
854 DenseMap<uint32_t, std::vector<ContextTotalSize>> ContextIdToContextSizeInfos;
855
856 /// Identifies the context node created for a stack id when adding the MIB
857 /// contexts to the graph. This is used to locate the context nodes when
858 /// trying to assign the corresponding callsites with those stack ids to these
859 /// nodes.
860 DenseMap<uint64_t, ContextNode *> StackEntryIdToContextNodeMap;
861
862 /// Maps to track the calls to their corresponding nodes in the graph.
863 MapVector<CallInfo, ContextNode *> AllocationCallToContextNodeMap;
864 MapVector<CallInfo, ContextNode *> NonAllocationCallToContextNodeMap;
865
866 /// Owner of all ContextNode unique_ptrs.
867 std::vector<std::unique_ptr<ContextNode>> NodeOwner;
868
869 /// Perform sanity checks on graph when requested.
870 void check() const;
871
872 /// Keeps track of the last unique context id assigned.
873 unsigned int LastContextId = 0;
874};
875
876template <typename DerivedCCG, typename FuncTy, typename CallTy>
877using ContextNode =
878 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode;
879template <typename DerivedCCG, typename FuncTy, typename CallTy>
880using ContextEdge =
881 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge;
882template <typename DerivedCCG, typename FuncTy, typename CallTy>
883using FuncInfo =
884 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::FuncInfo;
885template <typename DerivedCCG, typename FuncTy, typename CallTy>
886using CallInfo =
887 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::CallInfo;
888
889/// CRTP derived class for graphs built from IR (regular LTO).
890class ModuleCallsiteContextGraph
891 : public CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
892 Instruction *> {
893public:
894 ModuleCallsiteContextGraph(
895 Module &M,
896 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter);
897
898private:
899 friend CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
900 Instruction *>;
901
902 uint64_t getStackId(uint64_t IdOrIndex) const;
903 const Function *getCalleeFunc(Instruction *Call);
904 bool calleeMatchesFunc(
905 Instruction *Call, const Function *Func, const Function *CallerFunc,
906 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain);
907 bool sameCallee(Instruction *Call1, Instruction *Call2);
908 bool findProfiledCalleeThroughTailCalls(
909 const Function *ProfiledCallee, Value *CurCallee, unsigned Depth,
910 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain,
911 bool &FoundMultipleCalleeChains);
912 uint64_t getLastStackId(Instruction *Call);
913 std::vector<uint64_t> getStackIdsWithContextNodesForCall(Instruction *Call);
914 void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
915 AllocationType getAllocationCallType(const CallInfo &Call) const;
916 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
917 CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
918 Instruction *>::FuncInfo
919 cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
920 DenseMap<CallInfo, CallInfo> &CallMap,
921 std::vector<CallInfo> &CallsWithMetadataInFunc,
922 unsigned CloneNo);
923 std::string getLabel(const Function *Func, const Instruction *Call,
924 unsigned CloneNo) const;
925
926 const Module &Mod;
927 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter;
928};
929
930/// Represents a call in the summary index graph, which can either be an
931/// allocation or an interior callsite node in an allocation's context.
932/// Holds a pointer to the corresponding data structure in the index.
933struct IndexCall : public PointerUnion<CallsiteInfo *, AllocInfo *> {
934 IndexCall() : PointerUnion() {}
935 IndexCall(std::nullptr_t) : IndexCall() {}
936 IndexCall(CallsiteInfo *StackNode) : PointerUnion(StackNode) {}
937 IndexCall(AllocInfo *AllocNode) : PointerUnion(AllocNode) {}
938 IndexCall(PointerUnion PT) : PointerUnion(PT) {}
939
940 IndexCall *operator->() { return this; }
941
942 void print(raw_ostream &OS) const {
943 PointerUnion<CallsiteInfo *, AllocInfo *> Base = *this;
945 OS << *AI;
946 } else {
948 assert(CI);
949 OS << *CI;
950 }
951 }
952};
953} // namespace
954
955namespace llvm {
956template <> struct simplify_type<IndexCall> {
958 static SimpleType getSimplifiedValue(IndexCall &Val) { return Val; }
959};
960template <> struct simplify_type<const IndexCall> {
962 static SimpleType getSimplifiedValue(const IndexCall &Val) { return Val; }
963};
964} // namespace llvm
965
966namespace {
967/// CRTP derived class for graphs built from summary index (ThinLTO).
968class IndexCallsiteContextGraph
969 : public CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
970 IndexCall> {
971public:
972 IndexCallsiteContextGraph(
973 ModuleSummaryIndex &Index,
974 llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
975 isPrevailing);
976
977 ~IndexCallsiteContextGraph() {
978 // Now that we are done with the graph it is safe to add the new
979 // CallsiteInfo structs to the function summary vectors. The graph nodes
980 // point into locations within these vectors, so we don't want to add them
981 // any earlier.
982 for (auto &I : FunctionCalleesToSynthesizedCallsiteInfos) {
983 auto *FS = I.first;
984 for (auto &Callsite : I.second)
985 FS->addCallsite(*Callsite.second);
986 }
987 }
988
989private:
990 friend CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
991 IndexCall>;
992
993 uint64_t getStackId(uint64_t IdOrIndex) const;
994 const FunctionSummary *getCalleeFunc(IndexCall &Call);
995 bool calleeMatchesFunc(
996 IndexCall &Call, const FunctionSummary *Func,
997 const FunctionSummary *CallerFunc,
998 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain);
999 bool sameCallee(IndexCall &Call1, IndexCall &Call2);
1000 bool findProfiledCalleeThroughTailCalls(
1001 ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
1002 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
1003 bool &FoundMultipleCalleeChains);
1004 uint64_t getLastStackId(IndexCall &Call);
1005 std::vector<uint64_t> getStackIdsWithContextNodesForCall(IndexCall &Call);
1006 void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
1007 AllocationType getAllocationCallType(const CallInfo &Call) const;
1008 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
1009 CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
1010 IndexCall>::FuncInfo
1011 cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
1012 DenseMap<CallInfo, CallInfo> &CallMap,
1013 std::vector<CallInfo> &CallsWithMetadataInFunc,
1014 unsigned CloneNo);
1015 std::string getLabel(const FunctionSummary *Func, const IndexCall &Call,
1016 unsigned CloneNo) const;
1017
1018 // Saves mapping from function summaries containing memprof records back to
1019 // its VI, for use in checking and debugging.
1020 std::map<const FunctionSummary *, ValueInfo> FSToVIMap;
1021
1022 const ModuleSummaryIndex &Index;
1023 llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
1024 isPrevailing;
1025
1026 // Saves/owns the callsite info structures synthesized for missing tail call
1027 // frames that we discover while building the graph.
1028 // It maps from the summary of the function making the tail call, to a map
1029 // of callee ValueInfo to corresponding synthesized callsite info.
1030 std::unordered_map<FunctionSummary *,
1031 std::map<ValueInfo, std::unique_ptr<CallsiteInfo>>>
1032 FunctionCalleesToSynthesizedCallsiteInfos;
1033};
1034} // namespace
1035
1036namespace llvm {
1037template <>
1038struct DenseMapInfo<typename CallsiteContextGraph<
1039 ModuleCallsiteContextGraph, Function, Instruction *>::CallInfo>
1041template <>
1042struct DenseMapInfo<typename CallsiteContextGraph<
1043 IndexCallsiteContextGraph, FunctionSummary, IndexCall>::CallInfo>
1044 : public DenseMapInfo<std::pair<IndexCall, unsigned>> {};
1045template <>
1046struct DenseMapInfo<IndexCall>
1047 : public DenseMapInfo<PointerUnion<CallsiteInfo *, AllocInfo *>> {};
1048} // end namespace llvm
1049
1050namespace {
1051
1052// Map the uint8_t alloc types (which may contain NotCold|Cold) to the alloc
1053// type we should actually use on the corresponding allocation.
1054// If we can't clone a node that has NotCold+Cold alloc type, we will fall
1055// back to using NotCold. So don't bother cloning to distinguish NotCold+Cold
1056// from NotCold.
1057AllocationType allocTypeToUse(uint8_t AllocTypes) {
1058 assert(AllocTypes != (uint8_t)AllocationType::None);
1059 if (AllocTypes ==
1062 else
1063 return (AllocationType)AllocTypes;
1064}
1065
1066// Helper to check if the alloc types for all edges recorded in the
1067// InAllocTypes vector match the alloc types for all edges in the Edges
1068// vector.
1069template <typename DerivedCCG, typename FuncTy, typename CallTy>
1070bool allocTypesMatch(
1071 const std::vector<uint8_t> &InAllocTypes,
1072 const std::vector<std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>>
1073 &Edges) {
1074 // This should be called only when the InAllocTypes vector was computed for
1075 // this set of Edges. Make sure the sizes are the same.
1076 assert(InAllocTypes.size() == Edges.size());
1077 return std::equal(
1078 InAllocTypes.begin(), InAllocTypes.end(), Edges.begin(), Edges.end(),
1079 [](const uint8_t &l,
1080 const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &r) {
1081 // Can share if one of the edges is None type - don't
1082 // care about the type along that edge as it doesn't
1083 // exist for those context ids.
1084 if (l == (uint8_t)AllocationType::None ||
1085 r->AllocTypes == (uint8_t)AllocationType::None)
1086 return true;
1087 return allocTypeToUse(l) == allocTypeToUse(r->AllocTypes);
1088 });
1089}
1090
1091// Helper to check if the alloc types for all edges recorded in the
1092// InAllocTypes vector match the alloc types for callee edges in the given
1093// clone. Because the InAllocTypes were computed from the original node's callee
1094// edges, and other cloning could have happened after this clone was created, we
1095// need to find the matching clone callee edge, which may or may not exist.
1096template <typename DerivedCCG, typename FuncTy, typename CallTy>
1097bool allocTypesMatchClone(
1098 const std::vector<uint8_t> &InAllocTypes,
1099 const ContextNode<DerivedCCG, FuncTy, CallTy> *Clone) {
1100 const ContextNode<DerivedCCG, FuncTy, CallTy> *Node = Clone->CloneOf;
1101 assert(Node);
1102 // InAllocTypes should have been computed for the original node's callee
1103 // edges.
1104 assert(InAllocTypes.size() == Node->CalleeEdges.size());
1105 // First create a map of the clone callee edge callees to the edge alloc type.
1107 EdgeCalleeMap;
1108 for (const auto &E : Clone->CalleeEdges) {
1109 assert(!EdgeCalleeMap.contains(E->Callee));
1110 EdgeCalleeMap[E->Callee] = E->AllocTypes;
1111 }
1112 // Next, walk the original node's callees, and look for the corresponding
1113 // clone edge to that callee.
1114 for (unsigned I = 0; I < Node->CalleeEdges.size(); I++) {
1115 auto Iter = EdgeCalleeMap.find(Node->CalleeEdges[I]->Callee);
1116 // Not found is ok, we will simply add an edge if we use this clone.
1117 if (Iter == EdgeCalleeMap.end())
1118 continue;
1119 // Can share if one of the edges is None type - don't
1120 // care about the type along that edge as it doesn't
1121 // exist for those context ids.
1122 if (InAllocTypes[I] == (uint8_t)AllocationType::None ||
1123 Iter->second == (uint8_t)AllocationType::None)
1124 continue;
1125 if (allocTypeToUse(Iter->second) != allocTypeToUse(InAllocTypes[I]))
1126 return false;
1127 }
1128 return true;
1129}
1130
1131} // end anonymous namespace
1132
1133template <typename DerivedCCG, typename FuncTy, typename CallTy>
1134typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1135CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForInst(
1136 const CallInfo &C) {
1137 ContextNode *Node = getNodeForAlloc(C);
1138 if (Node)
1139 return Node;
1140
1141 return NonAllocationCallToContextNodeMap.lookup(C);
1142}
1143
1144template <typename DerivedCCG, typename FuncTy, typename CallTy>
1145typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1146CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForAlloc(
1147 const CallInfo &C) {
1148 return AllocationCallToContextNodeMap.lookup(C);
1149}
1150
1151template <typename DerivedCCG, typename FuncTy, typename CallTy>
1152typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1153CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForStackId(
1154 uint64_t StackId) {
1155 auto StackEntryNode = StackEntryIdToContextNodeMap.find(StackId);
1156 if (StackEntryNode != StackEntryIdToContextNodeMap.end())
1157 return StackEntryNode->second;
1158 return nullptr;
1159}
1160
1161template <typename DerivedCCG, typename FuncTy, typename CallTy>
1162void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1163 addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
1164 unsigned int ContextId) {
1165 for (auto &Edge : CallerEdges) {
1166 if (Edge->Caller == Caller) {
1167 Edge->AllocTypes |= (uint8_t)AllocType;
1168 Edge->getContextIds().insert(ContextId);
1169 return;
1170 }
1171 }
1172 std::shared_ptr<ContextEdge> Edge = std::make_shared<ContextEdge>(
1173 this, Caller, (uint8_t)AllocType, DenseSet<uint32_t>({ContextId}));
1174 CallerEdges.push_back(Edge);
1175 Caller->CalleeEdges.push_back(Edge);
1176}
1177
1178template <typename DerivedCCG, typename FuncTy, typename CallTy>
1179void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::removeEdgeFromGraph(
1180 ContextEdge *Edge, EdgeIter *EI, bool CalleeIter) {
1181 assert(!EI || (*EI)->get() == Edge);
1182 assert(!Edge->isRemoved());
1183 // Save the Caller and Callee pointers so we can erase Edge from their edge
1184 // lists after clearing Edge below. We do the clearing first in case it is
1185 // destructed after removing from the edge lists (if those were the last
1186 // shared_ptr references to Edge).
1187 auto *Callee = Edge->Callee;
1188 auto *Caller = Edge->Caller;
1189
1190 // Make sure the edge fields are cleared out so we can properly detect
1191 // removed edges if Edge is not destructed because there is still a shared_ptr
1192 // reference.
1193 Edge->clear();
1194
1195#ifndef NDEBUG
1196 auto CalleeCallerCount = Callee->CallerEdges.size();
1197 auto CallerCalleeCount = Caller->CalleeEdges.size();
1198#endif
1199 if (!EI) {
1200 Callee->eraseCallerEdge(Edge);
1201 Caller->eraseCalleeEdge(Edge);
1202 } else if (CalleeIter) {
1203 Callee->eraseCallerEdge(Edge);
1204 *EI = Caller->CalleeEdges.erase(*EI);
1205 } else {
1206 Caller->eraseCalleeEdge(Edge);
1207 *EI = Callee->CallerEdges.erase(*EI);
1208 }
1209 assert(Callee->CallerEdges.size() < CalleeCallerCount);
1210 assert(Caller->CalleeEdges.size() < CallerCalleeCount);
1211}
1212
1213template <typename DerivedCCG, typename FuncTy, typename CallTy>
1214void CallsiteContextGraph<
1215 DerivedCCG, FuncTy, CallTy>::removeNoneTypeCalleeEdges(ContextNode *Node) {
1216 for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();) {
1217 auto Edge = *EI;
1218 if (Edge->AllocTypes == (uint8_t)AllocationType::None) {
1219 assert(Edge->ContextIds.empty());
1220 removeEdgeFromGraph(Edge.get(), &EI, /*CalleeIter=*/true);
1221 } else
1222 ++EI;
1223 }
1224}
1225
1226template <typename DerivedCCG, typename FuncTy, typename CallTy>
1227void CallsiteContextGraph<
1228 DerivedCCG, FuncTy, CallTy>::removeNoneTypeCallerEdges(ContextNode *Node) {
1229 for (auto EI = Node->CallerEdges.begin(); EI != Node->CallerEdges.end();) {
1230 auto Edge = *EI;
1231 if (Edge->AllocTypes == (uint8_t)AllocationType::None) {
1232 assert(Edge->ContextIds.empty());
1233 Edge->Caller->eraseCalleeEdge(Edge.get());
1234 EI = Node->CallerEdges.erase(EI);
1235 } else
1236 ++EI;
1237 }
1238}
1239
1240template <typename DerivedCCG, typename FuncTy, typename CallTy>
1241typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
1242CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1243 findEdgeFromCallee(const ContextNode *Callee) {
1244 for (const auto &Edge : CalleeEdges)
1245 if (Edge->Callee == Callee)
1246 return Edge.get();
1247 return nullptr;
1248}
1249
1250template <typename DerivedCCG, typename FuncTy, typename CallTy>
1251typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
1252CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1253 findEdgeFromCaller(const ContextNode *Caller) {
1254 for (const auto &Edge : CallerEdges)
1255 if (Edge->Caller == Caller)
1256 return Edge.get();
1257 return nullptr;
1258}
1259
1260template <typename DerivedCCG, typename FuncTy, typename CallTy>
1261void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1262 eraseCalleeEdge(const ContextEdge *Edge) {
1263 auto EI = llvm::find_if(
1264 CalleeEdges, [Edge](const std::shared_ptr<ContextEdge> &CalleeEdge) {
1265 return CalleeEdge.get() == Edge;
1266 });
1267 assert(EI != CalleeEdges.end());
1268 CalleeEdges.erase(EI);
1269}
1270
1271template <typename DerivedCCG, typename FuncTy, typename CallTy>
1272void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1273 eraseCallerEdge(const ContextEdge *Edge) {
1274 auto EI = llvm::find_if(
1275 CallerEdges, [Edge](const std::shared_ptr<ContextEdge> &CallerEdge) {
1276 return CallerEdge.get() == Edge;
1277 });
1278 assert(EI != CallerEdges.end());
1279 CallerEdges.erase(EI);
1280}
1281
1282template <typename DerivedCCG, typename FuncTy, typename CallTy>
1283uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::computeAllocType(
1284 DenseSet<uint32_t> &ContextIds) const {
1285 uint8_t BothTypes =
1286 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
1287 uint8_t AllocType = (uint8_t)AllocationType::None;
1288 for (auto Id : ContextIds) {
1289 AllocType |= (uint8_t)ContextIdToAllocationType.at(Id);
1290 // Bail early if alloc type reached both, no further refinement.
1291 if (AllocType == BothTypes)
1292 return AllocType;
1293 }
1294 return AllocType;
1295}
1296
1297template <typename DerivedCCG, typename FuncTy, typename CallTy>
1298uint8_t
1299CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypesImpl(
1300 const DenseSet<uint32_t> &Node1Ids,
1301 const DenseSet<uint32_t> &Node2Ids) const {
1302 uint8_t BothTypes =
1303 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
1304 uint8_t AllocType = (uint8_t)AllocationType::None;
1305 for (auto Id : Node1Ids) {
1306 if (!Node2Ids.count(Id))
1307 continue;
1308 AllocType |= (uint8_t)ContextIdToAllocationType.at(Id);
1309 // Bail early if alloc type reached both, no further refinement.
1310 if (AllocType == BothTypes)
1311 return AllocType;
1312 }
1313 return AllocType;
1314}
1315
1316template <typename DerivedCCG, typename FuncTy, typename CallTy>
1317uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypes(
1318 const DenseSet<uint32_t> &Node1Ids,
1319 const DenseSet<uint32_t> &Node2Ids) const {
1320 if (Node1Ids.size() < Node2Ids.size())
1321 return intersectAllocTypesImpl(Node1Ids, Node2Ids);
1322 else
1323 return intersectAllocTypesImpl(Node2Ids, Node1Ids);
1324}
1325
1326template <typename DerivedCCG, typename FuncTy, typename CallTy>
1327typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1328CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addAllocNode(
1329 CallInfo Call, const FuncTy *F) {
1330 assert(!getNodeForAlloc(Call));
1331 ContextNode *AllocNode = createNewNode(/*IsAllocation=*/true, F, Call);
1332 AllocationCallToContextNodeMap[Call] = AllocNode;
1333 // Use LastContextId as a uniq id for MIB allocation nodes.
1334 AllocNode->OrigStackOrAllocId = LastContextId;
1335 // Alloc type should be updated as we add in the MIBs. We should assert
1336 // afterwards that it is not still None.
1337 AllocNode->AllocTypes = (uint8_t)AllocationType::None;
1338
1339 return AllocNode;
1340}
1341
1342static std::string getAllocTypeString(uint8_t AllocTypes) {
1343 if (!AllocTypes)
1344 return "None";
1345 std::string Str;
1346 if (AllocTypes & (uint8_t)AllocationType::NotCold)
1347 Str += "NotCold";
1348 if (AllocTypes & (uint8_t)AllocationType::Cold)
1349 Str += "Cold";
1350 return Str;
1351}
1352
1353template <typename DerivedCCG, typename FuncTy, typename CallTy>
1354template <class NodeT, class IteratorT>
1355void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
1356 ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext,
1357 CallStack<NodeT, IteratorT> &CallsiteContext, AllocationType AllocType,
1358 ArrayRef<ContextTotalSize> ContextSizeInfo) {
1359 // Treating the hot alloc type as NotCold before the disambiguation for "hot"
1360 // is done.
1361 if (AllocType == AllocationType::Hot)
1362 AllocType = AllocationType::NotCold;
1363
1364 ContextIdToAllocationType[++LastContextId] = AllocType;
1365
1366 if (!ContextSizeInfo.empty()) {
1367 auto &Entry = ContextIdToContextSizeInfos[LastContextId];
1368 Entry.insert(Entry.begin(), ContextSizeInfo.begin(), ContextSizeInfo.end());
1369 }
1370
1371 // Update alloc type and context ids for this MIB.
1372 AllocNode->AllocTypes |= (uint8_t)AllocType;
1373
1374 // Now add or update nodes for each stack id in alloc's context.
1375 // Later when processing the stack ids on non-alloc callsites we will adjust
1376 // for any inlining in the context.
1377 ContextNode *PrevNode = AllocNode;
1378 // Look for recursion (direct recursion should have been collapsed by
1379 // module summary analysis, here we should just be detecting mutual
1380 // recursion). Mark these nodes so we don't try to clone.
1381 SmallSet<uint64_t, 8> StackIdSet;
1382 // Skip any on the allocation call (inlining).
1383 for (auto ContextIter = StackContext.beginAfterSharedPrefix(CallsiteContext);
1384 ContextIter != StackContext.end(); ++ContextIter) {
1385 auto StackId = getStackId(*ContextIter);
1386 ContextNode *StackNode = getNodeForStackId(StackId);
1387 if (!StackNode) {
1388 StackNode = createNewNode(/*IsAllocation=*/false);
1389 StackEntryIdToContextNodeMap[StackId] = StackNode;
1390 StackNode->OrigStackOrAllocId = StackId;
1391 }
1392 // Marking a node recursive will prevent its cloning completely, even for
1393 // non-recursive contexts flowing through it.
1395 auto Ins = StackIdSet.insert(StackId);
1396 if (!Ins.second)
1397 StackNode->Recursive = true;
1398 }
1399 StackNode->AllocTypes |= (uint8_t)AllocType;
1400 PrevNode->addOrUpdateCallerEdge(StackNode, AllocType, LastContextId);
1401 PrevNode = StackNode;
1402 }
1403}
1404
1405template <typename DerivedCCG, typename FuncTy, typename CallTy>
1406DenseSet<uint32_t>
1407CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::duplicateContextIds(
1408 const DenseSet<uint32_t> &StackSequenceContextIds,
1409 DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
1410 DenseSet<uint32_t> NewContextIds;
1411 for (auto OldId : StackSequenceContextIds) {
1412 NewContextIds.insert(++LastContextId);
1413 OldToNewContextIds[OldId].insert(LastContextId);
1414 assert(ContextIdToAllocationType.count(OldId));
1415 // The new context has the same allocation type as original.
1416 ContextIdToAllocationType[LastContextId] = ContextIdToAllocationType[OldId];
1417 if (DotAllocContextIds.contains(OldId))
1418 DotAllocContextIds.insert(LastContextId);
1419 }
1420 return NewContextIds;
1421}
1422
1423template <typename DerivedCCG, typename FuncTy, typename CallTy>
1424void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
1425 propagateDuplicateContextIds(
1426 const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
1427 // Build a set of duplicated context ids corresponding to the input id set.
1428 auto GetNewIds = [&OldToNewContextIds](const DenseSet<uint32_t> &ContextIds) {
1429 DenseSet<uint32_t> NewIds;
1430 for (auto Id : ContextIds)
1431 if (auto NewId = OldToNewContextIds.find(Id);
1432 NewId != OldToNewContextIds.end())
1433 NewIds.insert_range(NewId->second);
1434 return NewIds;
1435 };
1436
1437 // Recursively update context ids sets along caller edges.
1438 auto UpdateCallers = [&](ContextNode *Node,
1439 DenseSet<const ContextEdge *> &Visited,
1440 auto &&UpdateCallers) -> void {
1441 for (const auto &Edge : Node->CallerEdges) {
1442 auto Inserted = Visited.insert(Edge.get());
1443 if (!Inserted.second)
1444 continue;
1445 ContextNode *NextNode = Edge->Caller;
1446 DenseSet<uint32_t> NewIdsToAdd = GetNewIds(Edge->getContextIds());
1447 // Only need to recursively iterate to NextNode via this caller edge if
1448 // it resulted in any added ids to NextNode.
1449 if (!NewIdsToAdd.empty()) {
1450 Edge->getContextIds().insert_range(NewIdsToAdd);
1451 UpdateCallers(NextNode, Visited, UpdateCallers);
1452 }
1453 }
1454 };
1455
1456 DenseSet<const ContextEdge *> Visited;
1457 for (auto &Entry : AllocationCallToContextNodeMap) {
1458 auto *Node = Entry.second;
1459 UpdateCallers(Node, Visited, UpdateCallers);
1460 }
1461}
1462
1463template <typename DerivedCCG, typename FuncTy, typename CallTy>
1464void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::connectNewNode(
1465 ContextNode *NewNode, ContextNode *OrigNode, bool TowardsCallee,
1466 // This must be passed by value to make a copy since it will be adjusted
1467 // as ids are moved.
1468 DenseSet<uint32_t> RemainingContextIds) {
1469 auto &OrigEdges =
1470 TowardsCallee ? OrigNode->CalleeEdges : OrigNode->CallerEdges;
1471 DenseSet<uint32_t> RecursiveContextIds;
1472 DenseSet<uint32_t> AllCallerContextIds;
1474 // Identify which context ids are recursive which is needed to properly
1475 // update the RemainingContextIds set. The relevant recursive context ids
1476 // are those that are in multiple edges.
1477 for (auto &CE : OrigEdges) {
1478 AllCallerContextIds.reserve(CE->getContextIds().size());
1479 for (auto Id : CE->getContextIds())
1480 if (!AllCallerContextIds.insert(Id).second)
1481 RecursiveContextIds.insert(Id);
1482 }
1483 }
1484 // Increment iterator in loop so that we can remove edges as needed.
1485 for (auto EI = OrigEdges.begin(); EI != OrigEdges.end();) {
1486 auto Edge = *EI;
1487 DenseSet<uint32_t> NewEdgeContextIds;
1488 DenseSet<uint32_t> NotFoundContextIds;
1489 // Remove any matching context ids from Edge, return set that were found and
1490 // removed, these are the new edge's context ids. Also update the remaining
1491 // (not found ids).
1492 set_subtract(Edge->getContextIds(), RemainingContextIds, NewEdgeContextIds,
1493 NotFoundContextIds);
1494 // Update the remaining context ids set for the later edges. This is a
1495 // compile time optimization.
1496 if (RecursiveContextIds.empty()) {
1497 // No recursive ids, so all of the previously remaining context ids that
1498 // were not seen on this edge are the new remaining set.
1499 RemainingContextIds.swap(NotFoundContextIds);
1500 } else {
1501 // Keep the recursive ids in the remaining set as we expect to see those
1502 // on another edge. We can remove the non-recursive remaining ids that
1503 // were seen on this edge, however. We already have the set of remaining
1504 // ids that were on this edge (in NewEdgeContextIds). Figure out which are
1505 // non-recursive and only remove those. Note that despite the higher
1506 // overhead of updating the remaining context ids set when recursion
1507 // handling is enabled, it was found to be at worst performance neutral
1508 // and in one case a clear win.
1509 DenseSet<uint32_t> NonRecursiveRemainingCurEdgeIds =
1510 set_difference(NewEdgeContextIds, RecursiveContextIds);
1511 set_subtract(RemainingContextIds, NonRecursiveRemainingCurEdgeIds);
1512 }
1513 // If no matching context ids for this edge, skip it.
1514 if (NewEdgeContextIds.empty()) {
1515 ++EI;
1516 continue;
1517 }
1518 if (TowardsCallee) {
1519 uint8_t NewAllocType = computeAllocType(NewEdgeContextIds);
1520 auto NewEdge = std::make_shared<ContextEdge>(
1521 Edge->Callee, NewNode, NewAllocType, std::move(NewEdgeContextIds));
1522 NewNode->CalleeEdges.push_back(NewEdge);
1523 NewEdge->Callee->CallerEdges.push_back(NewEdge);
1524 } else {
1525 uint8_t NewAllocType = computeAllocType(NewEdgeContextIds);
1526 auto NewEdge = std::make_shared<ContextEdge>(
1527 NewNode, Edge->Caller, NewAllocType, std::move(NewEdgeContextIds));
1528 NewNode->CallerEdges.push_back(NewEdge);
1529 NewEdge->Caller->CalleeEdges.push_back(NewEdge);
1530 }
1531 // Remove old edge if context ids empty.
1532 if (Edge->getContextIds().empty()) {
1533 removeEdgeFromGraph(Edge.get(), &EI, TowardsCallee);
1534 continue;
1535 }
1536 ++EI;
1537 }
1538}
1539
1540template <typename DerivedCCG, typename FuncTy, typename CallTy>
1541static void checkEdge(
1542 const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &Edge) {
1543 // Confirm that alloc type is not None and that we have at least one context
1544 // id.
1545 assert(Edge->AllocTypes != (uint8_t)AllocationType::None);
1546 assert(!Edge->ContextIds.empty());
1547}
1548
1549template <typename DerivedCCG, typename FuncTy, typename CallTy>
1550static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node,
1551 bool CheckEdges = true) {
1552 if (Node->isRemoved())
1553 return;
1554#ifndef NDEBUG
1555 // Compute node's context ids once for use in asserts.
1556 auto NodeContextIds = Node->getContextIds();
1557#endif
1558 // Node's context ids should be the union of both its callee and caller edge
1559 // context ids.
1560 if (Node->CallerEdges.size()) {
1561 DenseSet<uint32_t> CallerEdgeContextIds(
1562 Node->CallerEdges.front()->ContextIds);
1563 for (const auto &Edge : llvm::drop_begin(Node->CallerEdges)) {
1564 if (CheckEdges)
1566 set_union(CallerEdgeContextIds, Edge->ContextIds);
1567 }
1568 // Node can have more context ids than callers if some contexts terminate at
1569 // node and some are longer. If we are allowing recursive callsites and
1570 // contexts this will be violated for incompletely cloned recursive cycles,
1571 // so skip the checking in that case.
1573 NodeContextIds == CallerEdgeContextIds ||
1574 set_is_subset(CallerEdgeContextIds, NodeContextIds));
1575 }
1576 if (Node->CalleeEdges.size()) {
1577 DenseSet<uint32_t> CalleeEdgeContextIds(
1578 Node->CalleeEdges.front()->ContextIds);
1579 for (const auto &Edge : llvm::drop_begin(Node->CalleeEdges)) {
1580 if (CheckEdges)
1582 set_union(CalleeEdgeContextIds, Edge->getContextIds());
1583 }
1584 // If we are allowing recursive callsites and contexts this will be violated
1585 // for incompletely cloned recursive cycles, so skip the checking in that
1586 // case.
1588 NodeContextIds == CalleeEdgeContextIds);
1589 }
1590 // FIXME: Since this checking is only invoked under an option, we should
1591 // change the error checking from using assert to something that will trigger
1592 // an error on a release build.
1593#ifndef NDEBUG
1594 // Make sure we don't end up with duplicate edges between the same caller and
1595 // callee.
1597 for (const auto &E : Node->CalleeEdges)
1598 NodeSet.insert(E->Callee);
1599 assert(NodeSet.size() == Node->CalleeEdges.size());
1600#endif
1601}
1602
1603template <typename DerivedCCG, typename FuncTy, typename CallTy>
1604void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
1605 assignStackNodesPostOrder(
1606 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
1607 DenseMap<uint64_t, std::vector<CallContextInfo>>
1608 &StackIdToMatchingCalls,
1609 DenseMap<CallInfo, CallInfo> &CallToMatchingCall) {
1610 auto Inserted = Visited.insert(Node);
1611 if (!Inserted.second)
1612 return;
1613 // Post order traversal. Iterate over a copy since we may add nodes and
1614 // therefore new callers during the recursive call, invalidating any
1615 // iterator over the original edge vector. We don't need to process these
1616 // new nodes as they were already processed on creation.
1617 auto CallerEdges = Node->CallerEdges;
1618 for (auto &Edge : CallerEdges) {
1619 // Skip any that have been removed during the recursion.
1620 if (Edge->isRemoved()) {
1621 assert(!is_contained(Node->CallerEdges, Edge));
1622 continue;
1623 }
1624 assignStackNodesPostOrder(Edge->Caller, Visited, StackIdToMatchingCalls,
1625 CallToMatchingCall);
1626 }
1627
1628 // If this node's stack id is in the map, update the graph to contain new
1629 // nodes representing any inlining at interior callsites. Note we move the
1630 // associated context ids over to the new nodes.
1631
1632 // Ignore this node if it is for an allocation or we didn't record any
1633 // stack id lists ending at it.
1634 if (Node->IsAllocation ||
1635 !StackIdToMatchingCalls.count(Node->OrigStackOrAllocId))
1636 return;
1637
1638 auto &Calls = StackIdToMatchingCalls[Node->OrigStackOrAllocId];
1639 // Handle the simple case first. A single call with a single stack id.
1640 // In this case there is no need to create any new context nodes, simply
1641 // assign the context node for stack id to this Call.
1642 if (Calls.size() == 1) {
1643 auto &[Call, Ids, Func, SavedContextIds] = Calls[0];
1644 if (Ids.size() == 1) {
1645 assert(SavedContextIds.empty());
1646 // It should be this Node
1647 assert(Node == getNodeForStackId(Ids[0]));
1648 if (Node->Recursive)
1649 return;
1650 Node->setCall(Call);
1651 NonAllocationCallToContextNodeMap[Call] = Node;
1652 NodeToCallingFunc[Node] = Func;
1653 return;
1654 }
1655 }
1656
1657#ifndef NDEBUG
1658 // Find the node for the last stack id, which should be the same
1659 // across all calls recorded for this id, and is this node's id.
1660 uint64_t LastId = Node->OrigStackOrAllocId;
1661 ContextNode *LastNode = getNodeForStackId(LastId);
1662 // We should only have kept stack ids that had nodes.
1663 assert(LastNode);
1664 assert(LastNode == Node);
1665#else
1666 ContextNode *LastNode = Node;
1667#endif
1668
1669 // Compute the last node's context ids once, as it is shared by all calls in
1670 // this entry.
1671 DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds();
1672
1673 [[maybe_unused]] bool PrevIterCreatedNode = false;
1674 bool CreatedNode = false;
1675 for (unsigned I = 0; I < Calls.size();
1676 I++, PrevIterCreatedNode = CreatedNode) {
1677 CreatedNode = false;
1678 auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
1679 // Skip any for which we didn't assign any ids, these don't get a node in
1680 // the graph.
1681 if (SavedContextIds.empty()) {
1682 // If this call has a matching call (located in the same function and
1683 // having the same stack ids), simply add it to the context node created
1684 // for its matching call earlier. These can be treated the same through
1685 // cloning and get updated at the same time.
1686 if (!CallToMatchingCall.contains(Call))
1687 continue;
1688 auto MatchingCall = CallToMatchingCall[Call];
1689 if (!NonAllocationCallToContextNodeMap.contains(MatchingCall)) {
1690 // This should only happen if we had a prior iteration, and it didn't
1691 // create a node because of the below recomputation of context ids
1692 // finding none remaining and continuing early.
1693 assert(I > 0 && !PrevIterCreatedNode);
1694 continue;
1695 }
1696 NonAllocationCallToContextNodeMap[MatchingCall]->MatchingCalls.push_back(
1697 Call);
1698 continue;
1699 }
1700
1701 assert(LastId == Ids.back());
1702
1703 // Recompute the context ids for this stack id sequence (the
1704 // intersection of the context ids of the corresponding nodes).
1705 // Start with the ids we saved in the map for this call, which could be
1706 // duplicated context ids. We have to recompute as we might have overlap
1707 // overlap between the saved context ids for different last nodes, and
1708 // removed them already during the post order traversal.
1709 set_intersect(SavedContextIds, LastNodeContextIds);
1710 ContextNode *PrevNode = LastNode;
1711 bool Skip = false;
1712 // Iterate backwards through the stack Ids, starting after the last Id
1713 // in the list, which was handled once outside for all Calls.
1714 for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) {
1715 auto Id = *IdIter;
1716 ContextNode *CurNode = getNodeForStackId(Id);
1717 // We should only have kept stack ids that had nodes and weren't
1718 // recursive.
1719 assert(CurNode);
1720 assert(!CurNode->Recursive);
1721
1722 auto *Edge = CurNode->findEdgeFromCaller(PrevNode);
1723 if (!Edge) {
1724 Skip = true;
1725 break;
1726 }
1727 PrevNode = CurNode;
1728
1729 // Update the context ids, which is the intersection of the ids along
1730 // all edges in the sequence.
1731 set_intersect(SavedContextIds, Edge->getContextIds());
1732
1733 // If we now have no context ids for clone, skip this call.
1734 if (SavedContextIds.empty()) {
1735 Skip = true;
1736 break;
1737 }
1738 }
1739 if (Skip)
1740 continue;
1741
1742 // Create new context node.
1743 ContextNode *NewNode = createNewNode(/*IsAllocation=*/false, Func, Call);
1744 NonAllocationCallToContextNodeMap[Call] = NewNode;
1745 CreatedNode = true;
1746 NewNode->AllocTypes = computeAllocType(SavedContextIds);
1747
1748 ContextNode *FirstNode = getNodeForStackId(Ids[0]);
1749 assert(FirstNode);
1750
1751 // Connect to callees of innermost stack frame in inlined call chain.
1752 // This updates context ids for FirstNode's callee's to reflect those
1753 // moved to NewNode.
1754 connectNewNode(NewNode, FirstNode, /*TowardsCallee=*/true, SavedContextIds);
1755
1756 // Connect to callers of outermost stack frame in inlined call chain.
1757 // This updates context ids for FirstNode's caller's to reflect those
1758 // moved to NewNode.
1759 connectNewNode(NewNode, LastNode, /*TowardsCallee=*/false, SavedContextIds);
1760
1761 // Now we need to remove context ids from edges/nodes between First and
1762 // Last Node.
1763 PrevNode = nullptr;
1764 for (auto Id : Ids) {
1765 ContextNode *CurNode = getNodeForStackId(Id);
1766 // We should only have kept stack ids that had nodes.
1767 assert(CurNode);
1768
1769 // Remove the context ids moved to NewNode from CurNode, and the
1770 // edge from the prior node.
1771 if (PrevNode) {
1772 auto *PrevEdge = CurNode->findEdgeFromCallee(PrevNode);
1773 // If the sequence contained recursion, we might have already removed
1774 // some edges during the connectNewNode calls above.
1775 if (!PrevEdge) {
1776 PrevNode = CurNode;
1777 continue;
1778 }
1779 set_subtract(PrevEdge->getContextIds(), SavedContextIds);
1780 if (PrevEdge->getContextIds().empty())
1781 removeEdgeFromGraph(PrevEdge);
1782 }
1783 // Since we update the edges from leaf to tail, only look at the callee
1784 // edges. This isn't an alloc node, so if there are no callee edges, the
1785 // alloc type is None.
1786 CurNode->AllocTypes = CurNode->CalleeEdges.empty()
1787 ? (uint8_t)AllocationType::None
1788 : CurNode->computeAllocType();
1789 PrevNode = CurNode;
1790 }
1791 if (VerifyNodes) {
1792 checkNode<DerivedCCG, FuncTy, CallTy>(NewNode, /*CheckEdges=*/true);
1793 for (auto Id : Ids) {
1794 ContextNode *CurNode = getNodeForStackId(Id);
1795 // We should only have kept stack ids that had nodes.
1796 assert(CurNode);
1797 checkNode<DerivedCCG, FuncTy, CallTy>(CurNode, /*CheckEdges=*/true);
1798 }
1799 }
1800 }
1801}
1802
1803template <typename DerivedCCG, typename FuncTy, typename CallTy>
1804void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
1805 // Map of stack id to all calls with that as the last (outermost caller)
1806 // callsite id that has a context node (some might not due to pruning
1807 // performed during matching of the allocation profile contexts).
1808 // The CallContextInfo contains the Call and a list of its stack ids with
1809 // ContextNodes, the function containing Call, and the set of context ids
1810 // the analysis will eventually identify for use in any new node created
1811 // for that callsite.
1812 DenseMap<uint64_t, std::vector<CallContextInfo>> StackIdToMatchingCalls;
1813 for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
1814 for (auto &Call : CallsWithMetadata) {
1815 // Ignore allocations, already handled.
1816 if (AllocationCallToContextNodeMap.count(Call))
1817 continue;
1818 auto StackIdsWithContextNodes =
1819 getStackIdsWithContextNodesForCall(Call.call());
1820 // If there were no nodes created for MIBs on allocs (maybe this was in
1821 // the unambiguous part of the MIB stack that was pruned), ignore.
1822 if (StackIdsWithContextNodes.empty())
1823 continue;
1824 // Otherwise, record this Call along with the list of ids for the last
1825 // (outermost caller) stack id with a node.
1826 StackIdToMatchingCalls[StackIdsWithContextNodes.back()].push_back(
1827 {Call.call(), StackIdsWithContextNodes, Func, {}});
1828 }
1829 }
1830
1831 // First make a pass through all stack ids that correspond to a call,
1832 // as identified in the above loop. Compute the context ids corresponding to
1833 // each of these calls when they correspond to multiple stack ids due to
1834 // due to inlining. Perform any duplication of context ids required when
1835 // there is more than one call with the same stack ids. Their (possibly newly
1836 // duplicated) context ids are saved in the StackIdToMatchingCalls map.
1837 DenseMap<uint32_t, DenseSet<uint32_t>> OldToNewContextIds;
1838 // Save a map from each call to any that are found to match it. I.e. located
1839 // in the same function and have the same (possibly pruned) stack ids. We use
1840 // this to avoid creating extra graph nodes as they can be treated the same.
1841 DenseMap<CallInfo, CallInfo> CallToMatchingCall;
1842 for (auto &It : StackIdToMatchingCalls) {
1843 auto &Calls = It.getSecond();
1844 // Skip single calls with a single stack id. These don't need a new node.
1845 if (Calls.size() == 1) {
1846 auto &Ids = Calls[0].StackIds;
1847 if (Ids.size() == 1)
1848 continue;
1849 }
1850 // In order to do the best and maximal matching of inlined calls to context
1851 // node sequences we will sort the vectors of stack ids in descending order
1852 // of length, and within each length, lexicographically by stack id. The
1853 // latter is so that we can specially handle calls that have identical stack
1854 // id sequences (either due to cloning or artificially because of the MIB
1855 // context pruning). Those with the same Ids are then sorted by function to
1856 // facilitate efficiently mapping them to the same context node.
1857 // Because the functions are pointers, to ensure a stable sort first assign
1858 // each function pointer to its first index in the Calls array, and then use
1859 // that to sort by.
1860 DenseMap<const FuncTy *, unsigned> FuncToIndex;
1861 for (const auto &[Idx, CallCtxInfo] : enumerate(Calls))
1862 FuncToIndex.insert({CallCtxInfo.Func, Idx});
1864 Calls,
1865 [&FuncToIndex](const CallContextInfo &A, const CallContextInfo &B) {
1866 return A.StackIds.size() > B.StackIds.size() ||
1867 (A.StackIds.size() == B.StackIds.size() &&
1868 (A.StackIds < B.StackIds ||
1869 (A.StackIds == B.StackIds &&
1870 FuncToIndex[A.Func] < FuncToIndex[B.Func])));
1871 });
1872
1873 // Find the node for the last stack id, which should be the same
1874 // across all calls recorded for this id, and is the id for this
1875 // entry in the StackIdToMatchingCalls map.
1876 uint64_t LastId = It.getFirst();
1877 ContextNode *LastNode = getNodeForStackId(LastId);
1878 // We should only have kept stack ids that had nodes.
1879 assert(LastNode);
1880
1881 if (LastNode->Recursive)
1882 continue;
1883
1884 // Initialize the context ids with the last node's. We will subsequently
1885 // refine the context ids by computing the intersection along all edges.
1886 DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds();
1887 assert(!LastNodeContextIds.empty());
1888
1889#ifndef NDEBUG
1890 // Save the set of functions seen for a particular set of the same stack
1891 // ids. This is used to ensure that they have been correctly sorted to be
1892 // adjacent in the Calls list, since we rely on that to efficiently place
1893 // all such matching calls onto the same context node.
1894 DenseSet<const FuncTy *> MatchingIdsFuncSet;
1895#endif
1896
1897 for (unsigned I = 0; I < Calls.size(); I++) {
1898 auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
1899 assert(SavedContextIds.empty());
1900 assert(LastId == Ids.back());
1901
1902#ifndef NDEBUG
1903 // If this call has a different set of ids than the last one, clear the
1904 // set used to ensure they are sorted properly.
1905 if (I > 0 && Ids != Calls[I - 1].StackIds)
1906 MatchingIdsFuncSet.clear();
1907#endif
1908
1909 // First compute the context ids for this stack id sequence (the
1910 // intersection of the context ids of the corresponding nodes).
1911 // Start with the remaining saved ids for the last node.
1912 assert(!LastNodeContextIds.empty());
1913 DenseSet<uint32_t> StackSequenceContextIds = LastNodeContextIds;
1914
1915 ContextNode *PrevNode = LastNode;
1916 ContextNode *CurNode = LastNode;
1917 bool Skip = false;
1918
1919 // Iterate backwards through the stack Ids, starting after the last Id
1920 // in the list, which was handled once outside for all Calls.
1921 for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) {
1922 auto Id = *IdIter;
1923 CurNode = getNodeForStackId(Id);
1924 // We should only have kept stack ids that had nodes.
1925 assert(CurNode);
1926
1927 if (CurNode->Recursive) {
1928 Skip = true;
1929 break;
1930 }
1931
1932 auto *Edge = CurNode->findEdgeFromCaller(PrevNode);
1933 // If there is no edge then the nodes belong to different MIB contexts,
1934 // and we should skip this inlined context sequence. For example, this
1935 // particular inlined context may include stack ids A->B, and we may
1936 // indeed have nodes for both A and B, but it is possible that they were
1937 // never profiled in sequence in a single MIB for any allocation (i.e.
1938 // we might have profiled an allocation that involves the callsite A,
1939 // but through a different one of its callee callsites, and we might
1940 // have profiled an allocation that involves callsite B, but reached
1941 // from a different caller callsite).
1942 if (!Edge) {
1943 Skip = true;
1944 break;
1945 }
1946 PrevNode = CurNode;
1947
1948 // Update the context ids, which is the intersection of the ids along
1949 // all edges in the sequence.
1950 set_intersect(StackSequenceContextIds, Edge->getContextIds());
1951
1952 // If we now have no context ids for clone, skip this call.
1953 if (StackSequenceContextIds.empty()) {
1954 Skip = true;
1955 break;
1956 }
1957 }
1958 if (Skip)
1959 continue;
1960
1961 // If some of this call's stack ids did not have corresponding nodes (due
1962 // to pruning), don't include any context ids for contexts that extend
1963 // beyond these nodes. Otherwise we would be matching part of unrelated /
1964 // not fully matching stack contexts. To do this, subtract any context ids
1965 // found in caller nodes of the last node found above.
1966 if (Ids.back() != getLastStackId(Call)) {
1967 for (const auto &PE : LastNode->CallerEdges) {
1968 set_subtract(StackSequenceContextIds, PE->getContextIds());
1969 if (StackSequenceContextIds.empty())
1970 break;
1971 }
1972 // If we now have no context ids for clone, skip this call.
1973 if (StackSequenceContextIds.empty())
1974 continue;
1975 }
1976
1977#ifndef NDEBUG
1978 // If the prior call had the same stack ids this set would not be empty.
1979 // Check if we already have a call that "matches" because it is located
1980 // in the same function. If the Calls list was sorted properly we should
1981 // not encounter this situation as all such entries should be adjacent
1982 // and processed in bulk further below.
1983 assert(!MatchingIdsFuncSet.contains(Func));
1984
1985 MatchingIdsFuncSet.insert(Func);
1986#endif
1987
1988 // Check if the next set of stack ids is the same (since the Calls vector
1989 // of tuples is sorted by the stack ids we can just look at the next one).
1990 // If so, save them in the CallToMatchingCall map so that they get
1991 // assigned to the same context node, and skip them.
1992 bool DuplicateContextIds = false;
1993 for (unsigned J = I + 1; J < Calls.size(); J++) {
1994 auto &CallCtxInfo = Calls[J];
1995 auto &NextIds = CallCtxInfo.StackIds;
1996 if (NextIds != Ids)
1997 break;
1998 auto *NextFunc = CallCtxInfo.Func;
1999 if (NextFunc != Func) {
2000 // We have another Call with the same ids but that cannot share this
2001 // node, must duplicate ids for it.
2002 DuplicateContextIds = true;
2003 break;
2004 }
2005 auto &NextCall = CallCtxInfo.Call;
2006 CallToMatchingCall[NextCall] = Call;
2007 // Update I so that it gets incremented correctly to skip this call.
2008 I = J;
2009 }
2010
2011 // If we don't have duplicate context ids, then we can assign all the
2012 // context ids computed for the original node sequence to this call.
2013 // If there are duplicate calls with the same stack ids then we synthesize
2014 // new context ids that are duplicates of the originals. These are
2015 // assigned to SavedContextIds, which is a reference into the map entry
2016 // for this call, allowing us to access these ids later on.
2017 OldToNewContextIds.reserve(OldToNewContextIds.size() +
2018 StackSequenceContextIds.size());
2019 SavedContextIds =
2020 DuplicateContextIds
2021 ? duplicateContextIds(StackSequenceContextIds, OldToNewContextIds)
2022 : StackSequenceContextIds;
2023 assert(!SavedContextIds.empty());
2024
2025 if (!DuplicateContextIds) {
2026 // Update saved last node's context ids to remove those that are
2027 // assigned to other calls, so that it is ready for the next call at
2028 // this stack id.
2029 set_subtract(LastNodeContextIds, StackSequenceContextIds);
2030 if (LastNodeContextIds.empty())
2031 break;
2032 }
2033 }
2034 }
2035
2036 // Propagate the duplicate context ids over the graph.
2037 propagateDuplicateContextIds(OldToNewContextIds);
2038
2039 if (VerifyCCG)
2040 check();
2041
2042 // Now perform a post-order traversal over the graph, starting with the
2043 // allocation nodes, essentially processing nodes from callers to callees.
2044 // For any that contains an id in the map, update the graph to contain new
2045 // nodes representing any inlining at interior callsites. Note we move the
2046 // associated context ids over to the new nodes.
2047 DenseSet<const ContextNode *> Visited;
2048 for (auto &Entry : AllocationCallToContextNodeMap)
2049 assignStackNodesPostOrder(Entry.second, Visited, StackIdToMatchingCalls,
2050 CallToMatchingCall);
2051 if (VerifyCCG)
2052 check();
2053}
2054
2055uint64_t ModuleCallsiteContextGraph::getLastStackId(Instruction *Call) {
2056 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
2057 Call->getMetadata(LLVMContext::MD_callsite));
2058 return CallsiteContext.back();
2059}
2060
2061uint64_t IndexCallsiteContextGraph::getLastStackId(IndexCall &Call) {
2063 CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator>
2064 CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call));
2065 // Need to convert index into stack id.
2066 return Index.getStackIdAtIndex(CallsiteContext.back());
2067}
2068
2069static const std::string MemProfCloneSuffix = ".memprof.";
2070
2071static std::string getMemProfFuncName(Twine Base, unsigned CloneNo) {
2072 // We use CloneNo == 0 to refer to the original version, which doesn't get
2073 // renamed with a suffix.
2074 if (!CloneNo)
2075 return Base.str();
2076 return (Base + MemProfCloneSuffix + Twine(CloneNo)).str();
2077}
2078
2079static bool isMemProfClone(const Function &F) {
2080 return F.getName().contains(MemProfCloneSuffix);
2081}
2082
2083// Return the clone number of the given function by extracting it from the
2084// memprof suffix. Assumes the caller has already confirmed it is a memprof
2085// clone.
2086static unsigned getMemProfCloneNum(const Function &F) {
2088 auto Pos = F.getName().find_last_of('.');
2089 assert(Pos > 0);
2090 unsigned CloneNo;
2091 bool Err = F.getName().drop_front(Pos + 1).getAsInteger(10, CloneNo);
2092 assert(!Err);
2093 (void)Err;
2094 return CloneNo;
2095}
2096
2097std::string ModuleCallsiteContextGraph::getLabel(const Function *Func,
2098 const Instruction *Call,
2099 unsigned CloneNo) const {
2100 return (Twine(Call->getFunction()->getName()) + " -> " +
2101 cast<CallBase>(Call)->getCalledFunction()->getName())
2102 .str();
2103}
2104
2105std::string IndexCallsiteContextGraph::getLabel(const FunctionSummary *Func,
2106 const IndexCall &Call,
2107 unsigned CloneNo) const {
2108 auto VI = FSToVIMap.find(Func);
2109 assert(VI != FSToVIMap.end());
2110 std::string CallerName = getMemProfFuncName(VI->second.name(), CloneNo);
2112 return CallerName + " -> alloc";
2113 else {
2114 auto *Callsite = dyn_cast_if_present<CallsiteInfo *>(Call);
2115 return CallerName + " -> " +
2116 getMemProfFuncName(Callsite->Callee.name(),
2117 Callsite->Clones[CloneNo]);
2118 }
2119}
2120
2121std::vector<uint64_t>
2122ModuleCallsiteContextGraph::getStackIdsWithContextNodesForCall(
2123 Instruction *Call) {
2124 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
2125 Call->getMetadata(LLVMContext::MD_callsite));
2126 return getStackIdsWithContextNodes<MDNode, MDNode::op_iterator>(
2127 CallsiteContext);
2128}
2129
2130std::vector<uint64_t>
2131IndexCallsiteContextGraph::getStackIdsWithContextNodesForCall(IndexCall &Call) {
2133 CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator>
2134 CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call));
2135 return getStackIdsWithContextNodes<CallsiteInfo,
2136 SmallVector<unsigned>::const_iterator>(
2137 CallsiteContext);
2138}
2139
2140template <typename DerivedCCG, typename FuncTy, typename CallTy>
2141template <class NodeT, class IteratorT>
2142std::vector<uint64_t>
2143CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getStackIdsWithContextNodes(
2144 CallStack<NodeT, IteratorT> &CallsiteContext) {
2145 std::vector<uint64_t> StackIds;
2146 for (auto IdOrIndex : CallsiteContext) {
2147 auto StackId = getStackId(IdOrIndex);
2148 ContextNode *Node = getNodeForStackId(StackId);
2149 if (!Node)
2150 break;
2151 StackIds.push_back(StackId);
2152 }
2153 return StackIds;
2154}
2155
2156ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(
2157 Module &M,
2158 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter)
2159 : Mod(M), OREGetter(OREGetter) {
2160 for (auto &F : M) {
2161 std::vector<CallInfo> CallsWithMetadata;
2162 for (auto &BB : F) {
2163 for (auto &I : BB) {
2164 if (!isa<CallBase>(I))
2165 continue;
2166 if (auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof)) {
2167 CallsWithMetadata.push_back(&I);
2168 auto *AllocNode = addAllocNode(&I, &F);
2169 auto *CallsiteMD = I.getMetadata(LLVMContext::MD_callsite);
2170 assert(CallsiteMD);
2171 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(CallsiteMD);
2172 // Add all of the MIBs and their stack nodes.
2173 for (auto &MDOp : MemProfMD->operands()) {
2174 auto *MIBMD = cast<const MDNode>(MDOp);
2175 std::vector<ContextTotalSize> ContextSizeInfo;
2176 // Collect the context size information if it exists.
2177 if (MIBMD->getNumOperands() > 2) {
2178 for (unsigned I = 2; I < MIBMD->getNumOperands(); I++) {
2179 MDNode *ContextSizePair =
2180 dyn_cast<MDNode>(MIBMD->getOperand(I));
2181 assert(ContextSizePair->getNumOperands() == 2);
2183 ContextSizePair->getOperand(0))
2184 ->getZExtValue();
2186 ContextSizePair->getOperand(1))
2187 ->getZExtValue();
2188 ContextSizeInfo.push_back({FullStackId, TotalSize});
2189 }
2190 }
2194 addStackNodesForMIB<MDNode, MDNode::op_iterator>(
2195 AllocNode, StackContext, CallsiteContext,
2196 getMIBAllocType(MIBMD), ContextSizeInfo);
2197 }
2198 // If exporting the graph to dot and an allocation id of interest was
2199 // specified, record all the context ids for this allocation node.
2200 if (ExportToDot && AllocNode->OrigStackOrAllocId == AllocIdForDot)
2201 DotAllocContextIds = AllocNode->getContextIds();
2202 assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
2203 // Memprof and callsite metadata on memory allocations no longer
2204 // needed.
2205 I.setMetadata(LLVMContext::MD_memprof, nullptr);
2206 I.setMetadata(LLVMContext::MD_callsite, nullptr);
2207 }
2208 // For callsite metadata, add to list for this function for later use.
2209 else if (I.getMetadata(LLVMContext::MD_callsite)) {
2210 CallsWithMetadata.push_back(&I);
2211 }
2212 }
2213 }
2214 if (!CallsWithMetadata.empty())
2215 FuncToCallsWithMetadata[&F] = CallsWithMetadata;
2216 }
2217
2218 if (DumpCCG) {
2219 dbgs() << "CCG before updating call stack chains:\n";
2220 dbgs() << *this;
2221 }
2222
2223 if (ExportToDot)
2224 exportToDot("prestackupdate");
2225
2226 updateStackNodes();
2227
2228 if (ExportToDot)
2229 exportToDot("poststackupdate");
2230
2231 handleCallsitesWithMultipleTargets();
2232
2233 markBackedges();
2234
2235 // Strip off remaining callsite metadata, no longer needed.
2236 for (auto &FuncEntry : FuncToCallsWithMetadata)
2237 for (auto &Call : FuncEntry.second)
2238 Call.call()->setMetadata(LLVMContext::MD_callsite, nullptr);
2239}
2240
2241IndexCallsiteContextGraph::IndexCallsiteContextGraph(
2242 ModuleSummaryIndex &Index,
2244 isPrevailing)
2245 : Index(Index), isPrevailing(isPrevailing) {
2246 for (auto &I : Index) {
2247 auto VI = Index.getValueInfo(I);
2248 for (auto &S : VI.getSummaryList()) {
2249 // We should only add the prevailing nodes. Otherwise we may try to clone
2250 // in a weak copy that won't be linked (and may be different than the
2251 // prevailing version).
2252 // We only keep the memprof summary on the prevailing copy now when
2253 // building the combined index, as a space optimization, however don't
2254 // rely on this optimization. The linker doesn't resolve local linkage
2255 // values so don't check whether those are prevailing.
2256 if (!GlobalValue::isLocalLinkage(S->linkage()) &&
2257 !isPrevailing(VI.getGUID(), S.get()))
2258 continue;
2259 auto *FS = dyn_cast<FunctionSummary>(S.get());
2260 if (!FS)
2261 continue;
2262 std::vector<CallInfo> CallsWithMetadata;
2263 if (!FS->allocs().empty()) {
2264 for (auto &AN : FS->mutableAllocs()) {
2265 // This can happen because of recursion elimination handling that
2266 // currently exists in ModuleSummaryAnalysis. Skip these for now.
2267 // We still added them to the summary because we need to be able to
2268 // correlate properly in applyImport in the backends.
2269 if (AN.MIBs.empty())
2270 continue;
2271 IndexCall AllocCall(&AN);
2272 CallsWithMetadata.push_back(AllocCall);
2273 auto *AllocNode = addAllocNode(AllocCall, FS);
2274 // Pass an empty CallStack to the CallsiteContext (second)
2275 // parameter, since for ThinLTO we already collapsed out the inlined
2276 // stack ids on the allocation call during ModuleSummaryAnalysis.
2278 EmptyContext;
2279 unsigned I = 0;
2281 AN.ContextSizeInfos.size() == AN.MIBs.size());
2282 // Now add all of the MIBs and their stack nodes.
2283 for (auto &MIB : AN.MIBs) {
2285 StackContext(&MIB);
2286 std::vector<ContextTotalSize> ContextSizeInfo;
2287 if (!AN.ContextSizeInfos.empty()) {
2288 for (auto [FullStackId, TotalSize] : AN.ContextSizeInfos[I])
2289 ContextSizeInfo.push_back({FullStackId, TotalSize});
2290 }
2291 addStackNodesForMIB<MIBInfo, SmallVector<unsigned>::const_iterator>(
2292 AllocNode, StackContext, EmptyContext, MIB.AllocType,
2293 ContextSizeInfo);
2294 I++;
2295 }
2296 // If exporting the graph to dot and an allocation id of interest was
2297 // specified, record all the context ids for this allocation node.
2298 if (ExportToDot && AllocNode->OrigStackOrAllocId == AllocIdForDot)
2299 DotAllocContextIds = AllocNode->getContextIds();
2300 assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
2301 // Initialize version 0 on the summary alloc node to the current alloc
2302 // type, unless it has both types in which case make it default, so
2303 // that in the case where we aren't able to clone the original version
2304 // always ends up with the default allocation behavior.
2305 AN.Versions[0] = (uint8_t)allocTypeToUse(AllocNode->AllocTypes);
2306 }
2307 }
2308 // For callsite metadata, add to list for this function for later use.
2309 if (!FS->callsites().empty())
2310 for (auto &SN : FS->mutableCallsites()) {
2311 IndexCall StackNodeCall(&SN);
2312 CallsWithMetadata.push_back(StackNodeCall);
2313 }
2314
2315 if (!CallsWithMetadata.empty())
2316 FuncToCallsWithMetadata[FS] = CallsWithMetadata;
2317
2318 if (!FS->allocs().empty() || !FS->callsites().empty())
2319 FSToVIMap[FS] = VI;
2320 }
2321 }
2322
2323 if (DumpCCG) {
2324 dbgs() << "CCG before updating call stack chains:\n";
2325 dbgs() << *this;
2326 }
2327
2328 if (ExportToDot)
2329 exportToDot("prestackupdate");
2330
2331 updateStackNodes();
2332
2333 if (ExportToDot)
2334 exportToDot("poststackupdate");
2335
2336 handleCallsitesWithMultipleTargets();
2337
2338 markBackedges();
2339}
2340
2341template <typename DerivedCCG, typename FuncTy, typename CallTy>
2342void CallsiteContextGraph<DerivedCCG, FuncTy,
2343 CallTy>::handleCallsitesWithMultipleTargets() {
2344 // Look for and workaround callsites that call multiple functions.
2345 // This can happen for indirect calls, which needs better handling, and in
2346 // more rare cases (e.g. macro expansion).
2347 // TODO: To fix this for indirect calls we will want to perform speculative
2348 // devirtualization using either the normal PGO info with ICP, or using the
2349 // information in the profiled MemProf contexts. We can do this prior to
2350 // this transformation for regular LTO, and for ThinLTO we can simulate that
2351 // effect in the summary and perform the actual speculative devirtualization
2352 // while cloning in the ThinLTO backend.
2353
2354 // Keep track of the new nodes synthesized for discovered tail calls missing
2355 // from the profiled contexts.
2356 MapVector<CallInfo, ContextNode *> TailCallToContextNodeMap;
2357
2358 std::vector<std::pair<CallInfo, ContextNode *>> NewCallToNode;
2359 for (auto &Entry : NonAllocationCallToContextNodeMap) {
2360 auto *Node = Entry.second;
2361 assert(Node->Clones.empty());
2362 // Check all node callees and see if in the same function.
2363 // We need to check all of the calls recorded in this Node, because in some
2364 // cases we may have had multiple calls with the same debug info calling
2365 // different callees. This can happen, for example, when an object is
2366 // constructed in the paramter list - the destructor call of the object has
2367 // the same debug info (line/col) as the call the object was passed to.
2368 // Here we will prune any that don't match all callee nodes.
2369 std::vector<CallInfo> AllCalls;
2370 AllCalls.reserve(Node->MatchingCalls.size() + 1);
2371 AllCalls.push_back(Node->Call);
2372 llvm::append_range(AllCalls, Node->MatchingCalls);
2373
2374 // First see if we can partition the calls by callee function, creating new
2375 // nodes to host each set of calls calling the same callees. This is
2376 // necessary for support indirect calls with ThinLTO, for which we
2377 // synthesized CallsiteInfo records for each target. They will all have the
2378 // same callsite stack ids and would be sharing a context node at this
2379 // point. We need to perform separate cloning for each, which will be
2380 // applied along with speculative devirtualization in the ThinLTO backends
2381 // as needed. Note this does not currently support looking through tail
2382 // calls, it is unclear if we need that for indirect call targets.
2383 // First partition calls by callee func. Map indexed by func, value is
2384 // struct with list of matching calls, assigned node.
2385 if (partitionCallsByCallee(Node, AllCalls, NewCallToNode))
2386 continue;
2387
2388 auto It = AllCalls.begin();
2389 // Iterate through the calls until we find the first that matches.
2390 for (; It != AllCalls.end(); ++It) {
2391 auto ThisCall = *It;
2392 bool Match = true;
2393 for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();
2394 ++EI) {
2395 auto Edge = *EI;
2396 if (!Edge->Callee->hasCall())
2397 continue;
2398 assert(NodeToCallingFunc.count(Edge->Callee));
2399 // Check if the called function matches that of the callee node.
2400 if (!calleesMatch(ThisCall.call(), EI, TailCallToContextNodeMap)) {
2401 Match = false;
2402 break;
2403 }
2404 }
2405 // Found a call that matches the callee nodes, we can quit now.
2406 if (Match) {
2407 // If the first match is not the primary call on the Node, update it
2408 // now. We will update the list of matching calls further below.
2409 if (Node->Call != ThisCall) {
2410 Node->setCall(ThisCall);
2411 // We need to update the NonAllocationCallToContextNodeMap, but don't
2412 // want to do this during iteration over that map, so save the calls
2413 // that need updated entries.
2414 NewCallToNode.push_back({ThisCall, Node});
2415 }
2416 break;
2417 }
2418 }
2419 // We will update this list below (or leave it cleared if there was no
2420 // match found above).
2421 Node->MatchingCalls.clear();
2422 // If we hit the end of the AllCalls vector, no call matching the callee
2423 // nodes was found, clear the call information in the node.
2424 if (It == AllCalls.end()) {
2425 RemovedEdgesWithMismatchedCallees++;
2426 // Work around by setting Node to have a null call, so it gets
2427 // skipped during cloning. Otherwise assignFunctions will assert
2428 // because its data structures are not designed to handle this case.
2429 Node->setCall(CallInfo());
2430 continue;
2431 }
2432 // Now add back any matching calls that call the same function as the
2433 // matching primary call on Node.
2434 for (++It; It != AllCalls.end(); ++It) {
2435 auto ThisCall = *It;
2436 if (!sameCallee(Node->Call.call(), ThisCall.call()))
2437 continue;
2438 Node->MatchingCalls.push_back(ThisCall);
2439 }
2440 }
2441
2442 // Remove all mismatched nodes identified in the above loop from the node map
2443 // (checking whether they have a null call which is set above). For a
2444 // MapVector like NonAllocationCallToContextNodeMap it is much more efficient
2445 // to do the removal via remove_if than by individually erasing entries above.
2446 // Also remove any entries if we updated the node's primary call above.
2447 NonAllocationCallToContextNodeMap.remove_if([](const auto &it) {
2448 return !it.second->hasCall() || it.second->Call != it.first;
2449 });
2450
2451 // Add entries for any new primary calls recorded above.
2452 for (auto &[Call, Node] : NewCallToNode)
2453 NonAllocationCallToContextNodeMap[Call] = Node;
2454
2455 // Add the new nodes after the above loop so that the iteration is not
2456 // invalidated.
2457 for (auto &[Call, Node] : TailCallToContextNodeMap)
2458 NonAllocationCallToContextNodeMap[Call] = Node;
2459}
2460
2461template <typename DerivedCCG, typename FuncTy, typename CallTy>
2462bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::partitionCallsByCallee(
2463 ContextNode *Node, ArrayRef<CallInfo> AllCalls,
2464 std::vector<std::pair<CallInfo, ContextNode *>> &NewCallToNode) {
2465 // Struct to keep track of all the calls having the same callee function,
2466 // and the node we eventually assign to them. Eventually we will record the
2467 // context node assigned to this group of calls.
2468 struct CallsWithSameCallee {
2469 std::vector<CallInfo> Calls;
2470 ContextNode *Node = nullptr;
2471 };
2472
2473 // First partition calls by callee function. Build map from each function
2474 // to the list of matching calls.
2476 for (auto ThisCall : AllCalls) {
2477 auto *F = getCalleeFunc(ThisCall.call());
2478 if (F)
2479 CalleeFuncToCallInfo[F].Calls.push_back(ThisCall);
2480 }
2481
2482 // Next, walk through all callee edges. For each callee node, get its
2483 // containing function and see if it was recorded in the above map (meaning we
2484 // have at least one matching call). Build another map from each callee node
2485 // with a matching call to the structure instance created above containing all
2486 // the calls.
2488 for (const auto &Edge : Node->CalleeEdges) {
2489 if (!Edge->Callee->hasCall())
2490 continue;
2491 const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee];
2492 if (CalleeFuncToCallInfo.contains(ProfiledCalleeFunc))
2493 CalleeNodeToCallInfo[Edge->Callee] =
2494 &CalleeFuncToCallInfo[ProfiledCalleeFunc];
2495 }
2496
2497 // If there are entries in the second map, then there were no matching
2498 // calls/callees, nothing to do here. Return so we can go to the handling that
2499 // looks through tail calls.
2500 if (CalleeNodeToCallInfo.empty())
2501 return false;
2502
2503 // Walk through all callee edges again. Any and all callee edges that didn't
2504 // match any calls (callee not in the CalleeNodeToCallInfo map) are moved to a
2505 // new caller node (UnmatchedCalleesNode) which gets a null call so that it is
2506 // ignored during cloning. If it is in the map, then we use the node recorded
2507 // in that entry (creating it if needed), and move the callee edge to it.
2508 // The first callee will use the original node instead of creating a new one.
2509 // Note that any of the original calls on this node (in AllCalls) that didn't
2510 // have a callee function automatically get dropped from the node as part of
2511 // this process.
2512 ContextNode *UnmatchedCalleesNode = nullptr;
2513 // Track whether we already assigned original node to a callee.
2514 bool UsedOrigNode = false;
2515 assert(NodeToCallingFunc[Node]);
2516 // Iterate over a copy of Node's callee edges, since we may need to remove
2517 // edges in moveCalleeEdgeToNewCaller, and this simplifies the handling and
2518 // makes it less error-prone.
2519 auto CalleeEdges = Node->CalleeEdges;
2520 for (auto &Edge : CalleeEdges) {
2521 if (!Edge->Callee->hasCall())
2522 continue;
2523
2524 // Will be updated below to point to whatever (caller) node this callee edge
2525 // should be moved to.
2526 ContextNode *CallerNodeToUse = nullptr;
2527
2528 // Handle the case where there were no matching calls first. Move this
2529 // callee edge to the UnmatchedCalleesNode, creating it if needed.
2530 if (!CalleeNodeToCallInfo.contains(Edge->Callee)) {
2531 if (!UnmatchedCalleesNode)
2532 UnmatchedCalleesNode =
2533 createNewNode(/*IsAllocation=*/false, NodeToCallingFunc[Node]);
2534 CallerNodeToUse = UnmatchedCalleesNode;
2535 } else {
2536 // Look up the information recorded for this callee node, and use the
2537 // recorded caller node (creating it if needed).
2538 auto *Info = CalleeNodeToCallInfo[Edge->Callee];
2539 if (!Info->Node) {
2540 // If we haven't assigned any callees to the original node use it.
2541 if (!UsedOrigNode) {
2542 Info->Node = Node;
2543 // Clear the set of matching calls which will be updated below.
2544 Node->MatchingCalls.clear();
2545 UsedOrigNode = true;
2546 } else
2547 Info->Node =
2548 createNewNode(/*IsAllocation=*/false, NodeToCallingFunc[Node]);
2549 assert(!Info->Calls.empty());
2550 // The first call becomes the primary call for this caller node, and the
2551 // rest go in the matching calls list.
2552 Info->Node->setCall(Info->Calls.front());
2553 llvm::append_range(Info->Node->MatchingCalls,
2554 llvm::drop_begin(Info->Calls));
2555 // Save the primary call to node correspondence so that we can update
2556 // the NonAllocationCallToContextNodeMap, which is being iterated in the
2557 // caller of this function.
2558 NewCallToNode.push_back({Info->Node->Call, Info->Node});
2559 }
2560 CallerNodeToUse = Info->Node;
2561 }
2562
2563 // Don't need to move edge if we are using the original node;
2564 if (CallerNodeToUse == Node)
2565 continue;
2566
2567 moveCalleeEdgeToNewCaller(Edge, CallerNodeToUse);
2568 }
2569 // Now that we are done moving edges, clean up any caller edges that ended
2570 // up with no type or context ids. During moveCalleeEdgeToNewCaller all
2571 // caller edges from Node are replicated onto the new callers, and it
2572 // simplifies the handling to leave them until we have moved all
2573 // edges/context ids.
2574 for (auto &I : CalleeNodeToCallInfo)
2575 removeNoneTypeCallerEdges(I.second->Node);
2576 if (UnmatchedCalleesNode)
2577 removeNoneTypeCallerEdges(UnmatchedCalleesNode);
2578 removeNoneTypeCallerEdges(Node);
2579
2580 return true;
2581}
2582
2583uint64_t ModuleCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
2584 // In the Module (IR) case this is already the Id.
2585 return IdOrIndex;
2586}
2587
2588uint64_t IndexCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
2589 // In the Index case this is an index into the stack id list in the summary
2590 // index, convert it to an Id.
2591 return Index.getStackIdAtIndex(IdOrIndex);
2592}
2593
2594template <typename DerivedCCG, typename FuncTy, typename CallTy>
2595bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::calleesMatch(
2596 CallTy Call, EdgeIter &EI,
2597 MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap) {
2598 auto Edge = *EI;
2599 const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee];
2600 const FuncTy *CallerFunc = NodeToCallingFunc[Edge->Caller];
2601 // Will be populated in order of callee to caller if we find a chain of tail
2602 // calls between the profiled caller and callee.
2603 std::vector<std::pair<CallTy, FuncTy *>> FoundCalleeChain;
2604 if (!calleeMatchesFunc(Call, ProfiledCalleeFunc, CallerFunc,
2605 FoundCalleeChain))
2606 return false;
2607
2608 // The usual case where the profiled callee matches that of the IR/summary.
2609 if (FoundCalleeChain.empty())
2610 return true;
2611
2612 auto AddEdge = [Edge, &EI](ContextNode *Caller, ContextNode *Callee) {
2613 auto *CurEdge = Callee->findEdgeFromCaller(Caller);
2614 // If there is already an edge between these nodes, simply update it and
2615 // return.
2616 if (CurEdge) {
2617 CurEdge->ContextIds.insert_range(Edge->ContextIds);
2618 CurEdge->AllocTypes |= Edge->AllocTypes;
2619 return;
2620 }
2621 // Otherwise, create a new edge and insert it into the caller and callee
2622 // lists.
2623 auto NewEdge = std::make_shared<ContextEdge>(
2624 Callee, Caller, Edge->AllocTypes, Edge->ContextIds);
2625 Callee->CallerEdges.push_back(NewEdge);
2626 if (Caller == Edge->Caller) {
2627 // If we are inserting the new edge into the current edge's caller, insert
2628 // the new edge before the current iterator position, and then increment
2629 // back to the current edge.
2630 EI = Caller->CalleeEdges.insert(EI, NewEdge);
2631 ++EI;
2632 assert(*EI == Edge &&
2633 "Iterator position not restored after insert and increment");
2634 } else
2635 Caller->CalleeEdges.push_back(NewEdge);
2636 };
2637
2638 // Create new nodes for each found callee and connect in between the profiled
2639 // caller and callee.
2640 auto *CurCalleeNode = Edge->Callee;
2641 for (auto &[NewCall, Func] : FoundCalleeChain) {
2642 ContextNode *NewNode = nullptr;
2643 // First check if we have already synthesized a node for this tail call.
2644 if (TailCallToContextNodeMap.count(NewCall)) {
2645 NewNode = TailCallToContextNodeMap[NewCall];
2646 NewNode->AllocTypes |= Edge->AllocTypes;
2647 } else {
2648 FuncToCallsWithMetadata[Func].push_back({NewCall});
2649 // Create Node and record node info.
2650 NewNode = createNewNode(/*IsAllocation=*/false, Func, NewCall);
2651 TailCallToContextNodeMap[NewCall] = NewNode;
2652 NewNode->AllocTypes = Edge->AllocTypes;
2653 }
2654
2655 // Hook up node to its callee node
2656 AddEdge(NewNode, CurCalleeNode);
2657
2658 CurCalleeNode = NewNode;
2659 }
2660
2661 // Hook up edge's original caller to new callee node.
2662 AddEdge(Edge->Caller, CurCalleeNode);
2663
2664#ifndef NDEBUG
2665 // Save this because Edge's fields get cleared below when removed.
2666 auto *Caller = Edge->Caller;
2667#endif
2668
2669 // Remove old edge
2670 removeEdgeFromGraph(Edge.get(), &EI, /*CalleeIter=*/true);
2671
2672 // To simplify the increment of EI in the caller, subtract one from EI.
2673 // In the final AddEdge call we would have either added a new callee edge,
2674 // to Edge->Caller, or found an existing one. Either way we are guaranteed
2675 // that there is at least one callee edge.
2676 assert(!Caller->CalleeEdges.empty());
2677 --EI;
2678
2679 return true;
2680}
2681
2682bool ModuleCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
2683 const Function *ProfiledCallee, Value *CurCallee, unsigned Depth,
2684 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain,
2685 bool &FoundMultipleCalleeChains) {
2686 // Stop recursive search if we have already explored the maximum specified
2687 // depth.
2689 return false;
2690
2691 auto SaveCallsiteInfo = [&](Instruction *Callsite, Function *F) {
2692 FoundCalleeChain.push_back({Callsite, F});
2693 };
2694
2695 auto *CalleeFunc = dyn_cast<Function>(CurCallee);
2696 if (!CalleeFunc) {
2697 auto *Alias = dyn_cast<GlobalAlias>(CurCallee);
2698 assert(Alias);
2699 CalleeFunc = dyn_cast<Function>(Alias->getAliasee());
2700 assert(CalleeFunc);
2701 }
2702
2703 // Look for tail calls in this function, and check if they either call the
2704 // profiled callee directly, or indirectly (via a recursive search).
2705 // Only succeed if there is a single unique tail call chain found between the
2706 // profiled caller and callee, otherwise we could perform incorrect cloning.
2707 bool FoundSingleCalleeChain = false;
2708 for (auto &BB : *CalleeFunc) {
2709 for (auto &I : BB) {
2710 auto *CB = dyn_cast<CallBase>(&I);
2711 if (!CB || !CB->isTailCall())
2712 continue;
2713 auto *CalledValue = CB->getCalledOperand();
2714 auto *CalledFunction = CB->getCalledFunction();
2715 if (CalledValue && !CalledFunction) {
2716 CalledValue = CalledValue->stripPointerCasts();
2717 // Stripping pointer casts can reveal a called function.
2718 CalledFunction = dyn_cast<Function>(CalledValue);
2719 }
2720 // Check if this is an alias to a function. If so, get the
2721 // called aliasee for the checks below.
2722 if (auto *GA = dyn_cast<GlobalAlias>(CalledValue)) {
2723 assert(!CalledFunction &&
2724 "Expected null called function in callsite for alias");
2725 CalledFunction = dyn_cast<Function>(GA->getAliaseeObject());
2726 }
2727 if (!CalledFunction)
2728 continue;
2729 if (CalledFunction == ProfiledCallee) {
2730 if (FoundSingleCalleeChain) {
2731 FoundMultipleCalleeChains = true;
2732 return false;
2733 }
2734 FoundSingleCalleeChain = true;
2735 FoundProfiledCalleeCount++;
2736 FoundProfiledCalleeDepth += Depth;
2737 if (Depth > FoundProfiledCalleeMaxDepth)
2738 FoundProfiledCalleeMaxDepth = Depth;
2739 SaveCallsiteInfo(&I, CalleeFunc);
2740 } else if (findProfiledCalleeThroughTailCalls(
2741 ProfiledCallee, CalledFunction, Depth + 1,
2742 FoundCalleeChain, FoundMultipleCalleeChains)) {
2743 // findProfiledCalleeThroughTailCalls should not have returned
2744 // true if FoundMultipleCalleeChains.
2745 assert(!FoundMultipleCalleeChains);
2746 if (FoundSingleCalleeChain) {
2747 FoundMultipleCalleeChains = true;
2748 return false;
2749 }
2750 FoundSingleCalleeChain = true;
2751 SaveCallsiteInfo(&I, CalleeFunc);
2752 } else if (FoundMultipleCalleeChains)
2753 return false;
2754 }
2755 }
2756
2757 return FoundSingleCalleeChain;
2758}
2759
2760const Function *ModuleCallsiteContextGraph::getCalleeFunc(Instruction *Call) {
2761 auto *CB = dyn_cast<CallBase>(Call);
2762 if (!CB->getCalledOperand() || CB->isIndirectCall())
2763 return nullptr;
2764 auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts();
2765 auto *Alias = dyn_cast<GlobalAlias>(CalleeVal);
2766 if (Alias)
2767 return dyn_cast<Function>(Alias->getAliasee());
2768 return dyn_cast<Function>(CalleeVal);
2769}
2770
2771bool ModuleCallsiteContextGraph::calleeMatchesFunc(
2772 Instruction *Call, const Function *Func, const Function *CallerFunc,
2773 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain) {
2774 auto *CB = dyn_cast<CallBase>(Call);
2775 if (!CB->getCalledOperand() || CB->isIndirectCall())
2776 return false;
2777 auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts();
2778 auto *CalleeFunc = dyn_cast<Function>(CalleeVal);
2779 if (CalleeFunc == Func)
2780 return true;
2781 auto *Alias = dyn_cast<GlobalAlias>(CalleeVal);
2782 if (Alias && Alias->getAliasee() == Func)
2783 return true;
2784
2785 // Recursively search for the profiled callee through tail calls starting with
2786 // the actual Callee. The discovered tail call chain is saved in
2787 // FoundCalleeChain, and we will fixup the graph to include these callsites
2788 // after returning.
2789 // FIXME: We will currently redo the same recursive walk if we find the same
2790 // mismatched callee from another callsite. We can improve this with more
2791 // bookkeeping of the created chain of new nodes for each mismatch.
2792 unsigned Depth = 1;
2793 bool FoundMultipleCalleeChains = false;
2794 if (!findProfiledCalleeThroughTailCalls(Func, CalleeVal, Depth,
2795 FoundCalleeChain,
2796 FoundMultipleCalleeChains)) {
2797 LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: "
2798 << Func->getName() << " from " << CallerFunc->getName()
2799 << " that actually called " << CalleeVal->getName()
2800 << (FoundMultipleCalleeChains
2801 ? " (found multiple possible chains)"
2802 : "")
2803 << "\n");
2804 if (FoundMultipleCalleeChains)
2805 FoundProfiledCalleeNonUniquelyCount++;
2806 return false;
2807 }
2808
2809 return true;
2810}
2811
2812bool ModuleCallsiteContextGraph::sameCallee(Instruction *Call1,
2813 Instruction *Call2) {
2814 auto *CB1 = cast<CallBase>(Call1);
2815 if (!CB1->getCalledOperand() || CB1->isIndirectCall())
2816 return false;
2817 auto *CalleeVal1 = CB1->getCalledOperand()->stripPointerCasts();
2818 auto *CalleeFunc1 = dyn_cast<Function>(CalleeVal1);
2819 auto *CB2 = cast<CallBase>(Call2);
2820 if (!CB2->getCalledOperand() || CB2->isIndirectCall())
2821 return false;
2822 auto *CalleeVal2 = CB2->getCalledOperand()->stripPointerCasts();
2823 auto *CalleeFunc2 = dyn_cast<Function>(CalleeVal2);
2824 return CalleeFunc1 == CalleeFunc2;
2825}
2826
2827bool IndexCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
2828 ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
2829 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
2830 bool &FoundMultipleCalleeChains) {
2831 // Stop recursive search if we have already explored the maximum specified
2832 // depth.
2834 return false;
2835
2836 auto CreateAndSaveCallsiteInfo = [&](ValueInfo Callee, FunctionSummary *FS) {
2837 // Make a CallsiteInfo for each discovered callee, if one hasn't already
2838 // been synthesized.
2839 if (!FunctionCalleesToSynthesizedCallsiteInfos.count(FS) ||
2840 !FunctionCalleesToSynthesizedCallsiteInfos[FS].count(Callee))
2841 // StackIds is empty (we don't have debug info available in the index for
2842 // these callsites)
2843 FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee] =
2844 std::make_unique<CallsiteInfo>(Callee, SmallVector<unsigned>());
2845 CallsiteInfo *NewCallsiteInfo =
2846 FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee].get();
2847 FoundCalleeChain.push_back({NewCallsiteInfo, FS});
2848 };
2849
2850 // Look for tail calls in this function, and check if they either call the
2851 // profiled callee directly, or indirectly (via a recursive search).
2852 // Only succeed if there is a single unique tail call chain found between the
2853 // profiled caller and callee, otherwise we could perform incorrect cloning.
2854 bool FoundSingleCalleeChain = false;
2855 for (auto &S : CurCallee.getSummaryList()) {
2856 if (!GlobalValue::isLocalLinkage(S->linkage()) &&
2857 !isPrevailing(CurCallee.getGUID(), S.get()))
2858 continue;
2859 auto *FS = dyn_cast<FunctionSummary>(S->getBaseObject());
2860 if (!FS)
2861 continue;
2862 auto FSVI = CurCallee;
2863 auto *AS = dyn_cast<AliasSummary>(S.get());
2864 if (AS)
2865 FSVI = AS->getAliaseeVI();
2866 for (auto &CallEdge : FS->calls()) {
2867 if (!CallEdge.second.hasTailCall())
2868 continue;
2869 if (CallEdge.first == ProfiledCallee) {
2870 if (FoundSingleCalleeChain) {
2871 FoundMultipleCalleeChains = true;
2872 return false;
2873 }
2874 FoundSingleCalleeChain = true;
2875 FoundProfiledCalleeCount++;
2876 FoundProfiledCalleeDepth += Depth;
2877 if (Depth > FoundProfiledCalleeMaxDepth)
2878 FoundProfiledCalleeMaxDepth = Depth;
2879 CreateAndSaveCallsiteInfo(CallEdge.first, FS);
2880 // Add FS to FSToVIMap in case it isn't already there.
2881 assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI);
2882 FSToVIMap[FS] = FSVI;
2883 } else if (findProfiledCalleeThroughTailCalls(
2884 ProfiledCallee, CallEdge.first, Depth + 1,
2885 FoundCalleeChain, FoundMultipleCalleeChains)) {
2886 // findProfiledCalleeThroughTailCalls should not have returned
2887 // true if FoundMultipleCalleeChains.
2888 assert(!FoundMultipleCalleeChains);
2889 if (FoundSingleCalleeChain) {
2890 FoundMultipleCalleeChains = true;
2891 return false;
2892 }
2893 FoundSingleCalleeChain = true;
2894 CreateAndSaveCallsiteInfo(CallEdge.first, FS);
2895 // Add FS to FSToVIMap in case it isn't already there.
2896 assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI);
2897 FSToVIMap[FS] = FSVI;
2898 } else if (FoundMultipleCalleeChains)
2899 return false;
2900 }
2901 }
2902
2903 return FoundSingleCalleeChain;
2904}
2905
2906const FunctionSummary *
2907IndexCallsiteContextGraph::getCalleeFunc(IndexCall &Call) {
2908 ValueInfo Callee = dyn_cast_if_present<CallsiteInfo *>(Call)->Callee;
2909 if (Callee.getSummaryList().empty())
2910 return nullptr;
2911 return dyn_cast<FunctionSummary>(Callee.getSummaryList()[0]->getBaseObject());
2912}
2913
2914bool IndexCallsiteContextGraph::calleeMatchesFunc(
2915 IndexCall &Call, const FunctionSummary *Func,
2916 const FunctionSummary *CallerFunc,
2917 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain) {
2918 ValueInfo Callee = dyn_cast_if_present<CallsiteInfo *>(Call)->Callee;
2919 // If there is no summary list then this is a call to an externally defined
2920 // symbol.
2921 AliasSummary *Alias =
2922 Callee.getSummaryList().empty()
2923 ? nullptr
2924 : dyn_cast<AliasSummary>(Callee.getSummaryList()[0].get());
2925 assert(FSToVIMap.count(Func));
2926 auto FuncVI = FSToVIMap[Func];
2927 if (Callee == FuncVI ||
2928 // If callee is an alias, check the aliasee, since only function
2929 // summary base objects will contain the stack node summaries and thus
2930 // get a context node.
2931 (Alias && Alias->getAliaseeVI() == FuncVI))
2932 return true;
2933
2934 // Recursively search for the profiled callee through tail calls starting with
2935 // the actual Callee. The discovered tail call chain is saved in
2936 // FoundCalleeChain, and we will fixup the graph to include these callsites
2937 // after returning.
2938 // FIXME: We will currently redo the same recursive walk if we find the same
2939 // mismatched callee from another callsite. We can improve this with more
2940 // bookkeeping of the created chain of new nodes for each mismatch.
2941 unsigned Depth = 1;
2942 bool FoundMultipleCalleeChains = false;
2943 if (!findProfiledCalleeThroughTailCalls(
2944 FuncVI, Callee, Depth, FoundCalleeChain, FoundMultipleCalleeChains)) {
2945 LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: " << FuncVI
2946 << " from " << FSToVIMap[CallerFunc]
2947 << " that actually called " << Callee
2948 << (FoundMultipleCalleeChains
2949 ? " (found multiple possible chains)"
2950 : "")
2951 << "\n");
2952 if (FoundMultipleCalleeChains)
2953 FoundProfiledCalleeNonUniquelyCount++;
2954 return false;
2955 }
2956
2957 return true;
2958}
2959
2960bool IndexCallsiteContextGraph::sameCallee(IndexCall &Call1, IndexCall &Call2) {
2961 ValueInfo Callee1 = dyn_cast_if_present<CallsiteInfo *>(Call1)->Callee;
2962 ValueInfo Callee2 = dyn_cast_if_present<CallsiteInfo *>(Call2)->Callee;
2963 return Callee1 == Callee2;
2964}
2965
2966template <typename DerivedCCG, typename FuncTy, typename CallTy>
2967void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::dump()
2968 const {
2969 print(dbgs());
2970 dbgs() << "\n";
2971}
2972
2973template <typename DerivedCCG, typename FuncTy, typename CallTy>
2974void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::print(
2975 raw_ostream &OS) const {
2976 OS << "Node " << this << "\n";
2977 OS << "\t";
2978 printCall(OS);
2979 if (Recursive)
2980 OS << " (recursive)";
2981 OS << "\n";
2982 if (!MatchingCalls.empty()) {
2983 OS << "\tMatchingCalls:\n";
2984 for (auto &MatchingCall : MatchingCalls) {
2985 OS << "\t";
2986 MatchingCall.print(OS);
2987 OS << "\n";
2988 }
2989 }
2990 OS << "\tNodeId: " << NodeId << "\n";
2991 OS << "\tAllocTypes: " << getAllocTypeString(AllocTypes) << "\n";
2992 OS << "\tContextIds:";
2993 // Make a copy of the computed context ids that we can sort for stability.
2994 auto ContextIds = getContextIds();
2995 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
2996 std::sort(SortedIds.begin(), SortedIds.end());
2997 for (auto Id : SortedIds)
2998 OS << " " << Id;
2999 OS << "\n";
3000 OS << "\tCalleeEdges:\n";
3001 for (auto &Edge : CalleeEdges)
3002 OS << "\t\t" << *Edge << " (Callee NodeId: " << Edge->Callee->NodeId
3003 << ")\n";
3004 OS << "\tCallerEdges:\n";
3005 for (auto &Edge : CallerEdges)
3006 OS << "\t\t" << *Edge << " (Caller NodeId: " << Edge->Caller->NodeId
3007 << ")\n";
3008 if (!Clones.empty()) {
3009 OS << "\tClones: ";
3010 bool First = true;
3011 for (auto *C : Clones) {
3012 if (!First)
3013 OS << ", ";
3014 First = false;
3015 OS << C << " NodeId: " << C->NodeId;
3016 }
3017 OS << "\n";
3018 } else if (CloneOf) {
3019 OS << "\tClone of " << CloneOf << " NodeId: " << CloneOf->NodeId << "\n";
3020 }
3021}
3022
3023template <typename DerivedCCG, typename FuncTy, typename CallTy>
3024void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::dump()
3025 const {
3026 print(dbgs());
3027 dbgs() << "\n";
3028}
3029
3030template <typename DerivedCCG, typename FuncTy, typename CallTy>
3031void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::print(
3032 raw_ostream &OS) const {
3033 OS << "Edge from Callee " << Callee << " to Caller: " << Caller
3034 << (IsBackedge ? " (BE)" : "")
3035 << " AllocTypes: " << getAllocTypeString(AllocTypes);
3036 OS << " ContextIds:";
3037 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
3038 std::sort(SortedIds.begin(), SortedIds.end());
3039 for (auto Id : SortedIds)
3040 OS << " " << Id;
3041}
3042
3043template <typename DerivedCCG, typename FuncTy, typename CallTy>
3044void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::dump() const {
3045 print(dbgs());
3046}
3047
3048template <typename DerivedCCG, typename FuncTy, typename CallTy>
3049void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::print(
3050 raw_ostream &OS) const {
3051 OS << "Callsite Context Graph:\n";
3052 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3053 for (const auto Node : nodes<GraphType>(this)) {
3054 if (Node->isRemoved())
3055 continue;
3056 Node->print(OS);
3057 OS << "\n";
3058 }
3059}
3060
3061template <typename DerivedCCG, typename FuncTy, typename CallTy>
3062void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::printTotalSizes(
3063 raw_ostream &OS) const {
3064 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3065 for (const auto Node : nodes<GraphType>(this)) {
3066 if (Node->isRemoved())
3067 continue;
3068 if (!Node->IsAllocation)
3069 continue;
3070 DenseSet<uint32_t> ContextIds = Node->getContextIds();
3071 auto AllocTypeFromCall = getAllocationCallType(Node->Call);
3072 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
3073 std::sort(SortedIds.begin(), SortedIds.end());
3074 for (auto Id : SortedIds) {
3075 auto TypeI = ContextIdToAllocationType.find(Id);
3076 assert(TypeI != ContextIdToAllocationType.end());
3077 auto CSI = ContextIdToContextSizeInfos.find(Id);
3078 if (CSI != ContextIdToContextSizeInfos.end()) {
3079 for (auto &Info : CSI->second) {
3080 OS << "MemProf hinting: "
3081 << getAllocTypeString((uint8_t)TypeI->second)
3082 << " full allocation context " << Info.FullStackId
3083 << " with total size " << Info.TotalSize << " is "
3084 << getAllocTypeString(Node->AllocTypes) << " after cloning";
3085 if (allocTypeToUse(Node->AllocTypes) != AllocTypeFromCall)
3086 OS << " marked " << getAllocTypeString((uint8_t)AllocTypeFromCall)
3087 << " due to cold byte percent";
3088 // Print the internal context id to aid debugging and visualization.
3089 OS << " (context id " << Id << ")";
3090 OS << "\n";
3091 }
3092 }
3093 }
3094 }
3095}
3096
3097template <typename DerivedCCG, typename FuncTy, typename CallTy>
3098void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::check() const {
3099 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3100 for (const auto Node : nodes<GraphType>(this)) {
3101 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
3102 for (auto &Edge : Node->CallerEdges)
3104 }
3105}
3106
3107template <typename DerivedCCG, typename FuncTy, typename CallTy>
3108struct GraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *> {
3109 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3110 using NodeRef = const ContextNode<DerivedCCG, FuncTy, CallTy> *;
3111
3112 using NodePtrTy = std::unique_ptr<ContextNode<DerivedCCG, FuncTy, CallTy>>;
3113 static NodeRef getNode(const NodePtrTy &P) { return P.get(); }
3114
3117 decltype(&getNode)>;
3118
3120 return nodes_iterator(G->NodeOwner.begin(), &getNode);
3121 }
3122
3124 return nodes_iterator(G->NodeOwner.end(), &getNode);
3125 }
3126
3128 return G->NodeOwner.begin()->get();
3129 }
3130
3131 using EdgePtrTy = std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>;
3132 static const ContextNode<DerivedCCG, FuncTy, CallTy> *
3134 return P->Callee;
3135 }
3136
3138 mapped_iterator<typename std::vector<std::shared_ptr<ContextEdge<
3139 DerivedCCG, FuncTy, CallTy>>>::const_iterator,
3140 decltype(&GetCallee)>;
3141
3143 return ChildIteratorType(N->CalleeEdges.begin(), &GetCallee);
3144 }
3145
3147 return ChildIteratorType(N->CalleeEdges.end(), &GetCallee);
3148 }
3149};
3150
3151template <typename DerivedCCG, typename FuncTy, typename CallTy>
3152struct DOTGraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>
3153 : public DefaultDOTGraphTraits {
3154 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {
3155 // If the user requested the full graph to be exported, but provided an
3156 // allocation id, or if the user gave a context id and requested more than
3157 // just a specific context to be exported, note that highlighting is
3158 // enabled.
3159 DoHighlight =
3160 (AllocIdForDot.getNumOccurrences() && DotGraphScope == DotScope::All) ||
3161 (ContextIdForDot.getNumOccurrences() &&
3163 }
3164
3165 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3167 using NodeRef = typename GTraits::NodeRef;
3168 using ChildIteratorType = typename GTraits::ChildIteratorType;
3169
3170 static std::string getNodeLabel(NodeRef Node, GraphType G) {
3171 std::string LabelString =
3172 (Twine("OrigId: ") + (Node->IsAllocation ? "Alloc" : "") +
3173 Twine(Node->OrigStackOrAllocId) + " NodeId: " + Twine(Node->NodeId))
3174 .str();
3175 LabelString += "\n";
3176 if (Node->hasCall()) {
3177 auto Func = G->NodeToCallingFunc.find(Node);
3178 assert(Func != G->NodeToCallingFunc.end());
3179 LabelString +=
3180 G->getLabel(Func->second, Node->Call.call(), Node->Call.cloneNo());
3181 } else {
3182 LabelString += "null call";
3183 if (Node->Recursive)
3184 LabelString += " (recursive)";
3185 else
3186 LabelString += " (external)";
3187 }
3188 return LabelString;
3189 }
3190
3192 auto ContextIds = Node->getContextIds();
3193 // If highlighting enabled, see if this node contains any of the context ids
3194 // of interest. If so, it will use a different color and a larger fontsize
3195 // (which makes the node larger as well).
3196 bool Highlight = false;
3197 if (DoHighlight) {
3198 assert(ContextIdForDot.getNumOccurrences() ||
3199 AllocIdForDot.getNumOccurrences());
3200 if (ContextIdForDot.getNumOccurrences())
3201 Highlight = ContextIds.contains(ContextIdForDot);
3202 else
3203 Highlight = set_intersects(ContextIds, G->DotAllocContextIds);
3204 }
3205 std::string AttributeString = (Twine("tooltip=\"") + getNodeId(Node) + " " +
3206 getContextIds(ContextIds) + "\"")
3207 .str();
3208 // Default fontsize is 14
3209 if (Highlight)
3210 AttributeString += ",fontsize=\"30\"";
3211 AttributeString +=
3212 (Twine(",fillcolor=\"") + getColor(Node->AllocTypes, Highlight) + "\"")
3213 .str();
3214 if (Node->CloneOf) {
3215 AttributeString += ",color=\"blue\"";
3216 AttributeString += ",style=\"filled,bold,dashed\"";
3217 } else
3218 AttributeString += ",style=\"filled\"";
3219 return AttributeString;
3220 }
3221
3222 static std::string getEdgeAttributes(NodeRef, ChildIteratorType ChildIter,
3223 GraphType G) {
3224 auto &Edge = *(ChildIter.getCurrent());
3225 // If highlighting enabled, see if this edge contains any of the context ids
3226 // of interest. If so, it will use a different color and a heavier arrow
3227 // size and weight (the larger weight makes the highlighted path
3228 // straighter).
3229 bool Highlight = false;
3230 if (DoHighlight) {
3231 assert(ContextIdForDot.getNumOccurrences() ||
3232 AllocIdForDot.getNumOccurrences());
3233 if (ContextIdForDot.getNumOccurrences())
3234 Highlight = Edge->ContextIds.contains(ContextIdForDot);
3235 else
3236 Highlight = set_intersects(Edge->ContextIds, G->DotAllocContextIds);
3237 }
3238 auto Color = getColor(Edge->AllocTypes, Highlight);
3239 std::string AttributeString =
3240 (Twine("tooltip=\"") + getContextIds(Edge->ContextIds) + "\"" +
3241 // fillcolor is the arrow head and color is the line
3242 Twine(",fillcolor=\"") + Color + "\"" + Twine(",color=\"") + Color +
3243 "\"")
3244 .str();
3245 if (Edge->IsBackedge)
3246 AttributeString += ",style=\"dotted\"";
3247 // Default penwidth and weight are both 1.
3248 if (Highlight)
3249 AttributeString += ",penwidth=\"2.0\",weight=\"2\"";
3250 return AttributeString;
3251 }
3252
3253 // Since the NodeOwners list includes nodes that are no longer connected to
3254 // the graph, skip them here.
3256 if (Node->isRemoved())
3257 return true;
3258 // If a scope smaller than the full graph was requested, see if this node
3259 // contains any of the context ids of interest.
3261 return !set_intersects(Node->getContextIds(), G->DotAllocContextIds);
3263 return !Node->getContextIds().contains(ContextIdForDot);
3264 return false;
3265 }
3266
3267private:
3268 static std::string getContextIds(const DenseSet<uint32_t> &ContextIds) {
3269 std::string IdString = "ContextIds:";
3270 if (ContextIds.size() < 100) {
3271 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
3272 std::sort(SortedIds.begin(), SortedIds.end());
3273 for (auto Id : SortedIds)
3274 IdString += (" " + Twine(Id)).str();
3275 } else {
3276 IdString += (" (" + Twine(ContextIds.size()) + " ids)").str();
3277 }
3278 return IdString;
3279 }
3280
3281 static std::string getColor(uint8_t AllocTypes, bool Highlight) {
3282 // If DoHighlight is not enabled, we want to use the highlight colors for
3283 // NotCold and Cold, and the non-highlight color for NotCold+Cold. This is
3284 // both compatible with the color scheme before highlighting was supported,
3285 // and for the NotCold+Cold color the non-highlight color is a bit more
3286 // readable.
3287 if (AllocTypes == (uint8_t)AllocationType::NotCold)
3288 // Color "brown1" actually looks like a lighter red.
3289 return !DoHighlight || Highlight ? "brown1" : "lightpink";
3290 if (AllocTypes == (uint8_t)AllocationType::Cold)
3291 return !DoHighlight || Highlight ? "cyan" : "lightskyblue";
3292 if (AllocTypes ==
3293 ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold))
3294 return Highlight ? "magenta" : "mediumorchid1";
3295 return "gray";
3296 }
3297
3298 static std::string getNodeId(NodeRef Node) {
3299 std::stringstream SStream;
3300 SStream << std::hex << "N0x" << (unsigned long long)Node;
3301 std::string Result = SStream.str();
3302 return Result;
3303 }
3304
3305 // True if we should highlight a specific context or allocation's contexts in
3306 // the emitted graph.
3307 static bool DoHighlight;
3308};
3309
3310template <typename DerivedCCG, typename FuncTy, typename CallTy>
3311bool DOTGraphTraits<
3312 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>::DoHighlight =
3313 false;
3314
3315template <typename DerivedCCG, typename FuncTy, typename CallTy>
3316void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::exportToDot(
3317 std::string Label) const {
3318 WriteGraph(this, "", false, Label,
3319 DotFilePathPrefix + "ccg." + Label + ".dot");
3320}
3321
3322template <typename DerivedCCG, typename FuncTy, typename CallTy>
3323typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
3324CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::moveEdgeToNewCalleeClone(
3325 const std::shared_ptr<ContextEdge> &Edge,
3326 DenseSet<uint32_t> ContextIdsToMove) {
3327 ContextNode *Node = Edge->Callee;
3328 assert(NodeToCallingFunc.count(Node));
3329 ContextNode *Clone =
3330 createNewNode(Node->IsAllocation, NodeToCallingFunc[Node], Node->Call);
3331 Node->addClone(Clone);
3332 Clone->MatchingCalls = Node->MatchingCalls;
3333 moveEdgeToExistingCalleeClone(Edge, Clone, /*NewClone=*/true,
3334 ContextIdsToMove);
3335 return Clone;
3336}
3337
3338template <typename DerivedCCG, typename FuncTy, typename CallTy>
3339void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
3340 moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
3341 ContextNode *NewCallee, bool NewClone,
3342 DenseSet<uint32_t> ContextIdsToMove) {
3343 // NewCallee and Edge's current callee must be clones of the same original
3344 // node (Edge's current callee may be the original node too).
3345 assert(NewCallee->getOrigNode() == Edge->Callee->getOrigNode());
3346
3347 bool EdgeIsRecursive = Edge->Callee == Edge->Caller;
3348
3349 ContextNode *OldCallee = Edge->Callee;
3350
3351 // We might already have an edge to the new callee from earlier cloning for a
3352 // different allocation. If one exists we will reuse it.
3353 auto ExistingEdgeToNewCallee = NewCallee->findEdgeFromCaller(Edge->Caller);
3354
3355 // Callers will pass an empty ContextIdsToMove set when they want to move the
3356 // edge. Copy in Edge's ids for simplicity.
3357 if (ContextIdsToMove.empty())
3358 ContextIdsToMove = Edge->getContextIds();
3359
3360 // If we are moving all of Edge's ids, then just move the whole Edge.
3361 // Otherwise only move the specified subset, to a new edge if needed.
3362 if (Edge->getContextIds().size() == ContextIdsToMove.size()) {
3363 // First, update the alloc types on New Callee from Edge.
3364 // Do this before we potentially clear Edge's fields below!
3365 NewCallee->AllocTypes |= Edge->AllocTypes;
3366 // Moving the whole Edge.
3367 if (ExistingEdgeToNewCallee) {
3368 // Since we already have an edge to NewCallee, simply move the ids
3369 // onto it, and remove the existing Edge.
3370 ExistingEdgeToNewCallee->getContextIds().insert_range(ContextIdsToMove);
3371 ExistingEdgeToNewCallee->AllocTypes |= Edge->AllocTypes;
3372 assert(Edge->ContextIds == ContextIdsToMove);
3373 removeEdgeFromGraph(Edge.get());
3374 } else {
3375 // Otherwise just reconnect Edge to NewCallee.
3376 Edge->Callee = NewCallee;
3377 NewCallee->CallerEdges.push_back(Edge);
3378 // Remove it from callee where it was previously connected.
3379 OldCallee->eraseCallerEdge(Edge.get());
3380 // Don't need to update Edge's context ids since we are simply
3381 // reconnecting it.
3382 }
3383 } else {
3384 // Only moving a subset of Edge's ids.
3385 // Compute the alloc type of the subset of ids being moved.
3386 auto CallerEdgeAllocType = computeAllocType(ContextIdsToMove);
3387 if (ExistingEdgeToNewCallee) {
3388 // Since we already have an edge to NewCallee, simply move the ids
3389 // onto it.
3390 ExistingEdgeToNewCallee->getContextIds().insert_range(ContextIdsToMove);
3391 ExistingEdgeToNewCallee->AllocTypes |= CallerEdgeAllocType;
3392 } else {
3393 // Otherwise, create a new edge to NewCallee for the ids being moved.
3394 auto NewEdge = std::make_shared<ContextEdge>(
3395 NewCallee, Edge->Caller, CallerEdgeAllocType, ContextIdsToMove);
3396 Edge->Caller->CalleeEdges.push_back(NewEdge);
3397 NewCallee->CallerEdges.push_back(NewEdge);
3398 }
3399 // In either case, need to update the alloc types on NewCallee, and remove
3400 // those ids and update the alloc type on the original Edge.
3401 NewCallee->AllocTypes |= CallerEdgeAllocType;
3402 set_subtract(Edge->ContextIds, ContextIdsToMove);
3403 Edge->AllocTypes = computeAllocType(Edge->ContextIds);
3404 }
3405 // Now walk the old callee node's callee edges and move Edge's context ids
3406 // over to the corresponding edge into the clone (which is created here if
3407 // this is a newly created clone).
3408 for (auto &OldCalleeEdge : OldCallee->CalleeEdges) {
3409 ContextNode *CalleeToUse = OldCalleeEdge->Callee;
3410 // If this is a direct recursion edge, use NewCallee (the clone) as the
3411 // callee as well, so that any edge updated/created here is also direct
3412 // recursive.
3413 if (CalleeToUse == OldCallee) {
3414 // If this is a recursive edge, see if we already moved a recursive edge
3415 // (which would have to have been this one) - if we were only moving a
3416 // subset of context ids it would still be on OldCallee.
3417 if (EdgeIsRecursive) {
3418 assert(OldCalleeEdge == Edge);
3419 continue;
3420 }
3421 CalleeToUse = NewCallee;
3422 }
3423 // The context ids moving to the new callee are the subset of this edge's
3424 // context ids and the context ids on the caller edge being moved.
3425 DenseSet<uint32_t> EdgeContextIdsToMove =
3426 set_intersection(OldCalleeEdge->getContextIds(), ContextIdsToMove);
3427 set_subtract(OldCalleeEdge->getContextIds(), EdgeContextIdsToMove);
3428 OldCalleeEdge->AllocTypes =
3429 computeAllocType(OldCalleeEdge->getContextIds());
3430 if (!NewClone) {
3431 // Update context ids / alloc type on corresponding edge to NewCallee.
3432 // There is a chance this may not exist if we are reusing an existing
3433 // clone, specifically during function assignment, where we would have
3434 // removed none type edges after creating the clone. If we can't find
3435 // a corresponding edge there, fall through to the cloning below.
3436 if (auto *NewCalleeEdge = NewCallee->findEdgeFromCallee(CalleeToUse)) {
3437 NewCalleeEdge->getContextIds().insert_range(EdgeContextIdsToMove);
3438 NewCalleeEdge->AllocTypes |= computeAllocType(EdgeContextIdsToMove);
3439 continue;
3440 }
3441 }
3442 auto NewEdge = std::make_shared<ContextEdge>(
3443 CalleeToUse, NewCallee, computeAllocType(EdgeContextIdsToMove),
3444 EdgeContextIdsToMove);
3445 NewCallee->CalleeEdges.push_back(NewEdge);
3446 NewEdge->Callee->CallerEdges.push_back(NewEdge);
3447 }
3448 // Recompute the node alloc type now that its callee edges have been
3449 // updated (since we will compute from those edges).
3450 OldCallee->AllocTypes = OldCallee->computeAllocType();
3451 // OldCallee alloc type should be None iff its context id set is now empty.
3452 assert((OldCallee->AllocTypes == (uint8_t)AllocationType::None) ==
3453 OldCallee->emptyContextIds());
3454 if (VerifyCCG) {
3455 checkNode<DerivedCCG, FuncTy, CallTy>(OldCallee, /*CheckEdges=*/false);
3456 checkNode<DerivedCCG, FuncTy, CallTy>(NewCallee, /*CheckEdges=*/false);
3457 for (const auto &OldCalleeEdge : OldCallee->CalleeEdges)
3458 checkNode<DerivedCCG, FuncTy, CallTy>(OldCalleeEdge->Callee,
3459 /*CheckEdges=*/false);
3460 for (const auto &NewCalleeEdge : NewCallee->CalleeEdges)
3461 checkNode<DerivedCCG, FuncTy, CallTy>(NewCalleeEdge->Callee,
3462 /*CheckEdges=*/false);
3463 }
3464}
3465
3466template <typename DerivedCCG, typename FuncTy, typename CallTy>
3467void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
3468 moveCalleeEdgeToNewCaller(const std::shared_ptr<ContextEdge> &Edge,
3469 ContextNode *NewCaller) {
3470 auto *OldCallee = Edge->Callee;
3471 auto *NewCallee = OldCallee;
3472 // If this edge was direct recursive, make any new/updated edge also direct
3473 // recursive to NewCaller.
3474 bool Recursive = Edge->Caller == Edge->Callee;
3475 if (Recursive)
3476 NewCallee = NewCaller;
3477
3478 ContextNode *OldCaller = Edge->Caller;
3479 OldCaller->eraseCalleeEdge(Edge.get());
3480
3481 // We might already have an edge to the new caller. If one exists we will
3482 // reuse it.
3483 auto ExistingEdgeToNewCaller = NewCaller->findEdgeFromCallee(NewCallee);
3484
3485 if (ExistingEdgeToNewCaller) {
3486 // Since we already have an edge to NewCaller, simply move the ids
3487 // onto it, and remove the existing Edge.
3488 ExistingEdgeToNewCaller->getContextIds().insert_range(
3489 Edge->getContextIds());
3490 ExistingEdgeToNewCaller->AllocTypes |= Edge->AllocTypes;
3491 Edge->ContextIds.clear();
3492 Edge->AllocTypes = (uint8_t)AllocationType::None;
3493 OldCallee->eraseCallerEdge(Edge.get());
3494 } else {
3495 // Otherwise just reconnect Edge to NewCaller.
3496 Edge->Caller = NewCaller;
3497 NewCaller->CalleeEdges.push_back(Edge);
3498 if (Recursive) {
3499 assert(NewCallee == NewCaller);
3500 // In the case of (direct) recursive edges, we update the callee as well
3501 // so that it becomes recursive on the new caller.
3502 Edge->Callee = NewCallee;
3503 NewCallee->CallerEdges.push_back(Edge);
3504 OldCallee->eraseCallerEdge(Edge.get());
3505 }
3506 // Don't need to update Edge's context ids since we are simply
3507 // reconnecting it.
3508 }
3509 // In either case, need to update the alloc types on New Caller.
3510 NewCaller->AllocTypes |= Edge->AllocTypes;
3511
3512 // Now walk the old caller node's caller edges and move Edge's context ids
3513 // over to the corresponding edge into the node (which is created here if
3514 // this is a newly created node). We can tell whether this is a newly created
3515 // node by seeing if it has any caller edges yet.
3516#ifndef NDEBUG
3517 bool IsNewNode = NewCaller->CallerEdges.empty();
3518#endif
3519 // If we just moved a direct recursive edge, presumably its context ids should
3520 // also flow out of OldCaller via some other non-recursive callee edge. We
3521 // don't want to remove the recursive context ids from other caller edges yet,
3522 // otherwise the context ids get into an inconsistent state on OldCaller.
3523 // We will update these context ids on the non-recursive caller edge when and
3524 // if they are updated on the non-recursive callee.
3525 if (!Recursive) {
3526 for (auto &OldCallerEdge : OldCaller->CallerEdges) {
3527 auto OldCallerCaller = OldCallerEdge->Caller;
3528 // The context ids moving to the new caller are the subset of this edge's
3529 // context ids and the context ids on the callee edge being moved.
3530 DenseSet<uint32_t> EdgeContextIdsToMove = set_intersection(
3531 OldCallerEdge->getContextIds(), Edge->getContextIds());
3532 if (OldCaller == OldCallerCaller) {
3533 OldCallerCaller = NewCaller;
3534 // Don't actually move this one. The caller will move it directly via a
3535 // call to this function with this as the Edge if it is appropriate to
3536 // move to a diff node that has a matching callee (itself).
3537 continue;
3538 }
3539 set_subtract(OldCallerEdge->getContextIds(), EdgeContextIdsToMove);
3540 OldCallerEdge->AllocTypes =
3541 computeAllocType(OldCallerEdge->getContextIds());
3542 // In this function we expect that any pre-existing node already has edges
3543 // from the same callers as the old node. That should be true in the
3544 // current use case, where we will remove None-type edges after copying
3545 // over all caller edges from the callee.
3546 auto *ExistingCallerEdge = NewCaller->findEdgeFromCaller(OldCallerCaller);
3547 // Since we would have skipped caller edges when moving a direct recursive
3548 // edge, this may not hold true when recursive handling enabled.
3549 assert(IsNewNode || ExistingCallerEdge || AllowRecursiveCallsites);
3550 if (ExistingCallerEdge) {
3551 ExistingCallerEdge->getContextIds().insert_range(EdgeContextIdsToMove);
3552 ExistingCallerEdge->AllocTypes |=
3553 computeAllocType(EdgeContextIdsToMove);
3554 continue;
3555 }
3556 auto NewEdge = std::make_shared<ContextEdge>(
3557 NewCaller, OldCallerCaller, computeAllocType(EdgeContextIdsToMove),
3558 EdgeContextIdsToMove);
3559 NewCaller->CallerEdges.push_back(NewEdge);
3560 NewEdge->Caller->CalleeEdges.push_back(NewEdge);
3561 }
3562 }
3563 // Recompute the node alloc type now that its caller edges have been
3564 // updated (since we will compute from those edges).
3565 OldCaller->AllocTypes = OldCaller->computeAllocType();
3566 // OldCaller alloc type should be None iff its context id set is now empty.
3567 assert((OldCaller->AllocTypes == (uint8_t)AllocationType::None) ==
3568 OldCaller->emptyContextIds());
3569 if (VerifyCCG) {
3570 checkNode<DerivedCCG, FuncTy, CallTy>(OldCaller, /*CheckEdges=*/false);
3571 checkNode<DerivedCCG, FuncTy, CallTy>(NewCaller, /*CheckEdges=*/false);
3572 for (const auto &OldCallerEdge : OldCaller->CallerEdges)
3573 checkNode<DerivedCCG, FuncTy, CallTy>(OldCallerEdge->Caller,
3574 /*CheckEdges=*/false);
3575 for (const auto &NewCallerEdge : NewCaller->CallerEdges)
3576 checkNode<DerivedCCG, FuncTy, CallTy>(NewCallerEdge->Caller,
3577 /*CheckEdges=*/false);
3578 }
3579}
3580
3581template <typename DerivedCCG, typename FuncTy, typename CallTy>
3582void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
3583 recursivelyRemoveNoneTypeCalleeEdges(
3584 ContextNode *Node, DenseSet<const ContextNode *> &Visited) {
3585 auto Inserted = Visited.insert(Node);
3586 if (!Inserted.second)
3587 return;
3588
3589 removeNoneTypeCalleeEdges(Node);
3590
3591 for (auto *Clone : Node->Clones)
3592 recursivelyRemoveNoneTypeCalleeEdges(Clone, Visited);
3593
3594 // The recursive call may remove some of this Node's caller edges.
3595 // Iterate over a copy and skip any that were removed.
3596 auto CallerEdges = Node->CallerEdges;
3597 for (auto &Edge : CallerEdges) {
3598 // Skip any that have been removed by an earlier recursive call.
3599 if (Edge->isRemoved()) {
3600 assert(!is_contained(Node->CallerEdges, Edge));
3601 continue;
3602 }
3603 recursivelyRemoveNoneTypeCalleeEdges(Edge->Caller, Visited);
3604 }
3605}
3606
3607// This is the standard DFS based backedge discovery algorithm.
3608template <typename DerivedCCG, typename FuncTy, typename CallTy>
3609void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::markBackedges() {
3610 // If we are cloning recursive contexts, find and mark backedges from all root
3611 // callers, using the typical DFS based backedge analysis.
3613 return;
3614 DenseSet<const ContextNode *> Visited;
3615 DenseSet<const ContextNode *> CurrentStack;
3616 for (auto &Entry : NonAllocationCallToContextNodeMap) {
3617 auto *Node = Entry.second;
3618 if (Node->isRemoved())
3619 continue;
3620 // It is a root if it doesn't have callers.
3621 if (!Node->CallerEdges.empty())
3622 continue;
3623 markBackedges(Node, Visited, CurrentStack);
3624 assert(CurrentStack.empty());
3625 }
3626}
3627
3628// Recursive helper for above markBackedges method.
3629template <typename DerivedCCG, typename FuncTy, typename CallTy>
3630void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::markBackedges(
3631 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
3632 DenseSet<const ContextNode *> &CurrentStack) {
3633 auto I = Visited.insert(Node);
3634 // We should only call this for unvisited nodes.
3635 assert(I.second);
3636 (void)I;
3637 for (auto &CalleeEdge : Node->CalleeEdges) {
3638 auto *Callee = CalleeEdge->Callee;
3639 if (Visited.count(Callee)) {
3640 // Since this was already visited we need to check if it is currently on
3641 // the recursive stack in which case it is a backedge.
3642 if (CurrentStack.count(Callee))
3643 CalleeEdge->IsBackedge = true;
3644 continue;
3645 }
3646 CurrentStack.insert(Callee);
3647 markBackedges(Callee, Visited, CurrentStack);
3648 CurrentStack.erase(Callee);
3649 }
3650}
3651
3652template <typename DerivedCCG, typename FuncTy, typename CallTy>
3653void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones() {
3654 DenseSet<const ContextNode *> Visited;
3655 for (auto &Entry : AllocationCallToContextNodeMap) {
3656 Visited.clear();
3657 identifyClones(Entry.second, Visited, Entry.second->getContextIds());
3658 }
3659 Visited.clear();
3660 for (auto &Entry : AllocationCallToContextNodeMap)
3661 recursivelyRemoveNoneTypeCalleeEdges(Entry.second, Visited);
3662 if (VerifyCCG)
3663 check();
3664}
3665
3666// helper function to check an AllocType is cold or notcold or both.
3673
3674template <typename DerivedCCG, typename FuncTy, typename CallTy>
3675void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
3676 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
3677 const DenseSet<uint32_t> &AllocContextIds) {
3678 if (VerifyNodes)
3679 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
3680 assert(!Node->CloneOf);
3681
3682 // If Node as a null call, then either it wasn't found in the module (regular
3683 // LTO) or summary index (ThinLTO), or there were other conditions blocking
3684 // cloning (e.g. recursion, calls multiple targets, etc).
3685 // Do this here so that we don't try to recursively clone callers below, which
3686 // isn't useful at least for this node.
3687 if (!Node->hasCall())
3688 return;
3689
3690 // No need to look at any callers if allocation type already unambiguous.
3691 if (hasSingleAllocType(Node->AllocTypes))
3692 return;
3693
3694#ifndef NDEBUG
3695 auto Insert =
3696#endif
3697 Visited.insert(Node);
3698 // We should not have visited this node yet.
3699 assert(Insert.second);
3700 // The recursive call to identifyClones may delete the current edge from the
3701 // CallerEdges vector. Make a copy and iterate on that, simpler than passing
3702 // in an iterator and having recursive call erase from it. Other edges may
3703 // also get removed during the recursion, which will have null Callee and
3704 // Caller pointers (and are deleted later), so we skip those below.
3705 {
3706 auto CallerEdges = Node->CallerEdges;
3707 for (auto &Edge : CallerEdges) {
3708 // Skip any that have been removed by an earlier recursive call.
3709 if (Edge->isRemoved()) {
3710 assert(!is_contained(Node->CallerEdges, Edge));
3711 continue;
3712 }
3713 // Defer backedges. See comments further below where these edges are
3714 // handled during the cloning of this Node.
3715 if (Edge->IsBackedge) {
3716 // We should only mark these if cloning recursive contexts, where we
3717 // need to do this deferral.
3719 continue;
3720 }
3721 // Ignore any caller we previously visited via another edge.
3722 if (!Visited.count(Edge->Caller) && !Edge->Caller->CloneOf) {
3723 identifyClones(Edge->Caller, Visited, AllocContextIds);
3724 }
3725 }
3726 }
3727
3728 // Check if we reached an unambiguous call or have have only a single caller.
3729 if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
3730 return;
3731
3732 // We need to clone.
3733
3734 // Try to keep the original version as alloc type NotCold. This will make
3735 // cases with indirect calls or any other situation with an unknown call to
3736 // the original function get the default behavior. We do this by sorting the
3737 // CallerEdges of the Node we will clone by alloc type.
3738 //
3739 // Give NotCold edge the lowest sort priority so those edges are at the end of
3740 // the caller edges vector, and stay on the original version (since the below
3741 // code clones greedily until it finds all remaining edges have the same type
3742 // and leaves the remaining ones on the original Node).
3743 //
3744 // We shouldn't actually have any None type edges, so the sorting priority for
3745 // that is arbitrary, and we assert in that case below.
3746 const unsigned AllocTypeCloningPriority[] = {/*None*/ 3, /*NotCold*/ 4,
3747 /*Cold*/ 1,
3748 /*NotColdCold*/ 2};
3749 llvm::stable_sort(Node->CallerEdges,
3750 [&](const std::shared_ptr<ContextEdge> &A,
3751 const std::shared_ptr<ContextEdge> &B) {
3752 // Nodes with non-empty context ids should be sorted
3753 // before those with empty context ids.
3754 if (A->ContextIds.empty())
3755 // Either B ContextIds are non-empty (in which case we
3756 // should return false because B < A), or B ContextIds
3757 // are empty, in which case they are equal, and we
3758 // should maintain the original relative ordering.
3759 return false;
3760 if (B->ContextIds.empty())
3761 return true;
3762
3763 if (A->AllocTypes == B->AllocTypes)
3764 // Use the first context id for each edge as a
3765 // tie-breaker.
3766 return *A->ContextIds.begin() < *B->ContextIds.begin();
3767 return AllocTypeCloningPriority[A->AllocTypes] <
3768 AllocTypeCloningPriority[B->AllocTypes];
3769 });
3770
3771 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
3772
3773 DenseSet<uint32_t> RecursiveContextIds;
3775 // If we are allowing recursive callsites, but have also disabled recursive
3776 // contexts, look for context ids that show up in multiple caller edges.
3778 DenseSet<uint32_t> AllCallerContextIds;
3779 for (auto &CE : Node->CallerEdges) {
3780 // Resize to the largest set of caller context ids, since we know the
3781 // final set will be at least that large.
3782 AllCallerContextIds.reserve(CE->getContextIds().size());
3783 for (auto Id : CE->getContextIds())
3784 if (!AllCallerContextIds.insert(Id).second)
3785 RecursiveContextIds.insert(Id);
3786 }
3787 }
3788
3789 // Iterate until we find no more opportunities for disambiguating the alloc
3790 // types via cloning. In most cases this loop will terminate once the Node
3791 // has a single allocation type, in which case no more cloning is needed.
3792 // Iterate over a copy of Node's caller edges, since we may need to remove
3793 // edges in the moveEdgeTo* methods, and this simplifies the handling and
3794 // makes it less error-prone.
3795 auto CallerEdges = Node->CallerEdges;
3796 for (auto &CallerEdge : CallerEdges) {
3797 // Skip any that have been removed by an earlier recursive call.
3798 if (CallerEdge->isRemoved()) {
3799 assert(!is_contained(Node->CallerEdges, CallerEdge));
3800 continue;
3801 }
3802 assert(CallerEdge->Callee == Node);
3803
3804 // See if cloning the prior caller edge left this node with a single alloc
3805 // type or a single caller. In that case no more cloning of Node is needed.
3806 if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
3807 break;
3808
3809 // If the caller was not successfully matched to a call in the IR/summary,
3810 // there is no point in trying to clone for it as we can't update that call.
3811 if (!CallerEdge->Caller->hasCall())
3812 continue;
3813
3814 // Only need to process the ids along this edge pertaining to the given
3815 // allocation.
3816 auto CallerEdgeContextsForAlloc =
3817 set_intersection(CallerEdge->getContextIds(), AllocContextIds);
3818 if (!RecursiveContextIds.empty())
3819 CallerEdgeContextsForAlloc =
3820 set_difference(CallerEdgeContextsForAlloc, RecursiveContextIds);
3821 if (CallerEdgeContextsForAlloc.empty())
3822 continue;
3823
3824 auto CallerAllocTypeForAlloc = computeAllocType(CallerEdgeContextsForAlloc);
3825
3826 // Compute the node callee edge alloc types corresponding to the context ids
3827 // for this caller edge.
3828 std::vector<uint8_t> CalleeEdgeAllocTypesForCallerEdge;
3829 CalleeEdgeAllocTypesForCallerEdge.reserve(Node->CalleeEdges.size());
3830 for (auto &CalleeEdge : Node->CalleeEdges)
3831 CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes(
3832 CalleeEdge->getContextIds(), CallerEdgeContextsForAlloc));
3833
3834 // Don't clone if doing so will not disambiguate any alloc types amongst
3835 // caller edges (including the callee edges that would be cloned).
3836 // Otherwise we will simply move all edges to the clone.
3837 //
3838 // First check if by cloning we will disambiguate the caller allocation
3839 // type from node's allocation type. Query allocTypeToUse so that we don't
3840 // bother cloning to distinguish NotCold+Cold from NotCold. Note that
3841 // neither of these should be None type.
3842 //
3843 // Then check if by cloning node at least one of the callee edges will be
3844 // disambiguated by splitting out different context ids.
3845 //
3846 // However, always do the cloning if this is a backedge, in which case we
3847 // have not yet cloned along this caller edge.
3848 assert(CallerEdge->AllocTypes != (uint8_t)AllocationType::None);
3849 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
3850 if (!CallerEdge->IsBackedge &&
3851 allocTypeToUse(CallerAllocTypeForAlloc) ==
3852 allocTypeToUse(Node->AllocTypes) &&
3853 allocTypesMatch<DerivedCCG, FuncTy, CallTy>(
3854 CalleeEdgeAllocTypesForCallerEdge, Node->CalleeEdges)) {
3855 continue;
3856 }
3857
3858 if (CallerEdge->IsBackedge) {
3859 // We should only mark these if cloning recursive contexts, where we
3860 // need to do this deferral.
3862 DeferredBackedges++;
3863 }
3864
3865 // If this is a backedge, we now do recursive cloning starting from its
3866 // caller since we may have moved unambiguous caller contexts to a clone
3867 // of this Node in a previous iteration of the current loop, giving more
3868 // opportunity for cloning through the backedge. Because we sorted the
3869 // caller edges earlier so that cold caller edges are first, we would have
3870 // visited and cloned this node for any unamibiguously cold non-recursive
3871 // callers before any ambiguous backedge callers. Note that we don't do this
3872 // if the caller is already cloned or visited during cloning (e.g. via a
3873 // different context path from the allocation).
3874 // TODO: Can we do better in the case where the caller was already visited?
3875 if (CallerEdge->IsBackedge && !CallerEdge->Caller->CloneOf &&
3876 !Visited.count(CallerEdge->Caller)) {
3877 const auto OrigIdCount = CallerEdge->getContextIds().size();
3878 // Now do the recursive cloning of this backedge's caller, which was
3879 // deferred earlier.
3880 identifyClones(CallerEdge->Caller, Visited, CallerEdgeContextsForAlloc);
3881 removeNoneTypeCalleeEdges(CallerEdge->Caller);
3882 // See if the recursive call to identifyClones moved the context ids to a
3883 // new edge from this node to a clone of caller, and switch to looking at
3884 // that new edge so that we clone Node for the new caller clone.
3885 bool UpdatedEdge = false;
3886 if (OrigIdCount > CallerEdge->getContextIds().size()) {
3887 for (auto E : Node->CallerEdges) {
3888 // Only interested in clones of the current edges caller.
3889 if (E->Caller->CloneOf != CallerEdge->Caller)
3890 continue;
3891 // See if this edge contains any of the context ids originally on the
3892 // current caller edge.
3893 auto CallerEdgeContextsForAllocNew =
3894 set_intersection(CallerEdgeContextsForAlloc, E->getContextIds());
3895 if (CallerEdgeContextsForAllocNew.empty())
3896 continue;
3897 // Make sure we don't pick a previously existing caller edge of this
3898 // Node, which would be processed on a different iteration of the
3899 // outer loop over the saved CallerEdges.
3900 if (llvm::is_contained(CallerEdges, E))
3901 continue;
3902 // The CallerAllocTypeForAlloc and CalleeEdgeAllocTypesForCallerEdge
3903 // are updated further below for all cases where we just invoked
3904 // identifyClones recursively.
3905 CallerEdgeContextsForAlloc.swap(CallerEdgeContextsForAllocNew);
3906 CallerEdge = E;
3907 UpdatedEdge = true;
3908 break;
3909 }
3910 }
3911 // If cloning removed this edge (and we didn't update it to a new edge
3912 // above), we're done with this edge. It's possible we moved all of the
3913 // context ids to an existing clone, in which case there's no need to do
3914 // further processing for them.
3915 if (CallerEdge->isRemoved())
3916 continue;
3917
3918 // Now we need to update the information used for the cloning decisions
3919 // further below, as we may have modified edges and their context ids.
3920
3921 // Note if we changed the CallerEdge above we would have already updated
3922 // the context ids.
3923 if (!UpdatedEdge) {
3924 CallerEdgeContextsForAlloc = set_intersection(
3925 CallerEdgeContextsForAlloc, CallerEdge->getContextIds());
3926 if (CallerEdgeContextsForAlloc.empty())
3927 continue;
3928 }
3929 // Update the other information that depends on the edges and on the now
3930 // updated CallerEdgeContextsForAlloc.
3931 CallerAllocTypeForAlloc = computeAllocType(CallerEdgeContextsForAlloc);
3932 CalleeEdgeAllocTypesForCallerEdge.clear();
3933 for (auto &CalleeEdge : Node->CalleeEdges) {
3934 CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes(
3935 CalleeEdge->getContextIds(), CallerEdgeContextsForAlloc));
3936 }
3937 }
3938
3939 // First see if we can use an existing clone. Check each clone and its
3940 // callee edges for matching alloc types.
3941 ContextNode *Clone = nullptr;
3942 for (auto *CurClone : Node->Clones) {
3943 if (allocTypeToUse(CurClone->AllocTypes) !=
3944 allocTypeToUse(CallerAllocTypeForAlloc))
3945 continue;
3946
3947 bool BothSingleAlloc = hasSingleAllocType(CurClone->AllocTypes) &&
3948 hasSingleAllocType(CallerAllocTypeForAlloc);
3949 // The above check should mean that if both have single alloc types that
3950 // they should be equal.
3951 assert(!BothSingleAlloc ||
3952 CurClone->AllocTypes == CallerAllocTypeForAlloc);
3953
3954 // If either both have a single alloc type (which are the same), or if the
3955 // clone's callee edges have the same alloc types as those for the current
3956 // allocation on Node's callee edges (CalleeEdgeAllocTypesForCallerEdge),
3957 // then we can reuse this clone.
3958 if (BothSingleAlloc || allocTypesMatchClone<DerivedCCG, FuncTy, CallTy>(
3959 CalleeEdgeAllocTypesForCallerEdge, CurClone)) {
3960 Clone = CurClone;
3961 break;
3962 }
3963 }
3964
3965 // The edge iterator is adjusted when we move the CallerEdge to the clone.
3966 if (Clone)
3967 moveEdgeToExistingCalleeClone(CallerEdge, Clone, /*NewClone=*/false,
3968 CallerEdgeContextsForAlloc);
3969 else
3970 Clone = moveEdgeToNewCalleeClone(CallerEdge, CallerEdgeContextsForAlloc);
3971
3972 // Sanity check that no alloc types on clone or its edges are None.
3973 assert(Clone->AllocTypes != (uint8_t)AllocationType::None);
3974 }
3975
3976 // We should still have some context ids on the original Node.
3977 assert(!Node->emptyContextIds());
3978
3979 // Sanity check that no alloc types on node or edges are None.
3980 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
3981
3982 if (VerifyNodes)
3983 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
3984}
3985
3986void ModuleCallsiteContextGraph::updateAllocationCall(
3987 CallInfo &Call, AllocationType AllocType) {
3988 std::string AllocTypeString = getAllocTypeAttributeString(AllocType);
3989 auto A = llvm::Attribute::get(Call.call()->getFunction()->getContext(),
3990 "memprof", AllocTypeString);
3991 cast<CallBase>(Call.call())->addFnAttr(A);
3992 OREGetter(Call.call()->getFunction())
3993 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", Call.call())
3994 << ore::NV("AllocationCall", Call.call()) << " in clone "
3995 << ore::NV("Caller", Call.call()->getFunction())
3996 << " marked with memprof allocation attribute "
3997 << ore::NV("Attribute", AllocTypeString));
3998}
3999
4000void IndexCallsiteContextGraph::updateAllocationCall(CallInfo &Call,
4002 auto *AI = cast<AllocInfo *>(Call.call());
4003 assert(AI);
4004 assert(AI->Versions.size() > Call.cloneNo());
4005 AI->Versions[Call.cloneNo()] = (uint8_t)AllocType;
4006}
4007
4009ModuleCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const {
4010 const auto *CB = cast<CallBase>(Call.call());
4011 if (!CB->getAttributes().hasFnAttr("memprof"))
4012 return AllocationType::None;
4013 return CB->getAttributes().getFnAttr("memprof").getValueAsString() == "cold"
4014 ? AllocationType::Cold
4015 : AllocationType::NotCold;
4016}
4017
4019IndexCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const {
4020 const auto *AI = cast<AllocInfo *>(Call.call());
4021 assert(AI->Versions.size() > Call.cloneNo());
4022 return (AllocationType)AI->Versions[Call.cloneNo()];
4023}
4024
4025void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall,
4026 FuncInfo CalleeFunc) {
4027 auto *CurF = getCalleeFunc(CallerCall.call());
4028 auto NewCalleeCloneNo = CalleeFunc.cloneNo();
4029 if (isMemProfClone(*CurF)) {
4030 // If we already assigned this callsite to call a specific non-default
4031 // clone (i.e. not the original function which is clone 0), ensure that we
4032 // aren't trying to now update it to call a different clone, which is
4033 // indicative of a bug in the graph or function assignment.
4034 auto CurCalleeCloneNo = getMemProfCloneNum(*CurF);
4035 if (CurCalleeCloneNo != NewCalleeCloneNo) {
4036 LLVM_DEBUG(dbgs() << "Mismatch in call clone assignment: was "
4037 << CurCalleeCloneNo << " now " << NewCalleeCloneNo
4038 << "\n");
4039 MismatchedCloneAssignments++;
4040 }
4041 }
4042 if (NewCalleeCloneNo > 0)
4043 cast<CallBase>(CallerCall.call())->setCalledFunction(CalleeFunc.func());
4044 OREGetter(CallerCall.call()->getFunction())
4045 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CallerCall.call())
4046 << ore::NV("Call", CallerCall.call()) << " in clone "
4047 << ore::NV("Caller", CallerCall.call()->getFunction())
4048 << " assigned to call function clone "
4049 << ore::NV("Callee", CalleeFunc.func()));
4050}
4051
4052void IndexCallsiteContextGraph::updateCall(CallInfo &CallerCall,
4053 FuncInfo CalleeFunc) {
4054 auto *CI = cast<CallsiteInfo *>(CallerCall.call());
4055 assert(CI &&
4056 "Caller cannot be an allocation which should not have profiled calls");
4057 assert(CI->Clones.size() > CallerCall.cloneNo());
4058 auto NewCalleeCloneNo = CalleeFunc.cloneNo();
4059 auto &CurCalleeCloneNo = CI->Clones[CallerCall.cloneNo()];
4060 // If we already assigned this callsite to call a specific non-default
4061 // clone (i.e. not the original function which is clone 0), ensure that we
4062 // aren't trying to now update it to call a different clone, which is
4063 // indicative of a bug in the graph or function assignment.
4064 if (CurCalleeCloneNo != 0 && CurCalleeCloneNo != NewCalleeCloneNo) {
4065 LLVM_DEBUG(dbgs() << "Mismatch in call clone assignment: was "
4066 << CurCalleeCloneNo << " now " << NewCalleeCloneNo
4067 << "\n");
4068 MismatchedCloneAssignments++;
4069 }
4070 CurCalleeCloneNo = NewCalleeCloneNo;
4071}
4072
4073// Update the debug information attached to NewFunc to use the clone Name. Note
4074// this needs to be done for both any existing DISubprogram for the definition,
4075// as well as any separate declaration DISubprogram.
4077 assert(Name == NewFunc->getName());
4078 auto *SP = NewFunc->getSubprogram();
4079 if (!SP)
4080 return;
4081 auto *MDName = MDString::get(NewFunc->getParent()->getContext(), Name);
4082 SP->replaceLinkageName(MDName);
4083 DISubprogram *Decl = SP->getDeclaration();
4084 if (!Decl)
4085 return;
4086 TempDISubprogram NewDecl = Decl->clone();
4087 NewDecl->replaceLinkageName(MDName);
4088 SP->replaceDeclaration(MDNode::replaceWithUniqued(std::move(NewDecl)));
4089}
4090
4091CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
4092 Instruction *>::FuncInfo
4093ModuleCallsiteContextGraph::cloneFunctionForCallsite(
4094 FuncInfo &Func, CallInfo &Call, DenseMap<CallInfo, CallInfo> &CallMap,
4095 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
4096 // Use existing LLVM facilities for cloning and obtaining Call in clone
4097 ValueToValueMapTy VMap;
4098 auto *NewFunc = CloneFunction(Func.func(), VMap);
4099 std::string Name = getMemProfFuncName(Func.func()->getName(), CloneNo);
4100 assert(!Func.func()->getParent()->getFunction(Name));
4101 NewFunc->setName(Name);
4102 updateSubprogramLinkageName(NewFunc, Name);
4103 for (auto &Inst : CallsWithMetadataInFunc) {
4104 // This map always has the initial version in it.
4105 assert(Inst.cloneNo() == 0);
4106 CallMap[Inst] = {cast<Instruction>(VMap[Inst.call()]), CloneNo};
4107 }
4108 OREGetter(Func.func())
4109 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", Func.func())
4110 << "created clone " << ore::NV("NewFunction", NewFunc));
4111 return {NewFunc, CloneNo};
4112}
4113
4114CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
4115 IndexCall>::FuncInfo
4116IndexCallsiteContextGraph::cloneFunctionForCallsite(
4117 FuncInfo &Func, CallInfo &Call, DenseMap<CallInfo, CallInfo> &CallMap,
4118 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
4119 // Check how many clones we have of Call (and therefore function).
4120 // The next clone number is the current size of versions array.
4121 // Confirm this matches the CloneNo provided by the caller, which is based on
4122 // the number of function clones we have.
4123 assert(CloneNo == (isa<AllocInfo *>(Call.call())
4124 ? cast<AllocInfo *>(Call.call())->Versions.size()
4125 : cast<CallsiteInfo *>(Call.call())->Clones.size()));
4126 // Walk all the instructions in this function. Create a new version for
4127 // each (by adding an entry to the Versions/Clones summary array), and copy
4128 // over the version being called for the function clone being cloned here.
4129 // Additionally, add an entry to the CallMap for the new function clone,
4130 // mapping the original call (clone 0, what is in CallsWithMetadataInFunc)
4131 // to the new call clone.
4132 for (auto &Inst : CallsWithMetadataInFunc) {
4133 // This map always has the initial version in it.
4134 assert(Inst.cloneNo() == 0);
4135 if (auto *AI = dyn_cast<AllocInfo *>(Inst.call())) {
4136 assert(AI->Versions.size() == CloneNo);
4137 // We assign the allocation type later (in updateAllocationCall), just add
4138 // an entry for it here.
4139 AI->Versions.push_back(0);
4140 } else {
4141 auto *CI = cast<CallsiteInfo *>(Inst.call());
4142 assert(CI && CI->Clones.size() == CloneNo);
4143 // We assign the clone number later (in updateCall), just add an entry for
4144 // it here.
4145 CI->Clones.push_back(0);
4146 }
4147 CallMap[Inst] = {Inst.call(), CloneNo};
4148 }
4149 return {Func.func(), CloneNo};
4150}
4151
4152// We perform cloning for each allocation node separately. However, this
4153// sometimes results in a situation where the same node calls multiple
4154// clones of the same callee, created for different allocations. This
4155// causes issues when assigning functions to these clones, as each node can
4156// in reality only call a single callee clone.
4157//
4158// To address this, before assigning functions, merge callee clone nodes as
4159// needed using a post order traversal from the allocations. We attempt to
4160// use existing clones as the merge node when legal, and to share them
4161// among callers with the same properties (callers calling the same set of
4162// callee clone nodes for the same allocations).
4163//
4164// Without this fix, in some cases incorrect function assignment will lead
4165// to calling the wrong allocation clone.
4166template <typename DerivedCCG, typename FuncTy, typename CallTy>
4167void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::mergeClones() {
4168 if (!MergeClones)
4169 return;
4170
4171 // Generate a map from context id to the associated allocation node for use
4172 // when merging clones.
4173 DenseMap<uint32_t, ContextNode *> ContextIdToAllocationNode;
4174 for (auto &Entry : AllocationCallToContextNodeMap) {
4175 auto *Node = Entry.second;
4176 for (auto Id : Node->getContextIds())
4177 ContextIdToAllocationNode[Id] = Node->getOrigNode();
4178 for (auto *Clone : Node->Clones) {
4179 for (auto Id : Clone->getContextIds())
4180 ContextIdToAllocationNode[Id] = Clone->getOrigNode();
4181 }
4182 }
4183
4184 // Post order traversal starting from allocations to ensure each callsite
4185 // calls a single clone of its callee. Callee nodes that are clones of each
4186 // other are merged (via new merge nodes if needed) to achieve this.
4187 DenseSet<const ContextNode *> Visited;
4188 for (auto &Entry : AllocationCallToContextNodeMap) {
4189 auto *Node = Entry.second;
4190
4191 mergeClones(Node, Visited, ContextIdToAllocationNode);
4192
4193 // Make a copy so the recursive post order traversal that may create new
4194 // clones doesn't mess up iteration. Note that the recursive traversal
4195 // itself does not call mergeClones on any of these nodes, which are all
4196 // (clones of) allocations.
4197 auto Clones = Node->Clones;
4198 for (auto *Clone : Clones)
4199 mergeClones(Clone, Visited, ContextIdToAllocationNode);
4200 }
4201
4202 if (DumpCCG) {
4203 dbgs() << "CCG after merging:\n";
4204 dbgs() << *this;
4205 }
4206 if (ExportToDot)
4207 exportToDot("aftermerge");
4208
4209 if (VerifyCCG) {
4210 check();
4211 }
4212}
4213
4214// Recursive helper for above mergeClones method.
4215template <typename DerivedCCG, typename FuncTy, typename CallTy>
4216void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::mergeClones(
4217 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
4218 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode) {
4219 auto Inserted = Visited.insert(Node);
4220 if (!Inserted.second)
4221 return;
4222
4223 // Iteratively perform merging on this node to handle new caller nodes created
4224 // during the recursive traversal. We could do something more elegant such as
4225 // maintain a worklist, but this is a simple approach that doesn't cause a
4226 // measureable compile time effect, as most nodes don't have many caller
4227 // edges to check.
4228 bool FoundUnvisited = true;
4229 unsigned Iters = 0;
4230 while (FoundUnvisited) {
4231 Iters++;
4232 FoundUnvisited = false;
4233 // Make a copy since the recursive call may move a caller edge to a new
4234 // callee, messing up the iterator.
4235 auto CallerEdges = Node->CallerEdges;
4236 for (auto CallerEdge : CallerEdges) {
4237 // Skip any caller edge moved onto a different callee during recursion.
4238 if (CallerEdge->Callee != Node)
4239 continue;
4240 // If we found an unvisited caller, note that we should check the caller
4241 // edges again as mergeClones may add or change caller nodes.
4242 if (DoMergeIteration && !Visited.contains(CallerEdge->Caller))
4243 FoundUnvisited = true;
4244 mergeClones(CallerEdge->Caller, Visited, ContextIdToAllocationNode);
4245 }
4246 }
4247
4248 TotalMergeInvokes++;
4249 TotalMergeIters += Iters;
4250 if (Iters > MaxMergeIters)
4251 MaxMergeIters = Iters;
4252
4253 // Merge for this node after we handle its callers.
4254 mergeNodeCalleeClones(Node, Visited, ContextIdToAllocationNode);
4255}
4256
4257template <typename DerivedCCG, typename FuncTy, typename CallTy>
4258void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::mergeNodeCalleeClones(
4259 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
4260 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode) {
4261 // Ignore Node if we moved all of its contexts to clones.
4262 if (Node->emptyContextIds())
4263 return;
4264
4265 // First identify groups of clones among Node's callee edges, by building
4266 // a map from each callee base node to the associated callee edges from Node.
4267 MapVector<ContextNode *, std::vector<std::shared_ptr<ContextEdge>>>
4268 OrigNodeToCloneEdges;
4269 for (const auto &E : Node->CalleeEdges) {
4270 auto *Callee = E->Callee;
4271 if (!Callee->CloneOf && Callee->Clones.empty())
4272 continue;
4273 ContextNode *Base = Callee->getOrigNode();
4274 OrigNodeToCloneEdges[Base].push_back(E);
4275 }
4276
4277 // Helper for callee edge sorting below. Return true if A's callee has fewer
4278 // caller edges than B, or if A is a clone and B is not, or if A's first
4279 // context id is smaller than B's.
4280 auto CalleeCallerEdgeLessThan = [](const std::shared_ptr<ContextEdge> &A,
4281 const std::shared_ptr<ContextEdge> &B) {
4282 if (A->Callee->CallerEdges.size() != B->Callee->CallerEdges.size())
4283 return A->Callee->CallerEdges.size() < B->Callee->CallerEdges.size();
4284 if (A->Callee->CloneOf && !B->Callee->CloneOf)
4285 return true;
4286 else if (!A->Callee->CloneOf && B->Callee->CloneOf)
4287 return false;
4288 // Use the first context id for each edge as a
4289 // tie-breaker.
4290 return *A->ContextIds.begin() < *B->ContextIds.begin();
4291 };
4292
4293 // Process each set of callee clones called by Node, performing the needed
4294 // merging.
4295 for (auto Entry : OrigNodeToCloneEdges) {
4296 // CalleeEdges is the set of edges from Node reaching callees that are
4297 // mutual clones of each other.
4298 auto &CalleeEdges = Entry.second;
4299 auto NumCalleeClones = CalleeEdges.size();
4300 // A single edge means there is no merging needed.
4301 if (NumCalleeClones == 1)
4302 continue;
4303 // Sort the CalleeEdges calling this group of clones in ascending order of
4304 // their caller edge counts, putting the original non-clone node first in
4305 // cases of a tie. This simplifies finding an existing node to use as the
4306 // merge node.
4307 llvm::stable_sort(CalleeEdges, CalleeCallerEdgeLessThan);
4308
4309 /// Find other callers of the given set of callee edges that can
4310 /// share the same callee merge node. See the comments at this method
4311 /// definition for details.
4312 DenseSet<ContextNode *> OtherCallersToShareMerge;
4313 findOtherCallersToShareMerge(Node, CalleeEdges, ContextIdToAllocationNode,
4314 OtherCallersToShareMerge);
4315
4316 // Now do the actual merging. Identify existing or create a new MergeNode
4317 // during the first iteration. Move each callee over, along with edges from
4318 // other callers we've determined above can share the same merge node.
4319 ContextNode *MergeNode = nullptr;
4320 DenseMap<ContextNode *, unsigned> CallerToMoveCount;
4321 for (auto CalleeEdge : CalleeEdges) {
4322 auto *OrigCallee = CalleeEdge->Callee;
4323 // If we don't have a MergeNode yet (only happens on the first iteration,
4324 // as a new one will be created when we go to move the first callee edge
4325 // over as needed), see if we can use this callee.
4326 if (!MergeNode) {
4327 // If there are no other callers, simply use this callee.
4328 if (CalleeEdge->Callee->CallerEdges.size() == 1) {
4329 MergeNode = OrigCallee;
4330 NonNewMergedNodes++;
4331 continue;
4332 }
4333 // Otherwise, if we have identified other caller nodes that can share
4334 // the merge node with Node, see if all of OrigCallee's callers are
4335 // going to share the same merge node. In that case we can use callee
4336 // (since all of its callers would move to the new merge node).
4337 if (!OtherCallersToShareMerge.empty()) {
4338 bool MoveAllCallerEdges = true;
4339 for (auto CalleeCallerE : OrigCallee->CallerEdges) {
4340 if (CalleeCallerE == CalleeEdge)
4341 continue;
4342 if (!OtherCallersToShareMerge.contains(CalleeCallerE->Caller)) {
4343 MoveAllCallerEdges = false;
4344 break;
4345 }
4346 }
4347 // If we are going to move all callers over, we can use this callee as
4348 // the MergeNode.
4349 if (MoveAllCallerEdges) {
4350 MergeNode = OrigCallee;
4351 NonNewMergedNodes++;
4352 continue;
4353 }
4354 }
4355 }
4356 // Move this callee edge, creating a new merge node if necessary.
4357 if (MergeNode) {
4358 assert(MergeNode != OrigCallee);
4359 moveEdgeToExistingCalleeClone(CalleeEdge, MergeNode,
4360 /*NewClone*/ false);
4361 } else {
4362 MergeNode = moveEdgeToNewCalleeClone(CalleeEdge);
4363 NewMergedNodes++;
4364 }
4365 // Now move all identified edges from other callers over to the merge node
4366 // as well.
4367 if (!OtherCallersToShareMerge.empty()) {
4368 // Make and iterate over a copy of OrigCallee's caller edges because
4369 // some of these will be moved off of the OrigCallee and that would mess
4370 // up the iteration from OrigCallee.
4371 auto OrigCalleeCallerEdges = OrigCallee->CallerEdges;
4372 for (auto &CalleeCallerE : OrigCalleeCallerEdges) {
4373 if (CalleeCallerE == CalleeEdge)
4374 continue;
4375 if (!OtherCallersToShareMerge.contains(CalleeCallerE->Caller))
4376 continue;
4377 CallerToMoveCount[CalleeCallerE->Caller]++;
4378 moveEdgeToExistingCalleeClone(CalleeCallerE, MergeNode,
4379 /*NewClone*/ false);
4380 }
4381 }
4382 removeNoneTypeCalleeEdges(OrigCallee);
4383 removeNoneTypeCalleeEdges(MergeNode);
4384 }
4385 }
4386}
4387
4388// Look for other nodes that have edges to the same set of callee
4389// clones as the current Node. Those can share the eventual merge node
4390// (reducing cloning and binary size overhead) iff:
4391// - they have edges to the same set of callee clones
4392// - each callee edge reaches a subset of the same allocations as Node's
4393// corresponding edge to the same callee clone.
4394// The second requirement is to ensure that we don't undo any of the
4395// necessary cloning to distinguish contexts with different allocation
4396// behavior.
4397// FIXME: This is somewhat conservative, as we really just need to ensure
4398// that they don't reach the same allocations as contexts on edges from Node
4399// going to any of the *other* callee clones being merged. However, that
4400// requires more tracking and checking to get right.
4401template <typename DerivedCCG, typename FuncTy, typename CallTy>
4402void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
4403 findOtherCallersToShareMerge(
4404 ContextNode *Node,
4405 std::vector<std::shared_ptr<ContextEdge>> &CalleeEdges,
4406 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode,
4407 DenseSet<ContextNode *> &OtherCallersToShareMerge) {
4408 auto NumCalleeClones = CalleeEdges.size();
4409 // This map counts how many edges to the same callee clone exist for other
4410 // caller nodes of each callee clone.
4411 DenseMap<ContextNode *, unsigned> OtherCallersToSharedCalleeEdgeCount;
4412 // Counts the number of other caller nodes that have edges to all callee
4413 // clones that don't violate the allocation context checking.
4414 unsigned PossibleOtherCallerNodes = 0;
4415
4416 // We only need to look at other Caller nodes if the first callee edge has
4417 // multiple callers (recall they are sorted in ascending order above).
4418 if (CalleeEdges[0]->Callee->CallerEdges.size() < 2)
4419 return;
4420
4421 // For each callee edge:
4422 // - Collect the count of other caller nodes calling the same callees.
4423 // - Collect the alloc nodes reached by contexts on each callee edge.
4424 DenseMap<ContextEdge *, DenseSet<ContextNode *>> CalleeEdgeToAllocNodes;
4425 for (auto CalleeEdge : CalleeEdges) {
4426 assert(CalleeEdge->Callee->CallerEdges.size() > 1);
4427 // For each other caller of the same callee, increment the count of
4428 // edges reaching the same callee clone.
4429 for (auto CalleeCallerEdges : CalleeEdge->Callee->CallerEdges) {
4430 if (CalleeCallerEdges->Caller == Node) {
4431 assert(CalleeCallerEdges == CalleeEdge);
4432 continue;
4433 }
4434 OtherCallersToSharedCalleeEdgeCount[CalleeCallerEdges->Caller]++;
4435 // If this caller edge now reaches all of the same callee clones,
4436 // increment the count of candidate other caller nodes.
4437 if (OtherCallersToSharedCalleeEdgeCount[CalleeCallerEdges->Caller] ==
4438 NumCalleeClones)
4439 PossibleOtherCallerNodes++;
4440 }
4441 // Collect the alloc nodes reached by contexts on each callee edge, for
4442 // later analysis.
4443 for (auto Id : CalleeEdge->getContextIds()) {
4444 auto *Alloc = ContextIdToAllocationNode.lookup(Id);
4445 if (!Alloc) {
4446 // FIXME: unclear why this happens occasionally, presumably
4447 // imperfect graph updates possibly with recursion.
4448 MissingAllocForContextId++;
4449 continue;
4450 }
4451 CalleeEdgeToAllocNodes[CalleeEdge.get()].insert(Alloc);
4452 }
4453 }
4454
4455 // Now walk the callee edges again, and make sure that for each candidate
4456 // caller node all of its edges to the callees reach the same allocs (or
4457 // a subset) as those along the corresponding callee edge from Node.
4458 for (auto CalleeEdge : CalleeEdges) {
4459 assert(CalleeEdge->Callee->CallerEdges.size() > 1);
4460 // Stop if we do not have any (more) candidate other caller nodes.
4461 if (!PossibleOtherCallerNodes)
4462 break;
4463 auto &CurCalleeAllocNodes = CalleeEdgeToAllocNodes[CalleeEdge.get()];
4464 // Check each other caller of this callee clone.
4465 for (auto &CalleeCallerE : CalleeEdge->Callee->CallerEdges) {
4466 // Not interested in the callee edge from Node itself.
4467 if (CalleeCallerE == CalleeEdge)
4468 continue;
4469 // Skip any callers that didn't have callee edges to all the same
4470 // callee clones.
4471 if (OtherCallersToSharedCalleeEdgeCount[CalleeCallerE->Caller] !=
4472 NumCalleeClones)
4473 continue;
4474 // Make sure that each context along edge from candidate caller node
4475 // reaches an allocation also reached by this callee edge from Node.
4476 for (auto Id : CalleeCallerE->getContextIds()) {
4477 auto *Alloc = ContextIdToAllocationNode.lookup(Id);
4478 if (!Alloc)
4479 continue;
4480 // If not, simply reset the map entry to 0 so caller is ignored, and
4481 // reduce the count of candidate other caller nodes.
4482 if (!CurCalleeAllocNodes.contains(Alloc)) {
4483 OtherCallersToSharedCalleeEdgeCount[CalleeCallerE->Caller] = 0;
4484 PossibleOtherCallerNodes--;
4485 break;
4486 }
4487 }
4488 }
4489 }
4490
4491 if (!PossibleOtherCallerNodes)
4492 return;
4493
4494 // Build the set of other caller nodes that can use the same callee merge
4495 // node.
4496 for (auto &[OtherCaller, Count] : OtherCallersToSharedCalleeEdgeCount) {
4497 if (Count != NumCalleeClones)
4498 continue;
4499 OtherCallersToShareMerge.insert(OtherCaller);
4500 }
4501}
4502
4503// This method assigns cloned callsites to functions, cloning the functions as
4504// needed. The assignment is greedy and proceeds roughly as follows:
4505//
4506// For each function Func:
4507// For each call with graph Node having clones:
4508// Initialize ClonesWorklist to Node and its clones
4509// Initialize NodeCloneCount to 0
4510// While ClonesWorklist is not empty:
4511// Clone = pop front ClonesWorklist
4512// NodeCloneCount++
4513// If Func has been cloned less than NodeCloneCount times:
4514// If NodeCloneCount is 1:
4515// Assign Clone to original Func
4516// Continue
4517// Create a new function clone
4518// If other callers not assigned to call a function clone yet:
4519// Assign them to call new function clone
4520// Continue
4521// Assign any other caller calling the cloned version to new clone
4522//
4523// For each caller of Clone:
4524// If caller is assigned to call a specific function clone:
4525// If we cannot assign Clone to that function clone:
4526// Create new callsite Clone NewClone
4527// Add NewClone to ClonesWorklist
4528// Continue
4529// Assign Clone to existing caller's called function clone
4530// Else:
4531// If Clone not already assigned to a function clone:
4532// Assign to first function clone without assignment
4533// Assign caller to selected function clone
4534// For each call with graph Node having clones:
4535// If number func clones > number call's callsite Node clones:
4536// Record func CallInfo clones without Node clone in UnassignedCallClones
4537// For callsite Nodes in DFS order from allocations:
4538// If IsAllocation:
4539// Update allocation with alloc type
4540// Else:
4541// For Call, all MatchingCalls, and associated UnnassignedCallClones:
4542// Update call to call recorded callee clone
4543//
4544template <typename DerivedCCG, typename FuncTy, typename CallTy>
4545bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
4546 bool Changed = false;
4547
4548 mergeClones();
4549
4550 // Keep track of the assignment of nodes (callsites) to function clones they
4551 // call.
4552 DenseMap<ContextNode *, FuncInfo> CallsiteToCalleeFuncCloneMap;
4553
4554 // Update caller node to call function version CalleeFunc, by recording the
4555 // assignment in CallsiteToCalleeFuncCloneMap.
4556 auto RecordCalleeFuncOfCallsite = [&](ContextNode *Caller,
4557 const FuncInfo &CalleeFunc) {
4558 assert(Caller->hasCall());
4559 CallsiteToCalleeFuncCloneMap[Caller] = CalleeFunc;
4560 };
4561
4562 // Information for a single clone of this Func.
4563 struct FuncCloneInfo {
4564 // The function clone.
4565 FuncInfo FuncClone;
4566 // Remappings of each call of interest (from original uncloned call to the
4567 // corresponding cloned call in this function clone).
4568 DenseMap<CallInfo, CallInfo> CallMap;
4569 };
4570
4571 // Map to keep track of information needed to update calls in function clones
4572 // when their corresponding callsite node was not itself cloned for that
4573 // function clone. Because of call context pruning (i.e. we only keep as much
4574 // caller information as needed to distinguish hot vs cold), we may not have
4575 // caller edges coming to each callsite node from all possible function
4576 // callers. A function clone may get created for other callsites in the
4577 // function for which there are caller edges that were not pruned. Any other
4578 // callsites in that function clone, which were not themselved cloned for
4579 // that function clone, should get updated the same way as the corresponding
4580 // callsite in the original function (which may call a clone of its callee).
4581 //
4582 // We build this map after completing function cloning for each function, so
4583 // that we can record the information from its call maps before they are
4584 // destructed. The map will be used as we update calls to update any still
4585 // unassigned call clones. Note that we may create new node clones as we clone
4586 // other functions, so later on we check which node clones were still not
4587 // created. To this end, the inner map is a map from function clone number to
4588 // the list of calls cloned for that function (can be more than one due to the
4589 // Node's MatchingCalls array).
4590 //
4591 // The alternative is creating new callsite clone nodes below as we clone the
4592 // function, but that is tricker to get right and likely more overhead.
4593 //
4594 // Inner map is a std::map so sorted by key (clone number), in order to get
4595 // ordered remarks in the full LTO case.
4596 DenseMap<const ContextNode *, std::map<unsigned, SmallVector<CallInfo, 0>>>
4597 UnassignedCallClones;
4598
4599 // Walk all functions for which we saw calls with memprof metadata, and handle
4600 // cloning for each of its calls.
4601 for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
4602 FuncInfo OrigFunc(Func);
4603 // Map from each clone number of OrigFunc to information about that function
4604 // clone (the function clone FuncInfo and call remappings). The index into
4605 // the vector is the clone number, as function clones are created and
4606 // numbered sequentially.
4607 std::vector<FuncCloneInfo> FuncCloneInfos;
4608 for (auto &Call : CallsWithMetadata) {
4609 ContextNode *Node = getNodeForInst(Call);
4610 // Skip call if we do not have a node for it (all uses of its stack ids
4611 // were either on inlined chains or pruned from the MIBs), or if we did
4612 // not create any clones for it.
4613 if (!Node || Node->Clones.empty())
4614 continue;
4615 assert(Node->hasCall() &&
4616 "Not having a call should have prevented cloning");
4617
4618 // Track the assignment of function clones to clones of the current
4619 // callsite Node being handled.
4620 std::map<FuncInfo, ContextNode *> FuncCloneToCurNodeCloneMap;
4621
4622 // Assign callsite version CallsiteClone to function version FuncClone,
4623 // and also assign (possibly cloned) Call to CallsiteClone.
4624 auto AssignCallsiteCloneToFuncClone = [&](const FuncInfo &FuncClone,
4625 CallInfo &Call,
4626 ContextNode *CallsiteClone,
4627 bool IsAlloc) {
4628 // Record the clone of callsite node assigned to this function clone.
4629 FuncCloneToCurNodeCloneMap[FuncClone] = CallsiteClone;
4630
4631 assert(FuncCloneInfos.size() > FuncClone.cloneNo());
4632 DenseMap<CallInfo, CallInfo> &CallMap =
4633 FuncCloneInfos[FuncClone.cloneNo()].CallMap;
4634 CallInfo CallClone(Call);
4635 if (auto It = CallMap.find(Call); It != CallMap.end())
4636 CallClone = It->second;
4637 CallsiteClone->setCall(CallClone);
4638 // Need to do the same for all matching calls.
4639 for (auto &MatchingCall : Node->MatchingCalls) {
4640 CallInfo CallClone(MatchingCall);
4641 if (auto It = CallMap.find(MatchingCall); It != CallMap.end())
4642 CallClone = It->second;
4643 // Updates the call in the list.
4644 MatchingCall = CallClone;
4645 }
4646 };
4647
4648 // Invokes moveEdgeToNewCalleeClone which creates a new clone, and then
4649 // performs the necessary fixups (removing none type edges, and
4650 // importantly, propagating any function call assignment of the original
4651 // node to the new clone).
4652 auto MoveEdgeToNewCalleeCloneAndSetUp =
4653 [&](const std::shared_ptr<ContextEdge> &Edge) {
4654 ContextNode *OrigCallee = Edge->Callee;
4655 ContextNode *NewClone = moveEdgeToNewCalleeClone(Edge);
4656 removeNoneTypeCalleeEdges(NewClone);
4657 assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
4658 // If the original Callee was already assigned to call a specific
4659 // function version, make sure its new clone is assigned to call
4660 // that same function clone.
4661 if (CallsiteToCalleeFuncCloneMap.count(OrigCallee))
4662 RecordCalleeFuncOfCallsite(
4663 NewClone, CallsiteToCalleeFuncCloneMap[OrigCallee]);
4664 return NewClone;
4665 };
4666
4667 // Keep track of the clones of callsite Node that need to be assigned to
4668 // function clones. This list may be expanded in the loop body below if we
4669 // find additional cloning is required.
4670 std::deque<ContextNode *> ClonesWorklist;
4671 // Ignore original Node if we moved all of its contexts to clones.
4672 if (!Node->emptyContextIds())
4673 ClonesWorklist.push_back(Node);
4674 llvm::append_range(ClonesWorklist, Node->Clones);
4675
4676 // Now walk through all of the clones of this callsite Node that we need,
4677 // and determine the assignment to a corresponding clone of the current
4678 // function (creating new function clones as needed).
4679 unsigned NodeCloneCount = 0;
4680 while (!ClonesWorklist.empty()) {
4681 ContextNode *Clone = ClonesWorklist.front();
4682 ClonesWorklist.pop_front();
4683 NodeCloneCount++;
4684 if (VerifyNodes)
4686
4687 // Need to create a new function clone if we have more callsite clones
4688 // than existing function clones, which would have been assigned to an
4689 // earlier clone in the list (we assign callsite clones to function
4690 // clones greedily).
4691 if (FuncCloneInfos.size() < NodeCloneCount) {
4692 // If this is the first callsite copy, assign to original function.
4693 if (NodeCloneCount == 1) {
4694 // Since FuncCloneInfos is empty in this case, no clones have
4695 // been created for this function yet, and no callers should have
4696 // been assigned a function clone for this callee node yet.
4698 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
4699 return CallsiteToCalleeFuncCloneMap.count(E->Caller);
4700 }));
4701 // Initialize with empty call map, assign Clone to original function
4702 // and its callers, and skip to the next clone.
4703 FuncCloneInfos.push_back(
4704 {OrigFunc, DenseMap<CallInfo, CallInfo>()});
4705 AssignCallsiteCloneToFuncClone(
4706 OrigFunc, Call, Clone,
4707 AllocationCallToContextNodeMap.count(Call));
4708 for (auto &CE : Clone->CallerEdges) {
4709 // Ignore any caller that does not have a recorded callsite Call.
4710 if (!CE->Caller->hasCall())
4711 continue;
4712 RecordCalleeFuncOfCallsite(CE->Caller, OrigFunc);
4713 }
4714 continue;
4715 }
4716
4717 // First locate which copy of OrigFunc to clone again. If a caller
4718 // of this callsite clone was already assigned to call a particular
4719 // function clone, we need to redirect all of those callers to the
4720 // new function clone, and update their other callees within this
4721 // function.
4722 FuncInfo PreviousAssignedFuncClone;
4723 auto EI = llvm::find_if(
4724 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
4725 return CallsiteToCalleeFuncCloneMap.count(E->Caller);
4726 });
4727 bool CallerAssignedToCloneOfFunc = false;
4728 if (EI != Clone->CallerEdges.end()) {
4729 const std::shared_ptr<ContextEdge> &Edge = *EI;
4730 PreviousAssignedFuncClone =
4731 CallsiteToCalleeFuncCloneMap[Edge->Caller];
4732 CallerAssignedToCloneOfFunc = true;
4733 }
4734
4735 // Clone function and save it along with the CallInfo map created
4736 // during cloning in the FuncCloneInfos.
4737 DenseMap<CallInfo, CallInfo> NewCallMap;
4738 unsigned CloneNo = FuncCloneInfos.size();
4739 assert(CloneNo > 0 && "Clone 0 is the original function, which "
4740 "should already exist in the map");
4741 FuncInfo NewFuncClone = cloneFunctionForCallsite(
4742 OrigFunc, Call, NewCallMap, CallsWithMetadata, CloneNo);
4743 FuncCloneInfos.push_back({NewFuncClone, std::move(NewCallMap)});
4744 FunctionClonesAnalysis++;
4745 Changed = true;
4746
4747 // If no caller callsites were already assigned to a clone of this
4748 // function, we can simply assign this clone to the new func clone
4749 // and update all callers to it, then skip to the next clone.
4750 if (!CallerAssignedToCloneOfFunc) {
4751 AssignCallsiteCloneToFuncClone(
4752 NewFuncClone, Call, Clone,
4753 AllocationCallToContextNodeMap.count(Call));
4754 for (auto &CE : Clone->CallerEdges) {
4755 // Ignore any caller that does not have a recorded callsite Call.
4756 if (!CE->Caller->hasCall())
4757 continue;
4758 RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
4759 }
4760 continue;
4761 }
4762
4763 // We may need to do additional node cloning in this case.
4764 // Reset the CallsiteToCalleeFuncCloneMap entry for any callers
4765 // that were previously assigned to call PreviousAssignedFuncClone,
4766 // to record that they now call NewFuncClone.
4767 // The none type edge removal may remove some of this Clone's caller
4768 // edges, if it is reached via another of its caller's callees.
4769 // Iterate over a copy and skip any that were removed.
4770 auto CallerEdges = Clone->CallerEdges;
4771 for (auto CE : CallerEdges) {
4772 // Skip any that have been removed on an earlier iteration.
4773 if (CE->isRemoved()) {
4774 assert(!is_contained(Clone->CallerEdges, CE));
4775 continue;
4776 }
4777 assert(CE);
4778 // Ignore any caller that does not have a recorded callsite Call.
4779 if (!CE->Caller->hasCall())
4780 continue;
4781
4782 if (!CallsiteToCalleeFuncCloneMap.count(CE->Caller) ||
4783 // We subsequently fall through to later handling that
4784 // will perform any additional cloning required for
4785 // callers that were calling other function clones.
4786 CallsiteToCalleeFuncCloneMap[CE->Caller] !=
4787 PreviousAssignedFuncClone)
4788 continue;
4789
4790 RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
4791
4792 // If we are cloning a function that was already assigned to some
4793 // callers, then essentially we are creating new callsite clones
4794 // of the other callsites in that function that are reached by those
4795 // callers. Clone the other callees of the current callsite's caller
4796 // that were already assigned to PreviousAssignedFuncClone
4797 // accordingly. This is important since we subsequently update the
4798 // calls from the nodes in the graph and their assignments to callee
4799 // functions recorded in CallsiteToCalleeFuncCloneMap.
4800 // The none type edge removal may remove some of this caller's
4801 // callee edges, if it is reached via another of its callees.
4802 // Iterate over a copy and skip any that were removed.
4803 auto CalleeEdges = CE->Caller->CalleeEdges;
4804 for (auto CalleeEdge : CalleeEdges) {
4805 // Skip any that have been removed on an earlier iteration when
4806 // cleaning up newly None type callee edges.
4807 if (CalleeEdge->isRemoved()) {
4808 assert(!is_contained(CE->Caller->CalleeEdges, CalleeEdge));
4809 continue;
4810 }
4811 assert(CalleeEdge);
4812 ContextNode *Callee = CalleeEdge->Callee;
4813 // Skip the current callsite, we are looking for other
4814 // callsites Caller calls, as well as any that does not have a
4815 // recorded callsite Call.
4816 if (Callee == Clone || !Callee->hasCall())
4817 continue;
4818 // Skip direct recursive calls. We don't need/want to clone the
4819 // caller node again, and this loop will not behave as expected if
4820 // we tried.
4821 if (Callee == CalleeEdge->Caller)
4822 continue;
4823 ContextNode *NewClone =
4824 MoveEdgeToNewCalleeCloneAndSetUp(CalleeEdge);
4825 // Moving the edge may have resulted in some none type
4826 // callee edges on the original Callee.
4827 removeNoneTypeCalleeEdges(Callee);
4828 // Update NewClone with the new Call clone of this callsite's Call
4829 // created for the new function clone created earlier.
4830 // Recall that we have already ensured when building the graph
4831 // that each caller can only call callsites within the same
4832 // function, so we are guaranteed that Callee Call is in the
4833 // current OrigFunc.
4834 // CallMap is set up as indexed by original Call at clone 0.
4835 CallInfo OrigCall(Callee->getOrigNode()->Call);
4836 OrigCall.setCloneNo(0);
4837 DenseMap<CallInfo, CallInfo> &CallMap =
4838 FuncCloneInfos[NewFuncClone.cloneNo()].CallMap;
4839 assert(CallMap.count(OrigCall));
4840 CallInfo NewCall(CallMap[OrigCall]);
4841 assert(NewCall);
4842 NewClone->setCall(NewCall);
4843 // Need to do the same for all matching calls.
4844 for (auto &MatchingCall : NewClone->MatchingCalls) {
4845 CallInfo OrigMatchingCall(MatchingCall);
4846 OrigMatchingCall.setCloneNo(0);
4847 assert(CallMap.count(OrigMatchingCall));
4848 CallInfo NewCall(CallMap[OrigMatchingCall]);
4849 assert(NewCall);
4850 // Updates the call in the list.
4851 MatchingCall = NewCall;
4852 }
4853 }
4854 }
4855 // Fall through to handling below to perform the recording of the
4856 // function for this callsite clone. This enables handling of cases
4857 // where the callers were assigned to different clones of a function.
4858 }
4859
4860 auto FindFirstAvailFuncClone = [&]() {
4861 // Find first function in FuncCloneInfos without an assigned
4862 // clone of this callsite Node. We should always have one
4863 // available at this point due to the earlier cloning when the
4864 // FuncCloneInfos size was smaller than the clone number.
4865 for (auto &CF : FuncCloneInfos) {
4866 if (!FuncCloneToCurNodeCloneMap.count(CF.FuncClone))
4867 return CF.FuncClone;
4868 }
4870 "Expected an available func clone for this callsite clone");
4871 };
4872
4873 // See if we can use existing function clone. Walk through
4874 // all caller edges to see if any have already been assigned to
4875 // a clone of this callsite's function. If we can use it, do so. If not,
4876 // because that function clone is already assigned to a different clone
4877 // of this callsite, then we need to clone again.
4878 // Basically, this checking is needed to handle the case where different
4879 // caller functions/callsites may need versions of this function
4880 // containing different mixes of callsite clones across the different
4881 // callsites within the function. If that happens, we need to create
4882 // additional function clones to handle the various combinations.
4883 //
4884 // Keep track of any new clones of this callsite created by the
4885 // following loop, as well as any existing clone that we decided to
4886 // assign this clone to.
4887 std::map<FuncInfo, ContextNode *> FuncCloneToNewCallsiteCloneMap;
4888 FuncInfo FuncCloneAssignedToCurCallsiteClone;
4889 // Iterate over a copy of Clone's caller edges, since we may need to
4890 // remove edges in the moveEdgeTo* methods, and this simplifies the
4891 // handling and makes it less error-prone.
4892 auto CloneCallerEdges = Clone->CallerEdges;
4893 for (auto &Edge : CloneCallerEdges) {
4894 // Skip removed edges (due to direct recursive edges updated when
4895 // updating callee edges when moving an edge and subsequently
4896 // removed by call to removeNoneTypeCalleeEdges on the Clone).
4897 if (Edge->isRemoved())
4898 continue;
4899 // Ignore any caller that does not have a recorded callsite Call.
4900 if (!Edge->Caller->hasCall())
4901 continue;
4902 // If this caller already assigned to call a version of OrigFunc, need
4903 // to ensure we can assign this callsite clone to that function clone.
4904 if (CallsiteToCalleeFuncCloneMap.count(Edge->Caller)) {
4905 FuncInfo FuncCloneCalledByCaller =
4906 CallsiteToCalleeFuncCloneMap[Edge->Caller];
4907 // First we need to confirm that this function clone is available
4908 // for use by this callsite node clone.
4909 //
4910 // While FuncCloneToCurNodeCloneMap is built only for this Node and
4911 // its callsite clones, one of those callsite clones X could have
4912 // been assigned to the same function clone called by Edge's caller
4913 // - if Edge's caller calls another callsite within Node's original
4914 // function, and that callsite has another caller reaching clone X.
4915 // We need to clone Node again in this case.
4916 if ((FuncCloneToCurNodeCloneMap.count(FuncCloneCalledByCaller) &&
4917 FuncCloneToCurNodeCloneMap[FuncCloneCalledByCaller] !=
4918 Clone) ||
4919 // Detect when we have multiple callers of this callsite that
4920 // have already been assigned to specific, and different, clones
4921 // of OrigFunc (due to other unrelated callsites in Func they
4922 // reach via call contexts). Is this Clone of callsite Node
4923 // assigned to a different clone of OrigFunc? If so, clone Node
4924 // again.
4925 (FuncCloneAssignedToCurCallsiteClone &&
4926 FuncCloneAssignedToCurCallsiteClone !=
4927 FuncCloneCalledByCaller)) {
4928 // We need to use a different newly created callsite clone, in
4929 // order to assign it to another new function clone on a
4930 // subsequent iteration over the Clones array (adjusted below).
4931 // Note we specifically do not reset the
4932 // CallsiteToCalleeFuncCloneMap entry for this caller, so that
4933 // when this new clone is processed later we know which version of
4934 // the function to copy (so that other callsite clones we have
4935 // assigned to that function clone are properly cloned over). See
4936 // comments in the function cloning handling earlier.
4937
4938 // Check if we already have cloned this callsite again while
4939 // walking through caller edges, for a caller calling the same
4940 // function clone. If so, we can move this edge to that new clone
4941 // rather than creating yet another new clone.
4942 if (FuncCloneToNewCallsiteCloneMap.count(
4943 FuncCloneCalledByCaller)) {
4944 ContextNode *NewClone =
4945 FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller];
4946 moveEdgeToExistingCalleeClone(Edge, NewClone);
4947 // Cleanup any none type edges cloned over.
4948 removeNoneTypeCalleeEdges(NewClone);
4949 } else {
4950 // Create a new callsite clone.
4951 ContextNode *NewClone = MoveEdgeToNewCalleeCloneAndSetUp(Edge);
4952 FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller] =
4953 NewClone;
4954 // Add to list of clones and process later.
4955 ClonesWorklist.push_back(NewClone);
4956 }
4957 // Moving the caller edge may have resulted in some none type
4958 // callee edges.
4959 removeNoneTypeCalleeEdges(Clone);
4960 // We will handle the newly created callsite clone in a subsequent
4961 // iteration over this Node's Clones.
4962 continue;
4963 }
4964
4965 // Otherwise, we can use the function clone already assigned to this
4966 // caller.
4967 if (!FuncCloneAssignedToCurCallsiteClone) {
4968 FuncCloneAssignedToCurCallsiteClone = FuncCloneCalledByCaller;
4969 // Assign Clone to FuncCloneCalledByCaller
4970 AssignCallsiteCloneToFuncClone(
4971 FuncCloneCalledByCaller, Call, Clone,
4972 AllocationCallToContextNodeMap.count(Call));
4973 } else
4974 // Don't need to do anything - callsite is already calling this
4975 // function clone.
4976 assert(FuncCloneAssignedToCurCallsiteClone ==
4977 FuncCloneCalledByCaller);
4978
4979 } else {
4980 // We have not already assigned this caller to a version of
4981 // OrigFunc. Do the assignment now.
4982
4983 // First check if we have already assigned this callsite clone to a
4984 // clone of OrigFunc for another caller during this iteration over
4985 // its caller edges.
4986 if (!FuncCloneAssignedToCurCallsiteClone) {
4987 FuncCloneAssignedToCurCallsiteClone = FindFirstAvailFuncClone();
4988 assert(FuncCloneAssignedToCurCallsiteClone);
4989 // Assign Clone to FuncCloneAssignedToCurCallsiteClone
4990 AssignCallsiteCloneToFuncClone(
4991 FuncCloneAssignedToCurCallsiteClone, Call, Clone,
4992 AllocationCallToContextNodeMap.count(Call));
4993 } else
4994 assert(FuncCloneToCurNodeCloneMap
4995 [FuncCloneAssignedToCurCallsiteClone] == Clone);
4996 // Update callers to record function version called.
4997 RecordCalleeFuncOfCallsite(Edge->Caller,
4998 FuncCloneAssignedToCurCallsiteClone);
4999 }
5000 }
5001 // If we didn't assign a function clone to this callsite clone yet, e.g.
5002 // none of its callers has a non-null call, do the assignment here.
5003 // We want to ensure that every callsite clone is assigned to some
5004 // function clone, so that the call updates below work as expected.
5005 // In particular if this is the original callsite, we want to ensure it
5006 // is assigned to the original function, otherwise the original function
5007 // will appear available for assignment to other callsite clones,
5008 // leading to unintended effects. For one, the unknown and not updated
5009 // callers will call into cloned paths leading to the wrong hints,
5010 // because they still call the original function (clone 0). Also,
5011 // because all callsites start out as being clone 0 by default, we can't
5012 // easily distinguish between callsites explicitly assigned to clone 0
5013 // vs those never assigned, which can lead to multiple updates of the
5014 // calls when invoking updateCall below, with mismatched clone values.
5015 // TODO: Add a flag to the callsite nodes or some other mechanism to
5016 // better distinguish and identify callsite clones that are not getting
5017 // assigned to function clones as expected.
5018 if (!FuncCloneAssignedToCurCallsiteClone) {
5019 FuncCloneAssignedToCurCallsiteClone = FindFirstAvailFuncClone();
5020 assert(FuncCloneAssignedToCurCallsiteClone &&
5021 "No available func clone for this callsite clone");
5022 AssignCallsiteCloneToFuncClone(
5023 FuncCloneAssignedToCurCallsiteClone, Call, Clone,
5024 /*IsAlloc=*/AllocationCallToContextNodeMap.contains(Call));
5025 }
5026 }
5027 if (VerifyCCG) {
5029 for (const auto &PE : Node->CalleeEdges)
5031 for (const auto &CE : Node->CallerEdges)
5033 for (auto *Clone : Node->Clones) {
5035 for (const auto &PE : Clone->CalleeEdges)
5037 for (const auto &CE : Clone->CallerEdges)
5039 }
5040 }
5041 }
5042
5043 if (FuncCloneInfos.size() < 2)
5044 continue;
5045
5046 // In this case there is more than just the original function copy.
5047 // Record call clones of any callsite nodes in the function that did not
5048 // themselves get cloned for all of the function clones.
5049 for (auto &Call : CallsWithMetadata) {
5050 ContextNode *Node = getNodeForInst(Call);
5051 if (!Node || !Node->hasCall() || Node->emptyContextIds())
5052 continue;
5053 // If Node has enough clones already to cover all function clones, we can
5054 // skip it. Need to add one for the original copy.
5055 // Use >= in case there were clones that were skipped due to having empty
5056 // context ids
5057 if (Node->Clones.size() + 1 >= FuncCloneInfos.size())
5058 continue;
5059 // First collect all function clones we cloned this callsite node for.
5060 // They may not be sequential due to empty clones e.g.
5061 DenseSet<unsigned> NodeCallClones;
5062 for (auto *C : Node->Clones)
5063 NodeCallClones.insert(C->Call.cloneNo());
5064 unsigned I = 0;
5065 // Now check all the function clones.
5066 for (auto &FC : FuncCloneInfos) {
5067 // Function clones should be sequential.
5068 assert(FC.FuncClone.cloneNo() == I);
5069 // Skip the first clone which got the original call.
5070 // Also skip any other clones created for this Node.
5071 if (++I == 1 || NodeCallClones.contains(I)) {
5072 continue;
5073 }
5074 // Record the call clones created for this callsite in this function
5075 // clone.
5076 auto &CallVector = UnassignedCallClones[Node][I];
5077 DenseMap<CallInfo, CallInfo> &CallMap = FC.CallMap;
5078 if (auto It = CallMap.find(Call); It != CallMap.end()) {
5079 CallInfo CallClone = It->second;
5080 CallVector.push_back(CallClone);
5081 } else {
5082 // All but the original clone (skipped earlier) should have an entry
5083 // for all calls.
5084 assert(false && "Expected to find call in CallMap");
5085 }
5086 // Need to do the same for all matching calls.
5087 for (auto &MatchingCall : Node->MatchingCalls) {
5088 if (auto It = CallMap.find(MatchingCall); It != CallMap.end()) {
5089 CallInfo CallClone = It->second;
5090 CallVector.push_back(CallClone);
5091 } else {
5092 // All but the original clone (skipped earlier) should have an entry
5093 // for all calls.
5094 assert(false && "Expected to find call in CallMap");
5095 }
5096 }
5097 }
5098 }
5099 }
5100
5101 uint8_t BothTypes =
5102 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
5103
5104 auto UpdateCalls = [&](ContextNode *Node,
5105 DenseSet<const ContextNode *> &Visited,
5106 auto &&UpdateCalls) {
5107 auto Inserted = Visited.insert(Node);
5108 if (!Inserted.second)
5109 return;
5110
5111 for (auto *Clone : Node->Clones)
5112 UpdateCalls(Clone, Visited, UpdateCalls);
5113
5114 for (auto &Edge : Node->CallerEdges)
5115 UpdateCalls(Edge->Caller, Visited, UpdateCalls);
5116
5117 // Skip if either no call to update, or if we ended up with no context ids
5118 // (we moved all edges onto other clones).
5119 if (!Node->hasCall() || Node->emptyContextIds())
5120 return;
5121
5122 if (Node->IsAllocation) {
5123 auto AT = allocTypeToUse(Node->AllocTypes);
5124 // If the allocation type is ambiguous, and more aggressive hinting
5125 // has been enabled via the MinClonedColdBytePercent flag, see if this
5126 // allocation should be hinted cold anyway because its fraction cold bytes
5127 // allocated is at least the given threshold.
5128 if (Node->AllocTypes == BothTypes && MinClonedColdBytePercent < 100 &&
5129 !ContextIdToContextSizeInfos.empty()) {
5130 uint64_t TotalCold = 0;
5131 uint64_t Total = 0;
5132 for (auto Id : Node->getContextIds()) {
5133 auto TypeI = ContextIdToAllocationType.find(Id);
5134 assert(TypeI != ContextIdToAllocationType.end());
5135 auto CSI = ContextIdToContextSizeInfos.find(Id);
5136 if (CSI != ContextIdToContextSizeInfos.end()) {
5137 for (auto &Info : CSI->second) {
5138 Total += Info.TotalSize;
5139 if (TypeI->second == AllocationType::Cold)
5140 TotalCold += Info.TotalSize;
5141 }
5142 }
5143 }
5144 if (TotalCold * 100 >= Total * MinClonedColdBytePercent)
5145 AT = AllocationType::Cold;
5146 }
5147 updateAllocationCall(Node->Call, AT);
5148 assert(Node->MatchingCalls.empty());
5149 return;
5150 }
5151
5152 if (!CallsiteToCalleeFuncCloneMap.count(Node))
5153 return;
5154
5155 auto CalleeFunc = CallsiteToCalleeFuncCloneMap[Node];
5156 updateCall(Node->Call, CalleeFunc);
5157 // Update all the matching calls as well.
5158 for (auto &Call : Node->MatchingCalls)
5159 updateCall(Call, CalleeFunc);
5160
5161 // Now update all calls recorded earlier that are still in function clones
5162 // which don't have a clone of this callsite node.
5163 if (!UnassignedCallClones.contains(Node))
5164 return;
5165 DenseSet<unsigned> NodeCallClones;
5166 for (auto *C : Node->Clones)
5167 NodeCallClones.insert(C->Call.cloneNo());
5168 // Note that we already confirmed Node is in this map a few lines above.
5169 auto &ClonedCalls = UnassignedCallClones[Node];
5170 for (auto &[CloneNo, CallVector] : ClonedCalls) {
5171 // Should start at 1 as we never create an entry for original node.
5172 assert(CloneNo > 0);
5173 // If we subsequently created a clone, skip this one.
5174 if (NodeCallClones.contains(CloneNo))
5175 continue;
5176 // Use the original Node's CalleeFunc.
5177 for (auto &Call : CallVector)
5178 updateCall(Call, CalleeFunc);
5179 }
5180 };
5181
5182 // Performs DFS traversal starting from allocation nodes to update calls to
5183 // reflect cloning decisions recorded earlier. For regular LTO this will
5184 // update the actual calls in the IR to call the appropriate function clone
5185 // (and add attributes to allocation calls), whereas for ThinLTO the decisions
5186 // are recorded in the summary entries.
5187 DenseSet<const ContextNode *> Visited;
5188 for (auto &Entry : AllocationCallToContextNodeMap)
5189 UpdateCalls(Entry.second, Visited, UpdateCalls);
5190
5191 return Changed;
5192}
5193
5194// Compute a SHA1 hash of the callsite and alloc version information of clone I
5195// in the summary, to use in detection of duplicate clones.
5197 SHA1 Hasher;
5198 // Update hash with any callsites that call non-default (non-zero) callee
5199 // versions.
5200 for (auto &SN : FS->callsites()) {
5201 // In theory all callsites and allocs in this function should have the same
5202 // number of clone entries, but handle any discrepancies gracefully below
5203 // for NDEBUG builds.
5204 assert(
5205 SN.Clones.size() > I &&
5206 "Callsite summary has fewer entries than other summaries in function");
5207 if (SN.Clones.size() <= I || !SN.Clones[I])
5208 continue;
5209 uint8_t Data[sizeof(SN.Clones[I])];
5210 support::endian::write32le(Data, SN.Clones[I]);
5211 Hasher.update(Data);
5212 }
5213 // Update hash with any allocs that have non-default (non-None) hints.
5214 for (auto &AN : FS->allocs()) {
5215 // In theory all callsites and allocs in this function should have the same
5216 // number of clone entries, but handle any discrepancies gracefully below
5217 // for NDEBUG builds.
5218 assert(AN.Versions.size() > I &&
5219 "Alloc summary has fewer entries than other summaries in function");
5220 if (AN.Versions.size() <= I ||
5221 (AllocationType)AN.Versions[I] == AllocationType::None)
5222 continue;
5223 Hasher.update(ArrayRef<uint8_t>(&AN.Versions[I], 1));
5224 }
5225 return support::endian::read64le(Hasher.result().data());
5226}
5227
5229 Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE,
5231 &FuncToAliasMap,
5232 FunctionSummary *FS) {
5233 auto TakeDeclNameAndReplace = [](GlobalValue *DeclGV, GlobalValue *NewGV) {
5234 // We might have created this when adjusting callsite in another
5235 // function. It should be a declaration.
5236 assert(DeclGV->isDeclaration());
5237 NewGV->takeName(DeclGV);
5238 DeclGV->replaceAllUsesWith(NewGV);
5239 DeclGV->eraseFromParent();
5240 };
5241
5242 // Handle aliases to this function, and create analogous alias clones to the
5243 // provided clone of this function.
5244 auto CloneFuncAliases = [&](Function *NewF, unsigned I) {
5245 if (!FuncToAliasMap.count(&F))
5246 return;
5247 for (auto *A : FuncToAliasMap[&F]) {
5248 std::string AliasName = getMemProfFuncName(A->getName(), I);
5249 auto *PrevA = M.getNamedAlias(AliasName);
5250 auto *NewA = GlobalAlias::create(A->getValueType(),
5251 A->getType()->getPointerAddressSpace(),
5252 A->getLinkage(), AliasName, NewF);
5253 NewA->copyAttributesFrom(A);
5254 if (PrevA)
5255 TakeDeclNameAndReplace(PrevA, NewA);
5256 }
5257 };
5258
5259 // The first "clone" is the original copy, we should only call this if we
5260 // needed to create new clones.
5261 assert(NumClones > 1);
5263 VMaps.reserve(NumClones - 1);
5264 FunctionsClonedThinBackend++;
5265
5266 // Map of hash of callsite/alloc versions to the instantiated function clone
5267 // (possibly the original) implementing those calls. Used to avoid
5268 // instantiating duplicate function clones.
5269 // FIXME: Ideally the thin link would not generate such duplicate clones to
5270 // start with, but right now it happens due to phase ordering in the function
5271 // assignment and possible new clones that produces. We simply make each
5272 // duplicate an alias to the matching instantiated clone recorded in the map
5273 // (except for available_externally which are made declarations as they would
5274 // be aliases in the prevailing module, and available_externally aliases are
5275 // not well supported right now).
5277
5278 // Save the hash of the original function version.
5279 HashToFunc[ComputeHash(FS, 0)] = &F;
5280
5281 for (unsigned I = 1; I < NumClones; I++) {
5282 VMaps.emplace_back(std::make_unique<ValueToValueMapTy>());
5283 std::string Name = getMemProfFuncName(F.getName(), I);
5284 auto Hash = ComputeHash(FS, I);
5285 // If this clone would duplicate a previously seen clone, don't generate the
5286 // duplicate clone body, just make an alias to satisfy any (potentially
5287 // cross-module) references.
5288 if (HashToFunc.contains(Hash)) {
5289 FunctionCloneDuplicatesThinBackend++;
5290 auto *Func = HashToFunc[Hash];
5291 if (Func->hasAvailableExternallyLinkage()) {
5292 // Skip these as EliminateAvailableExternallyPass does not handle
5293 // available_externally aliases correctly and we end up with an
5294 // available_externally alias to a declaration. Just create a
5295 // declaration for now as we know we will have a definition in another
5296 // module.
5297 auto Decl = M.getOrInsertFunction(Name, Func->getFunctionType());
5298 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F)
5299 << "created clone decl " << ore::NV("Decl", Decl.getCallee()));
5300 continue;
5301 }
5302 auto *PrevF = M.getFunction(Name);
5303 auto *Alias = GlobalAlias::create(Name, Func);
5304 if (PrevF)
5305 TakeDeclNameAndReplace(PrevF, Alias);
5306 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F)
5307 << "created clone alias " << ore::NV("Alias", Alias));
5308
5309 // Now handle aliases to this function, and clone those as well.
5310 CloneFuncAliases(Func, I);
5311 continue;
5312 }
5313 auto *NewF = CloneFunction(&F, *VMaps.back());
5314 HashToFunc[Hash] = NewF;
5315 FunctionClonesThinBackend++;
5316 // Strip memprof and callsite metadata from clone as they are no longer
5317 // needed.
5318 for (auto &BB : *NewF) {
5319 for (auto &Inst : BB) {
5320 Inst.setMetadata(LLVMContext::MD_memprof, nullptr);
5321 Inst.setMetadata(LLVMContext::MD_callsite, nullptr);
5322 }
5323 }
5324 auto *PrevF = M.getFunction(Name);
5325 if (PrevF)
5326 TakeDeclNameAndReplace(PrevF, NewF);
5327 else
5328 NewF->setName(Name);
5329 updateSubprogramLinkageName(NewF, Name);
5330 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F)
5331 << "created clone " << ore::NV("NewFunction", NewF));
5332
5333 // Now handle aliases to this function, and clone those as well.
5334 CloneFuncAliases(NewF, I);
5335 }
5336 return VMaps;
5337}
5338
5339// Locate the summary for F. This is complicated by the fact that it might
5340// have been internalized or promoted.
5342 const ModuleSummaryIndex *ImportSummary,
5343 const Function *CallingFunc = nullptr) {
5344 // FIXME: Ideally we would retain the original GUID in some fashion on the
5345 // function (e.g. as metadata), but for now do our best to locate the
5346 // summary without that information.
5347 ValueInfo TheFnVI = ImportSummary->getValueInfo(F.getGUID());
5348 if (!TheFnVI)
5349 // See if theFn was internalized, by checking index directly with
5350 // original name (this avoids the name adjustment done by getGUID() for
5351 // internal symbols).
5352 TheFnVI = ImportSummary->getValueInfo(
5354 if (TheFnVI)
5355 return TheFnVI;
5356 // Now query with the original name before any promotion was performed.
5357 StringRef OrigName =
5359 // When this pass is enabled, we always add thinlto_src_file provenance
5360 // metadata to imported function definitions, which allows us to recreate the
5361 // original internal symbol's GUID.
5362 auto SrcFileMD = F.getMetadata("thinlto_src_file");
5363 // If this is a call to an imported/promoted local for which we didn't import
5364 // the definition, the metadata will not exist on the declaration. However,
5365 // since we are doing this early, before any inlining in the LTO backend, we
5366 // can simply look at the metadata on the calling function which must have
5367 // been from the same module if F was an internal symbol originally.
5368 if (!SrcFileMD && F.isDeclaration()) {
5369 // We would only call this for a declaration for a direct callsite, in which
5370 // case the caller would have provided the calling function pointer.
5371 assert(CallingFunc);
5372 SrcFileMD = CallingFunc->getMetadata("thinlto_src_file");
5373 // If this is a promoted local (OrigName != F.getName()), since this is a
5374 // declaration, it must be imported from a different module and therefore we
5375 // should always find the metadata on its calling function. Any call to a
5376 // promoted local that came from this module should still be a definition.
5377 assert(SrcFileMD || OrigName == F.getName());
5378 }
5379 StringRef SrcFile = M.getSourceFileName();
5380 if (SrcFileMD)
5381 SrcFile = dyn_cast<MDString>(SrcFileMD->getOperand(0))->getString();
5382 std::string OrigId = GlobalValue::getGlobalIdentifier(
5383 OrigName, GlobalValue::InternalLinkage, SrcFile);
5384 TheFnVI = ImportSummary->getValueInfo(
5386 // Internal func in original module may have gotten a numbered suffix if we
5387 // imported an external function with the same name. This happens
5388 // automatically during IR linking for naming conflicts. It would have to
5389 // still be internal in that case (otherwise it would have been renamed on
5390 // promotion in which case we wouldn't have a naming conflict).
5391 if (!TheFnVI && OrigName == F.getName() && F.hasLocalLinkage() &&
5392 F.getName().contains('.')) {
5393 OrigName = F.getName().rsplit('.').first;
5395 OrigName, GlobalValue::InternalLinkage, SrcFile);
5396 TheFnVI = ImportSummary->getValueInfo(
5398 }
5399 // The only way we may not have a VI is if this is a declaration created for
5400 // an imported reference. For distributed ThinLTO we may not have a VI for
5401 // such declarations in the distributed summary.
5402 assert(TheFnVI || F.isDeclaration());
5403 return TheFnVI;
5404}
5405
5406bool MemProfContextDisambiguation::initializeIndirectCallPromotionInfo(
5407 Module &M) {
5408 ICallAnalysis = std::make_unique<ICallPromotionAnalysis>();
5409 Symtab = std::make_unique<InstrProfSymtab>();
5410 // Don't add canonical names, to avoid multiple functions to the symtab
5411 // when they both have the same root name with "." suffixes stripped.
5412 // If we pick the wrong one then this could lead to incorrect ICP and calling
5413 // a memprof clone that we don't actually create (resulting in linker unsats).
5414 // What this means is that the GUID of the function (or its PGOFuncName
5415 // metadata) *must* match that in the VP metadata to allow promotion.
5416 // In practice this should not be a limitation, since local functions should
5417 // have PGOFuncName metadata and global function names shouldn't need any
5418 // special handling (they should not get the ".llvm.*" suffix that the
5419 // canonicalization handling is attempting to strip).
5420 if (Error E = Symtab->create(M, /*InLTO=*/true, /*AddCanonical=*/false)) {
5421 std::string SymtabFailure = toString(std::move(E));
5422 M.getContext().emitError("Failed to create symtab: " + SymtabFailure);
5423 return false;
5424 }
5425 return true;
5426}
5427
5428#ifndef NDEBUG
5429// Sanity check that the MIB stack ids match between the summary and
5430// instruction metadata.
5432 const AllocInfo &AllocNode, const MDNode *MemProfMD,
5433 const CallStack<MDNode, MDNode::op_iterator> &CallsiteContext,
5434 const ModuleSummaryIndex *ImportSummary) {
5435 auto MIBIter = AllocNode.MIBs.begin();
5436 for (auto &MDOp : MemProfMD->operands()) {
5437 assert(MIBIter != AllocNode.MIBs.end());
5438 auto StackIdIndexIter = MIBIter->StackIdIndices.begin();
5439 auto *MIBMD = cast<const MDNode>(MDOp);
5440 MDNode *StackMDNode = getMIBStackNode(MIBMD);
5441 assert(StackMDNode);
5442 CallStack<MDNode, MDNode::op_iterator> StackContext(StackMDNode);
5443 auto ContextIterBegin =
5444 StackContext.beginAfterSharedPrefix(CallsiteContext);
5445 // Skip the checking on the first iteration.
5446 uint64_t LastStackContextId =
5447 (ContextIterBegin != StackContext.end() && *ContextIterBegin == 0) ? 1
5448 : 0;
5449 for (auto ContextIter = ContextIterBegin; ContextIter != StackContext.end();
5450 ++ContextIter) {
5451 // If this is a direct recursion, simply skip the duplicate
5452 // entries, to be consistent with how the summary ids were
5453 // generated during ModuleSummaryAnalysis.
5454 if (LastStackContextId == *ContextIter)
5455 continue;
5456 LastStackContextId = *ContextIter;
5457 assert(StackIdIndexIter != MIBIter->StackIdIndices.end());
5458 assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
5459 *ContextIter);
5460 StackIdIndexIter++;
5461 }
5462 MIBIter++;
5463 }
5464}
5465#endif
5466
5467bool MemProfContextDisambiguation::applyImport(Module &M) {
5468 assert(ImportSummary);
5469 bool Changed = false;
5470
5471 // We also need to clone any aliases that reference cloned functions, because
5472 // the modified callsites may invoke via the alias. Keep track of the aliases
5473 // for each function.
5474 std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>>
5475 FuncToAliasMap;
5476 for (auto &A : M.aliases()) {
5477 auto *Aliasee = A.getAliaseeObject();
5478 if (auto *F = dyn_cast<Function>(Aliasee))
5479 FuncToAliasMap[F].insert(&A);
5480 }
5481
5482 if (!initializeIndirectCallPromotionInfo(M))
5483 return false;
5484
5485 for (auto &F : M) {
5486 if (F.isDeclaration() || isMemProfClone(F))
5487 continue;
5488
5489 OptimizationRemarkEmitter ORE(&F);
5490
5492 bool ClonesCreated = false;
5493 unsigned NumClonesCreated = 0;
5494 auto CloneFuncIfNeeded = [&](unsigned NumClones, FunctionSummary *FS) {
5495 // We should at least have version 0 which is the original copy.
5496 assert(NumClones > 0);
5497 // If only one copy needed use original.
5498 if (NumClones == 1)
5499 return;
5500 // If we already performed cloning of this function, confirm that the
5501 // requested number of clones matches (the thin link should ensure the
5502 // number of clones for each constituent callsite is consistent within
5503 // each function), before returning.
5504 if (ClonesCreated) {
5505 assert(NumClonesCreated == NumClones);
5506 return;
5507 }
5508 VMaps = createFunctionClones(F, NumClones, M, ORE, FuncToAliasMap, FS);
5509 // The first "clone" is the original copy, which doesn't have a VMap.
5510 assert(VMaps.size() == NumClones - 1);
5511 Changed = true;
5512 ClonesCreated = true;
5513 NumClonesCreated = NumClones;
5514 };
5515
5516 auto CloneCallsite = [&](const CallsiteInfo &StackNode, CallBase *CB,
5517 Function *CalledFunction, FunctionSummary *FS) {
5518 // Perform cloning if not yet done.
5519 CloneFuncIfNeeded(/*NumClones=*/StackNode.Clones.size(), FS);
5520
5521 assert(!isMemProfClone(*CalledFunction));
5522
5523 // Because we update the cloned calls by calling setCalledOperand (see
5524 // comment below), out of an abundance of caution make sure the called
5525 // function was actually the called operand (or its aliasee). We also
5526 // strip pointer casts when looking for calls (to match behavior during
5527 // summary generation), however, with opaque pointers in theory this
5528 // should not be an issue. Note we still clone the current function
5529 // (containing this call) above, as that could be needed for its callers.
5530 auto *GA = dyn_cast_or_null<GlobalAlias>(CB->getCalledOperand());
5531 if (CalledFunction != CB->getCalledOperand() &&
5532 (!GA || CalledFunction != GA->getAliaseeObject())) {
5533 SkippedCallsCloning++;
5534 return;
5535 }
5536 // Update the calls per the summary info.
5537 // Save orig name since it gets updated in the first iteration
5538 // below.
5539 auto CalleeOrigName = CalledFunction->getName();
5540 for (unsigned J = 0; J < StackNode.Clones.size(); J++) {
5541 // If the VMap is empty, this clone was a duplicate of another and was
5542 // created as an alias or a declaration.
5543 if (J > 0 && VMaps[J - 1]->empty())
5544 continue;
5545 // Do nothing if this version calls the original version of its
5546 // callee.
5547 if (!StackNode.Clones[J])
5548 continue;
5549 auto NewF = M.getOrInsertFunction(
5550 getMemProfFuncName(CalleeOrigName, StackNode.Clones[J]),
5551 CalledFunction->getFunctionType());
5552 CallBase *CBClone;
5553 // Copy 0 is the original function.
5554 if (!J)
5555 CBClone = CB;
5556 else
5557 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
5558 // Set the called operand directly instead of calling setCalledFunction,
5559 // as the latter mutates the function type on the call. In rare cases
5560 // we may have a slightly different type on a callee function
5561 // declaration due to it being imported from a different module with
5562 // incomplete types. We really just want to change the name of the
5563 // function to the clone, and not make any type changes.
5564 CBClone->setCalledOperand(NewF.getCallee());
5565 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone)
5566 << ore::NV("Call", CBClone) << " in clone "
5567 << ore::NV("Caller", CBClone->getFunction())
5568 << " assigned to call function clone "
5569 << ore::NV("Callee", NewF.getCallee()));
5570 }
5571 };
5572
5573 // Locate the summary for F.
5574 ValueInfo TheFnVI = findValueInfoForFunc(F, M, ImportSummary);
5575 // If not found, this could be an imported local (see comment in
5576 // findValueInfoForFunc). Skip for now as it will be cloned in its original
5577 // module (where it would have been promoted to global scope so should
5578 // satisfy any reference in this module).
5579 if (!TheFnVI)
5580 continue;
5581
5582 auto *GVSummary =
5583 ImportSummary->findSummaryInModule(TheFnVI, M.getModuleIdentifier());
5584 if (!GVSummary) {
5585 // Must have been imported, use the summary which matches the definition。
5586 // (might be multiple if this was a linkonce_odr).
5587 auto SrcModuleMD = F.getMetadata("thinlto_src_module");
5588 assert(SrcModuleMD &&
5589 "enable-import-metadata is needed to emit thinlto_src_module");
5590 StringRef SrcModule =
5591 dyn_cast<MDString>(SrcModuleMD->getOperand(0))->getString();
5592 for (auto &GVS : TheFnVI.getSummaryList()) {
5593 if (GVS->modulePath() == SrcModule) {
5594 GVSummary = GVS.get();
5595 break;
5596 }
5597 }
5598 assert(GVSummary && GVSummary->modulePath() == SrcModule);
5599 }
5600
5601 // If this was an imported alias skip it as we won't have the function
5602 // summary, and it should be cloned in the original module.
5603 if (isa<AliasSummary>(GVSummary))
5604 continue;
5605
5606 auto *FS = cast<FunctionSummary>(GVSummary->getBaseObject());
5607
5608 if (FS->allocs().empty() && FS->callsites().empty())
5609 continue;
5610
5611 auto SI = FS->callsites().begin();
5612 auto AI = FS->allocs().begin();
5613
5614 // To handle callsite infos synthesized for tail calls which have missing
5615 // frames in the profiled context, map callee VI to the synthesized callsite
5616 // info.
5617 DenseMap<ValueInfo, CallsiteInfo> MapTailCallCalleeVIToCallsite;
5618 // Iterate the callsites for this function in reverse, since we place all
5619 // those synthesized for tail calls at the end.
5620 for (auto CallsiteIt = FS->callsites().rbegin();
5621 CallsiteIt != FS->callsites().rend(); CallsiteIt++) {
5622 auto &Callsite = *CallsiteIt;
5623 // Stop as soon as we see a non-synthesized callsite info (see comment
5624 // above loop). All the entries added for discovered tail calls have empty
5625 // stack ids.
5626 if (!Callsite.StackIdIndices.empty())
5627 break;
5628 MapTailCallCalleeVIToCallsite.insert({Callsite.Callee, Callsite});
5629 }
5630
5631 // Keeps track of needed ICP for the function.
5632 SmallVector<ICallAnalysisData> ICallAnalysisInfo;
5633
5634 // Assume for now that the instructions are in the exact same order
5635 // as when the summary was created, but confirm this is correct by
5636 // matching the stack ids.
5637 for (auto &BB : F) {
5638 for (auto &I : BB) {
5639 auto *CB = dyn_cast<CallBase>(&I);
5640 // Same handling as when creating module summary.
5641 if (!mayHaveMemprofSummary(CB))
5642 continue;
5643
5644 auto *CalledValue = CB->getCalledOperand();
5645 auto *CalledFunction = CB->getCalledFunction();
5646 if (CalledValue && !CalledFunction) {
5647 CalledValue = CalledValue->stripPointerCasts();
5648 // Stripping pointer casts can reveal a called function.
5649 CalledFunction = dyn_cast<Function>(CalledValue);
5650 }
5651 // Check if this is an alias to a function. If so, get the
5652 // called aliasee for the checks below.
5653 if (auto *GA = dyn_cast<GlobalAlias>(CalledValue)) {
5654 assert(!CalledFunction &&
5655 "Expected null called function in callsite for alias");
5656 CalledFunction = dyn_cast<Function>(GA->getAliaseeObject());
5657 }
5658
5659 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
5660 I.getMetadata(LLVMContext::MD_callsite));
5661 auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof);
5662
5663 // Include allocs that were already assigned a memprof function
5664 // attribute in the statistics.
5665 if (CB->getAttributes().hasFnAttr("memprof")) {
5666 assert(!MemProfMD);
5667 CB->getAttributes().getFnAttr("memprof").getValueAsString() == "cold"
5668 ? AllocTypeColdThinBackend++
5669 : AllocTypeNotColdThinBackend++;
5670 OrigAllocsThinBackend++;
5671 AllocVersionsThinBackend++;
5672 if (!MaxAllocVersionsThinBackend)
5673 MaxAllocVersionsThinBackend = 1;
5674 continue;
5675 }
5676
5677 if (MemProfMD) {
5678 // Consult the next alloc node.
5679 assert(AI != FS->allocs().end());
5680 auto &AllocNode = *(AI++);
5681
5682#ifndef NDEBUG
5683 checkAllocContextIds(AllocNode, MemProfMD, CallsiteContext,
5684 ImportSummary);
5685#endif
5686
5687 // Perform cloning if not yet done.
5688 CloneFuncIfNeeded(/*NumClones=*/AllocNode.Versions.size(), FS);
5689
5690 OrigAllocsThinBackend++;
5691 AllocVersionsThinBackend += AllocNode.Versions.size();
5692 if (MaxAllocVersionsThinBackend < AllocNode.Versions.size())
5693 MaxAllocVersionsThinBackend = AllocNode.Versions.size();
5694
5695 // If there is only one version that means we didn't end up
5696 // considering this function for cloning, and in that case the alloc
5697 // will still be none type or should have gotten the default NotCold.
5698 // Skip that after calling clone helper since that does some sanity
5699 // checks that confirm we haven't decided yet that we need cloning.
5700 // We might have a single version that is cold due to the
5701 // MinClonedColdBytePercent heuristic, make sure we don't skip in that
5702 // case.
5703 if (AllocNode.Versions.size() == 1 &&
5704 (AllocationType)AllocNode.Versions[0] != AllocationType::Cold) {
5705 assert((AllocationType)AllocNode.Versions[0] ==
5706 AllocationType::NotCold ||
5707 (AllocationType)AllocNode.Versions[0] ==
5708 AllocationType::None);
5709 UnclonableAllocsThinBackend++;
5710 continue;
5711 }
5712
5713 // All versions should have a singular allocation type.
5714 assert(llvm::none_of(AllocNode.Versions, [](uint8_t Type) {
5715 return Type == ((uint8_t)AllocationType::NotCold |
5716 (uint8_t)AllocationType::Cold);
5717 }));
5718
5719 // Update the allocation types per the summary info.
5720 for (unsigned J = 0; J < AllocNode.Versions.size(); J++) {
5721 // If the VMap is empty, this clone was a duplicate of another and
5722 // was created as an alias or a declaration.
5723 if (J > 0 && VMaps[J - 1]->empty())
5724 continue;
5725 // Ignore any that didn't get an assigned allocation type.
5726 if (AllocNode.Versions[J] == (uint8_t)AllocationType::None)
5727 continue;
5728 AllocationType AllocTy = (AllocationType)AllocNode.Versions[J];
5729 AllocTy == AllocationType::Cold ? AllocTypeColdThinBackend++
5730 : AllocTypeNotColdThinBackend++;
5731 std::string AllocTypeString = getAllocTypeAttributeString(AllocTy);
5732 auto A = llvm::Attribute::get(F.getContext(), "memprof",
5733 AllocTypeString);
5734 CallBase *CBClone;
5735 // Copy 0 is the original function.
5736 if (!J)
5737 CBClone = CB;
5738 else
5739 // Since VMaps are only created for new clones, we index with
5740 // clone J-1 (J==0 is the original clone and does not have a VMaps
5741 // entry).
5742 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
5743 CBClone->addFnAttr(A);
5744 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", CBClone)
5745 << ore::NV("AllocationCall", CBClone) << " in clone "
5746 << ore::NV("Caller", CBClone->getFunction())
5747 << " marked with memprof allocation attribute "
5748 << ore::NV("Attribute", AllocTypeString));
5749 }
5750 } else if (!CallsiteContext.empty()) {
5751 if (!CalledFunction) {
5752#ifndef NDEBUG
5753 // We should have skipped inline assembly calls.
5754 auto *CI = dyn_cast<CallInst>(CB);
5755 assert(!CI || !CI->isInlineAsm());
5756#endif
5757 // We should have skipped direct calls via a Constant.
5758 assert(CalledValue && !isa<Constant>(CalledValue));
5759
5760 // This is an indirect call, see if we have profile information and
5761 // whether any clones were recorded for the profiled targets (that
5762 // we synthesized CallsiteInfo summary records for when building the
5763 // index).
5764 auto NumClones =
5765 recordICPInfo(CB, FS->callsites(), SI, ICallAnalysisInfo);
5766
5767 // Perform cloning if not yet done. This is done here in case
5768 // we don't need to do ICP, but might need to clone this
5769 // function as it is the target of other cloned calls.
5770 if (NumClones)
5771 CloneFuncIfNeeded(NumClones, FS);
5772 }
5773
5774 else {
5775 // Consult the next callsite node.
5776 assert(SI != FS->callsites().end());
5777 auto &StackNode = *(SI++);
5778
5779#ifndef NDEBUG
5780 // Sanity check that the stack ids match between the summary and
5781 // instruction metadata.
5782 auto StackIdIndexIter = StackNode.StackIdIndices.begin();
5783 for (auto StackId : CallsiteContext) {
5784 assert(StackIdIndexIter != StackNode.StackIdIndices.end());
5785 assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
5786 StackId);
5787 StackIdIndexIter++;
5788 }
5789#endif
5790
5791 CloneCallsite(StackNode, CB, CalledFunction, FS);
5792 }
5793 } else if (CB->isTailCall() && CalledFunction) {
5794 // Locate the synthesized callsite info for the callee VI, if any was
5795 // created, and use that for cloning.
5796 ValueInfo CalleeVI =
5797 findValueInfoForFunc(*CalledFunction, M, ImportSummary, &F);
5798 if (CalleeVI && MapTailCallCalleeVIToCallsite.count(CalleeVI)) {
5799 auto Callsite = MapTailCallCalleeVIToCallsite.find(CalleeVI);
5800 assert(Callsite != MapTailCallCalleeVIToCallsite.end());
5801 CloneCallsite(Callsite->second, CB, CalledFunction, FS);
5802 }
5803 }
5804 }
5805 }
5806
5807 // Now do any promotion required for cloning.
5808 performICP(M, FS->callsites(), VMaps, ICallAnalysisInfo, ORE);
5809 }
5810
5811 // We skip some of the functions and instructions above, so remove all the
5812 // metadata in a single sweep here.
5813 for (auto &F : M) {
5814 // We can skip memprof clones because createFunctionClones already strips
5815 // the metadata from the newly created clones.
5816 if (F.isDeclaration() || isMemProfClone(F))
5817 continue;
5818 for (auto &BB : F) {
5819 for (auto &I : BB) {
5820 if (!isa<CallBase>(I))
5821 continue;
5822 I.setMetadata(LLVMContext::MD_memprof, nullptr);
5823 I.setMetadata(LLVMContext::MD_callsite, nullptr);
5824 }
5825 }
5826 }
5827
5828 return Changed;
5829}
5830
5831unsigned MemProfContextDisambiguation::recordICPInfo(
5832 CallBase *CB, ArrayRef<CallsiteInfo> AllCallsites,
5834 SmallVector<ICallAnalysisData> &ICallAnalysisInfo) {
5835 // First see if we have profile information for this indirect call.
5836 uint32_t NumCandidates;
5837 uint64_t TotalCount;
5838 auto CandidateProfileData =
5839 ICallAnalysis->getPromotionCandidatesForInstruction(CB, TotalCount,
5840 NumCandidates);
5841 if (CandidateProfileData.empty())
5842 return 0;
5843
5844 // Iterate through all of the candidate profiled targets along with the
5845 // CallsiteInfo summary records synthesized for them when building the index,
5846 // and see if any are cloned and/or refer to clones.
5847 bool ICPNeeded = false;
5848 unsigned NumClones = 0;
5849 size_t CallsiteInfoStartIndex = std::distance(AllCallsites.begin(), SI);
5850 for (const auto &Candidate : CandidateProfileData) {
5851#ifndef NDEBUG
5852 auto CalleeValueInfo =
5853#endif
5854 ImportSummary->getValueInfo(Candidate.Value);
5855 // We might not have a ValueInfo if this is a distributed
5856 // ThinLTO backend and decided not to import that function.
5857 assert(!CalleeValueInfo || SI->Callee == CalleeValueInfo);
5858 assert(SI != AllCallsites.end());
5859 auto &StackNode = *(SI++);
5860 // See if any of the clones of the indirect callsite for this
5861 // profiled target should call a cloned version of the profiled
5862 // target. We only need to do the ICP here if so.
5863 ICPNeeded |= llvm::any_of(StackNode.Clones,
5864 [](unsigned CloneNo) { return CloneNo != 0; });
5865 // Every callsite in the same function should have been cloned the same
5866 // number of times.
5867 assert(!NumClones || NumClones == StackNode.Clones.size());
5868 NumClones = StackNode.Clones.size();
5869 }
5870 if (!ICPNeeded)
5871 return NumClones;
5872 // Save information for ICP, which is performed later to avoid messing up the
5873 // current function traversal.
5874 ICallAnalysisInfo.push_back({CB, CandidateProfileData.vec(), NumCandidates,
5875 TotalCount, CallsiteInfoStartIndex});
5876 return NumClones;
5877}
5878
5879void MemProfContextDisambiguation::performICP(
5880 Module &M, ArrayRef<CallsiteInfo> AllCallsites,
5881 ArrayRef<std::unique_ptr<ValueToValueMapTy>> VMaps,
5882 ArrayRef<ICallAnalysisData> ICallAnalysisInfo,
5883 OptimizationRemarkEmitter &ORE) {
5884 // Now do any promotion required for cloning. Specifically, for each
5885 // recorded ICP candidate (which was only recorded because one clone of that
5886 // candidate should call a cloned target), we perform ICP (speculative
5887 // devirtualization) for each clone of the callsite, and update its callee
5888 // to the appropriate clone. Note that the ICP compares against the original
5889 // version of the target, which is what is in the vtable.
5890 for (auto &Info : ICallAnalysisInfo) {
5891 auto *CB = Info.CB;
5892 auto CallsiteIndex = Info.CallsiteInfoStartIndex;
5893 auto TotalCount = Info.TotalCount;
5894 unsigned NumPromoted = 0;
5895 unsigned NumClones = 0;
5896
5897 for (auto &Candidate : Info.CandidateProfileData) {
5898 auto &StackNode = AllCallsites[CallsiteIndex++];
5899
5900 // All calls in the same function must have the same number of clones.
5901 assert(!NumClones || NumClones == StackNode.Clones.size());
5902 NumClones = StackNode.Clones.size();
5903
5904 // See if the target is in the module. If it wasn't imported, it is
5905 // possible that this profile could have been collected on a different
5906 // target (or version of the code), and we need to be conservative
5907 // (similar to what is done in the ICP pass).
5908 Function *TargetFunction = Symtab->getFunction(Candidate.Value);
5909 if (TargetFunction == nullptr ||
5910 // Any ThinLTO global dead symbol removal should have already
5911 // occurred, so it should be safe to promote when the target is a
5912 // declaration.
5913 // TODO: Remove internal option once more fully tested.
5915 TargetFunction->isDeclaration())) {
5916 ORE.emit([&]() {
5917 return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToFindTarget", CB)
5918 << "Memprof cannot promote indirect call: target with md5sum "
5919 << ore::NV("target md5sum", Candidate.Value) << " not found";
5920 });
5921 // FIXME: See if we can use the new declaration importing support to
5922 // at least get the declarations imported for this case. Hot indirect
5923 // targets should have been imported normally, however.
5924 continue;
5925 }
5926
5927 // Check if legal to promote
5928 const char *Reason = nullptr;
5929 if (!isLegalToPromote(*CB, TargetFunction, &Reason)) {
5930 ORE.emit([&]() {
5931 return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToPromote", CB)
5932 << "Memprof cannot promote indirect call to "
5933 << ore::NV("TargetFunction", TargetFunction)
5934 << " with count of " << ore::NV("TotalCount", TotalCount)
5935 << ": " << Reason;
5936 });
5937 continue;
5938 }
5939
5940 assert(!isMemProfClone(*TargetFunction));
5941
5942 // Handle each call clone, applying ICP so that each clone directly
5943 // calls the specified callee clone, guarded by the appropriate ICP
5944 // check.
5945 CallBase *CBClone = CB;
5946 for (unsigned J = 0; J < NumClones; J++) {
5947 // If the VMap is empty, this clone was a duplicate of another and was
5948 // created as an alias or a declaration.
5949 if (J > 0 && VMaps[J - 1]->empty())
5950 continue;
5951 // Copy 0 is the original function.
5952 if (J > 0)
5953 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
5954 // We do the promotion using the original name, so that the comparison
5955 // is against the name in the vtable. Then just below, change the new
5956 // direct call to call the cloned function.
5957 auto &DirectCall =
5958 pgo::promoteIndirectCall(*CBClone, TargetFunction, Candidate.Count,
5959 TotalCount, isSamplePGO, &ORE);
5960 auto *TargetToUse = TargetFunction;
5961 // Call original if this version calls the original version of its
5962 // callee.
5963 if (StackNode.Clones[J]) {
5964 TargetToUse =
5965 cast<Function>(M.getOrInsertFunction(
5966 getMemProfFuncName(TargetFunction->getName(),
5967 StackNode.Clones[J]),
5968 TargetFunction->getFunctionType())
5969 .getCallee());
5970 }
5971 DirectCall.setCalledFunction(TargetToUse);
5972 // During matching we generate synthetic VP metadata for indirect calls
5973 // not already having any, from the memprof profile's callee GUIDs. If
5974 // we subsequently promote and inline those callees, we currently lose
5975 // the ability to generate this synthetic VP metadata. Optionally apply
5976 // a noinline attribute to promoted direct calls, where the threshold is
5977 // set to capture synthetic VP metadata targets which get a count of 1.
5979 Candidate.Count < MemProfICPNoInlineThreshold)
5980 DirectCall.setIsNoInline();
5981 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone)
5982 << ore::NV("Call", CBClone) << " in clone "
5983 << ore::NV("Caller", CBClone->getFunction())
5984 << " promoted and assigned to call function clone "
5985 << ore::NV("Callee", TargetToUse));
5986 }
5987
5988 // Update TotalCount (all clones should get same count above)
5989 TotalCount -= Candidate.Count;
5990 NumPromoted++;
5991 }
5992 // Adjust the MD.prof metadata for all clones, now that we have the new
5993 // TotalCount and the number promoted.
5994 CallBase *CBClone = CB;
5995 for (unsigned J = 0; J < NumClones; J++) {
5996 // If the VMap is empty, this clone was a duplicate of another and was
5997 // created as an alias or a declaration.
5998 if (J > 0 && VMaps[J - 1]->empty())
5999 continue;
6000 // Copy 0 is the original function.
6001 if (J > 0)
6002 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
6003 // First delete the old one.
6004 CBClone->setMetadata(LLVMContext::MD_prof, nullptr);
6005 // If all promoted, we don't need the MD.prof metadata.
6006 // Otherwise we need update with the un-promoted records back.
6007 if (TotalCount != 0)
6009 M, *CBClone, ArrayRef(Info.CandidateProfileData).slice(NumPromoted),
6010 TotalCount, IPVK_IndirectCallTarget, Info.NumCandidates);
6011 }
6012 }
6013}
6014
6015template <typename DerivedCCG, typename FuncTy, typename CallTy>
6016bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::process() {
6017 if (DumpCCG) {
6018 dbgs() << "CCG before cloning:\n";
6019 dbgs() << *this;
6020 }
6021 if (ExportToDot)
6022 exportToDot("postbuild");
6023
6024 if (VerifyCCG) {
6025 check();
6026 }
6027
6028 identifyClones();
6029
6030 if (VerifyCCG) {
6031 check();
6032 }
6033
6034 if (DumpCCG) {
6035 dbgs() << "CCG after cloning:\n";
6036 dbgs() << *this;
6037 }
6038 if (ExportToDot)
6039 exportToDot("cloned");
6040
6041 bool Changed = assignFunctions();
6042
6043 if (DumpCCG) {
6044 dbgs() << "CCG after assigning function clones:\n";
6045 dbgs() << *this;
6046 }
6047 if (ExportToDot)
6048 exportToDot("clonefuncassign");
6049
6051 printTotalSizes(errs());
6052
6053 return Changed;
6054}
6055
6056bool MemProfContextDisambiguation::processModule(
6057 Module &M,
6058 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter) {
6059
6060 // If we have an import summary, then the cloning decisions were made during
6061 // the thin link on the index. Apply them and return.
6062 if (ImportSummary)
6063 return applyImport(M);
6064
6065 // TODO: If/when other types of memprof cloning are enabled beyond just for
6066 // hot and cold, we will need to change this to individually control the
6067 // AllocationType passed to addStackNodesForMIB during CCG construction.
6068 // Note that we specifically check this after applying imports above, so that
6069 // the option isn't needed to be passed to distributed ThinLTO backend
6070 // clang processes, which won't necessarily have visibility into the linker
6071 // dependences. Instead the information is communicated from the LTO link to
6072 // the backends via the combined summary index.
6073 if (!SupportsHotColdNew)
6074 return false;
6075
6076 ModuleCallsiteContextGraph CCG(M, OREGetter);
6077 return CCG.process();
6078}
6079
6081 const ModuleSummaryIndex *Summary, bool isSamplePGO)
6082 : ImportSummary(Summary), isSamplePGO(isSamplePGO) {
6083 // Check the dot graph printing options once here, to make sure we have valid
6084 // and expected combinations.
6085 if (DotGraphScope == DotScope::Alloc && !AllocIdForDot.getNumOccurrences())
6087 "-memprof-dot-scope=alloc requires -memprof-dot-alloc-id");
6089 !ContextIdForDot.getNumOccurrences())
6091 "-memprof-dot-scope=context requires -memprof-dot-context-id");
6092 if (DotGraphScope == DotScope::All && AllocIdForDot.getNumOccurrences() &&
6093 ContextIdForDot.getNumOccurrences())
6095 "-memprof-dot-scope=all can't have both -memprof-dot-alloc-id and "
6096 "-memprof-dot-context-id");
6097 if (ImportSummary) {
6098 // The MemProfImportSummary should only be used for testing ThinLTO
6099 // distributed backend handling via opt, in which case we don't have a
6100 // summary from the pass pipeline.
6102 return;
6103 }
6104 if (MemProfImportSummary.empty())
6105 return;
6106
6107 auto ReadSummaryFile =
6109 if (!ReadSummaryFile) {
6110 logAllUnhandledErrors(ReadSummaryFile.takeError(), errs(),
6111 "Error loading file '" + MemProfImportSummary +
6112 "': ");
6113 return;
6114 }
6115 auto ImportSummaryForTestingOrErr = getModuleSummaryIndex(**ReadSummaryFile);
6116 if (!ImportSummaryForTestingOrErr) {
6117 logAllUnhandledErrors(ImportSummaryForTestingOrErr.takeError(), errs(),
6118 "Error parsing file '" + MemProfImportSummary +
6119 "': ");
6120 return;
6121 }
6122 ImportSummaryForTesting = std::move(*ImportSummaryForTestingOrErr);
6123 ImportSummary = ImportSummaryForTesting.get();
6124}
6125
6128 auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
6129 auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & {
6130 return FAM.getResult<OptimizationRemarkEmitterAnalysis>(*F);
6131 };
6132 if (!processModule(M, OREGetter))
6133 return PreservedAnalyses::all();
6134 return PreservedAnalyses::none();
6135}
6136
6138 ModuleSummaryIndex &Index,
6140 isPrevailing) {
6141 // TODO: If/when other types of memprof cloning are enabled beyond just for
6142 // hot and cold, we will need to change this to individually control the
6143 // AllocationType passed to addStackNodesForMIB during CCG construction.
6144 // The index was set from the option, so these should be in sync.
6145 assert(Index.withSupportsHotColdNew() == SupportsHotColdNew);
6146 if (!SupportsHotColdNew)
6147 return;
6148
6149 IndexCallsiteContextGraph CCG(Index, isPrevailing);
6150 CCG.process();
6151}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
aarch64 promote const
AMDGPU Prepare AGPR Alloc
Unify divergent function exit nodes
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
#define DEBUG_TYPE
global merge func
Module.h This file contains the declarations for the Module class.
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
#define G(x, y, z)
Definition MD5.cpp:56
Machine Check Debug Module
This file implements a map that provides insertion order iteration.
static cl::opt< unsigned > TailCallSearchDepth("memprof-tail-call-search-depth", cl::init(5), cl::Hidden, cl::desc("Max depth to recursively search for missing " "frames through tail calls."))
uint64_t ComputeHash(const FunctionSummary *FS, unsigned I)
static cl::opt< DotScope > DotGraphScope("memprof-dot-scope", cl::desc("Scope of graph to export to dot"), cl::Hidden, cl::init(DotScope::All), cl::values(clEnumValN(DotScope::All, "all", "Export full callsite graph"), clEnumValN(DotScope::Alloc, "alloc", "Export only nodes with contexts feeding given " "-memprof-dot-alloc-id"), clEnumValN(DotScope::Context, "context", "Export only nodes with given -memprof-dot-context-id")))
static cl::opt< bool > DoMergeIteration("memprof-merge-iteration", cl::init(true), cl::Hidden, cl::desc("Iteratively apply merging on a node to catch new callers"))
static bool isMemProfClone(const Function &F)
static cl::opt< unsigned > AllocIdForDot("memprof-dot-alloc-id", cl::init(0), cl::Hidden, cl::desc("Id of alloc to export if -memprof-dot-scope=alloc " "or to highlight if -memprof-dot-scope=all"))
static cl::opt< unsigned > ContextIdForDot("memprof-dot-context-id", cl::init(0), cl::Hidden, cl::desc("Id of context to export if -memprof-dot-scope=context or to " "highlight otherwise"))
static cl::opt< bool > ExportToDot("memprof-export-to-dot", cl::init(false), cl::Hidden, cl::desc("Export graph to dot files."))
static void checkEdge(const std::shared_ptr< ContextEdge< DerivedCCG, FuncTy, CallTy > > &Edge)
static cl::opt< bool > AllowRecursiveCallsites("memprof-allow-recursive-callsites", cl::init(true), cl::Hidden, cl::desc("Allow cloning of callsites involved in recursive cycles"))
bool checkColdOrNotCold(uint8_t AllocType)
static ValueInfo findValueInfoForFunc(const Function &F, const Module &M, const ModuleSummaryIndex *ImportSummary, const Function *CallingFunc=nullptr)
static cl::opt< bool > CloneRecursiveContexts("memprof-clone-recursive-contexts", cl::init(true), cl::Hidden, cl::desc("Allow cloning of contexts through recursive cycles"))
static std::string getAllocTypeString(uint8_t AllocTypes)
static cl::opt< unsigned > MemProfICPNoInlineThreshold("memprof-icp-noinline-threshold", cl::init(2), cl::Hidden, cl::desc("Minimum absolute count for promoted target to be inlinable"))
bool DOTGraphTraits< constCallsiteContextGraph< DerivedCCG, FuncTy, CallTy > * >::DoHighlight
static unsigned getMemProfCloneNum(const Function &F)
static SmallVector< std::unique_ptr< ValueToValueMapTy >, 4 > createFunctionClones(Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE, std::map< const Function *, SmallPtrSet< const GlobalAlias *, 1 > > &FuncToAliasMap, FunctionSummary *FS)
static cl::opt< bool > VerifyCCG("memprof-verify-ccg", cl::init(false), cl::Hidden, cl::desc("Perform verification checks on CallingContextGraph."))
static void checkNode(const ContextNode< DerivedCCG, FuncTy, CallTy > *Node, bool CheckEdges=true)
static cl::opt< bool > MergeClones("memprof-merge-clones", cl::init(true), cl::Hidden, cl::desc("Merge clones before assigning functions"))
static std::string getMemProfFuncName(Twine Base, unsigned CloneNo)
static cl::opt< std::string > MemProfImportSummary("memprof-import-summary", cl::desc("Import summary to use for testing the ThinLTO backend via opt"), cl::Hidden)
static const std::string MemProfCloneSuffix
static void updateSubprogramLinkageName(Function *NewFunc, StringRef Name)
static cl::opt< bool > AllowRecursiveContexts("memprof-allow-recursive-contexts", cl::init(true), cl::Hidden, cl::desc("Allow cloning of contexts having recursive cycles"))
static cl::opt< std::string > DotFilePathPrefix("memprof-dot-file-path-prefix", cl::init(""), cl::Hidden, cl::value_desc("filename"), cl::desc("Specify the path prefix of the MemProf dot files."))
static cl::opt< bool > VerifyNodes("memprof-verify-nodes", cl::init(false), cl::Hidden, cl::desc("Perform frequent verification checks on nodes."))
static void checkAllocContextIds(const AllocInfo &AllocNode, const MDNode *MemProfMD, const CallStack< MDNode, MDNode::op_iterator > &CallsiteContext, const ModuleSummaryIndex *ImportSummary)
static cl::opt< bool > DumpCCG("memprof-dump-ccg", cl::init(false), cl::Hidden, cl::desc("Dump CallingContextGraph to stdout after each stage."))
AllocType
This is the interface to build a ModuleSummaryIndex for a module.
ModuleSummaryIndex.h This file contains the declarations the classes that hold the module index and s...
#define P(N)
if(auto Err=PB.parsePassPipeline(MPM, Passes)) return wrap(std MPM run * Mod
FunctionAnalysisManager FAM
if(PassOpts->AAPipeline)
std::pair< BasicBlock *, BasicBlock * > Edge
This file defines generic set operations that may be used on set's of different types,...
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
This file contains some functions that are useful when dealing with strings.
#define LLVM_DEBUG(...)
Definition Debug.h:114
void print(OutputBuffer &OB) const
ValueInfo getAliaseeVI() const
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
iterator end() const
Definition ArrayRef.h:136
const_pointer iterator
Definition ArrayRef.h:48
iterator begin() const
Definition ArrayRef.h:135
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:142
static LLVM_ABI Attribute get(LLVMContext &Context, AttrKind Kind, uint64_t Val=0)
Return a uniquified Attribute object.
void addFnAttr(Attribute::AttrKind Kind)
Adds the attribute to the function.
void setCalledOperand(Value *V)
Subprogram description. Uses SubclassData1.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:194
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:167
unsigned size() const
Definition DenseMap.h:110
bool empty() const
Definition DenseMap.h:109
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition DenseMap.h:163
iterator end()
Definition DenseMap.h:81
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition DenseMap.h:158
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:222
void reserve(size_type NumEntries)
Grow the densemap so that it can contain at least NumEntries items before resizing again.
Definition DenseMap.h:114
Implements a dense probed hash-table based set.
Definition DenseSet.h:279
Function summary information to aid decisions and implementation of importing.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:209
DISubprogram * getSubprogram() const
Get the attached subprogram.
const Function & getFunction() const
Definition Function.h:164
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
static LLVM_ABI GlobalAlias * create(Type *Ty, unsigned AddressSpace, LinkageTypes Linkage, const Twine &Name, Constant *Aliasee, Module *Parent)
If a parent module is specified, the alias is automatically inserted into the end of the specified mo...
Definition Globals.cpp:597
Function and variable summary information to aid decisions and implementation of importing.
static LLVM_ABI GUID getGUIDAssumingExternalLinkage(StringRef GlobalName)
Return a 64-bit global unique ID constructed from the name of a global symbol.
Definition Globals.cpp:77
static bool isLocalLinkage(LinkageTypes Linkage)
LLVM_ABI bool isDeclaration() const
Return true if the primary definition of this global value is outside of the current translation unit...
Definition Globals.cpp:328
uint64_t GUID
Declare a type to represent a global unique identifier for a global value.
Module * getParent()
Get the module that this global value is contained inside of...
LLVM_ABI void eraseFromParent()
This method unlinks 'this' from the containing module and deletes it.
Definition Globals.cpp:93
static LLVM_ABI std::string getGlobalIdentifier(StringRef Name, GlobalValue::LinkageTypes Linkage, StringRef FileName)
Return the modified name for a global value suitable to be used as the key for a global lookup (e....
Definition Globals.cpp:161
@ InternalLinkage
Rename collisions when linking (static functions).
Definition GlobalValue.h:60
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Metadata node.
Definition Metadata.h:1078
const MDOperand & getOperand(unsigned I) const
Definition Metadata.h:1442
ArrayRef< MDOperand > operands() const
Definition Metadata.h:1440
unsigned getNumOperands() const
Return number of MDNode operands.
Definition Metadata.h:1448
LLVM_ABI TempMDNode clone() const
Create a (temporary) clone of this.
Definition Metadata.cpp:669
static std::enable_if_t< std::is_base_of< MDNode, T >::value, T * > replaceWithUniqued(std::unique_ptr< T, TempMDNodeDeleter > N)
Replace a temporary node with a uniqued one.
Definition Metadata.h:1317
static LLVM_ABI MDString * get(LLVMContext &Context, StringRef Str)
Definition Metadata.cpp:608
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
size_type count(const KeyT &Key) const
Definition MapVector.h:145
MemProfContextDisambiguation(const ModuleSummaryIndex *Summary=nullptr, bool isSamplePGO=false)
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
static ErrorOr< std::unique_ptr< MemoryBuffer > > getFile(const Twine &Filename, bool IsText=false, bool RequiresNullTerminator=true, bool IsVolatile=false, std::optional< Align > Alignment=std::nullopt)
Open the specified file as a MemoryBuffer, returning a new MemoryBuffer if successful,...
Class to hold module path string table and global value map, and encapsulate methods for operating on...
static StringRef getOriginalNameBeforePromote(StringRef Name)
Helper to obtain the unpromoted name for a global value (or the original name if not promoted).
ValueInfo getValueInfo(const GlobalValueSummaryMapTy::value_type &R) const
Return a ValueInfo for the index value_type (convenient when iterating index).
uint64_t getStackIdAtIndex(unsigned Index) const
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
LLVMContext & getContext() const
Get the global data context.
Definition Module.h:285
A NodeSet contains a set of SUnit DAG nodes with additional information that assigns a priority to th...
unsigned size() const
bool insert(SUnit *SU)
The optimization diagnostic interface.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for applied optimization remarks.
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
Definition Analysis.h:115
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
A class that wrap the SHA1 algorithm.
Definition SHA1.h:27
LLVM_ABI void update(ArrayRef< uint8_t > Data)
Digest more data.
Definition SHA1.cpp:208
LLVM_ABI std::array< uint8_t, 20 > result()
Return the current raw 160-bits SHA1 for the digested data since the last call to init().
Definition SHA1.cpp:288
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:183
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
void reserve(size_t Size)
Grow the DenseSet so that it can contain at least NumEntries items before resizing again.
Definition DenseSet.h:96
void insert_range(Range &&R)
Definition DenseSet.h:228
size_type size() const
Definition DenseSet.h:87
void swap(DenseSetImpl &RHS)
Definition DenseSet.h:102
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:175
bool erase(const ValueT &V)
Definition DenseSet.h:100
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition DenseSet.h:180
An efficient, type-erasing, non-owning reference to a callable.
Helper class to iterate through stack ids in both metadata (memprof MIB and callsite) and the corresp...
CallStackIterator beginAfterSharedPrefix(const CallStack &Other)
CallStackIterator end() const
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
CallInst * Call
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ Entry
Definition COFF.h:862
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ CE
Windows NT (Windows on ARM)
Definition MCAsmInfo.h:48
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
initializer< Ty > init(const Ty &Val)
std::enable_if_t< detail::IsValidPointer< X, Y >::value, X * > dyn_extract(Y &&MD)
Extract a Value from Metadata, if any.
Definition Metadata.h:695
LLVM_ABI AllocationType getMIBAllocType(const MDNode *MIB)
Returns the allocation type from an MIB metadata node.
LLVM_ABI bool metadataMayIncludeContextSizeInfo()
Whether the alloc memprof metadata may include context size info for some MIBs (but possibly not all)...
LLVM_ABI bool hasSingleAllocType(uint8_t AllocTypes)
True if the AllocTypes bitmask contains just a single type.
LLVM_ABI std::string getAllocTypeAttributeString(AllocationType Type)
Returns the string to use in attributes with the given type.
LLVM_ABI MDNode * getMIBStackNode(const MDNode *MIB)
Returns the stack node from an MIB metadata node.
DiagnosticInfoOptimizationBase::Argument NV
LLVM_ABI CallBase & promoteIndirectCall(CallBase &CB, Function *F, uint64_t Count, uint64_t TotalCount, bool AttachProfToDirectCall, OptimizationRemarkEmitter *ORE)
uint32_t NodeId
Definition RDFGraph.h:262
NodeAddr< NodeBase * > Node
Definition RDFGraph.h:381
NodeAddr< FuncNode * > Func
Definition RDFGraph.h:393
bool empty() const
Definition BasicBlock.h:101
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
uint64_t read64le(const void *P)
Definition Endian.h:435
void write32le(void *P, uint32_t V)
Definition Endian.h:475
This is an optimization pass for GlobalISel generic memory operations.
cl::opt< unsigned > MinClonedColdBytePercent("memprof-cloning-cold-threshold", cl::init(100), cl::Hidden, cl::desc("Min percent of cold bytes to hint alloc cold during cloning"))
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:318
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
LLVM_ABI void logAllUnhandledErrors(Error E, raw_ostream &OS, Twine ErrorBanner={})
Log all errors (if any) in E to OS.
Definition Error.cpp:65
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
void stable_sort(R &&Range)
Definition STLExtras.h:2038
cl::opt< bool > MemProfReportHintedSizes("memprof-report-hinted-sizes", cl::init(false), cl::Hidden, cl::desc("Report total allocation sizes of hinted allocations"))
LLVM_ABI bool isLegalToPromote(const CallBase &CB, Function *Callee, const char **FailureReason=nullptr)
Return true if the given indirect call site can be made to call Callee.
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2452
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:644
LLVM_ABI bool mayHaveMemprofSummary(const CallBase *CB)
Returns true if the instruction could have memprof metadata, used to ensure consistency between summa...
static cl::opt< bool > MemProfRequireDefinitionForPromotion("memprof-require-definition-for-promotion", cl::init(false), cl::Hidden, cl::desc("Require target function definition when promoting indirect calls"))
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:733
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2116
void set_subtract(S1Ty &S1, const S2Ty &S2)
set_subtract(A, B) - Compute A := A - B
InnerAnalysisManagerProxy< FunctionAnalysisManager, Module > FunctionAnalysisManagerModuleProxy
Provide the FunctionAnalysisManager to Module proxy.
raw_ostream & WriteGraph(raw_ostream &O, const GraphType &G, bool ShortNames=false, const Twine &Title="")
bool set_intersects(const S1Ty &S1, const S2Ty &S2)
set_intersects(A, B) - Return true iff A ^ B is non empty
detail::concat_range< ValueT, RangeTs... > concat(RangeTs &&...Ranges)
Returns a concatenated range across two or more ranges.
Definition STLExtras.h:1152
LLVM_ABI Expected< std::unique_ptr< ModuleSummaryIndex > > getModuleSummaryIndex(MemoryBufferRef Buffer)
Parse the specified bitcode buffer, returning the module summary index.
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:754
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1712
LLVM_ABI void annotateValueSite(Module &M, Instruction &Inst, const InstrProfRecord &InstrProfR, InstrProfValueKind ValueKind, uint32_t SiteIndx, uint32_t MaxMDCount=3)
Get the value profile data for value site SiteIdx from InstrProfR and annotate the instruction Inst w...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1719
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
bool set_union(S1Ty &S1, const S2Ty &S2)
set_union(A, B) - Compute A := A u B, return whether A changed.
cl::opt< bool > SupportsHotColdNew
Indicate we are linking with an allocator that supports hot/cold operator new interfaces.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
S1Ty set_intersection(const S1Ty &S1, const S2Ty &S2)
set_intersection(A, B) - Return A ^ B
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:71
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
cl::opt< bool > EnableMemProfContextDisambiguation
Enable MemProf context disambiguation for thin link.
S1Ty set_difference(const S1Ty &S1, const S2Ty &S2)
set_difference(A, B) - Return A - B
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
Expected< T > errorOrToExpected(ErrorOr< T > &&EO)
Convert an ErrorOr<T> to an Expected<T>.
Definition Error.h:1245
ArrayRef(const T &OneElt) -> ArrayRef< T >
std::string toString(const APInt &I, unsigned Radix, bool Signed, bool formatAsCLiteral=false, bool UpperCase=true, bool InsertSeparators=false)
ValueMap< const Value *, WeakTrackingVH > ValueToValueMapTy
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:560
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1877
LLVM_ABI Function * CloneFunction(Function *F, ValueToValueMapTy &VMap, ClonedCodeInfo *CodeInfo=nullptr)
Return a copy of the specified function and add it to that function's module.
AnalysisManager< Module > ModuleAnalysisManager
Convenience typedef for the Module analysis manager.
Definition MIRParser.h:39
#define N
static std::string getEdgeAttributes(NodeRef, ChildIteratorType ChildIter, GraphType G)
static const ContextNode< DerivedCCG, FuncTy, CallTy > * GetCallee(const EdgePtrTy &P)
std::unique_ptr< ContextNode< DerivedCCG, FuncTy, CallTy > > NodePtrTy
mapped_iterator< typename std::vector< std::shared_ptr< ContextEdge< DerivedCCG, FuncTy, CallTy > > >::const_iterator, decltype(&GetCallee)> ChildIteratorType
mapped_iterator< typename std::vector< NodePtrTy >::const_iterator, decltype(&getNode)> nodes_iterator
std::shared_ptr< ContextEdge< DerivedCCG, FuncTy, CallTy > > EdgePtrTy
Summary of memprof metadata on allocations.
std::vector< MIBInfo > MIBs
SmallVector< unsigned > StackIdIndices
SmallVector< unsigned > Clones
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits(bool simple=false)
An information struct used to provide DenseMap with the various necessary components for a given valu...
typename GraphType::UnknownGraphTypeError NodeRef
Definition GraphTraits.h:95
Struct that holds a reference to a particular GUID in a global value summary.
ArrayRef< std::unique_ptr< GlobalValueSummary > > getSummaryList() const
GlobalValue::GUID getGUID() const
PointerUnion< CallsiteInfo *, AllocInfo * > SimpleType
static SimpleType getSimplifiedValue(IndexCall &Val)
const PointerUnion< CallsiteInfo *, AllocInfo * > SimpleType
static SimpleType getSimplifiedValue(const IndexCall &Val)
Define a template that can be specialized by smart pointers to reflect the fact that they are automat...
Definition Casting.h:34