LLVM 22.0.0git
MemProfContextDisambiguation.cpp
Go to the documentation of this file.
1//==-- MemProfContextDisambiguation.cpp - Disambiguate contexts -------------=//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements support for context disambiguation of allocation
10// calls for profile guided heap optimization. Specifically, it uses Memprof
11// profiles which indicate context specific allocation behavior (currently
12// distinguishing cold vs hot memory allocations). Cloning is performed to
13// expose the cold allocation call contexts, and the allocation calls are
14// subsequently annotated with an attribute for later transformation.
15//
16// The transformations can be performed either directly on IR (regular LTO), or
17// on a ThinLTO index (and later applied to the IR during the ThinLTO backend).
18// Both types of LTO operate on a the same base graph representation, which
19// uses CRTP to support either IR or Index formats.
20//
21//===----------------------------------------------------------------------===//
22
24#include "llvm/ADT/DenseMap.h"
25#include "llvm/ADT/DenseSet.h"
26#include "llvm/ADT/MapVector.h"
29#include "llvm/ADT/SmallSet.h"
31#include "llvm/ADT/Statistic.h"
38#include "llvm/IR/Module.h"
40#include "llvm/Pass.h"
44#include "llvm/Support/SHA1.h"
46#include "llvm/Transforms/IPO.h"
50#include <deque>
51#include <sstream>
52#include <unordered_map>
53#include <vector>
54using namespace llvm;
55using namespace llvm::memprof;
56
57#define DEBUG_TYPE "memprof-context-disambiguation"
58
59STATISTIC(FunctionClonesAnalysis,
60 "Number of function clones created during whole program analysis");
61STATISTIC(FunctionClonesThinBackend,
62 "Number of function clones created during ThinLTO backend");
63STATISTIC(FunctionsClonedThinBackend,
64 "Number of functions that had clones created during ThinLTO backend");
66 FunctionCloneDuplicatesThinBackend,
67 "Number of function clone duplicates detected during ThinLTO backend");
68STATISTIC(AllocTypeNotCold, "Number of not cold static allocations (possibly "
69 "cloned) during whole program analysis");
70STATISTIC(AllocTypeCold, "Number of cold static allocations (possibly cloned) "
71 "during whole program analysis");
72STATISTIC(AllocTypeNotColdThinBackend,
73 "Number of not cold static allocations (possibly cloned) during "
74 "ThinLTO backend");
75STATISTIC(AllocTypeColdThinBackend, "Number of cold static allocations "
76 "(possibly cloned) during ThinLTO backend");
77STATISTIC(OrigAllocsThinBackend,
78 "Number of original (not cloned) allocations with memprof profiles "
79 "during ThinLTO backend");
81 AllocVersionsThinBackend,
82 "Number of allocation versions (including clones) during ThinLTO backend");
83STATISTIC(MaxAllocVersionsThinBackend,
84 "Maximum number of allocation versions created for an original "
85 "allocation during ThinLTO backend");
86STATISTIC(UnclonableAllocsThinBackend,
87 "Number of unclonable ambigous allocations during ThinLTO backend");
88STATISTIC(RemovedEdgesWithMismatchedCallees,
89 "Number of edges removed due to mismatched callees (profiled vs IR)");
90STATISTIC(FoundProfiledCalleeCount,
91 "Number of profiled callees found via tail calls");
92STATISTIC(FoundProfiledCalleeDepth,
93 "Aggregate depth of profiled callees found via tail calls");
94STATISTIC(FoundProfiledCalleeMaxDepth,
95 "Maximum depth of profiled callees found via tail calls");
96STATISTIC(FoundProfiledCalleeNonUniquelyCount,
97 "Number of profiled callees found via multiple tail call chains");
98STATISTIC(DeferredBackedges, "Number of backedges with deferred cloning");
99STATISTIC(NewMergedNodes, "Number of new nodes created during merging");
100STATISTIC(NonNewMergedNodes, "Number of non new nodes used during merging");
101STATISTIC(MissingAllocForContextId,
102 "Number of missing alloc nodes for context ids");
103STATISTIC(SkippedCallsCloning,
104 "Number of calls skipped during cloning due to unexpected operand");
105STATISTIC(MismatchedCloneAssignments,
106 "Number of callsites assigned to call multiple non-matching clones");
107STATISTIC(TotalMergeInvokes, "Number of merge invocations for nodes");
108STATISTIC(TotalMergeIters, "Number of merge iterations for nodes");
109STATISTIC(MaxMergeIters, "Max merge iterations for nodes");
110
112 "memprof-dot-file-path-prefix", cl::init(""), cl::Hidden,
113 cl::value_desc("filename"),
114 cl::desc("Specify the path prefix of the MemProf dot files."));
115
116static cl::opt<bool> ExportToDot("memprof-export-to-dot", cl::init(false),
118 cl::desc("Export graph to dot files."));
119
120// TODO: Remove this option once new handling is validated more widely.
122 "memprof-merge-iteration", cl::init(true), cl::Hidden,
123 cl::desc("Iteratively apply merging on a node to catch new callers"));
124
125// How much of the graph to export to dot.
127 All, // The full CCG graph.
128 Alloc, // Only contexts for the specified allocation.
129 Context, // Only the specified context.
130};
131
133 "memprof-dot-scope", cl::desc("Scope of graph to export to dot"),
136 clEnumValN(DotScope::All, "all", "Export full callsite graph"),
138 "Export only nodes with contexts feeding given "
139 "-memprof-dot-alloc-id"),
140 clEnumValN(DotScope::Context, "context",
141 "Export only nodes with given -memprof-dot-context-id")));
142
144 AllocIdForDot("memprof-dot-alloc-id", cl::init(0), cl::Hidden,
145 cl::desc("Id of alloc to export if -memprof-dot-scope=alloc "
146 "or to highlight if -memprof-dot-scope=all"));
147
149 "memprof-dot-context-id", cl::init(0), cl::Hidden,
150 cl::desc("Id of context to export if -memprof-dot-scope=context or to "
151 "highlight otherwise"));
152
153static cl::opt<bool>
154 DumpCCG("memprof-dump-ccg", cl::init(false), cl::Hidden,
155 cl::desc("Dump CallingContextGraph to stdout after each stage."));
156
157static cl::opt<bool>
158 VerifyCCG("memprof-verify-ccg", cl::init(false), cl::Hidden,
159 cl::desc("Perform verification checks on CallingContextGraph."));
160
161static cl::opt<bool>
162 VerifyNodes("memprof-verify-nodes", cl::init(false), cl::Hidden,
163 cl::desc("Perform frequent verification checks on nodes."));
164
166 "memprof-import-summary",
167 cl::desc("Import summary to use for testing the ThinLTO backend via opt"),
168 cl::Hidden);
169
171 TailCallSearchDepth("memprof-tail-call-search-depth", cl::init(5),
173 cl::desc("Max depth to recursively search for missing "
174 "frames through tail calls."));
175
176// Optionally enable cloning of callsites involved with recursive cycles
178 "memprof-allow-recursive-callsites", cl::init(true), cl::Hidden,
179 cl::desc("Allow cloning of callsites involved in recursive cycles"));
180
182 "memprof-clone-recursive-contexts", cl::init(true), cl::Hidden,
183 cl::desc("Allow cloning of contexts through recursive cycles"));
184
185// Generally this is needed for correct assignment of allocation clones to
186// function clones, however, allow it to be disabled for debugging while the
187// functionality is new and being tested more widely.
188static cl::opt<bool>
189 MergeClones("memprof-merge-clones", cl::init(true), cl::Hidden,
190 cl::desc("Merge clones before assigning functions"));
191
192// When disabled, try to detect and prevent cloning of recursive contexts.
193// This is only necessary until we support cloning through recursive cycles.
194// Leave on by default for now, as disabling requires a little bit of compile
195// time overhead and doesn't affect correctness, it will just inflate the cold
196// hinted bytes reporting a bit when -memprof-report-hinted-sizes is enabled.
198 "memprof-allow-recursive-contexts", cl::init(true), cl::Hidden,
199 cl::desc("Allow cloning of contexts having recursive cycles"));
200
201// Set the minimum absolute count threshold for allowing inlining of indirect
202// calls promoted during cloning.
204 "memprof-icp-noinline-threshold", cl::init(2), cl::Hidden,
205 cl::desc("Minimum absolute count for promoted target to be inlinable"));
206
207namespace llvm {
209 "enable-memprof-context-disambiguation", cl::init(false), cl::Hidden,
210 cl::ZeroOrMore, cl::desc("Enable MemProf context disambiguation"));
211
212// Indicate we are linking with an allocator that supports hot/cold operator
213// new interfaces.
215 "supports-hot-cold-new", cl::init(false), cl::Hidden,
216 cl::desc("Linking with hot/cold operator new interfaces"));
217
219 "memprof-require-definition-for-promotion", cl::init(false), cl::Hidden,
220 cl::desc(
221 "Require target function definition when promoting indirect calls"));
222
225
226} // namespace llvm
227
228namespace {
229/// CRTP base for graphs built from either IR or ThinLTO summary index.
230///
231/// The graph represents the call contexts in all memprof metadata on allocation
232/// calls, with nodes for the allocations themselves, as well as for the calls
233/// in each context. The graph is initially built from the allocation memprof
234/// metadata (or summary) MIBs. It is then updated to match calls with callsite
235/// metadata onto the nodes, updating it to reflect any inlining performed on
236/// those calls.
237///
238/// Each MIB (representing an allocation's call context with allocation
239/// behavior) is assigned a unique context id during the graph build. The edges
240/// and nodes in the graph are decorated with the context ids they carry. This
241/// is used to correctly update the graph when cloning is performed so that we
242/// can uniquify the context for a single (possibly cloned) allocation.
243template <typename DerivedCCG, typename FuncTy, typename CallTy>
244class CallsiteContextGraph {
245public:
246 CallsiteContextGraph() = default;
247 CallsiteContextGraph(const CallsiteContextGraph &) = default;
248 CallsiteContextGraph(CallsiteContextGraph &&) = default;
249
250 /// Main entry point to perform analysis and transformations on graph.
251 bool process();
252
253 /// Perform cloning on the graph necessary to uniquely identify the allocation
254 /// behavior of an allocation based on its context.
255 void identifyClones();
256
257 /// Assign callsite clones to functions, cloning functions as needed to
258 /// accommodate the combinations of their callsite clones reached by callers.
259 /// For regular LTO this clones functions and callsites in the IR, but for
260 /// ThinLTO the cloning decisions are noted in the summaries and later applied
261 /// in applyImport.
262 bool assignFunctions();
263
264 void dump() const;
265 void print(raw_ostream &OS) const;
266 void printTotalSizes(raw_ostream &OS) const;
267
269 const CallsiteContextGraph &CCG) {
270 CCG.print(OS);
271 return OS;
272 }
273
274 friend struct GraphTraits<
275 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
276 friend struct DOTGraphTraits<
277 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
278
279 void exportToDot(std::string Label) const;
280
281 /// Represents a function clone via FuncTy pointer and clone number pair.
282 struct FuncInfo final
283 : public std::pair<FuncTy *, unsigned /*Clone number*/> {
284 using Base = std::pair<FuncTy *, unsigned>;
285 FuncInfo(const Base &B) : Base(B) {}
286 FuncInfo(FuncTy *F = nullptr, unsigned CloneNo = 0) : Base(F, CloneNo) {}
287 explicit operator bool() const { return this->first != nullptr; }
288 FuncTy *func() const { return this->first; }
289 unsigned cloneNo() const { return this->second; }
290 };
291
292 /// Represents a callsite clone via CallTy and clone number pair.
293 struct CallInfo final : public std::pair<CallTy, unsigned /*Clone number*/> {
294 using Base = std::pair<CallTy, unsigned>;
295 CallInfo(const Base &B) : Base(B) {}
296 CallInfo(CallTy Call = nullptr, unsigned CloneNo = 0)
297 : Base(Call, CloneNo) {}
298 explicit operator bool() const { return (bool)this->first; }
299 CallTy call() const { return this->first; }
300 unsigned cloneNo() const { return this->second; }
301 void setCloneNo(unsigned N) { this->second = N; }
302 void print(raw_ostream &OS) const {
303 if (!operator bool()) {
304 assert(!cloneNo());
305 OS << "null Call";
306 return;
307 }
308 call()->print(OS);
309 OS << "\t(clone " << cloneNo() << ")";
310 }
311 void dump() const {
312 print(dbgs());
313 dbgs() << "\n";
314 }
315 friend raw_ostream &operator<<(raw_ostream &OS, const CallInfo &Call) {
316 Call.print(OS);
317 return OS;
318 }
319 };
320
321 struct ContextEdge;
322
323 /// Node in the Callsite Context Graph
324 struct ContextNode {
325 // Assigned to nodes as they are created, useful for debugging.
326 unsigned NodeId = 0;
327
328 // Keep this for now since in the IR case where we have an Instruction* it
329 // is not as immediately discoverable. Used for printing richer information
330 // when dumping graph.
331 bool IsAllocation;
332
333 // Keeps track of when the Call was reset to null because there was
334 // recursion.
335 bool Recursive = false;
336
337 // This will be formed by ORing together the AllocationType enum values
338 // for contexts including this node.
339 uint8_t AllocTypes = 0;
340
341 // The corresponding allocation or interior call. This is the primary call
342 // for which we have created this node.
343 CallInfo Call;
344
345 // List of other calls that can be treated the same as the primary call
346 // through cloning. I.e. located in the same function and have the same
347 // (possibly pruned) stack ids. They will be updated the same way as the
348 // primary call when assigning to function clones.
349 SmallVector<CallInfo, 0> MatchingCalls;
350
351 // For alloc nodes this is a unique id assigned when constructed, and for
352 // callsite stack nodes it is the original stack id when the node is
353 // constructed from the memprof MIB metadata on the alloc nodes. Note that
354 // this is only used when matching callsite metadata onto the stack nodes
355 // created when processing the allocation memprof MIBs, and for labeling
356 // nodes in the dot graph. Therefore we don't bother to assign a value for
357 // clones.
358 uint64_t OrigStackOrAllocId = 0;
359
360 // Edges to all callees in the profiled call stacks.
361 // TODO: Should this be a map (from Callee node) for more efficient lookup?
362 std::vector<std::shared_ptr<ContextEdge>> CalleeEdges;
363
364 // Edges to all callers in the profiled call stacks.
365 // TODO: Should this be a map (from Caller node) for more efficient lookup?
366 std::vector<std::shared_ptr<ContextEdge>> CallerEdges;
367
368 // Returns true if we need to look at the callee edges for determining the
369 // node context ids and allocation type.
370 bool useCallerEdgesForContextInfo() const {
371 // Typically if the callee edges are empty either the caller edges are
372 // also empty, or this is an allocation (leaf node). However, if we are
373 // allowing recursive callsites and contexts this will be violated for
374 // incompletely cloned recursive cycles.
375 assert(!CalleeEdges.empty() || CallerEdges.empty() || IsAllocation ||
377 // When cloning for a recursive context, during cloning we might be in the
378 // midst of cloning for a recurrence and have moved context ids off of a
379 // caller edge onto the clone but not yet off of the incoming caller
380 // (back) edge. If we don't look at those we miss the fact that this node
381 // still has context ids of interest.
382 return IsAllocation || CloneRecursiveContexts;
383 }
384
385 // Compute the context ids for this node from the union of its edge context
386 // ids.
387 DenseSet<uint32_t> getContextIds() const {
388 unsigned Count = 0;
389 // Compute the number of ids for reserve below. In general we only need to
390 // look at one set of edges, typically the callee edges, since other than
391 // allocations and in some cases during recursion cloning, all the context
392 // ids on the callers should also flow out via callee edges.
393 for (auto &Edge : CalleeEdges.empty() ? CallerEdges : CalleeEdges)
394 Count += Edge->getContextIds().size();
395 DenseSet<uint32_t> ContextIds;
396 ContextIds.reserve(Count);
398 CalleeEdges, useCallerEdgesForContextInfo()
399 ? CallerEdges
400 : std::vector<std::shared_ptr<ContextEdge>>());
401 for (const auto &Edge : Edges)
402 ContextIds.insert_range(Edge->getContextIds());
403 return ContextIds;
404 }
405
406 // Compute the allocation type for this node from the OR of its edge
407 // allocation types.
408 uint8_t computeAllocType() const {
409 uint8_t BothTypes =
413 CalleeEdges, useCallerEdgesForContextInfo()
414 ? CallerEdges
415 : std::vector<std::shared_ptr<ContextEdge>>());
416 for (const auto &Edge : Edges) {
417 AllocType |= Edge->AllocTypes;
418 // Bail early if alloc type reached both, no further refinement.
419 if (AllocType == BothTypes)
420 return AllocType;
421 }
422 return AllocType;
423 }
424
425 // The context ids set for this node is empty if its edge context ids are
426 // also all empty.
427 bool emptyContextIds() const {
429 CalleeEdges, useCallerEdgesForContextInfo()
430 ? CallerEdges
431 : std::vector<std::shared_ptr<ContextEdge>>());
432 for (const auto &Edge : Edges) {
433 if (!Edge->getContextIds().empty())
434 return false;
435 }
436 return true;
437 }
438
439 // List of clones of this ContextNode, initially empty.
440 std::vector<ContextNode *> Clones;
441
442 // If a clone, points to the original uncloned node.
443 ContextNode *CloneOf = nullptr;
444
445 ContextNode(bool IsAllocation) : IsAllocation(IsAllocation), Call() {}
446
447 ContextNode(bool IsAllocation, CallInfo C)
448 : IsAllocation(IsAllocation), Call(C) {}
449
450 void addClone(ContextNode *Clone) {
451 if (CloneOf) {
452 CloneOf->Clones.push_back(Clone);
453 Clone->CloneOf = CloneOf;
454 } else {
455 Clones.push_back(Clone);
456 assert(!Clone->CloneOf);
457 Clone->CloneOf = this;
458 }
459 }
460
461 ContextNode *getOrigNode() {
462 if (!CloneOf)
463 return this;
464 return CloneOf;
465 }
466
467 void addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
468 unsigned int ContextId);
469
470 ContextEdge *findEdgeFromCallee(const ContextNode *Callee);
471 ContextEdge *findEdgeFromCaller(const ContextNode *Caller);
472 void eraseCalleeEdge(const ContextEdge *Edge);
473 void eraseCallerEdge(const ContextEdge *Edge);
474
475 void setCall(CallInfo C) { Call = C; }
476
477 bool hasCall() const { return (bool)Call.call(); }
478
479 void printCall(raw_ostream &OS) const { Call.print(OS); }
480
481 // True if this node was effectively removed from the graph, in which case
482 // it should have an allocation type of None and empty context ids.
483 bool isRemoved() const {
484 // Typically if the callee edges are empty either the caller edges are
485 // also empty, or this is an allocation (leaf node). However, if we are
486 // allowing recursive callsites and contexts this will be violated for
487 // incompletely cloned recursive cycles.
489 (AllocTypes == (uint8_t)AllocationType::None) ==
490 emptyContextIds());
491 return AllocTypes == (uint8_t)AllocationType::None;
492 }
493
494 void dump() const;
495 void print(raw_ostream &OS) const;
496
497 friend raw_ostream &operator<<(raw_ostream &OS, const ContextNode &Node) {
498 Node.print(OS);
499 return OS;
500 }
501 };
502
503 /// Edge in the Callsite Context Graph from a ContextNode N to a caller or
504 /// callee.
505 struct ContextEdge {
506 ContextNode *Callee;
507 ContextNode *Caller;
508
509 // This will be formed by ORing together the AllocationType enum values
510 // for contexts including this edge.
511 uint8_t AllocTypes = 0;
512
513 // Set just before initiating cloning when cloning of recursive contexts is
514 // enabled. Used to defer cloning of backedges until we have done cloning of
515 // the callee node for non-backedge caller edges. This exposes cloning
516 // opportunities through the backedge of the cycle.
517 // TODO: Note that this is not updated during cloning, and it is unclear
518 // whether that would be needed.
519 bool IsBackedge = false;
520
521 // The set of IDs for contexts including this edge.
522 DenseSet<uint32_t> ContextIds;
523
524 ContextEdge(ContextNode *Callee, ContextNode *Caller, uint8_t AllocType,
525 DenseSet<uint32_t> ContextIds)
526 : Callee(Callee), Caller(Caller), AllocTypes(AllocType),
527 ContextIds(std::move(ContextIds)) {}
528
529 DenseSet<uint32_t> &getContextIds() { return ContextIds; }
530
531 // Helper to clear the fields of this edge when we are removing it from the
532 // graph.
533 inline void clear() {
534 ContextIds.clear();
535 AllocTypes = (uint8_t)AllocationType::None;
536 Caller = nullptr;
537 Callee = nullptr;
538 }
539
540 // Check if edge was removed from the graph. This is useful while iterating
541 // over a copy of edge lists when performing operations that mutate the
542 // graph in ways that might remove one of the edges.
543 inline bool isRemoved() const {
544 if (Callee || Caller)
545 return false;
546 // Any edges that have been removed from the graph but are still in a
547 // shared_ptr somewhere should have all fields null'ed out by clear()
548 // above.
549 assert(AllocTypes == (uint8_t)AllocationType::None);
550 assert(ContextIds.empty());
551 return true;
552 }
553
554 void dump() const;
555 void print(raw_ostream &OS) const;
556
557 friend raw_ostream &operator<<(raw_ostream &OS, const ContextEdge &Edge) {
558 Edge.print(OS);
559 return OS;
560 }
561 };
562
563 /// Helpers to remove edges that have allocation type None (due to not
564 /// carrying any context ids) after transformations.
565 void removeNoneTypeCalleeEdges(ContextNode *Node);
566 void removeNoneTypeCallerEdges(ContextNode *Node);
567 void
568 recursivelyRemoveNoneTypeCalleeEdges(ContextNode *Node,
570
571protected:
572 /// Get a list of nodes corresponding to the stack ids in the given callsite
573 /// context.
574 template <class NodeT, class IteratorT>
575 std::vector<uint64_t>
576 getStackIdsWithContextNodes(CallStack<NodeT, IteratorT> &CallsiteContext);
577
578 /// Adds nodes for the given allocation and any stack ids on its memprof MIB
579 /// metadata (or summary).
580 ContextNode *addAllocNode(CallInfo Call, const FuncTy *F);
581
582 /// Adds nodes for the given MIB stack ids.
583 template <class NodeT, class IteratorT>
584 void addStackNodesForMIB(ContextNode *AllocNode,
585 CallStack<NodeT, IteratorT> &StackContext,
586 CallStack<NodeT, IteratorT> &CallsiteContext,
588 ArrayRef<ContextTotalSize> ContextSizeInfo);
589
590 /// Matches all callsite metadata (or summary) to the nodes created for
591 /// allocation memprof MIB metadata, synthesizing new nodes to reflect any
592 /// inlining performed on those callsite instructions.
593 void updateStackNodes();
594
595 /// Update graph to conservatively handle any callsite stack nodes that target
596 /// multiple different callee target functions.
597 void handleCallsitesWithMultipleTargets();
598
599 /// Mark backedges via the standard DFS based backedge algorithm.
600 void markBackedges();
601
602 /// Merge clones generated during cloning for different allocations but that
603 /// are called by the same caller node, to ensure proper function assignment.
604 void mergeClones();
605
606 // Try to partition calls on the given node (already placed into the AllCalls
607 // array) by callee function, creating new copies of Node as needed to hold
608 // calls with different callees, and moving the callee edges appropriately.
609 // Returns true if partitioning was successful.
610 bool partitionCallsByCallee(
611 ContextNode *Node, ArrayRef<CallInfo> AllCalls,
612 std::vector<std::pair<CallInfo, ContextNode *>> &NewCallToNode);
613
614 /// Save lists of calls with MemProf metadata in each function, for faster
615 /// iteration.
616 MapVector<FuncTy *, std::vector<CallInfo>> FuncToCallsWithMetadata;
617
618 /// Map from callsite node to the enclosing caller function.
619 std::map<const ContextNode *, const FuncTy *> NodeToCallingFunc;
620
621 // When exporting to dot, and an allocation id is specified, contains the
622 // context ids on that allocation.
623 DenseSet<uint32_t> DotAllocContextIds;
624
625private:
626 using EdgeIter = typename std::vector<std::shared_ptr<ContextEdge>>::iterator;
627
628 // Structure to keep track of information for each call as we are matching
629 // non-allocation callsites onto context nodes created from the allocation
630 // call metadata / summary contexts.
631 struct CallContextInfo {
632 // The callsite we're trying to match.
633 CallTy Call;
634 // The callsites stack ids that have a context node in the graph.
635 std::vector<uint64_t> StackIds;
636 // The function containing this callsite.
637 const FuncTy *Func;
638 // Initially empty, if needed this will be updated to contain the context
639 // ids for use in a new context node created for this callsite.
640 DenseSet<uint32_t> ContextIds;
641 };
642
643 /// Helper to remove edge from graph, updating edge iterator if it is provided
644 /// (in which case CalleeIter indicates which edge list is being iterated).
645 /// This will also perform the necessary clearing of the ContextEdge members
646 /// to enable later checking if the edge has been removed (since we may have
647 /// other copies of the shared_ptr in existence, and in fact rely on this to
648 /// enable removal while iterating over a copy of a node's edge list).
649 void removeEdgeFromGraph(ContextEdge *Edge, EdgeIter *EI = nullptr,
650 bool CalleeIter = true);
651
652 /// Assigns the given Node to calls at or inlined into the location with
653 /// the Node's stack id, after post order traversing and processing its
654 /// caller nodes. Uses the call information recorded in the given
655 /// StackIdToMatchingCalls map, and creates new nodes for inlined sequences
656 /// as needed. Called by updateStackNodes which sets up the given
657 /// StackIdToMatchingCalls map.
658 void assignStackNodesPostOrder(
659 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
660 DenseMap<uint64_t, std::vector<CallContextInfo>> &StackIdToMatchingCalls,
661 DenseMap<CallInfo, CallInfo> &CallToMatchingCall);
662
663 /// Duplicates the given set of context ids, updating the provided
664 /// map from each original id with the newly generated context ids,
665 /// and returning the new duplicated id set.
666 DenseSet<uint32_t> duplicateContextIds(
667 const DenseSet<uint32_t> &StackSequenceContextIds,
668 DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
669
670 /// Propagates all duplicated context ids across the graph.
671 void propagateDuplicateContextIds(
672 const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
673
674 /// Connect the NewNode to OrigNode's callees if TowardsCallee is true,
675 /// else to its callers. Also updates OrigNode's edges to remove any context
676 /// ids moved to the newly created edge.
677 void connectNewNode(ContextNode *NewNode, ContextNode *OrigNode,
678 bool TowardsCallee,
679 DenseSet<uint32_t> RemainingContextIds);
680
681 /// Get the stack id corresponding to the given Id or Index (for IR this will
682 /// return itself, for a summary index this will return the id recorded in the
683 /// index for that stack id index value).
684 uint64_t getStackId(uint64_t IdOrIndex) const {
685 return static_cast<const DerivedCCG *>(this)->getStackId(IdOrIndex);
686 }
687
688 /// Returns true if the given call targets the callee of the given edge, or if
689 /// we were able to identify the call chain through intermediate tail calls.
690 /// In the latter case new context nodes are added to the graph for the
691 /// identified tail calls, and their synthesized nodes are added to
692 /// TailCallToContextNodeMap. The EdgeIter is updated in the latter case for
693 /// the updated edges and to prepare it for an increment in the caller.
694 bool
695 calleesMatch(CallTy Call, EdgeIter &EI,
696 MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap);
697
698 // Return the callee function of the given call, or nullptr if it can't be
699 // determined
700 const FuncTy *getCalleeFunc(CallTy Call) {
701 return static_cast<DerivedCCG *>(this)->getCalleeFunc(Call);
702 }
703
704 /// Returns true if the given call targets the given function, or if we were
705 /// able to identify the call chain through intermediate tail calls (in which
706 /// case FoundCalleeChain will be populated).
707 bool calleeMatchesFunc(
708 CallTy Call, const FuncTy *Func, const FuncTy *CallerFunc,
709 std::vector<std::pair<CallTy, FuncTy *>> &FoundCalleeChain) {
710 return static_cast<DerivedCCG *>(this)->calleeMatchesFunc(
711 Call, Func, CallerFunc, FoundCalleeChain);
712 }
713
714 /// Returns true if both call instructions have the same callee.
715 bool sameCallee(CallTy Call1, CallTy Call2) {
716 return static_cast<DerivedCCG *>(this)->sameCallee(Call1, Call2);
717 }
718
719 /// Get a list of nodes corresponding to the stack ids in the given
720 /// callsite's context.
721 std::vector<uint64_t> getStackIdsWithContextNodesForCall(CallTy Call) {
722 return static_cast<DerivedCCG *>(this)->getStackIdsWithContextNodesForCall(
723 Call);
724 }
725
726 /// Get the last stack id in the context for callsite.
727 uint64_t getLastStackId(CallTy Call) {
728 return static_cast<DerivedCCG *>(this)->getLastStackId(Call);
729 }
730
731 /// Update the allocation call to record type of allocated memory.
732 void updateAllocationCall(CallInfo &Call, AllocationType AllocType) {
733 AllocType == AllocationType::Cold ? AllocTypeCold++ : AllocTypeNotCold++;
734 static_cast<DerivedCCG *>(this)->updateAllocationCall(Call, AllocType);
735 }
736
737 /// Get the AllocationType assigned to the given allocation instruction clone.
738 AllocationType getAllocationCallType(const CallInfo &Call) const {
739 return static_cast<const DerivedCCG *>(this)->getAllocationCallType(Call);
740 }
741
742 /// Update non-allocation call to invoke (possibly cloned) function
743 /// CalleeFunc.
744 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc) {
745 static_cast<DerivedCCG *>(this)->updateCall(CallerCall, CalleeFunc);
746 }
747
748 /// Clone the given function for the given callsite, recording mapping of all
749 /// of the functions tracked calls to their new versions in the CallMap.
750 /// Assigns new clones to clone number CloneNo.
751 FuncInfo cloneFunctionForCallsite(
752 FuncInfo &Func, CallInfo &Call, DenseMap<CallInfo, CallInfo> &CallMap,
753 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
754 return static_cast<DerivedCCG *>(this)->cloneFunctionForCallsite(
755 Func, Call, CallMap, CallsWithMetadataInFunc, CloneNo);
756 }
757
758 /// Gets a label to use in the dot graph for the given call clone in the given
759 /// function.
760 std::string getLabel(const FuncTy *Func, const CallTy Call,
761 unsigned CloneNo) const {
762 return static_cast<const DerivedCCG *>(this)->getLabel(Func, Call, CloneNo);
763 }
764
765 // Create and return a new ContextNode.
766 ContextNode *createNewNode(bool IsAllocation, const FuncTy *F = nullptr,
767 CallInfo C = CallInfo()) {
768 NodeOwner.push_back(std::make_unique<ContextNode>(IsAllocation, C));
769 auto *NewNode = NodeOwner.back().get();
770 if (F)
771 NodeToCallingFunc[NewNode] = F;
772 NewNode->NodeId = NodeOwner.size();
773 return NewNode;
774 }
775
776 /// Helpers to find the node corresponding to the given call or stackid.
777 ContextNode *getNodeForInst(const CallInfo &C);
778 ContextNode *getNodeForAlloc(const CallInfo &C);
779 ContextNode *getNodeForStackId(uint64_t StackId);
780
781 /// Computes the alloc type corresponding to the given context ids, by
782 /// unioning their recorded alloc types.
783 uint8_t computeAllocType(DenseSet<uint32_t> &ContextIds) const;
784
785 /// Returns the allocation type of the intersection of the contexts of two
786 /// nodes (based on their provided context id sets), optimized for the case
787 /// when Node1Ids is smaller than Node2Ids.
788 uint8_t intersectAllocTypesImpl(const DenseSet<uint32_t> &Node1Ids,
789 const DenseSet<uint32_t> &Node2Ids) const;
790
791 /// Returns the allocation type of the intersection of the contexts of two
792 /// nodes (based on their provided context id sets).
793 uint8_t intersectAllocTypes(const DenseSet<uint32_t> &Node1Ids,
794 const DenseSet<uint32_t> &Node2Ids) const;
795
796 /// Create a clone of Edge's callee and move Edge to that new callee node,
797 /// performing the necessary context id and allocation type updates.
798 /// If ContextIdsToMove is non-empty, only that subset of Edge's ids are
799 /// moved to an edge to the new callee.
800 ContextNode *
801 moveEdgeToNewCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
802 DenseSet<uint32_t> ContextIdsToMove = {});
803
804 /// Change the callee of Edge to existing callee clone NewCallee, performing
805 /// the necessary context id and allocation type updates.
806 /// If ContextIdsToMove is non-empty, only that subset of Edge's ids are
807 /// moved to an edge to the new callee.
808 void moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
809 ContextNode *NewCallee,
810 bool NewClone = false,
811 DenseSet<uint32_t> ContextIdsToMove = {});
812
813 /// Change the caller of the edge at the given callee edge iterator to be
814 /// NewCaller, performing the necessary context id and allocation type
815 /// updates. This is similar to the above moveEdgeToExistingCalleeClone, but
816 /// a simplified version of it as we always move the given edge and all of its
817 /// context ids.
818 void moveCalleeEdgeToNewCaller(const std::shared_ptr<ContextEdge> &Edge,
819 ContextNode *NewCaller);
820
821 /// Recursive helper for marking backedges via DFS.
822 void markBackedges(ContextNode *Node, DenseSet<const ContextNode *> &Visited,
823 DenseSet<const ContextNode *> &CurrentStack);
824
825 /// Recursive helper for merging clones.
826 void
827 mergeClones(ContextNode *Node, DenseSet<const ContextNode *> &Visited,
828 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode);
829 /// Main worker for merging callee clones for a given node.
830 void mergeNodeCalleeClones(
831 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
832 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode);
833 /// Helper to find other callers of the given set of callee edges that can
834 /// share the same callee merge node.
835 void findOtherCallersToShareMerge(
836 ContextNode *Node, std::vector<std::shared_ptr<ContextEdge>> &CalleeEdges,
837 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode,
838 DenseSet<ContextNode *> &OtherCallersToShareMerge);
839
840 /// Recursively perform cloning on the graph for the given Node and its
841 /// callers, in order to uniquely identify the allocation behavior of an
842 /// allocation given its context. The context ids of the allocation being
843 /// processed are given in AllocContextIds.
844 void identifyClones(ContextNode *Node, DenseSet<const ContextNode *> &Visited,
845 const DenseSet<uint32_t> &AllocContextIds);
846
847 /// Map from each context ID to the AllocationType assigned to that context.
848 DenseMap<uint32_t, AllocationType> ContextIdToAllocationType;
849
850 /// Map from each contextID to the profiled full contexts and their total
851 /// sizes (there may be more than one due to context trimming),
852 /// optionally populated when requested (via MemProfReportHintedSizes or
853 /// MinClonedColdBytePercent).
854 DenseMap<uint32_t, std::vector<ContextTotalSize>> ContextIdToContextSizeInfos;
855
856 /// Identifies the context node created for a stack id when adding the MIB
857 /// contexts to the graph. This is used to locate the context nodes when
858 /// trying to assign the corresponding callsites with those stack ids to these
859 /// nodes.
860 DenseMap<uint64_t, ContextNode *> StackEntryIdToContextNodeMap;
861
862 /// Maps to track the calls to their corresponding nodes in the graph.
863 MapVector<CallInfo, ContextNode *> AllocationCallToContextNodeMap;
864 MapVector<CallInfo, ContextNode *> NonAllocationCallToContextNodeMap;
865
866 /// Owner of all ContextNode unique_ptrs.
867 std::vector<std::unique_ptr<ContextNode>> NodeOwner;
868
869 /// Perform sanity checks on graph when requested.
870 void check() const;
871
872 /// Keeps track of the last unique context id assigned.
873 unsigned int LastContextId = 0;
874};
875
876template <typename DerivedCCG, typename FuncTy, typename CallTy>
877using ContextNode =
878 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode;
879template <typename DerivedCCG, typename FuncTy, typename CallTy>
880using ContextEdge =
881 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge;
882template <typename DerivedCCG, typename FuncTy, typename CallTy>
883using FuncInfo =
884 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::FuncInfo;
885template <typename DerivedCCG, typename FuncTy, typename CallTy>
886using CallInfo =
887 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::CallInfo;
888
889/// CRTP derived class for graphs built from IR (regular LTO).
890class ModuleCallsiteContextGraph
891 : public CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
892 Instruction *> {
893public:
894 ModuleCallsiteContextGraph(
895 Module &M,
896 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter);
897
898private:
899 friend CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
900 Instruction *>;
901
902 uint64_t getStackId(uint64_t IdOrIndex) const;
903 const Function *getCalleeFunc(Instruction *Call);
904 bool calleeMatchesFunc(
905 Instruction *Call, const Function *Func, const Function *CallerFunc,
906 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain);
907 bool sameCallee(Instruction *Call1, Instruction *Call2);
908 bool findProfiledCalleeThroughTailCalls(
909 const Function *ProfiledCallee, Value *CurCallee, unsigned Depth,
910 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain,
911 bool &FoundMultipleCalleeChains);
912 uint64_t getLastStackId(Instruction *Call);
913 std::vector<uint64_t> getStackIdsWithContextNodesForCall(Instruction *Call);
914 void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
915 AllocationType getAllocationCallType(const CallInfo &Call) const;
916 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
917 CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
918 Instruction *>::FuncInfo
919 cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
920 DenseMap<CallInfo, CallInfo> &CallMap,
921 std::vector<CallInfo> &CallsWithMetadataInFunc,
922 unsigned CloneNo);
923 std::string getLabel(const Function *Func, const Instruction *Call,
924 unsigned CloneNo) const;
925
926 const Module &Mod;
927 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter;
928};
929
930/// Represents a call in the summary index graph, which can either be an
931/// allocation or an interior callsite node in an allocation's context.
932/// Holds a pointer to the corresponding data structure in the index.
933struct IndexCall : public PointerUnion<CallsiteInfo *, AllocInfo *> {
934 IndexCall() : PointerUnion() {}
935 IndexCall(std::nullptr_t) : IndexCall() {}
936 IndexCall(CallsiteInfo *StackNode) : PointerUnion(StackNode) {}
937 IndexCall(AllocInfo *AllocNode) : PointerUnion(AllocNode) {}
938 IndexCall(PointerUnion PT) : PointerUnion(PT) {}
939
940 IndexCall *operator->() { return this; }
941
942 void print(raw_ostream &OS) const {
943 PointerUnion<CallsiteInfo *, AllocInfo *> Base = *this;
945 OS << *AI;
946 } else {
948 assert(CI);
949 OS << *CI;
950 }
951 }
952};
953} // namespace
954
955namespace llvm {
956template <> struct simplify_type<IndexCall> {
958 static SimpleType getSimplifiedValue(IndexCall &Val) { return Val; }
959};
960template <> struct simplify_type<const IndexCall> {
962 static SimpleType getSimplifiedValue(const IndexCall &Val) { return Val; }
963};
964} // namespace llvm
965
966namespace {
967/// CRTP derived class for graphs built from summary index (ThinLTO).
968class IndexCallsiteContextGraph
969 : public CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
970 IndexCall> {
971public:
972 IndexCallsiteContextGraph(
973 ModuleSummaryIndex &Index,
974 llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
975 isPrevailing);
976
977 ~IndexCallsiteContextGraph() {
978 // Now that we are done with the graph it is safe to add the new
979 // CallsiteInfo structs to the function summary vectors. The graph nodes
980 // point into locations within these vectors, so we don't want to add them
981 // any earlier.
982 for (auto &I : FunctionCalleesToSynthesizedCallsiteInfos) {
983 auto *FS = I.first;
984 for (auto &Callsite : I.second)
985 FS->addCallsite(*Callsite.second);
986 }
987 }
988
989private:
990 friend CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
991 IndexCall>;
992
993 uint64_t getStackId(uint64_t IdOrIndex) const;
994 const FunctionSummary *getCalleeFunc(IndexCall &Call);
995 bool calleeMatchesFunc(
996 IndexCall &Call, const FunctionSummary *Func,
997 const FunctionSummary *CallerFunc,
998 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain);
999 bool sameCallee(IndexCall &Call1, IndexCall &Call2);
1000 bool findProfiledCalleeThroughTailCalls(
1001 ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
1002 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
1003 bool &FoundMultipleCalleeChains);
1004 uint64_t getLastStackId(IndexCall &Call);
1005 std::vector<uint64_t> getStackIdsWithContextNodesForCall(IndexCall &Call);
1006 void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
1007 AllocationType getAllocationCallType(const CallInfo &Call) const;
1008 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
1009 CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
1010 IndexCall>::FuncInfo
1011 cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
1012 DenseMap<CallInfo, CallInfo> &CallMap,
1013 std::vector<CallInfo> &CallsWithMetadataInFunc,
1014 unsigned CloneNo);
1015 std::string getLabel(const FunctionSummary *Func, const IndexCall &Call,
1016 unsigned CloneNo) const;
1017
1018 // Saves mapping from function summaries containing memprof records back to
1019 // its VI, for use in checking and debugging.
1020 std::map<const FunctionSummary *, ValueInfo> FSToVIMap;
1021
1022 const ModuleSummaryIndex &Index;
1023 llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
1024 isPrevailing;
1025
1026 // Saves/owns the callsite info structures synthesized for missing tail call
1027 // frames that we discover while building the graph.
1028 // It maps from the summary of the function making the tail call, to a map
1029 // of callee ValueInfo to corresponding synthesized callsite info.
1030 std::unordered_map<FunctionSummary *,
1031 std::map<ValueInfo, std::unique_ptr<CallsiteInfo>>>
1032 FunctionCalleesToSynthesizedCallsiteInfos;
1033};
1034} // namespace
1035
1036namespace llvm {
1037template <>
1038struct DenseMapInfo<typename CallsiteContextGraph<
1039 ModuleCallsiteContextGraph, Function, Instruction *>::CallInfo>
1041template <>
1042struct DenseMapInfo<typename CallsiteContextGraph<
1043 IndexCallsiteContextGraph, FunctionSummary, IndexCall>::CallInfo>
1044 : public DenseMapInfo<std::pair<IndexCall, unsigned>> {};
1045template <>
1046struct DenseMapInfo<IndexCall>
1047 : public DenseMapInfo<PointerUnion<CallsiteInfo *, AllocInfo *>> {};
1048} // end namespace llvm
1049
1050namespace {
1051
1052// Map the uint8_t alloc types (which may contain NotCold|Cold) to the alloc
1053// type we should actually use on the corresponding allocation.
1054// If we can't clone a node that has NotCold+Cold alloc type, we will fall
1055// back to using NotCold. So don't bother cloning to distinguish NotCold+Cold
1056// from NotCold.
1057AllocationType allocTypeToUse(uint8_t AllocTypes) {
1058 assert(AllocTypes != (uint8_t)AllocationType::None);
1059 if (AllocTypes ==
1062 else
1063 return (AllocationType)AllocTypes;
1064}
1065
1066// Helper to check if the alloc types for all edges recorded in the
1067// InAllocTypes vector match the alloc types for all edges in the Edges
1068// vector.
1069template <typename DerivedCCG, typename FuncTy, typename CallTy>
1070bool allocTypesMatch(
1071 const std::vector<uint8_t> &InAllocTypes,
1072 const std::vector<std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>>
1073 &Edges) {
1074 // This should be called only when the InAllocTypes vector was computed for
1075 // this set of Edges. Make sure the sizes are the same.
1076 assert(InAllocTypes.size() == Edges.size());
1077 return std::equal(
1078 InAllocTypes.begin(), InAllocTypes.end(), Edges.begin(), Edges.end(),
1079 [](const uint8_t &l,
1080 const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &r) {
1081 // Can share if one of the edges is None type - don't
1082 // care about the type along that edge as it doesn't
1083 // exist for those context ids.
1084 if (l == (uint8_t)AllocationType::None ||
1085 r->AllocTypes == (uint8_t)AllocationType::None)
1086 return true;
1087 return allocTypeToUse(l) == allocTypeToUse(r->AllocTypes);
1088 });
1089}
1090
1091// Helper to check if the alloc types for all edges recorded in the
1092// InAllocTypes vector match the alloc types for callee edges in the given
1093// clone. Because the InAllocTypes were computed from the original node's callee
1094// edges, and other cloning could have happened after this clone was created, we
1095// need to find the matching clone callee edge, which may or may not exist.
1096template <typename DerivedCCG, typename FuncTy, typename CallTy>
1097bool allocTypesMatchClone(
1098 const std::vector<uint8_t> &InAllocTypes,
1099 const ContextNode<DerivedCCG, FuncTy, CallTy> *Clone) {
1100 const ContextNode<DerivedCCG, FuncTy, CallTy> *Node = Clone->CloneOf;
1101 assert(Node);
1102 // InAllocTypes should have been computed for the original node's callee
1103 // edges.
1104 assert(InAllocTypes.size() == Node->CalleeEdges.size());
1105 // First create a map of the clone callee edge callees to the edge alloc type.
1107 EdgeCalleeMap;
1108 for (const auto &E : Clone->CalleeEdges) {
1109 assert(!EdgeCalleeMap.contains(E->Callee));
1110 EdgeCalleeMap[E->Callee] = E->AllocTypes;
1111 }
1112 // Next, walk the original node's callees, and look for the corresponding
1113 // clone edge to that callee.
1114 for (unsigned I = 0; I < Node->CalleeEdges.size(); I++) {
1115 auto Iter = EdgeCalleeMap.find(Node->CalleeEdges[I]->Callee);
1116 // Not found is ok, we will simply add an edge if we use this clone.
1117 if (Iter == EdgeCalleeMap.end())
1118 continue;
1119 // Can share if one of the edges is None type - don't
1120 // care about the type along that edge as it doesn't
1121 // exist for those context ids.
1122 if (InAllocTypes[I] == (uint8_t)AllocationType::None ||
1123 Iter->second == (uint8_t)AllocationType::None)
1124 continue;
1125 if (allocTypeToUse(Iter->second) != allocTypeToUse(InAllocTypes[I]))
1126 return false;
1127 }
1128 return true;
1129}
1130
1131} // end anonymous namespace
1132
1133template <typename DerivedCCG, typename FuncTy, typename CallTy>
1134typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1135CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForInst(
1136 const CallInfo &C) {
1137 ContextNode *Node = getNodeForAlloc(C);
1138 if (Node)
1139 return Node;
1140
1141 return NonAllocationCallToContextNodeMap.lookup(C);
1142}
1143
1144template <typename DerivedCCG, typename FuncTy, typename CallTy>
1145typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1146CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForAlloc(
1147 const CallInfo &C) {
1148 return AllocationCallToContextNodeMap.lookup(C);
1149}
1150
1151template <typename DerivedCCG, typename FuncTy, typename CallTy>
1152typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1153CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForStackId(
1154 uint64_t StackId) {
1155 auto StackEntryNode = StackEntryIdToContextNodeMap.find(StackId);
1156 if (StackEntryNode != StackEntryIdToContextNodeMap.end())
1157 return StackEntryNode->second;
1158 return nullptr;
1159}
1160
1161template <typename DerivedCCG, typename FuncTy, typename CallTy>
1162void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1163 addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
1164 unsigned int ContextId) {
1165 for (auto &Edge : CallerEdges) {
1166 if (Edge->Caller == Caller) {
1167 Edge->AllocTypes |= (uint8_t)AllocType;
1168 Edge->getContextIds().insert(ContextId);
1169 return;
1170 }
1171 }
1172 std::shared_ptr<ContextEdge> Edge = std::make_shared<ContextEdge>(
1173 this, Caller, (uint8_t)AllocType, DenseSet<uint32_t>({ContextId}));
1174 CallerEdges.push_back(Edge);
1175 Caller->CalleeEdges.push_back(Edge);
1176}
1177
1178template <typename DerivedCCG, typename FuncTy, typename CallTy>
1179void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::removeEdgeFromGraph(
1180 ContextEdge *Edge, EdgeIter *EI, bool CalleeIter) {
1181 assert(!EI || (*EI)->get() == Edge);
1182 assert(!Edge->isRemoved());
1183 // Save the Caller and Callee pointers so we can erase Edge from their edge
1184 // lists after clearing Edge below. We do the clearing first in case it is
1185 // destructed after removing from the edge lists (if those were the last
1186 // shared_ptr references to Edge).
1187 auto *Callee = Edge->Callee;
1188 auto *Caller = Edge->Caller;
1189
1190 // Make sure the edge fields are cleared out so we can properly detect
1191 // removed edges if Edge is not destructed because there is still a shared_ptr
1192 // reference.
1193 Edge->clear();
1194
1195#ifndef NDEBUG
1196 auto CalleeCallerCount = Callee->CallerEdges.size();
1197 auto CallerCalleeCount = Caller->CalleeEdges.size();
1198#endif
1199 if (!EI) {
1200 Callee->eraseCallerEdge(Edge);
1201 Caller->eraseCalleeEdge(Edge);
1202 } else if (CalleeIter) {
1203 Callee->eraseCallerEdge(Edge);
1204 *EI = Caller->CalleeEdges.erase(*EI);
1205 } else {
1206 Caller->eraseCalleeEdge(Edge);
1207 *EI = Callee->CallerEdges.erase(*EI);
1208 }
1209 assert(Callee->CallerEdges.size() < CalleeCallerCount);
1210 assert(Caller->CalleeEdges.size() < CallerCalleeCount);
1211}
1212
1213template <typename DerivedCCG, typename FuncTy, typename CallTy>
1214void CallsiteContextGraph<
1215 DerivedCCG, FuncTy, CallTy>::removeNoneTypeCalleeEdges(ContextNode *Node) {
1216 for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();) {
1217 auto Edge = *EI;
1218 if (Edge->AllocTypes == (uint8_t)AllocationType::None) {
1219 assert(Edge->ContextIds.empty());
1220 removeEdgeFromGraph(Edge.get(), &EI, /*CalleeIter=*/true);
1221 } else
1222 ++EI;
1223 }
1224}
1225
1226template <typename DerivedCCG, typename FuncTy, typename CallTy>
1227void CallsiteContextGraph<
1228 DerivedCCG, FuncTy, CallTy>::removeNoneTypeCallerEdges(ContextNode *Node) {
1229 for (auto EI = Node->CallerEdges.begin(); EI != Node->CallerEdges.end();) {
1230 auto Edge = *EI;
1231 if (Edge->AllocTypes == (uint8_t)AllocationType::None) {
1232 assert(Edge->ContextIds.empty());
1233 Edge->Caller->eraseCalleeEdge(Edge.get());
1234 EI = Node->CallerEdges.erase(EI);
1235 } else
1236 ++EI;
1237 }
1238}
1239
1240template <typename DerivedCCG, typename FuncTy, typename CallTy>
1241typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
1242CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1243 findEdgeFromCallee(const ContextNode *Callee) {
1244 for (const auto &Edge : CalleeEdges)
1245 if (Edge->Callee == Callee)
1246 return Edge.get();
1247 return nullptr;
1248}
1249
1250template <typename DerivedCCG, typename FuncTy, typename CallTy>
1251typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
1252CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1253 findEdgeFromCaller(const ContextNode *Caller) {
1254 for (const auto &Edge : CallerEdges)
1255 if (Edge->Caller == Caller)
1256 return Edge.get();
1257 return nullptr;
1258}
1259
1260template <typename DerivedCCG, typename FuncTy, typename CallTy>
1261void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1262 eraseCalleeEdge(const ContextEdge *Edge) {
1263 auto EI = llvm::find_if(
1264 CalleeEdges, [Edge](const std::shared_ptr<ContextEdge> &CalleeEdge) {
1265 return CalleeEdge.get() == Edge;
1266 });
1267 assert(EI != CalleeEdges.end());
1268 CalleeEdges.erase(EI);
1269}
1270
1271template <typename DerivedCCG, typename FuncTy, typename CallTy>
1272void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1273 eraseCallerEdge(const ContextEdge *Edge) {
1274 auto EI = llvm::find_if(
1275 CallerEdges, [Edge](const std::shared_ptr<ContextEdge> &CallerEdge) {
1276 return CallerEdge.get() == Edge;
1277 });
1278 assert(EI != CallerEdges.end());
1279 CallerEdges.erase(EI);
1280}
1281
1282template <typename DerivedCCG, typename FuncTy, typename CallTy>
1283uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::computeAllocType(
1284 DenseSet<uint32_t> &ContextIds) const {
1285 uint8_t BothTypes =
1286 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
1287 uint8_t AllocType = (uint8_t)AllocationType::None;
1288 for (auto Id : ContextIds) {
1289 AllocType |= (uint8_t)ContextIdToAllocationType.at(Id);
1290 // Bail early if alloc type reached both, no further refinement.
1291 if (AllocType == BothTypes)
1292 return AllocType;
1293 }
1294 return AllocType;
1295}
1296
1297template <typename DerivedCCG, typename FuncTy, typename CallTy>
1298uint8_t
1299CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypesImpl(
1300 const DenseSet<uint32_t> &Node1Ids,
1301 const DenseSet<uint32_t> &Node2Ids) const {
1302 uint8_t BothTypes =
1303 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
1304 uint8_t AllocType = (uint8_t)AllocationType::None;
1305 for (auto Id : Node1Ids) {
1306 if (!Node2Ids.count(Id))
1307 continue;
1308 AllocType |= (uint8_t)ContextIdToAllocationType.at(Id);
1309 // Bail early if alloc type reached both, no further refinement.
1310 if (AllocType == BothTypes)
1311 return AllocType;
1312 }
1313 return AllocType;
1314}
1315
1316template <typename DerivedCCG, typename FuncTy, typename CallTy>
1317uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypes(
1318 const DenseSet<uint32_t> &Node1Ids,
1319 const DenseSet<uint32_t> &Node2Ids) const {
1320 if (Node1Ids.size() < Node2Ids.size())
1321 return intersectAllocTypesImpl(Node1Ids, Node2Ids);
1322 else
1323 return intersectAllocTypesImpl(Node2Ids, Node1Ids);
1324}
1325
1326template <typename DerivedCCG, typename FuncTy, typename CallTy>
1327typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1328CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addAllocNode(
1329 CallInfo Call, const FuncTy *F) {
1330 assert(!getNodeForAlloc(Call));
1331 ContextNode *AllocNode = createNewNode(/*IsAllocation=*/true, F, Call);
1332 AllocationCallToContextNodeMap[Call] = AllocNode;
1333 // Use LastContextId as a uniq id for MIB allocation nodes.
1334 AllocNode->OrigStackOrAllocId = LastContextId;
1335 // Alloc type should be updated as we add in the MIBs. We should assert
1336 // afterwards that it is not still None.
1337 AllocNode->AllocTypes = (uint8_t)AllocationType::None;
1338
1339 return AllocNode;
1340}
1341
1342static std::string getAllocTypeString(uint8_t AllocTypes) {
1343 if (!AllocTypes)
1344 return "None";
1345 std::string Str;
1346 if (AllocTypes & (uint8_t)AllocationType::NotCold)
1347 Str += "NotCold";
1348 if (AllocTypes & (uint8_t)AllocationType::Cold)
1349 Str += "Cold";
1350 return Str;
1351}
1352
1353template <typename DerivedCCG, typename FuncTy, typename CallTy>
1354template <class NodeT, class IteratorT>
1355void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
1356 ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext,
1357 CallStack<NodeT, IteratorT> &CallsiteContext, AllocationType AllocType,
1358 ArrayRef<ContextTotalSize> ContextSizeInfo) {
1359 // Treating the hot alloc type as NotCold before the disambiguation for "hot"
1360 // is done.
1361 if (AllocType == AllocationType::Hot)
1362 AllocType = AllocationType::NotCold;
1363
1364 ContextIdToAllocationType[++LastContextId] = AllocType;
1365
1366 if (!ContextSizeInfo.empty()) {
1367 auto &Entry = ContextIdToContextSizeInfos[LastContextId];
1368 Entry.insert(Entry.begin(), ContextSizeInfo.begin(), ContextSizeInfo.end());
1369 }
1370
1371 // Update alloc type and context ids for this MIB.
1372 AllocNode->AllocTypes |= (uint8_t)AllocType;
1373
1374 // Now add or update nodes for each stack id in alloc's context.
1375 // Later when processing the stack ids on non-alloc callsites we will adjust
1376 // for any inlining in the context.
1377 ContextNode *PrevNode = AllocNode;
1378 // Look for recursion (direct recursion should have been collapsed by
1379 // module summary analysis, here we should just be detecting mutual
1380 // recursion). Mark these nodes so we don't try to clone.
1381 SmallSet<uint64_t, 8> StackIdSet;
1382 // Skip any on the allocation call (inlining).
1383 for (auto ContextIter = StackContext.beginAfterSharedPrefix(CallsiteContext);
1384 ContextIter != StackContext.end(); ++ContextIter) {
1385 auto StackId = getStackId(*ContextIter);
1386 ContextNode *StackNode = getNodeForStackId(StackId);
1387 if (!StackNode) {
1388 StackNode = createNewNode(/*IsAllocation=*/false);
1389 StackEntryIdToContextNodeMap[StackId] = StackNode;
1390 StackNode->OrigStackOrAllocId = StackId;
1391 }
1392 // Marking a node recursive will prevent its cloning completely, even for
1393 // non-recursive contexts flowing through it.
1395 auto Ins = StackIdSet.insert(StackId);
1396 if (!Ins.second)
1397 StackNode->Recursive = true;
1398 }
1399 StackNode->AllocTypes |= (uint8_t)AllocType;
1400 PrevNode->addOrUpdateCallerEdge(StackNode, AllocType, LastContextId);
1401 PrevNode = StackNode;
1402 }
1403}
1404
1405template <typename DerivedCCG, typename FuncTy, typename CallTy>
1406DenseSet<uint32_t>
1407CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::duplicateContextIds(
1408 const DenseSet<uint32_t> &StackSequenceContextIds,
1409 DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
1410 DenseSet<uint32_t> NewContextIds;
1411 for (auto OldId : StackSequenceContextIds) {
1412 NewContextIds.insert(++LastContextId);
1413 OldToNewContextIds[OldId].insert(LastContextId);
1414 assert(ContextIdToAllocationType.count(OldId));
1415 // The new context has the same allocation type as original.
1416 ContextIdToAllocationType[LastContextId] = ContextIdToAllocationType[OldId];
1417 if (DotAllocContextIds.contains(OldId))
1418 DotAllocContextIds.insert(LastContextId);
1419 }
1420 return NewContextIds;
1421}
1422
1423template <typename DerivedCCG, typename FuncTy, typename CallTy>
1424void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
1425 propagateDuplicateContextIds(
1426 const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
1427 // Build a set of duplicated context ids corresponding to the input id set.
1428 auto GetNewIds = [&OldToNewContextIds](const DenseSet<uint32_t> &ContextIds) {
1429 DenseSet<uint32_t> NewIds;
1430 for (auto Id : ContextIds)
1431 if (auto NewId = OldToNewContextIds.find(Id);
1432 NewId != OldToNewContextIds.end())
1433 NewIds.insert_range(NewId->second);
1434 return NewIds;
1435 };
1436
1437 // Recursively update context ids sets along caller edges.
1438 auto UpdateCallers = [&](ContextNode *Node,
1439 DenseSet<const ContextEdge *> &Visited,
1440 auto &&UpdateCallers) -> void {
1441 for (const auto &Edge : Node->CallerEdges) {
1442 auto Inserted = Visited.insert(Edge.get());
1443 if (!Inserted.second)
1444 continue;
1445 ContextNode *NextNode = Edge->Caller;
1446 DenseSet<uint32_t> NewIdsToAdd = GetNewIds(Edge->getContextIds());
1447 // Only need to recursively iterate to NextNode via this caller edge if
1448 // it resulted in any added ids to NextNode.
1449 if (!NewIdsToAdd.empty()) {
1450 Edge->getContextIds().insert_range(NewIdsToAdd);
1451 UpdateCallers(NextNode, Visited, UpdateCallers);
1452 }
1453 }
1454 };
1455
1456 DenseSet<const ContextEdge *> Visited;
1457 for (auto &Entry : AllocationCallToContextNodeMap) {
1458 auto *Node = Entry.second;
1459 UpdateCallers(Node, Visited, UpdateCallers);
1460 }
1461}
1462
1463template <typename DerivedCCG, typename FuncTy, typename CallTy>
1464void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::connectNewNode(
1465 ContextNode *NewNode, ContextNode *OrigNode, bool TowardsCallee,
1466 // This must be passed by value to make a copy since it will be adjusted
1467 // as ids are moved.
1468 DenseSet<uint32_t> RemainingContextIds) {
1469 auto &OrigEdges =
1470 TowardsCallee ? OrigNode->CalleeEdges : OrigNode->CallerEdges;
1471 DenseSet<uint32_t> RecursiveContextIds;
1472 DenseSet<uint32_t> AllCallerContextIds;
1474 // Identify which context ids are recursive which is needed to properly
1475 // update the RemainingContextIds set. The relevant recursive context ids
1476 // are those that are in multiple edges.
1477 for (auto &CE : OrigEdges) {
1478 AllCallerContextIds.reserve(CE->getContextIds().size());
1479 for (auto Id : CE->getContextIds())
1480 if (!AllCallerContextIds.insert(Id).second)
1481 RecursiveContextIds.insert(Id);
1482 }
1483 }
1484 // Increment iterator in loop so that we can remove edges as needed.
1485 for (auto EI = OrigEdges.begin(); EI != OrigEdges.end();) {
1486 auto Edge = *EI;
1487 DenseSet<uint32_t> NewEdgeContextIds;
1488 DenseSet<uint32_t> NotFoundContextIds;
1489 // Remove any matching context ids from Edge, return set that were found and
1490 // removed, these are the new edge's context ids. Also update the remaining
1491 // (not found ids).
1492 set_subtract(Edge->getContextIds(), RemainingContextIds, NewEdgeContextIds,
1493 NotFoundContextIds);
1494 // Update the remaining context ids set for the later edges. This is a
1495 // compile time optimization.
1496 if (RecursiveContextIds.empty()) {
1497 // No recursive ids, so all of the previously remaining context ids that
1498 // were not seen on this edge are the new remaining set.
1499 RemainingContextIds.swap(NotFoundContextIds);
1500 } else {
1501 // Keep the recursive ids in the remaining set as we expect to see those
1502 // on another edge. We can remove the non-recursive remaining ids that
1503 // were seen on this edge, however. We already have the set of remaining
1504 // ids that were on this edge (in NewEdgeContextIds). Figure out which are
1505 // non-recursive and only remove those. Note that despite the higher
1506 // overhead of updating the remaining context ids set when recursion
1507 // handling is enabled, it was found to be at worst performance neutral
1508 // and in one case a clear win.
1509 DenseSet<uint32_t> NonRecursiveRemainingCurEdgeIds =
1510 set_difference(NewEdgeContextIds, RecursiveContextIds);
1511 set_subtract(RemainingContextIds, NonRecursiveRemainingCurEdgeIds);
1512 }
1513 // If no matching context ids for this edge, skip it.
1514 if (NewEdgeContextIds.empty()) {
1515 ++EI;
1516 continue;
1517 }
1518 if (TowardsCallee) {
1519 uint8_t NewAllocType = computeAllocType(NewEdgeContextIds);
1520 auto NewEdge = std::make_shared<ContextEdge>(
1521 Edge->Callee, NewNode, NewAllocType, std::move(NewEdgeContextIds));
1522 NewNode->CalleeEdges.push_back(NewEdge);
1523 NewEdge->Callee->CallerEdges.push_back(NewEdge);
1524 } else {
1525 uint8_t NewAllocType = computeAllocType(NewEdgeContextIds);
1526 auto NewEdge = std::make_shared<ContextEdge>(
1527 NewNode, Edge->Caller, NewAllocType, std::move(NewEdgeContextIds));
1528 NewNode->CallerEdges.push_back(NewEdge);
1529 NewEdge->Caller->CalleeEdges.push_back(NewEdge);
1530 }
1531 // Remove old edge if context ids empty.
1532 if (Edge->getContextIds().empty()) {
1533 removeEdgeFromGraph(Edge.get(), &EI, TowardsCallee);
1534 continue;
1535 }
1536 ++EI;
1537 }
1538}
1539
1540template <typename DerivedCCG, typename FuncTy, typename CallTy>
1541static void checkEdge(
1542 const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &Edge) {
1543 // Confirm that alloc type is not None and that we have at least one context
1544 // id.
1545 assert(Edge->AllocTypes != (uint8_t)AllocationType::None);
1546 assert(!Edge->ContextIds.empty());
1547}
1548
1549template <typename DerivedCCG, typename FuncTy, typename CallTy>
1550static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node,
1551 bool CheckEdges = true) {
1552 if (Node->isRemoved())
1553 return;
1554#ifndef NDEBUG
1555 // Compute node's context ids once for use in asserts.
1556 auto NodeContextIds = Node->getContextIds();
1557#endif
1558 // Node's context ids should be the union of both its callee and caller edge
1559 // context ids.
1560 if (Node->CallerEdges.size()) {
1561 DenseSet<uint32_t> CallerEdgeContextIds(
1562 Node->CallerEdges.front()->ContextIds);
1563 for (const auto &Edge : llvm::drop_begin(Node->CallerEdges)) {
1564 if (CheckEdges)
1566 set_union(CallerEdgeContextIds, Edge->ContextIds);
1567 }
1568 // Node can have more context ids than callers if some contexts terminate at
1569 // node and some are longer. If we are allowing recursive callsites and
1570 // contexts this will be violated for incompletely cloned recursive cycles,
1571 // so skip the checking in that case.
1573 NodeContextIds == CallerEdgeContextIds ||
1574 set_is_subset(CallerEdgeContextIds, NodeContextIds));
1575 }
1576 if (Node->CalleeEdges.size()) {
1577 DenseSet<uint32_t> CalleeEdgeContextIds(
1578 Node->CalleeEdges.front()->ContextIds);
1579 for (const auto &Edge : llvm::drop_begin(Node->CalleeEdges)) {
1580 if (CheckEdges)
1582 set_union(CalleeEdgeContextIds, Edge->getContextIds());
1583 }
1584 // If we are allowing recursive callsites and contexts this will be violated
1585 // for incompletely cloned recursive cycles, so skip the checking in that
1586 // case.
1588 NodeContextIds == CalleeEdgeContextIds);
1589 }
1590 // FIXME: Since this checking is only invoked under an option, we should
1591 // change the error checking from using assert to something that will trigger
1592 // an error on a release build.
1593#ifndef NDEBUG
1594 // Make sure we don't end up with duplicate edges between the same caller and
1595 // callee.
1597 for (const auto &E : Node->CalleeEdges)
1598 NodeSet.insert(E->Callee);
1599 assert(NodeSet.size() == Node->CalleeEdges.size());
1600#endif
1601}
1602
1603template <typename DerivedCCG, typename FuncTy, typename CallTy>
1604void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
1605 assignStackNodesPostOrder(
1606 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
1607 DenseMap<uint64_t, std::vector<CallContextInfo>>
1608 &StackIdToMatchingCalls,
1609 DenseMap<CallInfo, CallInfo> &CallToMatchingCall) {
1610 auto Inserted = Visited.insert(Node);
1611 if (!Inserted.second)
1612 return;
1613 // Post order traversal. Iterate over a copy since we may add nodes and
1614 // therefore new callers during the recursive call, invalidating any
1615 // iterator over the original edge vector. We don't need to process these
1616 // new nodes as they were already processed on creation.
1617 auto CallerEdges = Node->CallerEdges;
1618 for (auto &Edge : CallerEdges) {
1619 // Skip any that have been removed during the recursion.
1620 if (Edge->isRemoved()) {
1621 assert(!is_contained(Node->CallerEdges, Edge));
1622 continue;
1623 }
1624 assignStackNodesPostOrder(Edge->Caller, Visited, StackIdToMatchingCalls,
1625 CallToMatchingCall);
1626 }
1627
1628 // If this node's stack id is in the map, update the graph to contain new
1629 // nodes representing any inlining at interior callsites. Note we move the
1630 // associated context ids over to the new nodes.
1631
1632 // Ignore this node if it is for an allocation or we didn't record any
1633 // stack id lists ending at it.
1634 if (Node->IsAllocation ||
1635 !StackIdToMatchingCalls.count(Node->OrigStackOrAllocId))
1636 return;
1637
1638 auto &Calls = StackIdToMatchingCalls[Node->OrigStackOrAllocId];
1639 // Handle the simple case first. A single call with a single stack id.
1640 // In this case there is no need to create any new context nodes, simply
1641 // assign the context node for stack id to this Call.
1642 if (Calls.size() == 1) {
1643 auto &[Call, Ids, Func, SavedContextIds] = Calls[0];
1644 if (Ids.size() == 1) {
1645 assert(SavedContextIds.empty());
1646 // It should be this Node
1647 assert(Node == getNodeForStackId(Ids[0]));
1648 if (Node->Recursive)
1649 return;
1650 Node->setCall(Call);
1651 NonAllocationCallToContextNodeMap[Call] = Node;
1652 NodeToCallingFunc[Node] = Func;
1653 return;
1654 }
1655 }
1656
1657#ifndef NDEBUG
1658 // Find the node for the last stack id, which should be the same
1659 // across all calls recorded for this id, and is this node's id.
1660 uint64_t LastId = Node->OrigStackOrAllocId;
1661 ContextNode *LastNode = getNodeForStackId(LastId);
1662 // We should only have kept stack ids that had nodes.
1663 assert(LastNode);
1664 assert(LastNode == Node);
1665#else
1666 ContextNode *LastNode = Node;
1667#endif
1668
1669 // Compute the last node's context ids once, as it is shared by all calls in
1670 // this entry.
1671 DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds();
1672
1673 [[maybe_unused]] bool PrevIterCreatedNode = false;
1674 bool CreatedNode = false;
1675 for (unsigned I = 0; I < Calls.size();
1676 I++, PrevIterCreatedNode = CreatedNode) {
1677 CreatedNode = false;
1678 auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
1679 // Skip any for which we didn't assign any ids, these don't get a node in
1680 // the graph.
1681 if (SavedContextIds.empty()) {
1682 // If this call has a matching call (located in the same function and
1683 // having the same stack ids), simply add it to the context node created
1684 // for its matching call earlier. These can be treated the same through
1685 // cloning and get updated at the same time.
1686 if (!CallToMatchingCall.contains(Call))
1687 continue;
1688 auto MatchingCall = CallToMatchingCall[Call];
1689 if (!NonAllocationCallToContextNodeMap.contains(MatchingCall)) {
1690 // This should only happen if we had a prior iteration, and it didn't
1691 // create a node because of the below recomputation of context ids
1692 // finding none remaining and continuing early.
1693 assert(I > 0 && !PrevIterCreatedNode);
1694 continue;
1695 }
1696 NonAllocationCallToContextNodeMap[MatchingCall]->MatchingCalls.push_back(
1697 Call);
1698 continue;
1699 }
1700
1701 assert(LastId == Ids.back());
1702
1703 // Recompute the context ids for this stack id sequence (the
1704 // intersection of the context ids of the corresponding nodes).
1705 // Start with the ids we saved in the map for this call, which could be
1706 // duplicated context ids. We have to recompute as we might have overlap
1707 // overlap between the saved context ids for different last nodes, and
1708 // removed them already during the post order traversal.
1709 set_intersect(SavedContextIds, LastNodeContextIds);
1710 ContextNode *PrevNode = LastNode;
1711 bool Skip = false;
1712 // Iterate backwards through the stack Ids, starting after the last Id
1713 // in the list, which was handled once outside for all Calls.
1714 for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) {
1715 auto Id = *IdIter;
1716 ContextNode *CurNode = getNodeForStackId(Id);
1717 // We should only have kept stack ids that had nodes and weren't
1718 // recursive.
1719 assert(CurNode);
1720 assert(!CurNode->Recursive);
1721
1722 auto *Edge = CurNode->findEdgeFromCaller(PrevNode);
1723 if (!Edge) {
1724 Skip = true;
1725 break;
1726 }
1727 PrevNode = CurNode;
1728
1729 // Update the context ids, which is the intersection of the ids along
1730 // all edges in the sequence.
1731 set_intersect(SavedContextIds, Edge->getContextIds());
1732
1733 // If we now have no context ids for clone, skip this call.
1734 if (SavedContextIds.empty()) {
1735 Skip = true;
1736 break;
1737 }
1738 }
1739 if (Skip)
1740 continue;
1741
1742 // Create new context node.
1743 ContextNode *NewNode = createNewNode(/*IsAllocation=*/false, Func, Call);
1744 NonAllocationCallToContextNodeMap[Call] = NewNode;
1745 CreatedNode = true;
1746 NewNode->AllocTypes = computeAllocType(SavedContextIds);
1747
1748 ContextNode *FirstNode = getNodeForStackId(Ids[0]);
1749 assert(FirstNode);
1750
1751 // Connect to callees of innermost stack frame in inlined call chain.
1752 // This updates context ids for FirstNode's callee's to reflect those
1753 // moved to NewNode.
1754 connectNewNode(NewNode, FirstNode, /*TowardsCallee=*/true, SavedContextIds);
1755
1756 // Connect to callers of outermost stack frame in inlined call chain.
1757 // This updates context ids for FirstNode's caller's to reflect those
1758 // moved to NewNode.
1759 connectNewNode(NewNode, LastNode, /*TowardsCallee=*/false, SavedContextIds);
1760
1761 // Now we need to remove context ids from edges/nodes between First and
1762 // Last Node.
1763 PrevNode = nullptr;
1764 for (auto Id : Ids) {
1765 ContextNode *CurNode = getNodeForStackId(Id);
1766 // We should only have kept stack ids that had nodes.
1767 assert(CurNode);
1768
1769 // Remove the context ids moved to NewNode from CurNode, and the
1770 // edge from the prior node.
1771 if (PrevNode) {
1772 auto *PrevEdge = CurNode->findEdgeFromCallee(PrevNode);
1773 // If the sequence contained recursion, we might have already removed
1774 // some edges during the connectNewNode calls above.
1775 if (!PrevEdge) {
1776 PrevNode = CurNode;
1777 continue;
1778 }
1779 set_subtract(PrevEdge->getContextIds(), SavedContextIds);
1780 if (PrevEdge->getContextIds().empty())
1781 removeEdgeFromGraph(PrevEdge);
1782 }
1783 // Since we update the edges from leaf to tail, only look at the callee
1784 // edges. This isn't an alloc node, so if there are no callee edges, the
1785 // alloc type is None.
1786 CurNode->AllocTypes = CurNode->CalleeEdges.empty()
1787 ? (uint8_t)AllocationType::None
1788 : CurNode->computeAllocType();
1789 PrevNode = CurNode;
1790 }
1791 if (VerifyNodes) {
1792 checkNode<DerivedCCG, FuncTy, CallTy>(NewNode, /*CheckEdges=*/true);
1793 for (auto Id : Ids) {
1794 ContextNode *CurNode = getNodeForStackId(Id);
1795 // We should only have kept stack ids that had nodes.
1796 assert(CurNode);
1797 checkNode<DerivedCCG, FuncTy, CallTy>(CurNode, /*CheckEdges=*/true);
1798 }
1799 }
1800 }
1801}
1802
1803template <typename DerivedCCG, typename FuncTy, typename CallTy>
1804void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
1805 // Map of stack id to all calls with that as the last (outermost caller)
1806 // callsite id that has a context node (some might not due to pruning
1807 // performed during matching of the allocation profile contexts).
1808 // The CallContextInfo contains the Call and a list of its stack ids with
1809 // ContextNodes, the function containing Call, and the set of context ids
1810 // the analysis will eventually identify for use in any new node created
1811 // for that callsite.
1812 DenseMap<uint64_t, std::vector<CallContextInfo>> StackIdToMatchingCalls;
1813 for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
1814 for (auto &Call : CallsWithMetadata) {
1815 // Ignore allocations, already handled.
1816 if (AllocationCallToContextNodeMap.count(Call))
1817 continue;
1818 auto StackIdsWithContextNodes =
1819 getStackIdsWithContextNodesForCall(Call.call());
1820 // If there were no nodes created for MIBs on allocs (maybe this was in
1821 // the unambiguous part of the MIB stack that was pruned), ignore.
1822 if (StackIdsWithContextNodes.empty())
1823 continue;
1824 // Otherwise, record this Call along with the list of ids for the last
1825 // (outermost caller) stack id with a node.
1826 StackIdToMatchingCalls[StackIdsWithContextNodes.back()].push_back(
1827 {Call.call(), StackIdsWithContextNodes, Func, {}});
1828 }
1829 }
1830
1831 // First make a pass through all stack ids that correspond to a call,
1832 // as identified in the above loop. Compute the context ids corresponding to
1833 // each of these calls when they correspond to multiple stack ids due to
1834 // due to inlining. Perform any duplication of context ids required when
1835 // there is more than one call with the same stack ids. Their (possibly newly
1836 // duplicated) context ids are saved in the StackIdToMatchingCalls map.
1837 DenseMap<uint32_t, DenseSet<uint32_t>> OldToNewContextIds;
1838 // Save a map from each call to any that are found to match it. I.e. located
1839 // in the same function and have the same (possibly pruned) stack ids. We use
1840 // this to avoid creating extra graph nodes as they can be treated the same.
1841 DenseMap<CallInfo, CallInfo> CallToMatchingCall;
1842 for (auto &It : StackIdToMatchingCalls) {
1843 auto &Calls = It.getSecond();
1844 // Skip single calls with a single stack id. These don't need a new node.
1845 if (Calls.size() == 1) {
1846 auto &Ids = Calls[0].StackIds;
1847 if (Ids.size() == 1)
1848 continue;
1849 }
1850 // In order to do the best and maximal matching of inlined calls to context
1851 // node sequences we will sort the vectors of stack ids in descending order
1852 // of length, and within each length, lexicographically by stack id. The
1853 // latter is so that we can specially handle calls that have identical stack
1854 // id sequences (either due to cloning or artificially because of the MIB
1855 // context pruning). Those with the same Ids are then sorted by function to
1856 // facilitate efficiently mapping them to the same context node.
1857 // Because the functions are pointers, to ensure a stable sort first assign
1858 // each function pointer to its first index in the Calls array, and then use
1859 // that to sort by.
1860 DenseMap<const FuncTy *, unsigned> FuncToIndex;
1861 for (const auto &[Idx, CallCtxInfo] : enumerate(Calls))
1862 FuncToIndex.insert({CallCtxInfo.Func, Idx});
1864 Calls,
1865 [&FuncToIndex](const CallContextInfo &A, const CallContextInfo &B) {
1866 return A.StackIds.size() > B.StackIds.size() ||
1867 (A.StackIds.size() == B.StackIds.size() &&
1868 (A.StackIds < B.StackIds ||
1869 (A.StackIds == B.StackIds &&
1870 FuncToIndex[A.Func] < FuncToIndex[B.Func])));
1871 });
1872
1873 // Find the node for the last stack id, which should be the same
1874 // across all calls recorded for this id, and is the id for this
1875 // entry in the StackIdToMatchingCalls map.
1876 uint64_t LastId = It.getFirst();
1877 ContextNode *LastNode = getNodeForStackId(LastId);
1878 // We should only have kept stack ids that had nodes.
1879 assert(LastNode);
1880
1881 if (LastNode->Recursive)
1882 continue;
1883
1884 // Initialize the context ids with the last node's. We will subsequently
1885 // refine the context ids by computing the intersection along all edges.
1886 DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds();
1887 assert(!LastNodeContextIds.empty());
1888
1889#ifndef NDEBUG
1890 // Save the set of functions seen for a particular set of the same stack
1891 // ids. This is used to ensure that they have been correctly sorted to be
1892 // adjacent in the Calls list, since we rely on that to efficiently place
1893 // all such matching calls onto the same context node.
1894 DenseSet<const FuncTy *> MatchingIdsFuncSet;
1895#endif
1896
1897 for (unsigned I = 0; I < Calls.size(); I++) {
1898 auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
1899 assert(SavedContextIds.empty());
1900 assert(LastId == Ids.back());
1901
1902#ifndef NDEBUG
1903 // If this call has a different set of ids than the last one, clear the
1904 // set used to ensure they are sorted properly.
1905 if (I > 0 && Ids != Calls[I - 1].StackIds)
1906 MatchingIdsFuncSet.clear();
1907#endif
1908
1909 // First compute the context ids for this stack id sequence (the
1910 // intersection of the context ids of the corresponding nodes).
1911 // Start with the remaining saved ids for the last node.
1912 assert(!LastNodeContextIds.empty());
1913 DenseSet<uint32_t> StackSequenceContextIds = LastNodeContextIds;
1914
1915 ContextNode *PrevNode = LastNode;
1916 ContextNode *CurNode = LastNode;
1917 bool Skip = false;
1918
1919 // Iterate backwards through the stack Ids, starting after the last Id
1920 // in the list, which was handled once outside for all Calls.
1921 for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) {
1922 auto Id = *IdIter;
1923 CurNode = getNodeForStackId(Id);
1924 // We should only have kept stack ids that had nodes.
1925 assert(CurNode);
1926
1927 if (CurNode->Recursive) {
1928 Skip = true;
1929 break;
1930 }
1931
1932 auto *Edge = CurNode->findEdgeFromCaller(PrevNode);
1933 // If there is no edge then the nodes belong to different MIB contexts,
1934 // and we should skip this inlined context sequence. For example, this
1935 // particular inlined context may include stack ids A->B, and we may
1936 // indeed have nodes for both A and B, but it is possible that they were
1937 // never profiled in sequence in a single MIB for any allocation (i.e.
1938 // we might have profiled an allocation that involves the callsite A,
1939 // but through a different one of its callee callsites, and we might
1940 // have profiled an allocation that involves callsite B, but reached
1941 // from a different caller callsite).
1942 if (!Edge) {
1943 Skip = true;
1944 break;
1945 }
1946 PrevNode = CurNode;
1947
1948 // Update the context ids, which is the intersection of the ids along
1949 // all edges in the sequence.
1950 set_intersect(StackSequenceContextIds, Edge->getContextIds());
1951
1952 // If we now have no context ids for clone, skip this call.
1953 if (StackSequenceContextIds.empty()) {
1954 Skip = true;
1955 break;
1956 }
1957 }
1958 if (Skip)
1959 continue;
1960
1961 // If some of this call's stack ids did not have corresponding nodes (due
1962 // to pruning), don't include any context ids for contexts that extend
1963 // beyond these nodes. Otherwise we would be matching part of unrelated /
1964 // not fully matching stack contexts. To do this, subtract any context ids
1965 // found in caller nodes of the last node found above.
1966 if (Ids.back() != getLastStackId(Call)) {
1967 for (const auto &PE : LastNode->CallerEdges) {
1968 set_subtract(StackSequenceContextIds, PE->getContextIds());
1969 if (StackSequenceContextIds.empty())
1970 break;
1971 }
1972 // If we now have no context ids for clone, skip this call.
1973 if (StackSequenceContextIds.empty())
1974 continue;
1975 }
1976
1977#ifndef NDEBUG
1978 // If the prior call had the same stack ids this set would not be empty.
1979 // Check if we already have a call that "matches" because it is located
1980 // in the same function. If the Calls list was sorted properly we should
1981 // not encounter this situation as all such entries should be adjacent
1982 // and processed in bulk further below.
1983 assert(!MatchingIdsFuncSet.contains(Func));
1984
1985 MatchingIdsFuncSet.insert(Func);
1986#endif
1987
1988 // Check if the next set of stack ids is the same (since the Calls vector
1989 // of tuples is sorted by the stack ids we can just look at the next one).
1990 // If so, save them in the CallToMatchingCall map so that they get
1991 // assigned to the same context node, and skip them.
1992 bool DuplicateContextIds = false;
1993 for (unsigned J = I + 1; J < Calls.size(); J++) {
1994 auto &CallCtxInfo = Calls[J];
1995 auto &NextIds = CallCtxInfo.StackIds;
1996 if (NextIds != Ids)
1997 break;
1998 auto *NextFunc = CallCtxInfo.Func;
1999 if (NextFunc != Func) {
2000 // We have another Call with the same ids but that cannot share this
2001 // node, must duplicate ids for it.
2002 DuplicateContextIds = true;
2003 break;
2004 }
2005 auto &NextCall = CallCtxInfo.Call;
2006 CallToMatchingCall[NextCall] = Call;
2007 // Update I so that it gets incremented correctly to skip this call.
2008 I = J;
2009 }
2010
2011 // If we don't have duplicate context ids, then we can assign all the
2012 // context ids computed for the original node sequence to this call.
2013 // If there are duplicate calls with the same stack ids then we synthesize
2014 // new context ids that are duplicates of the originals. These are
2015 // assigned to SavedContextIds, which is a reference into the map entry
2016 // for this call, allowing us to access these ids later on.
2017 OldToNewContextIds.reserve(OldToNewContextIds.size() +
2018 StackSequenceContextIds.size());
2019 SavedContextIds =
2020 DuplicateContextIds
2021 ? duplicateContextIds(StackSequenceContextIds, OldToNewContextIds)
2022 : StackSequenceContextIds;
2023 assert(!SavedContextIds.empty());
2024
2025 if (!DuplicateContextIds) {
2026 // Update saved last node's context ids to remove those that are
2027 // assigned to other calls, so that it is ready for the next call at
2028 // this stack id.
2029 set_subtract(LastNodeContextIds, StackSequenceContextIds);
2030 if (LastNodeContextIds.empty())
2031 break;
2032 }
2033 }
2034 }
2035
2036 // Propagate the duplicate context ids over the graph.
2037 propagateDuplicateContextIds(OldToNewContextIds);
2038
2039 if (VerifyCCG)
2040 check();
2041
2042 // Now perform a post-order traversal over the graph, starting with the
2043 // allocation nodes, essentially processing nodes from callers to callees.
2044 // For any that contains an id in the map, update the graph to contain new
2045 // nodes representing any inlining at interior callsites. Note we move the
2046 // associated context ids over to the new nodes.
2047 DenseSet<const ContextNode *> Visited;
2048 for (auto &Entry : AllocationCallToContextNodeMap)
2049 assignStackNodesPostOrder(Entry.second, Visited, StackIdToMatchingCalls,
2050 CallToMatchingCall);
2051 if (VerifyCCG)
2052 check();
2053}
2054
2055uint64_t ModuleCallsiteContextGraph::getLastStackId(Instruction *Call) {
2056 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
2057 Call->getMetadata(LLVMContext::MD_callsite));
2058 return CallsiteContext.back();
2059}
2060
2061uint64_t IndexCallsiteContextGraph::getLastStackId(IndexCall &Call) {
2063 CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator>
2064 CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call));
2065 // Need to convert index into stack id.
2066 return Index.getStackIdAtIndex(CallsiteContext.back());
2067}
2068
2069static const std::string MemProfCloneSuffix = ".memprof.";
2070
2071static std::string getMemProfFuncName(Twine Base, unsigned CloneNo) {
2072 // We use CloneNo == 0 to refer to the original version, which doesn't get
2073 // renamed with a suffix.
2074 if (!CloneNo)
2075 return Base.str();
2076 return (Base + MemProfCloneSuffix + Twine(CloneNo)).str();
2077}
2078
2079static bool isMemProfClone(const Function &F) {
2080 return F.getName().contains(MemProfCloneSuffix);
2081}
2082
2083// Return the clone number of the given function by extracting it from the
2084// memprof suffix. Assumes the caller has already confirmed it is a memprof
2085// clone.
2086static unsigned getMemProfCloneNum(const Function &F) {
2088 auto Pos = F.getName().find_last_of('.');
2089 assert(Pos > 0);
2090 unsigned CloneNo;
2091 bool Err = F.getName().drop_front(Pos + 1).getAsInteger(10, CloneNo);
2092 assert(!Err);
2093 (void)Err;
2094 return CloneNo;
2095}
2096
2097std::string ModuleCallsiteContextGraph::getLabel(const Function *Func,
2098 const Instruction *Call,
2099 unsigned CloneNo) const {
2100 return (Twine(Call->getFunction()->getName()) + " -> " +
2101 cast<CallBase>(Call)->getCalledFunction()->getName())
2102 .str();
2103}
2104
2105std::string IndexCallsiteContextGraph::getLabel(const FunctionSummary *Func,
2106 const IndexCall &Call,
2107 unsigned CloneNo) const {
2108 auto VI = FSToVIMap.find(Func);
2109 assert(VI != FSToVIMap.end());
2110 std::string CallerName = getMemProfFuncName(VI->second.name(), CloneNo);
2112 return CallerName + " -> alloc";
2113 else {
2114 auto *Callsite = dyn_cast_if_present<CallsiteInfo *>(Call);
2115 return CallerName + " -> " +
2116 getMemProfFuncName(Callsite->Callee.name(),
2117 Callsite->Clones[CloneNo]);
2118 }
2119}
2120
2121std::vector<uint64_t>
2122ModuleCallsiteContextGraph::getStackIdsWithContextNodesForCall(
2123 Instruction *Call) {
2124 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
2125 Call->getMetadata(LLVMContext::MD_callsite));
2126 return getStackIdsWithContextNodes<MDNode, MDNode::op_iterator>(
2127 CallsiteContext);
2128}
2129
2130std::vector<uint64_t>
2131IndexCallsiteContextGraph::getStackIdsWithContextNodesForCall(IndexCall &Call) {
2133 CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator>
2134 CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call));
2135 return getStackIdsWithContextNodes<CallsiteInfo,
2136 SmallVector<unsigned>::const_iterator>(
2137 CallsiteContext);
2138}
2139
2140template <typename DerivedCCG, typename FuncTy, typename CallTy>
2141template <class NodeT, class IteratorT>
2142std::vector<uint64_t>
2143CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getStackIdsWithContextNodes(
2144 CallStack<NodeT, IteratorT> &CallsiteContext) {
2145 std::vector<uint64_t> StackIds;
2146 for (auto IdOrIndex : CallsiteContext) {
2147 auto StackId = getStackId(IdOrIndex);
2148 ContextNode *Node = getNodeForStackId(StackId);
2149 if (!Node)
2150 break;
2151 StackIds.push_back(StackId);
2152 }
2153 return StackIds;
2154}
2155
2156ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(
2157 Module &M,
2158 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter)
2159 : Mod(M), OREGetter(OREGetter) {
2160 for (auto &F : M) {
2161 std::vector<CallInfo> CallsWithMetadata;
2162 for (auto &BB : F) {
2163 for (auto &I : BB) {
2164 if (!isa<CallBase>(I))
2165 continue;
2166 if (auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof)) {
2167 CallsWithMetadata.push_back(&I);
2168 auto *AllocNode = addAllocNode(&I, &F);
2169 auto *CallsiteMD = I.getMetadata(LLVMContext::MD_callsite);
2170 assert(CallsiteMD);
2171 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(CallsiteMD);
2172 // Add all of the MIBs and their stack nodes.
2173 for (auto &MDOp : MemProfMD->operands()) {
2174 auto *MIBMD = cast<const MDNode>(MDOp);
2175 std::vector<ContextTotalSize> ContextSizeInfo;
2176 // Collect the context size information if it exists.
2177 if (MIBMD->getNumOperands() > 2) {
2178 for (unsigned I = 2; I < MIBMD->getNumOperands(); I++) {
2179 MDNode *ContextSizePair =
2180 dyn_cast<MDNode>(MIBMD->getOperand(I));
2181 assert(ContextSizePair->getNumOperands() == 2);
2183 ContextSizePair->getOperand(0))
2184 ->getZExtValue();
2186 ContextSizePair->getOperand(1))
2187 ->getZExtValue();
2188 ContextSizeInfo.push_back({FullStackId, TotalSize});
2189 }
2190 }
2194 addStackNodesForMIB<MDNode, MDNode::op_iterator>(
2195 AllocNode, StackContext, CallsiteContext,
2196 getMIBAllocType(MIBMD), ContextSizeInfo);
2197 }
2198 // If exporting the graph to dot and an allocation id of interest was
2199 // specified, record all the context ids for this allocation node.
2200 if (ExportToDot && AllocNode->OrigStackOrAllocId == AllocIdForDot)
2201 DotAllocContextIds = AllocNode->getContextIds();
2202 assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
2203 // Memprof and callsite metadata on memory allocations no longer
2204 // needed.
2205 I.setMetadata(LLVMContext::MD_memprof, nullptr);
2206 I.setMetadata(LLVMContext::MD_callsite, nullptr);
2207 }
2208 // For callsite metadata, add to list for this function for later use.
2209 else if (I.getMetadata(LLVMContext::MD_callsite)) {
2210 CallsWithMetadata.push_back(&I);
2211 }
2212 }
2213 }
2214 if (!CallsWithMetadata.empty())
2215 FuncToCallsWithMetadata[&F] = CallsWithMetadata;
2216 }
2217
2218 if (DumpCCG) {
2219 dbgs() << "CCG before updating call stack chains:\n";
2220 dbgs() << *this;
2221 }
2222
2223 if (ExportToDot)
2224 exportToDot("prestackupdate");
2225
2226 updateStackNodes();
2227
2228 if (ExportToDot)
2229 exportToDot("poststackupdate");
2230
2231 handleCallsitesWithMultipleTargets();
2232
2233 markBackedges();
2234
2235 // Strip off remaining callsite metadata, no longer needed.
2236 for (auto &FuncEntry : FuncToCallsWithMetadata)
2237 for (auto &Call : FuncEntry.second)
2238 Call.call()->setMetadata(LLVMContext::MD_callsite, nullptr);
2239}
2240
2241IndexCallsiteContextGraph::IndexCallsiteContextGraph(
2242 ModuleSummaryIndex &Index,
2244 isPrevailing)
2245 : Index(Index), isPrevailing(isPrevailing) {
2246 for (auto &I : Index) {
2247 auto VI = Index.getValueInfo(I);
2248 for (auto &S : VI.getSummaryList()) {
2249 // We should only add the prevailing nodes. Otherwise we may try to clone
2250 // in a weak copy that won't be linked (and may be different than the
2251 // prevailing version).
2252 // We only keep the memprof summary on the prevailing copy now when
2253 // building the combined index, as a space optimization, however don't
2254 // rely on this optimization. The linker doesn't resolve local linkage
2255 // values so don't check whether those are prevailing.
2256 if (!GlobalValue::isLocalLinkage(S->linkage()) &&
2257 !isPrevailing(VI.getGUID(), S.get()))
2258 continue;
2259 auto *FS = dyn_cast<FunctionSummary>(S.get());
2260 if (!FS)
2261 continue;
2262 std::vector<CallInfo> CallsWithMetadata;
2263 if (!FS->allocs().empty()) {
2264 for (auto &AN : FS->mutableAllocs()) {
2265 // This can happen because of recursion elimination handling that
2266 // currently exists in ModuleSummaryAnalysis. Skip these for now.
2267 // We still added them to the summary because we need to be able to
2268 // correlate properly in applyImport in the backends.
2269 if (AN.MIBs.empty())
2270 continue;
2271 IndexCall AllocCall(&AN);
2272 CallsWithMetadata.push_back(AllocCall);
2273 auto *AllocNode = addAllocNode(AllocCall, FS);
2274 // Pass an empty CallStack to the CallsiteContext (second)
2275 // parameter, since for ThinLTO we already collapsed out the inlined
2276 // stack ids on the allocation call during ModuleSummaryAnalysis.
2278 EmptyContext;
2279 unsigned I = 0;
2281 AN.ContextSizeInfos.size() == AN.MIBs.size());
2282 // Now add all of the MIBs and their stack nodes.
2283 for (auto &MIB : AN.MIBs) {
2285 StackContext(&MIB);
2286 std::vector<ContextTotalSize> ContextSizeInfo;
2287 if (!AN.ContextSizeInfos.empty()) {
2288 for (auto [FullStackId, TotalSize] : AN.ContextSizeInfos[I])
2289 ContextSizeInfo.push_back({FullStackId, TotalSize});
2290 }
2291 addStackNodesForMIB<MIBInfo, SmallVector<unsigned>::const_iterator>(
2292 AllocNode, StackContext, EmptyContext, MIB.AllocType,
2293 ContextSizeInfo);
2294 I++;
2295 }
2296 // If exporting the graph to dot and an allocation id of interest was
2297 // specified, record all the context ids for this allocation node.
2298 if (ExportToDot && AllocNode->OrigStackOrAllocId == AllocIdForDot)
2299 DotAllocContextIds = AllocNode->getContextIds();
2300 assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
2301 // Initialize version 0 on the summary alloc node to the current alloc
2302 // type, unless it has both types in which case make it default, so
2303 // that in the case where we aren't able to clone the original version
2304 // always ends up with the default allocation behavior.
2305 AN.Versions[0] = (uint8_t)allocTypeToUse(AllocNode->AllocTypes);
2306 }
2307 }
2308 // For callsite metadata, add to list for this function for later use.
2309 if (!FS->callsites().empty())
2310 for (auto &SN : FS->mutableCallsites()) {
2311 IndexCall StackNodeCall(&SN);
2312 CallsWithMetadata.push_back(StackNodeCall);
2313 }
2314
2315 if (!CallsWithMetadata.empty())
2316 FuncToCallsWithMetadata[FS] = CallsWithMetadata;
2317
2318 if (!FS->allocs().empty() || !FS->callsites().empty())
2319 FSToVIMap[FS] = VI;
2320 }
2321 }
2322
2323 if (DumpCCG) {
2324 dbgs() << "CCG before updating call stack chains:\n";
2325 dbgs() << *this;
2326 }
2327
2328 if (ExportToDot)
2329 exportToDot("prestackupdate");
2330
2331 updateStackNodes();
2332
2333 if (ExportToDot)
2334 exportToDot("poststackupdate");
2335
2336 handleCallsitesWithMultipleTargets();
2337
2338 markBackedges();
2339}
2340
2341template <typename DerivedCCG, typename FuncTy, typename CallTy>
2342void CallsiteContextGraph<DerivedCCG, FuncTy,
2343 CallTy>::handleCallsitesWithMultipleTargets() {
2344 // Look for and workaround callsites that call multiple functions.
2345 // This can happen for indirect calls, which needs better handling, and in
2346 // more rare cases (e.g. macro expansion).
2347 // TODO: To fix this for indirect calls we will want to perform speculative
2348 // devirtualization using either the normal PGO info with ICP, or using the
2349 // information in the profiled MemProf contexts. We can do this prior to
2350 // this transformation for regular LTO, and for ThinLTO we can simulate that
2351 // effect in the summary and perform the actual speculative devirtualization
2352 // while cloning in the ThinLTO backend.
2353
2354 // Keep track of the new nodes synthesized for discovered tail calls missing
2355 // from the profiled contexts.
2356 MapVector<CallInfo, ContextNode *> TailCallToContextNodeMap;
2357
2358 std::vector<std::pair<CallInfo, ContextNode *>> NewCallToNode;
2359 for (auto &Entry : NonAllocationCallToContextNodeMap) {
2360 auto *Node = Entry.second;
2361 assert(Node->Clones.empty());
2362 // Check all node callees and see if in the same function.
2363 // We need to check all of the calls recorded in this Node, because in some
2364 // cases we may have had multiple calls with the same debug info calling
2365 // different callees. This can happen, for example, when an object is
2366 // constructed in the paramter list - the destructor call of the object has
2367 // the same debug info (line/col) as the call the object was passed to.
2368 // Here we will prune any that don't match all callee nodes.
2369 std::vector<CallInfo> AllCalls;
2370 AllCalls.reserve(Node->MatchingCalls.size() + 1);
2371 AllCalls.push_back(Node->Call);
2372 llvm::append_range(AllCalls, Node->MatchingCalls);
2373
2374 // First see if we can partition the calls by callee function, creating new
2375 // nodes to host each set of calls calling the same callees. This is
2376 // necessary for support indirect calls with ThinLTO, for which we
2377 // synthesized CallsiteInfo records for each target. They will all have the
2378 // same callsite stack ids and would be sharing a context node at this
2379 // point. We need to perform separate cloning for each, which will be
2380 // applied along with speculative devirtualization in the ThinLTO backends
2381 // as needed. Note this does not currently support looking through tail
2382 // calls, it is unclear if we need that for indirect call targets.
2383 // First partition calls by callee func. Map indexed by func, value is
2384 // struct with list of matching calls, assigned node.
2385 if (partitionCallsByCallee(Node, AllCalls, NewCallToNode))
2386 continue;
2387
2388 auto It = AllCalls.begin();
2389 // Iterate through the calls until we find the first that matches.
2390 for (; It != AllCalls.end(); ++It) {
2391 auto ThisCall = *It;
2392 bool Match = true;
2393 for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();
2394 ++EI) {
2395 auto Edge = *EI;
2396 if (!Edge->Callee->hasCall())
2397 continue;
2398 assert(NodeToCallingFunc.count(Edge->Callee));
2399 // Check if the called function matches that of the callee node.
2400 if (!calleesMatch(ThisCall.call(), EI, TailCallToContextNodeMap)) {
2401 Match = false;
2402 break;
2403 }
2404 }
2405 // Found a call that matches the callee nodes, we can quit now.
2406 if (Match) {
2407 // If the first match is not the primary call on the Node, update it
2408 // now. We will update the list of matching calls further below.
2409 if (Node->Call != ThisCall) {
2410 Node->setCall(ThisCall);
2411 // We need to update the NonAllocationCallToContextNodeMap, but don't
2412 // want to do this during iteration over that map, so save the calls
2413 // that need updated entries.
2414 NewCallToNode.push_back({ThisCall, Node});
2415 }
2416 break;
2417 }
2418 }
2419 // We will update this list below (or leave it cleared if there was no
2420 // match found above).
2421 Node->MatchingCalls.clear();
2422 // If we hit the end of the AllCalls vector, no call matching the callee
2423 // nodes was found, clear the call information in the node.
2424 if (It == AllCalls.end()) {
2425 RemovedEdgesWithMismatchedCallees++;
2426 // Work around by setting Node to have a null call, so it gets
2427 // skipped during cloning. Otherwise assignFunctions will assert
2428 // because its data structures are not designed to handle this case.
2429 Node->setCall(CallInfo());
2430 continue;
2431 }
2432 // Now add back any matching calls that call the same function as the
2433 // matching primary call on Node.
2434 for (++It; It != AllCalls.end(); ++It) {
2435 auto ThisCall = *It;
2436 if (!sameCallee(Node->Call.call(), ThisCall.call()))
2437 continue;
2438 Node->MatchingCalls.push_back(ThisCall);
2439 }
2440 }
2441
2442 // Remove all mismatched nodes identified in the above loop from the node map
2443 // (checking whether they have a null call which is set above). For a
2444 // MapVector like NonAllocationCallToContextNodeMap it is much more efficient
2445 // to do the removal via remove_if than by individually erasing entries above.
2446 // Also remove any entries if we updated the node's primary call above.
2447 NonAllocationCallToContextNodeMap.remove_if([](const auto &it) {
2448 return !it.second->hasCall() || it.second->Call != it.first;
2449 });
2450
2451 // Add entries for any new primary calls recorded above.
2452 for (auto &[Call, Node] : NewCallToNode)
2453 NonAllocationCallToContextNodeMap[Call] = Node;
2454
2455 // Add the new nodes after the above loop so that the iteration is not
2456 // invalidated.
2457 for (auto &[Call, Node] : TailCallToContextNodeMap)
2458 NonAllocationCallToContextNodeMap[Call] = Node;
2459}
2460
2461template <typename DerivedCCG, typename FuncTy, typename CallTy>
2462bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::partitionCallsByCallee(
2463 ContextNode *Node, ArrayRef<CallInfo> AllCalls,
2464 std::vector<std::pair<CallInfo, ContextNode *>> &NewCallToNode) {
2465 // Struct to keep track of all the calls having the same callee function,
2466 // and the node we eventually assign to them. Eventually we will record the
2467 // context node assigned to this group of calls.
2468 struct CallsWithSameCallee {
2469 std::vector<CallInfo> Calls;
2470 ContextNode *Node = nullptr;
2471 };
2472
2473 // First partition calls by callee function. Build map from each function
2474 // to the list of matching calls.
2476 for (auto ThisCall : AllCalls) {
2477 auto *F = getCalleeFunc(ThisCall.call());
2478 if (F)
2479 CalleeFuncToCallInfo[F].Calls.push_back(ThisCall);
2480 }
2481
2482 // Next, walk through all callee edges. For each callee node, get its
2483 // containing function and see if it was recorded in the above map (meaning we
2484 // have at least one matching call). Build another map from each callee node
2485 // with a matching call to the structure instance created above containing all
2486 // the calls.
2488 for (const auto &Edge : Node->CalleeEdges) {
2489 if (!Edge->Callee->hasCall())
2490 continue;
2491 const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee];
2492 if (CalleeFuncToCallInfo.contains(ProfiledCalleeFunc))
2493 CalleeNodeToCallInfo[Edge->Callee] =
2494 &CalleeFuncToCallInfo[ProfiledCalleeFunc];
2495 }
2496
2497 // If there are entries in the second map, then there were no matching
2498 // calls/callees, nothing to do here. Return so we can go to the handling that
2499 // looks through tail calls.
2500 if (CalleeNodeToCallInfo.empty())
2501 return false;
2502
2503 // Walk through all callee edges again. Any and all callee edges that didn't
2504 // match any calls (callee not in the CalleeNodeToCallInfo map) are moved to a
2505 // new caller node (UnmatchedCalleesNode) which gets a null call so that it is
2506 // ignored during cloning. If it is in the map, then we use the node recorded
2507 // in that entry (creating it if needed), and move the callee edge to it.
2508 // The first callee will use the original node instead of creating a new one.
2509 // Note that any of the original calls on this node (in AllCalls) that didn't
2510 // have a callee function automatically get dropped from the node as part of
2511 // this process.
2512 ContextNode *UnmatchedCalleesNode = nullptr;
2513 // Track whether we already assigned original node to a callee.
2514 bool UsedOrigNode = false;
2515 assert(NodeToCallingFunc[Node]);
2516 // Iterate over a copy of Node's callee edges, since we may need to remove
2517 // edges in moveCalleeEdgeToNewCaller, and this simplifies the handling and
2518 // makes it less error-prone.
2519 auto CalleeEdges = Node->CalleeEdges;
2520 for (auto &Edge : CalleeEdges) {
2521 if (!Edge->Callee->hasCall())
2522 continue;
2523
2524 // Will be updated below to point to whatever (caller) node this callee edge
2525 // should be moved to.
2526 ContextNode *CallerNodeToUse = nullptr;
2527
2528 // Handle the case where there were no matching calls first. Move this
2529 // callee edge to the UnmatchedCalleesNode, creating it if needed.
2530 if (!CalleeNodeToCallInfo.contains(Edge->Callee)) {
2531 if (!UnmatchedCalleesNode)
2532 UnmatchedCalleesNode =
2533 createNewNode(/*IsAllocation=*/false, NodeToCallingFunc[Node]);
2534 CallerNodeToUse = UnmatchedCalleesNode;
2535 } else {
2536 // Look up the information recorded for this callee node, and use the
2537 // recorded caller node (creating it if needed).
2538 auto *Info = CalleeNodeToCallInfo[Edge->Callee];
2539 if (!Info->Node) {
2540 // If we haven't assigned any callees to the original node use it.
2541 if (!UsedOrigNode) {
2542 Info->Node = Node;
2543 // Clear the set of matching calls which will be updated below.
2544 Node->MatchingCalls.clear();
2545 UsedOrigNode = true;
2546 } else
2547 Info->Node =
2548 createNewNode(/*IsAllocation=*/false, NodeToCallingFunc[Node]);
2549 assert(!Info->Calls.empty());
2550 // The first call becomes the primary call for this caller node, and the
2551 // rest go in the matching calls list.
2552 Info->Node->setCall(Info->Calls.front());
2553 llvm::append_range(Info->Node->MatchingCalls,
2554 llvm::drop_begin(Info->Calls));
2555 // Save the primary call to node correspondence so that we can update
2556 // the NonAllocationCallToContextNodeMap, which is being iterated in the
2557 // caller of this function.
2558 NewCallToNode.push_back({Info->Node->Call, Info->Node});
2559 }
2560 CallerNodeToUse = Info->Node;
2561 }
2562
2563 // Don't need to move edge if we are using the original node;
2564 if (CallerNodeToUse == Node)
2565 continue;
2566
2567 moveCalleeEdgeToNewCaller(Edge, CallerNodeToUse);
2568 }
2569 // Now that we are done moving edges, clean up any caller edges that ended
2570 // up with no type or context ids. During moveCalleeEdgeToNewCaller all
2571 // caller edges from Node are replicated onto the new callers, and it
2572 // simplifies the handling to leave them until we have moved all
2573 // edges/context ids.
2574 for (auto &I : CalleeNodeToCallInfo)
2575 removeNoneTypeCallerEdges(I.second->Node);
2576 if (UnmatchedCalleesNode)
2577 removeNoneTypeCallerEdges(UnmatchedCalleesNode);
2578 removeNoneTypeCallerEdges(Node);
2579
2580 return true;
2581}
2582
2583uint64_t ModuleCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
2584 // In the Module (IR) case this is already the Id.
2585 return IdOrIndex;
2586}
2587
2588uint64_t IndexCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
2589 // In the Index case this is an index into the stack id list in the summary
2590 // index, convert it to an Id.
2591 return Index.getStackIdAtIndex(IdOrIndex);
2592}
2593
2594template <typename DerivedCCG, typename FuncTy, typename CallTy>
2595bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::calleesMatch(
2596 CallTy Call, EdgeIter &EI,
2597 MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap) {
2598 auto Edge = *EI;
2599 const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee];
2600 const FuncTy *CallerFunc = NodeToCallingFunc[Edge->Caller];
2601 // Will be populated in order of callee to caller if we find a chain of tail
2602 // calls between the profiled caller and callee.
2603 std::vector<std::pair<CallTy, FuncTy *>> FoundCalleeChain;
2604 if (!calleeMatchesFunc(Call, ProfiledCalleeFunc, CallerFunc,
2605 FoundCalleeChain))
2606 return false;
2607
2608 // The usual case where the profiled callee matches that of the IR/summary.
2609 if (FoundCalleeChain.empty())
2610 return true;
2611
2612 auto AddEdge = [Edge, &EI](ContextNode *Caller, ContextNode *Callee) {
2613 auto *CurEdge = Callee->findEdgeFromCaller(Caller);
2614 // If there is already an edge between these nodes, simply update it and
2615 // return.
2616 if (CurEdge) {
2617 CurEdge->ContextIds.insert_range(Edge->ContextIds);
2618 CurEdge->AllocTypes |= Edge->AllocTypes;
2619 return;
2620 }
2621 // Otherwise, create a new edge and insert it into the caller and callee
2622 // lists.
2623 auto NewEdge = std::make_shared<ContextEdge>(
2624 Callee, Caller, Edge->AllocTypes, Edge->ContextIds);
2625 Callee->CallerEdges.push_back(NewEdge);
2626 if (Caller == Edge->Caller) {
2627 // If we are inserting the new edge into the current edge's caller, insert
2628 // the new edge before the current iterator position, and then increment
2629 // back to the current edge.
2630 EI = Caller->CalleeEdges.insert(EI, NewEdge);
2631 ++EI;
2632 assert(*EI == Edge &&
2633 "Iterator position not restored after insert and increment");
2634 } else
2635 Caller->CalleeEdges.push_back(NewEdge);
2636 };
2637
2638 // Create new nodes for each found callee and connect in between the profiled
2639 // caller and callee.
2640 auto *CurCalleeNode = Edge->Callee;
2641 for (auto &[NewCall, Func] : FoundCalleeChain) {
2642 ContextNode *NewNode = nullptr;
2643 // First check if we have already synthesized a node for this tail call.
2644 if (TailCallToContextNodeMap.count(NewCall)) {
2645 NewNode = TailCallToContextNodeMap[NewCall];
2646 NewNode->AllocTypes |= Edge->AllocTypes;
2647 } else {
2648 FuncToCallsWithMetadata[Func].push_back({NewCall});
2649 // Create Node and record node info.
2650 NewNode = createNewNode(/*IsAllocation=*/false, Func, NewCall);
2651 TailCallToContextNodeMap[NewCall] = NewNode;
2652 NewNode->AllocTypes = Edge->AllocTypes;
2653 }
2654
2655 // Hook up node to its callee node
2656 AddEdge(NewNode, CurCalleeNode);
2657
2658 CurCalleeNode = NewNode;
2659 }
2660
2661 // Hook up edge's original caller to new callee node.
2662 AddEdge(Edge->Caller, CurCalleeNode);
2663
2664#ifndef NDEBUG
2665 // Save this because Edge's fields get cleared below when removed.
2666 auto *Caller = Edge->Caller;
2667#endif
2668
2669 // Remove old edge
2670 removeEdgeFromGraph(Edge.get(), &EI, /*CalleeIter=*/true);
2671
2672 // To simplify the increment of EI in the caller, subtract one from EI.
2673 // In the final AddEdge call we would have either added a new callee edge,
2674 // to Edge->Caller, or found an existing one. Either way we are guaranteed
2675 // that there is at least one callee edge.
2676 assert(!Caller->CalleeEdges.empty());
2677 --EI;
2678
2679 return true;
2680}
2681
2682bool ModuleCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
2683 const Function *ProfiledCallee, Value *CurCallee, unsigned Depth,
2684 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain,
2685 bool &FoundMultipleCalleeChains) {
2686 // Stop recursive search if we have already explored the maximum specified
2687 // depth.
2689 return false;
2690
2691 auto SaveCallsiteInfo = [&](Instruction *Callsite, Function *F) {
2692 FoundCalleeChain.push_back({Callsite, F});
2693 };
2694
2695 auto *CalleeFunc = dyn_cast<Function>(CurCallee);
2696 if (!CalleeFunc) {
2697 auto *Alias = dyn_cast<GlobalAlias>(CurCallee);
2698 assert(Alias);
2699 CalleeFunc = dyn_cast<Function>(Alias->getAliasee());
2700 assert(CalleeFunc);
2701 }
2702
2703 // Look for tail calls in this function, and check if they either call the
2704 // profiled callee directly, or indirectly (via a recursive search).
2705 // Only succeed if there is a single unique tail call chain found between the
2706 // profiled caller and callee, otherwise we could perform incorrect cloning.
2707 bool FoundSingleCalleeChain = false;
2708 for (auto &BB : *CalleeFunc) {
2709 for (auto &I : BB) {
2710 auto *CB = dyn_cast<CallBase>(&I);
2711 if (!CB || !CB->isTailCall())
2712 continue;
2713 auto *CalledValue = CB->getCalledOperand();
2714 auto *CalledFunction = CB->getCalledFunction();
2715 if (CalledValue && !CalledFunction) {
2716 CalledValue = CalledValue->stripPointerCasts();
2717 // Stripping pointer casts can reveal a called function.
2718 CalledFunction = dyn_cast<Function>(CalledValue);
2719 }
2720 // Check if this is an alias to a function. If so, get the
2721 // called aliasee for the checks below.
2722 if (auto *GA = dyn_cast<GlobalAlias>(CalledValue)) {
2723 assert(!CalledFunction &&
2724 "Expected null called function in callsite for alias");
2725 CalledFunction = dyn_cast<Function>(GA->getAliaseeObject());
2726 }
2727 if (!CalledFunction)
2728 continue;
2729 if (CalledFunction == ProfiledCallee) {
2730 if (FoundSingleCalleeChain) {
2731 FoundMultipleCalleeChains = true;
2732 return false;
2733 }
2734 FoundSingleCalleeChain = true;
2735 FoundProfiledCalleeCount++;
2736 FoundProfiledCalleeDepth += Depth;
2737 if (Depth > FoundProfiledCalleeMaxDepth)
2738 FoundProfiledCalleeMaxDepth = Depth;
2739 SaveCallsiteInfo(&I, CalleeFunc);
2740 } else if (findProfiledCalleeThroughTailCalls(
2741 ProfiledCallee, CalledFunction, Depth + 1,
2742 FoundCalleeChain, FoundMultipleCalleeChains)) {
2743 // findProfiledCalleeThroughTailCalls should not have returned
2744 // true if FoundMultipleCalleeChains.
2745 assert(!FoundMultipleCalleeChains);
2746 if (FoundSingleCalleeChain) {
2747 FoundMultipleCalleeChains = true;
2748 return false;
2749 }
2750 FoundSingleCalleeChain = true;
2751 SaveCallsiteInfo(&I, CalleeFunc);
2752 } else if (FoundMultipleCalleeChains)
2753 return false;
2754 }
2755 }
2756
2757 return FoundSingleCalleeChain;
2758}
2759
2760const Function *ModuleCallsiteContextGraph::getCalleeFunc(Instruction *Call) {
2761 auto *CB = dyn_cast<CallBase>(Call);
2762 if (!CB->getCalledOperand() || CB->isIndirectCall())
2763 return nullptr;
2764 auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts();
2765 auto *Alias = dyn_cast<GlobalAlias>(CalleeVal);
2766 if (Alias)
2767 return dyn_cast<Function>(Alias->getAliasee());
2768 return dyn_cast<Function>(CalleeVal);
2769}
2770
2771bool ModuleCallsiteContextGraph::calleeMatchesFunc(
2772 Instruction *Call, const Function *Func, const Function *CallerFunc,
2773 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain) {
2774 auto *CB = dyn_cast<CallBase>(Call);
2775 if (!CB->getCalledOperand() || CB->isIndirectCall())
2776 return false;
2777 auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts();
2778 auto *CalleeFunc = dyn_cast<Function>(CalleeVal);
2779 if (CalleeFunc == Func)
2780 return true;
2781 auto *Alias = dyn_cast<GlobalAlias>(CalleeVal);
2782 if (Alias && Alias->getAliasee() == Func)
2783 return true;
2784
2785 // Recursively search for the profiled callee through tail calls starting with
2786 // the actual Callee. The discovered tail call chain is saved in
2787 // FoundCalleeChain, and we will fixup the graph to include these callsites
2788 // after returning.
2789 // FIXME: We will currently redo the same recursive walk if we find the same
2790 // mismatched callee from another callsite. We can improve this with more
2791 // bookkeeping of the created chain of new nodes for each mismatch.
2792 unsigned Depth = 1;
2793 bool FoundMultipleCalleeChains = false;
2794 if (!findProfiledCalleeThroughTailCalls(Func, CalleeVal, Depth,
2795 FoundCalleeChain,
2796 FoundMultipleCalleeChains)) {
2797 LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: "
2798 << Func->getName() << " from " << CallerFunc->getName()
2799 << " that actually called " << CalleeVal->getName()
2800 << (FoundMultipleCalleeChains
2801 ? " (found multiple possible chains)"
2802 : "")
2803 << "\n");
2804 if (FoundMultipleCalleeChains)
2805 FoundProfiledCalleeNonUniquelyCount++;
2806 return false;
2807 }
2808
2809 return true;
2810}
2811
2812bool ModuleCallsiteContextGraph::sameCallee(Instruction *Call1,
2813 Instruction *Call2) {
2814 auto *CB1 = cast<CallBase>(Call1);
2815 if (!CB1->getCalledOperand() || CB1->isIndirectCall())
2816 return false;
2817 auto *CalleeVal1 = CB1->getCalledOperand()->stripPointerCasts();
2818 auto *CalleeFunc1 = dyn_cast<Function>(CalleeVal1);
2819 auto *CB2 = cast<CallBase>(Call2);
2820 if (!CB2->getCalledOperand() || CB2->isIndirectCall())
2821 return false;
2822 auto *CalleeVal2 = CB2->getCalledOperand()->stripPointerCasts();
2823 auto *CalleeFunc2 = dyn_cast<Function>(CalleeVal2);
2824 return CalleeFunc1 == CalleeFunc2;
2825}
2826
2827bool IndexCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
2828 ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
2829 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
2830 bool &FoundMultipleCalleeChains) {
2831 // Stop recursive search if we have already explored the maximum specified
2832 // depth.
2834 return false;
2835
2836 auto CreateAndSaveCallsiteInfo = [&](ValueInfo Callee, FunctionSummary *FS) {
2837 // Make a CallsiteInfo for each discovered callee, if one hasn't already
2838 // been synthesized.
2839 if (!FunctionCalleesToSynthesizedCallsiteInfos.count(FS) ||
2840 !FunctionCalleesToSynthesizedCallsiteInfos[FS].count(Callee))
2841 // StackIds is empty (we don't have debug info available in the index for
2842 // these callsites)
2843 FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee] =
2844 std::make_unique<CallsiteInfo>(Callee, SmallVector<unsigned>());
2845 CallsiteInfo *NewCallsiteInfo =
2846 FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee].get();
2847 FoundCalleeChain.push_back({NewCallsiteInfo, FS});
2848 };
2849
2850 // Look for tail calls in this function, and check if they either call the
2851 // profiled callee directly, or indirectly (via a recursive search).
2852 // Only succeed if there is a single unique tail call chain found between the
2853 // profiled caller and callee, otherwise we could perform incorrect cloning.
2854 bool FoundSingleCalleeChain = false;
2855 for (auto &S : CurCallee.getSummaryList()) {
2856 if (!GlobalValue::isLocalLinkage(S->linkage()) &&
2857 !isPrevailing(CurCallee.getGUID(), S.get()))
2858 continue;
2859 auto *FS = dyn_cast<FunctionSummary>(S->getBaseObject());
2860 if (!FS)
2861 continue;
2862 auto FSVI = CurCallee;
2863 auto *AS = dyn_cast<AliasSummary>(S.get());
2864 if (AS)
2865 FSVI = AS->getAliaseeVI();
2866 for (auto &CallEdge : FS->calls()) {
2867 if (!CallEdge.second.hasTailCall())
2868 continue;
2869 if (CallEdge.first == ProfiledCallee) {
2870 if (FoundSingleCalleeChain) {
2871 FoundMultipleCalleeChains = true;
2872 return false;
2873 }
2874 FoundSingleCalleeChain = true;
2875 FoundProfiledCalleeCount++;
2876 FoundProfiledCalleeDepth += Depth;
2877 if (Depth > FoundProfiledCalleeMaxDepth)
2878 FoundProfiledCalleeMaxDepth = Depth;
2879 CreateAndSaveCallsiteInfo(CallEdge.first, FS);
2880 // Add FS to FSToVIMap in case it isn't already there.
2881 assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI);
2882 FSToVIMap[FS] = FSVI;
2883 } else if (findProfiledCalleeThroughTailCalls(
2884 ProfiledCallee, CallEdge.first, Depth + 1,
2885 FoundCalleeChain, FoundMultipleCalleeChains)) {
2886 // findProfiledCalleeThroughTailCalls should not have returned
2887 // true if FoundMultipleCalleeChains.
2888 assert(!FoundMultipleCalleeChains);
2889 if (FoundSingleCalleeChain) {
2890 FoundMultipleCalleeChains = true;
2891 return false;
2892 }
2893 FoundSingleCalleeChain = true;
2894 CreateAndSaveCallsiteInfo(CallEdge.first, FS);
2895 // Add FS to FSToVIMap in case it isn't already there.
2896 assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI);
2897 FSToVIMap[FS] = FSVI;
2898 } else if (FoundMultipleCalleeChains)
2899 return false;
2900 }
2901 }
2902
2903 return FoundSingleCalleeChain;
2904}
2905
2906const FunctionSummary *
2907IndexCallsiteContextGraph::getCalleeFunc(IndexCall &Call) {
2908 ValueInfo Callee = dyn_cast_if_present<CallsiteInfo *>(Call)->Callee;
2909 if (Callee.getSummaryList().empty())
2910 return nullptr;
2911 return dyn_cast<FunctionSummary>(Callee.getSummaryList()[0]->getBaseObject());
2912}
2913
2914bool IndexCallsiteContextGraph::calleeMatchesFunc(
2915 IndexCall &Call, const FunctionSummary *Func,
2916 const FunctionSummary *CallerFunc,
2917 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain) {
2918 ValueInfo Callee = dyn_cast_if_present<CallsiteInfo *>(Call)->Callee;
2919 // If there is no summary list then this is a call to an externally defined
2920 // symbol.
2921 AliasSummary *Alias =
2922 Callee.getSummaryList().empty()
2923 ? nullptr
2924 : dyn_cast<AliasSummary>(Callee.getSummaryList()[0].get());
2925 assert(FSToVIMap.count(Func));
2926 auto FuncVI = FSToVIMap[Func];
2927 if (Callee == FuncVI ||
2928 // If callee is an alias, check the aliasee, since only function
2929 // summary base objects will contain the stack node summaries and thus
2930 // get a context node.
2931 (Alias && Alias->getAliaseeVI() == FuncVI))
2932 return true;
2933
2934 // Recursively search for the profiled callee through tail calls starting with
2935 // the actual Callee. The discovered tail call chain is saved in
2936 // FoundCalleeChain, and we will fixup the graph to include these callsites
2937 // after returning.
2938 // FIXME: We will currently redo the same recursive walk if we find the same
2939 // mismatched callee from another callsite. We can improve this with more
2940 // bookkeeping of the created chain of new nodes for each mismatch.
2941 unsigned Depth = 1;
2942 bool FoundMultipleCalleeChains = false;
2943 if (!findProfiledCalleeThroughTailCalls(
2944 FuncVI, Callee, Depth, FoundCalleeChain, FoundMultipleCalleeChains)) {
2945 LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: " << FuncVI
2946 << " from " << FSToVIMap[CallerFunc]
2947 << " that actually called " << Callee
2948 << (FoundMultipleCalleeChains
2949 ? " (found multiple possible chains)"
2950 : "")
2951 << "\n");
2952 if (FoundMultipleCalleeChains)
2953 FoundProfiledCalleeNonUniquelyCount++;
2954 return false;
2955 }
2956
2957 return true;
2958}
2959
2960bool IndexCallsiteContextGraph::sameCallee(IndexCall &Call1, IndexCall &Call2) {
2961 ValueInfo Callee1 = dyn_cast_if_present<CallsiteInfo *>(Call1)->Callee;
2962 ValueInfo Callee2 = dyn_cast_if_present<CallsiteInfo *>(Call2)->Callee;
2963 return Callee1 == Callee2;
2964}
2965
2966template <typename DerivedCCG, typename FuncTy, typename CallTy>
2967void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::dump()
2968 const {
2969 print(dbgs());
2970 dbgs() << "\n";
2971}
2972
2973template <typename DerivedCCG, typename FuncTy, typename CallTy>
2974void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::print(
2975 raw_ostream &OS) const {
2976 OS << "Node " << this << "\n";
2977 OS << "\t";
2978 printCall(OS);
2979 if (Recursive)
2980 OS << " (recursive)";
2981 OS << "\n";
2982 if (!MatchingCalls.empty()) {
2983 OS << "\tMatchingCalls:\n";
2984 for (auto &MatchingCall : MatchingCalls) {
2985 OS << "\t";
2986 MatchingCall.print(OS);
2987 OS << "\n";
2988 }
2989 }
2990 OS << "\tNodeId: " << NodeId << "\n";
2991 OS << "\tAllocTypes: " << getAllocTypeString(AllocTypes) << "\n";
2992 OS << "\tContextIds:";
2993 // Make a copy of the computed context ids that we can sort for stability.
2994 auto ContextIds = getContextIds();
2995 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
2996 std::sort(SortedIds.begin(), SortedIds.end());
2997 for (auto Id : SortedIds)
2998 OS << " " << Id;
2999 OS << "\n";
3000 OS << "\tCalleeEdges:\n";
3001 for (auto &Edge : CalleeEdges)
3002 OS << "\t\t" << *Edge << " (Callee NodeId: " << Edge->Callee->NodeId
3003 << ")\n";
3004 OS << "\tCallerEdges:\n";
3005 for (auto &Edge : CallerEdges)
3006 OS << "\t\t" << *Edge << " (Caller NodeId: " << Edge->Caller->NodeId
3007 << ")\n";
3008 if (!Clones.empty()) {
3009 OS << "\tClones: ";
3010 bool First = true;
3011 for (auto *C : Clones) {
3012 if (!First)
3013 OS << ", ";
3014 First = false;
3015 OS << C << " NodeId: " << C->NodeId;
3016 }
3017 OS << "\n";
3018 } else if (CloneOf) {
3019 OS << "\tClone of " << CloneOf << " NodeId: " << CloneOf->NodeId << "\n";
3020 }
3021}
3022
3023template <typename DerivedCCG, typename FuncTy, typename CallTy>
3024void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::dump()
3025 const {
3026 print(dbgs());
3027 dbgs() << "\n";
3028}
3029
3030template <typename DerivedCCG, typename FuncTy, typename CallTy>
3031void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::print(
3032 raw_ostream &OS) const {
3033 OS << "Edge from Callee " << Callee << " to Caller: " << Caller
3034 << (IsBackedge ? " (BE)" : "")
3035 << " AllocTypes: " << getAllocTypeString(AllocTypes);
3036 OS << " ContextIds:";
3037 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
3038 std::sort(SortedIds.begin(), SortedIds.end());
3039 for (auto Id : SortedIds)
3040 OS << " " << Id;
3041}
3042
3043template <typename DerivedCCG, typename FuncTy, typename CallTy>
3044void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::dump() const {
3045 print(dbgs());
3046}
3047
3048template <typename DerivedCCG, typename FuncTy, typename CallTy>
3049void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::print(
3050 raw_ostream &OS) const {
3051 OS << "Callsite Context Graph:\n";
3052 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3053 for (const auto Node : nodes<GraphType>(this)) {
3054 if (Node->isRemoved())
3055 continue;
3056 Node->print(OS);
3057 OS << "\n";
3058 }
3059}
3060
3061template <typename DerivedCCG, typename FuncTy, typename CallTy>
3062void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::printTotalSizes(
3063 raw_ostream &OS) const {
3064 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3065 for (const auto Node : nodes<GraphType>(this)) {
3066 if (Node->isRemoved())
3067 continue;
3068 if (!Node->IsAllocation)
3069 continue;
3070 DenseSet<uint32_t> ContextIds = Node->getContextIds();
3071 auto AllocTypeFromCall = getAllocationCallType(Node->Call);
3072 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
3073 std::sort(SortedIds.begin(), SortedIds.end());
3074 for (auto Id : SortedIds) {
3075 auto TypeI = ContextIdToAllocationType.find(Id);
3076 assert(TypeI != ContextIdToAllocationType.end());
3077 auto CSI = ContextIdToContextSizeInfos.find(Id);
3078 if (CSI != ContextIdToContextSizeInfos.end()) {
3079 for (auto &Info : CSI->second) {
3080 OS << "MemProf hinting: "
3081 << getAllocTypeString((uint8_t)TypeI->second)
3082 << " full allocation context " << Info.FullStackId
3083 << " with total size " << Info.TotalSize << " is "
3084 << getAllocTypeString(Node->AllocTypes) << " after cloning";
3085 if (allocTypeToUse(Node->AllocTypes) != AllocTypeFromCall)
3086 OS << " marked " << getAllocTypeString((uint8_t)AllocTypeFromCall)
3087 << " due to cold byte percent";
3088 // Print the internal context id to aid debugging and visualization.
3089 OS << " (context id " << Id << ")";
3090 OS << "\n";
3091 }
3092 }
3093 }
3094 }
3095}
3096
3097template <typename DerivedCCG, typename FuncTy, typename CallTy>
3098void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::check() const {
3099 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3100 for (const auto Node : nodes<GraphType>(this)) {
3101 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
3102 for (auto &Edge : Node->CallerEdges)
3104 }
3105}
3106
3107template <typename DerivedCCG, typename FuncTy, typename CallTy>
3108struct GraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *> {
3109 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3110 using NodeRef = const ContextNode<DerivedCCG, FuncTy, CallTy> *;
3111
3112 using NodePtrTy = std::unique_ptr<ContextNode<DerivedCCG, FuncTy, CallTy>>;
3113 static NodeRef getNode(const NodePtrTy &P) { return P.get(); }
3114
3117 decltype(&getNode)>;
3118
3120 return nodes_iterator(G->NodeOwner.begin(), &getNode);
3121 }
3122
3124 return nodes_iterator(G->NodeOwner.end(), &getNode);
3125 }
3126
3128 return G->NodeOwner.begin()->get();
3129 }
3130
3131 using EdgePtrTy = std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>;
3132 static const ContextNode<DerivedCCG, FuncTy, CallTy> *
3134 return P->Callee;
3135 }
3136
3138 mapped_iterator<typename std::vector<std::shared_ptr<ContextEdge<
3139 DerivedCCG, FuncTy, CallTy>>>::const_iterator,
3140 decltype(&GetCallee)>;
3141
3143 return ChildIteratorType(N->CalleeEdges.begin(), &GetCallee);
3144 }
3145
3147 return ChildIteratorType(N->CalleeEdges.end(), &GetCallee);
3148 }
3149};
3150
3151template <typename DerivedCCG, typename FuncTy, typename CallTy>
3152struct DOTGraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>
3153 : public DefaultDOTGraphTraits {
3154 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {
3155 // If the user requested the full graph to be exported, but provided an
3156 // allocation id, or if the user gave a context id and requested more than
3157 // just a specific context to be exported, note that highlighting is
3158 // enabled.
3159 DoHighlight =
3160 (AllocIdForDot.getNumOccurrences() && DotGraphScope == DotScope::All) ||
3161 (ContextIdForDot.getNumOccurrences() &&
3163 }
3164
3165 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3167 using NodeRef = typename GTraits::NodeRef;
3168 using ChildIteratorType = typename GTraits::ChildIteratorType;
3169
3170 static std::string getNodeLabel(NodeRef Node, GraphType G) {
3171 std::string LabelString =
3172 (Twine("OrigId: ") + (Node->IsAllocation ? "Alloc" : "") +
3173 Twine(Node->OrigStackOrAllocId) + " NodeId: " + Twine(Node->NodeId))
3174 .str();
3175 LabelString += "\n";
3176 if (Node->hasCall()) {
3177 auto Func = G->NodeToCallingFunc.find(Node);
3178 assert(Func != G->NodeToCallingFunc.end());
3179 LabelString +=
3180 G->getLabel(Func->second, Node->Call.call(), Node->Call.cloneNo());
3181 } else {
3182 LabelString += "null call";
3183 if (Node->Recursive)
3184 LabelString += " (recursive)";
3185 else
3186 LabelString += " (external)";
3187 }
3188 return LabelString;
3189 }
3190
3192 auto ContextIds = Node->getContextIds();
3193 // If highlighting enabled, see if this node contains any of the context ids
3194 // of interest. If so, it will use a different color and a larger fontsize
3195 // (which makes the node larger as well).
3196 bool Highlight = false;
3197 if (DoHighlight) {
3198 assert(ContextIdForDot.getNumOccurrences() ||
3199 AllocIdForDot.getNumOccurrences());
3200 if (ContextIdForDot.getNumOccurrences())
3201 Highlight = ContextIds.contains(ContextIdForDot);
3202 else
3203 Highlight = set_intersects(ContextIds, G->DotAllocContextIds);
3204 }
3205 std::string AttributeString = (Twine("tooltip=\"") + getNodeId(Node) + " " +
3206 getContextIds(ContextIds) + "\"")
3207 .str();
3208 // Default fontsize is 14
3209 if (Highlight)
3210 AttributeString += ",fontsize=\"30\"";
3211 AttributeString +=
3212 (Twine(",fillcolor=\"") + getColor(Node->AllocTypes, Highlight) + "\"")
3213 .str();
3214 if (Node->CloneOf) {
3215 AttributeString += ",color=\"blue\"";
3216 AttributeString += ",style=\"filled,bold,dashed\"";
3217 } else
3218 AttributeString += ",style=\"filled\"";
3219 return AttributeString;
3220 }
3221
3222 static std::string getEdgeAttributes(NodeRef, ChildIteratorType ChildIter,
3223 GraphType G) {
3224 auto &Edge = *(ChildIter.getCurrent());
3225 // If highlighting enabled, see if this edge contains any of the context ids
3226 // of interest. If so, it will use a different color and a heavier arrow
3227 // size and weight (the larger weight makes the highlighted path
3228 // straighter).
3229 bool Highlight = false;
3230 if (DoHighlight) {
3231 assert(ContextIdForDot.getNumOccurrences() ||
3232 AllocIdForDot.getNumOccurrences());
3233 if (ContextIdForDot.getNumOccurrences())
3234 Highlight = Edge->ContextIds.contains(ContextIdForDot);
3235 else
3236 Highlight = set_intersects(Edge->ContextIds, G->DotAllocContextIds);
3237 }
3238 auto Color = getColor(Edge->AllocTypes, Highlight);
3239 std::string AttributeString =
3240 (Twine("tooltip=\"") + getContextIds(Edge->ContextIds) + "\"" +
3241 // fillcolor is the arrow head and color is the line
3242 Twine(",fillcolor=\"") + Color + "\"" + Twine(",color=\"") + Color +
3243 "\"")
3244 .str();
3245 if (Edge->IsBackedge)
3246 AttributeString += ",style=\"dotted\"";
3247 // Default penwidth and weight are both 1.
3248 if (Highlight)
3249 AttributeString += ",penwidth=\"2.0\",weight=\"2\"";
3250 return AttributeString;
3251 }
3252
3253 // Since the NodeOwners list includes nodes that are no longer connected to
3254 // the graph, skip them here.
3256 if (Node->isRemoved())
3257 return true;
3258 // If a scope smaller than the full graph was requested, see if this node
3259 // contains any of the context ids of interest.
3261 return !set_intersects(Node->getContextIds(), G->DotAllocContextIds);
3263 return !Node->getContextIds().contains(ContextIdForDot);
3264 return false;
3265 }
3266
3267private:
3268 static std::string getContextIds(const DenseSet<uint32_t> &ContextIds) {
3269 std::string IdString = "ContextIds:";
3270 if (ContextIds.size() < 100) {
3271 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
3272 std::sort(SortedIds.begin(), SortedIds.end());
3273 for (auto Id : SortedIds)
3274 IdString += (" " + Twine(Id)).str();
3275 } else {
3276 IdString += (" (" + Twine(ContextIds.size()) + " ids)").str();
3277 }
3278 return IdString;
3279 }
3280
3281 static std::string getColor(uint8_t AllocTypes, bool Highlight) {
3282 // If DoHighlight is not enabled, we want to use the highlight colors for
3283 // NotCold and Cold, and the non-highlight color for NotCold+Cold. This is
3284 // both compatible with the color scheme before highlighting was supported,
3285 // and for the NotCold+Cold color the non-highlight color is a bit more
3286 // readable.
3287 if (AllocTypes == (uint8_t)AllocationType::NotCold)
3288 // Color "brown1" actually looks like a lighter red.
3289 return !DoHighlight || Highlight ? "brown1" : "lightpink";
3290 if (AllocTypes == (uint8_t)AllocationType::Cold)
3291 return !DoHighlight || Highlight ? "cyan" : "lightskyblue";
3292 if (AllocTypes ==
3293 ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold))
3294 return Highlight ? "magenta" : "mediumorchid1";
3295 return "gray";
3296 }
3297
3298 static std::string getNodeId(NodeRef Node) {
3299 std::stringstream SStream;
3300 SStream << std::hex << "N0x" << (unsigned long long)Node;
3301 std::string Result = SStream.str();
3302 return Result;
3303 }
3304
3305 // True if we should highlight a specific context or allocation's contexts in
3306 // the emitted graph.
3307 static bool DoHighlight;
3308};
3309
3310template <typename DerivedCCG, typename FuncTy, typename CallTy>
3311bool DOTGraphTraits<
3312 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>::DoHighlight =
3313 false;
3314
3315template <typename DerivedCCG, typename FuncTy, typename CallTy>
3316void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::exportToDot(
3317 std::string Label) const {
3318 WriteGraph(this, "", false, Label,
3319 DotFilePathPrefix + "ccg." + Label + ".dot");
3320}
3321
3322template <typename DerivedCCG, typename FuncTy, typename CallTy>
3323typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
3324CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::moveEdgeToNewCalleeClone(
3325 const std::shared_ptr<ContextEdge> &Edge,
3326 DenseSet<uint32_t> ContextIdsToMove) {
3327 ContextNode *Node = Edge->Callee;
3328 assert(NodeToCallingFunc.count(Node));
3329 ContextNode *Clone =
3330 createNewNode(Node->IsAllocation, NodeToCallingFunc[Node], Node->Call);
3331 Node->addClone(Clone);
3332 Clone->MatchingCalls = Node->MatchingCalls;
3333 moveEdgeToExistingCalleeClone(Edge, Clone, /*NewClone=*/true,
3334 ContextIdsToMove);
3335 return Clone;
3336}
3337
3338template <typename DerivedCCG, typename FuncTy, typename CallTy>
3339void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
3340 moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
3341 ContextNode *NewCallee, bool NewClone,
3342 DenseSet<uint32_t> ContextIdsToMove) {
3343 // NewCallee and Edge's current callee must be clones of the same original
3344 // node (Edge's current callee may be the original node too).
3345 assert(NewCallee->getOrigNode() == Edge->Callee->getOrigNode());
3346
3347 bool EdgeIsRecursive = Edge->Callee == Edge->Caller;
3348
3349 ContextNode *OldCallee = Edge->Callee;
3350
3351 // We might already have an edge to the new callee from earlier cloning for a
3352 // different allocation. If one exists we will reuse it.
3353 auto ExistingEdgeToNewCallee = NewCallee->findEdgeFromCaller(Edge->Caller);
3354
3355 // Callers will pass an empty ContextIdsToMove set when they want to move the
3356 // edge. Copy in Edge's ids for simplicity.
3357 if (ContextIdsToMove.empty())
3358 ContextIdsToMove = Edge->getContextIds();
3359
3360 // If we are moving all of Edge's ids, then just move the whole Edge.
3361 // Otherwise only move the specified subset, to a new edge if needed.
3362 if (Edge->getContextIds().size() == ContextIdsToMove.size()) {
3363 // First, update the alloc types on New Callee from Edge.
3364 // Do this before we potentially clear Edge's fields below!
3365 NewCallee->AllocTypes |= Edge->AllocTypes;
3366 // Moving the whole Edge.
3367 if (ExistingEdgeToNewCallee) {
3368 // Since we already have an edge to NewCallee, simply move the ids
3369 // onto it, and remove the existing Edge.
3370 ExistingEdgeToNewCallee->getContextIds().insert_range(ContextIdsToMove);
3371 ExistingEdgeToNewCallee->AllocTypes |= Edge->AllocTypes;
3372 assert(Edge->ContextIds == ContextIdsToMove);
3373 removeEdgeFromGraph(Edge.get());
3374 } else {
3375 // Otherwise just reconnect Edge to NewCallee.
3376 Edge->Callee = NewCallee;
3377 NewCallee->CallerEdges.push_back(Edge);
3378 // Remove it from callee where it was previously connected.
3379 OldCallee->eraseCallerEdge(Edge.get());
3380 // Don't need to update Edge's context ids since we are simply
3381 // reconnecting it.
3382 }
3383 } else {
3384 // Only moving a subset of Edge's ids.
3385 // Compute the alloc type of the subset of ids being moved.
3386 auto CallerEdgeAllocType = computeAllocType(ContextIdsToMove);
3387 if (ExistingEdgeToNewCallee) {
3388 // Since we already have an edge to NewCallee, simply move the ids
3389 // onto it.
3390 ExistingEdgeToNewCallee->getContextIds().insert_range(ContextIdsToMove);
3391 ExistingEdgeToNewCallee->AllocTypes |= CallerEdgeAllocType;
3392 } else {
3393 // Otherwise, create a new edge to NewCallee for the ids being moved.
3394 auto NewEdge = std::make_shared<ContextEdge>(
3395 NewCallee, Edge->Caller, CallerEdgeAllocType, ContextIdsToMove);
3396 Edge->Caller->CalleeEdges.push_back(NewEdge);
3397 NewCallee->CallerEdges.push_back(NewEdge);
3398 }
3399 // In either case, need to update the alloc types on NewCallee, and remove
3400 // those ids and update the alloc type on the original Edge.
3401 NewCallee->AllocTypes |= CallerEdgeAllocType;
3402 set_subtract(Edge->ContextIds, ContextIdsToMove);
3403 Edge->AllocTypes = computeAllocType(Edge->ContextIds);
3404 }
3405 // Now walk the old callee node's callee edges and move Edge's context ids
3406 // over to the corresponding edge into the clone (which is created here if
3407 // this is a newly created clone).
3408 for (auto &OldCalleeEdge : OldCallee->CalleeEdges) {
3409 ContextNode *CalleeToUse = OldCalleeEdge->Callee;
3410 // If this is a direct recursion edge, use NewCallee (the clone) as the
3411 // callee as well, so that any edge updated/created here is also direct
3412 // recursive.
3413 if (CalleeToUse == OldCallee) {
3414 // If this is a recursive edge, see if we already moved a recursive edge
3415 // (which would have to have been this one) - if we were only moving a
3416 // subset of context ids it would still be on OldCallee.
3417 if (EdgeIsRecursive) {
3418 assert(OldCalleeEdge == Edge);
3419 continue;
3420 }
3421 CalleeToUse = NewCallee;
3422 }
3423 // The context ids moving to the new callee are the subset of this edge's
3424 // context ids and the context ids on the caller edge being moved.
3425 DenseSet<uint32_t> EdgeContextIdsToMove =
3426 set_intersection(OldCalleeEdge->getContextIds(), ContextIdsToMove);
3427 set_subtract(OldCalleeEdge->getContextIds(), EdgeContextIdsToMove);
3428 OldCalleeEdge->AllocTypes =
3429 computeAllocType(OldCalleeEdge->getContextIds());
3430 if (!NewClone) {
3431 // Update context ids / alloc type on corresponding edge to NewCallee.
3432 // There is a chance this may not exist if we are reusing an existing
3433 // clone, specifically during function assignment, where we would have
3434 // removed none type edges after creating the clone. If we can't find
3435 // a corresponding edge there, fall through to the cloning below.
3436 if (auto *NewCalleeEdge = NewCallee->findEdgeFromCallee(CalleeToUse)) {
3437 NewCalleeEdge->getContextIds().insert_range(EdgeContextIdsToMove);
3438 NewCalleeEdge->AllocTypes |= computeAllocType(EdgeContextIdsToMove);
3439 continue;
3440 }
3441 }
3442 auto NewEdge = std::make_shared<ContextEdge>(
3443 CalleeToUse, NewCallee, computeAllocType(EdgeContextIdsToMove),
3444 EdgeContextIdsToMove);
3445 NewCallee->CalleeEdges.push_back(NewEdge);
3446 NewEdge->Callee->CallerEdges.push_back(NewEdge);
3447 }
3448 // Recompute the node alloc type now that its callee edges have been
3449 // updated (since we will compute from those edges).
3450 OldCallee->AllocTypes = OldCallee->computeAllocType();
3451 // OldCallee alloc type should be None iff its context id set is now empty.
3452 assert((OldCallee->AllocTypes == (uint8_t)AllocationType::None) ==
3453 OldCallee->emptyContextIds());
3454 if (VerifyCCG) {
3455 checkNode<DerivedCCG, FuncTy, CallTy>(OldCallee, /*CheckEdges=*/false);
3456 checkNode<DerivedCCG, FuncTy, CallTy>(NewCallee, /*CheckEdges=*/false);
3457 for (const auto &OldCalleeEdge : OldCallee->CalleeEdges)
3458 checkNode<DerivedCCG, FuncTy, CallTy>(OldCalleeEdge->Callee,
3459 /*CheckEdges=*/false);
3460 for (const auto &NewCalleeEdge : NewCallee->CalleeEdges)
3461 checkNode<DerivedCCG, FuncTy, CallTy>(NewCalleeEdge->Callee,
3462 /*CheckEdges=*/false);
3463 }
3464}
3465
3466template <typename DerivedCCG, typename FuncTy, typename CallTy>
3467void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
3468 moveCalleeEdgeToNewCaller(const std::shared_ptr<ContextEdge> &Edge,
3469 ContextNode *NewCaller) {
3470 auto *OldCallee = Edge->Callee;
3471 auto *NewCallee = OldCallee;
3472 // If this edge was direct recursive, make any new/updated edge also direct
3473 // recursive to NewCaller.
3474 bool Recursive = Edge->Caller == Edge->Callee;
3475 if (Recursive)
3476 NewCallee = NewCaller;
3477
3478 ContextNode *OldCaller = Edge->Caller;
3479 OldCaller->eraseCalleeEdge(Edge.get());
3480
3481 // We might already have an edge to the new caller. If one exists we will
3482 // reuse it.
3483 auto ExistingEdgeToNewCaller = NewCaller->findEdgeFromCallee(NewCallee);
3484
3485 if (ExistingEdgeToNewCaller) {
3486 // Since we already have an edge to NewCaller, simply move the ids
3487 // onto it, and remove the existing Edge.
3488 ExistingEdgeToNewCaller->getContextIds().insert_range(
3489 Edge->getContextIds());
3490 ExistingEdgeToNewCaller->AllocTypes |= Edge->AllocTypes;
3491 Edge->ContextIds.clear();
3492 Edge->AllocTypes = (uint8_t)AllocationType::None;
3493 OldCallee->eraseCallerEdge(Edge.get());
3494 } else {
3495 // Otherwise just reconnect Edge to NewCaller.
3496 Edge->Caller = NewCaller;
3497 NewCaller->CalleeEdges.push_back(Edge);
3498 if (Recursive) {
3499 assert(NewCallee == NewCaller);
3500 // In the case of (direct) recursive edges, we update the callee as well
3501 // so that it becomes recursive on the new caller.
3502 Edge->Callee = NewCallee;
3503 NewCallee->CallerEdges.push_back(Edge);
3504 OldCallee->eraseCallerEdge(Edge.get());
3505 }
3506 // Don't need to update Edge's context ids since we are simply
3507 // reconnecting it.
3508 }
3509 // In either case, need to update the alloc types on New Caller.
3510 NewCaller->AllocTypes |= Edge->AllocTypes;
3511
3512 // Now walk the old caller node's caller edges and move Edge's context ids
3513 // over to the corresponding edge into the node (which is created here if
3514 // this is a newly created node). We can tell whether this is a newly created
3515 // node by seeing if it has any caller edges yet.
3516#ifndef NDEBUG
3517 bool IsNewNode = NewCaller->CallerEdges.empty();
3518#endif
3519 // If we just moved a direct recursive edge, presumably its context ids should
3520 // also flow out of OldCaller via some other non-recursive callee edge. We
3521 // don't want to remove the recursive context ids from other caller edges yet,
3522 // otherwise the context ids get into an inconsistent state on OldCaller.
3523 // We will update these context ids on the non-recursive caller edge when and
3524 // if they are updated on the non-recursive callee.
3525 if (!Recursive) {
3526 for (auto &OldCallerEdge : OldCaller->CallerEdges) {
3527 auto OldCallerCaller = OldCallerEdge->Caller;
3528 // The context ids moving to the new caller are the subset of this edge's
3529 // context ids and the context ids on the callee edge being moved.
3530 DenseSet<uint32_t> EdgeContextIdsToMove = set_intersection(
3531 OldCallerEdge->getContextIds(), Edge->getContextIds());
3532 if (OldCaller == OldCallerCaller) {
3533 OldCallerCaller = NewCaller;
3534 // Don't actually move this one. The caller will move it directly via a
3535 // call to this function with this as the Edge if it is appropriate to
3536 // move to a diff node that has a matching callee (itself).
3537 continue;
3538 }
3539 set_subtract(OldCallerEdge->getContextIds(), EdgeContextIdsToMove);
3540 OldCallerEdge->AllocTypes =
3541 computeAllocType(OldCallerEdge->getContextIds());
3542 // In this function we expect that any pre-existing node already has edges
3543 // from the same callers as the old node. That should be true in the
3544 // current use case, where we will remove None-type edges after copying
3545 // over all caller edges from the callee.
3546 auto *ExistingCallerEdge = NewCaller->findEdgeFromCaller(OldCallerCaller);
3547 // Since we would have skipped caller edges when moving a direct recursive
3548 // edge, this may not hold true when recursive handling enabled.
3549 assert(IsNewNode || ExistingCallerEdge || AllowRecursiveCallsites);
3550 if (ExistingCallerEdge) {
3551 ExistingCallerEdge->getContextIds().insert_range(EdgeContextIdsToMove);
3552 ExistingCallerEdge->AllocTypes |=
3553 computeAllocType(EdgeContextIdsToMove);
3554 continue;
3555 }
3556 auto NewEdge = std::make_shared<ContextEdge>(
3557 NewCaller, OldCallerCaller, computeAllocType(EdgeContextIdsToMove),
3558 EdgeContextIdsToMove);
3559 NewCaller->CallerEdges.push_back(NewEdge);
3560 NewEdge->Caller->CalleeEdges.push_back(NewEdge);
3561 }
3562 }
3563 // Recompute the node alloc type now that its caller edges have been
3564 // updated (since we will compute from those edges).
3565 OldCaller->AllocTypes = OldCaller->computeAllocType();
3566 // OldCaller alloc type should be None iff its context id set is now empty.
3567 assert((OldCaller->AllocTypes == (uint8_t)AllocationType::None) ==
3568 OldCaller->emptyContextIds());
3569 if (VerifyCCG) {
3570 checkNode<DerivedCCG, FuncTy, CallTy>(OldCaller, /*CheckEdges=*/false);
3571 checkNode<DerivedCCG, FuncTy, CallTy>(NewCaller, /*CheckEdges=*/false);
3572 for (const auto &OldCallerEdge : OldCaller->CallerEdges)
3573 checkNode<DerivedCCG, FuncTy, CallTy>(OldCallerEdge->Caller,
3574 /*CheckEdges=*/false);
3575 for (const auto &NewCallerEdge : NewCaller->CallerEdges)
3576 checkNode<DerivedCCG, FuncTy, CallTy>(NewCallerEdge->Caller,
3577 /*CheckEdges=*/false);
3578 }
3579}
3580
3581template <typename DerivedCCG, typename FuncTy, typename CallTy>
3582void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
3583 recursivelyRemoveNoneTypeCalleeEdges(
3584 ContextNode *Node, DenseSet<const ContextNode *> &Visited) {
3585 auto Inserted = Visited.insert(Node);
3586 if (!Inserted.second)
3587 return;
3588
3589 removeNoneTypeCalleeEdges(Node);
3590
3591 for (auto *Clone : Node->Clones)
3592 recursivelyRemoveNoneTypeCalleeEdges(Clone, Visited);
3593
3594 // The recursive call may remove some of this Node's caller edges.
3595 // Iterate over a copy and skip any that were removed.
3596 auto CallerEdges = Node->CallerEdges;
3597 for (auto &Edge : CallerEdges) {
3598 // Skip any that have been removed by an earlier recursive call.
3599 if (Edge->isRemoved()) {
3600 assert(!is_contained(Node->CallerEdges, Edge));
3601 continue;
3602 }
3603 recursivelyRemoveNoneTypeCalleeEdges(Edge->Caller, Visited);
3604 }
3605}
3606
3607// This is the standard DFS based backedge discovery algorithm.
3608template <typename DerivedCCG, typename FuncTy, typename CallTy>
3609void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::markBackedges() {
3610 // If we are cloning recursive contexts, find and mark backedges from all root
3611 // callers, using the typical DFS based backedge analysis.
3613 return;
3614 DenseSet<const ContextNode *> Visited;
3615 DenseSet<const ContextNode *> CurrentStack;
3616 for (auto &Entry : NonAllocationCallToContextNodeMap) {
3617 auto *Node = Entry.second;
3618 if (Node->isRemoved())
3619 continue;
3620 // It is a root if it doesn't have callers.
3621 if (!Node->CallerEdges.empty())
3622 continue;
3623 markBackedges(Node, Visited, CurrentStack);
3624 assert(CurrentStack.empty());
3625 }
3626}
3627
3628// Recursive helper for above markBackedges method.
3629template <typename DerivedCCG, typename FuncTy, typename CallTy>
3630void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::markBackedges(
3631 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
3632 DenseSet<const ContextNode *> &CurrentStack) {
3633 auto I = Visited.insert(Node);
3634 // We should only call this for unvisited nodes.
3635 assert(I.second);
3636 (void)I;
3637 for (auto &CalleeEdge : Node->CalleeEdges) {
3638 auto *Callee = CalleeEdge->Callee;
3639 if (Visited.count(Callee)) {
3640 // Since this was already visited we need to check if it is currently on
3641 // the recursive stack in which case it is a backedge.
3642 if (CurrentStack.count(Callee))
3643 CalleeEdge->IsBackedge = true;
3644 continue;
3645 }
3646 CurrentStack.insert(Callee);
3647 markBackedges(Callee, Visited, CurrentStack);
3648 CurrentStack.erase(Callee);
3649 }
3650}
3651
3652template <typename DerivedCCG, typename FuncTy, typename CallTy>
3653void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones() {
3654 DenseSet<const ContextNode *> Visited;
3655 for (auto &Entry : AllocationCallToContextNodeMap) {
3656 Visited.clear();
3657 identifyClones(Entry.second, Visited, Entry.second->getContextIds());
3658 }
3659 Visited.clear();
3660 for (auto &Entry : AllocationCallToContextNodeMap)
3661 recursivelyRemoveNoneTypeCalleeEdges(Entry.second, Visited);
3662 if (VerifyCCG)
3663 check();
3664}
3665
3666// helper function to check an AllocType is cold or notcold or both.
3673
3674template <typename DerivedCCG, typename FuncTy, typename CallTy>
3675void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
3676 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
3677 const DenseSet<uint32_t> &AllocContextIds) {
3678 if (VerifyNodes)
3679 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
3680 assert(!Node->CloneOf);
3681
3682 // If Node as a null call, then either it wasn't found in the module (regular
3683 // LTO) or summary index (ThinLTO), or there were other conditions blocking
3684 // cloning (e.g. recursion, calls multiple targets, etc).
3685 // Do this here so that we don't try to recursively clone callers below, which
3686 // isn't useful at least for this node.
3687 if (!Node->hasCall())
3688 return;
3689
3690 // No need to look at any callers if allocation type already unambiguous.
3691 if (hasSingleAllocType(Node->AllocTypes))
3692 return;
3693
3694#ifndef NDEBUG
3695 auto Insert =
3696#endif
3697 Visited.insert(Node);
3698 // We should not have visited this node yet.
3699 assert(Insert.second);
3700 // The recursive call to identifyClones may delete the current edge from the
3701 // CallerEdges vector. Make a copy and iterate on that, simpler than passing
3702 // in an iterator and having recursive call erase from it. Other edges may
3703 // also get removed during the recursion, which will have null Callee and
3704 // Caller pointers (and are deleted later), so we skip those below.
3705 {
3706 auto CallerEdges = Node->CallerEdges;
3707 for (auto &Edge : CallerEdges) {
3708 // Skip any that have been removed by an earlier recursive call.
3709 if (Edge->isRemoved()) {
3710 assert(!is_contained(Node->CallerEdges, Edge));
3711 continue;
3712 }
3713 // Defer backedges. See comments further below where these edges are
3714 // handled during the cloning of this Node.
3715 if (Edge->IsBackedge) {
3716 // We should only mark these if cloning recursive contexts, where we
3717 // need to do this deferral.
3719 continue;
3720 }
3721 // Ignore any caller we previously visited via another edge.
3722 if (!Visited.count(Edge->Caller) && !Edge->Caller->CloneOf) {
3723 identifyClones(Edge->Caller, Visited, AllocContextIds);
3724 }
3725 }
3726 }
3727
3728 // Check if we reached an unambiguous call or have have only a single caller.
3729 if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
3730 return;
3731
3732 // We need to clone.
3733
3734 // Try to keep the original version as alloc type NotCold. This will make
3735 // cases with indirect calls or any other situation with an unknown call to
3736 // the original function get the default behavior. We do this by sorting the
3737 // CallerEdges of the Node we will clone by alloc type.
3738 //
3739 // Give NotCold edge the lowest sort priority so those edges are at the end of
3740 // the caller edges vector, and stay on the original version (since the below
3741 // code clones greedily until it finds all remaining edges have the same type
3742 // and leaves the remaining ones on the original Node).
3743 //
3744 // We shouldn't actually have any None type edges, so the sorting priority for
3745 // that is arbitrary, and we assert in that case below.
3746 const unsigned AllocTypeCloningPriority[] = {/*None*/ 3, /*NotCold*/ 4,
3747 /*Cold*/ 1,
3748 /*NotColdCold*/ 2};
3749 llvm::stable_sort(Node->CallerEdges,
3750 [&](const std::shared_ptr<ContextEdge> &A,
3751 const std::shared_ptr<ContextEdge> &B) {
3752 // Nodes with non-empty context ids should be sorted
3753 // before those with empty context ids.
3754 if (A->ContextIds.empty())
3755 // Either B ContextIds are non-empty (in which case we
3756 // should return false because B < A), or B ContextIds
3757 // are empty, in which case they are equal, and we
3758 // should maintain the original relative ordering.
3759 return false;
3760 if (B->ContextIds.empty())
3761 return true;
3762
3763 if (A->AllocTypes == B->AllocTypes)
3764 // Use the first context id for each edge as a
3765 // tie-breaker.
3766 return *A->ContextIds.begin() < *B->ContextIds.begin();
3767 return AllocTypeCloningPriority[A->AllocTypes] <
3768 AllocTypeCloningPriority[B->AllocTypes];
3769 });
3770
3771 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
3772
3773 DenseSet<uint32_t> RecursiveContextIds;
3775 // If we are allowing recursive callsites, but have also disabled recursive
3776 // contexts, look for context ids that show up in multiple caller edges.
3778 DenseSet<uint32_t> AllCallerContextIds;
3779 for (auto &CE : Node->CallerEdges) {
3780 // Resize to the largest set of caller context ids, since we know the
3781 // final set will be at least that large.
3782 AllCallerContextIds.reserve(CE->getContextIds().size());
3783 for (auto Id : CE->getContextIds())
3784 if (!AllCallerContextIds.insert(Id).second)
3785 RecursiveContextIds.insert(Id);
3786 }
3787 }
3788
3789 // Iterate until we find no more opportunities for disambiguating the alloc
3790 // types via cloning. In most cases this loop will terminate once the Node
3791 // has a single allocation type, in which case no more cloning is needed.
3792 // Iterate over a copy of Node's caller edges, since we may need to remove
3793 // edges in the moveEdgeTo* methods, and this simplifies the handling and
3794 // makes it less error-prone.
3795 auto CallerEdges = Node->CallerEdges;
3796 for (auto &CallerEdge : CallerEdges) {
3797 // Skip any that have been removed by an earlier recursive call.
3798 if (CallerEdge->isRemoved()) {
3799 assert(!is_contained(Node->CallerEdges, CallerEdge));
3800 continue;
3801 }
3802 assert(CallerEdge->Callee == Node);
3803
3804 // See if cloning the prior caller edge left this node with a single alloc
3805 // type or a single caller. In that case no more cloning of Node is needed.
3806 if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
3807 break;
3808
3809 // If the caller was not successfully matched to a call in the IR/summary,
3810 // there is no point in trying to clone for it as we can't update that call.
3811 if (!CallerEdge->Caller->hasCall())
3812 continue;
3813
3814 // Only need to process the ids along this edge pertaining to the given
3815 // allocation.
3816 auto CallerEdgeContextsForAlloc =
3817 set_intersection(CallerEdge->getContextIds(), AllocContextIds);
3818 if (!RecursiveContextIds.empty())
3819 CallerEdgeContextsForAlloc =
3820 set_difference(CallerEdgeContextsForAlloc, RecursiveContextIds);
3821 if (CallerEdgeContextsForAlloc.empty())
3822 continue;
3823
3824 auto CallerAllocTypeForAlloc = computeAllocType(CallerEdgeContextsForAlloc);
3825
3826 // Compute the node callee edge alloc types corresponding to the context ids
3827 // for this caller edge.
3828 std::vector<uint8_t> CalleeEdgeAllocTypesForCallerEdge;
3829 CalleeEdgeAllocTypesForCallerEdge.reserve(Node->CalleeEdges.size());
3830 for (auto &CalleeEdge : Node->CalleeEdges)
3831 CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes(
3832 CalleeEdge->getContextIds(), CallerEdgeContextsForAlloc));
3833
3834 // Don't clone if doing so will not disambiguate any alloc types amongst
3835 // caller edges (including the callee edges that would be cloned).
3836 // Otherwise we will simply move all edges to the clone.
3837 //
3838 // First check if by cloning we will disambiguate the caller allocation
3839 // type from node's allocation type. Query allocTypeToUse so that we don't
3840 // bother cloning to distinguish NotCold+Cold from NotCold. Note that
3841 // neither of these should be None type.
3842 //
3843 // Then check if by cloning node at least one of the callee edges will be
3844 // disambiguated by splitting out different context ids.
3845 //
3846 // However, always do the cloning if this is a backedge, in which case we
3847 // have not yet cloned along this caller edge.
3848 assert(CallerEdge->AllocTypes != (uint8_t)AllocationType::None);
3849 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
3850 if (!CallerEdge->IsBackedge &&
3851 allocTypeToUse(CallerAllocTypeForAlloc) ==
3852 allocTypeToUse(Node->AllocTypes) &&
3853 allocTypesMatch<DerivedCCG, FuncTy, CallTy>(
3854 CalleeEdgeAllocTypesForCallerEdge, Node->CalleeEdges)) {
3855 continue;
3856 }
3857
3858 if (CallerEdge->IsBackedge) {
3859 // We should only mark these if cloning recursive contexts, where we
3860 // need to do this deferral.
3862 DeferredBackedges++;
3863 }
3864
3865 // If this is a backedge, we now do recursive cloning starting from its
3866 // caller since we may have moved unambiguous caller contexts to a clone
3867 // of this Node in a previous iteration of the current loop, giving more
3868 // opportunity for cloning through the backedge. Because we sorted the
3869 // caller edges earlier so that cold caller edges are first, we would have
3870 // visited and cloned this node for any unamibiguously cold non-recursive
3871 // callers before any ambiguous backedge callers. Note that we don't do this
3872 // if the caller is already cloned or visited during cloning (e.g. via a
3873 // different context path from the allocation).
3874 // TODO: Can we do better in the case where the caller was already visited?
3875 if (CallerEdge->IsBackedge && !CallerEdge->Caller->CloneOf &&
3876 !Visited.count(CallerEdge->Caller)) {
3877 const auto OrigIdCount = CallerEdge->getContextIds().size();
3878 // Now do the recursive cloning of this backedge's caller, which was
3879 // deferred earlier.
3880 identifyClones(CallerEdge->Caller, Visited, CallerEdgeContextsForAlloc);
3881 removeNoneTypeCalleeEdges(CallerEdge->Caller);
3882 // See if the recursive call to identifyClones moved the context ids to a
3883 // new edge from this node to a clone of caller, and switch to looking at
3884 // that new edge so that we clone Node for the new caller clone.
3885 bool UpdatedEdge = false;
3886 if (OrigIdCount > CallerEdge->getContextIds().size()) {
3887 for (auto E : Node->CallerEdges) {
3888 // Only interested in clones of the current edges caller.
3889 if (E->Caller->CloneOf != CallerEdge->Caller)
3890 continue;
3891 // See if this edge contains any of the context ids originally on the
3892 // current caller edge.
3893 auto CallerEdgeContextsForAllocNew =
3894 set_intersection(CallerEdgeContextsForAlloc, E->getContextIds());
3895 if (CallerEdgeContextsForAllocNew.empty())
3896 continue;
3897 // Make sure we don't pick a previously existing caller edge of this
3898 // Node, which would be processed on a different iteration of the
3899 // outer loop over the saved CallerEdges.
3900 if (llvm::is_contained(CallerEdges, E))
3901 continue;
3902 // The CallerAllocTypeForAlloc and CalleeEdgeAllocTypesForCallerEdge
3903 // are updated further below for all cases where we just invoked
3904 // identifyClones recursively.
3905 CallerEdgeContextsForAlloc.swap(CallerEdgeContextsForAllocNew);
3906 CallerEdge = E;
3907 UpdatedEdge = true;
3908 break;
3909 }
3910 }
3911 // If cloning removed this edge (and we didn't update it to a new edge
3912 // above), we're done with this edge. It's possible we moved all of the
3913 // context ids to an existing clone, in which case there's no need to do
3914 // further processing for them.
3915 if (CallerEdge->isRemoved())
3916 continue;
3917
3918 // Now we need to update the information used for the cloning decisions
3919 // further below, as we may have modified edges and their context ids.
3920
3921 // Note if we changed the CallerEdge above we would have already updated
3922 // the context ids.
3923 if (!UpdatedEdge) {
3924 CallerEdgeContextsForAlloc = set_intersection(
3925 CallerEdgeContextsForAlloc, CallerEdge->getContextIds());
3926 if (CallerEdgeContextsForAlloc.empty())
3927 continue;
3928 }
3929 // Update the other information that depends on the edges and on the now
3930 // updated CallerEdgeContextsForAlloc.
3931 CallerAllocTypeForAlloc = computeAllocType(CallerEdgeContextsForAlloc);
3932 CalleeEdgeAllocTypesForCallerEdge.clear();
3933 for (auto &CalleeEdge : Node->CalleeEdges) {
3934 CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes(
3935 CalleeEdge->getContextIds(), CallerEdgeContextsForAlloc));
3936 }
3937 }
3938
3939 // First see if we can use an existing clone. Check each clone and its
3940 // callee edges for matching alloc types.
3941 ContextNode *Clone = nullptr;
3942 for (auto *CurClone : Node->Clones) {
3943 if (allocTypeToUse(CurClone->AllocTypes) !=
3944 allocTypeToUse(CallerAllocTypeForAlloc))
3945 continue;
3946
3947 bool BothSingleAlloc = hasSingleAllocType(CurClone->AllocTypes) &&
3948 hasSingleAllocType(CallerAllocTypeForAlloc);
3949 // The above check should mean that if both have single alloc types that
3950 // they should be equal.
3951 assert(!BothSingleAlloc ||
3952 CurClone->AllocTypes == CallerAllocTypeForAlloc);
3953
3954 // If either both have a single alloc type (which are the same), or if the
3955 // clone's callee edges have the same alloc types as those for the current
3956 // allocation on Node's callee edges (CalleeEdgeAllocTypesForCallerEdge),
3957 // then we can reuse this clone.
3958 if (BothSingleAlloc || allocTypesMatchClone<DerivedCCG, FuncTy, CallTy>(
3959 CalleeEdgeAllocTypesForCallerEdge, CurClone)) {
3960 Clone = CurClone;
3961 break;
3962 }
3963 }
3964
3965 // The edge iterator is adjusted when we move the CallerEdge to the clone.
3966 if (Clone)
3967 moveEdgeToExistingCalleeClone(CallerEdge, Clone, /*NewClone=*/false,
3968 CallerEdgeContextsForAlloc);
3969 else
3970 Clone = moveEdgeToNewCalleeClone(CallerEdge, CallerEdgeContextsForAlloc);
3971
3972 // Sanity check that no alloc types on clone or its edges are None.
3973 assert(Clone->AllocTypes != (uint8_t)AllocationType::None);
3974 }
3975
3976 // We should still have some context ids on the original Node.
3977 assert(!Node->emptyContextIds());
3978
3979 // Sanity check that no alloc types on node or edges are None.
3980 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
3981
3982 if (VerifyNodes)
3983 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
3984}
3985
3986void ModuleCallsiteContextGraph::updateAllocationCall(
3987 CallInfo &Call, AllocationType AllocType) {
3988 std::string AllocTypeString = getAllocTypeAttributeString(AllocType);
3990 auto A = llvm::Attribute::get(Call.call()->getFunction()->getContext(),
3991 "memprof", AllocTypeString);
3992 cast<CallBase>(Call.call())->addFnAttr(A);
3993 OREGetter(Call.call()->getFunction())
3994 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", Call.call())
3995 << ore::NV("AllocationCall", Call.call()) << " in clone "
3996 << ore::NV("Caller", Call.call()->getFunction())
3997 << " marked with memprof allocation attribute "
3998 << ore::NV("Attribute", AllocTypeString));
3999}
4000
4001void IndexCallsiteContextGraph::updateAllocationCall(CallInfo &Call,
4003 auto *AI = cast<AllocInfo *>(Call.call());
4004 assert(AI);
4005 assert(AI->Versions.size() > Call.cloneNo());
4006 AI->Versions[Call.cloneNo()] = (uint8_t)AllocType;
4007}
4008
4010ModuleCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const {
4011 const auto *CB = cast<CallBase>(Call.call());
4012 if (!CB->getAttributes().hasFnAttr("memprof"))
4013 return AllocationType::None;
4014 return CB->getAttributes().getFnAttr("memprof").getValueAsString() == "cold"
4015 ? AllocationType::Cold
4016 : AllocationType::NotCold;
4017}
4018
4020IndexCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const {
4021 const auto *AI = cast<AllocInfo *>(Call.call());
4022 assert(AI->Versions.size() > Call.cloneNo());
4023 return (AllocationType)AI->Versions[Call.cloneNo()];
4024}
4025
4026void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall,
4027 FuncInfo CalleeFunc) {
4028 auto *CurF = getCalleeFunc(CallerCall.call());
4029 auto NewCalleeCloneNo = CalleeFunc.cloneNo();
4030 if (isMemProfClone(*CurF)) {
4031 // If we already assigned this callsite to call a specific non-default
4032 // clone (i.e. not the original function which is clone 0), ensure that we
4033 // aren't trying to now update it to call a different clone, which is
4034 // indicative of a bug in the graph or function assignment.
4035 auto CurCalleeCloneNo = getMemProfCloneNum(*CurF);
4036 if (CurCalleeCloneNo != NewCalleeCloneNo) {
4037 LLVM_DEBUG(dbgs() << "Mismatch in call clone assignment: was "
4038 << CurCalleeCloneNo << " now " << NewCalleeCloneNo
4039 << "\n");
4040 MismatchedCloneAssignments++;
4041 }
4042 }
4043 if (NewCalleeCloneNo > 0)
4044 cast<CallBase>(CallerCall.call())->setCalledFunction(CalleeFunc.func());
4045 OREGetter(CallerCall.call()->getFunction())
4046 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CallerCall.call())
4047 << ore::NV("Call", CallerCall.call()) << " in clone "
4048 << ore::NV("Caller", CallerCall.call()->getFunction())
4049 << " assigned to call function clone "
4050 << ore::NV("Callee", CalleeFunc.func()));
4051}
4052
4053void IndexCallsiteContextGraph::updateCall(CallInfo &CallerCall,
4054 FuncInfo CalleeFunc) {
4055 auto *CI = cast<CallsiteInfo *>(CallerCall.call());
4056 assert(CI &&
4057 "Caller cannot be an allocation which should not have profiled calls");
4058 assert(CI->Clones.size() > CallerCall.cloneNo());
4059 auto NewCalleeCloneNo = CalleeFunc.cloneNo();
4060 auto &CurCalleeCloneNo = CI->Clones[CallerCall.cloneNo()];
4061 // If we already assigned this callsite to call a specific non-default
4062 // clone (i.e. not the original function which is clone 0), ensure that we
4063 // aren't trying to now update it to call a different clone, which is
4064 // indicative of a bug in the graph or function assignment.
4065 if (CurCalleeCloneNo != 0 && CurCalleeCloneNo != NewCalleeCloneNo) {
4066 LLVM_DEBUG(dbgs() << "Mismatch in call clone assignment: was "
4067 << CurCalleeCloneNo << " now " << NewCalleeCloneNo
4068 << "\n");
4069 MismatchedCloneAssignments++;
4070 }
4071 CurCalleeCloneNo = NewCalleeCloneNo;
4072}
4073
4074// Update the debug information attached to NewFunc to use the clone Name. Note
4075// this needs to be done for both any existing DISubprogram for the definition,
4076// as well as any separate declaration DISubprogram.
4078 assert(Name == NewFunc->getName());
4079 auto *SP = NewFunc->getSubprogram();
4080 if (!SP)
4081 return;
4082 auto *MDName = MDString::get(NewFunc->getParent()->getContext(), Name);
4083 SP->replaceLinkageName(MDName);
4084 DISubprogram *Decl = SP->getDeclaration();
4085 if (!Decl)
4086 return;
4087 TempDISubprogram NewDecl = Decl->clone();
4088 NewDecl->replaceLinkageName(MDName);
4089 SP->replaceDeclaration(MDNode::replaceWithUniqued(std::move(NewDecl)));
4090}
4091
4092CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
4093 Instruction *>::FuncInfo
4094ModuleCallsiteContextGraph::cloneFunctionForCallsite(
4095 FuncInfo &Func, CallInfo &Call, DenseMap<CallInfo, CallInfo> &CallMap,
4096 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
4097 // Use existing LLVM facilities for cloning and obtaining Call in clone
4098 ValueToValueMapTy VMap;
4099 auto *NewFunc = CloneFunction(Func.func(), VMap);
4100 std::string Name = getMemProfFuncName(Func.func()->getName(), CloneNo);
4101 assert(!Func.func()->getParent()->getFunction(Name));
4102 NewFunc->setName(Name);
4103 updateSubprogramLinkageName(NewFunc, Name);
4104 for (auto &Inst : CallsWithMetadataInFunc) {
4105 // This map always has the initial version in it.
4106 assert(Inst.cloneNo() == 0);
4107 CallMap[Inst] = {cast<Instruction>(VMap[Inst.call()]), CloneNo};
4108 }
4109 OREGetter(Func.func())
4110 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", Func.func())
4111 << "created clone " << ore::NV("NewFunction", NewFunc));
4112 return {NewFunc, CloneNo};
4113}
4114
4115CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
4116 IndexCall>::FuncInfo
4117IndexCallsiteContextGraph::cloneFunctionForCallsite(
4118 FuncInfo &Func, CallInfo &Call, DenseMap<CallInfo, CallInfo> &CallMap,
4119 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
4120 // Check how many clones we have of Call (and therefore function).
4121 // The next clone number is the current size of versions array.
4122 // Confirm this matches the CloneNo provided by the caller, which is based on
4123 // the number of function clones we have.
4124 assert(CloneNo == (isa<AllocInfo *>(Call.call())
4125 ? cast<AllocInfo *>(Call.call())->Versions.size()
4126 : cast<CallsiteInfo *>(Call.call())->Clones.size()));
4127 // Walk all the instructions in this function. Create a new version for
4128 // each (by adding an entry to the Versions/Clones summary array), and copy
4129 // over the version being called for the function clone being cloned here.
4130 // Additionally, add an entry to the CallMap for the new function clone,
4131 // mapping the original call (clone 0, what is in CallsWithMetadataInFunc)
4132 // to the new call clone.
4133 for (auto &Inst : CallsWithMetadataInFunc) {
4134 // This map always has the initial version in it.
4135 assert(Inst.cloneNo() == 0);
4136 if (auto *AI = dyn_cast<AllocInfo *>(Inst.call())) {
4137 assert(AI->Versions.size() == CloneNo);
4138 // We assign the allocation type later (in updateAllocationCall), just add
4139 // an entry for it here.
4140 AI->Versions.push_back(0);
4141 } else {
4142 auto *CI = cast<CallsiteInfo *>(Inst.call());
4143 assert(CI && CI->Clones.size() == CloneNo);
4144 // We assign the clone number later (in updateCall), just add an entry for
4145 // it here.
4146 CI->Clones.push_back(0);
4147 }
4148 CallMap[Inst] = {Inst.call(), CloneNo};
4149 }
4150 return {Func.func(), CloneNo};
4151}
4152
4153// We perform cloning for each allocation node separately. However, this
4154// sometimes results in a situation where the same node calls multiple
4155// clones of the same callee, created for different allocations. This
4156// causes issues when assigning functions to these clones, as each node can
4157// in reality only call a single callee clone.
4158//
4159// To address this, before assigning functions, merge callee clone nodes as
4160// needed using a post order traversal from the allocations. We attempt to
4161// use existing clones as the merge node when legal, and to share them
4162// among callers with the same properties (callers calling the same set of
4163// callee clone nodes for the same allocations).
4164//
4165// Without this fix, in some cases incorrect function assignment will lead
4166// to calling the wrong allocation clone.
4167template <typename DerivedCCG, typename FuncTy, typename CallTy>
4168void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::mergeClones() {
4169 if (!MergeClones)
4170 return;
4171
4172 // Generate a map from context id to the associated allocation node for use
4173 // when merging clones.
4174 DenseMap<uint32_t, ContextNode *> ContextIdToAllocationNode;
4175 for (auto &Entry : AllocationCallToContextNodeMap) {
4176 auto *Node = Entry.second;
4177 for (auto Id : Node->getContextIds())
4178 ContextIdToAllocationNode[Id] = Node->getOrigNode();
4179 for (auto *Clone : Node->Clones) {
4180 for (auto Id : Clone->getContextIds())
4181 ContextIdToAllocationNode[Id] = Clone->getOrigNode();
4182 }
4183 }
4184
4185 // Post order traversal starting from allocations to ensure each callsite
4186 // calls a single clone of its callee. Callee nodes that are clones of each
4187 // other are merged (via new merge nodes if needed) to achieve this.
4188 DenseSet<const ContextNode *> Visited;
4189 for (auto &Entry : AllocationCallToContextNodeMap) {
4190 auto *Node = Entry.second;
4191
4192 mergeClones(Node, Visited, ContextIdToAllocationNode);
4193
4194 // Make a copy so the recursive post order traversal that may create new
4195 // clones doesn't mess up iteration. Note that the recursive traversal
4196 // itself does not call mergeClones on any of these nodes, which are all
4197 // (clones of) allocations.
4198 auto Clones = Node->Clones;
4199 for (auto *Clone : Clones)
4200 mergeClones(Clone, Visited, ContextIdToAllocationNode);
4201 }
4202
4203 if (DumpCCG) {
4204 dbgs() << "CCG after merging:\n";
4205 dbgs() << *this;
4206 }
4207 if (ExportToDot)
4208 exportToDot("aftermerge");
4209
4210 if (VerifyCCG) {
4211 check();
4212 }
4213}
4214
4215// Recursive helper for above mergeClones method.
4216template <typename DerivedCCG, typename FuncTy, typename CallTy>
4217void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::mergeClones(
4218 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
4219 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode) {
4220 auto Inserted = Visited.insert(Node);
4221 if (!Inserted.second)
4222 return;
4223
4224 // Iteratively perform merging on this node to handle new caller nodes created
4225 // during the recursive traversal. We could do something more elegant such as
4226 // maintain a worklist, but this is a simple approach that doesn't cause a
4227 // measureable compile time effect, as most nodes don't have many caller
4228 // edges to check.
4229 bool FoundUnvisited = true;
4230 unsigned Iters = 0;
4231 while (FoundUnvisited) {
4232 Iters++;
4233 FoundUnvisited = false;
4234 // Make a copy since the recursive call may move a caller edge to a new
4235 // callee, messing up the iterator.
4236 auto CallerEdges = Node->CallerEdges;
4237 for (auto CallerEdge : CallerEdges) {
4238 // Skip any caller edge moved onto a different callee during recursion.
4239 if (CallerEdge->Callee != Node)
4240 continue;
4241 // If we found an unvisited caller, note that we should check the caller
4242 // edges again as mergeClones may add or change caller nodes.
4243 if (DoMergeIteration && !Visited.contains(CallerEdge->Caller))
4244 FoundUnvisited = true;
4245 mergeClones(CallerEdge->Caller, Visited, ContextIdToAllocationNode);
4246 }
4247 }
4248
4249 TotalMergeInvokes++;
4250 TotalMergeIters += Iters;
4251 if (Iters > MaxMergeIters)
4252 MaxMergeIters = Iters;
4253
4254 // Merge for this node after we handle its callers.
4255 mergeNodeCalleeClones(Node, Visited, ContextIdToAllocationNode);
4256}
4257
4258template <typename DerivedCCG, typename FuncTy, typename CallTy>
4259void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::mergeNodeCalleeClones(
4260 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
4261 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode) {
4262 // Ignore Node if we moved all of its contexts to clones.
4263 if (Node->emptyContextIds())
4264 return;
4265
4266 // First identify groups of clones among Node's callee edges, by building
4267 // a map from each callee base node to the associated callee edges from Node.
4268 MapVector<ContextNode *, std::vector<std::shared_ptr<ContextEdge>>>
4269 OrigNodeToCloneEdges;
4270 for (const auto &E : Node->CalleeEdges) {
4271 auto *Callee = E->Callee;
4272 if (!Callee->CloneOf && Callee->Clones.empty())
4273 continue;
4274 ContextNode *Base = Callee->getOrigNode();
4275 OrigNodeToCloneEdges[Base].push_back(E);
4276 }
4277
4278 // Helper for callee edge sorting below. Return true if A's callee has fewer
4279 // caller edges than B, or if A is a clone and B is not, or if A's first
4280 // context id is smaller than B's.
4281 auto CalleeCallerEdgeLessThan = [](const std::shared_ptr<ContextEdge> &A,
4282 const std::shared_ptr<ContextEdge> &B) {
4283 if (A->Callee->CallerEdges.size() != B->Callee->CallerEdges.size())
4284 return A->Callee->CallerEdges.size() < B->Callee->CallerEdges.size();
4285 if (A->Callee->CloneOf && !B->Callee->CloneOf)
4286 return true;
4287 else if (!A->Callee->CloneOf && B->Callee->CloneOf)
4288 return false;
4289 // Use the first context id for each edge as a
4290 // tie-breaker.
4291 return *A->ContextIds.begin() < *B->ContextIds.begin();
4292 };
4293
4294 // Process each set of callee clones called by Node, performing the needed
4295 // merging.
4296 for (auto Entry : OrigNodeToCloneEdges) {
4297 // CalleeEdges is the set of edges from Node reaching callees that are
4298 // mutual clones of each other.
4299 auto &CalleeEdges = Entry.second;
4300 auto NumCalleeClones = CalleeEdges.size();
4301 // A single edge means there is no merging needed.
4302 if (NumCalleeClones == 1)
4303 continue;
4304 // Sort the CalleeEdges calling this group of clones in ascending order of
4305 // their caller edge counts, putting the original non-clone node first in
4306 // cases of a tie. This simplifies finding an existing node to use as the
4307 // merge node.
4308 llvm::stable_sort(CalleeEdges, CalleeCallerEdgeLessThan);
4309
4310 /// Find other callers of the given set of callee edges that can
4311 /// share the same callee merge node. See the comments at this method
4312 /// definition for details.
4313 DenseSet<ContextNode *> OtherCallersToShareMerge;
4314 findOtherCallersToShareMerge(Node, CalleeEdges, ContextIdToAllocationNode,
4315 OtherCallersToShareMerge);
4316
4317 // Now do the actual merging. Identify existing or create a new MergeNode
4318 // during the first iteration. Move each callee over, along with edges from
4319 // other callers we've determined above can share the same merge node.
4320 ContextNode *MergeNode = nullptr;
4321 DenseMap<ContextNode *, unsigned> CallerToMoveCount;
4322 for (auto CalleeEdge : CalleeEdges) {
4323 auto *OrigCallee = CalleeEdge->Callee;
4324 // If we don't have a MergeNode yet (only happens on the first iteration,
4325 // as a new one will be created when we go to move the first callee edge
4326 // over as needed), see if we can use this callee.
4327 if (!MergeNode) {
4328 // If there are no other callers, simply use this callee.
4329 if (CalleeEdge->Callee->CallerEdges.size() == 1) {
4330 MergeNode = OrigCallee;
4331 NonNewMergedNodes++;
4332 continue;
4333 }
4334 // Otherwise, if we have identified other caller nodes that can share
4335 // the merge node with Node, see if all of OrigCallee's callers are
4336 // going to share the same merge node. In that case we can use callee
4337 // (since all of its callers would move to the new merge node).
4338 if (!OtherCallersToShareMerge.empty()) {
4339 bool MoveAllCallerEdges = true;
4340 for (auto CalleeCallerE : OrigCallee->CallerEdges) {
4341 if (CalleeCallerE == CalleeEdge)
4342 continue;
4343 if (!OtherCallersToShareMerge.contains(CalleeCallerE->Caller)) {
4344 MoveAllCallerEdges = false;
4345 break;
4346 }
4347 }
4348 // If we are going to move all callers over, we can use this callee as
4349 // the MergeNode.
4350 if (MoveAllCallerEdges) {
4351 MergeNode = OrigCallee;
4352 NonNewMergedNodes++;
4353 continue;
4354 }
4355 }
4356 }
4357 // Move this callee edge, creating a new merge node if necessary.
4358 if (MergeNode) {
4359 assert(MergeNode != OrigCallee);
4360 moveEdgeToExistingCalleeClone(CalleeEdge, MergeNode,
4361 /*NewClone*/ false);
4362 } else {
4363 MergeNode = moveEdgeToNewCalleeClone(CalleeEdge);
4364 NewMergedNodes++;
4365 }
4366 // Now move all identified edges from other callers over to the merge node
4367 // as well.
4368 if (!OtherCallersToShareMerge.empty()) {
4369 // Make and iterate over a copy of OrigCallee's caller edges because
4370 // some of these will be moved off of the OrigCallee and that would mess
4371 // up the iteration from OrigCallee.
4372 auto OrigCalleeCallerEdges = OrigCallee->CallerEdges;
4373 for (auto &CalleeCallerE : OrigCalleeCallerEdges) {
4374 if (CalleeCallerE == CalleeEdge)
4375 continue;
4376 if (!OtherCallersToShareMerge.contains(CalleeCallerE->Caller))
4377 continue;
4378 CallerToMoveCount[CalleeCallerE->Caller]++;
4379 moveEdgeToExistingCalleeClone(CalleeCallerE, MergeNode,
4380 /*NewClone*/ false);
4381 }
4382 }
4383 removeNoneTypeCalleeEdges(OrigCallee);
4384 removeNoneTypeCalleeEdges(MergeNode);
4385 }
4386 }
4387}
4388
4389// Look for other nodes that have edges to the same set of callee
4390// clones as the current Node. Those can share the eventual merge node
4391// (reducing cloning and binary size overhead) iff:
4392// - they have edges to the same set of callee clones
4393// - each callee edge reaches a subset of the same allocations as Node's
4394// corresponding edge to the same callee clone.
4395// The second requirement is to ensure that we don't undo any of the
4396// necessary cloning to distinguish contexts with different allocation
4397// behavior.
4398// FIXME: This is somewhat conservative, as we really just need to ensure
4399// that they don't reach the same allocations as contexts on edges from Node
4400// going to any of the *other* callee clones being merged. However, that
4401// requires more tracking and checking to get right.
4402template <typename DerivedCCG, typename FuncTy, typename CallTy>
4403void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
4404 findOtherCallersToShareMerge(
4405 ContextNode *Node,
4406 std::vector<std::shared_ptr<ContextEdge>> &CalleeEdges,
4407 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode,
4408 DenseSet<ContextNode *> &OtherCallersToShareMerge) {
4409 auto NumCalleeClones = CalleeEdges.size();
4410 // This map counts how many edges to the same callee clone exist for other
4411 // caller nodes of each callee clone.
4412 DenseMap<ContextNode *, unsigned> OtherCallersToSharedCalleeEdgeCount;
4413 // Counts the number of other caller nodes that have edges to all callee
4414 // clones that don't violate the allocation context checking.
4415 unsigned PossibleOtherCallerNodes = 0;
4416
4417 // We only need to look at other Caller nodes if the first callee edge has
4418 // multiple callers (recall they are sorted in ascending order above).
4419 if (CalleeEdges[0]->Callee->CallerEdges.size() < 2)
4420 return;
4421
4422 // For each callee edge:
4423 // - Collect the count of other caller nodes calling the same callees.
4424 // - Collect the alloc nodes reached by contexts on each callee edge.
4425 DenseMap<ContextEdge *, DenseSet<ContextNode *>> CalleeEdgeToAllocNodes;
4426 for (auto CalleeEdge : CalleeEdges) {
4427 assert(CalleeEdge->Callee->CallerEdges.size() > 1);
4428 // For each other caller of the same callee, increment the count of
4429 // edges reaching the same callee clone.
4430 for (auto CalleeCallerEdges : CalleeEdge->Callee->CallerEdges) {
4431 if (CalleeCallerEdges->Caller == Node) {
4432 assert(CalleeCallerEdges == CalleeEdge);
4433 continue;
4434 }
4435 OtherCallersToSharedCalleeEdgeCount[CalleeCallerEdges->Caller]++;
4436 // If this caller edge now reaches all of the same callee clones,
4437 // increment the count of candidate other caller nodes.
4438 if (OtherCallersToSharedCalleeEdgeCount[CalleeCallerEdges->Caller] ==
4439 NumCalleeClones)
4440 PossibleOtherCallerNodes++;
4441 }
4442 // Collect the alloc nodes reached by contexts on each callee edge, for
4443 // later analysis.
4444 for (auto Id : CalleeEdge->getContextIds()) {
4445 auto *Alloc = ContextIdToAllocationNode.lookup(Id);
4446 if (!Alloc) {
4447 // FIXME: unclear why this happens occasionally, presumably
4448 // imperfect graph updates possibly with recursion.
4449 MissingAllocForContextId++;
4450 continue;
4451 }
4452 CalleeEdgeToAllocNodes[CalleeEdge.get()].insert(Alloc);
4453 }
4454 }
4455
4456 // Now walk the callee edges again, and make sure that for each candidate
4457 // caller node all of its edges to the callees reach the same allocs (or
4458 // a subset) as those along the corresponding callee edge from Node.
4459 for (auto CalleeEdge : CalleeEdges) {
4460 assert(CalleeEdge->Callee->CallerEdges.size() > 1);
4461 // Stop if we do not have any (more) candidate other caller nodes.
4462 if (!PossibleOtherCallerNodes)
4463 break;
4464 auto &CurCalleeAllocNodes = CalleeEdgeToAllocNodes[CalleeEdge.get()];
4465 // Check each other caller of this callee clone.
4466 for (auto &CalleeCallerE : CalleeEdge->Callee->CallerEdges) {
4467 // Not interested in the callee edge from Node itself.
4468 if (CalleeCallerE == CalleeEdge)
4469 continue;
4470 // Skip any callers that didn't have callee edges to all the same
4471 // callee clones.
4472 if (OtherCallersToSharedCalleeEdgeCount[CalleeCallerE->Caller] !=
4473 NumCalleeClones)
4474 continue;
4475 // Make sure that each context along edge from candidate caller node
4476 // reaches an allocation also reached by this callee edge from Node.
4477 for (auto Id : CalleeCallerE->getContextIds()) {
4478 auto *Alloc = ContextIdToAllocationNode.lookup(Id);
4479 if (!Alloc)
4480 continue;
4481 // If not, simply reset the map entry to 0 so caller is ignored, and
4482 // reduce the count of candidate other caller nodes.
4483 if (!CurCalleeAllocNodes.contains(Alloc)) {
4484 OtherCallersToSharedCalleeEdgeCount[CalleeCallerE->Caller] = 0;
4485 PossibleOtherCallerNodes--;
4486 break;
4487 }
4488 }
4489 }
4490 }
4491
4492 if (!PossibleOtherCallerNodes)
4493 return;
4494
4495 // Build the set of other caller nodes that can use the same callee merge
4496 // node.
4497 for (auto &[OtherCaller, Count] : OtherCallersToSharedCalleeEdgeCount) {
4498 if (Count != NumCalleeClones)
4499 continue;
4500 OtherCallersToShareMerge.insert(OtherCaller);
4501 }
4502}
4503
4504// This method assigns cloned callsites to functions, cloning the functions as
4505// needed. The assignment is greedy and proceeds roughly as follows:
4506//
4507// For each function Func:
4508// For each call with graph Node having clones:
4509// Initialize ClonesWorklist to Node and its clones
4510// Initialize NodeCloneCount to 0
4511// While ClonesWorklist is not empty:
4512// Clone = pop front ClonesWorklist
4513// NodeCloneCount++
4514// If Func has been cloned less than NodeCloneCount times:
4515// If NodeCloneCount is 1:
4516// Assign Clone to original Func
4517// Continue
4518// Create a new function clone
4519// If other callers not assigned to call a function clone yet:
4520// Assign them to call new function clone
4521// Continue
4522// Assign any other caller calling the cloned version to new clone
4523//
4524// For each caller of Clone:
4525// If caller is assigned to call a specific function clone:
4526// If we cannot assign Clone to that function clone:
4527// Create new callsite Clone NewClone
4528// Add NewClone to ClonesWorklist
4529// Continue
4530// Assign Clone to existing caller's called function clone
4531// Else:
4532// If Clone not already assigned to a function clone:
4533// Assign to first function clone without assignment
4534// Assign caller to selected function clone
4535// For each call with graph Node having clones:
4536// If number func clones > number call's callsite Node clones:
4537// Record func CallInfo clones without Node clone in UnassignedCallClones
4538// For callsite Nodes in DFS order from allocations:
4539// If IsAllocation:
4540// Update allocation with alloc type
4541// Else:
4542// For Call, all MatchingCalls, and associated UnnassignedCallClones:
4543// Update call to call recorded callee clone
4544//
4545template <typename DerivedCCG, typename FuncTy, typename CallTy>
4546bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
4547 bool Changed = false;
4548
4549 mergeClones();
4550
4551 // Keep track of the assignment of nodes (callsites) to function clones they
4552 // call.
4553 DenseMap<ContextNode *, FuncInfo> CallsiteToCalleeFuncCloneMap;
4554
4555 // Update caller node to call function version CalleeFunc, by recording the
4556 // assignment in CallsiteToCalleeFuncCloneMap.
4557 auto RecordCalleeFuncOfCallsite = [&](ContextNode *Caller,
4558 const FuncInfo &CalleeFunc) {
4559 assert(Caller->hasCall());
4560 CallsiteToCalleeFuncCloneMap[Caller] = CalleeFunc;
4561 };
4562
4563 // Information for a single clone of this Func.
4564 struct FuncCloneInfo {
4565 // The function clone.
4566 FuncInfo FuncClone;
4567 // Remappings of each call of interest (from original uncloned call to the
4568 // corresponding cloned call in this function clone).
4569 DenseMap<CallInfo, CallInfo> CallMap;
4570 };
4571
4572 // Map to keep track of information needed to update calls in function clones
4573 // when their corresponding callsite node was not itself cloned for that
4574 // function clone. Because of call context pruning (i.e. we only keep as much
4575 // caller information as needed to distinguish hot vs cold), we may not have
4576 // caller edges coming to each callsite node from all possible function
4577 // callers. A function clone may get created for other callsites in the
4578 // function for which there are caller edges that were not pruned. Any other
4579 // callsites in that function clone, which were not themselved cloned for
4580 // that function clone, should get updated the same way as the corresponding
4581 // callsite in the original function (which may call a clone of its callee).
4582 //
4583 // We build this map after completing function cloning for each function, so
4584 // that we can record the information from its call maps before they are
4585 // destructed. The map will be used as we update calls to update any still
4586 // unassigned call clones. Note that we may create new node clones as we clone
4587 // other functions, so later on we check which node clones were still not
4588 // created. To this end, the inner map is a map from function clone number to
4589 // the list of calls cloned for that function (can be more than one due to the
4590 // Node's MatchingCalls array).
4591 //
4592 // The alternative is creating new callsite clone nodes below as we clone the
4593 // function, but that is tricker to get right and likely more overhead.
4594 //
4595 // Inner map is a std::map so sorted by key (clone number), in order to get
4596 // ordered remarks in the full LTO case.
4597 DenseMap<const ContextNode *, std::map<unsigned, SmallVector<CallInfo, 0>>>
4598 UnassignedCallClones;
4599
4600 // Walk all functions for which we saw calls with memprof metadata, and handle
4601 // cloning for each of its calls.
4602 for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
4603 FuncInfo OrigFunc(Func);
4604 // Map from each clone number of OrigFunc to information about that function
4605 // clone (the function clone FuncInfo and call remappings). The index into
4606 // the vector is the clone number, as function clones are created and
4607 // numbered sequentially.
4608 std::vector<FuncCloneInfo> FuncCloneInfos;
4609 for (auto &Call : CallsWithMetadata) {
4610 ContextNode *Node = getNodeForInst(Call);
4611 // Skip call if we do not have a node for it (all uses of its stack ids
4612 // were either on inlined chains or pruned from the MIBs), or if we did
4613 // not create any clones for it.
4614 if (!Node || Node->Clones.empty())
4615 continue;
4616 assert(Node->hasCall() &&
4617 "Not having a call should have prevented cloning");
4618
4619 // Track the assignment of function clones to clones of the current
4620 // callsite Node being handled.
4621 std::map<FuncInfo, ContextNode *> FuncCloneToCurNodeCloneMap;
4622
4623 // Assign callsite version CallsiteClone to function version FuncClone,
4624 // and also assign (possibly cloned) Call to CallsiteClone.
4625 auto AssignCallsiteCloneToFuncClone = [&](const FuncInfo &FuncClone,
4626 CallInfo &Call,
4627 ContextNode *CallsiteClone,
4628 bool IsAlloc) {
4629 // Record the clone of callsite node assigned to this function clone.
4630 FuncCloneToCurNodeCloneMap[FuncClone] = CallsiteClone;
4631
4632 assert(FuncCloneInfos.size() > FuncClone.cloneNo());
4633 DenseMap<CallInfo, CallInfo> &CallMap =
4634 FuncCloneInfos[FuncClone.cloneNo()].CallMap;
4635 CallInfo CallClone(Call);
4636 if (auto It = CallMap.find(Call); It != CallMap.end())
4637 CallClone = It->second;
4638 CallsiteClone->setCall(CallClone);
4639 // Need to do the same for all matching calls.
4640 for (auto &MatchingCall : Node->MatchingCalls) {
4641 CallInfo CallClone(MatchingCall);
4642 if (auto It = CallMap.find(MatchingCall); It != CallMap.end())
4643 CallClone = It->second;
4644 // Updates the call in the list.
4645 MatchingCall = CallClone;
4646 }
4647 };
4648
4649 // Invokes moveEdgeToNewCalleeClone which creates a new clone, and then
4650 // performs the necessary fixups (removing none type edges, and
4651 // importantly, propagating any function call assignment of the original
4652 // node to the new clone).
4653 auto MoveEdgeToNewCalleeCloneAndSetUp =
4654 [&](const std::shared_ptr<ContextEdge> &Edge) {
4655 ContextNode *OrigCallee = Edge->Callee;
4656 ContextNode *NewClone = moveEdgeToNewCalleeClone(Edge);
4657 removeNoneTypeCalleeEdges(NewClone);
4658 assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
4659 // If the original Callee was already assigned to call a specific
4660 // function version, make sure its new clone is assigned to call
4661 // that same function clone.
4662 if (CallsiteToCalleeFuncCloneMap.count(OrigCallee))
4663 RecordCalleeFuncOfCallsite(
4664 NewClone, CallsiteToCalleeFuncCloneMap[OrigCallee]);
4665 return NewClone;
4666 };
4667
4668 // Keep track of the clones of callsite Node that need to be assigned to
4669 // function clones. This list may be expanded in the loop body below if we
4670 // find additional cloning is required.
4671 std::deque<ContextNode *> ClonesWorklist;
4672 // Ignore original Node if we moved all of its contexts to clones.
4673 if (!Node->emptyContextIds())
4674 ClonesWorklist.push_back(Node);
4675 llvm::append_range(ClonesWorklist, Node->Clones);
4676
4677 // Now walk through all of the clones of this callsite Node that we need,
4678 // and determine the assignment to a corresponding clone of the current
4679 // function (creating new function clones as needed).
4680 unsigned NodeCloneCount = 0;
4681 while (!ClonesWorklist.empty()) {
4682 ContextNode *Clone = ClonesWorklist.front();
4683 ClonesWorklist.pop_front();
4684 NodeCloneCount++;
4685 if (VerifyNodes)
4687
4688 // Need to create a new function clone if we have more callsite clones
4689 // than existing function clones, which would have been assigned to an
4690 // earlier clone in the list (we assign callsite clones to function
4691 // clones greedily).
4692 if (FuncCloneInfos.size() < NodeCloneCount) {
4693 // If this is the first callsite copy, assign to original function.
4694 if (NodeCloneCount == 1) {
4695 // Since FuncCloneInfos is empty in this case, no clones have
4696 // been created for this function yet, and no callers should have
4697 // been assigned a function clone for this callee node yet.
4699 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
4700 return CallsiteToCalleeFuncCloneMap.count(E->Caller);
4701 }));
4702 // Initialize with empty call map, assign Clone to original function
4703 // and its callers, and skip to the next clone.
4704 FuncCloneInfos.push_back(
4705 {OrigFunc, DenseMap<CallInfo, CallInfo>()});
4706 AssignCallsiteCloneToFuncClone(
4707 OrigFunc, Call, Clone,
4708 AllocationCallToContextNodeMap.count(Call));
4709 for (auto &CE : Clone->CallerEdges) {
4710 // Ignore any caller that does not have a recorded callsite Call.
4711 if (!CE->Caller->hasCall())
4712 continue;
4713 RecordCalleeFuncOfCallsite(CE->Caller, OrigFunc);
4714 }
4715 continue;
4716 }
4717
4718 // First locate which copy of OrigFunc to clone again. If a caller
4719 // of this callsite clone was already assigned to call a particular
4720 // function clone, we need to redirect all of those callers to the
4721 // new function clone, and update their other callees within this
4722 // function.
4723 FuncInfo PreviousAssignedFuncClone;
4724 auto EI = llvm::find_if(
4725 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
4726 return CallsiteToCalleeFuncCloneMap.count(E->Caller);
4727 });
4728 bool CallerAssignedToCloneOfFunc = false;
4729 if (EI != Clone->CallerEdges.end()) {
4730 const std::shared_ptr<ContextEdge> &Edge = *EI;
4731 PreviousAssignedFuncClone =
4732 CallsiteToCalleeFuncCloneMap[Edge->Caller];
4733 CallerAssignedToCloneOfFunc = true;
4734 }
4735
4736 // Clone function and save it along with the CallInfo map created
4737 // during cloning in the FuncCloneInfos.
4738 DenseMap<CallInfo, CallInfo> NewCallMap;
4739 unsigned CloneNo = FuncCloneInfos.size();
4740 assert(CloneNo > 0 && "Clone 0 is the original function, which "
4741 "should already exist in the map");
4742 FuncInfo NewFuncClone = cloneFunctionForCallsite(
4743 OrigFunc, Call, NewCallMap, CallsWithMetadata, CloneNo);
4744 FuncCloneInfos.push_back({NewFuncClone, std::move(NewCallMap)});
4745 FunctionClonesAnalysis++;
4746 Changed = true;
4747
4748 // If no caller callsites were already assigned to a clone of this
4749 // function, we can simply assign this clone to the new func clone
4750 // and update all callers to it, then skip to the next clone.
4751 if (!CallerAssignedToCloneOfFunc) {
4752 AssignCallsiteCloneToFuncClone(
4753 NewFuncClone, Call, Clone,
4754 AllocationCallToContextNodeMap.count(Call));
4755 for (auto &CE : Clone->CallerEdges) {
4756 // Ignore any caller that does not have a recorded callsite Call.
4757 if (!CE->Caller->hasCall())
4758 continue;
4759 RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
4760 }
4761 continue;
4762 }
4763
4764 // We may need to do additional node cloning in this case.
4765 // Reset the CallsiteToCalleeFuncCloneMap entry for any callers
4766 // that were previously assigned to call PreviousAssignedFuncClone,
4767 // to record that they now call NewFuncClone.
4768 // The none type edge removal may remove some of this Clone's caller
4769 // edges, if it is reached via another of its caller's callees.
4770 // Iterate over a copy and skip any that were removed.
4771 auto CallerEdges = Clone->CallerEdges;
4772 for (auto CE : CallerEdges) {
4773 // Skip any that have been removed on an earlier iteration.
4774 if (CE->isRemoved()) {
4775 assert(!is_contained(Clone->CallerEdges, CE));
4776 continue;
4777 }
4778 assert(CE);
4779 // Ignore any caller that does not have a recorded callsite Call.
4780 if (!CE->Caller->hasCall())
4781 continue;
4782
4783 if (!CallsiteToCalleeFuncCloneMap.count(CE->Caller) ||
4784 // We subsequently fall through to later handling that
4785 // will perform any additional cloning required for
4786 // callers that were calling other function clones.
4787 CallsiteToCalleeFuncCloneMap[CE->Caller] !=
4788 PreviousAssignedFuncClone)
4789 continue;
4790
4791 RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
4792
4793 // If we are cloning a function that was already assigned to some
4794 // callers, then essentially we are creating new callsite clones
4795 // of the other callsites in that function that are reached by those
4796 // callers. Clone the other callees of the current callsite's caller
4797 // that were already assigned to PreviousAssignedFuncClone
4798 // accordingly. This is important since we subsequently update the
4799 // calls from the nodes in the graph and their assignments to callee
4800 // functions recorded in CallsiteToCalleeFuncCloneMap.
4801 // The none type edge removal may remove some of this caller's
4802 // callee edges, if it is reached via another of its callees.
4803 // Iterate over a copy and skip any that were removed.
4804 auto CalleeEdges = CE->Caller->CalleeEdges;
4805 for (auto CalleeEdge : CalleeEdges) {
4806 // Skip any that have been removed on an earlier iteration when
4807 // cleaning up newly None type callee edges.
4808 if (CalleeEdge->isRemoved()) {
4809 assert(!is_contained(CE->Caller->CalleeEdges, CalleeEdge));
4810 continue;
4811 }
4812 assert(CalleeEdge);
4813 ContextNode *Callee = CalleeEdge->Callee;
4814 // Skip the current callsite, we are looking for other
4815 // callsites Caller calls, as well as any that does not have a
4816 // recorded callsite Call.
4817 if (Callee == Clone || !Callee->hasCall())
4818 continue;
4819 // Skip direct recursive calls. We don't need/want to clone the
4820 // caller node again, and this loop will not behave as expected if
4821 // we tried.
4822 if (Callee == CalleeEdge->Caller)
4823 continue;
4824 ContextNode *NewClone =
4825 MoveEdgeToNewCalleeCloneAndSetUp(CalleeEdge);
4826 // Moving the edge may have resulted in some none type
4827 // callee edges on the original Callee.
4828 removeNoneTypeCalleeEdges(Callee);
4829 // Update NewClone with the new Call clone of this callsite's Call
4830 // created for the new function clone created earlier.
4831 // Recall that we have already ensured when building the graph
4832 // that each caller can only call callsites within the same
4833 // function, so we are guaranteed that Callee Call is in the
4834 // current OrigFunc.
4835 // CallMap is set up as indexed by original Call at clone 0.
4836 CallInfo OrigCall(Callee->getOrigNode()->Call);
4837 OrigCall.setCloneNo(0);
4838 DenseMap<CallInfo, CallInfo> &CallMap =
4839 FuncCloneInfos[NewFuncClone.cloneNo()].CallMap;
4840 assert(CallMap.count(OrigCall));
4841 CallInfo NewCall(CallMap[OrigCall]);
4842 assert(NewCall);
4843 NewClone->setCall(NewCall);
4844 // Need to do the same for all matching calls.
4845 for (auto &MatchingCall : NewClone->MatchingCalls) {
4846 CallInfo OrigMatchingCall(MatchingCall);
4847 OrigMatchingCall.setCloneNo(0);
4848 assert(CallMap.count(OrigMatchingCall));
4849 CallInfo NewCall(CallMap[OrigMatchingCall]);
4850 assert(NewCall);
4851 // Updates the call in the list.
4852 MatchingCall = NewCall;
4853 }
4854 }
4855 }
4856 // Fall through to handling below to perform the recording of the
4857 // function for this callsite clone. This enables handling of cases
4858 // where the callers were assigned to different clones of a function.
4859 }
4860
4861 auto FindFirstAvailFuncClone = [&]() {
4862 // Find first function in FuncCloneInfos without an assigned
4863 // clone of this callsite Node. We should always have one
4864 // available at this point due to the earlier cloning when the
4865 // FuncCloneInfos size was smaller than the clone number.
4866 for (auto &CF : FuncCloneInfos) {
4867 if (!FuncCloneToCurNodeCloneMap.count(CF.FuncClone))
4868 return CF.FuncClone;
4869 }
4871 "Expected an available func clone for this callsite clone");
4872 };
4873
4874 // See if we can use existing function clone. Walk through
4875 // all caller edges to see if any have already been assigned to
4876 // a clone of this callsite's function. If we can use it, do so. If not,
4877 // because that function clone is already assigned to a different clone
4878 // of this callsite, then we need to clone again.
4879 // Basically, this checking is needed to handle the case where different
4880 // caller functions/callsites may need versions of this function
4881 // containing different mixes of callsite clones across the different
4882 // callsites within the function. If that happens, we need to create
4883 // additional function clones to handle the various combinations.
4884 //
4885 // Keep track of any new clones of this callsite created by the
4886 // following loop, as well as any existing clone that we decided to
4887 // assign this clone to.
4888 std::map<FuncInfo, ContextNode *> FuncCloneToNewCallsiteCloneMap;
4889 FuncInfo FuncCloneAssignedToCurCallsiteClone;
4890 // Iterate over a copy of Clone's caller edges, since we may need to
4891 // remove edges in the moveEdgeTo* methods, and this simplifies the
4892 // handling and makes it less error-prone.
4893 auto CloneCallerEdges = Clone->CallerEdges;
4894 for (auto &Edge : CloneCallerEdges) {
4895 // Skip removed edges (due to direct recursive edges updated when
4896 // updating callee edges when moving an edge and subsequently
4897 // removed by call to removeNoneTypeCalleeEdges on the Clone).
4898 if (Edge->isRemoved())
4899 continue;
4900 // Ignore any caller that does not have a recorded callsite Call.
4901 if (!Edge->Caller->hasCall())
4902 continue;
4903 // If this caller already assigned to call a version of OrigFunc, need
4904 // to ensure we can assign this callsite clone to that function clone.
4905 if (CallsiteToCalleeFuncCloneMap.count(Edge->Caller)) {
4906 FuncInfo FuncCloneCalledByCaller =
4907 CallsiteToCalleeFuncCloneMap[Edge->Caller];
4908 // First we need to confirm that this function clone is available
4909 // for use by this callsite node clone.
4910 //
4911 // While FuncCloneToCurNodeCloneMap is built only for this Node and
4912 // its callsite clones, one of those callsite clones X could have
4913 // been assigned to the same function clone called by Edge's caller
4914 // - if Edge's caller calls another callsite within Node's original
4915 // function, and that callsite has another caller reaching clone X.
4916 // We need to clone Node again in this case.
4917 if ((FuncCloneToCurNodeCloneMap.count(FuncCloneCalledByCaller) &&
4918 FuncCloneToCurNodeCloneMap[FuncCloneCalledByCaller] !=
4919 Clone) ||
4920 // Detect when we have multiple callers of this callsite that
4921 // have already been assigned to specific, and different, clones
4922 // of OrigFunc (due to other unrelated callsites in Func they
4923 // reach via call contexts). Is this Clone of callsite Node
4924 // assigned to a different clone of OrigFunc? If so, clone Node
4925 // again.
4926 (FuncCloneAssignedToCurCallsiteClone &&
4927 FuncCloneAssignedToCurCallsiteClone !=
4928 FuncCloneCalledByCaller)) {
4929 // We need to use a different newly created callsite clone, in
4930 // order to assign it to another new function clone on a
4931 // subsequent iteration over the Clones array (adjusted below).
4932 // Note we specifically do not reset the
4933 // CallsiteToCalleeFuncCloneMap entry for this caller, so that
4934 // when this new clone is processed later we know which version of
4935 // the function to copy (so that other callsite clones we have
4936 // assigned to that function clone are properly cloned over). See
4937 // comments in the function cloning handling earlier.
4938
4939 // Check if we already have cloned this callsite again while
4940 // walking through caller edges, for a caller calling the same
4941 // function clone. If so, we can move this edge to that new clone
4942 // rather than creating yet another new clone.
4943 if (FuncCloneToNewCallsiteCloneMap.count(
4944 FuncCloneCalledByCaller)) {
4945 ContextNode *NewClone =
4946 FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller];
4947 moveEdgeToExistingCalleeClone(Edge, NewClone);
4948 // Cleanup any none type edges cloned over.
4949 removeNoneTypeCalleeEdges(NewClone);
4950 } else {
4951 // Create a new callsite clone.
4952 ContextNode *NewClone = MoveEdgeToNewCalleeCloneAndSetUp(Edge);
4953 FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller] =
4954 NewClone;
4955 // Add to list of clones and process later.
4956 ClonesWorklist.push_back(NewClone);
4957 }
4958 // Moving the caller edge may have resulted in some none type
4959 // callee edges.
4960 removeNoneTypeCalleeEdges(Clone);
4961 // We will handle the newly created callsite clone in a subsequent
4962 // iteration over this Node's Clones.
4963 continue;
4964 }
4965
4966 // Otherwise, we can use the function clone already assigned to this
4967 // caller.
4968 if (!FuncCloneAssignedToCurCallsiteClone) {
4969 FuncCloneAssignedToCurCallsiteClone = FuncCloneCalledByCaller;
4970 // Assign Clone to FuncCloneCalledByCaller
4971 AssignCallsiteCloneToFuncClone(
4972 FuncCloneCalledByCaller, Call, Clone,
4973 AllocationCallToContextNodeMap.count(Call));
4974 } else
4975 // Don't need to do anything - callsite is already calling this
4976 // function clone.
4977 assert(FuncCloneAssignedToCurCallsiteClone ==
4978 FuncCloneCalledByCaller);
4979
4980 } else {
4981 // We have not already assigned this caller to a version of
4982 // OrigFunc. Do the assignment now.
4983
4984 // First check if we have already assigned this callsite clone to a
4985 // clone of OrigFunc for another caller during this iteration over
4986 // its caller edges.
4987 if (!FuncCloneAssignedToCurCallsiteClone) {
4988 FuncCloneAssignedToCurCallsiteClone = FindFirstAvailFuncClone();
4989 assert(FuncCloneAssignedToCurCallsiteClone);
4990 // Assign Clone to FuncCloneAssignedToCurCallsiteClone
4991 AssignCallsiteCloneToFuncClone(
4992 FuncCloneAssignedToCurCallsiteClone, Call, Clone,
4993 AllocationCallToContextNodeMap.count(Call));
4994 } else
4995 assert(FuncCloneToCurNodeCloneMap
4996 [FuncCloneAssignedToCurCallsiteClone] == Clone);
4997 // Update callers to record function version called.
4998 RecordCalleeFuncOfCallsite(Edge->Caller,
4999 FuncCloneAssignedToCurCallsiteClone);
5000 }
5001 }
5002 // If we didn't assign a function clone to this callsite clone yet, e.g.
5003 // none of its callers has a non-null call, do the assignment here.
5004 // We want to ensure that every callsite clone is assigned to some
5005 // function clone, so that the call updates below work as expected.
5006 // In particular if this is the original callsite, we want to ensure it
5007 // is assigned to the original function, otherwise the original function
5008 // will appear available for assignment to other callsite clones,
5009 // leading to unintended effects. For one, the unknown and not updated
5010 // callers will call into cloned paths leading to the wrong hints,
5011 // because they still call the original function (clone 0). Also,
5012 // because all callsites start out as being clone 0 by default, we can't
5013 // easily distinguish between callsites explicitly assigned to clone 0
5014 // vs those never assigned, which can lead to multiple updates of the
5015 // calls when invoking updateCall below, with mismatched clone values.
5016 // TODO: Add a flag to the callsite nodes or some other mechanism to
5017 // better distinguish and identify callsite clones that are not getting
5018 // assigned to function clones as expected.
5019 if (!FuncCloneAssignedToCurCallsiteClone) {
5020 FuncCloneAssignedToCurCallsiteClone = FindFirstAvailFuncClone();
5021 assert(FuncCloneAssignedToCurCallsiteClone &&
5022 "No available func clone for this callsite clone");
5023 AssignCallsiteCloneToFuncClone(
5024 FuncCloneAssignedToCurCallsiteClone, Call, Clone,
5025 /*IsAlloc=*/AllocationCallToContextNodeMap.contains(Call));
5026 }
5027 }
5028 if (VerifyCCG) {
5030 for (const auto &PE : Node->CalleeEdges)
5032 for (const auto &CE : Node->CallerEdges)
5034 for (auto *Clone : Node->Clones) {
5036 for (const auto &PE : Clone->CalleeEdges)
5038 for (const auto &CE : Clone->CallerEdges)
5040 }
5041 }
5042 }
5043
5044 if (FuncCloneInfos.size() < 2)
5045 continue;
5046
5047 // In this case there is more than just the original function copy.
5048 // Record call clones of any callsite nodes in the function that did not
5049 // themselves get cloned for all of the function clones.
5050 for (auto &Call : CallsWithMetadata) {
5051 ContextNode *Node = getNodeForInst(Call);
5052 if (!Node || !Node->hasCall() || Node->emptyContextIds())
5053 continue;
5054 // If Node has enough clones already to cover all function clones, we can
5055 // skip it. Need to add one for the original copy.
5056 // Use >= in case there were clones that were skipped due to having empty
5057 // context ids
5058 if (Node->Clones.size() + 1 >= FuncCloneInfos.size())
5059 continue;
5060 // First collect all function clones we cloned this callsite node for.
5061 // They may not be sequential due to empty clones e.g.
5062 DenseSet<unsigned> NodeCallClones;
5063 for (auto *C : Node->Clones)
5064 NodeCallClones.insert(C->Call.cloneNo());
5065 unsigned I = 0;
5066 // Now check all the function clones.
5067 for (auto &FC : FuncCloneInfos) {
5068 // Function clones should be sequential.
5069 assert(FC.FuncClone.cloneNo() == I);
5070 // Skip the first clone which got the original call.
5071 // Also skip any other clones created for this Node.
5072 if (++I == 1 || NodeCallClones.contains(I)) {
5073 continue;
5074 }
5075 // Record the call clones created for this callsite in this function
5076 // clone.
5077 auto &CallVector = UnassignedCallClones[Node][I];
5078 DenseMap<CallInfo, CallInfo> &CallMap = FC.CallMap;
5079 if (auto It = CallMap.find(Call); It != CallMap.end()) {
5080 CallInfo CallClone = It->second;
5081 CallVector.push_back(CallClone);
5082 } else {
5083 // All but the original clone (skipped earlier) should have an entry
5084 // for all calls.
5085 assert(false && "Expected to find call in CallMap");
5086 }
5087 // Need to do the same for all matching calls.
5088 for (auto &MatchingCall : Node->MatchingCalls) {
5089 if (auto It = CallMap.find(MatchingCall); It != CallMap.end()) {
5090 CallInfo CallClone = It->second;
5091 CallVector.push_back(CallClone);
5092 } else {
5093 // All but the original clone (skipped earlier) should have an entry
5094 // for all calls.
5095 assert(false && "Expected to find call in CallMap");
5096 }
5097 }
5098 }
5099 }
5100 }
5101
5102 uint8_t BothTypes =
5103 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
5104
5105 auto UpdateCalls = [&](ContextNode *Node,
5106 DenseSet<const ContextNode *> &Visited,
5107 auto &&UpdateCalls) {
5108 auto Inserted = Visited.insert(Node);
5109 if (!Inserted.second)
5110 return;
5111
5112 for (auto *Clone : Node->Clones)
5113 UpdateCalls(Clone, Visited, UpdateCalls);
5114
5115 for (auto &Edge : Node->CallerEdges)
5116 UpdateCalls(Edge->Caller, Visited, UpdateCalls);
5117
5118 // Skip if either no call to update, or if we ended up with no context ids
5119 // (we moved all edges onto other clones).
5120 if (!Node->hasCall() || Node->emptyContextIds())
5121 return;
5122
5123 if (Node->IsAllocation) {
5124 auto AT = allocTypeToUse(Node->AllocTypes);
5125 // If the allocation type is ambiguous, and more aggressive hinting
5126 // has been enabled via the MinClonedColdBytePercent flag, see if this
5127 // allocation should be hinted cold anyway because its fraction cold bytes
5128 // allocated is at least the given threshold.
5129 if (Node->AllocTypes == BothTypes && MinClonedColdBytePercent < 100 &&
5130 !ContextIdToContextSizeInfos.empty()) {
5131 uint64_t TotalCold = 0;
5132 uint64_t Total = 0;
5133 for (auto Id : Node->getContextIds()) {
5134 auto TypeI = ContextIdToAllocationType.find(Id);
5135 assert(TypeI != ContextIdToAllocationType.end());
5136 auto CSI = ContextIdToContextSizeInfos.find(Id);
5137 if (CSI != ContextIdToContextSizeInfos.end()) {
5138 for (auto &Info : CSI->second) {
5139 Total += Info.TotalSize;
5140 if (TypeI->second == AllocationType::Cold)
5141 TotalCold += Info.TotalSize;
5142 }
5143 }
5144 }
5145 if (TotalCold * 100 >= Total * MinClonedColdBytePercent)
5146 AT = AllocationType::Cold;
5147 }
5148 updateAllocationCall(Node->Call, AT);
5149 assert(Node->MatchingCalls.empty());
5150 return;
5151 }
5152
5153 if (!CallsiteToCalleeFuncCloneMap.count(Node))
5154 return;
5155
5156 auto CalleeFunc = CallsiteToCalleeFuncCloneMap[Node];
5157 updateCall(Node->Call, CalleeFunc);
5158 // Update all the matching calls as well.
5159 for (auto &Call : Node->MatchingCalls)
5160 updateCall(Call, CalleeFunc);
5161
5162 // Now update all calls recorded earlier that are still in function clones
5163 // which don't have a clone of this callsite node.
5164 if (!UnassignedCallClones.contains(Node))
5165 return;
5166 DenseSet<unsigned> NodeCallClones;
5167 for (auto *C : Node->Clones)
5168 NodeCallClones.insert(C->Call.cloneNo());
5169 // Note that we already confirmed Node is in this map a few lines above.
5170 auto &ClonedCalls = UnassignedCallClones[Node];
5171 for (auto &[CloneNo, CallVector] : ClonedCalls) {
5172 // Should start at 1 as we never create an entry for original node.
5173 assert(CloneNo > 0);
5174 // If we subsequently created a clone, skip this one.
5175 if (NodeCallClones.contains(CloneNo))
5176 continue;
5177 // Use the original Node's CalleeFunc.
5178 for (auto &Call : CallVector)
5179 updateCall(Call, CalleeFunc);
5180 }
5181 };
5182
5183 // Performs DFS traversal starting from allocation nodes to update calls to
5184 // reflect cloning decisions recorded earlier. For regular LTO this will
5185 // update the actual calls in the IR to call the appropriate function clone
5186 // (and add attributes to allocation calls), whereas for ThinLTO the decisions
5187 // are recorded in the summary entries.
5188 DenseSet<const ContextNode *> Visited;
5189 for (auto &Entry : AllocationCallToContextNodeMap)
5190 UpdateCalls(Entry.second, Visited, UpdateCalls);
5191
5192 return Changed;
5193}
5194
5195// Compute a SHA1 hash of the callsite and alloc version information of clone I
5196// in the summary, to use in detection of duplicate clones.
5198 SHA1 Hasher;
5199 // Update hash with any callsites that call non-default (non-zero) callee
5200 // versions.
5201 for (auto &SN : FS->callsites()) {
5202 // In theory all callsites and allocs in this function should have the same
5203 // number of clone entries, but handle any discrepancies gracefully below
5204 // for NDEBUG builds.
5205 assert(
5206 SN.Clones.size() > I &&
5207 "Callsite summary has fewer entries than other summaries in function");
5208 if (SN.Clones.size() <= I || !SN.Clones[I])
5209 continue;
5210 uint8_t Data[sizeof(SN.Clones[I])];
5211 support::endian::write32le(Data, SN.Clones[I]);
5212 Hasher.update(Data);
5213 }
5214 // Update hash with any allocs that have non-default (non-None) hints.
5215 for (auto &AN : FS->allocs()) {
5216 // In theory all callsites and allocs in this function should have the same
5217 // number of clone entries, but handle any discrepancies gracefully below
5218 // for NDEBUG builds.
5219 assert(AN.Versions.size() > I &&
5220 "Alloc summary has fewer entries than other summaries in function");
5221 if (AN.Versions.size() <= I ||
5222 (AllocationType)AN.Versions[I] == AllocationType::None)
5223 continue;
5224 Hasher.update(ArrayRef<uint8_t>(&AN.Versions[I], 1));
5225 }
5226 return support::endian::read64le(Hasher.result().data());
5227}
5228
5230 Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE,
5232 &FuncToAliasMap,
5233 FunctionSummary *FS) {
5234 auto TakeDeclNameAndReplace = [](GlobalValue *DeclGV, GlobalValue *NewGV) {
5235 // We might have created this when adjusting callsite in another
5236 // function. It should be a declaration.
5237 assert(DeclGV->isDeclaration());
5238 NewGV->takeName(DeclGV);
5239 DeclGV->replaceAllUsesWith(NewGV);
5240 DeclGV->eraseFromParent();
5241 };
5242
5243 // Handle aliases to this function, and create analogous alias clones to the
5244 // provided clone of this function.
5245 auto CloneFuncAliases = [&](Function *NewF, unsigned I) {
5246 if (!FuncToAliasMap.count(&F))
5247 return;
5248 for (auto *A : FuncToAliasMap[&F]) {
5249 std::string AliasName = getMemProfFuncName(A->getName(), I);
5250 auto *PrevA = M.getNamedAlias(AliasName);
5251 auto *NewA = GlobalAlias::create(A->getValueType(),
5252 A->getType()->getPointerAddressSpace(),
5253 A->getLinkage(), AliasName, NewF);
5254 NewA->copyAttributesFrom(A);
5255 if (PrevA)
5256 TakeDeclNameAndReplace(PrevA, NewA);
5257 }
5258 };
5259
5260 // The first "clone" is the original copy, we should only call this if we
5261 // needed to create new clones.
5262 assert(NumClones > 1);
5264 VMaps.reserve(NumClones - 1);
5265 FunctionsClonedThinBackend++;
5266
5267 // Map of hash of callsite/alloc versions to the instantiated function clone
5268 // (possibly the original) implementing those calls. Used to avoid
5269 // instantiating duplicate function clones.
5270 // FIXME: Ideally the thin link would not generate such duplicate clones to
5271 // start with, but right now it happens due to phase ordering in the function
5272 // assignment and possible new clones that produces. We simply make each
5273 // duplicate an alias to the matching instantiated clone recorded in the map
5274 // (except for available_externally which are made declarations as they would
5275 // be aliases in the prevailing module, and available_externally aliases are
5276 // not well supported right now).
5278
5279 // Save the hash of the original function version.
5280 HashToFunc[ComputeHash(FS, 0)] = &F;
5281
5282 for (unsigned I = 1; I < NumClones; I++) {
5283 VMaps.emplace_back(std::make_unique<ValueToValueMapTy>());
5284 std::string Name = getMemProfFuncName(F.getName(), I);
5285 auto Hash = ComputeHash(FS, I);
5286 // If this clone would duplicate a previously seen clone, don't generate the
5287 // duplicate clone body, just make an alias to satisfy any (potentially
5288 // cross-module) references.
5289 if (HashToFunc.contains(Hash)) {
5290 FunctionCloneDuplicatesThinBackend++;
5291 auto *Func = HashToFunc[Hash];
5292 if (Func->hasAvailableExternallyLinkage()) {
5293 // Skip these as EliminateAvailableExternallyPass does not handle
5294 // available_externally aliases correctly and we end up with an
5295 // available_externally alias to a declaration. Just create a
5296 // declaration for now as we know we will have a definition in another
5297 // module.
5298 auto Decl = M.getOrInsertFunction(Name, Func->getFunctionType());
5299 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F)
5300 << "created clone decl " << ore::NV("Decl", Decl.getCallee()));
5301 continue;
5302 }
5303 auto *PrevF = M.getFunction(Name);
5304 auto *Alias = GlobalAlias::create(Name, Func);
5305 if (PrevF)
5306 TakeDeclNameAndReplace(PrevF, Alias);
5307 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F)
5308 << "created clone alias " << ore::NV("Alias", Alias));
5309
5310 // Now handle aliases to this function, and clone those as well.
5311 CloneFuncAliases(Func, I);
5312 continue;
5313 }
5314 auto *NewF = CloneFunction(&F, *VMaps.back());
5315 HashToFunc[Hash] = NewF;
5316 FunctionClonesThinBackend++;
5317 // Strip memprof and callsite metadata from clone as they are no longer
5318 // needed.
5319 for (auto &BB : *NewF) {
5320 for (auto &Inst : BB) {
5321 Inst.setMetadata(LLVMContext::MD_memprof, nullptr);
5322 Inst.setMetadata(LLVMContext::MD_callsite, nullptr);
5323 }
5324 }
5325 auto *PrevF = M.getFunction(Name);
5326 if (PrevF)
5327 TakeDeclNameAndReplace(PrevF, NewF);
5328 else
5329 NewF->setName(Name);
5330 updateSubprogramLinkageName(NewF, Name);
5331 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F)
5332 << "created clone " << ore::NV("NewFunction", NewF));
5333
5334 // Now handle aliases to this function, and clone those as well.
5335 CloneFuncAliases(NewF, I);
5336 }
5337 return VMaps;
5338}
5339
5340// Locate the summary for F. This is complicated by the fact that it might
5341// have been internalized or promoted.
5343 const ModuleSummaryIndex *ImportSummary,
5344 const Function *CallingFunc = nullptr) {
5345 // FIXME: Ideally we would retain the original GUID in some fashion on the
5346 // function (e.g. as metadata), but for now do our best to locate the
5347 // summary without that information.
5348 ValueInfo TheFnVI = ImportSummary->getValueInfo(F.getGUID());
5349 if (!TheFnVI)
5350 // See if theFn was internalized, by checking index directly with
5351 // original name (this avoids the name adjustment done by getGUID() for
5352 // internal symbols).
5353 TheFnVI = ImportSummary->getValueInfo(
5355 if (TheFnVI)
5356 return TheFnVI;
5357 // Now query with the original name before any promotion was performed.
5358 StringRef OrigName =
5360 // When this pass is enabled, we always add thinlto_src_file provenance
5361 // metadata to imported function definitions, which allows us to recreate the
5362 // original internal symbol's GUID.
5363 auto SrcFileMD = F.getMetadata("thinlto_src_file");
5364 // If this is a call to an imported/promoted local for which we didn't import
5365 // the definition, the metadata will not exist on the declaration. However,
5366 // since we are doing this early, before any inlining in the LTO backend, we
5367 // can simply look at the metadata on the calling function which must have
5368 // been from the same module if F was an internal symbol originally.
5369 if (!SrcFileMD && F.isDeclaration()) {
5370 // We would only call this for a declaration for a direct callsite, in which
5371 // case the caller would have provided the calling function pointer.
5372 assert(CallingFunc);
5373 SrcFileMD = CallingFunc->getMetadata("thinlto_src_file");
5374 // If this is a promoted local (OrigName != F.getName()), since this is a
5375 // declaration, it must be imported from a different module and therefore we
5376 // should always find the metadata on its calling function. Any call to a
5377 // promoted local that came from this module should still be a definition.
5378 assert(SrcFileMD || OrigName == F.getName());
5379 }
5380 StringRef SrcFile = M.getSourceFileName();
5381 if (SrcFileMD)
5382 SrcFile = dyn_cast<MDString>(SrcFileMD->getOperand(0))->getString();
5383 std::string OrigId = GlobalValue::getGlobalIdentifier(
5384 OrigName, GlobalValue::InternalLinkage, SrcFile);
5385 TheFnVI = ImportSummary->getValueInfo(
5387 // Internal func in original module may have gotten a numbered suffix if we
5388 // imported an external function with the same name. This happens
5389 // automatically during IR linking for naming conflicts. It would have to
5390 // still be internal in that case (otherwise it would have been renamed on
5391 // promotion in which case we wouldn't have a naming conflict).
5392 if (!TheFnVI && OrigName == F.getName() && F.hasLocalLinkage() &&
5393 F.getName().contains('.')) {
5394 OrigName = F.getName().rsplit('.').first;
5396 OrigName, GlobalValue::InternalLinkage, SrcFile);
5397 TheFnVI = ImportSummary->getValueInfo(
5399 }
5400 // The only way we may not have a VI is if this is a declaration created for
5401 // an imported reference. For distributed ThinLTO we may not have a VI for
5402 // such declarations in the distributed summary.
5403 assert(TheFnVI || F.isDeclaration());
5404 return TheFnVI;
5405}
5406
5407bool MemProfContextDisambiguation::initializeIndirectCallPromotionInfo(
5408 Module &M) {
5409 ICallAnalysis = std::make_unique<ICallPromotionAnalysis>();
5410 Symtab = std::make_unique<InstrProfSymtab>();
5411 // Don't add canonical names, to avoid multiple functions to the symtab
5412 // when they both have the same root name with "." suffixes stripped.
5413 // If we pick the wrong one then this could lead to incorrect ICP and calling
5414 // a memprof clone that we don't actually create (resulting in linker unsats).
5415 // What this means is that the GUID of the function (or its PGOFuncName
5416 // metadata) *must* match that in the VP metadata to allow promotion.
5417 // In practice this should not be a limitation, since local functions should
5418 // have PGOFuncName metadata and global function names shouldn't need any
5419 // special handling (they should not get the ".llvm.*" suffix that the
5420 // canonicalization handling is attempting to strip).
5421 if (Error E = Symtab->create(M, /*InLTO=*/true, /*AddCanonical=*/false)) {
5422 std::string SymtabFailure = toString(std::move(E));
5423 M.getContext().emitError("Failed to create symtab: " + SymtabFailure);
5424 return false;
5425 }
5426 return true;
5427}
5428
5429#ifndef NDEBUG
5430// Sanity check that the MIB stack ids match between the summary and
5431// instruction metadata.
5433 const AllocInfo &AllocNode, const MDNode *MemProfMD,
5434 const CallStack<MDNode, MDNode::op_iterator> &CallsiteContext,
5435 const ModuleSummaryIndex *ImportSummary) {
5436 auto MIBIter = AllocNode.MIBs.begin();
5437 for (auto &MDOp : MemProfMD->operands()) {
5438 assert(MIBIter != AllocNode.MIBs.end());
5439 auto StackIdIndexIter = MIBIter->StackIdIndices.begin();
5440 auto *MIBMD = cast<const MDNode>(MDOp);
5441 MDNode *StackMDNode = getMIBStackNode(MIBMD);
5442 assert(StackMDNode);
5443 CallStack<MDNode, MDNode::op_iterator> StackContext(StackMDNode);
5444 auto ContextIterBegin =
5445 StackContext.beginAfterSharedPrefix(CallsiteContext);
5446 // Skip the checking on the first iteration.
5447 uint64_t LastStackContextId =
5448 (ContextIterBegin != StackContext.end() && *ContextIterBegin == 0) ? 1
5449 : 0;
5450 for (auto ContextIter = ContextIterBegin; ContextIter != StackContext.end();
5451 ++ContextIter) {
5452 // If this is a direct recursion, simply skip the duplicate
5453 // entries, to be consistent with how the summary ids were
5454 // generated during ModuleSummaryAnalysis.
5455 if (LastStackContextId == *ContextIter)
5456 continue;
5457 LastStackContextId = *ContextIter;
5458 assert(StackIdIndexIter != MIBIter->StackIdIndices.end());
5459 assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
5460 *ContextIter);
5461 StackIdIndexIter++;
5462 }
5463 MIBIter++;
5464 }
5465}
5466#endif
5467
5468bool MemProfContextDisambiguation::applyImport(Module &M) {
5469 assert(ImportSummary);
5470 bool Changed = false;
5471
5472 // We also need to clone any aliases that reference cloned functions, because
5473 // the modified callsites may invoke via the alias. Keep track of the aliases
5474 // for each function.
5475 std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>>
5476 FuncToAliasMap;
5477 for (auto &A : M.aliases()) {
5478 auto *Aliasee = A.getAliaseeObject();
5479 if (auto *F = dyn_cast<Function>(Aliasee))
5480 FuncToAliasMap[F].insert(&A);
5481 }
5482
5483 if (!initializeIndirectCallPromotionInfo(M))
5484 return false;
5485
5486 for (auto &F : M) {
5487 if (F.isDeclaration() || isMemProfClone(F))
5488 continue;
5489
5490 OptimizationRemarkEmitter ORE(&F);
5491
5493 bool ClonesCreated = false;
5494 unsigned NumClonesCreated = 0;
5495 auto CloneFuncIfNeeded = [&](unsigned NumClones, FunctionSummary *FS) {
5496 // We should at least have version 0 which is the original copy.
5497 assert(NumClones > 0);
5498 // If only one copy needed use original.
5499 if (NumClones == 1)
5500 return;
5501 // If we already performed cloning of this function, confirm that the
5502 // requested number of clones matches (the thin link should ensure the
5503 // number of clones for each constituent callsite is consistent within
5504 // each function), before returning.
5505 if (ClonesCreated) {
5506 assert(NumClonesCreated == NumClones);
5507 return;
5508 }
5509 VMaps = createFunctionClones(F, NumClones, M, ORE, FuncToAliasMap, FS);
5510 // The first "clone" is the original copy, which doesn't have a VMap.
5511 assert(VMaps.size() == NumClones - 1);
5512 Changed = true;
5513 ClonesCreated = true;
5514 NumClonesCreated = NumClones;
5515 };
5516
5517 auto CloneCallsite = [&](const CallsiteInfo &StackNode, CallBase *CB,
5518 Function *CalledFunction, FunctionSummary *FS) {
5519 // Perform cloning if not yet done.
5520 CloneFuncIfNeeded(/*NumClones=*/StackNode.Clones.size(), FS);
5521
5522 assert(!isMemProfClone(*CalledFunction));
5523
5524 // Because we update the cloned calls by calling setCalledOperand (see
5525 // comment below), out of an abundance of caution make sure the called
5526 // function was actually the called operand (or its aliasee). We also
5527 // strip pointer casts when looking for calls (to match behavior during
5528 // summary generation), however, with opaque pointers in theory this
5529 // should not be an issue. Note we still clone the current function
5530 // (containing this call) above, as that could be needed for its callers.
5531 auto *GA = dyn_cast_or_null<GlobalAlias>(CB->getCalledOperand());
5532 if (CalledFunction != CB->getCalledOperand() &&
5533 (!GA || CalledFunction != GA->getAliaseeObject())) {
5534 SkippedCallsCloning++;
5535 return;
5536 }
5537 // Update the calls per the summary info.
5538 // Save orig name since it gets updated in the first iteration
5539 // below.
5540 auto CalleeOrigName = CalledFunction->getName();
5541 for (unsigned J = 0; J < StackNode.Clones.size(); J++) {
5542 // If the VMap is empty, this clone was a duplicate of another and was
5543 // created as an alias or a declaration.
5544 if (J > 0 && VMaps[J - 1]->empty())
5545 continue;
5546 // Do nothing if this version calls the original version of its
5547 // callee.
5548 if (!StackNode.Clones[J])
5549 continue;
5550 auto NewF = M.getOrInsertFunction(
5551 getMemProfFuncName(CalleeOrigName, StackNode.Clones[J]),
5552 CalledFunction->getFunctionType());
5553 CallBase *CBClone;
5554 // Copy 0 is the original function.
5555 if (!J)
5556 CBClone = CB;
5557 else
5558 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
5559 // Set the called operand directly instead of calling setCalledFunction,
5560 // as the latter mutates the function type on the call. In rare cases
5561 // we may have a slightly different type on a callee function
5562 // declaration due to it being imported from a different module with
5563 // incomplete types. We really just want to change the name of the
5564 // function to the clone, and not make any type changes.
5565 CBClone->setCalledOperand(NewF.getCallee());
5566 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone)
5567 << ore::NV("Call", CBClone) << " in clone "
5568 << ore::NV("Caller", CBClone->getFunction())
5569 << " assigned to call function clone "
5570 << ore::NV("Callee", NewF.getCallee()));
5571 }
5572 };
5573
5574 // Locate the summary for F.
5575 ValueInfo TheFnVI = findValueInfoForFunc(F, M, ImportSummary);
5576 // If not found, this could be an imported local (see comment in
5577 // findValueInfoForFunc). Skip for now as it will be cloned in its original
5578 // module (where it would have been promoted to global scope so should
5579 // satisfy any reference in this module).
5580 if (!TheFnVI)
5581 continue;
5582
5583 auto *GVSummary =
5584 ImportSummary->findSummaryInModule(TheFnVI, M.getModuleIdentifier());
5585 if (!GVSummary) {
5586 // Must have been imported, use the summary which matches the definition。
5587 // (might be multiple if this was a linkonce_odr).
5588 auto SrcModuleMD = F.getMetadata("thinlto_src_module");
5589 assert(SrcModuleMD &&
5590 "enable-import-metadata is needed to emit thinlto_src_module");
5591 StringRef SrcModule =
5592 dyn_cast<MDString>(SrcModuleMD->getOperand(0))->getString();
5593 for (auto &GVS : TheFnVI.getSummaryList()) {
5594 if (GVS->modulePath() == SrcModule) {
5595 GVSummary = GVS.get();
5596 break;
5597 }
5598 }
5599 assert(GVSummary && GVSummary->modulePath() == SrcModule);
5600 }
5601
5602 // If this was an imported alias skip it as we won't have the function
5603 // summary, and it should be cloned in the original module.
5604 if (isa<AliasSummary>(GVSummary))
5605 continue;
5606
5607 auto *FS = cast<FunctionSummary>(GVSummary->getBaseObject());
5608
5609 if (FS->allocs().empty() && FS->callsites().empty())
5610 continue;
5611
5612 auto SI = FS->callsites().begin();
5613 auto AI = FS->allocs().begin();
5614
5615 // To handle callsite infos synthesized for tail calls which have missing
5616 // frames in the profiled context, map callee VI to the synthesized callsite
5617 // info.
5618 DenseMap<ValueInfo, CallsiteInfo> MapTailCallCalleeVIToCallsite;
5619 // Iterate the callsites for this function in reverse, since we place all
5620 // those synthesized for tail calls at the end.
5621 for (auto CallsiteIt = FS->callsites().rbegin();
5622 CallsiteIt != FS->callsites().rend(); CallsiteIt++) {
5623 auto &Callsite = *CallsiteIt;
5624 // Stop as soon as we see a non-synthesized callsite info (see comment
5625 // above loop). All the entries added for discovered tail calls have empty
5626 // stack ids.
5627 if (!Callsite.StackIdIndices.empty())
5628 break;
5629 MapTailCallCalleeVIToCallsite.insert({Callsite.Callee, Callsite});
5630 }
5631
5632 // Keeps track of needed ICP for the function.
5633 SmallVector<ICallAnalysisData> ICallAnalysisInfo;
5634
5635 // Assume for now that the instructions are in the exact same order
5636 // as when the summary was created, but confirm this is correct by
5637 // matching the stack ids.
5638 for (auto &BB : F) {
5639 for (auto &I : BB) {
5640 auto *CB = dyn_cast<CallBase>(&I);
5641 // Same handling as when creating module summary.
5642 if (!mayHaveMemprofSummary(CB))
5643 continue;
5644
5645 auto *CalledValue = CB->getCalledOperand();
5646 auto *CalledFunction = CB->getCalledFunction();
5647 if (CalledValue && !CalledFunction) {
5648 CalledValue = CalledValue->stripPointerCasts();
5649 // Stripping pointer casts can reveal a called function.
5650 CalledFunction = dyn_cast<Function>(CalledValue);
5651 }
5652 // Check if this is an alias to a function. If so, get the
5653 // called aliasee for the checks below.
5654 if (auto *GA = dyn_cast<GlobalAlias>(CalledValue)) {
5655 assert(!CalledFunction &&
5656 "Expected null called function in callsite for alias");
5657 CalledFunction = dyn_cast<Function>(GA->getAliaseeObject());
5658 }
5659
5660 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
5661 I.getMetadata(LLVMContext::MD_callsite));
5662 auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof);
5663
5664 // Include allocs that were already assigned a memprof function
5665 // attribute in the statistics. Only do this for those that do not have
5666 // memprof metadata, since we add an "ambiguous" memprof attribute by
5667 // default.
5668 if (CB->getAttributes().hasFnAttr("memprof") && !MemProfMD) {
5669 CB->getAttributes().getFnAttr("memprof").getValueAsString() == "cold"
5670 ? AllocTypeColdThinBackend++
5671 : AllocTypeNotColdThinBackend++;
5672 OrigAllocsThinBackend++;
5673 AllocVersionsThinBackend++;
5674 if (!MaxAllocVersionsThinBackend)
5675 MaxAllocVersionsThinBackend = 1;
5676 continue;
5677 }
5678
5679 if (MemProfMD) {
5680 // Consult the next alloc node.
5681 assert(AI != FS->allocs().end());
5682 auto &AllocNode = *(AI++);
5683
5684#ifndef NDEBUG
5685 checkAllocContextIds(AllocNode, MemProfMD, CallsiteContext,
5686 ImportSummary);
5687#endif
5688
5689 // Perform cloning if not yet done.
5690 CloneFuncIfNeeded(/*NumClones=*/AllocNode.Versions.size(), FS);
5691
5692 OrigAllocsThinBackend++;
5693 AllocVersionsThinBackend += AllocNode.Versions.size();
5694 if (MaxAllocVersionsThinBackend < AllocNode.Versions.size())
5695 MaxAllocVersionsThinBackend = AllocNode.Versions.size();
5696
5697 // If there is only one version that means we didn't end up
5698 // considering this function for cloning, and in that case the alloc
5699 // will still be none type or should have gotten the default NotCold.
5700 // Skip that after calling clone helper since that does some sanity
5701 // checks that confirm we haven't decided yet that we need cloning.
5702 // We might have a single version that is cold due to the
5703 // MinClonedColdBytePercent heuristic, make sure we don't skip in that
5704 // case.
5705 if (AllocNode.Versions.size() == 1 &&
5706 (AllocationType)AllocNode.Versions[0] != AllocationType::Cold) {
5707 assert((AllocationType)AllocNode.Versions[0] ==
5708 AllocationType::NotCold ||
5709 (AllocationType)AllocNode.Versions[0] ==
5710 AllocationType::None);
5711 UnclonableAllocsThinBackend++;
5712 continue;
5713 }
5714
5715 // All versions should have a singular allocation type.
5716 assert(llvm::none_of(AllocNode.Versions, [](uint8_t Type) {
5717 return Type == ((uint8_t)AllocationType::NotCold |
5718 (uint8_t)AllocationType::Cold);
5719 }));
5720
5721 // Update the allocation types per the summary info.
5722 for (unsigned J = 0; J < AllocNode.Versions.size(); J++) {
5723 // If the VMap is empty, this clone was a duplicate of another and
5724 // was created as an alias or a declaration.
5725 if (J > 0 && VMaps[J - 1]->empty())
5726 continue;
5727 // Ignore any that didn't get an assigned allocation type.
5728 if (AllocNode.Versions[J] == (uint8_t)AllocationType::None)
5729 continue;
5730 AllocationType AllocTy = (AllocationType)AllocNode.Versions[J];
5731 AllocTy == AllocationType::Cold ? AllocTypeColdThinBackend++
5732 : AllocTypeNotColdThinBackend++;
5733 std::string AllocTypeString = getAllocTypeAttributeString(AllocTy);
5734 auto A = llvm::Attribute::get(F.getContext(), "memprof",
5735 AllocTypeString);
5736 CallBase *CBClone;
5737 // Copy 0 is the original function.
5738 if (!J)
5739 CBClone = CB;
5740 else
5741 // Since VMaps are only created for new clones, we index with
5742 // clone J-1 (J==0 is the original clone and does not have a VMaps
5743 // entry).
5744 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
5746 CBClone->addFnAttr(A);
5747 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", CBClone)
5748 << ore::NV("AllocationCall", CBClone) << " in clone "
5749 << ore::NV("Caller", CBClone->getFunction())
5750 << " marked with memprof allocation attribute "
5751 << ore::NV("Attribute", AllocTypeString));
5752 }
5753 } else if (!CallsiteContext.empty()) {
5754 if (!CalledFunction) {
5755#ifndef NDEBUG
5756 // We should have skipped inline assembly calls.
5757 auto *CI = dyn_cast<CallInst>(CB);
5758 assert(!CI || !CI->isInlineAsm());
5759#endif
5760 // We should have skipped direct calls via a Constant.
5761 assert(CalledValue && !isa<Constant>(CalledValue));
5762
5763 // This is an indirect call, see if we have profile information and
5764 // whether any clones were recorded for the profiled targets (that
5765 // we synthesized CallsiteInfo summary records for when building the
5766 // index).
5767 auto NumClones =
5768 recordICPInfo(CB, FS->callsites(), SI, ICallAnalysisInfo);
5769
5770 // Perform cloning if not yet done. This is done here in case
5771 // we don't need to do ICP, but might need to clone this
5772 // function as it is the target of other cloned calls.
5773 if (NumClones)
5774 CloneFuncIfNeeded(NumClones, FS);
5775 }
5776
5777 else {
5778 // Consult the next callsite node.
5779 assert(SI != FS->callsites().end());
5780 auto &StackNode = *(SI++);
5781
5782#ifndef NDEBUG
5783 // Sanity check that the stack ids match between the summary and
5784 // instruction metadata.
5785 auto StackIdIndexIter = StackNode.StackIdIndices.begin();
5786 for (auto StackId : CallsiteContext) {
5787 assert(StackIdIndexIter != StackNode.StackIdIndices.end());
5788 assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
5789 StackId);
5790 StackIdIndexIter++;
5791 }
5792#endif
5793
5794 CloneCallsite(StackNode, CB, CalledFunction, FS);
5795 }
5796 } else if (CB->isTailCall() && CalledFunction) {
5797 // Locate the synthesized callsite info for the callee VI, if any was
5798 // created, and use that for cloning.
5799 ValueInfo CalleeVI =
5800 findValueInfoForFunc(*CalledFunction, M, ImportSummary, &F);
5801 if (CalleeVI && MapTailCallCalleeVIToCallsite.count(CalleeVI)) {
5802 auto Callsite = MapTailCallCalleeVIToCallsite.find(CalleeVI);
5803 assert(Callsite != MapTailCallCalleeVIToCallsite.end());
5804 CloneCallsite(Callsite->second, CB, CalledFunction, FS);
5805 }
5806 }
5807 }
5808 }
5809
5810 // Now do any promotion required for cloning.
5811 performICP(M, FS->callsites(), VMaps, ICallAnalysisInfo, ORE);
5812 }
5813
5814 // We skip some of the functions and instructions above, so remove all the
5815 // metadata in a single sweep here.
5816 for (auto &F : M) {
5817 // We can skip memprof clones because createFunctionClones already strips
5818 // the metadata from the newly created clones.
5819 if (F.isDeclaration() || isMemProfClone(F))
5820 continue;
5821 for (auto &BB : F) {
5822 for (auto &I : BB) {
5823 if (!isa<CallBase>(I))
5824 continue;
5825 I.setMetadata(LLVMContext::MD_memprof, nullptr);
5826 I.setMetadata(LLVMContext::MD_callsite, nullptr);
5827 }
5828 }
5829 }
5830
5831 return Changed;
5832}
5833
5834unsigned MemProfContextDisambiguation::recordICPInfo(
5835 CallBase *CB, ArrayRef<CallsiteInfo> AllCallsites,
5837 SmallVector<ICallAnalysisData> &ICallAnalysisInfo) {
5838 // First see if we have profile information for this indirect call.
5839 uint32_t NumCandidates;
5840 uint64_t TotalCount;
5841 auto CandidateProfileData =
5842 ICallAnalysis->getPromotionCandidatesForInstruction(CB, TotalCount,
5843 NumCandidates);
5844 if (CandidateProfileData.empty())
5845 return 0;
5846
5847 // Iterate through all of the candidate profiled targets along with the
5848 // CallsiteInfo summary records synthesized for them when building the index,
5849 // and see if any are cloned and/or refer to clones.
5850 bool ICPNeeded = false;
5851 unsigned NumClones = 0;
5852 size_t CallsiteInfoStartIndex = std::distance(AllCallsites.begin(), SI);
5853 for (const auto &Candidate : CandidateProfileData) {
5854#ifndef NDEBUG
5855 auto CalleeValueInfo =
5856#endif
5857 ImportSummary->getValueInfo(Candidate.Value);
5858 // We might not have a ValueInfo if this is a distributed
5859 // ThinLTO backend and decided not to import that function.
5860 assert(!CalleeValueInfo || SI->Callee == CalleeValueInfo);
5861 assert(SI != AllCallsites.end());
5862 auto &StackNode = *(SI++);
5863 // See if any of the clones of the indirect callsite for this
5864 // profiled target should call a cloned version of the profiled
5865 // target. We only need to do the ICP here if so.
5866 ICPNeeded |= llvm::any_of(StackNode.Clones,
5867 [](unsigned CloneNo) { return CloneNo != 0; });
5868 // Every callsite in the same function should have been cloned the same
5869 // number of times.
5870 assert(!NumClones || NumClones == StackNode.Clones.size());
5871 NumClones = StackNode.Clones.size();
5872 }
5873 if (!ICPNeeded)
5874 return NumClones;
5875 // Save information for ICP, which is performed later to avoid messing up the
5876 // current function traversal.
5877 ICallAnalysisInfo.push_back({CB, CandidateProfileData.vec(), NumCandidates,
5878 TotalCount, CallsiteInfoStartIndex});
5879 return NumClones;
5880}
5881
5882void MemProfContextDisambiguation::performICP(
5883 Module &M, ArrayRef<CallsiteInfo> AllCallsites,
5884 ArrayRef<std::unique_ptr<ValueToValueMapTy>> VMaps,
5885 ArrayRef<ICallAnalysisData> ICallAnalysisInfo,
5886 OptimizationRemarkEmitter &ORE) {
5887 // Now do any promotion required for cloning. Specifically, for each
5888 // recorded ICP candidate (which was only recorded because one clone of that
5889 // candidate should call a cloned target), we perform ICP (speculative
5890 // devirtualization) for each clone of the callsite, and update its callee
5891 // to the appropriate clone. Note that the ICP compares against the original
5892 // version of the target, which is what is in the vtable.
5893 for (auto &Info : ICallAnalysisInfo) {
5894 auto *CB = Info.CB;
5895 auto CallsiteIndex = Info.CallsiteInfoStartIndex;
5896 auto TotalCount = Info.TotalCount;
5897 unsigned NumPromoted = 0;
5898 unsigned NumClones = 0;
5899
5900 for (auto &Candidate : Info.CandidateProfileData) {
5901 auto &StackNode = AllCallsites[CallsiteIndex++];
5902
5903 // All calls in the same function must have the same number of clones.
5904 assert(!NumClones || NumClones == StackNode.Clones.size());
5905 NumClones = StackNode.Clones.size();
5906
5907 // See if the target is in the module. If it wasn't imported, it is
5908 // possible that this profile could have been collected on a different
5909 // target (or version of the code), and we need to be conservative
5910 // (similar to what is done in the ICP pass).
5911 Function *TargetFunction = Symtab->getFunction(Candidate.Value);
5912 if (TargetFunction == nullptr ||
5913 // Any ThinLTO global dead symbol removal should have already
5914 // occurred, so it should be safe to promote when the target is a
5915 // declaration.
5916 // TODO: Remove internal option once more fully tested.
5918 TargetFunction->isDeclaration())) {
5919 ORE.emit([&]() {
5920 return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToFindTarget", CB)
5921 << "Memprof cannot promote indirect call: target with md5sum "
5922 << ore::NV("target md5sum", Candidate.Value) << " not found";
5923 });
5924 // FIXME: See if we can use the new declaration importing support to
5925 // at least get the declarations imported for this case. Hot indirect
5926 // targets should have been imported normally, however.
5927 continue;
5928 }
5929
5930 // Check if legal to promote
5931 const char *Reason = nullptr;
5932 if (!isLegalToPromote(*CB, TargetFunction, &Reason)) {
5933 ORE.emit([&]() {
5934 return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToPromote", CB)
5935 << "Memprof cannot promote indirect call to "
5936 << ore::NV("TargetFunction", TargetFunction)
5937 << " with count of " << ore::NV("TotalCount", TotalCount)
5938 << ": " << Reason;
5939 });
5940 continue;
5941 }
5942
5943 assert(!isMemProfClone(*TargetFunction));
5944
5945 // Handle each call clone, applying ICP so that each clone directly
5946 // calls the specified callee clone, guarded by the appropriate ICP
5947 // check.
5948 CallBase *CBClone = CB;
5949 for (unsigned J = 0; J < NumClones; J++) {
5950 // If the VMap is empty, this clone was a duplicate of another and was
5951 // created as an alias or a declaration.
5952 if (J > 0 && VMaps[J - 1]->empty())
5953 continue;
5954 // Copy 0 is the original function.
5955 if (J > 0)
5956 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
5957 // We do the promotion using the original name, so that the comparison
5958 // is against the name in the vtable. Then just below, change the new
5959 // direct call to call the cloned function.
5960 auto &DirectCall =
5961 pgo::promoteIndirectCall(*CBClone, TargetFunction, Candidate.Count,
5962 TotalCount, isSamplePGO, &ORE);
5963 auto *TargetToUse = TargetFunction;
5964 // Call original if this version calls the original version of its
5965 // callee.
5966 if (StackNode.Clones[J]) {
5967 TargetToUse =
5968 cast<Function>(M.getOrInsertFunction(
5969 getMemProfFuncName(TargetFunction->getName(),
5970 StackNode.Clones[J]),
5971 TargetFunction->getFunctionType())
5972 .getCallee());
5973 }
5974 DirectCall.setCalledFunction(TargetToUse);
5975 // During matching we generate synthetic VP metadata for indirect calls
5976 // not already having any, from the memprof profile's callee GUIDs. If
5977 // we subsequently promote and inline those callees, we currently lose
5978 // the ability to generate this synthetic VP metadata. Optionally apply
5979 // a noinline attribute to promoted direct calls, where the threshold is
5980 // set to capture synthetic VP metadata targets which get a count of 1.
5982 Candidate.Count < MemProfICPNoInlineThreshold)
5983 DirectCall.setIsNoInline();
5984 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone)
5985 << ore::NV("Call", CBClone) << " in clone "
5986 << ore::NV("Caller", CBClone->getFunction())
5987 << " promoted and assigned to call function clone "
5988 << ore::NV("Callee", TargetToUse));
5989 }
5990
5991 // Update TotalCount (all clones should get same count above)
5992 TotalCount -= Candidate.Count;
5993 NumPromoted++;
5994 }
5995 // Adjust the MD.prof metadata for all clones, now that we have the new
5996 // TotalCount and the number promoted.
5997 CallBase *CBClone = CB;
5998 for (unsigned J = 0; J < NumClones; J++) {
5999 // If the VMap is empty, this clone was a duplicate of another and was
6000 // created as an alias or a declaration.
6001 if (J > 0 && VMaps[J - 1]->empty())
6002 continue;
6003 // Copy 0 is the original function.
6004 if (J > 0)
6005 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
6006 // First delete the old one.
6007 CBClone->setMetadata(LLVMContext::MD_prof, nullptr);
6008 // If all promoted, we don't need the MD.prof metadata.
6009 // Otherwise we need update with the un-promoted records back.
6010 if (TotalCount != 0)
6012 M, *CBClone, ArrayRef(Info.CandidateProfileData).slice(NumPromoted),
6013 TotalCount, IPVK_IndirectCallTarget, Info.NumCandidates);
6014 }
6015 }
6016}
6017
6018template <typename DerivedCCG, typename FuncTy, typename CallTy>
6019bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::process() {
6020 if (DumpCCG) {
6021 dbgs() << "CCG before cloning:\n";
6022 dbgs() << *this;
6023 }
6024 if (ExportToDot)
6025 exportToDot("postbuild");
6026
6027 if (VerifyCCG) {
6028 check();
6029 }
6030
6031 identifyClones();
6032
6033 if (VerifyCCG) {
6034 check();
6035 }
6036
6037 if (DumpCCG) {
6038 dbgs() << "CCG after cloning:\n";
6039 dbgs() << *this;
6040 }
6041 if (ExportToDot)
6042 exportToDot("cloned");
6043
6044 bool Changed = assignFunctions();
6045
6046 if (DumpCCG) {
6047 dbgs() << "CCG after assigning function clones:\n";
6048 dbgs() << *this;
6049 }
6050 if (ExportToDot)
6051 exportToDot("clonefuncassign");
6052
6054 printTotalSizes(errs());
6055
6056 return Changed;
6057}
6058
6059bool MemProfContextDisambiguation::processModule(
6060 Module &M,
6061 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter) {
6062
6063 // If we have an import summary, then the cloning decisions were made during
6064 // the thin link on the index. Apply them and return.
6065 if (ImportSummary)
6066 return applyImport(M);
6067
6068 // TODO: If/when other types of memprof cloning are enabled beyond just for
6069 // hot and cold, we will need to change this to individually control the
6070 // AllocationType passed to addStackNodesForMIB during CCG construction.
6071 // Note that we specifically check this after applying imports above, so that
6072 // the option isn't needed to be passed to distributed ThinLTO backend
6073 // clang processes, which won't necessarily have visibility into the linker
6074 // dependences. Instead the information is communicated from the LTO link to
6075 // the backends via the combined summary index.
6076 if (!SupportsHotColdNew)
6077 return false;
6078
6079 ModuleCallsiteContextGraph CCG(M, OREGetter);
6080 return CCG.process();
6081}
6082
6084 const ModuleSummaryIndex *Summary, bool isSamplePGO)
6085 : ImportSummary(Summary), isSamplePGO(isSamplePGO) {
6086 // Check the dot graph printing options once here, to make sure we have valid
6087 // and expected combinations.
6088 if (DotGraphScope == DotScope::Alloc && !AllocIdForDot.getNumOccurrences())
6090 "-memprof-dot-scope=alloc requires -memprof-dot-alloc-id");
6092 !ContextIdForDot.getNumOccurrences())
6094 "-memprof-dot-scope=context requires -memprof-dot-context-id");
6095 if (DotGraphScope == DotScope::All && AllocIdForDot.getNumOccurrences() &&
6096 ContextIdForDot.getNumOccurrences())
6098 "-memprof-dot-scope=all can't have both -memprof-dot-alloc-id and "
6099 "-memprof-dot-context-id");
6100 if (ImportSummary) {
6101 // The MemProfImportSummary should only be used for testing ThinLTO
6102 // distributed backend handling via opt, in which case we don't have a
6103 // summary from the pass pipeline.
6105 return;
6106 }
6107 if (MemProfImportSummary.empty())
6108 return;
6109
6110 auto ReadSummaryFile =
6112 if (!ReadSummaryFile) {
6113 logAllUnhandledErrors(ReadSummaryFile.takeError(), errs(),
6114 "Error loading file '" + MemProfImportSummary +
6115 "': ");
6116 return;
6117 }
6118 auto ImportSummaryForTestingOrErr = getModuleSummaryIndex(**ReadSummaryFile);
6119 if (!ImportSummaryForTestingOrErr) {
6120 logAllUnhandledErrors(ImportSummaryForTestingOrErr.takeError(), errs(),
6121 "Error parsing file '" + MemProfImportSummary +
6122 "': ");
6123 return;
6124 }
6125 ImportSummaryForTesting = std::move(*ImportSummaryForTestingOrErr);
6126 ImportSummary = ImportSummaryForTesting.get();
6127}
6128
6131 auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
6132 auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & {
6133 return FAM.getResult<OptimizationRemarkEmitterAnalysis>(*F);
6134 };
6135 if (!processModule(M, OREGetter))
6136 return PreservedAnalyses::all();
6137 return PreservedAnalyses::none();
6138}
6139
6141 ModuleSummaryIndex &Index,
6143 isPrevailing) {
6144 // TODO: If/when other types of memprof cloning are enabled beyond just for
6145 // hot and cold, we will need to change this to individually control the
6146 // AllocationType passed to addStackNodesForMIB during CCG construction.
6147 // The index was set from the option, so these should be in sync.
6148 assert(Index.withSupportsHotColdNew() == SupportsHotColdNew);
6149 if (!SupportsHotColdNew)
6150 return;
6151
6152 IndexCallsiteContextGraph CCG(Index, isPrevailing);
6153 CCG.process();
6154}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
aarch64 promote const
AMDGPU Prepare AGPR Alloc
Unify divergent function exit nodes
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
#define DEBUG_TYPE
global merge func
Module.h This file contains the declarations for the Module class.
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
#define G(x, y, z)
Definition MD5.cpp:56
Machine Check Debug Module
This file implements a map that provides insertion order iteration.
static cl::opt< unsigned > TailCallSearchDepth("memprof-tail-call-search-depth", cl::init(5), cl::Hidden, cl::desc("Max depth to recursively search for missing " "frames through tail calls."))
uint64_t ComputeHash(const FunctionSummary *FS, unsigned I)
static cl::opt< DotScope > DotGraphScope("memprof-dot-scope", cl::desc("Scope of graph to export to dot"), cl::Hidden, cl::init(DotScope::All), cl::values(clEnumValN(DotScope::All, "all", "Export full callsite graph"), clEnumValN(DotScope::Alloc, "alloc", "Export only nodes with contexts feeding given " "-memprof-dot-alloc-id"), clEnumValN(DotScope::Context, "context", "Export only nodes with given -memprof-dot-context-id")))
static cl::opt< bool > DoMergeIteration("memprof-merge-iteration", cl::init(true), cl::Hidden, cl::desc("Iteratively apply merging on a node to catch new callers"))
static bool isMemProfClone(const Function &F)
static cl::opt< unsigned > AllocIdForDot("memprof-dot-alloc-id", cl::init(0), cl::Hidden, cl::desc("Id of alloc to export if -memprof-dot-scope=alloc " "or to highlight if -memprof-dot-scope=all"))
static cl::opt< unsigned > ContextIdForDot("memprof-dot-context-id", cl::init(0), cl::Hidden, cl::desc("Id of context to export if -memprof-dot-scope=context or to " "highlight otherwise"))
static cl::opt< bool > ExportToDot("memprof-export-to-dot", cl::init(false), cl::Hidden, cl::desc("Export graph to dot files."))
static void checkEdge(const std::shared_ptr< ContextEdge< DerivedCCG, FuncTy, CallTy > > &Edge)
static cl::opt< bool > AllowRecursiveCallsites("memprof-allow-recursive-callsites", cl::init(true), cl::Hidden, cl::desc("Allow cloning of callsites involved in recursive cycles"))
bool checkColdOrNotCold(uint8_t AllocType)
static ValueInfo findValueInfoForFunc(const Function &F, const Module &M, const ModuleSummaryIndex *ImportSummary, const Function *CallingFunc=nullptr)
static cl::opt< bool > CloneRecursiveContexts("memprof-clone-recursive-contexts", cl::init(true), cl::Hidden, cl::desc("Allow cloning of contexts through recursive cycles"))
static std::string getAllocTypeString(uint8_t AllocTypes)
static cl::opt< unsigned > MemProfICPNoInlineThreshold("memprof-icp-noinline-threshold", cl::init(2), cl::Hidden, cl::desc("Minimum absolute count for promoted target to be inlinable"))
bool DOTGraphTraits< constCallsiteContextGraph< DerivedCCG, FuncTy, CallTy > * >::DoHighlight
static unsigned getMemProfCloneNum(const Function &F)
static SmallVector< std::unique_ptr< ValueToValueMapTy >, 4 > createFunctionClones(Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE, std::map< const Function *, SmallPtrSet< const GlobalAlias *, 1 > > &FuncToAliasMap, FunctionSummary *FS)
static cl::opt< bool > VerifyCCG("memprof-verify-ccg", cl::init(false), cl::Hidden, cl::desc("Perform verification checks on CallingContextGraph."))
static void checkNode(const ContextNode< DerivedCCG, FuncTy, CallTy > *Node, bool CheckEdges=true)
static cl::opt< bool > MergeClones("memprof-merge-clones", cl::init(true), cl::Hidden, cl::desc("Merge clones before assigning functions"))
static std::string getMemProfFuncName(Twine Base, unsigned CloneNo)
static cl::opt< std::string > MemProfImportSummary("memprof-import-summary", cl::desc("Import summary to use for testing the ThinLTO backend via opt"), cl::Hidden)
static const std::string MemProfCloneSuffix
static void updateSubprogramLinkageName(Function *NewFunc, StringRef Name)
static cl::opt< bool > AllowRecursiveContexts("memprof-allow-recursive-contexts", cl::init(true), cl::Hidden, cl::desc("Allow cloning of contexts having recursive cycles"))
static cl::opt< std::string > DotFilePathPrefix("memprof-dot-file-path-prefix", cl::init(""), cl::Hidden, cl::value_desc("filename"), cl::desc("Specify the path prefix of the MemProf dot files."))
static cl::opt< bool > VerifyNodes("memprof-verify-nodes", cl::init(false), cl::Hidden, cl::desc("Perform frequent verification checks on nodes."))
static void checkAllocContextIds(const AllocInfo &AllocNode, const MDNode *MemProfMD, const CallStack< MDNode, MDNode::op_iterator > &CallsiteContext, const ModuleSummaryIndex *ImportSummary)
static cl::opt< bool > DumpCCG("memprof-dump-ccg", cl::init(false), cl::Hidden, cl::desc("Dump CallingContextGraph to stdout after each stage."))
AllocType
This is the interface to build a ModuleSummaryIndex for a module.
ModuleSummaryIndex.h This file contains the declarations the classes that hold the module index and s...
#define P(N)
if(auto Err=PB.parsePassPipeline(MPM, Passes)) return wrap(std MPM run * Mod
FunctionAnalysisManager FAM
if(PassOpts->AAPipeline)
std::pair< BasicBlock *, BasicBlock * > Edge
This file defines generic set operations that may be used on set's of different types,...
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
This file contains some functions that are useful when dealing with strings.
#define LLVM_DEBUG(...)
Definition Debug.h:114
void print(OutputBuffer &OB) const
ValueInfo getAliaseeVI() const
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
iterator end() const
Definition ArrayRef.h:136
const_pointer iterator
Definition ArrayRef.h:48
iterator begin() const
Definition ArrayRef.h:135
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:142
static LLVM_ABI Attribute get(LLVMContext &Context, AttrKind Kind, uint64_t Val=0)
Return a uniquified Attribute object.
void addFnAttr(Attribute::AttrKind Kind)
Adds the attribute to the function.
void setCalledOperand(Value *V)
Subprogram description. Uses SubclassData1.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:194
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:167
unsigned size() const
Definition DenseMap.h:110
bool empty() const
Definition DenseMap.h:109
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition DenseMap.h:163
iterator end()
Definition DenseMap.h:81
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition DenseMap.h:158
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:222
void reserve(size_type NumEntries)
Grow the densemap so that it can contain at least NumEntries items before resizing again.
Definition DenseMap.h:114
Implements a dense probed hash-table based set.
Definition DenseSet.h:279
Function summary information to aid decisions and implementation of importing.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:209
DISubprogram * getSubprogram() const
Get the attached subprogram.
const Function & getFunction() const
Definition Function.h:164
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
static LLVM_ABI GlobalAlias * create(Type *Ty, unsigned AddressSpace, LinkageTypes Linkage, const Twine &Name, Constant *Aliasee, Module *Parent)
If a parent module is specified, the alias is automatically inserted into the end of the specified mo...
Definition Globals.cpp:597
Function and variable summary information to aid decisions and implementation of importing.
static LLVM_ABI GUID getGUIDAssumingExternalLinkage(StringRef GlobalName)
Return a 64-bit global unique ID constructed from the name of a global symbol.
Definition Globals.cpp:77
static bool isLocalLinkage(LinkageTypes Linkage)
LLVM_ABI bool isDeclaration() const
Return true if the primary definition of this global value is outside of the current translation unit...
Definition Globals.cpp:328
uint64_t GUID
Declare a type to represent a global unique identifier for a global value.
Module * getParent()
Get the module that this global value is contained inside of...
LLVM_ABI void eraseFromParent()
This method unlinks 'this' from the containing module and deletes it.
Definition Globals.cpp:93
static LLVM_ABI std::string getGlobalIdentifier(StringRef Name, GlobalValue::LinkageTypes Linkage, StringRef FileName)
Return the modified name for a global value suitable to be used as the key for a global lookup (e....
Definition Globals.cpp:161
@ InternalLinkage
Rename collisions when linking (static functions).
Definition GlobalValue.h:60
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Metadata node.
Definition Metadata.h:1078
const MDOperand & getOperand(unsigned I) const
Definition Metadata.h:1442
ArrayRef< MDOperand > operands() const
Definition Metadata.h:1440
unsigned getNumOperands() const
Return number of MDNode operands.
Definition Metadata.h:1448
LLVM_ABI TempMDNode clone() const
Create a (temporary) clone of this.
Definition Metadata.cpp:669
static std::enable_if_t< std::is_base_of< MDNode, T >::value, T * > replaceWithUniqued(std::unique_ptr< T, TempMDNodeDeleter > N)
Replace a temporary node with a uniqued one.
Definition Metadata.h:1317
static LLVM_ABI MDString * get(LLVMContext &Context, StringRef Str)
Definition Metadata.cpp:608
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
size_type count(const KeyT &Key) const
Definition MapVector.h:145
MemProfContextDisambiguation(const ModuleSummaryIndex *Summary=nullptr, bool isSamplePGO=false)
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
static ErrorOr< std::unique_ptr< MemoryBuffer > > getFile(const Twine &Filename, bool IsText=false, bool RequiresNullTerminator=true, bool IsVolatile=false, std::optional< Align > Alignment=std::nullopt)
Open the specified file as a MemoryBuffer, returning a new MemoryBuffer if successful,...
Class to hold module path string table and global value map, and encapsulate methods for operating on...
static StringRef getOriginalNameBeforePromote(StringRef Name)
Helper to obtain the unpromoted name for a global value (or the original name if not promoted).
ValueInfo getValueInfo(const GlobalValueSummaryMapTy::value_type &R) const
Return a ValueInfo for the index value_type (convenient when iterating index).
uint64_t getStackIdAtIndex(unsigned Index) const
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
LLVMContext & getContext() const
Get the global data context.
Definition Module.h:285
A NodeSet contains a set of SUnit DAG nodes with additional information that assigns a priority to th...
unsigned size() const
bool insert(SUnit *SU)
The optimization diagnostic interface.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for applied optimization remarks.
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
Definition Analysis.h:115
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
A class that wrap the SHA1 algorithm.
Definition SHA1.h:27
LLVM_ABI void update(ArrayRef< uint8_t > Data)
Digest more data.
Definition SHA1.cpp:208
LLVM_ABI std::array< uint8_t, 20 > result()
Return the current raw 160-bits SHA1 for the digested data since the last call to init().
Definition SHA1.cpp:288
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:183
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
void reserve(size_t Size)
Grow the DenseSet so that it can contain at least NumEntries items before resizing again.
Definition DenseSet.h:96
void insert_range(Range &&R)
Definition DenseSet.h:228
size_type size() const
Definition DenseSet.h:87
void swap(DenseSetImpl &RHS)
Definition DenseSet.h:102
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:175
bool erase(const ValueT &V)
Definition DenseSet.h:100
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition DenseSet.h:180
An efficient, type-erasing, non-owning reference to a callable.
Helper class to iterate through stack ids in both metadata (memprof MIB and callsite) and the corresp...
CallStackIterator beginAfterSharedPrefix(const CallStack &Other)
CallStackIterator end() const
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
CallInst * Call
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ Entry
Definition COFF.h:862
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ CE
Windows NT (Windows on ARM)
Definition MCAsmInfo.h:48
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
initializer< Ty > init(const Ty &Val)
std::enable_if_t< detail::IsValidPointer< X, Y >::value, X * > dyn_extract(Y &&MD)
Extract a Value from Metadata, if any.
Definition Metadata.h:695
LLVM_ABI AllocationType getMIBAllocType(const MDNode *MIB)
Returns the allocation type from an MIB metadata node.
LLVM_ABI bool metadataMayIncludeContextSizeInfo()
Whether the alloc memprof metadata may include context size info for some MIBs (but possibly not all)...
LLVM_ABI bool hasSingleAllocType(uint8_t AllocTypes)
True if the AllocTypes bitmask contains just a single type.
LLVM_ABI std::string getAllocTypeAttributeString(AllocationType Type)
Returns the string to use in attributes with the given type.
LLVM_ABI MDNode * getMIBStackNode(const MDNode *MIB)
Returns the stack node from an MIB metadata node.
LLVM_ABI void removeAnyExistingAmbiguousAttribute(CallBase *CB)
Removes any existing "ambiguous" memprof attribute.
DiagnosticInfoOptimizationBase::Argument NV
LLVM_ABI CallBase & promoteIndirectCall(CallBase &CB, Function *F, uint64_t Count, uint64_t TotalCount, bool AttachProfToDirectCall, OptimizationRemarkEmitter *ORE)
uint32_t NodeId
Definition RDFGraph.h:262
NodeAddr< NodeBase * > Node
Definition RDFGraph.h:381
NodeAddr< FuncNode * > Func
Definition RDFGraph.h:393
bool empty() const
Definition BasicBlock.h:101
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
uint64_t read64le(const void *P)
Definition Endian.h:435
void write32le(void *P, uint32_t V)
Definition Endian.h:475
This is an optimization pass for GlobalISel generic memory operations.
cl::opt< unsigned > MinClonedColdBytePercent("memprof-cloning-cold-threshold", cl::init(100), cl::Hidden, cl::desc("Min percent of cold bytes to hint alloc cold during cloning"))
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:318
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
LLVM_ABI void logAllUnhandledErrors(Error E, raw_ostream &OS, Twine ErrorBanner={})
Log all errors (if any) in E to OS.
Definition Error.cpp:65
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
void stable_sort(R &&Range)
Definition STLExtras.h:2038
cl::opt< bool > MemProfReportHintedSizes("memprof-report-hinted-sizes", cl::init(false), cl::Hidden, cl::desc("Report total allocation sizes of hinted allocations"))
LLVM_ABI bool isLegalToPromote(const CallBase &CB, Function *Callee, const char **FailureReason=nullptr)
Return true if the given indirect call site can be made to call Callee.
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2452
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:644
LLVM_ABI bool mayHaveMemprofSummary(const CallBase *CB)
Returns true if the instruction could have memprof metadata, used to ensure consistency between summa...
static cl::opt< bool > MemProfRequireDefinitionForPromotion("memprof-require-definition-for-promotion", cl::init(false), cl::Hidden, cl::desc("Require target function definition when promoting indirect calls"))
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:733
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2116
void set_subtract(S1Ty &S1, const S2Ty &S2)
set_subtract(A, B) - Compute A := A - B
InnerAnalysisManagerProxy< FunctionAnalysisManager, Module > FunctionAnalysisManagerModuleProxy
Provide the FunctionAnalysisManager to Module proxy.
raw_ostream & WriteGraph(raw_ostream &O, const GraphType &G, bool ShortNames=false, const Twine &Title="")
bool set_intersects(const S1Ty &S1, const S2Ty &S2)
set_intersects(A, B) - Return true iff A ^ B is non empty
detail::concat_range< ValueT, RangeTs... > concat(RangeTs &&...Ranges)
Returns a concatenated range across two or more ranges.
Definition STLExtras.h:1152
LLVM_ABI Expected< std::unique_ptr< ModuleSummaryIndex > > getModuleSummaryIndex(MemoryBufferRef Buffer)
Parse the specified bitcode buffer, returning the module summary index.
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:754
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1712
LLVM_ABI void annotateValueSite(Module &M, Instruction &Inst, const InstrProfRecord &InstrProfR, InstrProfValueKind ValueKind, uint32_t SiteIndx, uint32_t MaxMDCount=3)
Get the value profile data for value site SiteIdx from InstrProfR and annotate the instruction Inst w...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1719
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
bool set_union(S1Ty &S1, const S2Ty &S2)
set_union(A, B) - Compute A := A u B, return whether A changed.
cl::opt< bool > SupportsHotColdNew
Indicate we are linking with an allocator that supports hot/cold operator new interfaces.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
S1Ty set_intersection(const S1Ty &S1, const S2Ty &S2)
set_intersection(A, B) - Return A ^ B
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:71
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
cl::opt< bool > EnableMemProfContextDisambiguation
Enable MemProf context disambiguation for thin link.
S1Ty set_difference(const S1Ty &S1, const S2Ty &S2)
set_difference(A, B) - Return A - B
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
Expected< T > errorOrToExpected(ErrorOr< T > &&EO)
Convert an ErrorOr<T> to an Expected<T>.
Definition Error.h:1245
ArrayRef(const T &OneElt) -> ArrayRef< T >
std::string toString(const APInt &I, unsigned Radix, bool Signed, bool formatAsCLiteral=false, bool UpperCase=true, bool InsertSeparators=false)
ValueMap< const Value *, WeakTrackingVH > ValueToValueMapTy
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:560
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1877
LLVM_ABI Function * CloneFunction(Function *F, ValueToValueMapTy &VMap, ClonedCodeInfo *CodeInfo=nullptr)
Return a copy of the specified function and add it to that function's module.
AnalysisManager< Module > ModuleAnalysisManager
Convenience typedef for the Module analysis manager.
Definition MIRParser.h:39
#define N
static std::string getEdgeAttributes(NodeRef, ChildIteratorType ChildIter, GraphType G)
static const ContextNode< DerivedCCG, FuncTy, CallTy > * GetCallee(const EdgePtrTy &P)
std::unique_ptr< ContextNode< DerivedCCG, FuncTy, CallTy > > NodePtrTy
mapped_iterator< typename std::vector< std::shared_ptr< ContextEdge< DerivedCCG, FuncTy, CallTy > > >::const_iterator, decltype(&GetCallee)> ChildIteratorType
mapped_iterator< typename std::vector< NodePtrTy >::const_iterator, decltype(&getNode)> nodes_iterator
std::shared_ptr< ContextEdge< DerivedCCG, FuncTy, CallTy > > EdgePtrTy
Summary of memprof metadata on allocations.
std::vector< MIBInfo > MIBs
SmallVector< unsigned > StackIdIndices
SmallVector< unsigned > Clones
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits(bool simple=false)
An information struct used to provide DenseMap with the various necessary components for a given valu...
typename GraphType::UnknownGraphTypeError NodeRef
Definition GraphTraits.h:95
Struct that holds a reference to a particular GUID in a global value summary.
ArrayRef< std::unique_ptr< GlobalValueSummary > > getSummaryList() const
GlobalValue::GUID getGUID() const
PointerUnion< CallsiteInfo *, AllocInfo * > SimpleType
static SimpleType getSimplifiedValue(IndexCall &Val)
const PointerUnion< CallsiteInfo *, AllocInfo * > SimpleType
static SimpleType getSimplifiedValue(const IndexCall &Val)
Define a template that can be specialized by smart pointers to reflect the fact that they are automat...
Definition Casting.h:34