LLVM 22.0.0git
MemProfContextDisambiguation.cpp
Go to the documentation of this file.
1//==-- MemProfContextDisambiguation.cpp - Disambiguate contexts -------------=//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements support for context disambiguation of allocation
10// calls for profile guided heap optimization. Specifically, it uses Memprof
11// profiles which indicate context specific allocation behavior (currently
12// distinguishing cold vs hot memory allocations). Cloning is performed to
13// expose the cold allocation call contexts, and the allocation calls are
14// subsequently annotated with an attribute for later transformation.
15//
16// The transformations can be performed either directly on IR (regular LTO), or
17// on a ThinLTO index (and later applied to the IR during the ThinLTO backend).
18// Both types of LTO operate on a the same base graph representation, which
19// uses CRTP to support either IR or Index formats.
20//
21//===----------------------------------------------------------------------===//
22
24#include "llvm/ADT/DenseMap.h"
25#include "llvm/ADT/DenseSet.h"
26#include "llvm/ADT/MapVector.h"
29#include "llvm/ADT/SmallSet.h"
31#include "llvm/ADT/Statistic.h"
37#include "llvm/IR/Module.h"
39#include "llvm/Pass.h"
44#include "llvm/Transforms/IPO.h"
48#include <deque>
49#include <sstream>
50#include <unordered_map>
51#include <vector>
52using namespace llvm;
53using namespace llvm::memprof;
54
55#define DEBUG_TYPE "memprof-context-disambiguation"
56
57STATISTIC(FunctionClonesAnalysis,
58 "Number of function clones created during whole program analysis");
59STATISTIC(FunctionClonesThinBackend,
60 "Number of function clones created during ThinLTO backend");
61STATISTIC(FunctionsClonedThinBackend,
62 "Number of functions that had clones created during ThinLTO backend");
63STATISTIC(AllocTypeNotCold, "Number of not cold static allocations (possibly "
64 "cloned) during whole program analysis");
65STATISTIC(AllocTypeCold, "Number of cold static allocations (possibly cloned) "
66 "during whole program analysis");
67STATISTIC(AllocTypeNotColdThinBackend,
68 "Number of not cold static allocations (possibly cloned) during "
69 "ThinLTO backend");
70STATISTIC(AllocTypeColdThinBackend, "Number of cold static allocations "
71 "(possibly cloned) during ThinLTO backend");
72STATISTIC(OrigAllocsThinBackend,
73 "Number of original (not cloned) allocations with memprof profiles "
74 "during ThinLTO backend");
76 AllocVersionsThinBackend,
77 "Number of allocation versions (including clones) during ThinLTO backend");
78STATISTIC(MaxAllocVersionsThinBackend,
79 "Maximum number of allocation versions created for an original "
80 "allocation during ThinLTO backend");
81STATISTIC(UnclonableAllocsThinBackend,
82 "Number of unclonable ambigous allocations during ThinLTO backend");
83STATISTIC(RemovedEdgesWithMismatchedCallees,
84 "Number of edges removed due to mismatched callees (profiled vs IR)");
85STATISTIC(FoundProfiledCalleeCount,
86 "Number of profiled callees found via tail calls");
87STATISTIC(FoundProfiledCalleeDepth,
88 "Aggregate depth of profiled callees found via tail calls");
89STATISTIC(FoundProfiledCalleeMaxDepth,
90 "Maximum depth of profiled callees found via tail calls");
91STATISTIC(FoundProfiledCalleeNonUniquelyCount,
92 "Number of profiled callees found via multiple tail call chains");
93STATISTIC(DeferredBackedges, "Number of backedges with deferred cloning");
94STATISTIC(NewMergedNodes, "Number of new nodes created during merging");
95STATISTIC(NonNewMergedNodes, "Number of non new nodes used during merging");
96STATISTIC(MissingAllocForContextId,
97 "Number of missing alloc nodes for context ids");
98STATISTIC(SkippedCallsCloning,
99 "Number of calls skipped during cloning due to unexpected operand");
100STATISTIC(MismatchedCloneAssignments,
101 "Number of callsites assigned to call multiple non-matching clones");
102STATISTIC(TotalMergeInvokes, "Number of merge invocations for nodes");
103STATISTIC(TotalMergeIters, "Number of merge iterations for nodes");
104STATISTIC(MaxMergeIters, "Max merge iterations for nodes");
105
107 "memprof-dot-file-path-prefix", cl::init(""), cl::Hidden,
108 cl::value_desc("filename"),
109 cl::desc("Specify the path prefix of the MemProf dot files."));
110
111static cl::opt<bool> ExportToDot("memprof-export-to-dot", cl::init(false),
113 cl::desc("Export graph to dot files."));
114
115// TODO: Remove this option once new handling is validated more widely.
117 "memprof-merge-iteration", cl::init(true), cl::Hidden,
118 cl::desc("Iteratively apply merging on a node to catch new callers"));
119
120// How much of the graph to export to dot.
122 All, // The full CCG graph.
123 Alloc, // Only contexts for the specified allocation.
124 Context, // Only the specified context.
125};
126
128 "memprof-dot-scope", cl::desc("Scope of graph to export to dot"),
131 clEnumValN(DotScope::All, "all", "Export full callsite graph"),
133 "Export only nodes with contexts feeding given "
134 "-memprof-dot-alloc-id"),
135 clEnumValN(DotScope::Context, "context",
136 "Export only nodes with given -memprof-dot-context-id")));
137
139 AllocIdForDot("memprof-dot-alloc-id", cl::init(0), cl::Hidden,
140 cl::desc("Id of alloc to export if -memprof-dot-scope=alloc "
141 "or to highlight if -memprof-dot-scope=all"));
142
144 "memprof-dot-context-id", cl::init(0), cl::Hidden,
145 cl::desc("Id of context to export if -memprof-dot-scope=context or to "
146 "highlight otherwise"));
147
148static cl::opt<bool>
149 DumpCCG("memprof-dump-ccg", cl::init(false), cl::Hidden,
150 cl::desc("Dump CallingContextGraph to stdout after each stage."));
151
152static cl::opt<bool>
153 VerifyCCG("memprof-verify-ccg", cl::init(false), cl::Hidden,
154 cl::desc("Perform verification checks on CallingContextGraph."));
155
156static cl::opt<bool>
157 VerifyNodes("memprof-verify-nodes", cl::init(false), cl::Hidden,
158 cl::desc("Perform frequent verification checks on nodes."));
159
161 "memprof-import-summary",
162 cl::desc("Import summary to use for testing the ThinLTO backend via opt"),
163 cl::Hidden);
164
166 TailCallSearchDepth("memprof-tail-call-search-depth", cl::init(5),
168 cl::desc("Max depth to recursively search for missing "
169 "frames through tail calls."));
170
171// Optionally enable cloning of callsites involved with recursive cycles
173 "memprof-allow-recursive-callsites", cl::init(true), cl::Hidden,
174 cl::desc("Allow cloning of callsites involved in recursive cycles"));
175
177 "memprof-clone-recursive-contexts", cl::init(true), cl::Hidden,
178 cl::desc("Allow cloning of contexts through recursive cycles"));
179
180// Generally this is needed for correct assignment of allocation clones to
181// function clones, however, allow it to be disabled for debugging while the
182// functionality is new and being tested more widely.
183static cl::opt<bool>
184 MergeClones("memprof-merge-clones", cl::init(true), cl::Hidden,
185 cl::desc("Merge clones before assigning functions"));
186
187// When disabled, try to detect and prevent cloning of recursive contexts.
188// This is only necessary until we support cloning through recursive cycles.
189// Leave on by default for now, as disabling requires a little bit of compile
190// time overhead and doesn't affect correctness, it will just inflate the cold
191// hinted bytes reporting a bit when -memprof-report-hinted-sizes is enabled.
193 "memprof-allow-recursive-contexts", cl::init(true), cl::Hidden,
194 cl::desc("Allow cloning of contexts having recursive cycles"));
195
196// Set the minimum absolute count threshold for allowing inlining of indirect
197// calls promoted during cloning.
199 "memprof-icp-noinline-threshold", cl::init(2), cl::Hidden,
200 cl::desc("Minimum absolute count for promoted target to be inlinable"));
201
202namespace llvm {
204 "enable-memprof-context-disambiguation", cl::init(false), cl::Hidden,
205 cl::ZeroOrMore, cl::desc("Enable MemProf context disambiguation"));
206
207// Indicate we are linking with an allocator that supports hot/cold operator
208// new interfaces.
210 "supports-hot-cold-new", cl::init(false), cl::Hidden,
211 cl::desc("Linking with hot/cold operator new interfaces"));
212
214 "memprof-require-definition-for-promotion", cl::init(false), cl::Hidden,
215 cl::desc(
216 "Require target function definition when promoting indirect calls"));
217} // namespace llvm
218
221
222namespace {
223/// CRTP base for graphs built from either IR or ThinLTO summary index.
224///
225/// The graph represents the call contexts in all memprof metadata on allocation
226/// calls, with nodes for the allocations themselves, as well as for the calls
227/// in each context. The graph is initially built from the allocation memprof
228/// metadata (or summary) MIBs. It is then updated to match calls with callsite
229/// metadata onto the nodes, updating it to reflect any inlining performed on
230/// those calls.
231///
232/// Each MIB (representing an allocation's call context with allocation
233/// behavior) is assigned a unique context id during the graph build. The edges
234/// and nodes in the graph are decorated with the context ids they carry. This
235/// is used to correctly update the graph when cloning is performed so that we
236/// can uniquify the context for a single (possibly cloned) allocation.
237template <typename DerivedCCG, typename FuncTy, typename CallTy>
238class CallsiteContextGraph {
239public:
240 CallsiteContextGraph() = default;
241 CallsiteContextGraph(const CallsiteContextGraph &) = default;
242 CallsiteContextGraph(CallsiteContextGraph &&) = default;
243
244 /// Main entry point to perform analysis and transformations on graph.
245 bool process();
246
247 /// Perform cloning on the graph necessary to uniquely identify the allocation
248 /// behavior of an allocation based on its context.
249 void identifyClones();
250
251 /// Assign callsite clones to functions, cloning functions as needed to
252 /// accommodate the combinations of their callsite clones reached by callers.
253 /// For regular LTO this clones functions and callsites in the IR, but for
254 /// ThinLTO the cloning decisions are noted in the summaries and later applied
255 /// in applyImport.
256 bool assignFunctions();
257
258 void dump() const;
259 void print(raw_ostream &OS) const;
260 void printTotalSizes(raw_ostream &OS) const;
261
263 const CallsiteContextGraph &CCG) {
264 CCG.print(OS);
265 return OS;
266 }
267
268 friend struct GraphTraits<
269 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
270 friend struct DOTGraphTraits<
271 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
272
273 void exportToDot(std::string Label) const;
274
275 /// Represents a function clone via FuncTy pointer and clone number pair.
276 struct FuncInfo final
277 : public std::pair<FuncTy *, unsigned /*Clone number*/> {
278 using Base = std::pair<FuncTy *, unsigned>;
279 FuncInfo(const Base &B) : Base(B) {}
280 FuncInfo(FuncTy *F = nullptr, unsigned CloneNo = 0) : Base(F, CloneNo) {}
281 explicit operator bool() const { return this->first != nullptr; }
282 FuncTy *func() const { return this->first; }
283 unsigned cloneNo() const { return this->second; }
284 };
285
286 /// Represents a callsite clone via CallTy and clone number pair.
287 struct CallInfo final : public std::pair<CallTy, unsigned /*Clone number*/> {
288 using Base = std::pair<CallTy, unsigned>;
289 CallInfo(const Base &B) : Base(B) {}
290 CallInfo(CallTy Call = nullptr, unsigned CloneNo = 0)
291 : Base(Call, CloneNo) {}
292 explicit operator bool() const { return (bool)this->first; }
293 CallTy call() const { return this->first; }
294 unsigned cloneNo() const { return this->second; }
295 void setCloneNo(unsigned N) { this->second = N; }
296 void print(raw_ostream &OS) const {
297 if (!operator bool()) {
298 assert(!cloneNo());
299 OS << "null Call";
300 return;
301 }
302 call()->print(OS);
303 OS << "\t(clone " << cloneNo() << ")";
304 }
305 void dump() const {
306 print(dbgs());
307 dbgs() << "\n";
308 }
309 friend raw_ostream &operator<<(raw_ostream &OS, const CallInfo &Call) {
310 Call.print(OS);
311 return OS;
312 }
313 };
314
315 struct ContextEdge;
316
317 /// Node in the Callsite Context Graph
318 struct ContextNode {
319 // Keep this for now since in the IR case where we have an Instruction* it
320 // is not as immediately discoverable. Used for printing richer information
321 // when dumping graph.
322 bool IsAllocation;
323
324 // Keeps track of when the Call was reset to null because there was
325 // recursion.
326 bool Recursive = false;
327
328 // This will be formed by ORing together the AllocationType enum values
329 // for contexts including this node.
330 uint8_t AllocTypes = 0;
331
332 // The corresponding allocation or interior call. This is the primary call
333 // for which we have created this node.
334 CallInfo Call;
335
336 // List of other calls that can be treated the same as the primary call
337 // through cloning. I.e. located in the same function and have the same
338 // (possibly pruned) stack ids. They will be updated the same way as the
339 // primary call when assigning to function clones.
340 SmallVector<CallInfo, 0> MatchingCalls;
341
342 // For alloc nodes this is a unique id assigned when constructed, and for
343 // callsite stack nodes it is the original stack id when the node is
344 // constructed from the memprof MIB metadata on the alloc nodes. Note that
345 // this is only used when matching callsite metadata onto the stack nodes
346 // created when processing the allocation memprof MIBs, and for labeling
347 // nodes in the dot graph. Therefore we don't bother to assign a value for
348 // clones.
349 uint64_t OrigStackOrAllocId = 0;
350
351 // Edges to all callees in the profiled call stacks.
352 // TODO: Should this be a map (from Callee node) for more efficient lookup?
353 std::vector<std::shared_ptr<ContextEdge>> CalleeEdges;
354
355 // Edges to all callers in the profiled call stacks.
356 // TODO: Should this be a map (from Caller node) for more efficient lookup?
357 std::vector<std::shared_ptr<ContextEdge>> CallerEdges;
358
359 // Returns true if we need to look at the callee edges for determining the
360 // node context ids and allocation type.
361 bool useCallerEdgesForContextInfo() const {
362 // Typically if the callee edges are empty either the caller edges are
363 // also empty, or this is an allocation (leaf node). However, if we are
364 // allowing recursive callsites and contexts this will be violated for
365 // incompletely cloned recursive cycles.
366 assert(!CalleeEdges.empty() || CallerEdges.empty() || IsAllocation ||
368 // When cloning for a recursive context, during cloning we might be in the
369 // midst of cloning for a recurrence and have moved context ids off of a
370 // caller edge onto the clone but not yet off of the incoming caller
371 // (back) edge. If we don't look at those we miss the fact that this node
372 // still has context ids of interest.
373 return IsAllocation || CloneRecursiveContexts;
374 }
375
376 // Compute the context ids for this node from the union of its edge context
377 // ids.
378 DenseSet<uint32_t> getContextIds() const {
379 unsigned Count = 0;
380 // Compute the number of ids for reserve below. In general we only need to
381 // look at one set of edges, typically the callee edges, since other than
382 // allocations and in some cases during recursion cloning, all the context
383 // ids on the callers should also flow out via callee edges.
384 for (auto &Edge : CalleeEdges.empty() ? CallerEdges : CalleeEdges)
385 Count += Edge->getContextIds().size();
386 DenseSet<uint32_t> ContextIds;
387 ContextIds.reserve(Count);
389 CalleeEdges, useCallerEdgesForContextInfo()
390 ? CallerEdges
391 : std::vector<std::shared_ptr<ContextEdge>>());
392 for (const auto &Edge : Edges)
393 ContextIds.insert_range(Edge->getContextIds());
394 return ContextIds;
395 }
396
397 // Compute the allocation type for this node from the OR of its edge
398 // allocation types.
399 uint8_t computeAllocType() const {
400 uint8_t BothTypes =
401 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
402 uint8_t AllocType = (uint8_t)AllocationType::None;
404 CalleeEdges, useCallerEdgesForContextInfo()
405 ? CallerEdges
406 : std::vector<std::shared_ptr<ContextEdge>>());
407 for (const auto &Edge : Edges) {
408 AllocType |= Edge->AllocTypes;
409 // Bail early if alloc type reached both, no further refinement.
410 if (AllocType == BothTypes)
411 return AllocType;
412 }
413 return AllocType;
414 }
415
416 // The context ids set for this node is empty if its edge context ids are
417 // also all empty.
418 bool emptyContextIds() const {
420 CalleeEdges, useCallerEdgesForContextInfo()
421 ? CallerEdges
422 : std::vector<std::shared_ptr<ContextEdge>>());
423 for (const auto &Edge : Edges) {
424 if (!Edge->getContextIds().empty())
425 return false;
426 }
427 return true;
428 }
429
430 // List of clones of this ContextNode, initially empty.
431 std::vector<ContextNode *> Clones;
432
433 // If a clone, points to the original uncloned node.
434 ContextNode *CloneOf = nullptr;
435
436 ContextNode(bool IsAllocation) : IsAllocation(IsAllocation), Call() {}
437
438 ContextNode(bool IsAllocation, CallInfo C)
439 : IsAllocation(IsAllocation), Call(C) {}
440
441 void addClone(ContextNode *Clone) {
442 if (CloneOf) {
443 CloneOf->Clones.push_back(Clone);
444 Clone->CloneOf = CloneOf;
445 } else {
446 Clones.push_back(Clone);
447 assert(!Clone->CloneOf);
448 Clone->CloneOf = this;
449 }
450 }
451
452 ContextNode *getOrigNode() {
453 if (!CloneOf)
454 return this;
455 return CloneOf;
456 }
457
458 void addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
459 unsigned int ContextId);
460
461 ContextEdge *findEdgeFromCallee(const ContextNode *Callee);
462 ContextEdge *findEdgeFromCaller(const ContextNode *Caller);
463 void eraseCalleeEdge(const ContextEdge *Edge);
464 void eraseCallerEdge(const ContextEdge *Edge);
465
466 void setCall(CallInfo C) { Call = C; }
467
468 bool hasCall() const { return (bool)Call.call(); }
469
470 void printCall(raw_ostream &OS) const { Call.print(OS); }
471
472 // True if this node was effectively removed from the graph, in which case
473 // it should have an allocation type of None and empty context ids.
474 bool isRemoved() const {
475 // Typically if the callee edges are empty either the caller edges are
476 // also empty, or this is an allocation (leaf node). However, if we are
477 // allowing recursive callsites and contexts this will be violated for
478 // incompletely cloned recursive cycles.
480 (AllocTypes == (uint8_t)AllocationType::None) ==
481 emptyContextIds());
482 return AllocTypes == (uint8_t)AllocationType::None;
483 }
484
485 void dump() const;
486 void print(raw_ostream &OS) const;
487
488 friend raw_ostream &operator<<(raw_ostream &OS, const ContextNode &Node) {
489 Node.print(OS);
490 return OS;
491 }
492 };
493
494 /// Edge in the Callsite Context Graph from a ContextNode N to a caller or
495 /// callee.
496 struct ContextEdge {
497 ContextNode *Callee;
498 ContextNode *Caller;
499
500 // This will be formed by ORing together the AllocationType enum values
501 // for contexts including this edge.
502 uint8_t AllocTypes = 0;
503
504 // Set just before initiating cloning when cloning of recursive contexts is
505 // enabled. Used to defer cloning of backedges until we have done cloning of
506 // the callee node for non-backedge caller edges. This exposes cloning
507 // opportunities through the backedge of the cycle.
508 // TODO: Note that this is not updated during cloning, and it is unclear
509 // whether that would be needed.
510 bool IsBackedge = false;
511
512 // The set of IDs for contexts including this edge.
513 DenseSet<uint32_t> ContextIds;
514
515 ContextEdge(ContextNode *Callee, ContextNode *Caller, uint8_t AllocType,
516 DenseSet<uint32_t> ContextIds)
517 : Callee(Callee), Caller(Caller), AllocTypes(AllocType),
518 ContextIds(std::move(ContextIds)) {}
519
520 DenseSet<uint32_t> &getContextIds() { return ContextIds; }
521
522 // Helper to clear the fields of this edge when we are removing it from the
523 // graph.
524 inline void clear() {
525 ContextIds.clear();
526 AllocTypes = (uint8_t)AllocationType::None;
527 Caller = nullptr;
528 Callee = nullptr;
529 }
530
531 // Check if edge was removed from the graph. This is useful while iterating
532 // over a copy of edge lists when performing operations that mutate the
533 // graph in ways that might remove one of the edges.
534 inline bool isRemoved() const {
535 if (Callee || Caller)
536 return false;
537 // Any edges that have been removed from the graph but are still in a
538 // shared_ptr somewhere should have all fields null'ed out by clear()
539 // above.
540 assert(AllocTypes == (uint8_t)AllocationType::None);
541 assert(ContextIds.empty());
542 return true;
543 }
544
545 void dump() const;
546 void print(raw_ostream &OS) const;
547
548 friend raw_ostream &operator<<(raw_ostream &OS, const ContextEdge &Edge) {
549 Edge.print(OS);
550 return OS;
551 }
552 };
553
554 /// Helpers to remove edges that have allocation type None (due to not
555 /// carrying any context ids) after transformations.
556 void removeNoneTypeCalleeEdges(ContextNode *Node);
557 void removeNoneTypeCallerEdges(ContextNode *Node);
558 void
559 recursivelyRemoveNoneTypeCalleeEdges(ContextNode *Node,
560 DenseSet<const ContextNode *> &Visited);
561
562protected:
563 /// Get a list of nodes corresponding to the stack ids in the given callsite
564 /// context.
565 template <class NodeT, class IteratorT>
566 std::vector<uint64_t>
567 getStackIdsWithContextNodes(CallStack<NodeT, IteratorT> &CallsiteContext);
568
569 /// Adds nodes for the given allocation and any stack ids on its memprof MIB
570 /// metadata (or summary).
571 ContextNode *addAllocNode(CallInfo Call, const FuncTy *F);
572
573 /// Adds nodes for the given MIB stack ids.
574 template <class NodeT, class IteratorT>
575 void addStackNodesForMIB(ContextNode *AllocNode,
576 CallStack<NodeT, IteratorT> &StackContext,
577 CallStack<NodeT, IteratorT> &CallsiteContext,
579 ArrayRef<ContextTotalSize> ContextSizeInfo);
580
581 /// Matches all callsite metadata (or summary) to the nodes created for
582 /// allocation memprof MIB metadata, synthesizing new nodes to reflect any
583 /// inlining performed on those callsite instructions.
584 void updateStackNodes();
585
586 /// Update graph to conservatively handle any callsite stack nodes that target
587 /// multiple different callee target functions.
588 void handleCallsitesWithMultipleTargets();
589
590 /// Mark backedges via the standard DFS based backedge algorithm.
591 void markBackedges();
592
593 /// Merge clones generated during cloning for different allocations but that
594 /// are called by the same caller node, to ensure proper function assignment.
595 void mergeClones();
596
597 // Try to partition calls on the given node (already placed into the AllCalls
598 // array) by callee function, creating new copies of Node as needed to hold
599 // calls with different callees, and moving the callee edges appropriately.
600 // Returns true if partitioning was successful.
601 bool partitionCallsByCallee(
602 ContextNode *Node, ArrayRef<CallInfo> AllCalls,
603 std::vector<std::pair<CallInfo, ContextNode *>> &NewCallToNode);
604
605 /// Save lists of calls with MemProf metadata in each function, for faster
606 /// iteration.
607 MapVector<FuncTy *, std::vector<CallInfo>> FuncToCallsWithMetadata;
608
609 /// Map from callsite node to the enclosing caller function.
610 std::map<const ContextNode *, const FuncTy *> NodeToCallingFunc;
611
612 // When exporting to dot, and an allocation id is specified, contains the
613 // context ids on that allocation.
614 DenseSet<uint32_t> DotAllocContextIds;
615
616private:
617 using EdgeIter = typename std::vector<std::shared_ptr<ContextEdge>>::iterator;
618
619 // Structure to keep track of information for each call as we are matching
620 // non-allocation callsites onto context nodes created from the allocation
621 // call metadata / summary contexts.
622 struct CallContextInfo {
623 // The callsite we're trying to match.
624 CallTy Call;
625 // The callsites stack ids that have a context node in the graph.
626 std::vector<uint64_t> StackIds;
627 // The function containing this callsite.
628 const FuncTy *Func;
629 // Initially empty, if needed this will be updated to contain the context
630 // ids for use in a new context node created for this callsite.
631 DenseSet<uint32_t> ContextIds;
632 };
633
634 /// Helper to remove edge from graph, updating edge iterator if it is provided
635 /// (in which case CalleeIter indicates which edge list is being iterated).
636 /// This will also perform the necessary clearing of the ContextEdge members
637 /// to enable later checking if the edge has been removed (since we may have
638 /// other copies of the shared_ptr in existence, and in fact rely on this to
639 /// enable removal while iterating over a copy of a node's edge list).
640 void removeEdgeFromGraph(ContextEdge *Edge, EdgeIter *EI = nullptr,
641 bool CalleeIter = true);
642
643 /// Assigns the given Node to calls at or inlined into the location with
644 /// the Node's stack id, after post order traversing and processing its
645 /// caller nodes. Uses the call information recorded in the given
646 /// StackIdToMatchingCalls map, and creates new nodes for inlined sequences
647 /// as needed. Called by updateStackNodes which sets up the given
648 /// StackIdToMatchingCalls map.
649 void assignStackNodesPostOrder(
650 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
651 DenseMap<uint64_t, std::vector<CallContextInfo>> &StackIdToMatchingCalls,
652 DenseMap<CallInfo, CallInfo> &CallToMatchingCall);
653
654 /// Duplicates the given set of context ids, updating the provided
655 /// map from each original id with the newly generated context ids,
656 /// and returning the new duplicated id set.
657 DenseSet<uint32_t> duplicateContextIds(
658 const DenseSet<uint32_t> &StackSequenceContextIds,
659 DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
660
661 /// Propagates all duplicated context ids across the graph.
662 void propagateDuplicateContextIds(
663 const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
664
665 /// Connect the NewNode to OrigNode's callees if TowardsCallee is true,
666 /// else to its callers. Also updates OrigNode's edges to remove any context
667 /// ids moved to the newly created edge.
668 void connectNewNode(ContextNode *NewNode, ContextNode *OrigNode,
669 bool TowardsCallee,
670 DenseSet<uint32_t> RemainingContextIds);
671
672 /// Get the stack id corresponding to the given Id or Index (for IR this will
673 /// return itself, for a summary index this will return the id recorded in the
674 /// index for that stack id index value).
675 uint64_t getStackId(uint64_t IdOrIndex) const {
676 return static_cast<const DerivedCCG *>(this)->getStackId(IdOrIndex);
677 }
678
679 /// Returns true if the given call targets the callee of the given edge, or if
680 /// we were able to identify the call chain through intermediate tail calls.
681 /// In the latter case new context nodes are added to the graph for the
682 /// identified tail calls, and their synthesized nodes are added to
683 /// TailCallToContextNodeMap. The EdgeIter is updated in the latter case for
684 /// the updated edges and to prepare it for an increment in the caller.
685 bool
686 calleesMatch(CallTy Call, EdgeIter &EI,
687 MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap);
688
689 // Return the callee function of the given call, or nullptr if it can't be
690 // determined
691 const FuncTy *getCalleeFunc(CallTy Call) {
692 return static_cast<DerivedCCG *>(this)->getCalleeFunc(Call);
693 }
694
695 /// Returns true if the given call targets the given function, or if we were
696 /// able to identify the call chain through intermediate tail calls (in which
697 /// case FoundCalleeChain will be populated).
698 bool calleeMatchesFunc(
699 CallTy Call, const FuncTy *Func, const FuncTy *CallerFunc,
700 std::vector<std::pair<CallTy, FuncTy *>> &FoundCalleeChain) {
701 return static_cast<DerivedCCG *>(this)->calleeMatchesFunc(
702 Call, Func, CallerFunc, FoundCalleeChain);
703 }
704
705 /// Returns true if both call instructions have the same callee.
706 bool sameCallee(CallTy Call1, CallTy Call2) {
707 return static_cast<DerivedCCG *>(this)->sameCallee(Call1, Call2);
708 }
709
710 /// Get a list of nodes corresponding to the stack ids in the given
711 /// callsite's context.
712 std::vector<uint64_t> getStackIdsWithContextNodesForCall(CallTy Call) {
713 return static_cast<DerivedCCG *>(this)->getStackIdsWithContextNodesForCall(
714 Call);
715 }
716
717 /// Get the last stack id in the context for callsite.
718 uint64_t getLastStackId(CallTy Call) {
719 return static_cast<DerivedCCG *>(this)->getLastStackId(Call);
720 }
721
722 /// Update the allocation call to record type of allocated memory.
723 void updateAllocationCall(CallInfo &Call, AllocationType AllocType) {
724 AllocType == AllocationType::Cold ? AllocTypeCold++ : AllocTypeNotCold++;
725 static_cast<DerivedCCG *>(this)->updateAllocationCall(Call, AllocType);
726 }
727
728 /// Get the AllocationType assigned to the given allocation instruction clone.
729 AllocationType getAllocationCallType(const CallInfo &Call) const {
730 return static_cast<const DerivedCCG *>(this)->getAllocationCallType(Call);
731 }
732
733 /// Update non-allocation call to invoke (possibly cloned) function
734 /// CalleeFunc.
735 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc) {
736 static_cast<DerivedCCG *>(this)->updateCall(CallerCall, CalleeFunc);
737 }
738
739 /// Clone the given function for the given callsite, recording mapping of all
740 /// of the functions tracked calls to their new versions in the CallMap.
741 /// Assigns new clones to clone number CloneNo.
742 FuncInfo cloneFunctionForCallsite(
743 FuncInfo &Func, CallInfo &Call, DenseMap<CallInfo, CallInfo> &CallMap,
744 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
745 return static_cast<DerivedCCG *>(this)->cloneFunctionForCallsite(
746 Func, Call, CallMap, CallsWithMetadataInFunc, CloneNo);
747 }
748
749 /// Gets a label to use in the dot graph for the given call clone in the given
750 /// function.
751 std::string getLabel(const FuncTy *Func, const CallTy Call,
752 unsigned CloneNo) const {
753 return static_cast<const DerivedCCG *>(this)->getLabel(Func, Call, CloneNo);
754 }
755
756 // Create and return a new ContextNode.
757 ContextNode *createNewNode(bool IsAllocation, const FuncTy *F = nullptr,
758 CallInfo C = CallInfo()) {
759 NodeOwner.push_back(std::make_unique<ContextNode>(IsAllocation, C));
760 auto *NewNode = NodeOwner.back().get();
761 if (F)
762 NodeToCallingFunc[NewNode] = F;
763 return NewNode;
764 }
765
766 /// Helpers to find the node corresponding to the given call or stackid.
767 ContextNode *getNodeForInst(const CallInfo &C);
768 ContextNode *getNodeForAlloc(const CallInfo &C);
769 ContextNode *getNodeForStackId(uint64_t StackId);
770
771 /// Computes the alloc type corresponding to the given context ids, by
772 /// unioning their recorded alloc types.
773 uint8_t computeAllocType(DenseSet<uint32_t> &ContextIds) const;
774
775 /// Returns the allocation type of the intersection of the contexts of two
776 /// nodes (based on their provided context id sets), optimized for the case
777 /// when Node1Ids is smaller than Node2Ids.
778 uint8_t intersectAllocTypesImpl(const DenseSet<uint32_t> &Node1Ids,
779 const DenseSet<uint32_t> &Node2Ids) const;
780
781 /// Returns the allocation type of the intersection of the contexts of two
782 /// nodes (based on their provided context id sets).
783 uint8_t intersectAllocTypes(const DenseSet<uint32_t> &Node1Ids,
784 const DenseSet<uint32_t> &Node2Ids) const;
785
786 /// Create a clone of Edge's callee and move Edge to that new callee node,
787 /// performing the necessary context id and allocation type updates.
788 /// If ContextIdsToMove is non-empty, only that subset of Edge's ids are
789 /// moved to an edge to the new callee.
790 ContextNode *
791 moveEdgeToNewCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
792 DenseSet<uint32_t> ContextIdsToMove = {});
793
794 /// Change the callee of Edge to existing callee clone NewCallee, performing
795 /// the necessary context id and allocation type updates.
796 /// If ContextIdsToMove is non-empty, only that subset of Edge's ids are
797 /// moved to an edge to the new callee.
798 void moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
799 ContextNode *NewCallee,
800 bool NewClone = false,
801 DenseSet<uint32_t> ContextIdsToMove = {});
802
803 /// Change the caller of the edge at the given callee edge iterator to be
804 /// NewCaller, performing the necessary context id and allocation type
805 /// updates. This is similar to the above moveEdgeToExistingCalleeClone, but
806 /// a simplified version of it as we always move the given edge and all of its
807 /// context ids.
808 void moveCalleeEdgeToNewCaller(const std::shared_ptr<ContextEdge> &Edge,
809 ContextNode *NewCaller);
810
811 /// Recursive helper for marking backedges via DFS.
812 void markBackedges(ContextNode *Node, DenseSet<const ContextNode *> &Visited,
813 DenseSet<const ContextNode *> &CurrentStack);
814
815 /// Recursive helper for merging clones.
816 void
817 mergeClones(ContextNode *Node, DenseSet<const ContextNode *> &Visited,
818 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode);
819 /// Main worker for merging callee clones for a given node.
820 void mergeNodeCalleeClones(
821 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
822 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode);
823 /// Helper to find other callers of the given set of callee edges that can
824 /// share the same callee merge node.
825 void findOtherCallersToShareMerge(
826 ContextNode *Node, std::vector<std::shared_ptr<ContextEdge>> &CalleeEdges,
827 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode,
828 DenseSet<ContextNode *> &OtherCallersToShareMerge);
829
830 /// Recursively perform cloning on the graph for the given Node and its
831 /// callers, in order to uniquely identify the allocation behavior of an
832 /// allocation given its context. The context ids of the allocation being
833 /// processed are given in AllocContextIds.
834 void identifyClones(ContextNode *Node, DenseSet<const ContextNode *> &Visited,
835 const DenseSet<uint32_t> &AllocContextIds);
836
837 /// Map from each context ID to the AllocationType assigned to that context.
838 DenseMap<uint32_t, AllocationType> ContextIdToAllocationType;
839
840 /// Map from each contextID to the profiled full contexts and their total
841 /// sizes (there may be more than one due to context trimming),
842 /// optionally populated when requested (via MemProfReportHintedSizes or
843 /// MinClonedColdBytePercent).
844 DenseMap<uint32_t, std::vector<ContextTotalSize>> ContextIdToContextSizeInfos;
845
846 /// Identifies the context node created for a stack id when adding the MIB
847 /// contexts to the graph. This is used to locate the context nodes when
848 /// trying to assign the corresponding callsites with those stack ids to these
849 /// nodes.
850 DenseMap<uint64_t, ContextNode *> StackEntryIdToContextNodeMap;
851
852 /// Maps to track the calls to their corresponding nodes in the graph.
853 MapVector<CallInfo, ContextNode *> AllocationCallToContextNodeMap;
854 MapVector<CallInfo, ContextNode *> NonAllocationCallToContextNodeMap;
855
856 /// Owner of all ContextNode unique_ptrs.
857 std::vector<std::unique_ptr<ContextNode>> NodeOwner;
858
859 /// Perform sanity checks on graph when requested.
860 void check() const;
861
862 /// Keeps track of the last unique context id assigned.
863 unsigned int LastContextId = 0;
864};
865
866template <typename DerivedCCG, typename FuncTy, typename CallTy>
867using ContextNode =
868 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode;
869template <typename DerivedCCG, typename FuncTy, typename CallTy>
870using ContextEdge =
871 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge;
872template <typename DerivedCCG, typename FuncTy, typename CallTy>
873using FuncInfo =
874 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::FuncInfo;
875template <typename DerivedCCG, typename FuncTy, typename CallTy>
876using CallInfo =
877 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::CallInfo;
878
879/// CRTP derived class for graphs built from IR (regular LTO).
880class ModuleCallsiteContextGraph
881 : public CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
882 Instruction *> {
883public:
884 ModuleCallsiteContextGraph(
885 Module &M,
886 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter);
887
888private:
889 friend CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
890 Instruction *>;
891
892 uint64_t getStackId(uint64_t IdOrIndex) const;
893 const Function *getCalleeFunc(Instruction *Call);
894 bool calleeMatchesFunc(
895 Instruction *Call, const Function *Func, const Function *CallerFunc,
896 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain);
897 bool sameCallee(Instruction *Call1, Instruction *Call2);
898 bool findProfiledCalleeThroughTailCalls(
899 const Function *ProfiledCallee, Value *CurCallee, unsigned Depth,
900 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain,
901 bool &FoundMultipleCalleeChains);
902 uint64_t getLastStackId(Instruction *Call);
903 std::vector<uint64_t> getStackIdsWithContextNodesForCall(Instruction *Call);
904 void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
905 AllocationType getAllocationCallType(const CallInfo &Call) const;
906 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
907 CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
908 Instruction *>::FuncInfo
909 cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
910 DenseMap<CallInfo, CallInfo> &CallMap,
911 std::vector<CallInfo> &CallsWithMetadataInFunc,
912 unsigned CloneNo);
913 std::string getLabel(const Function *Func, const Instruction *Call,
914 unsigned CloneNo) const;
915
916 const Module &Mod;
917 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter;
918};
919
920/// Represents a call in the summary index graph, which can either be an
921/// allocation or an interior callsite node in an allocation's context.
922/// Holds a pointer to the corresponding data structure in the index.
923struct IndexCall : public PointerUnion<CallsiteInfo *, AllocInfo *> {
924 IndexCall() : PointerUnion() {}
925 IndexCall(std::nullptr_t) : IndexCall() {}
926 IndexCall(CallsiteInfo *StackNode) : PointerUnion(StackNode) {}
927 IndexCall(AllocInfo *AllocNode) : PointerUnion(AllocNode) {}
928 IndexCall(PointerUnion PT) : PointerUnion(PT) {}
929
930 IndexCall *operator->() { return this; }
931
932 void print(raw_ostream &OS) const {
933 PointerUnion<CallsiteInfo *, AllocInfo *> Base = *this;
935 OS << *AI;
936 } else {
938 assert(CI);
939 OS << *CI;
940 }
941 }
942};
943} // namespace
944
945namespace llvm {
946template <> struct simplify_type<IndexCall> {
948 static SimpleType getSimplifiedValue(IndexCall &Val) { return Val; }
949};
950template <> struct simplify_type<const IndexCall> {
952 static SimpleType getSimplifiedValue(const IndexCall &Val) { return Val; }
953};
954} // namespace llvm
955
956namespace {
957/// CRTP derived class for graphs built from summary index (ThinLTO).
958class IndexCallsiteContextGraph
959 : public CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
960 IndexCall> {
961public:
962 IndexCallsiteContextGraph(
963 ModuleSummaryIndex &Index,
964 llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
965 isPrevailing);
966
967 ~IndexCallsiteContextGraph() {
968 // Now that we are done with the graph it is safe to add the new
969 // CallsiteInfo structs to the function summary vectors. The graph nodes
970 // point into locations within these vectors, so we don't want to add them
971 // any earlier.
972 for (auto &I : FunctionCalleesToSynthesizedCallsiteInfos) {
973 auto *FS = I.first;
974 for (auto &Callsite : I.second)
975 FS->addCallsite(*Callsite.second);
976 }
977 }
978
979private:
980 friend CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
981 IndexCall>;
982
983 uint64_t getStackId(uint64_t IdOrIndex) const;
984 const FunctionSummary *getCalleeFunc(IndexCall &Call);
985 bool calleeMatchesFunc(
986 IndexCall &Call, const FunctionSummary *Func,
987 const FunctionSummary *CallerFunc,
988 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain);
989 bool sameCallee(IndexCall &Call1, IndexCall &Call2);
990 bool findProfiledCalleeThroughTailCalls(
991 ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
992 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
993 bool &FoundMultipleCalleeChains);
994 uint64_t getLastStackId(IndexCall &Call);
995 std::vector<uint64_t> getStackIdsWithContextNodesForCall(IndexCall &Call);
996 void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
997 AllocationType getAllocationCallType(const CallInfo &Call) const;
998 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
999 CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
1000 IndexCall>::FuncInfo
1001 cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
1002 DenseMap<CallInfo, CallInfo> &CallMap,
1003 std::vector<CallInfo> &CallsWithMetadataInFunc,
1004 unsigned CloneNo);
1005 std::string getLabel(const FunctionSummary *Func, const IndexCall &Call,
1006 unsigned CloneNo) const;
1007
1008 // Saves mapping from function summaries containing memprof records back to
1009 // its VI, for use in checking and debugging.
1010 std::map<const FunctionSummary *, ValueInfo> FSToVIMap;
1011
1012 const ModuleSummaryIndex &Index;
1013 llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
1014 isPrevailing;
1015
1016 // Saves/owns the callsite info structures synthesized for missing tail call
1017 // frames that we discover while building the graph.
1018 // It maps from the summary of the function making the tail call, to a map
1019 // of callee ValueInfo to corresponding synthesized callsite info.
1020 std::unordered_map<FunctionSummary *,
1021 std::map<ValueInfo, std::unique_ptr<CallsiteInfo>>>
1022 FunctionCalleesToSynthesizedCallsiteInfos;
1023};
1024} // namespace
1025
1026namespace llvm {
1027template <>
1028struct DenseMapInfo<typename CallsiteContextGraph<
1029 ModuleCallsiteContextGraph, Function, Instruction *>::CallInfo>
1031template <>
1032struct DenseMapInfo<typename CallsiteContextGraph<
1033 IndexCallsiteContextGraph, FunctionSummary, IndexCall>::CallInfo>
1034 : public DenseMapInfo<std::pair<IndexCall, unsigned>> {};
1035template <>
1036struct DenseMapInfo<IndexCall>
1037 : public DenseMapInfo<PointerUnion<CallsiteInfo *, AllocInfo *>> {};
1038} // end namespace llvm
1039
1040namespace {
1041
1042// Map the uint8_t alloc types (which may contain NotCold|Cold) to the alloc
1043// type we should actually use on the corresponding allocation.
1044// If we can't clone a node that has NotCold+Cold alloc type, we will fall
1045// back to using NotCold. So don't bother cloning to distinguish NotCold+Cold
1046// from NotCold.
1047AllocationType allocTypeToUse(uint8_t AllocTypes) {
1048 assert(AllocTypes != (uint8_t)AllocationType::None);
1049 if (AllocTypes ==
1052 else
1053 return (AllocationType)AllocTypes;
1054}
1055
1056// Helper to check if the alloc types for all edges recorded in the
1057// InAllocTypes vector match the alloc types for all edges in the Edges
1058// vector.
1059template <typename DerivedCCG, typename FuncTy, typename CallTy>
1060bool allocTypesMatch(
1061 const std::vector<uint8_t> &InAllocTypes,
1062 const std::vector<std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>>
1063 &Edges) {
1064 // This should be called only when the InAllocTypes vector was computed for
1065 // this set of Edges. Make sure the sizes are the same.
1066 assert(InAllocTypes.size() == Edges.size());
1067 return std::equal(
1068 InAllocTypes.begin(), InAllocTypes.end(), Edges.begin(), Edges.end(),
1069 [](const uint8_t &l,
1070 const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &r) {
1071 // Can share if one of the edges is None type - don't
1072 // care about the type along that edge as it doesn't
1073 // exist for those context ids.
1074 if (l == (uint8_t)AllocationType::None ||
1075 r->AllocTypes == (uint8_t)AllocationType::None)
1076 return true;
1077 return allocTypeToUse(l) == allocTypeToUse(r->AllocTypes);
1078 });
1079}
1080
1081// Helper to check if the alloc types for all edges recorded in the
1082// InAllocTypes vector match the alloc types for callee edges in the given
1083// clone. Because the InAllocTypes were computed from the original node's callee
1084// edges, and other cloning could have happened after this clone was created, we
1085// need to find the matching clone callee edge, which may or may not exist.
1086template <typename DerivedCCG, typename FuncTy, typename CallTy>
1087bool allocTypesMatchClone(
1088 const std::vector<uint8_t> &InAllocTypes,
1089 const ContextNode<DerivedCCG, FuncTy, CallTy> *Clone) {
1090 const ContextNode<DerivedCCG, FuncTy, CallTy> *Node = Clone->CloneOf;
1091 assert(Node);
1092 // InAllocTypes should have been computed for the original node's callee
1093 // edges.
1094 assert(InAllocTypes.size() == Node->CalleeEdges.size());
1095 // First create a map of the clone callee edge callees to the edge alloc type.
1097 EdgeCalleeMap;
1098 for (const auto &E : Clone->CalleeEdges) {
1099 assert(!EdgeCalleeMap.contains(E->Callee));
1100 EdgeCalleeMap[E->Callee] = E->AllocTypes;
1101 }
1102 // Next, walk the original node's callees, and look for the corresponding
1103 // clone edge to that callee.
1104 for (unsigned I = 0; I < Node->CalleeEdges.size(); I++) {
1105 auto Iter = EdgeCalleeMap.find(Node->CalleeEdges[I]->Callee);
1106 // Not found is ok, we will simply add an edge if we use this clone.
1107 if (Iter == EdgeCalleeMap.end())
1108 continue;
1109 // Can share if one of the edges is None type - don't
1110 // care about the type along that edge as it doesn't
1111 // exist for those context ids.
1112 if (InAllocTypes[I] == (uint8_t)AllocationType::None ||
1113 Iter->second == (uint8_t)AllocationType::None)
1114 continue;
1115 if (allocTypeToUse(Iter->second) != allocTypeToUse(InAllocTypes[I]))
1116 return false;
1117 }
1118 return true;
1119}
1120
1121} // end anonymous namespace
1122
1123template <typename DerivedCCG, typename FuncTy, typename CallTy>
1124typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1125CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForInst(
1126 const CallInfo &C) {
1127 ContextNode *Node = getNodeForAlloc(C);
1128 if (Node)
1129 return Node;
1130
1131 return NonAllocationCallToContextNodeMap.lookup(C);
1132}
1133
1134template <typename DerivedCCG, typename FuncTy, typename CallTy>
1135typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1136CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForAlloc(
1137 const CallInfo &C) {
1138 return AllocationCallToContextNodeMap.lookup(C);
1139}
1140
1141template <typename DerivedCCG, typename FuncTy, typename CallTy>
1142typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1143CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForStackId(
1144 uint64_t StackId) {
1145 auto StackEntryNode = StackEntryIdToContextNodeMap.find(StackId);
1146 if (StackEntryNode != StackEntryIdToContextNodeMap.end())
1147 return StackEntryNode->second;
1148 return nullptr;
1149}
1150
1151template <typename DerivedCCG, typename FuncTy, typename CallTy>
1152void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1153 addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
1154 unsigned int ContextId) {
1155 for (auto &Edge : CallerEdges) {
1156 if (Edge->Caller == Caller) {
1157 Edge->AllocTypes |= (uint8_t)AllocType;
1158 Edge->getContextIds().insert(ContextId);
1159 return;
1160 }
1161 }
1162 std::shared_ptr<ContextEdge> Edge = std::make_shared<ContextEdge>(
1163 this, Caller, (uint8_t)AllocType, DenseSet<uint32_t>({ContextId}));
1164 CallerEdges.push_back(Edge);
1165 Caller->CalleeEdges.push_back(Edge);
1166}
1167
1168template <typename DerivedCCG, typename FuncTy, typename CallTy>
1169void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::removeEdgeFromGraph(
1170 ContextEdge *Edge, EdgeIter *EI, bool CalleeIter) {
1171 assert(!EI || (*EI)->get() == Edge);
1172 assert(!Edge->isRemoved());
1173 // Save the Caller and Callee pointers so we can erase Edge from their edge
1174 // lists after clearing Edge below. We do the clearing first in case it is
1175 // destructed after removing from the edge lists (if those were the last
1176 // shared_ptr references to Edge).
1177 auto *Callee = Edge->Callee;
1178 auto *Caller = Edge->Caller;
1179
1180 // Make sure the edge fields are cleared out so we can properly detect
1181 // removed edges if Edge is not destructed because there is still a shared_ptr
1182 // reference.
1183 Edge->clear();
1184
1185#ifndef NDEBUG
1186 auto CalleeCallerCount = Callee->CallerEdges.size();
1187 auto CallerCalleeCount = Caller->CalleeEdges.size();
1188#endif
1189 if (!EI) {
1190 Callee->eraseCallerEdge(Edge);
1191 Caller->eraseCalleeEdge(Edge);
1192 } else if (CalleeIter) {
1193 Callee->eraseCallerEdge(Edge);
1194 *EI = Caller->CalleeEdges.erase(*EI);
1195 } else {
1196 Caller->eraseCalleeEdge(Edge);
1197 *EI = Callee->CallerEdges.erase(*EI);
1198 }
1199 assert(Callee->CallerEdges.size() < CalleeCallerCount);
1200 assert(Caller->CalleeEdges.size() < CallerCalleeCount);
1201}
1202
1203template <typename DerivedCCG, typename FuncTy, typename CallTy>
1204void CallsiteContextGraph<
1205 DerivedCCG, FuncTy, CallTy>::removeNoneTypeCalleeEdges(ContextNode *Node) {
1206 for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();) {
1207 auto Edge = *EI;
1208 if (Edge->AllocTypes == (uint8_t)AllocationType::None) {
1209 assert(Edge->ContextIds.empty());
1210 removeEdgeFromGraph(Edge.get(), &EI, /*CalleeIter=*/true);
1211 } else
1212 ++EI;
1213 }
1214}
1215
1216template <typename DerivedCCG, typename FuncTy, typename CallTy>
1217void CallsiteContextGraph<
1218 DerivedCCG, FuncTy, CallTy>::removeNoneTypeCallerEdges(ContextNode *Node) {
1219 for (auto EI = Node->CallerEdges.begin(); EI != Node->CallerEdges.end();) {
1220 auto Edge = *EI;
1221 if (Edge->AllocTypes == (uint8_t)AllocationType::None) {
1222 assert(Edge->ContextIds.empty());
1223 Edge->Caller->eraseCalleeEdge(Edge.get());
1224 EI = Node->CallerEdges.erase(EI);
1225 } else
1226 ++EI;
1227 }
1228}
1229
1230template <typename DerivedCCG, typename FuncTy, typename CallTy>
1231typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
1232CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1233 findEdgeFromCallee(const ContextNode *Callee) {
1234 for (const auto &Edge : CalleeEdges)
1235 if (Edge->Callee == Callee)
1236 return Edge.get();
1237 return nullptr;
1238}
1239
1240template <typename DerivedCCG, typename FuncTy, typename CallTy>
1241typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
1242CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1243 findEdgeFromCaller(const ContextNode *Caller) {
1244 for (const auto &Edge : CallerEdges)
1245 if (Edge->Caller == Caller)
1246 return Edge.get();
1247 return nullptr;
1248}
1249
1250template <typename DerivedCCG, typename FuncTy, typename CallTy>
1251void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1252 eraseCalleeEdge(const ContextEdge *Edge) {
1253 auto EI = llvm::find_if(
1254 CalleeEdges, [Edge](const std::shared_ptr<ContextEdge> &CalleeEdge) {
1255 return CalleeEdge.get() == Edge;
1256 });
1257 assert(EI != CalleeEdges.end());
1258 CalleeEdges.erase(EI);
1259}
1260
1261template <typename DerivedCCG, typename FuncTy, typename CallTy>
1262void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1263 eraseCallerEdge(const ContextEdge *Edge) {
1264 auto EI = llvm::find_if(
1265 CallerEdges, [Edge](const std::shared_ptr<ContextEdge> &CallerEdge) {
1266 return CallerEdge.get() == Edge;
1267 });
1268 assert(EI != CallerEdges.end());
1269 CallerEdges.erase(EI);
1270}
1271
1272template <typename DerivedCCG, typename FuncTy, typename CallTy>
1273uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::computeAllocType(
1274 DenseSet<uint32_t> &ContextIds) const {
1275 uint8_t BothTypes =
1276 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
1277 uint8_t AllocType = (uint8_t)AllocationType::None;
1278 for (auto Id : ContextIds) {
1279 AllocType |= (uint8_t)ContextIdToAllocationType.at(Id);
1280 // Bail early if alloc type reached both, no further refinement.
1281 if (AllocType == BothTypes)
1282 return AllocType;
1283 }
1284 return AllocType;
1285}
1286
1287template <typename DerivedCCG, typename FuncTy, typename CallTy>
1288uint8_t
1289CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypesImpl(
1290 const DenseSet<uint32_t> &Node1Ids,
1291 const DenseSet<uint32_t> &Node2Ids) const {
1292 uint8_t BothTypes =
1293 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
1294 uint8_t AllocType = (uint8_t)AllocationType::None;
1295 for (auto Id : Node1Ids) {
1296 if (!Node2Ids.count(Id))
1297 continue;
1298 AllocType |= (uint8_t)ContextIdToAllocationType.at(Id);
1299 // Bail early if alloc type reached both, no further refinement.
1300 if (AllocType == BothTypes)
1301 return AllocType;
1302 }
1303 return AllocType;
1304}
1305
1306template <typename DerivedCCG, typename FuncTy, typename CallTy>
1307uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypes(
1308 const DenseSet<uint32_t> &Node1Ids,
1309 const DenseSet<uint32_t> &Node2Ids) const {
1310 if (Node1Ids.size() < Node2Ids.size())
1311 return intersectAllocTypesImpl(Node1Ids, Node2Ids);
1312 else
1313 return intersectAllocTypesImpl(Node2Ids, Node1Ids);
1314}
1315
1316template <typename DerivedCCG, typename FuncTy, typename CallTy>
1317typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1318CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addAllocNode(
1319 CallInfo Call, const FuncTy *F) {
1320 assert(!getNodeForAlloc(Call));
1321 ContextNode *AllocNode = createNewNode(/*IsAllocation=*/true, F, Call);
1322 AllocationCallToContextNodeMap[Call] = AllocNode;
1323 // Use LastContextId as a uniq id for MIB allocation nodes.
1324 AllocNode->OrigStackOrAllocId = LastContextId;
1325 // Alloc type should be updated as we add in the MIBs. We should assert
1326 // afterwards that it is not still None.
1327 AllocNode->AllocTypes = (uint8_t)AllocationType::None;
1328
1329 return AllocNode;
1330}
1331
1332static std::string getAllocTypeString(uint8_t AllocTypes) {
1333 if (!AllocTypes)
1334 return "None";
1335 std::string Str;
1336 if (AllocTypes & (uint8_t)AllocationType::NotCold)
1337 Str += "NotCold";
1338 if (AllocTypes & (uint8_t)AllocationType::Cold)
1339 Str += "Cold";
1340 return Str;
1341}
1342
1343template <typename DerivedCCG, typename FuncTy, typename CallTy>
1344template <class NodeT, class IteratorT>
1345void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
1346 ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext,
1347 CallStack<NodeT, IteratorT> &CallsiteContext, AllocationType AllocType,
1348 ArrayRef<ContextTotalSize> ContextSizeInfo) {
1349 // Treating the hot alloc type as NotCold before the disambiguation for "hot"
1350 // is done.
1351 if (AllocType == AllocationType::Hot)
1352 AllocType = AllocationType::NotCold;
1353
1354 ContextIdToAllocationType[++LastContextId] = AllocType;
1355
1356 if (!ContextSizeInfo.empty()) {
1357 auto &Entry = ContextIdToContextSizeInfos[LastContextId];
1358 Entry.insert(Entry.begin(), ContextSizeInfo.begin(), ContextSizeInfo.end());
1359 }
1360
1361 // Update alloc type and context ids for this MIB.
1362 AllocNode->AllocTypes |= (uint8_t)AllocType;
1363
1364 // Now add or update nodes for each stack id in alloc's context.
1365 // Later when processing the stack ids on non-alloc callsites we will adjust
1366 // for any inlining in the context.
1367 ContextNode *PrevNode = AllocNode;
1368 // Look for recursion (direct recursion should have been collapsed by
1369 // module summary analysis, here we should just be detecting mutual
1370 // recursion). Mark these nodes so we don't try to clone.
1371 SmallSet<uint64_t, 8> StackIdSet;
1372 // Skip any on the allocation call (inlining).
1373 for (auto ContextIter = StackContext.beginAfterSharedPrefix(CallsiteContext);
1374 ContextIter != StackContext.end(); ++ContextIter) {
1375 auto StackId = getStackId(*ContextIter);
1376 ContextNode *StackNode = getNodeForStackId(StackId);
1377 if (!StackNode) {
1378 StackNode = createNewNode(/*IsAllocation=*/false);
1379 StackEntryIdToContextNodeMap[StackId] = StackNode;
1380 StackNode->OrigStackOrAllocId = StackId;
1381 }
1382 // Marking a node recursive will prevent its cloning completely, even for
1383 // non-recursive contexts flowing through it.
1385 auto Ins = StackIdSet.insert(StackId);
1386 if (!Ins.second)
1387 StackNode->Recursive = true;
1388 }
1389 StackNode->AllocTypes |= (uint8_t)AllocType;
1390 PrevNode->addOrUpdateCallerEdge(StackNode, AllocType, LastContextId);
1391 PrevNode = StackNode;
1392 }
1393}
1394
1395template <typename DerivedCCG, typename FuncTy, typename CallTy>
1396DenseSet<uint32_t>
1397CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::duplicateContextIds(
1398 const DenseSet<uint32_t> &StackSequenceContextIds,
1399 DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
1400 DenseSet<uint32_t> NewContextIds;
1401 for (auto OldId : StackSequenceContextIds) {
1402 NewContextIds.insert(++LastContextId);
1403 OldToNewContextIds[OldId].insert(LastContextId);
1404 assert(ContextIdToAllocationType.count(OldId));
1405 // The new context has the same allocation type as original.
1406 ContextIdToAllocationType[LastContextId] = ContextIdToAllocationType[OldId];
1407 if (DotAllocContextIds.contains(OldId))
1408 DotAllocContextIds.insert(LastContextId);
1409 }
1410 return NewContextIds;
1411}
1412
1413template <typename DerivedCCG, typename FuncTy, typename CallTy>
1414void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
1415 propagateDuplicateContextIds(
1416 const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
1417 // Build a set of duplicated context ids corresponding to the input id set.
1418 auto GetNewIds = [&OldToNewContextIds](const DenseSet<uint32_t> &ContextIds) {
1419 DenseSet<uint32_t> NewIds;
1420 for (auto Id : ContextIds)
1421 if (auto NewId = OldToNewContextIds.find(Id);
1422 NewId != OldToNewContextIds.end())
1423 NewIds.insert_range(NewId->second);
1424 return NewIds;
1425 };
1426
1427 // Recursively update context ids sets along caller edges.
1428 auto UpdateCallers = [&](ContextNode *Node,
1429 DenseSet<const ContextEdge *> &Visited,
1430 auto &&UpdateCallers) -> void {
1431 for (const auto &Edge : Node->CallerEdges) {
1432 auto Inserted = Visited.insert(Edge.get());
1433 if (!Inserted.second)
1434 continue;
1435 ContextNode *NextNode = Edge->Caller;
1436 DenseSet<uint32_t> NewIdsToAdd = GetNewIds(Edge->getContextIds());
1437 // Only need to recursively iterate to NextNode via this caller edge if
1438 // it resulted in any added ids to NextNode.
1439 if (!NewIdsToAdd.empty()) {
1440 Edge->getContextIds().insert_range(NewIdsToAdd);
1441 UpdateCallers(NextNode, Visited, UpdateCallers);
1442 }
1443 }
1444 };
1445
1446 DenseSet<const ContextEdge *> Visited;
1447 for (auto &Entry : AllocationCallToContextNodeMap) {
1448 auto *Node = Entry.second;
1449 UpdateCallers(Node, Visited, UpdateCallers);
1450 }
1451}
1452
1453template <typename DerivedCCG, typename FuncTy, typename CallTy>
1454void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::connectNewNode(
1455 ContextNode *NewNode, ContextNode *OrigNode, bool TowardsCallee,
1456 // This must be passed by value to make a copy since it will be adjusted
1457 // as ids are moved.
1458 DenseSet<uint32_t> RemainingContextIds) {
1459 auto &OrigEdges =
1460 TowardsCallee ? OrigNode->CalleeEdges : OrigNode->CallerEdges;
1461 DenseSet<uint32_t> RecursiveContextIds;
1462 DenseSet<uint32_t> AllCallerContextIds;
1464 // Identify which context ids are recursive which is needed to properly
1465 // update the RemainingContextIds set. The relevant recursive context ids
1466 // are those that are in multiple edges.
1467 for (auto &CE : OrigEdges) {
1468 AllCallerContextIds.reserve(CE->getContextIds().size());
1469 for (auto Id : CE->getContextIds())
1470 if (!AllCallerContextIds.insert(Id).second)
1471 RecursiveContextIds.insert(Id);
1472 }
1473 }
1474 // Increment iterator in loop so that we can remove edges as needed.
1475 for (auto EI = OrigEdges.begin(); EI != OrigEdges.end();) {
1476 auto Edge = *EI;
1477 DenseSet<uint32_t> NewEdgeContextIds;
1478 DenseSet<uint32_t> NotFoundContextIds;
1479 // Remove any matching context ids from Edge, return set that were found and
1480 // removed, these are the new edge's context ids. Also update the remaining
1481 // (not found ids).
1482 set_subtract(Edge->getContextIds(), RemainingContextIds, NewEdgeContextIds,
1483 NotFoundContextIds);
1484 // Update the remaining context ids set for the later edges. This is a
1485 // compile time optimization.
1486 if (RecursiveContextIds.empty()) {
1487 // No recursive ids, so all of the previously remaining context ids that
1488 // were not seen on this edge are the new remaining set.
1489 RemainingContextIds.swap(NotFoundContextIds);
1490 } else {
1491 // Keep the recursive ids in the remaining set as we expect to see those
1492 // on another edge. We can remove the non-recursive remaining ids that
1493 // were seen on this edge, however. We already have the set of remaining
1494 // ids that were on this edge (in NewEdgeContextIds). Figure out which are
1495 // non-recursive and only remove those. Note that despite the higher
1496 // overhead of updating the remaining context ids set when recursion
1497 // handling is enabled, it was found to be at worst performance neutral
1498 // and in one case a clear win.
1499 DenseSet<uint32_t> NonRecursiveRemainingCurEdgeIds =
1500 set_difference(NewEdgeContextIds, RecursiveContextIds);
1501 set_subtract(RemainingContextIds, NonRecursiveRemainingCurEdgeIds);
1502 }
1503 // If no matching context ids for this edge, skip it.
1504 if (NewEdgeContextIds.empty()) {
1505 ++EI;
1506 continue;
1507 }
1508 if (TowardsCallee) {
1509 uint8_t NewAllocType = computeAllocType(NewEdgeContextIds);
1510 auto NewEdge = std::make_shared<ContextEdge>(
1511 Edge->Callee, NewNode, NewAllocType, std::move(NewEdgeContextIds));
1512 NewNode->CalleeEdges.push_back(NewEdge);
1513 NewEdge->Callee->CallerEdges.push_back(NewEdge);
1514 } else {
1515 uint8_t NewAllocType = computeAllocType(NewEdgeContextIds);
1516 auto NewEdge = std::make_shared<ContextEdge>(
1517 NewNode, Edge->Caller, NewAllocType, std::move(NewEdgeContextIds));
1518 NewNode->CallerEdges.push_back(NewEdge);
1519 NewEdge->Caller->CalleeEdges.push_back(NewEdge);
1520 }
1521 // Remove old edge if context ids empty.
1522 if (Edge->getContextIds().empty()) {
1523 removeEdgeFromGraph(Edge.get(), &EI, TowardsCallee);
1524 continue;
1525 }
1526 ++EI;
1527 }
1528}
1529
1530template <typename DerivedCCG, typename FuncTy, typename CallTy>
1531static void checkEdge(
1532 const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &Edge) {
1533 // Confirm that alloc type is not None and that we have at least one context
1534 // id.
1535 assert(Edge->AllocTypes != (uint8_t)AllocationType::None);
1536 assert(!Edge->ContextIds.empty());
1537}
1538
1539template <typename DerivedCCG, typename FuncTy, typename CallTy>
1540static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node,
1541 bool CheckEdges = true) {
1542 if (Node->isRemoved())
1543 return;
1544#ifndef NDEBUG
1545 // Compute node's context ids once for use in asserts.
1546 auto NodeContextIds = Node->getContextIds();
1547#endif
1548 // Node's context ids should be the union of both its callee and caller edge
1549 // context ids.
1550 if (Node->CallerEdges.size()) {
1551 DenseSet<uint32_t> CallerEdgeContextIds(
1552 Node->CallerEdges.front()->ContextIds);
1553 for (const auto &Edge : llvm::drop_begin(Node->CallerEdges)) {
1554 if (CheckEdges)
1556 set_union(CallerEdgeContextIds, Edge->ContextIds);
1557 }
1558 // Node can have more context ids than callers if some contexts terminate at
1559 // node and some are longer. If we are allowing recursive callsites and
1560 // contexts this will be violated for incompletely cloned recursive cycles,
1561 // so skip the checking in that case.
1563 NodeContextIds == CallerEdgeContextIds ||
1564 set_is_subset(CallerEdgeContextIds, NodeContextIds));
1565 }
1566 if (Node->CalleeEdges.size()) {
1567 DenseSet<uint32_t> CalleeEdgeContextIds(
1568 Node->CalleeEdges.front()->ContextIds);
1569 for (const auto &Edge : llvm::drop_begin(Node->CalleeEdges)) {
1570 if (CheckEdges)
1572 set_union(CalleeEdgeContextIds, Edge->getContextIds());
1573 }
1574 // If we are allowing recursive callsites and contexts this will be violated
1575 // for incompletely cloned recursive cycles, so skip the checking in that
1576 // case.
1578 NodeContextIds == CalleeEdgeContextIds);
1579 }
1580 // FIXME: Since this checking is only invoked under an option, we should
1581 // change the error checking from using assert to something that will trigger
1582 // an error on a release build.
1583#ifndef NDEBUG
1584 // Make sure we don't end up with duplicate edges between the same caller and
1585 // callee.
1587 for (const auto &E : Node->CalleeEdges)
1588 NodeSet.insert(E->Callee);
1589 assert(NodeSet.size() == Node->CalleeEdges.size());
1590#endif
1591}
1592
1593template <typename DerivedCCG, typename FuncTy, typename CallTy>
1594void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
1595 assignStackNodesPostOrder(
1596 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
1597 DenseMap<uint64_t, std::vector<CallContextInfo>>
1598 &StackIdToMatchingCalls,
1599 DenseMap<CallInfo, CallInfo> &CallToMatchingCall) {
1600 auto Inserted = Visited.insert(Node);
1601 if (!Inserted.second)
1602 return;
1603 // Post order traversal. Iterate over a copy since we may add nodes and
1604 // therefore new callers during the recursive call, invalidating any
1605 // iterator over the original edge vector. We don't need to process these
1606 // new nodes as they were already processed on creation.
1607 auto CallerEdges = Node->CallerEdges;
1608 for (auto &Edge : CallerEdges) {
1609 // Skip any that have been removed during the recursion.
1610 if (Edge->isRemoved()) {
1611 assert(!is_contained(Node->CallerEdges, Edge));
1612 continue;
1613 }
1614 assignStackNodesPostOrder(Edge->Caller, Visited, StackIdToMatchingCalls,
1615 CallToMatchingCall);
1616 }
1617
1618 // If this node's stack id is in the map, update the graph to contain new
1619 // nodes representing any inlining at interior callsites. Note we move the
1620 // associated context ids over to the new nodes.
1621
1622 // Ignore this node if it is for an allocation or we didn't record any
1623 // stack id lists ending at it.
1624 if (Node->IsAllocation ||
1625 !StackIdToMatchingCalls.count(Node->OrigStackOrAllocId))
1626 return;
1627
1628 auto &Calls = StackIdToMatchingCalls[Node->OrigStackOrAllocId];
1629 // Handle the simple case first. A single call with a single stack id.
1630 // In this case there is no need to create any new context nodes, simply
1631 // assign the context node for stack id to this Call.
1632 if (Calls.size() == 1) {
1633 auto &[Call, Ids, Func, SavedContextIds] = Calls[0];
1634 if (Ids.size() == 1) {
1635 assert(SavedContextIds.empty());
1636 // It should be this Node
1637 assert(Node == getNodeForStackId(Ids[0]));
1638 if (Node->Recursive)
1639 return;
1640 Node->setCall(Call);
1641 NonAllocationCallToContextNodeMap[Call] = Node;
1642 NodeToCallingFunc[Node] = Func;
1643 return;
1644 }
1645 }
1646
1647#ifndef NDEBUG
1648 // Find the node for the last stack id, which should be the same
1649 // across all calls recorded for this id, and is this node's id.
1650 uint64_t LastId = Node->OrigStackOrAllocId;
1651 ContextNode *LastNode = getNodeForStackId(LastId);
1652 // We should only have kept stack ids that had nodes.
1653 assert(LastNode);
1654 assert(LastNode == Node);
1655#else
1656 ContextNode *LastNode = Node;
1657#endif
1658
1659 // Compute the last node's context ids once, as it is shared by all calls in
1660 // this entry.
1661 DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds();
1662
1663 [[maybe_unused]] bool PrevIterCreatedNode = false;
1664 bool CreatedNode = false;
1665 for (unsigned I = 0; I < Calls.size();
1666 I++, PrevIterCreatedNode = CreatedNode) {
1667 CreatedNode = false;
1668 auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
1669 // Skip any for which we didn't assign any ids, these don't get a node in
1670 // the graph.
1671 if (SavedContextIds.empty()) {
1672 // If this call has a matching call (located in the same function and
1673 // having the same stack ids), simply add it to the context node created
1674 // for its matching call earlier. These can be treated the same through
1675 // cloning and get updated at the same time.
1676 if (!CallToMatchingCall.contains(Call))
1677 continue;
1678 auto MatchingCall = CallToMatchingCall[Call];
1679 if (!NonAllocationCallToContextNodeMap.contains(MatchingCall)) {
1680 // This should only happen if we had a prior iteration, and it didn't
1681 // create a node because of the below recomputation of context ids
1682 // finding none remaining and continuing early.
1683 assert(I > 0 && !PrevIterCreatedNode);
1684 continue;
1685 }
1686 NonAllocationCallToContextNodeMap[MatchingCall]->MatchingCalls.push_back(
1687 Call);
1688 continue;
1689 }
1690
1691 assert(LastId == Ids.back());
1692
1693 // Recompute the context ids for this stack id sequence (the
1694 // intersection of the context ids of the corresponding nodes).
1695 // Start with the ids we saved in the map for this call, which could be
1696 // duplicated context ids. We have to recompute as we might have overlap
1697 // overlap between the saved context ids for different last nodes, and
1698 // removed them already during the post order traversal.
1699 set_intersect(SavedContextIds, LastNodeContextIds);
1700 ContextNode *PrevNode = LastNode;
1701 bool Skip = false;
1702 // Iterate backwards through the stack Ids, starting after the last Id
1703 // in the list, which was handled once outside for all Calls.
1704 for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) {
1705 auto Id = *IdIter;
1706 ContextNode *CurNode = getNodeForStackId(Id);
1707 // We should only have kept stack ids that had nodes and weren't
1708 // recursive.
1709 assert(CurNode);
1710 assert(!CurNode->Recursive);
1711
1712 auto *Edge = CurNode->findEdgeFromCaller(PrevNode);
1713 if (!Edge) {
1714 Skip = true;
1715 break;
1716 }
1717 PrevNode = CurNode;
1718
1719 // Update the context ids, which is the intersection of the ids along
1720 // all edges in the sequence.
1721 set_intersect(SavedContextIds, Edge->getContextIds());
1722
1723 // If we now have no context ids for clone, skip this call.
1724 if (SavedContextIds.empty()) {
1725 Skip = true;
1726 break;
1727 }
1728 }
1729 if (Skip)
1730 continue;
1731
1732 // Create new context node.
1733 ContextNode *NewNode = createNewNode(/*IsAllocation=*/false, Func, Call);
1734 NonAllocationCallToContextNodeMap[Call] = NewNode;
1735 CreatedNode = true;
1736 NewNode->AllocTypes = computeAllocType(SavedContextIds);
1737
1738 ContextNode *FirstNode = getNodeForStackId(Ids[0]);
1739 assert(FirstNode);
1740
1741 // Connect to callees of innermost stack frame in inlined call chain.
1742 // This updates context ids for FirstNode's callee's to reflect those
1743 // moved to NewNode.
1744 connectNewNode(NewNode, FirstNode, /*TowardsCallee=*/true, SavedContextIds);
1745
1746 // Connect to callers of outermost stack frame in inlined call chain.
1747 // This updates context ids for FirstNode's caller's to reflect those
1748 // moved to NewNode.
1749 connectNewNode(NewNode, LastNode, /*TowardsCallee=*/false, SavedContextIds);
1750
1751 // Now we need to remove context ids from edges/nodes between First and
1752 // Last Node.
1753 PrevNode = nullptr;
1754 for (auto Id : Ids) {
1755 ContextNode *CurNode = getNodeForStackId(Id);
1756 // We should only have kept stack ids that had nodes.
1757 assert(CurNode);
1758
1759 // Remove the context ids moved to NewNode from CurNode, and the
1760 // edge from the prior node.
1761 if (PrevNode) {
1762 auto *PrevEdge = CurNode->findEdgeFromCallee(PrevNode);
1763 // If the sequence contained recursion, we might have already removed
1764 // some edges during the connectNewNode calls above.
1765 if (!PrevEdge) {
1766 PrevNode = CurNode;
1767 continue;
1768 }
1769 set_subtract(PrevEdge->getContextIds(), SavedContextIds);
1770 if (PrevEdge->getContextIds().empty())
1771 removeEdgeFromGraph(PrevEdge);
1772 }
1773 // Since we update the edges from leaf to tail, only look at the callee
1774 // edges. This isn't an alloc node, so if there are no callee edges, the
1775 // alloc type is None.
1776 CurNode->AllocTypes = CurNode->CalleeEdges.empty()
1777 ? (uint8_t)AllocationType::None
1778 : CurNode->computeAllocType();
1779 PrevNode = CurNode;
1780 }
1781 if (VerifyNodes) {
1782 checkNode<DerivedCCG, FuncTy, CallTy>(NewNode, /*CheckEdges=*/true);
1783 for (auto Id : Ids) {
1784 ContextNode *CurNode = getNodeForStackId(Id);
1785 // We should only have kept stack ids that had nodes.
1786 assert(CurNode);
1787 checkNode<DerivedCCG, FuncTy, CallTy>(CurNode, /*CheckEdges=*/true);
1788 }
1789 }
1790 }
1791}
1792
1793template <typename DerivedCCG, typename FuncTy, typename CallTy>
1794void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
1795 // Map of stack id to all calls with that as the last (outermost caller)
1796 // callsite id that has a context node (some might not due to pruning
1797 // performed during matching of the allocation profile contexts).
1798 // The CallContextInfo contains the Call and a list of its stack ids with
1799 // ContextNodes, the function containing Call, and the set of context ids
1800 // the analysis will eventually identify for use in any new node created
1801 // for that callsite.
1802 DenseMap<uint64_t, std::vector<CallContextInfo>> StackIdToMatchingCalls;
1803 for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
1804 for (auto &Call : CallsWithMetadata) {
1805 // Ignore allocations, already handled.
1806 if (AllocationCallToContextNodeMap.count(Call))
1807 continue;
1808 auto StackIdsWithContextNodes =
1809 getStackIdsWithContextNodesForCall(Call.call());
1810 // If there were no nodes created for MIBs on allocs (maybe this was in
1811 // the unambiguous part of the MIB stack that was pruned), ignore.
1812 if (StackIdsWithContextNodes.empty())
1813 continue;
1814 // Otherwise, record this Call along with the list of ids for the last
1815 // (outermost caller) stack id with a node.
1816 StackIdToMatchingCalls[StackIdsWithContextNodes.back()].push_back(
1817 {Call.call(), StackIdsWithContextNodes, Func, {}});
1818 }
1819 }
1820
1821 // First make a pass through all stack ids that correspond to a call,
1822 // as identified in the above loop. Compute the context ids corresponding to
1823 // each of these calls when they correspond to multiple stack ids due to
1824 // due to inlining. Perform any duplication of context ids required when
1825 // there is more than one call with the same stack ids. Their (possibly newly
1826 // duplicated) context ids are saved in the StackIdToMatchingCalls map.
1827 DenseMap<uint32_t, DenseSet<uint32_t>> OldToNewContextIds;
1828 // Save a map from each call to any that are found to match it. I.e. located
1829 // in the same function and have the same (possibly pruned) stack ids. We use
1830 // this to avoid creating extra graph nodes as they can be treated the same.
1831 DenseMap<CallInfo, CallInfo> CallToMatchingCall;
1832 for (auto &It : StackIdToMatchingCalls) {
1833 auto &Calls = It.getSecond();
1834 // Skip single calls with a single stack id. These don't need a new node.
1835 if (Calls.size() == 1) {
1836 auto &Ids = Calls[0].StackIds;
1837 if (Ids.size() == 1)
1838 continue;
1839 }
1840 // In order to do the best and maximal matching of inlined calls to context
1841 // node sequences we will sort the vectors of stack ids in descending order
1842 // of length, and within each length, lexicographically by stack id. The
1843 // latter is so that we can specially handle calls that have identical stack
1844 // id sequences (either due to cloning or artificially because of the MIB
1845 // context pruning). Those with the same Ids are then sorted by function to
1846 // facilitate efficiently mapping them to the same context node.
1847 // Because the functions are pointers, to ensure a stable sort first assign
1848 // each function pointer to its first index in the Calls array, and then use
1849 // that to sort by.
1850 DenseMap<const FuncTy *, unsigned> FuncToIndex;
1851 for (const auto &[Idx, CallCtxInfo] : enumerate(Calls))
1852 FuncToIndex.insert({CallCtxInfo.Func, Idx});
1854 Calls,
1855 [&FuncToIndex](const CallContextInfo &A, const CallContextInfo &B) {
1856 return A.StackIds.size() > B.StackIds.size() ||
1857 (A.StackIds.size() == B.StackIds.size() &&
1858 (A.StackIds < B.StackIds ||
1859 (A.StackIds == B.StackIds &&
1860 FuncToIndex[A.Func] < FuncToIndex[B.Func])));
1861 });
1862
1863 // Find the node for the last stack id, which should be the same
1864 // across all calls recorded for this id, and is the id for this
1865 // entry in the StackIdToMatchingCalls map.
1866 uint64_t LastId = It.getFirst();
1867 ContextNode *LastNode = getNodeForStackId(LastId);
1868 // We should only have kept stack ids that had nodes.
1869 assert(LastNode);
1870
1871 if (LastNode->Recursive)
1872 continue;
1873
1874 // Initialize the context ids with the last node's. We will subsequently
1875 // refine the context ids by computing the intersection along all edges.
1876 DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds();
1877 assert(!LastNodeContextIds.empty());
1878
1879#ifndef NDEBUG
1880 // Save the set of functions seen for a particular set of the same stack
1881 // ids. This is used to ensure that they have been correctly sorted to be
1882 // adjacent in the Calls list, since we rely on that to efficiently place
1883 // all such matching calls onto the same context node.
1884 DenseSet<const FuncTy *> MatchingIdsFuncSet;
1885#endif
1886
1887 for (unsigned I = 0; I < Calls.size(); I++) {
1888 auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
1889 assert(SavedContextIds.empty());
1890 assert(LastId == Ids.back());
1891
1892#ifndef NDEBUG
1893 // If this call has a different set of ids than the last one, clear the
1894 // set used to ensure they are sorted properly.
1895 if (I > 0 && Ids != Calls[I - 1].StackIds)
1896 MatchingIdsFuncSet.clear();
1897#endif
1898
1899 // First compute the context ids for this stack id sequence (the
1900 // intersection of the context ids of the corresponding nodes).
1901 // Start with the remaining saved ids for the last node.
1902 assert(!LastNodeContextIds.empty());
1903 DenseSet<uint32_t> StackSequenceContextIds = LastNodeContextIds;
1904
1905 ContextNode *PrevNode = LastNode;
1906 ContextNode *CurNode = LastNode;
1907 bool Skip = false;
1908
1909 // Iterate backwards through the stack Ids, starting after the last Id
1910 // in the list, which was handled once outside for all Calls.
1911 for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) {
1912 auto Id = *IdIter;
1913 CurNode = getNodeForStackId(Id);
1914 // We should only have kept stack ids that had nodes.
1915 assert(CurNode);
1916
1917 if (CurNode->Recursive) {
1918 Skip = true;
1919 break;
1920 }
1921
1922 auto *Edge = CurNode->findEdgeFromCaller(PrevNode);
1923 // If there is no edge then the nodes belong to different MIB contexts,
1924 // and we should skip this inlined context sequence. For example, this
1925 // particular inlined context may include stack ids A->B, and we may
1926 // indeed have nodes for both A and B, but it is possible that they were
1927 // never profiled in sequence in a single MIB for any allocation (i.e.
1928 // we might have profiled an allocation that involves the callsite A,
1929 // but through a different one of its callee callsites, and we might
1930 // have profiled an allocation that involves callsite B, but reached
1931 // from a different caller callsite).
1932 if (!Edge) {
1933 Skip = true;
1934 break;
1935 }
1936 PrevNode = CurNode;
1937
1938 // Update the context ids, which is the intersection of the ids along
1939 // all edges in the sequence.
1940 set_intersect(StackSequenceContextIds, Edge->getContextIds());
1941
1942 // If we now have no context ids for clone, skip this call.
1943 if (StackSequenceContextIds.empty()) {
1944 Skip = true;
1945 break;
1946 }
1947 }
1948 if (Skip)
1949 continue;
1950
1951 // If some of this call's stack ids did not have corresponding nodes (due
1952 // to pruning), don't include any context ids for contexts that extend
1953 // beyond these nodes. Otherwise we would be matching part of unrelated /
1954 // not fully matching stack contexts. To do this, subtract any context ids
1955 // found in caller nodes of the last node found above.
1956 if (Ids.back() != getLastStackId(Call)) {
1957 for (const auto &PE : LastNode->CallerEdges) {
1958 set_subtract(StackSequenceContextIds, PE->getContextIds());
1959 if (StackSequenceContextIds.empty())
1960 break;
1961 }
1962 // If we now have no context ids for clone, skip this call.
1963 if (StackSequenceContextIds.empty())
1964 continue;
1965 }
1966
1967#ifndef NDEBUG
1968 // If the prior call had the same stack ids this set would not be empty.
1969 // Check if we already have a call that "matches" because it is located
1970 // in the same function. If the Calls list was sorted properly we should
1971 // not encounter this situation as all such entries should be adjacent
1972 // and processed in bulk further below.
1973 assert(!MatchingIdsFuncSet.contains(Func));
1974
1975 MatchingIdsFuncSet.insert(Func);
1976#endif
1977
1978 // Check if the next set of stack ids is the same (since the Calls vector
1979 // of tuples is sorted by the stack ids we can just look at the next one).
1980 // If so, save them in the CallToMatchingCall map so that they get
1981 // assigned to the same context node, and skip them.
1982 bool DuplicateContextIds = false;
1983 for (unsigned J = I + 1; J < Calls.size(); J++) {
1984 auto &CallCtxInfo = Calls[J];
1985 auto &NextIds = CallCtxInfo.StackIds;
1986 if (NextIds != Ids)
1987 break;
1988 auto *NextFunc = CallCtxInfo.Func;
1989 if (NextFunc != Func) {
1990 // We have another Call with the same ids but that cannot share this
1991 // node, must duplicate ids for it.
1992 DuplicateContextIds = true;
1993 break;
1994 }
1995 auto &NextCall = CallCtxInfo.Call;
1996 CallToMatchingCall[NextCall] = Call;
1997 // Update I so that it gets incremented correctly to skip this call.
1998 I = J;
1999 }
2000
2001 // If we don't have duplicate context ids, then we can assign all the
2002 // context ids computed for the original node sequence to this call.
2003 // If there are duplicate calls with the same stack ids then we synthesize
2004 // new context ids that are duplicates of the originals. These are
2005 // assigned to SavedContextIds, which is a reference into the map entry
2006 // for this call, allowing us to access these ids later on.
2007 OldToNewContextIds.reserve(OldToNewContextIds.size() +
2008 StackSequenceContextIds.size());
2009 SavedContextIds =
2010 DuplicateContextIds
2011 ? duplicateContextIds(StackSequenceContextIds, OldToNewContextIds)
2012 : StackSequenceContextIds;
2013 assert(!SavedContextIds.empty());
2014
2015 if (!DuplicateContextIds) {
2016 // Update saved last node's context ids to remove those that are
2017 // assigned to other calls, so that it is ready for the next call at
2018 // this stack id.
2019 set_subtract(LastNodeContextIds, StackSequenceContextIds);
2020 if (LastNodeContextIds.empty())
2021 break;
2022 }
2023 }
2024 }
2025
2026 // Propagate the duplicate context ids over the graph.
2027 propagateDuplicateContextIds(OldToNewContextIds);
2028
2029 if (VerifyCCG)
2030 check();
2031
2032 // Now perform a post-order traversal over the graph, starting with the
2033 // allocation nodes, essentially processing nodes from callers to callees.
2034 // For any that contains an id in the map, update the graph to contain new
2035 // nodes representing any inlining at interior callsites. Note we move the
2036 // associated context ids over to the new nodes.
2037 DenseSet<const ContextNode *> Visited;
2038 for (auto &Entry : AllocationCallToContextNodeMap)
2039 assignStackNodesPostOrder(Entry.second, Visited, StackIdToMatchingCalls,
2040 CallToMatchingCall);
2041 if (VerifyCCG)
2042 check();
2043}
2044
2045uint64_t ModuleCallsiteContextGraph::getLastStackId(Instruction *Call) {
2046 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
2047 Call->getMetadata(LLVMContext::MD_callsite));
2048 return CallsiteContext.back();
2049}
2050
2051uint64_t IndexCallsiteContextGraph::getLastStackId(IndexCall &Call) {
2053 CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator>
2054 CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call));
2055 // Need to convert index into stack id.
2056 return Index.getStackIdAtIndex(CallsiteContext.back());
2057}
2058
2059static const std::string MemProfCloneSuffix = ".memprof.";
2060
2061static std::string getMemProfFuncName(Twine Base, unsigned CloneNo) {
2062 // We use CloneNo == 0 to refer to the original version, which doesn't get
2063 // renamed with a suffix.
2064 if (!CloneNo)
2065 return Base.str();
2066 return (Base + MemProfCloneSuffix + Twine(CloneNo)).str();
2067}
2068
2069static bool isMemProfClone(const Function &F) {
2070 return F.getName().contains(MemProfCloneSuffix);
2071}
2072
2073// Return the clone number of the given function by extracting it from the
2074// memprof suffix. Assumes the caller has already confirmed it is a memprof
2075// clone.
2076static unsigned getMemProfCloneNum(const Function &F) {
2078 auto Pos = F.getName().find_last_of('.');
2079 assert(Pos > 0);
2080 unsigned CloneNo;
2081 bool Err = F.getName().drop_front(Pos + 1).getAsInteger(10, CloneNo);
2082 assert(!Err);
2083 (void)Err;
2084 return CloneNo;
2085}
2086
2087std::string ModuleCallsiteContextGraph::getLabel(const Function *Func,
2088 const Instruction *Call,
2089 unsigned CloneNo) const {
2090 return (Twine(Call->getFunction()->getName()) + " -> " +
2091 cast<CallBase>(Call)->getCalledFunction()->getName())
2092 .str();
2093}
2094
2095std::string IndexCallsiteContextGraph::getLabel(const FunctionSummary *Func,
2096 const IndexCall &Call,
2097 unsigned CloneNo) const {
2098 auto VI = FSToVIMap.find(Func);
2099 assert(VI != FSToVIMap.end());
2100 std::string CallerName = getMemProfFuncName(VI->second.name(), CloneNo);
2102 return CallerName + " -> alloc";
2103 else {
2104 auto *Callsite = dyn_cast_if_present<CallsiteInfo *>(Call);
2105 return CallerName + " -> " +
2106 getMemProfFuncName(Callsite->Callee.name(),
2107 Callsite->Clones[CloneNo]);
2108 }
2109}
2110
2111std::vector<uint64_t>
2112ModuleCallsiteContextGraph::getStackIdsWithContextNodesForCall(
2113 Instruction *Call) {
2114 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
2115 Call->getMetadata(LLVMContext::MD_callsite));
2116 return getStackIdsWithContextNodes<MDNode, MDNode::op_iterator>(
2117 CallsiteContext);
2118}
2119
2120std::vector<uint64_t>
2121IndexCallsiteContextGraph::getStackIdsWithContextNodesForCall(IndexCall &Call) {
2123 CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator>
2124 CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call));
2125 return getStackIdsWithContextNodes<CallsiteInfo,
2126 SmallVector<unsigned>::const_iterator>(
2127 CallsiteContext);
2128}
2129
2130template <typename DerivedCCG, typename FuncTy, typename CallTy>
2131template <class NodeT, class IteratorT>
2132std::vector<uint64_t>
2133CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getStackIdsWithContextNodes(
2134 CallStack<NodeT, IteratorT> &CallsiteContext) {
2135 std::vector<uint64_t> StackIds;
2136 for (auto IdOrIndex : CallsiteContext) {
2137 auto StackId = getStackId(IdOrIndex);
2138 ContextNode *Node = getNodeForStackId(StackId);
2139 if (!Node)
2140 break;
2141 StackIds.push_back(StackId);
2142 }
2143 return StackIds;
2144}
2145
2146ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(
2147 Module &M,
2148 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter)
2149 : Mod(M), OREGetter(OREGetter) {
2150 for (auto &F : M) {
2151 std::vector<CallInfo> CallsWithMetadata;
2152 for (auto &BB : F) {
2153 for (auto &I : BB) {
2154 if (!isa<CallBase>(I))
2155 continue;
2156 if (auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof)) {
2157 CallsWithMetadata.push_back(&I);
2158 auto *AllocNode = addAllocNode(&I, &F);
2159 auto *CallsiteMD = I.getMetadata(LLVMContext::MD_callsite);
2160 assert(CallsiteMD);
2161 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(CallsiteMD);
2162 // Add all of the MIBs and their stack nodes.
2163 for (auto &MDOp : MemProfMD->operands()) {
2164 auto *MIBMD = cast<const MDNode>(MDOp);
2165 std::vector<ContextTotalSize> ContextSizeInfo;
2166 // Collect the context size information if it exists.
2167 if (MIBMD->getNumOperands() > 2) {
2168 for (unsigned I = 2; I < MIBMD->getNumOperands(); I++) {
2169 MDNode *ContextSizePair =
2170 dyn_cast<MDNode>(MIBMD->getOperand(I));
2171 assert(ContextSizePair->getNumOperands() == 2);
2173 ContextSizePair->getOperand(0))
2174 ->getZExtValue();
2176 ContextSizePair->getOperand(1))
2177 ->getZExtValue();
2178 ContextSizeInfo.push_back({FullStackId, TotalSize});
2179 }
2180 }
2184 addStackNodesForMIB<MDNode, MDNode::op_iterator>(
2185 AllocNode, StackContext, CallsiteContext,
2186 getMIBAllocType(MIBMD), ContextSizeInfo);
2187 }
2188 // If exporting the graph to dot and an allocation id of interest was
2189 // specified, record all the context ids for this allocation node.
2190 if (ExportToDot && AllocNode->OrigStackOrAllocId == AllocIdForDot)
2191 DotAllocContextIds = AllocNode->getContextIds();
2192 assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
2193 // Memprof and callsite metadata on memory allocations no longer
2194 // needed.
2195 I.setMetadata(LLVMContext::MD_memprof, nullptr);
2196 I.setMetadata(LLVMContext::MD_callsite, nullptr);
2197 }
2198 // For callsite metadata, add to list for this function for later use.
2199 else if (I.getMetadata(LLVMContext::MD_callsite)) {
2200 CallsWithMetadata.push_back(&I);
2201 }
2202 }
2203 }
2204 if (!CallsWithMetadata.empty())
2205 FuncToCallsWithMetadata[&F] = CallsWithMetadata;
2206 }
2207
2208 if (DumpCCG) {
2209 dbgs() << "CCG before updating call stack chains:\n";
2210 dbgs() << *this;
2211 }
2212
2213 if (ExportToDot)
2214 exportToDot("prestackupdate");
2215
2216 updateStackNodes();
2217
2218 if (ExportToDot)
2219 exportToDot("poststackupdate");
2220
2221 handleCallsitesWithMultipleTargets();
2222
2223 markBackedges();
2224
2225 // Strip off remaining callsite metadata, no longer needed.
2226 for (auto &FuncEntry : FuncToCallsWithMetadata)
2227 for (auto &Call : FuncEntry.second)
2228 Call.call()->setMetadata(LLVMContext::MD_callsite, nullptr);
2229}
2230
2231IndexCallsiteContextGraph::IndexCallsiteContextGraph(
2232 ModuleSummaryIndex &Index,
2234 isPrevailing)
2235 : Index(Index), isPrevailing(isPrevailing) {
2236 for (auto &I : Index) {
2237 auto VI = Index.getValueInfo(I);
2238 for (auto &S : VI.getSummaryList()) {
2239 // We should only add the prevailing nodes. Otherwise we may try to clone
2240 // in a weak copy that won't be linked (and may be different than the
2241 // prevailing version).
2242 // We only keep the memprof summary on the prevailing copy now when
2243 // building the combined index, as a space optimization, however don't
2244 // rely on this optimization. The linker doesn't resolve local linkage
2245 // values so don't check whether those are prevailing.
2246 if (!GlobalValue::isLocalLinkage(S->linkage()) &&
2247 !isPrevailing(VI.getGUID(), S.get()))
2248 continue;
2249 auto *FS = dyn_cast<FunctionSummary>(S.get());
2250 if (!FS)
2251 continue;
2252 std::vector<CallInfo> CallsWithMetadata;
2253 if (!FS->allocs().empty()) {
2254 for (auto &AN : FS->mutableAllocs()) {
2255 // This can happen because of recursion elimination handling that
2256 // currently exists in ModuleSummaryAnalysis. Skip these for now.
2257 // We still added them to the summary because we need to be able to
2258 // correlate properly in applyImport in the backends.
2259 if (AN.MIBs.empty())
2260 continue;
2261 IndexCall AllocCall(&AN);
2262 CallsWithMetadata.push_back(AllocCall);
2263 auto *AllocNode = addAllocNode(AllocCall, FS);
2264 // Pass an empty CallStack to the CallsiteContext (second)
2265 // parameter, since for ThinLTO we already collapsed out the inlined
2266 // stack ids on the allocation call during ModuleSummaryAnalysis.
2268 EmptyContext;
2269 unsigned I = 0;
2271 AN.ContextSizeInfos.size() == AN.MIBs.size());
2272 // Now add all of the MIBs and their stack nodes.
2273 for (auto &MIB : AN.MIBs) {
2275 StackContext(&MIB);
2276 std::vector<ContextTotalSize> ContextSizeInfo;
2277 if (!AN.ContextSizeInfos.empty()) {
2278 for (auto [FullStackId, TotalSize] : AN.ContextSizeInfos[I])
2279 ContextSizeInfo.push_back({FullStackId, TotalSize});
2280 }
2281 addStackNodesForMIB<MIBInfo, SmallVector<unsigned>::const_iterator>(
2282 AllocNode, StackContext, EmptyContext, MIB.AllocType,
2283 ContextSizeInfo);
2284 I++;
2285 }
2286 // If exporting the graph to dot and an allocation id of interest was
2287 // specified, record all the context ids for this allocation node.
2288 if (ExportToDot && AllocNode->OrigStackOrAllocId == AllocIdForDot)
2289 DotAllocContextIds = AllocNode->getContextIds();
2290 assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
2291 // Initialize version 0 on the summary alloc node to the current alloc
2292 // type, unless it has both types in which case make it default, so
2293 // that in the case where we aren't able to clone the original version
2294 // always ends up with the default allocation behavior.
2295 AN.Versions[0] = (uint8_t)allocTypeToUse(AllocNode->AllocTypes);
2296 }
2297 }
2298 // For callsite metadata, add to list for this function for later use.
2299 if (!FS->callsites().empty())
2300 for (auto &SN : FS->mutableCallsites()) {
2301 IndexCall StackNodeCall(&SN);
2302 CallsWithMetadata.push_back(StackNodeCall);
2303 }
2304
2305 if (!CallsWithMetadata.empty())
2306 FuncToCallsWithMetadata[FS] = CallsWithMetadata;
2307
2308 if (!FS->allocs().empty() || !FS->callsites().empty())
2309 FSToVIMap[FS] = VI;
2310 }
2311 }
2312
2313 if (DumpCCG) {
2314 dbgs() << "CCG before updating call stack chains:\n";
2315 dbgs() << *this;
2316 }
2317
2318 if (ExportToDot)
2319 exportToDot("prestackupdate");
2320
2321 updateStackNodes();
2322
2323 if (ExportToDot)
2324 exportToDot("poststackupdate");
2325
2326 handleCallsitesWithMultipleTargets();
2327
2328 markBackedges();
2329}
2330
2331template <typename DerivedCCG, typename FuncTy, typename CallTy>
2332void CallsiteContextGraph<DerivedCCG, FuncTy,
2333 CallTy>::handleCallsitesWithMultipleTargets() {
2334 // Look for and workaround callsites that call multiple functions.
2335 // This can happen for indirect calls, which needs better handling, and in
2336 // more rare cases (e.g. macro expansion).
2337 // TODO: To fix this for indirect calls we will want to perform speculative
2338 // devirtualization using either the normal PGO info with ICP, or using the
2339 // information in the profiled MemProf contexts. We can do this prior to
2340 // this transformation for regular LTO, and for ThinLTO we can simulate that
2341 // effect in the summary and perform the actual speculative devirtualization
2342 // while cloning in the ThinLTO backend.
2343
2344 // Keep track of the new nodes synthesized for discovered tail calls missing
2345 // from the profiled contexts.
2346 MapVector<CallInfo, ContextNode *> TailCallToContextNodeMap;
2347
2348 std::vector<std::pair<CallInfo, ContextNode *>> NewCallToNode;
2349 for (auto &Entry : NonAllocationCallToContextNodeMap) {
2350 auto *Node = Entry.second;
2351 assert(Node->Clones.empty());
2352 // Check all node callees and see if in the same function.
2353 // We need to check all of the calls recorded in this Node, because in some
2354 // cases we may have had multiple calls with the same debug info calling
2355 // different callees. This can happen, for example, when an object is
2356 // constructed in the paramter list - the destructor call of the object has
2357 // the same debug info (line/col) as the call the object was passed to.
2358 // Here we will prune any that don't match all callee nodes.
2359 std::vector<CallInfo> AllCalls;
2360 AllCalls.reserve(Node->MatchingCalls.size() + 1);
2361 AllCalls.push_back(Node->Call);
2362 llvm::append_range(AllCalls, Node->MatchingCalls);
2363
2364 // First see if we can partition the calls by callee function, creating new
2365 // nodes to host each set of calls calling the same callees. This is
2366 // necessary for support indirect calls with ThinLTO, for which we
2367 // synthesized CallsiteInfo records for each target. They will all have the
2368 // same callsite stack ids and would be sharing a context node at this
2369 // point. We need to perform separate cloning for each, which will be
2370 // applied along with speculative devirtualization in the ThinLTO backends
2371 // as needed. Note this does not currently support looking through tail
2372 // calls, it is unclear if we need that for indirect call targets.
2373 // First partition calls by callee func. Map indexed by func, value is
2374 // struct with list of matching calls, assigned node.
2375 if (partitionCallsByCallee(Node, AllCalls, NewCallToNode))
2376 continue;
2377
2378 auto It = AllCalls.begin();
2379 // Iterate through the calls until we find the first that matches.
2380 for (; It != AllCalls.end(); ++It) {
2381 auto ThisCall = *It;
2382 bool Match = true;
2383 for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();
2384 ++EI) {
2385 auto Edge = *EI;
2386 if (!Edge->Callee->hasCall())
2387 continue;
2388 assert(NodeToCallingFunc.count(Edge->Callee));
2389 // Check if the called function matches that of the callee node.
2390 if (!calleesMatch(ThisCall.call(), EI, TailCallToContextNodeMap)) {
2391 Match = false;
2392 break;
2393 }
2394 }
2395 // Found a call that matches the callee nodes, we can quit now.
2396 if (Match) {
2397 // If the first match is not the primary call on the Node, update it
2398 // now. We will update the list of matching calls further below.
2399 if (Node->Call != ThisCall) {
2400 Node->setCall(ThisCall);
2401 // We need to update the NonAllocationCallToContextNodeMap, but don't
2402 // want to do this during iteration over that map, so save the calls
2403 // that need updated entries.
2404 NewCallToNode.push_back({ThisCall, Node});
2405 }
2406 break;
2407 }
2408 }
2409 // We will update this list below (or leave it cleared if there was no
2410 // match found above).
2411 Node->MatchingCalls.clear();
2412 // If we hit the end of the AllCalls vector, no call matching the callee
2413 // nodes was found, clear the call information in the node.
2414 if (It == AllCalls.end()) {
2415 RemovedEdgesWithMismatchedCallees++;
2416 // Work around by setting Node to have a null call, so it gets
2417 // skipped during cloning. Otherwise assignFunctions will assert
2418 // because its data structures are not designed to handle this case.
2419 Node->setCall(CallInfo());
2420 continue;
2421 }
2422 // Now add back any matching calls that call the same function as the
2423 // matching primary call on Node.
2424 for (++It; It != AllCalls.end(); ++It) {
2425 auto ThisCall = *It;
2426 if (!sameCallee(Node->Call.call(), ThisCall.call()))
2427 continue;
2428 Node->MatchingCalls.push_back(ThisCall);
2429 }
2430 }
2431
2432 // Remove all mismatched nodes identified in the above loop from the node map
2433 // (checking whether they have a null call which is set above). For a
2434 // MapVector like NonAllocationCallToContextNodeMap it is much more efficient
2435 // to do the removal via remove_if than by individually erasing entries above.
2436 // Also remove any entries if we updated the node's primary call above.
2437 NonAllocationCallToContextNodeMap.remove_if([](const auto &it) {
2438 return !it.second->hasCall() || it.second->Call != it.first;
2439 });
2440
2441 // Add entries for any new primary calls recorded above.
2442 for (auto &[Call, Node] : NewCallToNode)
2443 NonAllocationCallToContextNodeMap[Call] = Node;
2444
2445 // Add the new nodes after the above loop so that the iteration is not
2446 // invalidated.
2447 for (auto &[Call, Node] : TailCallToContextNodeMap)
2448 NonAllocationCallToContextNodeMap[Call] = Node;
2449}
2450
2451template <typename DerivedCCG, typename FuncTy, typename CallTy>
2452bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::partitionCallsByCallee(
2453 ContextNode *Node, ArrayRef<CallInfo> AllCalls,
2454 std::vector<std::pair<CallInfo, ContextNode *>> &NewCallToNode) {
2455 // Struct to keep track of all the calls having the same callee function,
2456 // and the node we eventually assign to them. Eventually we will record the
2457 // context node assigned to this group of calls.
2458 struct CallsWithSameCallee {
2459 std::vector<CallInfo> Calls;
2460 ContextNode *Node = nullptr;
2461 };
2462
2463 // First partition calls by callee function. Build map from each function
2464 // to the list of matching calls.
2466 for (auto ThisCall : AllCalls) {
2467 auto *F = getCalleeFunc(ThisCall.call());
2468 if (F)
2469 CalleeFuncToCallInfo[F].Calls.push_back(ThisCall);
2470 }
2471
2472 // Next, walk through all callee edges. For each callee node, get its
2473 // containing function and see if it was recorded in the above map (meaning we
2474 // have at least one matching call). Build another map from each callee node
2475 // with a matching call to the structure instance created above containing all
2476 // the calls.
2478 for (const auto &Edge : Node->CalleeEdges) {
2479 if (!Edge->Callee->hasCall())
2480 continue;
2481 const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee];
2482 if (CalleeFuncToCallInfo.contains(ProfiledCalleeFunc))
2483 CalleeNodeToCallInfo[Edge->Callee] =
2484 &CalleeFuncToCallInfo[ProfiledCalleeFunc];
2485 }
2486
2487 // If there are entries in the second map, then there were no matching
2488 // calls/callees, nothing to do here. Return so we can go to the handling that
2489 // looks through tail calls.
2490 if (CalleeNodeToCallInfo.empty())
2491 return false;
2492
2493 // Walk through all callee edges again. Any and all callee edges that didn't
2494 // match any calls (callee not in the CalleeNodeToCallInfo map) are moved to a
2495 // new caller node (UnmatchedCalleesNode) which gets a null call so that it is
2496 // ignored during cloning. If it is in the map, then we use the node recorded
2497 // in that entry (creating it if needed), and move the callee edge to it.
2498 // The first callee will use the original node instead of creating a new one.
2499 // Note that any of the original calls on this node (in AllCalls) that didn't
2500 // have a callee function automatically get dropped from the node as part of
2501 // this process.
2502 ContextNode *UnmatchedCalleesNode = nullptr;
2503 // Track whether we already assigned original node to a callee.
2504 bool UsedOrigNode = false;
2505 assert(NodeToCallingFunc[Node]);
2506 // Iterate over a copy of Node's callee edges, since we may need to remove
2507 // edges in moveCalleeEdgeToNewCaller, and this simplifies the handling and
2508 // makes it less error-prone.
2509 auto CalleeEdges = Node->CalleeEdges;
2510 for (auto &Edge : CalleeEdges) {
2511 if (!Edge->Callee->hasCall())
2512 continue;
2513
2514 // Will be updated below to point to whatever (caller) node this callee edge
2515 // should be moved to.
2516 ContextNode *CallerNodeToUse = nullptr;
2517
2518 // Handle the case where there were no matching calls first. Move this
2519 // callee edge to the UnmatchedCalleesNode, creating it if needed.
2520 if (!CalleeNodeToCallInfo.contains(Edge->Callee)) {
2521 if (!UnmatchedCalleesNode)
2522 UnmatchedCalleesNode =
2523 createNewNode(/*IsAllocation=*/false, NodeToCallingFunc[Node]);
2524 CallerNodeToUse = UnmatchedCalleesNode;
2525 } else {
2526 // Look up the information recorded for this callee node, and use the
2527 // recorded caller node (creating it if needed).
2528 auto *Info = CalleeNodeToCallInfo[Edge->Callee];
2529 if (!Info->Node) {
2530 // If we haven't assigned any callees to the original node use it.
2531 if (!UsedOrigNode) {
2532 Info->Node = Node;
2533 // Clear the set of matching calls which will be updated below.
2534 Node->MatchingCalls.clear();
2535 UsedOrigNode = true;
2536 } else
2537 Info->Node =
2538 createNewNode(/*IsAllocation=*/false, NodeToCallingFunc[Node]);
2539 assert(!Info->Calls.empty());
2540 // The first call becomes the primary call for this caller node, and the
2541 // rest go in the matching calls list.
2542 Info->Node->setCall(Info->Calls.front());
2543 llvm::append_range(Info->Node->MatchingCalls,
2544 llvm::drop_begin(Info->Calls));
2545 // Save the primary call to node correspondence so that we can update
2546 // the NonAllocationCallToContextNodeMap, which is being iterated in the
2547 // caller of this function.
2548 NewCallToNode.push_back({Info->Node->Call, Info->Node});
2549 }
2550 CallerNodeToUse = Info->Node;
2551 }
2552
2553 // Don't need to move edge if we are using the original node;
2554 if (CallerNodeToUse == Node)
2555 continue;
2556
2557 moveCalleeEdgeToNewCaller(Edge, CallerNodeToUse);
2558 }
2559 // Now that we are done moving edges, clean up any caller edges that ended
2560 // up with no type or context ids. During moveCalleeEdgeToNewCaller all
2561 // caller edges from Node are replicated onto the new callers, and it
2562 // simplifies the handling to leave them until we have moved all
2563 // edges/context ids.
2564 for (auto &I : CalleeNodeToCallInfo)
2565 removeNoneTypeCallerEdges(I.second->Node);
2566 if (UnmatchedCalleesNode)
2567 removeNoneTypeCallerEdges(UnmatchedCalleesNode);
2568 removeNoneTypeCallerEdges(Node);
2569
2570 return true;
2571}
2572
2573uint64_t ModuleCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
2574 // In the Module (IR) case this is already the Id.
2575 return IdOrIndex;
2576}
2577
2578uint64_t IndexCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
2579 // In the Index case this is an index into the stack id list in the summary
2580 // index, convert it to an Id.
2581 return Index.getStackIdAtIndex(IdOrIndex);
2582}
2583
2584template <typename DerivedCCG, typename FuncTy, typename CallTy>
2585bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::calleesMatch(
2586 CallTy Call, EdgeIter &EI,
2587 MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap) {
2588 auto Edge = *EI;
2589 const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee];
2590 const FuncTy *CallerFunc = NodeToCallingFunc[Edge->Caller];
2591 // Will be populated in order of callee to caller if we find a chain of tail
2592 // calls between the profiled caller and callee.
2593 std::vector<std::pair<CallTy, FuncTy *>> FoundCalleeChain;
2594 if (!calleeMatchesFunc(Call, ProfiledCalleeFunc, CallerFunc,
2595 FoundCalleeChain))
2596 return false;
2597
2598 // The usual case where the profiled callee matches that of the IR/summary.
2599 if (FoundCalleeChain.empty())
2600 return true;
2601
2602 auto AddEdge = [Edge, &EI](ContextNode *Caller, ContextNode *Callee) {
2603 auto *CurEdge = Callee->findEdgeFromCaller(Caller);
2604 // If there is already an edge between these nodes, simply update it and
2605 // return.
2606 if (CurEdge) {
2607 CurEdge->ContextIds.insert_range(Edge->ContextIds);
2608 CurEdge->AllocTypes |= Edge->AllocTypes;
2609 return;
2610 }
2611 // Otherwise, create a new edge and insert it into the caller and callee
2612 // lists.
2613 auto NewEdge = std::make_shared<ContextEdge>(
2614 Callee, Caller, Edge->AllocTypes, Edge->ContextIds);
2615 Callee->CallerEdges.push_back(NewEdge);
2616 if (Caller == Edge->Caller) {
2617 // If we are inserting the new edge into the current edge's caller, insert
2618 // the new edge before the current iterator position, and then increment
2619 // back to the current edge.
2620 EI = Caller->CalleeEdges.insert(EI, NewEdge);
2621 ++EI;
2622 assert(*EI == Edge &&
2623 "Iterator position not restored after insert and increment");
2624 } else
2625 Caller->CalleeEdges.push_back(NewEdge);
2626 };
2627
2628 // Create new nodes for each found callee and connect in between the profiled
2629 // caller and callee.
2630 auto *CurCalleeNode = Edge->Callee;
2631 for (auto &[NewCall, Func] : FoundCalleeChain) {
2632 ContextNode *NewNode = nullptr;
2633 // First check if we have already synthesized a node for this tail call.
2634 if (TailCallToContextNodeMap.count(NewCall)) {
2635 NewNode = TailCallToContextNodeMap[NewCall];
2636 NewNode->AllocTypes |= Edge->AllocTypes;
2637 } else {
2638 FuncToCallsWithMetadata[Func].push_back({NewCall});
2639 // Create Node and record node info.
2640 NewNode = createNewNode(/*IsAllocation=*/false, Func, NewCall);
2641 TailCallToContextNodeMap[NewCall] = NewNode;
2642 NewNode->AllocTypes = Edge->AllocTypes;
2643 }
2644
2645 // Hook up node to its callee node
2646 AddEdge(NewNode, CurCalleeNode);
2647
2648 CurCalleeNode = NewNode;
2649 }
2650
2651 // Hook up edge's original caller to new callee node.
2652 AddEdge(Edge->Caller, CurCalleeNode);
2653
2654#ifndef NDEBUG
2655 // Save this because Edge's fields get cleared below when removed.
2656 auto *Caller = Edge->Caller;
2657#endif
2658
2659 // Remove old edge
2660 removeEdgeFromGraph(Edge.get(), &EI, /*CalleeIter=*/true);
2661
2662 // To simplify the increment of EI in the caller, subtract one from EI.
2663 // In the final AddEdge call we would have either added a new callee edge,
2664 // to Edge->Caller, or found an existing one. Either way we are guaranteed
2665 // that there is at least one callee edge.
2666 assert(!Caller->CalleeEdges.empty());
2667 --EI;
2668
2669 return true;
2670}
2671
2672bool ModuleCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
2673 const Function *ProfiledCallee, Value *CurCallee, unsigned Depth,
2674 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain,
2675 bool &FoundMultipleCalleeChains) {
2676 // Stop recursive search if we have already explored the maximum specified
2677 // depth.
2679 return false;
2680
2681 auto SaveCallsiteInfo = [&](Instruction *Callsite, Function *F) {
2682 FoundCalleeChain.push_back({Callsite, F});
2683 };
2684
2685 auto *CalleeFunc = dyn_cast<Function>(CurCallee);
2686 if (!CalleeFunc) {
2687 auto *Alias = dyn_cast<GlobalAlias>(CurCallee);
2688 assert(Alias);
2689 CalleeFunc = dyn_cast<Function>(Alias->getAliasee());
2690 assert(CalleeFunc);
2691 }
2692
2693 // Look for tail calls in this function, and check if they either call the
2694 // profiled callee directly, or indirectly (via a recursive search).
2695 // Only succeed if there is a single unique tail call chain found between the
2696 // profiled caller and callee, otherwise we could perform incorrect cloning.
2697 bool FoundSingleCalleeChain = false;
2698 for (auto &BB : *CalleeFunc) {
2699 for (auto &I : BB) {
2700 auto *CB = dyn_cast<CallBase>(&I);
2701 if (!CB || !CB->isTailCall())
2702 continue;
2703 auto *CalledValue = CB->getCalledOperand();
2704 auto *CalledFunction = CB->getCalledFunction();
2705 if (CalledValue && !CalledFunction) {
2706 CalledValue = CalledValue->stripPointerCasts();
2707 // Stripping pointer casts can reveal a called function.
2708 CalledFunction = dyn_cast<Function>(CalledValue);
2709 }
2710 // Check if this is an alias to a function. If so, get the
2711 // called aliasee for the checks below.
2712 if (auto *GA = dyn_cast<GlobalAlias>(CalledValue)) {
2713 assert(!CalledFunction &&
2714 "Expected null called function in callsite for alias");
2715 CalledFunction = dyn_cast<Function>(GA->getAliaseeObject());
2716 }
2717 if (!CalledFunction)
2718 continue;
2719 if (CalledFunction == ProfiledCallee) {
2720 if (FoundSingleCalleeChain) {
2721 FoundMultipleCalleeChains = true;
2722 return false;
2723 }
2724 FoundSingleCalleeChain = true;
2725 FoundProfiledCalleeCount++;
2726 FoundProfiledCalleeDepth += Depth;
2727 if (Depth > FoundProfiledCalleeMaxDepth)
2728 FoundProfiledCalleeMaxDepth = Depth;
2729 SaveCallsiteInfo(&I, CalleeFunc);
2730 } else if (findProfiledCalleeThroughTailCalls(
2731 ProfiledCallee, CalledFunction, Depth + 1,
2732 FoundCalleeChain, FoundMultipleCalleeChains)) {
2733 // findProfiledCalleeThroughTailCalls should not have returned
2734 // true if FoundMultipleCalleeChains.
2735 assert(!FoundMultipleCalleeChains);
2736 if (FoundSingleCalleeChain) {
2737 FoundMultipleCalleeChains = true;
2738 return false;
2739 }
2740 FoundSingleCalleeChain = true;
2741 SaveCallsiteInfo(&I, CalleeFunc);
2742 } else if (FoundMultipleCalleeChains)
2743 return false;
2744 }
2745 }
2746
2747 return FoundSingleCalleeChain;
2748}
2749
2750const Function *ModuleCallsiteContextGraph::getCalleeFunc(Instruction *Call) {
2751 auto *CB = dyn_cast<CallBase>(Call);
2752 if (!CB->getCalledOperand() || CB->isIndirectCall())
2753 return nullptr;
2754 auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts();
2755 auto *Alias = dyn_cast<GlobalAlias>(CalleeVal);
2756 if (Alias)
2757 return dyn_cast<Function>(Alias->getAliasee());
2758 return dyn_cast<Function>(CalleeVal);
2759}
2760
2761bool ModuleCallsiteContextGraph::calleeMatchesFunc(
2762 Instruction *Call, const Function *Func, const Function *CallerFunc,
2763 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain) {
2764 auto *CB = dyn_cast<CallBase>(Call);
2765 if (!CB->getCalledOperand() || CB->isIndirectCall())
2766 return false;
2767 auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts();
2768 auto *CalleeFunc = dyn_cast<Function>(CalleeVal);
2769 if (CalleeFunc == Func)
2770 return true;
2771 auto *Alias = dyn_cast<GlobalAlias>(CalleeVal);
2772 if (Alias && Alias->getAliasee() == Func)
2773 return true;
2774
2775 // Recursively search for the profiled callee through tail calls starting with
2776 // the actual Callee. The discovered tail call chain is saved in
2777 // FoundCalleeChain, and we will fixup the graph to include these callsites
2778 // after returning.
2779 // FIXME: We will currently redo the same recursive walk if we find the same
2780 // mismatched callee from another callsite. We can improve this with more
2781 // bookkeeping of the created chain of new nodes for each mismatch.
2782 unsigned Depth = 1;
2783 bool FoundMultipleCalleeChains = false;
2784 if (!findProfiledCalleeThroughTailCalls(Func, CalleeVal, Depth,
2785 FoundCalleeChain,
2786 FoundMultipleCalleeChains)) {
2787 LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: "
2788 << Func->getName() << " from " << CallerFunc->getName()
2789 << " that actually called " << CalleeVal->getName()
2790 << (FoundMultipleCalleeChains
2791 ? " (found multiple possible chains)"
2792 : "")
2793 << "\n");
2794 if (FoundMultipleCalleeChains)
2795 FoundProfiledCalleeNonUniquelyCount++;
2796 return false;
2797 }
2798
2799 return true;
2800}
2801
2802bool ModuleCallsiteContextGraph::sameCallee(Instruction *Call1,
2803 Instruction *Call2) {
2804 auto *CB1 = cast<CallBase>(Call1);
2805 if (!CB1->getCalledOperand() || CB1->isIndirectCall())
2806 return false;
2807 auto *CalleeVal1 = CB1->getCalledOperand()->stripPointerCasts();
2808 auto *CalleeFunc1 = dyn_cast<Function>(CalleeVal1);
2809 auto *CB2 = cast<CallBase>(Call2);
2810 if (!CB2->getCalledOperand() || CB2->isIndirectCall())
2811 return false;
2812 auto *CalleeVal2 = CB2->getCalledOperand()->stripPointerCasts();
2813 auto *CalleeFunc2 = dyn_cast<Function>(CalleeVal2);
2814 return CalleeFunc1 == CalleeFunc2;
2815}
2816
2817bool IndexCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
2818 ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
2819 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
2820 bool &FoundMultipleCalleeChains) {
2821 // Stop recursive search if we have already explored the maximum specified
2822 // depth.
2824 return false;
2825
2826 auto CreateAndSaveCallsiteInfo = [&](ValueInfo Callee, FunctionSummary *FS) {
2827 // Make a CallsiteInfo for each discovered callee, if one hasn't already
2828 // been synthesized.
2829 if (!FunctionCalleesToSynthesizedCallsiteInfos.count(FS) ||
2830 !FunctionCalleesToSynthesizedCallsiteInfos[FS].count(Callee))
2831 // StackIds is empty (we don't have debug info available in the index for
2832 // these callsites)
2833 FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee] =
2834 std::make_unique<CallsiteInfo>(Callee, SmallVector<unsigned>());
2835 CallsiteInfo *NewCallsiteInfo =
2836 FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee].get();
2837 FoundCalleeChain.push_back({NewCallsiteInfo, FS});
2838 };
2839
2840 // Look for tail calls in this function, and check if they either call the
2841 // profiled callee directly, or indirectly (via a recursive search).
2842 // Only succeed if there is a single unique tail call chain found between the
2843 // profiled caller and callee, otherwise we could perform incorrect cloning.
2844 bool FoundSingleCalleeChain = false;
2845 for (auto &S : CurCallee.getSummaryList()) {
2846 if (!GlobalValue::isLocalLinkage(S->linkage()) &&
2847 !isPrevailing(CurCallee.getGUID(), S.get()))
2848 continue;
2849 auto *FS = dyn_cast<FunctionSummary>(S->getBaseObject());
2850 if (!FS)
2851 continue;
2852 auto FSVI = CurCallee;
2853 auto *AS = dyn_cast<AliasSummary>(S.get());
2854 if (AS)
2855 FSVI = AS->getAliaseeVI();
2856 for (auto &CallEdge : FS->calls()) {
2857 if (!CallEdge.second.hasTailCall())
2858 continue;
2859 if (CallEdge.first == ProfiledCallee) {
2860 if (FoundSingleCalleeChain) {
2861 FoundMultipleCalleeChains = true;
2862 return false;
2863 }
2864 FoundSingleCalleeChain = true;
2865 FoundProfiledCalleeCount++;
2866 FoundProfiledCalleeDepth += Depth;
2867 if (Depth > FoundProfiledCalleeMaxDepth)
2868 FoundProfiledCalleeMaxDepth = Depth;
2869 CreateAndSaveCallsiteInfo(CallEdge.first, FS);
2870 // Add FS to FSToVIMap in case it isn't already there.
2871 assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI);
2872 FSToVIMap[FS] = FSVI;
2873 } else if (findProfiledCalleeThroughTailCalls(
2874 ProfiledCallee, CallEdge.first, Depth + 1,
2875 FoundCalleeChain, FoundMultipleCalleeChains)) {
2876 // findProfiledCalleeThroughTailCalls should not have returned
2877 // true if FoundMultipleCalleeChains.
2878 assert(!FoundMultipleCalleeChains);
2879 if (FoundSingleCalleeChain) {
2880 FoundMultipleCalleeChains = true;
2881 return false;
2882 }
2883 FoundSingleCalleeChain = true;
2884 CreateAndSaveCallsiteInfo(CallEdge.first, FS);
2885 // Add FS to FSToVIMap in case it isn't already there.
2886 assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI);
2887 FSToVIMap[FS] = FSVI;
2888 } else if (FoundMultipleCalleeChains)
2889 return false;
2890 }
2891 }
2892
2893 return FoundSingleCalleeChain;
2894}
2895
2896const FunctionSummary *
2897IndexCallsiteContextGraph::getCalleeFunc(IndexCall &Call) {
2898 ValueInfo Callee = dyn_cast_if_present<CallsiteInfo *>(Call)->Callee;
2899 if (Callee.getSummaryList().empty())
2900 return nullptr;
2901 return dyn_cast<FunctionSummary>(Callee.getSummaryList()[0]->getBaseObject());
2902}
2903
2904bool IndexCallsiteContextGraph::calleeMatchesFunc(
2905 IndexCall &Call, const FunctionSummary *Func,
2906 const FunctionSummary *CallerFunc,
2907 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain) {
2908 ValueInfo Callee = dyn_cast_if_present<CallsiteInfo *>(Call)->Callee;
2909 // If there is no summary list then this is a call to an externally defined
2910 // symbol.
2911 AliasSummary *Alias =
2912 Callee.getSummaryList().empty()
2913 ? nullptr
2914 : dyn_cast<AliasSummary>(Callee.getSummaryList()[0].get());
2915 assert(FSToVIMap.count(Func));
2916 auto FuncVI = FSToVIMap[Func];
2917 if (Callee == FuncVI ||
2918 // If callee is an alias, check the aliasee, since only function
2919 // summary base objects will contain the stack node summaries and thus
2920 // get a context node.
2921 (Alias && Alias->getAliaseeVI() == FuncVI))
2922 return true;
2923
2924 // Recursively search for the profiled callee through tail calls starting with
2925 // the actual Callee. The discovered tail call chain is saved in
2926 // FoundCalleeChain, and we will fixup the graph to include these callsites
2927 // after returning.
2928 // FIXME: We will currently redo the same recursive walk if we find the same
2929 // mismatched callee from another callsite. We can improve this with more
2930 // bookkeeping of the created chain of new nodes for each mismatch.
2931 unsigned Depth = 1;
2932 bool FoundMultipleCalleeChains = false;
2933 if (!findProfiledCalleeThroughTailCalls(
2934 FuncVI, Callee, Depth, FoundCalleeChain, FoundMultipleCalleeChains)) {
2935 LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: " << FuncVI
2936 << " from " << FSToVIMap[CallerFunc]
2937 << " that actually called " << Callee
2938 << (FoundMultipleCalleeChains
2939 ? " (found multiple possible chains)"
2940 : "")
2941 << "\n");
2942 if (FoundMultipleCalleeChains)
2943 FoundProfiledCalleeNonUniquelyCount++;
2944 return false;
2945 }
2946
2947 return true;
2948}
2949
2950bool IndexCallsiteContextGraph::sameCallee(IndexCall &Call1, IndexCall &Call2) {
2951 ValueInfo Callee1 = dyn_cast_if_present<CallsiteInfo *>(Call1)->Callee;
2952 ValueInfo Callee2 = dyn_cast_if_present<CallsiteInfo *>(Call2)->Callee;
2953 return Callee1 == Callee2;
2954}
2955
2956template <typename DerivedCCG, typename FuncTy, typename CallTy>
2957void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::dump()
2958 const {
2959 print(dbgs());
2960 dbgs() << "\n";
2961}
2962
2963template <typename DerivedCCG, typename FuncTy, typename CallTy>
2964void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::print(
2965 raw_ostream &OS) const {
2966 OS << "Node " << this << "\n";
2967 OS << "\t";
2968 printCall(OS);
2969 if (Recursive)
2970 OS << " (recursive)";
2971 OS << "\n";
2972 if (!MatchingCalls.empty()) {
2973 OS << "\tMatchingCalls:\n";
2974 for (auto &MatchingCall : MatchingCalls) {
2975 OS << "\t";
2976 MatchingCall.print(OS);
2977 OS << "\n";
2978 }
2979 }
2980 OS << "\tAllocTypes: " << getAllocTypeString(AllocTypes) << "\n";
2981 OS << "\tContextIds:";
2982 // Make a copy of the computed context ids that we can sort for stability.
2983 auto ContextIds = getContextIds();
2984 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
2985 std::sort(SortedIds.begin(), SortedIds.end());
2986 for (auto Id : SortedIds)
2987 OS << " " << Id;
2988 OS << "\n";
2989 OS << "\tCalleeEdges:\n";
2990 for (auto &Edge : CalleeEdges)
2991 OS << "\t\t" << *Edge << "\n";
2992 OS << "\tCallerEdges:\n";
2993 for (auto &Edge : CallerEdges)
2994 OS << "\t\t" << *Edge << "\n";
2995 if (!Clones.empty()) {
2996 OS << "\tClones: " << llvm::interleaved(Clones) << "\n";
2997 } else if (CloneOf) {
2998 OS << "\tClone of " << CloneOf << "\n";
2999 }
3000}
3001
3002template <typename DerivedCCG, typename FuncTy, typename CallTy>
3003void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::dump()
3004 const {
3005 print(dbgs());
3006 dbgs() << "\n";
3007}
3008
3009template <typename DerivedCCG, typename FuncTy, typename CallTy>
3010void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::print(
3011 raw_ostream &OS) const {
3012 OS << "Edge from Callee " << Callee << " to Caller: " << Caller
3013 << (IsBackedge ? " (BE)" : "")
3014 << " AllocTypes: " << getAllocTypeString(AllocTypes);
3015 OS << " ContextIds:";
3016 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
3017 std::sort(SortedIds.begin(), SortedIds.end());
3018 for (auto Id : SortedIds)
3019 OS << " " << Id;
3020}
3021
3022template <typename DerivedCCG, typename FuncTy, typename CallTy>
3023void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::dump() const {
3024 print(dbgs());
3025}
3026
3027template <typename DerivedCCG, typename FuncTy, typename CallTy>
3028void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::print(
3029 raw_ostream &OS) const {
3030 OS << "Callsite Context Graph:\n";
3031 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3032 for (const auto Node : nodes<GraphType>(this)) {
3033 if (Node->isRemoved())
3034 continue;
3035 Node->print(OS);
3036 OS << "\n";
3037 }
3038}
3039
3040template <typename DerivedCCG, typename FuncTy, typename CallTy>
3041void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::printTotalSizes(
3042 raw_ostream &OS) const {
3043 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3044 for (const auto Node : nodes<GraphType>(this)) {
3045 if (Node->isRemoved())
3046 continue;
3047 if (!Node->IsAllocation)
3048 continue;
3049 DenseSet<uint32_t> ContextIds = Node->getContextIds();
3050 auto AllocTypeFromCall = getAllocationCallType(Node->Call);
3051 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
3052 std::sort(SortedIds.begin(), SortedIds.end());
3053 for (auto Id : SortedIds) {
3054 auto TypeI = ContextIdToAllocationType.find(Id);
3055 assert(TypeI != ContextIdToAllocationType.end());
3056 auto CSI = ContextIdToContextSizeInfos.find(Id);
3057 if (CSI != ContextIdToContextSizeInfos.end()) {
3058 for (auto &Info : CSI->second) {
3059 OS << "MemProf hinting: "
3060 << getAllocTypeString((uint8_t)TypeI->second)
3061 << " full allocation context " << Info.FullStackId
3062 << " with total size " << Info.TotalSize << " is "
3063 << getAllocTypeString(Node->AllocTypes) << " after cloning";
3064 if (allocTypeToUse(Node->AllocTypes) != AllocTypeFromCall)
3065 OS << " marked " << getAllocTypeString((uint8_t)AllocTypeFromCall)
3066 << " due to cold byte percent";
3067 // Print the internal context id to aid debugging and visualization.
3068 OS << " (context id " << Id << ")";
3069 OS << "\n";
3070 }
3071 }
3072 }
3073 }
3074}
3075
3076template <typename DerivedCCG, typename FuncTy, typename CallTy>
3077void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::check() const {
3078 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3079 for (const auto Node : nodes<GraphType>(this)) {
3080 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
3081 for (auto &Edge : Node->CallerEdges)
3083 }
3084}
3085
3086template <typename DerivedCCG, typename FuncTy, typename CallTy>
3087struct GraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *> {
3088 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3089 using NodeRef = const ContextNode<DerivedCCG, FuncTy, CallTy> *;
3090
3091 using NodePtrTy = std::unique_ptr<ContextNode<DerivedCCG, FuncTy, CallTy>>;
3092 static NodeRef getNode(const NodePtrTy &P) { return P.get(); }
3093
3096 decltype(&getNode)>;
3097
3099 return nodes_iterator(G->NodeOwner.begin(), &getNode);
3100 }
3101
3103 return nodes_iterator(G->NodeOwner.end(), &getNode);
3104 }
3105
3107 return G->NodeOwner.begin()->get();
3108 }
3109
3110 using EdgePtrTy = std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>;
3111 static const ContextNode<DerivedCCG, FuncTy, CallTy> *
3113 return P->Callee;
3114 }
3115
3117 mapped_iterator<typename std::vector<std::shared_ptr<ContextEdge<
3118 DerivedCCG, FuncTy, CallTy>>>::const_iterator,
3119 decltype(&GetCallee)>;
3120
3122 return ChildIteratorType(N->CalleeEdges.begin(), &GetCallee);
3123 }
3124
3126 return ChildIteratorType(N->CalleeEdges.end(), &GetCallee);
3127 }
3128};
3129
3130template <typename DerivedCCG, typename FuncTy, typename CallTy>
3131struct DOTGraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>
3132 : public DefaultDOTGraphTraits {
3133 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {
3134 // If the user requested the full graph to be exported, but provided an
3135 // allocation id, or if the user gave a context id and requested more than
3136 // just a specific context to be exported, note that highlighting is
3137 // enabled.
3138 DoHighlight =
3139 (AllocIdForDot.getNumOccurrences() && DotGraphScope == DotScope::All) ||
3140 (ContextIdForDot.getNumOccurrences() &&
3142 }
3143
3144 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3146 using NodeRef = typename GTraits::NodeRef;
3147 using ChildIteratorType = typename GTraits::ChildIteratorType;
3148
3149 static std::string getNodeLabel(NodeRef Node, GraphType G) {
3150 std::string LabelString =
3151 (Twine("OrigId: ") + (Node->IsAllocation ? "Alloc" : "") +
3152 Twine(Node->OrigStackOrAllocId))
3153 .str();
3154 LabelString += "\n";
3155 if (Node->hasCall()) {
3156 auto Func = G->NodeToCallingFunc.find(Node);
3157 assert(Func != G->NodeToCallingFunc.end());
3158 LabelString +=
3159 G->getLabel(Func->second, Node->Call.call(), Node->Call.cloneNo());
3160 } else {
3161 LabelString += "null call";
3162 if (Node->Recursive)
3163 LabelString += " (recursive)";
3164 else
3165 LabelString += " (external)";
3166 }
3167 return LabelString;
3168 }
3169
3171 auto ContextIds = Node->getContextIds();
3172 // If highlighting enabled, see if this node contains any of the context ids
3173 // of interest. If so, it will use a different color and a larger fontsize
3174 // (which makes the node larger as well).
3175 bool Highlight = false;
3176 if (DoHighlight) {
3177 assert(ContextIdForDot.getNumOccurrences() ||
3178 AllocIdForDot.getNumOccurrences());
3179 if (ContextIdForDot.getNumOccurrences())
3180 Highlight = ContextIds.contains(ContextIdForDot);
3181 else
3182 Highlight = set_intersects(ContextIds, G->DotAllocContextIds);
3183 }
3184 std::string AttributeString = (Twine("tooltip=\"") + getNodeId(Node) + " " +
3185 getContextIds(ContextIds) + "\"")
3186 .str();
3187 // Default fontsize is 14
3188 if (Highlight)
3189 AttributeString += ",fontsize=\"30\"";
3190 AttributeString +=
3191 (Twine(",fillcolor=\"") + getColor(Node->AllocTypes, Highlight) + "\"")
3192 .str();
3193 if (Node->CloneOf) {
3194 AttributeString += ",color=\"blue\"";
3195 AttributeString += ",style=\"filled,bold,dashed\"";
3196 } else
3197 AttributeString += ",style=\"filled\"";
3198 return AttributeString;
3199 }
3200
3201 static std::string getEdgeAttributes(NodeRef, ChildIteratorType ChildIter,
3202 GraphType G) {
3203 auto &Edge = *(ChildIter.getCurrent());
3204 // If highlighting enabled, see if this edge contains any of the context ids
3205 // of interest. If so, it will use a different color and a heavier arrow
3206 // size and weight (the larger weight makes the highlighted path
3207 // straighter).
3208 bool Highlight = false;
3209 if (DoHighlight) {
3210 assert(ContextIdForDot.getNumOccurrences() ||
3211 AllocIdForDot.getNumOccurrences());
3212 if (ContextIdForDot.getNumOccurrences())
3213 Highlight = Edge->ContextIds.contains(ContextIdForDot);
3214 else
3215 Highlight = set_intersects(Edge->ContextIds, G->DotAllocContextIds);
3216 }
3217 auto Color = getColor(Edge->AllocTypes, Highlight);
3218 std::string AttributeString =
3219 (Twine("tooltip=\"") + getContextIds(Edge->ContextIds) + "\"" +
3220 // fillcolor is the arrow head and color is the line
3221 Twine(",fillcolor=\"") + Color + "\"" + Twine(",color=\"") + Color +
3222 "\"")
3223 .str();
3224 if (Edge->IsBackedge)
3225 AttributeString += ",style=\"dotted\"";
3226 // Default penwidth and weight are both 1.
3227 if (Highlight)
3228 AttributeString += ",penwidth=\"2.0\",weight=\"2\"";
3229 return AttributeString;
3230 }
3231
3232 // Since the NodeOwners list includes nodes that are no longer connected to
3233 // the graph, skip them here.
3235 if (Node->isRemoved())
3236 return true;
3237 // If a scope smaller than the full graph was requested, see if this node
3238 // contains any of the context ids of interest.
3240 return !set_intersects(Node->getContextIds(), G->DotAllocContextIds);
3242 return !Node->getContextIds().contains(ContextIdForDot);
3243 return false;
3244 }
3245
3246private:
3247 static std::string getContextIds(const DenseSet<uint32_t> &ContextIds) {
3248 std::string IdString = "ContextIds:";
3249 if (ContextIds.size() < 100) {
3250 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
3251 std::sort(SortedIds.begin(), SortedIds.end());
3252 for (auto Id : SortedIds)
3253 IdString += (" " + Twine(Id)).str();
3254 } else {
3255 IdString += (" (" + Twine(ContextIds.size()) + " ids)").str();
3256 }
3257 return IdString;
3258 }
3259
3260 static std::string getColor(uint8_t AllocTypes, bool Highlight) {
3261 // If DoHighlight is not enabled, we want to use the highlight colors for
3262 // NotCold and Cold, and the non-highlight color for NotCold+Cold. This is
3263 // both compatible with the color scheme before highlighting was supported,
3264 // and for the NotCold+Cold color the non-highlight color is a bit more
3265 // readable.
3266 if (AllocTypes == (uint8_t)AllocationType::NotCold)
3267 // Color "brown1" actually looks like a lighter red.
3268 return !DoHighlight || Highlight ? "brown1" : "lightpink";
3269 if (AllocTypes == (uint8_t)AllocationType::Cold)
3270 return !DoHighlight || Highlight ? "cyan" : "lightskyblue";
3271 if (AllocTypes ==
3272 ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold))
3273 return Highlight ? "magenta" : "mediumorchid1";
3274 return "gray";
3275 }
3276
3277 static std::string getNodeId(NodeRef Node) {
3278 std::stringstream SStream;
3279 SStream << std::hex << "N0x" << (unsigned long long)Node;
3280 std::string Result = SStream.str();
3281 return Result;
3282 }
3283
3284 // True if we should highlight a specific context or allocation's contexts in
3285 // the emitted graph.
3286 static bool DoHighlight;
3287};
3288
3289template <typename DerivedCCG, typename FuncTy, typename CallTy>
3290bool DOTGraphTraits<
3291 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>::DoHighlight =
3292 false;
3293
3294template <typename DerivedCCG, typename FuncTy, typename CallTy>
3295void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::exportToDot(
3296 std::string Label) const {
3297 WriteGraph(this, "", false, Label,
3298 DotFilePathPrefix + "ccg." + Label + ".dot");
3299}
3300
3301template <typename DerivedCCG, typename FuncTy, typename CallTy>
3302typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
3303CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::moveEdgeToNewCalleeClone(
3304 const std::shared_ptr<ContextEdge> &Edge,
3305 DenseSet<uint32_t> ContextIdsToMove) {
3306 ContextNode *Node = Edge->Callee;
3307 assert(NodeToCallingFunc.count(Node));
3308 ContextNode *Clone =
3309 createNewNode(Node->IsAllocation, NodeToCallingFunc[Node], Node->Call);
3310 Node->addClone(Clone);
3311 Clone->MatchingCalls = Node->MatchingCalls;
3312 moveEdgeToExistingCalleeClone(Edge, Clone, /*NewClone=*/true,
3313 ContextIdsToMove);
3314 return Clone;
3315}
3316
3317template <typename DerivedCCG, typename FuncTy, typename CallTy>
3318void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
3319 moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
3320 ContextNode *NewCallee, bool NewClone,
3321 DenseSet<uint32_t> ContextIdsToMove) {
3322 // NewCallee and Edge's current callee must be clones of the same original
3323 // node (Edge's current callee may be the original node too).
3324 assert(NewCallee->getOrigNode() == Edge->Callee->getOrigNode());
3325
3326 bool EdgeIsRecursive = Edge->Callee == Edge->Caller;
3327
3328 ContextNode *OldCallee = Edge->Callee;
3329
3330 // We might already have an edge to the new callee from earlier cloning for a
3331 // different allocation. If one exists we will reuse it.
3332 auto ExistingEdgeToNewCallee = NewCallee->findEdgeFromCaller(Edge->Caller);
3333
3334 // Callers will pass an empty ContextIdsToMove set when they want to move the
3335 // edge. Copy in Edge's ids for simplicity.
3336 if (ContextIdsToMove.empty())
3337 ContextIdsToMove = Edge->getContextIds();
3338
3339 // If we are moving all of Edge's ids, then just move the whole Edge.
3340 // Otherwise only move the specified subset, to a new edge if needed.
3341 if (Edge->getContextIds().size() == ContextIdsToMove.size()) {
3342 // First, update the alloc types on New Callee from Edge.
3343 // Do this before we potentially clear Edge's fields below!
3344 NewCallee->AllocTypes |= Edge->AllocTypes;
3345 // Moving the whole Edge.
3346 if (ExistingEdgeToNewCallee) {
3347 // Since we already have an edge to NewCallee, simply move the ids
3348 // onto it, and remove the existing Edge.
3349 ExistingEdgeToNewCallee->getContextIds().insert_range(ContextIdsToMove);
3350 ExistingEdgeToNewCallee->AllocTypes |= Edge->AllocTypes;
3351 assert(Edge->ContextIds == ContextIdsToMove);
3352 removeEdgeFromGraph(Edge.get());
3353 } else {
3354 // Otherwise just reconnect Edge to NewCallee.
3355 Edge->Callee = NewCallee;
3356 NewCallee->CallerEdges.push_back(Edge);
3357 // Remove it from callee where it was previously connected.
3358 OldCallee->eraseCallerEdge(Edge.get());
3359 // Don't need to update Edge's context ids since we are simply
3360 // reconnecting it.
3361 }
3362 } else {
3363 // Only moving a subset of Edge's ids.
3364 // Compute the alloc type of the subset of ids being moved.
3365 auto CallerEdgeAllocType = computeAllocType(ContextIdsToMove);
3366 if (ExistingEdgeToNewCallee) {
3367 // Since we already have an edge to NewCallee, simply move the ids
3368 // onto it.
3369 ExistingEdgeToNewCallee->getContextIds().insert_range(ContextIdsToMove);
3370 ExistingEdgeToNewCallee->AllocTypes |= CallerEdgeAllocType;
3371 } else {
3372 // Otherwise, create a new edge to NewCallee for the ids being moved.
3373 auto NewEdge = std::make_shared<ContextEdge>(
3374 NewCallee, Edge->Caller, CallerEdgeAllocType, ContextIdsToMove);
3375 Edge->Caller->CalleeEdges.push_back(NewEdge);
3376 NewCallee->CallerEdges.push_back(NewEdge);
3377 }
3378 // In either case, need to update the alloc types on NewCallee, and remove
3379 // those ids and update the alloc type on the original Edge.
3380 NewCallee->AllocTypes |= CallerEdgeAllocType;
3381 set_subtract(Edge->ContextIds, ContextIdsToMove);
3382 Edge->AllocTypes = computeAllocType(Edge->ContextIds);
3383 }
3384 // Now walk the old callee node's callee edges and move Edge's context ids
3385 // over to the corresponding edge into the clone (which is created here if
3386 // this is a newly created clone).
3387 for (auto &OldCalleeEdge : OldCallee->CalleeEdges) {
3388 ContextNode *CalleeToUse = OldCalleeEdge->Callee;
3389 // If this is a direct recursion edge, use NewCallee (the clone) as the
3390 // callee as well, so that any edge updated/created here is also direct
3391 // recursive.
3392 if (CalleeToUse == OldCallee) {
3393 // If this is a recursive edge, see if we already moved a recursive edge
3394 // (which would have to have been this one) - if we were only moving a
3395 // subset of context ids it would still be on OldCallee.
3396 if (EdgeIsRecursive) {
3397 assert(OldCalleeEdge == Edge);
3398 continue;
3399 }
3400 CalleeToUse = NewCallee;
3401 }
3402 // The context ids moving to the new callee are the subset of this edge's
3403 // context ids and the context ids on the caller edge being moved.
3404 DenseSet<uint32_t> EdgeContextIdsToMove =
3405 set_intersection(OldCalleeEdge->getContextIds(), ContextIdsToMove);
3406 set_subtract(OldCalleeEdge->getContextIds(), EdgeContextIdsToMove);
3407 OldCalleeEdge->AllocTypes =
3408 computeAllocType(OldCalleeEdge->getContextIds());
3409 if (!NewClone) {
3410 // Update context ids / alloc type on corresponding edge to NewCallee.
3411 // There is a chance this may not exist if we are reusing an existing
3412 // clone, specifically during function assignment, where we would have
3413 // removed none type edges after creating the clone. If we can't find
3414 // a corresponding edge there, fall through to the cloning below.
3415 if (auto *NewCalleeEdge = NewCallee->findEdgeFromCallee(CalleeToUse)) {
3416 NewCalleeEdge->getContextIds().insert_range(EdgeContextIdsToMove);
3417 NewCalleeEdge->AllocTypes |= computeAllocType(EdgeContextIdsToMove);
3418 continue;
3419 }
3420 }
3421 auto NewEdge = std::make_shared<ContextEdge>(
3422 CalleeToUse, NewCallee, computeAllocType(EdgeContextIdsToMove),
3423 EdgeContextIdsToMove);
3424 NewCallee->CalleeEdges.push_back(NewEdge);
3425 NewEdge->Callee->CallerEdges.push_back(NewEdge);
3426 }
3427 // Recompute the node alloc type now that its callee edges have been
3428 // updated (since we will compute from those edges).
3429 OldCallee->AllocTypes = OldCallee->computeAllocType();
3430 // OldCallee alloc type should be None iff its context id set is now empty.
3431 assert((OldCallee->AllocTypes == (uint8_t)AllocationType::None) ==
3432 OldCallee->emptyContextIds());
3433 if (VerifyCCG) {
3434 checkNode<DerivedCCG, FuncTy, CallTy>(OldCallee, /*CheckEdges=*/false);
3435 checkNode<DerivedCCG, FuncTy, CallTy>(NewCallee, /*CheckEdges=*/false);
3436 for (const auto &OldCalleeEdge : OldCallee->CalleeEdges)
3437 checkNode<DerivedCCG, FuncTy, CallTy>(OldCalleeEdge->Callee,
3438 /*CheckEdges=*/false);
3439 for (const auto &NewCalleeEdge : NewCallee->CalleeEdges)
3440 checkNode<DerivedCCG, FuncTy, CallTy>(NewCalleeEdge->Callee,
3441 /*CheckEdges=*/false);
3442 }
3443}
3444
3445template <typename DerivedCCG, typename FuncTy, typename CallTy>
3446void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
3447 moveCalleeEdgeToNewCaller(const std::shared_ptr<ContextEdge> &Edge,
3448 ContextNode *NewCaller) {
3449 auto *OldCallee = Edge->Callee;
3450 auto *NewCallee = OldCallee;
3451 // If this edge was direct recursive, make any new/updated edge also direct
3452 // recursive to NewCaller.
3453 bool Recursive = Edge->Caller == Edge->Callee;
3454 if (Recursive)
3455 NewCallee = NewCaller;
3456
3457 ContextNode *OldCaller = Edge->Caller;
3458 OldCaller->eraseCalleeEdge(Edge.get());
3459
3460 // We might already have an edge to the new caller. If one exists we will
3461 // reuse it.
3462 auto ExistingEdgeToNewCaller = NewCaller->findEdgeFromCallee(NewCallee);
3463
3464 if (ExistingEdgeToNewCaller) {
3465 // Since we already have an edge to NewCaller, simply move the ids
3466 // onto it, and remove the existing Edge.
3467 ExistingEdgeToNewCaller->getContextIds().insert_range(
3468 Edge->getContextIds());
3469 ExistingEdgeToNewCaller->AllocTypes |= Edge->AllocTypes;
3470 Edge->ContextIds.clear();
3471 Edge->AllocTypes = (uint8_t)AllocationType::None;
3472 OldCallee->eraseCallerEdge(Edge.get());
3473 } else {
3474 // Otherwise just reconnect Edge to NewCaller.
3475 Edge->Caller = NewCaller;
3476 NewCaller->CalleeEdges.push_back(Edge);
3477 if (Recursive) {
3478 assert(NewCallee == NewCaller);
3479 // In the case of (direct) recursive edges, we update the callee as well
3480 // so that it becomes recursive on the new caller.
3481 Edge->Callee = NewCallee;
3482 NewCallee->CallerEdges.push_back(Edge);
3483 OldCallee->eraseCallerEdge(Edge.get());
3484 }
3485 // Don't need to update Edge's context ids since we are simply
3486 // reconnecting it.
3487 }
3488 // In either case, need to update the alloc types on New Caller.
3489 NewCaller->AllocTypes |= Edge->AllocTypes;
3490
3491 // Now walk the old caller node's caller edges and move Edge's context ids
3492 // over to the corresponding edge into the node (which is created here if
3493 // this is a newly created node). We can tell whether this is a newly created
3494 // node by seeing if it has any caller edges yet.
3495#ifndef NDEBUG
3496 bool IsNewNode = NewCaller->CallerEdges.empty();
3497#endif
3498 // If we just moved a direct recursive edge, presumably its context ids should
3499 // also flow out of OldCaller via some other non-recursive callee edge. We
3500 // don't want to remove the recursive context ids from other caller edges yet,
3501 // otherwise the context ids get into an inconsistent state on OldCaller.
3502 // We will update these context ids on the non-recursive caller edge when and
3503 // if they are updated on the non-recursive callee.
3504 if (!Recursive) {
3505 for (auto &OldCallerEdge : OldCaller->CallerEdges) {
3506 auto OldCallerCaller = OldCallerEdge->Caller;
3507 // The context ids moving to the new caller are the subset of this edge's
3508 // context ids and the context ids on the callee edge being moved.
3509 DenseSet<uint32_t> EdgeContextIdsToMove = set_intersection(
3510 OldCallerEdge->getContextIds(), Edge->getContextIds());
3511 if (OldCaller == OldCallerCaller) {
3512 OldCallerCaller = NewCaller;
3513 // Don't actually move this one. The caller will move it directly via a
3514 // call to this function with this as the Edge if it is appropriate to
3515 // move to a diff node that has a matching callee (itself).
3516 continue;
3517 }
3518 set_subtract(OldCallerEdge->getContextIds(), EdgeContextIdsToMove);
3519 OldCallerEdge->AllocTypes =
3520 computeAllocType(OldCallerEdge->getContextIds());
3521 // In this function we expect that any pre-existing node already has edges
3522 // from the same callers as the old node. That should be true in the
3523 // current use case, where we will remove None-type edges after copying
3524 // over all caller edges from the callee.
3525 auto *ExistingCallerEdge = NewCaller->findEdgeFromCaller(OldCallerCaller);
3526 // Since we would have skipped caller edges when moving a direct recursive
3527 // edge, this may not hold true when recursive handling enabled.
3528 assert(IsNewNode || ExistingCallerEdge || AllowRecursiveCallsites);
3529 if (ExistingCallerEdge) {
3530 ExistingCallerEdge->getContextIds().insert_range(EdgeContextIdsToMove);
3531 ExistingCallerEdge->AllocTypes |=
3532 computeAllocType(EdgeContextIdsToMove);
3533 continue;
3534 }
3535 auto NewEdge = std::make_shared<ContextEdge>(
3536 NewCaller, OldCallerCaller, computeAllocType(EdgeContextIdsToMove),
3537 EdgeContextIdsToMove);
3538 NewCaller->CallerEdges.push_back(NewEdge);
3539 NewEdge->Caller->CalleeEdges.push_back(NewEdge);
3540 }
3541 }
3542 // Recompute the node alloc type now that its caller edges have been
3543 // updated (since we will compute from those edges).
3544 OldCaller->AllocTypes = OldCaller->computeAllocType();
3545 // OldCaller alloc type should be None iff its context id set is now empty.
3546 assert((OldCaller->AllocTypes == (uint8_t)AllocationType::None) ==
3547 OldCaller->emptyContextIds());
3548 if (VerifyCCG) {
3549 checkNode<DerivedCCG, FuncTy, CallTy>(OldCaller, /*CheckEdges=*/false);
3550 checkNode<DerivedCCG, FuncTy, CallTy>(NewCaller, /*CheckEdges=*/false);
3551 for (const auto &OldCallerEdge : OldCaller->CallerEdges)
3552 checkNode<DerivedCCG, FuncTy, CallTy>(OldCallerEdge->Caller,
3553 /*CheckEdges=*/false);
3554 for (const auto &NewCallerEdge : NewCaller->CallerEdges)
3555 checkNode<DerivedCCG, FuncTy, CallTy>(NewCallerEdge->Caller,
3556 /*CheckEdges=*/false);
3557 }
3558}
3559
3560template <typename DerivedCCG, typename FuncTy, typename CallTy>
3561void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
3562 recursivelyRemoveNoneTypeCalleeEdges(
3563 ContextNode *Node, DenseSet<const ContextNode *> &Visited) {
3564 auto Inserted = Visited.insert(Node);
3565 if (!Inserted.second)
3566 return;
3567
3568 removeNoneTypeCalleeEdges(Node);
3569
3570 for (auto *Clone : Node->Clones)
3571 recursivelyRemoveNoneTypeCalleeEdges(Clone, Visited);
3572
3573 // The recursive call may remove some of this Node's caller edges.
3574 // Iterate over a copy and skip any that were removed.
3575 auto CallerEdges = Node->CallerEdges;
3576 for (auto &Edge : CallerEdges) {
3577 // Skip any that have been removed by an earlier recursive call.
3578 if (Edge->isRemoved()) {
3579 assert(!is_contained(Node->CallerEdges, Edge));
3580 continue;
3581 }
3582 recursivelyRemoveNoneTypeCalleeEdges(Edge->Caller, Visited);
3583 }
3584}
3585
3586// This is the standard DFS based backedge discovery algorithm.
3587template <typename DerivedCCG, typename FuncTy, typename CallTy>
3588void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::markBackedges() {
3589 // If we are cloning recursive contexts, find and mark backedges from all root
3590 // callers, using the typical DFS based backedge analysis.
3592 return;
3593 DenseSet<const ContextNode *> Visited;
3594 DenseSet<const ContextNode *> CurrentStack;
3595 for (auto &Entry : NonAllocationCallToContextNodeMap) {
3596 auto *Node = Entry.second;
3597 if (Node->isRemoved())
3598 continue;
3599 // It is a root if it doesn't have callers.
3600 if (!Node->CallerEdges.empty())
3601 continue;
3602 markBackedges(Node, Visited, CurrentStack);
3603 assert(CurrentStack.empty());
3604 }
3605}
3606
3607// Recursive helper for above markBackedges method.
3608template <typename DerivedCCG, typename FuncTy, typename CallTy>
3609void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::markBackedges(
3610 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
3611 DenseSet<const ContextNode *> &CurrentStack) {
3612 auto I = Visited.insert(Node);
3613 // We should only call this for unvisited nodes.
3614 assert(I.second);
3615 (void)I;
3616 for (auto &CalleeEdge : Node->CalleeEdges) {
3617 auto *Callee = CalleeEdge->Callee;
3618 if (Visited.count(Callee)) {
3619 // Since this was already visited we need to check if it is currently on
3620 // the recursive stack in which case it is a backedge.
3621 if (CurrentStack.count(Callee))
3622 CalleeEdge->IsBackedge = true;
3623 continue;
3624 }
3625 CurrentStack.insert(Callee);
3626 markBackedges(Callee, Visited, CurrentStack);
3627 CurrentStack.erase(Callee);
3628 }
3629}
3630
3631template <typename DerivedCCG, typename FuncTy, typename CallTy>
3632void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones() {
3633 DenseSet<const ContextNode *> Visited;
3634 for (auto &Entry : AllocationCallToContextNodeMap) {
3635 Visited.clear();
3636 identifyClones(Entry.second, Visited, Entry.second->getContextIds());
3637 }
3638 Visited.clear();
3639 for (auto &Entry : AllocationCallToContextNodeMap)
3640 recursivelyRemoveNoneTypeCalleeEdges(Entry.second, Visited);
3641 if (VerifyCCG)
3642 check();
3643}
3644
3645// helper function to check an AllocType is cold or notcold or both.
3652
3653template <typename DerivedCCG, typename FuncTy, typename CallTy>
3654void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
3655 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
3656 const DenseSet<uint32_t> &AllocContextIds) {
3657 if (VerifyNodes)
3658 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
3659 assert(!Node->CloneOf);
3660
3661 // If Node as a null call, then either it wasn't found in the module (regular
3662 // LTO) or summary index (ThinLTO), or there were other conditions blocking
3663 // cloning (e.g. recursion, calls multiple targets, etc).
3664 // Do this here so that we don't try to recursively clone callers below, which
3665 // isn't useful at least for this node.
3666 if (!Node->hasCall())
3667 return;
3668
3669 // No need to look at any callers if allocation type already unambiguous.
3670 if (hasSingleAllocType(Node->AllocTypes))
3671 return;
3672
3673#ifndef NDEBUG
3674 auto Insert =
3675#endif
3676 Visited.insert(Node);
3677 // We should not have visited this node yet.
3678 assert(Insert.second);
3679 // The recursive call to identifyClones may delete the current edge from the
3680 // CallerEdges vector. Make a copy and iterate on that, simpler than passing
3681 // in an iterator and having recursive call erase from it. Other edges may
3682 // also get removed during the recursion, which will have null Callee and
3683 // Caller pointers (and are deleted later), so we skip those below.
3684 {
3685 auto CallerEdges = Node->CallerEdges;
3686 for (auto &Edge : CallerEdges) {
3687 // Skip any that have been removed by an earlier recursive call.
3688 if (Edge->isRemoved()) {
3689 assert(!is_contained(Node->CallerEdges, Edge));
3690 continue;
3691 }
3692 // Defer backedges. See comments further below where these edges are
3693 // handled during the cloning of this Node.
3694 if (Edge->IsBackedge) {
3695 // We should only mark these if cloning recursive contexts, where we
3696 // need to do this deferral.
3698 continue;
3699 }
3700 // Ignore any caller we previously visited via another edge.
3701 if (!Visited.count(Edge->Caller) && !Edge->Caller->CloneOf) {
3702 identifyClones(Edge->Caller, Visited, AllocContextIds);
3703 }
3704 }
3705 }
3706
3707 // Check if we reached an unambiguous call or have have only a single caller.
3708 if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
3709 return;
3710
3711 // We need to clone.
3712
3713 // Try to keep the original version as alloc type NotCold. This will make
3714 // cases with indirect calls or any other situation with an unknown call to
3715 // the original function get the default behavior. We do this by sorting the
3716 // CallerEdges of the Node we will clone by alloc type.
3717 //
3718 // Give NotCold edge the lowest sort priority so those edges are at the end of
3719 // the caller edges vector, and stay on the original version (since the below
3720 // code clones greedily until it finds all remaining edges have the same type
3721 // and leaves the remaining ones on the original Node).
3722 //
3723 // We shouldn't actually have any None type edges, so the sorting priority for
3724 // that is arbitrary, and we assert in that case below.
3725 const unsigned AllocTypeCloningPriority[] = {/*None*/ 3, /*NotCold*/ 4,
3726 /*Cold*/ 1,
3727 /*NotColdCold*/ 2};
3728 llvm::stable_sort(Node->CallerEdges,
3729 [&](const std::shared_ptr<ContextEdge> &A,
3730 const std::shared_ptr<ContextEdge> &B) {
3731 // Nodes with non-empty context ids should be sorted
3732 // before those with empty context ids.
3733 if (A->ContextIds.empty())
3734 // Either B ContextIds are non-empty (in which case we
3735 // should return false because B < A), or B ContextIds
3736 // are empty, in which case they are equal, and we
3737 // should maintain the original relative ordering.
3738 return false;
3739 if (B->ContextIds.empty())
3740 return true;
3741
3742 if (A->AllocTypes == B->AllocTypes)
3743 // Use the first context id for each edge as a
3744 // tie-breaker.
3745 return *A->ContextIds.begin() < *B->ContextIds.begin();
3746 return AllocTypeCloningPriority[A->AllocTypes] <
3747 AllocTypeCloningPriority[B->AllocTypes];
3748 });
3749
3750 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
3751
3752 DenseSet<uint32_t> RecursiveContextIds;
3754 // If we are allowing recursive callsites, but have also disabled recursive
3755 // contexts, look for context ids that show up in multiple caller edges.
3757 DenseSet<uint32_t> AllCallerContextIds;
3758 for (auto &CE : Node->CallerEdges) {
3759 // Resize to the largest set of caller context ids, since we know the
3760 // final set will be at least that large.
3761 AllCallerContextIds.reserve(CE->getContextIds().size());
3762 for (auto Id : CE->getContextIds())
3763 if (!AllCallerContextIds.insert(Id).second)
3764 RecursiveContextIds.insert(Id);
3765 }
3766 }
3767
3768 // Iterate until we find no more opportunities for disambiguating the alloc
3769 // types via cloning. In most cases this loop will terminate once the Node
3770 // has a single allocation type, in which case no more cloning is needed.
3771 // Iterate over a copy of Node's caller edges, since we may need to remove
3772 // edges in the moveEdgeTo* methods, and this simplifies the handling and
3773 // makes it less error-prone.
3774 auto CallerEdges = Node->CallerEdges;
3775 for (auto &CallerEdge : CallerEdges) {
3776 // Skip any that have been removed by an earlier recursive call.
3777 if (CallerEdge->isRemoved()) {
3778 assert(!is_contained(Node->CallerEdges, CallerEdge));
3779 continue;
3780 }
3781 assert(CallerEdge->Callee == Node);
3782
3783 // See if cloning the prior caller edge left this node with a single alloc
3784 // type or a single caller. In that case no more cloning of Node is needed.
3785 if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
3786 break;
3787
3788 // If the caller was not successfully matched to a call in the IR/summary,
3789 // there is no point in trying to clone for it as we can't update that call.
3790 if (!CallerEdge->Caller->hasCall())
3791 continue;
3792
3793 // Only need to process the ids along this edge pertaining to the given
3794 // allocation.
3795 auto CallerEdgeContextsForAlloc =
3796 set_intersection(CallerEdge->getContextIds(), AllocContextIds);
3797 if (!RecursiveContextIds.empty())
3798 CallerEdgeContextsForAlloc =
3799 set_difference(CallerEdgeContextsForAlloc, RecursiveContextIds);
3800 if (CallerEdgeContextsForAlloc.empty())
3801 continue;
3802
3803 auto CallerAllocTypeForAlloc = computeAllocType(CallerEdgeContextsForAlloc);
3804
3805 // Compute the node callee edge alloc types corresponding to the context ids
3806 // for this caller edge.
3807 std::vector<uint8_t> CalleeEdgeAllocTypesForCallerEdge;
3808 CalleeEdgeAllocTypesForCallerEdge.reserve(Node->CalleeEdges.size());
3809 for (auto &CalleeEdge : Node->CalleeEdges)
3810 CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes(
3811 CalleeEdge->getContextIds(), CallerEdgeContextsForAlloc));
3812
3813 // Don't clone if doing so will not disambiguate any alloc types amongst
3814 // caller edges (including the callee edges that would be cloned).
3815 // Otherwise we will simply move all edges to the clone.
3816 //
3817 // First check if by cloning we will disambiguate the caller allocation
3818 // type from node's allocation type. Query allocTypeToUse so that we don't
3819 // bother cloning to distinguish NotCold+Cold from NotCold. Note that
3820 // neither of these should be None type.
3821 //
3822 // Then check if by cloning node at least one of the callee edges will be
3823 // disambiguated by splitting out different context ids.
3824 //
3825 // However, always do the cloning if this is a backedge, in which case we
3826 // have not yet cloned along this caller edge.
3827 assert(CallerEdge->AllocTypes != (uint8_t)AllocationType::None);
3828 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
3829 if (!CallerEdge->IsBackedge &&
3830 allocTypeToUse(CallerAllocTypeForAlloc) ==
3831 allocTypeToUse(Node->AllocTypes) &&
3832 allocTypesMatch<DerivedCCG, FuncTy, CallTy>(
3833 CalleeEdgeAllocTypesForCallerEdge, Node->CalleeEdges)) {
3834 continue;
3835 }
3836
3837 if (CallerEdge->IsBackedge) {
3838 // We should only mark these if cloning recursive contexts, where we
3839 // need to do this deferral.
3841 DeferredBackedges++;
3842 }
3843
3844 // If this is a backedge, we now do recursive cloning starting from its
3845 // caller since we may have moved unambiguous caller contexts to a clone
3846 // of this Node in a previous iteration of the current loop, giving more
3847 // opportunity for cloning through the backedge. Because we sorted the
3848 // caller edges earlier so that cold caller edges are first, we would have
3849 // visited and cloned this node for any unamibiguously cold non-recursive
3850 // callers before any ambiguous backedge callers. Note that we don't do this
3851 // if the caller is already cloned or visited during cloning (e.g. via a
3852 // different context path from the allocation).
3853 // TODO: Can we do better in the case where the caller was already visited?
3854 if (CallerEdge->IsBackedge && !CallerEdge->Caller->CloneOf &&
3855 !Visited.count(CallerEdge->Caller)) {
3856 const auto OrigIdCount = CallerEdge->getContextIds().size();
3857 // Now do the recursive cloning of this backedge's caller, which was
3858 // deferred earlier.
3859 identifyClones(CallerEdge->Caller, Visited, CallerEdgeContextsForAlloc);
3860 removeNoneTypeCalleeEdges(CallerEdge->Caller);
3861 // See if the recursive call to identifyClones moved the context ids to a
3862 // new edge from this node to a clone of caller, and switch to looking at
3863 // that new edge so that we clone Node for the new caller clone.
3864 bool UpdatedEdge = false;
3865 if (OrigIdCount > CallerEdge->getContextIds().size()) {
3866 for (auto E : Node->CallerEdges) {
3867 // Only interested in clones of the current edges caller.
3868 if (E->Caller->CloneOf != CallerEdge->Caller)
3869 continue;
3870 // See if this edge contains any of the context ids originally on the
3871 // current caller edge.
3872 auto CallerEdgeContextsForAllocNew =
3873 set_intersection(CallerEdgeContextsForAlloc, E->getContextIds());
3874 if (CallerEdgeContextsForAllocNew.empty())
3875 continue;
3876 // Make sure we don't pick a previously existing caller edge of this
3877 // Node, which would be processed on a different iteration of the
3878 // outer loop over the saved CallerEdges.
3879 if (llvm::is_contained(CallerEdges, E))
3880 continue;
3881 // The CallerAllocTypeForAlloc and CalleeEdgeAllocTypesForCallerEdge
3882 // are updated further below for all cases where we just invoked
3883 // identifyClones recursively.
3884 CallerEdgeContextsForAlloc.swap(CallerEdgeContextsForAllocNew);
3885 CallerEdge = E;
3886 UpdatedEdge = true;
3887 break;
3888 }
3889 }
3890 // If cloning removed this edge (and we didn't update it to a new edge
3891 // above), we're done with this edge. It's possible we moved all of the
3892 // context ids to an existing clone, in which case there's no need to do
3893 // further processing for them.
3894 if (CallerEdge->isRemoved())
3895 continue;
3896
3897 // Now we need to update the information used for the cloning decisions
3898 // further below, as we may have modified edges and their context ids.
3899
3900 // Note if we changed the CallerEdge above we would have already updated
3901 // the context ids.
3902 if (!UpdatedEdge) {
3903 CallerEdgeContextsForAlloc = set_intersection(
3904 CallerEdgeContextsForAlloc, CallerEdge->getContextIds());
3905 if (CallerEdgeContextsForAlloc.empty())
3906 continue;
3907 }
3908 // Update the other information that depends on the edges and on the now
3909 // updated CallerEdgeContextsForAlloc.
3910 CallerAllocTypeForAlloc = computeAllocType(CallerEdgeContextsForAlloc);
3911 CalleeEdgeAllocTypesForCallerEdge.clear();
3912 for (auto &CalleeEdge : Node->CalleeEdges) {
3913 CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes(
3914 CalleeEdge->getContextIds(), CallerEdgeContextsForAlloc));
3915 }
3916 }
3917
3918 // First see if we can use an existing clone. Check each clone and its
3919 // callee edges for matching alloc types.
3920 ContextNode *Clone = nullptr;
3921 for (auto *CurClone : Node->Clones) {
3922 if (allocTypeToUse(CurClone->AllocTypes) !=
3923 allocTypeToUse(CallerAllocTypeForAlloc))
3924 continue;
3925
3926 bool BothSingleAlloc = hasSingleAllocType(CurClone->AllocTypes) &&
3927 hasSingleAllocType(CallerAllocTypeForAlloc);
3928 // The above check should mean that if both have single alloc types that
3929 // they should be equal.
3930 assert(!BothSingleAlloc ||
3931 CurClone->AllocTypes == CallerAllocTypeForAlloc);
3932
3933 // If either both have a single alloc type (which are the same), or if the
3934 // clone's callee edges have the same alloc types as those for the current
3935 // allocation on Node's callee edges (CalleeEdgeAllocTypesForCallerEdge),
3936 // then we can reuse this clone.
3937 if (BothSingleAlloc || allocTypesMatchClone<DerivedCCG, FuncTy, CallTy>(
3938 CalleeEdgeAllocTypesForCallerEdge, CurClone)) {
3939 Clone = CurClone;
3940 break;
3941 }
3942 }
3943
3944 // The edge iterator is adjusted when we move the CallerEdge to the clone.
3945 if (Clone)
3946 moveEdgeToExistingCalleeClone(CallerEdge, Clone, /*NewClone=*/false,
3947 CallerEdgeContextsForAlloc);
3948 else
3949 Clone = moveEdgeToNewCalleeClone(CallerEdge, CallerEdgeContextsForAlloc);
3950
3951 // Sanity check that no alloc types on clone or its edges are None.
3952 assert(Clone->AllocTypes != (uint8_t)AllocationType::None);
3953 }
3954
3955 // We should still have some context ids on the original Node.
3956 assert(!Node->emptyContextIds());
3957
3958 // Sanity check that no alloc types on node or edges are None.
3959 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
3960
3961 if (VerifyNodes)
3962 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
3963}
3964
3965void ModuleCallsiteContextGraph::updateAllocationCall(
3966 CallInfo &Call, AllocationType AllocType) {
3967 std::string AllocTypeString = getAllocTypeAttributeString(AllocType);
3969 auto A = llvm::Attribute::get(Call.call()->getFunction()->getContext(),
3970 "memprof", AllocTypeString);
3971 cast<CallBase>(Call.call())->addFnAttr(A);
3972 OREGetter(Call.call()->getFunction())
3973 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", Call.call())
3974 << ore::NV("AllocationCall", Call.call()) << " in clone "
3975 << ore::NV("Caller", Call.call()->getFunction())
3976 << " marked with memprof allocation attribute "
3977 << ore::NV("Attribute", AllocTypeString));
3978}
3979
3980void IndexCallsiteContextGraph::updateAllocationCall(CallInfo &Call,
3982 auto *AI = cast<AllocInfo *>(Call.call());
3983 assert(AI);
3984 assert(AI->Versions.size() > Call.cloneNo());
3985 AI->Versions[Call.cloneNo()] = (uint8_t)AllocType;
3986}
3987
3989ModuleCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const {
3990 const auto *CB = cast<CallBase>(Call.call());
3991 if (!CB->getAttributes().hasFnAttr("memprof"))
3992 return AllocationType::None;
3993 return CB->getAttributes().getFnAttr("memprof").getValueAsString() == "cold"
3994 ? AllocationType::Cold
3995 : AllocationType::NotCold;
3996}
3997
3999IndexCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const {
4000 const auto *AI = cast<AllocInfo *>(Call.call());
4001 assert(AI->Versions.size() > Call.cloneNo());
4002 return (AllocationType)AI->Versions[Call.cloneNo()];
4003}
4004
4005void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall,
4006 FuncInfo CalleeFunc) {
4007 auto *CurF = getCalleeFunc(CallerCall.call());
4008 auto NewCalleeCloneNo = CalleeFunc.cloneNo();
4009 if (isMemProfClone(*CurF)) {
4010 // If we already assigned this callsite to call a specific non-default
4011 // clone (i.e. not the original function which is clone 0), ensure that we
4012 // aren't trying to now update it to call a different clone, which is
4013 // indicative of a bug in the graph or function assignment.
4014 auto CurCalleeCloneNo = getMemProfCloneNum(*CurF);
4015 if (CurCalleeCloneNo != NewCalleeCloneNo) {
4016 LLVM_DEBUG(dbgs() << "Mismatch in call clone assignment: was "
4017 << CurCalleeCloneNo << " now " << NewCalleeCloneNo
4018 << "\n");
4019 MismatchedCloneAssignments++;
4020 }
4021 }
4022 if (NewCalleeCloneNo > 0)
4023 cast<CallBase>(CallerCall.call())->setCalledFunction(CalleeFunc.func());
4024 OREGetter(CallerCall.call()->getFunction())
4025 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CallerCall.call())
4026 << ore::NV("Call", CallerCall.call()) << " in clone "
4027 << ore::NV("Caller", CallerCall.call()->getFunction())
4028 << " assigned to call function clone "
4029 << ore::NV("Callee", CalleeFunc.func()));
4030}
4031
4032void IndexCallsiteContextGraph::updateCall(CallInfo &CallerCall,
4033 FuncInfo CalleeFunc) {
4034 auto *CI = cast<CallsiteInfo *>(CallerCall.call());
4035 assert(CI &&
4036 "Caller cannot be an allocation which should not have profiled calls");
4037 assert(CI->Clones.size() > CallerCall.cloneNo());
4038 auto NewCalleeCloneNo = CalleeFunc.cloneNo();
4039 auto &CurCalleeCloneNo = CI->Clones[CallerCall.cloneNo()];
4040 // If we already assigned this callsite to call a specific non-default
4041 // clone (i.e. not the original function which is clone 0), ensure that we
4042 // aren't trying to now update it to call a different clone, which is
4043 // indicative of a bug in the graph or function assignment.
4044 if (CurCalleeCloneNo != 0 && CurCalleeCloneNo != NewCalleeCloneNo) {
4045 LLVM_DEBUG(dbgs() << "Mismatch in call clone assignment: was "
4046 << CurCalleeCloneNo << " now " << NewCalleeCloneNo
4047 << "\n");
4048 MismatchedCloneAssignments++;
4049 }
4050 CurCalleeCloneNo = NewCalleeCloneNo;
4051}
4052
4053// Update the debug information attached to NewFunc to use the clone Name. Note
4054// this needs to be done for both any existing DISubprogram for the definition,
4055// as well as any separate declaration DISubprogram.
4057 assert(Name == NewFunc->getName());
4058 auto *SP = NewFunc->getSubprogram();
4059 if (!SP)
4060 return;
4061 auto *MDName = MDString::get(NewFunc->getParent()->getContext(), Name);
4062 SP->replaceLinkageName(MDName);
4063 DISubprogram *Decl = SP->getDeclaration();
4064 if (!Decl)
4065 return;
4066 TempDISubprogram NewDecl = Decl->clone();
4067 NewDecl->replaceLinkageName(MDName);
4068 SP->replaceDeclaration(MDNode::replaceWithUniqued(std::move(NewDecl)));
4069}
4070
4071CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
4072 Instruction *>::FuncInfo
4073ModuleCallsiteContextGraph::cloneFunctionForCallsite(
4074 FuncInfo &Func, CallInfo &Call, DenseMap<CallInfo, CallInfo> &CallMap,
4075 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
4076 // Use existing LLVM facilities for cloning and obtaining Call in clone
4077 ValueToValueMapTy VMap;
4078 auto *NewFunc = CloneFunction(Func.func(), VMap);
4079 std::string Name = getMemProfFuncName(Func.func()->getName(), CloneNo);
4080 assert(!Func.func()->getParent()->getFunction(Name));
4081 NewFunc->setName(Name);
4082 updateSubprogramLinkageName(NewFunc, Name);
4083 for (auto &Inst : CallsWithMetadataInFunc) {
4084 // This map always has the initial version in it.
4085 assert(Inst.cloneNo() == 0);
4086 CallMap[Inst] = {cast<Instruction>(VMap[Inst.call()]), CloneNo};
4087 }
4088 OREGetter(Func.func())
4089 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", Func.func())
4090 << "created clone " << ore::NV("NewFunction", NewFunc));
4091 return {NewFunc, CloneNo};
4092}
4093
4094CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
4095 IndexCall>::FuncInfo
4096IndexCallsiteContextGraph::cloneFunctionForCallsite(
4097 FuncInfo &Func, CallInfo &Call, DenseMap<CallInfo, CallInfo> &CallMap,
4098 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
4099 // Check how many clones we have of Call (and therefore function).
4100 // The next clone number is the current size of versions array.
4101 // Confirm this matches the CloneNo provided by the caller, which is based on
4102 // the number of function clones we have.
4103 assert(CloneNo == (isa<AllocInfo *>(Call.call())
4104 ? cast<AllocInfo *>(Call.call())->Versions.size()
4105 : cast<CallsiteInfo *>(Call.call())->Clones.size()));
4106 // Walk all the instructions in this function. Create a new version for
4107 // each (by adding an entry to the Versions/Clones summary array), and copy
4108 // over the version being called for the function clone being cloned here.
4109 // Additionally, add an entry to the CallMap for the new function clone,
4110 // mapping the original call (clone 0, what is in CallsWithMetadataInFunc)
4111 // to the new call clone.
4112 for (auto &Inst : CallsWithMetadataInFunc) {
4113 // This map always has the initial version in it.
4114 assert(Inst.cloneNo() == 0);
4115 if (auto *AI = dyn_cast<AllocInfo *>(Inst.call())) {
4116 assert(AI->Versions.size() == CloneNo);
4117 // We assign the allocation type later (in updateAllocationCall), just add
4118 // an entry for it here.
4119 AI->Versions.push_back(0);
4120 } else {
4121 auto *CI = cast<CallsiteInfo *>(Inst.call());
4122 assert(CI && CI->Clones.size() == CloneNo);
4123 // We assign the clone number later (in updateCall), just add an entry for
4124 // it here.
4125 CI->Clones.push_back(0);
4126 }
4127 CallMap[Inst] = {Inst.call(), CloneNo};
4128 }
4129 return {Func.func(), CloneNo};
4130}
4131
4132// We perform cloning for each allocation node separately. However, this
4133// sometimes results in a situation where the same node calls multiple
4134// clones of the same callee, created for different allocations. This
4135// causes issues when assigning functions to these clones, as each node can
4136// in reality only call a single callee clone.
4137//
4138// To address this, before assigning functions, merge callee clone nodes as
4139// needed using a post order traversal from the allocations. We attempt to
4140// use existing clones as the merge node when legal, and to share them
4141// among callers with the same properties (callers calling the same set of
4142// callee clone nodes for the same allocations).
4143//
4144// Without this fix, in some cases incorrect function assignment will lead
4145// to calling the wrong allocation clone.
4146template <typename DerivedCCG, typename FuncTy, typename CallTy>
4147void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::mergeClones() {
4148 if (!MergeClones)
4149 return;
4150
4151 // Generate a map from context id to the associated allocation node for use
4152 // when merging clones.
4153 DenseMap<uint32_t, ContextNode *> ContextIdToAllocationNode;
4154 for (auto &Entry : AllocationCallToContextNodeMap) {
4155 auto *Node = Entry.second;
4156 for (auto Id : Node->getContextIds())
4157 ContextIdToAllocationNode[Id] = Node->getOrigNode();
4158 for (auto *Clone : Node->Clones) {
4159 for (auto Id : Clone->getContextIds())
4160 ContextIdToAllocationNode[Id] = Clone->getOrigNode();
4161 }
4162 }
4163
4164 // Post order traversal starting from allocations to ensure each callsite
4165 // calls a single clone of its callee. Callee nodes that are clones of each
4166 // other are merged (via new merge nodes if needed) to achieve this.
4167 DenseSet<const ContextNode *> Visited;
4168 for (auto &Entry : AllocationCallToContextNodeMap) {
4169 auto *Node = Entry.second;
4170
4171 mergeClones(Node, Visited, ContextIdToAllocationNode);
4172
4173 // Make a copy so the recursive post order traversal that may create new
4174 // clones doesn't mess up iteration. Note that the recursive traversal
4175 // itself does not call mergeClones on any of these nodes, which are all
4176 // (clones of) allocations.
4177 auto Clones = Node->Clones;
4178 for (auto *Clone : Clones)
4179 mergeClones(Clone, Visited, ContextIdToAllocationNode);
4180 }
4181
4182 if (DumpCCG) {
4183 dbgs() << "CCG after merging:\n";
4184 dbgs() << *this;
4185 }
4186 if (ExportToDot)
4187 exportToDot("aftermerge");
4188
4189 if (VerifyCCG) {
4190 check();
4191 }
4192}
4193
4194// Recursive helper for above mergeClones method.
4195template <typename DerivedCCG, typename FuncTy, typename CallTy>
4196void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::mergeClones(
4197 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
4198 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode) {
4199 auto Inserted = Visited.insert(Node);
4200 if (!Inserted.second)
4201 return;
4202
4203 // Iteratively perform merging on this node to handle new caller nodes created
4204 // during the recursive traversal. We could do something more elegant such as
4205 // maintain a worklist, but this is a simple approach that doesn't cause a
4206 // measureable compile time effect, as most nodes don't have many caller
4207 // edges to check.
4208 bool FoundUnvisited = true;
4209 unsigned Iters = 0;
4210 while (FoundUnvisited) {
4211 Iters++;
4212 FoundUnvisited = false;
4213 // Make a copy since the recursive call may move a caller edge to a new
4214 // callee, messing up the iterator.
4215 auto CallerEdges = Node->CallerEdges;
4216 for (auto CallerEdge : CallerEdges) {
4217 // Skip any caller edge moved onto a different callee during recursion.
4218 if (CallerEdge->Callee != Node)
4219 continue;
4220 // If we found an unvisited caller, note that we should check the caller
4221 // edges again as mergeClones may add or change caller nodes.
4222 if (DoMergeIteration && !Visited.contains(CallerEdge->Caller))
4223 FoundUnvisited = true;
4224 mergeClones(CallerEdge->Caller, Visited, ContextIdToAllocationNode);
4225 }
4226 }
4227
4228 TotalMergeInvokes++;
4229 TotalMergeIters += Iters;
4230 if (Iters > MaxMergeIters)
4231 MaxMergeIters = Iters;
4232
4233 // Merge for this node after we handle its callers.
4234 mergeNodeCalleeClones(Node, Visited, ContextIdToAllocationNode);
4235}
4236
4237template <typename DerivedCCG, typename FuncTy, typename CallTy>
4238void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::mergeNodeCalleeClones(
4239 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
4240 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode) {
4241 // Ignore Node if we moved all of its contexts to clones.
4242 if (Node->emptyContextIds())
4243 return;
4244
4245 // First identify groups of clones among Node's callee edges, by building
4246 // a map from each callee base node to the associated callee edges from Node.
4247 MapVector<ContextNode *, std::vector<std::shared_ptr<ContextEdge>>>
4248 OrigNodeToCloneEdges;
4249 for (const auto &E : Node->CalleeEdges) {
4250 auto *Callee = E->Callee;
4251 if (!Callee->CloneOf && Callee->Clones.empty())
4252 continue;
4253 ContextNode *Base = Callee->getOrigNode();
4254 OrigNodeToCloneEdges[Base].push_back(E);
4255 }
4256
4257 // Helper for callee edge sorting below. Return true if A's callee has fewer
4258 // caller edges than B, or if A is a clone and B is not, or if A's first
4259 // context id is smaller than B's.
4260 auto CalleeCallerEdgeLessThan = [](const std::shared_ptr<ContextEdge> &A,
4261 const std::shared_ptr<ContextEdge> &B) {
4262 if (A->Callee->CallerEdges.size() != B->Callee->CallerEdges.size())
4263 return A->Callee->CallerEdges.size() < B->Callee->CallerEdges.size();
4264 if (A->Callee->CloneOf && !B->Callee->CloneOf)
4265 return true;
4266 else if (!A->Callee->CloneOf && B->Callee->CloneOf)
4267 return false;
4268 // Use the first context id for each edge as a
4269 // tie-breaker.
4270 return *A->ContextIds.begin() < *B->ContextIds.begin();
4271 };
4272
4273 // Process each set of callee clones called by Node, performing the needed
4274 // merging.
4275 for (auto Entry : OrigNodeToCloneEdges) {
4276 // CalleeEdges is the set of edges from Node reaching callees that are
4277 // mutual clones of each other.
4278 auto &CalleeEdges = Entry.second;
4279 auto NumCalleeClones = CalleeEdges.size();
4280 // A single edge means there is no merging needed.
4281 if (NumCalleeClones == 1)
4282 continue;
4283 // Sort the CalleeEdges calling this group of clones in ascending order of
4284 // their caller edge counts, putting the original non-clone node first in
4285 // cases of a tie. This simplifies finding an existing node to use as the
4286 // merge node.
4287 llvm::stable_sort(CalleeEdges, CalleeCallerEdgeLessThan);
4288
4289 /// Find other callers of the given set of callee edges that can
4290 /// share the same callee merge node. See the comments at this method
4291 /// definition for details.
4292 DenseSet<ContextNode *> OtherCallersToShareMerge;
4293 findOtherCallersToShareMerge(Node, CalleeEdges, ContextIdToAllocationNode,
4294 OtherCallersToShareMerge);
4295
4296 // Now do the actual merging. Identify existing or create a new MergeNode
4297 // during the first iteration. Move each callee over, along with edges from
4298 // other callers we've determined above can share the same merge node.
4299 ContextNode *MergeNode = nullptr;
4300 DenseMap<ContextNode *, unsigned> CallerToMoveCount;
4301 for (auto CalleeEdge : CalleeEdges) {
4302 auto *OrigCallee = CalleeEdge->Callee;
4303 // If we don't have a MergeNode yet (only happens on the first iteration,
4304 // as a new one will be created when we go to move the first callee edge
4305 // over as needed), see if we can use this callee.
4306 if (!MergeNode) {
4307 // If there are no other callers, simply use this callee.
4308 if (CalleeEdge->Callee->CallerEdges.size() == 1) {
4309 MergeNode = OrigCallee;
4310 NonNewMergedNodes++;
4311 continue;
4312 }
4313 // Otherwise, if we have identified other caller nodes that can share
4314 // the merge node with Node, see if all of OrigCallee's callers are
4315 // going to share the same merge node. In that case we can use callee
4316 // (since all of its callers would move to the new merge node).
4317 if (!OtherCallersToShareMerge.empty()) {
4318 bool MoveAllCallerEdges = true;
4319 for (auto CalleeCallerE : OrigCallee->CallerEdges) {
4320 if (CalleeCallerE == CalleeEdge)
4321 continue;
4322 if (!OtherCallersToShareMerge.contains(CalleeCallerE->Caller)) {
4323 MoveAllCallerEdges = false;
4324 break;
4325 }
4326 }
4327 // If we are going to move all callers over, we can use this callee as
4328 // the MergeNode.
4329 if (MoveAllCallerEdges) {
4330 MergeNode = OrigCallee;
4331 NonNewMergedNodes++;
4332 continue;
4333 }
4334 }
4335 }
4336 // Move this callee edge, creating a new merge node if necessary.
4337 if (MergeNode) {
4338 assert(MergeNode != OrigCallee);
4339 moveEdgeToExistingCalleeClone(CalleeEdge, MergeNode,
4340 /*NewClone*/ false);
4341 } else {
4342 MergeNode = moveEdgeToNewCalleeClone(CalleeEdge);
4343 NewMergedNodes++;
4344 }
4345 // Now move all identified edges from other callers over to the merge node
4346 // as well.
4347 if (!OtherCallersToShareMerge.empty()) {
4348 // Make and iterate over a copy of OrigCallee's caller edges because
4349 // some of these will be moved off of the OrigCallee and that would mess
4350 // up the iteration from OrigCallee.
4351 auto OrigCalleeCallerEdges = OrigCallee->CallerEdges;
4352 for (auto &CalleeCallerE : OrigCalleeCallerEdges) {
4353 if (CalleeCallerE == CalleeEdge)
4354 continue;
4355 if (!OtherCallersToShareMerge.contains(CalleeCallerE->Caller))
4356 continue;
4357 CallerToMoveCount[CalleeCallerE->Caller]++;
4358 moveEdgeToExistingCalleeClone(CalleeCallerE, MergeNode,
4359 /*NewClone*/ false);
4360 }
4361 }
4362 removeNoneTypeCalleeEdges(OrigCallee);
4363 removeNoneTypeCalleeEdges(MergeNode);
4364 }
4365 }
4366}
4367
4368// Look for other nodes that have edges to the same set of callee
4369// clones as the current Node. Those can share the eventual merge node
4370// (reducing cloning and binary size overhead) iff:
4371// - they have edges to the same set of callee clones
4372// - each callee edge reaches a subset of the same allocations as Node's
4373// corresponding edge to the same callee clone.
4374// The second requirement is to ensure that we don't undo any of the
4375// necessary cloning to distinguish contexts with different allocation
4376// behavior.
4377// FIXME: This is somewhat conservative, as we really just need to ensure
4378// that they don't reach the same allocations as contexts on edges from Node
4379// going to any of the *other* callee clones being merged. However, that
4380// requires more tracking and checking to get right.
4381template <typename DerivedCCG, typename FuncTy, typename CallTy>
4382void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
4383 findOtherCallersToShareMerge(
4384 ContextNode *Node,
4385 std::vector<std::shared_ptr<ContextEdge>> &CalleeEdges,
4386 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode,
4387 DenseSet<ContextNode *> &OtherCallersToShareMerge) {
4388 auto NumCalleeClones = CalleeEdges.size();
4389 // This map counts how many edges to the same callee clone exist for other
4390 // caller nodes of each callee clone.
4391 DenseMap<ContextNode *, unsigned> OtherCallersToSharedCalleeEdgeCount;
4392 // Counts the number of other caller nodes that have edges to all callee
4393 // clones that don't violate the allocation context checking.
4394 unsigned PossibleOtherCallerNodes = 0;
4395
4396 // We only need to look at other Caller nodes if the first callee edge has
4397 // multiple callers (recall they are sorted in ascending order above).
4398 if (CalleeEdges[0]->Callee->CallerEdges.size() < 2)
4399 return;
4400
4401 // For each callee edge:
4402 // - Collect the count of other caller nodes calling the same callees.
4403 // - Collect the alloc nodes reached by contexts on each callee edge.
4404 DenseMap<ContextEdge *, DenseSet<ContextNode *>> CalleeEdgeToAllocNodes;
4405 for (auto CalleeEdge : CalleeEdges) {
4406 assert(CalleeEdge->Callee->CallerEdges.size() > 1);
4407 // For each other caller of the same callee, increment the count of
4408 // edges reaching the same callee clone.
4409 for (auto CalleeCallerEdges : CalleeEdge->Callee->CallerEdges) {
4410 if (CalleeCallerEdges->Caller == Node) {
4411 assert(CalleeCallerEdges == CalleeEdge);
4412 continue;
4413 }
4414 OtherCallersToSharedCalleeEdgeCount[CalleeCallerEdges->Caller]++;
4415 // If this caller edge now reaches all of the same callee clones,
4416 // increment the count of candidate other caller nodes.
4417 if (OtherCallersToSharedCalleeEdgeCount[CalleeCallerEdges->Caller] ==
4418 NumCalleeClones)
4419 PossibleOtherCallerNodes++;
4420 }
4421 // Collect the alloc nodes reached by contexts on each callee edge, for
4422 // later analysis.
4423 for (auto Id : CalleeEdge->getContextIds()) {
4424 auto *Alloc = ContextIdToAllocationNode.lookup(Id);
4425 if (!Alloc) {
4426 // FIXME: unclear why this happens occasionally, presumably
4427 // imperfect graph updates possibly with recursion.
4428 MissingAllocForContextId++;
4429 continue;
4430 }
4431 CalleeEdgeToAllocNodes[CalleeEdge.get()].insert(Alloc);
4432 }
4433 }
4434
4435 // Now walk the callee edges again, and make sure that for each candidate
4436 // caller node all of its edges to the callees reach the same allocs (or
4437 // a subset) as those along the corresponding callee edge from Node.
4438 for (auto CalleeEdge : CalleeEdges) {
4439 assert(CalleeEdge->Callee->CallerEdges.size() > 1);
4440 // Stop if we do not have any (more) candidate other caller nodes.
4441 if (!PossibleOtherCallerNodes)
4442 break;
4443 auto &CurCalleeAllocNodes = CalleeEdgeToAllocNodes[CalleeEdge.get()];
4444 // Check each other caller of this callee clone.
4445 for (auto &CalleeCallerE : CalleeEdge->Callee->CallerEdges) {
4446 // Not interested in the callee edge from Node itself.
4447 if (CalleeCallerE == CalleeEdge)
4448 continue;
4449 // Skip any callers that didn't have callee edges to all the same
4450 // callee clones.
4451 if (OtherCallersToSharedCalleeEdgeCount[CalleeCallerE->Caller] !=
4452 NumCalleeClones)
4453 continue;
4454 // Make sure that each context along edge from candidate caller node
4455 // reaches an allocation also reached by this callee edge from Node.
4456 for (auto Id : CalleeCallerE->getContextIds()) {
4457 auto *Alloc = ContextIdToAllocationNode.lookup(Id);
4458 if (!Alloc)
4459 continue;
4460 // If not, simply reset the map entry to 0 so caller is ignored, and
4461 // reduce the count of candidate other caller nodes.
4462 if (!CurCalleeAllocNodes.contains(Alloc)) {
4463 OtherCallersToSharedCalleeEdgeCount[CalleeCallerE->Caller] = 0;
4464 PossibleOtherCallerNodes--;
4465 break;
4466 }
4467 }
4468 }
4469 }
4470
4471 if (!PossibleOtherCallerNodes)
4472 return;
4473
4474 // Build the set of other caller nodes that can use the same callee merge
4475 // node.
4476 for (auto &[OtherCaller, Count] : OtherCallersToSharedCalleeEdgeCount) {
4477 if (Count != NumCalleeClones)
4478 continue;
4479 OtherCallersToShareMerge.insert(OtherCaller);
4480 }
4481}
4482
4483// This method assigns cloned callsites to functions, cloning the functions as
4484// needed. The assignment is greedy and proceeds roughly as follows:
4485//
4486// For each function Func:
4487// For each call with graph Node having clones:
4488// Initialize ClonesWorklist to Node and its clones
4489// Initialize NodeCloneCount to 0
4490// While ClonesWorklist is not empty:
4491// Clone = pop front ClonesWorklist
4492// NodeCloneCount++
4493// If Func has been cloned less than NodeCloneCount times:
4494// If NodeCloneCount is 1:
4495// Assign Clone to original Func
4496// Continue
4497// Create a new function clone
4498// If other callers not assigned to call a function clone yet:
4499// Assign them to call new function clone
4500// Continue
4501// Assign any other caller calling the cloned version to new clone
4502//
4503// For each caller of Clone:
4504// If caller is assigned to call a specific function clone:
4505// If we cannot assign Clone to that function clone:
4506// Create new callsite Clone NewClone
4507// Add NewClone to ClonesWorklist
4508// Continue
4509// Assign Clone to existing caller's called function clone
4510// Else:
4511// If Clone not already assigned to a function clone:
4512// Assign to first function clone without assignment
4513// Assign caller to selected function clone
4514template <typename DerivedCCG, typename FuncTy, typename CallTy>
4515bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
4516 bool Changed = false;
4517
4518 mergeClones();
4519
4520 // Keep track of the assignment of nodes (callsites) to function clones they
4521 // call.
4522 DenseMap<ContextNode *, FuncInfo> CallsiteToCalleeFuncCloneMap;
4523
4524 // Update caller node to call function version CalleeFunc, by recording the
4525 // assignment in CallsiteToCalleeFuncCloneMap.
4526 auto RecordCalleeFuncOfCallsite = [&](ContextNode *Caller,
4527 const FuncInfo &CalleeFunc) {
4528 assert(Caller->hasCall());
4529 CallsiteToCalleeFuncCloneMap[Caller] = CalleeFunc;
4530 };
4531
4532 // Information for a single clone of this Func.
4533 struct FuncCloneInfo {
4534 // The function clone.
4535 FuncInfo FuncClone;
4536 // Remappings of each call of interest (from original uncloned call to the
4537 // corresponding cloned call in this function clone).
4538 DenseMap<CallInfo, CallInfo> CallMap;
4539 };
4540
4541 // Walk all functions for which we saw calls with memprof metadata, and handle
4542 // cloning for each of its calls.
4543 for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
4544 FuncInfo OrigFunc(Func);
4545 // Map from each clone number of OrigFunc to information about that function
4546 // clone (the function clone FuncInfo and call remappings). The index into
4547 // the vector is the clone number, as function clones are created and
4548 // numbered sequentially.
4549 std::vector<FuncCloneInfo> FuncCloneInfos;
4550 for (auto &Call : CallsWithMetadata) {
4551 ContextNode *Node = getNodeForInst(Call);
4552 // Skip call if we do not have a node for it (all uses of its stack ids
4553 // were either on inlined chains or pruned from the MIBs), or if we did
4554 // not create any clones for it.
4555 if (!Node || Node->Clones.empty())
4556 continue;
4557 assert(Node->hasCall() &&
4558 "Not having a call should have prevented cloning");
4559
4560 // Track the assignment of function clones to clones of the current
4561 // callsite Node being handled.
4562 std::map<FuncInfo, ContextNode *> FuncCloneToCurNodeCloneMap;
4563
4564 // Assign callsite version CallsiteClone to function version FuncClone,
4565 // and also assign (possibly cloned) Call to CallsiteClone.
4566 auto AssignCallsiteCloneToFuncClone = [&](const FuncInfo &FuncClone,
4567 CallInfo &Call,
4568 ContextNode *CallsiteClone,
4569 bool IsAlloc) {
4570 // Record the clone of callsite node assigned to this function clone.
4571 FuncCloneToCurNodeCloneMap[FuncClone] = CallsiteClone;
4572
4573 assert(FuncCloneInfos.size() > FuncClone.cloneNo());
4574 DenseMap<CallInfo, CallInfo> &CallMap =
4575 FuncCloneInfos[FuncClone.cloneNo()].CallMap;
4576 CallInfo CallClone(Call);
4577 if (auto It = CallMap.find(Call); It != CallMap.end())
4578 CallClone = It->second;
4579 CallsiteClone->setCall(CallClone);
4580 // Need to do the same for all matching calls.
4581 for (auto &MatchingCall : Node->MatchingCalls) {
4582 CallInfo CallClone(MatchingCall);
4583 if (auto It = CallMap.find(MatchingCall); It != CallMap.end())
4584 CallClone = It->second;
4585 // Updates the call in the list.
4586 MatchingCall = CallClone;
4587 }
4588 };
4589
4590 // Keep track of the clones of callsite Node that need to be assigned to
4591 // function clones. This list may be expanded in the loop body below if we
4592 // find additional cloning is required.
4593 std::deque<ContextNode *> ClonesWorklist;
4594 // Ignore original Node if we moved all of its contexts to clones.
4595 if (!Node->emptyContextIds())
4596 ClonesWorklist.push_back(Node);
4597 llvm::append_range(ClonesWorklist, Node->Clones);
4598
4599 // Now walk through all of the clones of this callsite Node that we need,
4600 // and determine the assignment to a corresponding clone of the current
4601 // function (creating new function clones as needed).
4602 unsigned NodeCloneCount = 0;
4603 while (!ClonesWorklist.empty()) {
4604 ContextNode *Clone = ClonesWorklist.front();
4605 ClonesWorklist.pop_front();
4606 NodeCloneCount++;
4607 if (VerifyNodes)
4609
4610 // Need to create a new function clone if we have more callsite clones
4611 // than existing function clones, which would have been assigned to an
4612 // earlier clone in the list (we assign callsite clones to function
4613 // clones greedily).
4614 if (FuncCloneInfos.size() < NodeCloneCount) {
4615 // If this is the first callsite copy, assign to original function.
4616 if (NodeCloneCount == 1) {
4617 // Since FuncCloneInfos is empty in this case, no clones have
4618 // been created for this function yet, and no callers should have
4619 // been assigned a function clone for this callee node yet.
4621 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
4622 return CallsiteToCalleeFuncCloneMap.count(E->Caller);
4623 }));
4624 // Initialize with empty call map, assign Clone to original function
4625 // and its callers, and skip to the next clone.
4626 FuncCloneInfos.push_back(
4627 {OrigFunc, DenseMap<CallInfo, CallInfo>()});
4628 AssignCallsiteCloneToFuncClone(
4629 OrigFunc, Call, Clone,
4630 AllocationCallToContextNodeMap.count(Call));
4631 for (auto &CE : Clone->CallerEdges) {
4632 // Ignore any caller that does not have a recorded callsite Call.
4633 if (!CE->Caller->hasCall())
4634 continue;
4635 RecordCalleeFuncOfCallsite(CE->Caller, OrigFunc);
4636 }
4637 continue;
4638 }
4639
4640 // First locate which copy of OrigFunc to clone again. If a caller
4641 // of this callsite clone was already assigned to call a particular
4642 // function clone, we need to redirect all of those callers to the
4643 // new function clone, and update their other callees within this
4644 // function.
4645 FuncInfo PreviousAssignedFuncClone;
4646 auto EI = llvm::find_if(
4647 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
4648 return CallsiteToCalleeFuncCloneMap.count(E->Caller);
4649 });
4650 bool CallerAssignedToCloneOfFunc = false;
4651 if (EI != Clone->CallerEdges.end()) {
4652 const std::shared_ptr<ContextEdge> &Edge = *EI;
4653 PreviousAssignedFuncClone =
4654 CallsiteToCalleeFuncCloneMap[Edge->Caller];
4655 CallerAssignedToCloneOfFunc = true;
4656 }
4657
4658 // Clone function and save it along with the CallInfo map created
4659 // during cloning in the FuncCloneInfos.
4660 DenseMap<CallInfo, CallInfo> NewCallMap;
4661 unsigned CloneNo = FuncCloneInfos.size();
4662 assert(CloneNo > 0 && "Clone 0 is the original function, which "
4663 "should already exist in the map");
4664 FuncInfo NewFuncClone = cloneFunctionForCallsite(
4665 OrigFunc, Call, NewCallMap, CallsWithMetadata, CloneNo);
4666 FuncCloneInfos.push_back({NewFuncClone, std::move(NewCallMap)});
4667 FunctionClonesAnalysis++;
4668 Changed = true;
4669
4670 // If no caller callsites were already assigned to a clone of this
4671 // function, we can simply assign this clone to the new func clone
4672 // and update all callers to it, then skip to the next clone.
4673 if (!CallerAssignedToCloneOfFunc) {
4674 AssignCallsiteCloneToFuncClone(
4675 NewFuncClone, Call, Clone,
4676 AllocationCallToContextNodeMap.count(Call));
4677 for (auto &CE : Clone->CallerEdges) {
4678 // Ignore any caller that does not have a recorded callsite Call.
4679 if (!CE->Caller->hasCall())
4680 continue;
4681 RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
4682 }
4683 continue;
4684 }
4685
4686 // We may need to do additional node cloning in this case.
4687 // Reset the CallsiteToCalleeFuncCloneMap entry for any callers
4688 // that were previously assigned to call PreviousAssignedFuncClone,
4689 // to record that they now call NewFuncClone.
4690 // The none type edge removal may remove some of this Clone's caller
4691 // edges, if it is reached via another of its caller's callees.
4692 // Iterate over a copy and skip any that were removed.
4693 auto CallerEdges = Clone->CallerEdges;
4694 for (auto CE : CallerEdges) {
4695 // Skip any that have been removed on an earlier iteration.
4696 if (CE->isRemoved()) {
4697 assert(!is_contained(Clone->CallerEdges, CE));
4698 continue;
4699 }
4700 assert(CE);
4701 // Ignore any caller that does not have a recorded callsite Call.
4702 if (!CE->Caller->hasCall())
4703 continue;
4704
4705 if (!CallsiteToCalleeFuncCloneMap.count(CE->Caller) ||
4706 // We subsequently fall through to later handling that
4707 // will perform any additional cloning required for
4708 // callers that were calling other function clones.
4709 CallsiteToCalleeFuncCloneMap[CE->Caller] !=
4710 PreviousAssignedFuncClone)
4711 continue;
4712
4713 RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
4714
4715 // If we are cloning a function that was already assigned to some
4716 // callers, then essentially we are creating new callsite clones
4717 // of the other callsites in that function that are reached by those
4718 // callers. Clone the other callees of the current callsite's caller
4719 // that were already assigned to PreviousAssignedFuncClone
4720 // accordingly. This is important since we subsequently update the
4721 // calls from the nodes in the graph and their assignments to callee
4722 // functions recorded in CallsiteToCalleeFuncCloneMap.
4723 // The none type edge removal may remove some of this caller's
4724 // callee edges, if it is reached via another of its callees.
4725 // Iterate over a copy and skip any that were removed.
4726 auto CalleeEdges = CE->Caller->CalleeEdges;
4727 for (auto CalleeEdge : CalleeEdges) {
4728 // Skip any that have been removed on an earlier iteration when
4729 // cleaning up newly None type callee edges.
4730 if (CalleeEdge->isRemoved()) {
4731 assert(!is_contained(CE->Caller->CalleeEdges, CalleeEdge));
4732 continue;
4733 }
4734 assert(CalleeEdge);
4735 ContextNode *Callee = CalleeEdge->Callee;
4736 // Skip the current callsite, we are looking for other
4737 // callsites Caller calls, as well as any that does not have a
4738 // recorded callsite Call.
4739 if (Callee == Clone || !Callee->hasCall())
4740 continue;
4741 // Skip direct recursive calls. We don't need/want to clone the
4742 // caller node again, and this loop will not behave as expected if
4743 // we tried.
4744 if (Callee == CalleeEdge->Caller)
4745 continue;
4746 ContextNode *NewClone = moveEdgeToNewCalleeClone(CalleeEdge);
4747 removeNoneTypeCalleeEdges(NewClone);
4748 // Moving the edge may have resulted in some none type
4749 // callee edges on the original Callee.
4750 removeNoneTypeCalleeEdges(Callee);
4751 assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
4752 // If the Callee node was already assigned to call a specific
4753 // function version, make sure its new clone is assigned to call
4754 // that same function clone.
4755 if (CallsiteToCalleeFuncCloneMap.count(Callee))
4756 RecordCalleeFuncOfCallsite(
4757 NewClone, CallsiteToCalleeFuncCloneMap[Callee]);
4758 // Update NewClone with the new Call clone of this callsite's Call
4759 // created for the new function clone created earlier.
4760 // Recall that we have already ensured when building the graph
4761 // that each caller can only call callsites within the same
4762 // function, so we are guaranteed that Callee Call is in the
4763 // current OrigFunc.
4764 // CallMap is set up as indexed by original Call at clone 0.
4765 CallInfo OrigCall(Callee->getOrigNode()->Call);
4766 OrigCall.setCloneNo(0);
4767 DenseMap<CallInfo, CallInfo> &CallMap =
4768 FuncCloneInfos[NewFuncClone.cloneNo()].CallMap;
4769 assert(CallMap.count(OrigCall));
4770 CallInfo NewCall(CallMap[OrigCall]);
4771 assert(NewCall);
4772 NewClone->setCall(NewCall);
4773 // Need to do the same for all matching calls.
4774 for (auto &MatchingCall : NewClone->MatchingCalls) {
4775 CallInfo OrigMatchingCall(MatchingCall);
4776 OrigMatchingCall.setCloneNo(0);
4777 assert(CallMap.count(OrigMatchingCall));
4778 CallInfo NewCall(CallMap[OrigMatchingCall]);
4779 assert(NewCall);
4780 // Updates the call in the list.
4781 MatchingCall = NewCall;
4782 }
4783 }
4784 }
4785 // Fall through to handling below to perform the recording of the
4786 // function for this callsite clone. This enables handling of cases
4787 // where the callers were assigned to different clones of a function.
4788 }
4789
4790 auto FindFirstAvailFuncClone = [&]() {
4791 // Find first function in FuncCloneInfos without an assigned
4792 // clone of this callsite Node. We should always have one
4793 // available at this point due to the earlier cloning when the
4794 // FuncCloneInfos size was smaller than the clone number.
4795 for (auto &CF : FuncCloneInfos) {
4796 if (!FuncCloneToCurNodeCloneMap.count(CF.FuncClone))
4797 return CF.FuncClone;
4798 }
4800 "Expected an available func clone for this callsite clone");
4801 };
4802
4803 // See if we can use existing function clone. Walk through
4804 // all caller edges to see if any have already been assigned to
4805 // a clone of this callsite's function. If we can use it, do so. If not,
4806 // because that function clone is already assigned to a different clone
4807 // of this callsite, then we need to clone again.
4808 // Basically, this checking is needed to handle the case where different
4809 // caller functions/callsites may need versions of this function
4810 // containing different mixes of callsite clones across the different
4811 // callsites within the function. If that happens, we need to create
4812 // additional function clones to handle the various combinations.
4813 //
4814 // Keep track of any new clones of this callsite created by the
4815 // following loop, as well as any existing clone that we decided to
4816 // assign this clone to.
4817 std::map<FuncInfo, ContextNode *> FuncCloneToNewCallsiteCloneMap;
4818 FuncInfo FuncCloneAssignedToCurCallsiteClone;
4819 // Iterate over a copy of Clone's caller edges, since we may need to
4820 // remove edges in the moveEdgeTo* methods, and this simplifies the
4821 // handling and makes it less error-prone.
4822 auto CloneCallerEdges = Clone->CallerEdges;
4823 for (auto &Edge : CloneCallerEdges) {
4824 // Skip removed edges (due to direct recursive edges updated when
4825 // updating callee edges when moving an edge and subsequently
4826 // removed by call to removeNoneTypeCalleeEdges on the Clone).
4827 if (Edge->isRemoved())
4828 continue;
4829 // Ignore any caller that does not have a recorded callsite Call.
4830 if (!Edge->Caller->hasCall())
4831 continue;
4832 // If this caller already assigned to call a version of OrigFunc, need
4833 // to ensure we can assign this callsite clone to that function clone.
4834 if (CallsiteToCalleeFuncCloneMap.count(Edge->Caller)) {
4835 FuncInfo FuncCloneCalledByCaller =
4836 CallsiteToCalleeFuncCloneMap[Edge->Caller];
4837 // First we need to confirm that this function clone is available
4838 // for use by this callsite node clone.
4839 //
4840 // While FuncCloneToCurNodeCloneMap is built only for this Node and
4841 // its callsite clones, one of those callsite clones X could have
4842 // been assigned to the same function clone called by Edge's caller
4843 // - if Edge's caller calls another callsite within Node's original
4844 // function, and that callsite has another caller reaching clone X.
4845 // We need to clone Node again in this case.
4846 if ((FuncCloneToCurNodeCloneMap.count(FuncCloneCalledByCaller) &&
4847 FuncCloneToCurNodeCloneMap[FuncCloneCalledByCaller] !=
4848 Clone) ||
4849 // Detect when we have multiple callers of this callsite that
4850 // have already been assigned to specific, and different, clones
4851 // of OrigFunc (due to other unrelated callsites in Func they
4852 // reach via call contexts). Is this Clone of callsite Node
4853 // assigned to a different clone of OrigFunc? If so, clone Node
4854 // again.
4855 (FuncCloneAssignedToCurCallsiteClone &&
4856 FuncCloneAssignedToCurCallsiteClone !=
4857 FuncCloneCalledByCaller)) {
4858 // We need to use a different newly created callsite clone, in
4859 // order to assign it to another new function clone on a
4860 // subsequent iteration over the Clones array (adjusted below).
4861 // Note we specifically do not reset the
4862 // CallsiteToCalleeFuncCloneMap entry for this caller, so that
4863 // when this new clone is processed later we know which version of
4864 // the function to copy (so that other callsite clones we have
4865 // assigned to that function clone are properly cloned over). See
4866 // comments in the function cloning handling earlier.
4867
4868 // Check if we already have cloned this callsite again while
4869 // walking through caller edges, for a caller calling the same
4870 // function clone. If so, we can move this edge to that new clone
4871 // rather than creating yet another new clone.
4872 if (FuncCloneToNewCallsiteCloneMap.count(
4873 FuncCloneCalledByCaller)) {
4874 ContextNode *NewClone =
4875 FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller];
4876 moveEdgeToExistingCalleeClone(Edge, NewClone);
4877 // Cleanup any none type edges cloned over.
4878 removeNoneTypeCalleeEdges(NewClone);
4879 } else {
4880 // Create a new callsite clone.
4881 ContextNode *NewClone = moveEdgeToNewCalleeClone(Edge);
4882 removeNoneTypeCalleeEdges(NewClone);
4883 FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller] =
4884 NewClone;
4885 // Add to list of clones and process later.
4886 ClonesWorklist.push_back(NewClone);
4887 assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
4888 }
4889 // Moving the caller edge may have resulted in some none type
4890 // callee edges.
4891 removeNoneTypeCalleeEdges(Clone);
4892 // We will handle the newly created callsite clone in a subsequent
4893 // iteration over this Node's Clones.
4894 continue;
4895 }
4896
4897 // Otherwise, we can use the function clone already assigned to this
4898 // caller.
4899 if (!FuncCloneAssignedToCurCallsiteClone) {
4900 FuncCloneAssignedToCurCallsiteClone = FuncCloneCalledByCaller;
4901 // Assign Clone to FuncCloneCalledByCaller
4902 AssignCallsiteCloneToFuncClone(
4903 FuncCloneCalledByCaller, Call, Clone,
4904 AllocationCallToContextNodeMap.count(Call));
4905 } else
4906 // Don't need to do anything - callsite is already calling this
4907 // function clone.
4908 assert(FuncCloneAssignedToCurCallsiteClone ==
4909 FuncCloneCalledByCaller);
4910
4911 } else {
4912 // We have not already assigned this caller to a version of
4913 // OrigFunc. Do the assignment now.
4914
4915 // First check if we have already assigned this callsite clone to a
4916 // clone of OrigFunc for another caller during this iteration over
4917 // its caller edges.
4918 if (!FuncCloneAssignedToCurCallsiteClone) {
4919 FuncCloneAssignedToCurCallsiteClone = FindFirstAvailFuncClone();
4920 assert(FuncCloneAssignedToCurCallsiteClone);
4921 // Assign Clone to FuncCloneAssignedToCurCallsiteClone
4922 AssignCallsiteCloneToFuncClone(
4923 FuncCloneAssignedToCurCallsiteClone, Call, Clone,
4924 AllocationCallToContextNodeMap.count(Call));
4925 } else
4926 assert(FuncCloneToCurNodeCloneMap
4927 [FuncCloneAssignedToCurCallsiteClone] == Clone);
4928 // Update callers to record function version called.
4929 RecordCalleeFuncOfCallsite(Edge->Caller,
4930 FuncCloneAssignedToCurCallsiteClone);
4931 }
4932 }
4933 // If we didn't assign a function clone to this callsite clone yet, e.g.
4934 // none of its callers has a non-null call, do the assignment here.
4935 // We want to ensure that every callsite clone is assigned to some
4936 // function clone, so that the call updates below work as expected.
4937 // In particular if this is the original callsite, we want to ensure it
4938 // is assigned to the original function, otherwise the original function
4939 // will appear available for assignment to other callsite clones,
4940 // leading to unintended effects. For one, the unknown and not updated
4941 // callers will call into cloned paths leading to the wrong hints,
4942 // because they still call the original function (clone 0). Also,
4943 // because all callsites start out as being clone 0 by default, we can't
4944 // easily distinguish between callsites explicitly assigned to clone 0
4945 // vs those never assigned, which can lead to multiple updates of the
4946 // calls when invoking updateCall below, with mismatched clone values.
4947 // TODO: Add a flag to the callsite nodes or some other mechanism to
4948 // better distinguish and identify callsite clones that are not getting
4949 // assigned to function clones as expected.
4950 if (!FuncCloneAssignedToCurCallsiteClone) {
4951 FuncCloneAssignedToCurCallsiteClone = FindFirstAvailFuncClone();
4952 assert(FuncCloneAssignedToCurCallsiteClone &&
4953 "No available func clone for this callsite clone");
4954 AssignCallsiteCloneToFuncClone(
4955 FuncCloneAssignedToCurCallsiteClone, Call, Clone,
4956 /*IsAlloc=*/AllocationCallToContextNodeMap.contains(Call));
4957 }
4958 }
4959 if (VerifyCCG) {
4961 for (const auto &PE : Node->CalleeEdges)
4963 for (const auto &CE : Node->CallerEdges)
4965 for (auto *Clone : Node->Clones) {
4967 for (const auto &PE : Clone->CalleeEdges)
4969 for (const auto &CE : Clone->CallerEdges)
4971 }
4972 }
4973 }
4974 }
4975
4976 uint8_t BothTypes =
4977 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
4978
4979 auto UpdateCalls = [&](ContextNode *Node,
4980 DenseSet<const ContextNode *> &Visited,
4981 auto &&UpdateCalls) {
4982 auto Inserted = Visited.insert(Node);
4983 if (!Inserted.second)
4984 return;
4985
4986 for (auto *Clone : Node->Clones)
4987 UpdateCalls(Clone, Visited, UpdateCalls);
4988
4989 for (auto &Edge : Node->CallerEdges)
4990 UpdateCalls(Edge->Caller, Visited, UpdateCalls);
4991
4992 // Skip if either no call to update, or if we ended up with no context ids
4993 // (we moved all edges onto other clones).
4994 if (!Node->hasCall() || Node->emptyContextIds())
4995 return;
4996
4997 if (Node->IsAllocation) {
4998 auto AT = allocTypeToUse(Node->AllocTypes);
4999 // If the allocation type is ambiguous, and more aggressive hinting
5000 // has been enabled via the MinClonedColdBytePercent flag, see if this
5001 // allocation should be hinted cold anyway because its fraction cold bytes
5002 // allocated is at least the given threshold.
5003 if (Node->AllocTypes == BothTypes && MinClonedColdBytePercent < 100 &&
5004 !ContextIdToContextSizeInfos.empty()) {
5005 uint64_t TotalCold = 0;
5006 uint64_t Total = 0;
5007 for (auto Id : Node->getContextIds()) {
5008 auto TypeI = ContextIdToAllocationType.find(Id);
5009 assert(TypeI != ContextIdToAllocationType.end());
5010 auto CSI = ContextIdToContextSizeInfos.find(Id);
5011 if (CSI != ContextIdToContextSizeInfos.end()) {
5012 for (auto &Info : CSI->second) {
5013 Total += Info.TotalSize;
5014 if (TypeI->second == AllocationType::Cold)
5015 TotalCold += Info.TotalSize;
5016 }
5017 }
5018 }
5019 if (TotalCold * 100 >= Total * MinClonedColdBytePercent)
5020 AT = AllocationType::Cold;
5021 }
5022 updateAllocationCall(Node->Call, AT);
5023 assert(Node->MatchingCalls.empty());
5024 return;
5025 }
5026
5027 if (!CallsiteToCalleeFuncCloneMap.count(Node))
5028 return;
5029
5030 auto CalleeFunc = CallsiteToCalleeFuncCloneMap[Node];
5031 updateCall(Node->Call, CalleeFunc);
5032 // Update all the matching calls as well.
5033 for (auto &Call : Node->MatchingCalls)
5034 updateCall(Call, CalleeFunc);
5035 };
5036
5037 // Performs DFS traversal starting from allocation nodes to update calls to
5038 // reflect cloning decisions recorded earlier. For regular LTO this will
5039 // update the actual calls in the IR to call the appropriate function clone
5040 // (and add attributes to allocation calls), whereas for ThinLTO the decisions
5041 // are recorded in the summary entries.
5042 DenseSet<const ContextNode *> Visited;
5043 for (auto &Entry : AllocationCallToContextNodeMap)
5044 UpdateCalls(Entry.second, Visited, UpdateCalls);
5045
5046 return Changed;
5047}
5048
5050 Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE,
5052 &FuncToAliasMap) {
5053 // The first "clone" is the original copy, we should only call this if we
5054 // needed to create new clones.
5055 assert(NumClones > 1);
5057 VMaps.reserve(NumClones - 1);
5058 FunctionsClonedThinBackend++;
5059 for (unsigned I = 1; I < NumClones; I++) {
5060 VMaps.emplace_back(std::make_unique<ValueToValueMapTy>());
5061 auto *NewF = CloneFunction(&F, *VMaps.back());
5062 FunctionClonesThinBackend++;
5063 // Strip memprof and callsite metadata from clone as they are no longer
5064 // needed.
5065 for (auto &BB : *NewF) {
5066 for (auto &Inst : BB) {
5067 Inst.setMetadata(LLVMContext::MD_memprof, nullptr);
5068 Inst.setMetadata(LLVMContext::MD_callsite, nullptr);
5069 }
5070 }
5071 std::string Name = getMemProfFuncName(F.getName(), I);
5072 auto *PrevF = M.getFunction(Name);
5073 if (PrevF) {
5074 // We might have created this when adjusting callsite in another
5075 // function. It should be a declaration.
5076 assert(PrevF->isDeclaration());
5077 NewF->takeName(PrevF);
5078 PrevF->replaceAllUsesWith(NewF);
5079 PrevF->eraseFromParent();
5080 } else
5081 NewF->setName(Name);
5082 updateSubprogramLinkageName(NewF, Name);
5083 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F)
5084 << "created clone " << ore::NV("NewFunction", NewF));
5085
5086 // Now handle aliases to this function, and clone those as well.
5087 if (!FuncToAliasMap.count(&F))
5088 continue;
5089 for (auto *A : FuncToAliasMap[&F]) {
5090 std::string Name = getMemProfFuncName(A->getName(), I);
5091 auto *PrevA = M.getNamedAlias(Name);
5092 auto *NewA = GlobalAlias::create(A->getValueType(),
5093 A->getType()->getPointerAddressSpace(),
5094 A->getLinkage(), Name, NewF);
5095 NewA->copyAttributesFrom(A);
5096 if (PrevA) {
5097 // We might have created this when adjusting callsite in another
5098 // function. It should be a declaration.
5099 assert(PrevA->isDeclaration());
5100 NewA->takeName(PrevA);
5101 PrevA->replaceAllUsesWith(NewA);
5102 PrevA->eraseFromParent();
5103 }
5104 }
5105 }
5106 return VMaps;
5107}
5108
5109// Locate the summary for F. This is complicated by the fact that it might
5110// have been internalized or promoted.
5112 const ModuleSummaryIndex *ImportSummary,
5113 const Function *CallingFunc = nullptr) {
5114 // FIXME: Ideally we would retain the original GUID in some fashion on the
5115 // function (e.g. as metadata), but for now do our best to locate the
5116 // summary without that information.
5117 ValueInfo TheFnVI = ImportSummary->getValueInfo(F.getGUID());
5118 if (!TheFnVI)
5119 // See if theFn was internalized, by checking index directly with
5120 // original name (this avoids the name adjustment done by getGUID() for
5121 // internal symbols).
5122 TheFnVI = ImportSummary->getValueInfo(
5124 if (TheFnVI)
5125 return TheFnVI;
5126 // Now query with the original name before any promotion was performed.
5127 StringRef OrigName =
5129 // When this pass is enabled, we always add thinlto_src_file provenance
5130 // metadata to imported function definitions, which allows us to recreate the
5131 // original internal symbol's GUID.
5132 auto SrcFileMD = F.getMetadata("thinlto_src_file");
5133 // If this is a call to an imported/promoted local for which we didn't import
5134 // the definition, the metadata will not exist on the declaration. However,
5135 // since we are doing this early, before any inlining in the LTO backend, we
5136 // can simply look at the metadata on the calling function which must have
5137 // been from the same module if F was an internal symbol originally.
5138 if (!SrcFileMD && F.isDeclaration()) {
5139 // We would only call this for a declaration for a direct callsite, in which
5140 // case the caller would have provided the calling function pointer.
5141 assert(CallingFunc);
5142 SrcFileMD = CallingFunc->getMetadata("thinlto_src_file");
5143 // If this is a promoted local (OrigName != F.getName()), since this is a
5144 // declaration, it must be imported from a different module and therefore we
5145 // should always find the metadata on its calling function. Any call to a
5146 // promoted local that came from this module should still be a definition.
5147 assert(SrcFileMD || OrigName == F.getName());
5148 }
5149 StringRef SrcFile = M.getSourceFileName();
5150 if (SrcFileMD)
5151 SrcFile = dyn_cast<MDString>(SrcFileMD->getOperand(0))->getString();
5152 std::string OrigId = GlobalValue::getGlobalIdentifier(
5153 OrigName, GlobalValue::InternalLinkage, SrcFile);
5154 TheFnVI = ImportSummary->getValueInfo(
5156 // Internal func in original module may have gotten a numbered suffix if we
5157 // imported an external function with the same name. This happens
5158 // automatically during IR linking for naming conflicts. It would have to
5159 // still be internal in that case (otherwise it would have been renamed on
5160 // promotion in which case we wouldn't have a naming conflict).
5161 if (!TheFnVI && OrigName == F.getName() && F.hasLocalLinkage() &&
5162 F.getName().contains('.')) {
5163 OrigName = F.getName().rsplit('.').first;
5165 OrigName, GlobalValue::InternalLinkage, SrcFile);
5166 TheFnVI = ImportSummary->getValueInfo(
5168 }
5169 // The only way we may not have a VI is if this is a declaration created for
5170 // an imported reference. For distributed ThinLTO we may not have a VI for
5171 // such declarations in the distributed summary.
5172 assert(TheFnVI || F.isDeclaration());
5173 return TheFnVI;
5174}
5175
5176bool MemProfContextDisambiguation::initializeIndirectCallPromotionInfo(
5177 Module &M) {
5178 ICallAnalysis = std::make_unique<ICallPromotionAnalysis>();
5179 Symtab = std::make_unique<InstrProfSymtab>();
5180 // Don't add canonical names, to avoid multiple functions to the symtab
5181 // when they both have the same root name with "." suffixes stripped.
5182 // If we pick the wrong one then this could lead to incorrect ICP and calling
5183 // a memprof clone that we don't actually create (resulting in linker unsats).
5184 // What this means is that the GUID of the function (or its PGOFuncName
5185 // metadata) *must* match that in the VP metadata to allow promotion.
5186 // In practice this should not be a limitation, since local functions should
5187 // have PGOFuncName metadata and global function names shouldn't need any
5188 // special handling (they should not get the ".llvm.*" suffix that the
5189 // canonicalization handling is attempting to strip).
5190 if (Error E = Symtab->create(M, /*InLTO=*/true, /*AddCanonical=*/false)) {
5191 std::string SymtabFailure = toString(std::move(E));
5192 M.getContext().emitError("Failed to create symtab: " + SymtabFailure);
5193 return false;
5194 }
5195 return true;
5196}
5197
5198#ifndef NDEBUG
5199// Sanity check that the MIB stack ids match between the summary and
5200// instruction metadata.
5202 const AllocInfo &AllocNode, const MDNode *MemProfMD,
5203 const CallStack<MDNode, MDNode::op_iterator> &CallsiteContext,
5204 const ModuleSummaryIndex *ImportSummary) {
5205 auto MIBIter = AllocNode.MIBs.begin();
5206 for (auto &MDOp : MemProfMD->operands()) {
5207 assert(MIBIter != AllocNode.MIBs.end());
5208 auto StackIdIndexIter = MIBIter->StackIdIndices.begin();
5209 auto *MIBMD = cast<const MDNode>(MDOp);
5210 MDNode *StackMDNode = getMIBStackNode(MIBMD);
5211 assert(StackMDNode);
5212 CallStack<MDNode, MDNode::op_iterator> StackContext(StackMDNode);
5213 auto ContextIterBegin =
5214 StackContext.beginAfterSharedPrefix(CallsiteContext);
5215 // Skip the checking on the first iteration.
5216 uint64_t LastStackContextId =
5217 (ContextIterBegin != StackContext.end() && *ContextIterBegin == 0) ? 1
5218 : 0;
5219 for (auto ContextIter = ContextIterBegin; ContextIter != StackContext.end();
5220 ++ContextIter) {
5221 // If this is a direct recursion, simply skip the duplicate
5222 // entries, to be consistent with how the summary ids were
5223 // generated during ModuleSummaryAnalysis.
5224 if (LastStackContextId == *ContextIter)
5225 continue;
5226 LastStackContextId = *ContextIter;
5227 assert(StackIdIndexIter != MIBIter->StackIdIndices.end());
5228 assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
5229 *ContextIter);
5230 StackIdIndexIter++;
5231 }
5232 MIBIter++;
5233 }
5234}
5235#endif
5236
5237bool MemProfContextDisambiguation::applyImport(Module &M) {
5238 assert(ImportSummary);
5239 bool Changed = false;
5240
5241 // We also need to clone any aliases that reference cloned functions, because
5242 // the modified callsites may invoke via the alias. Keep track of the aliases
5243 // for each function.
5244 std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>>
5245 FuncToAliasMap;
5246 for (auto &A : M.aliases()) {
5247 auto *Aliasee = A.getAliaseeObject();
5248 if (auto *F = dyn_cast<Function>(Aliasee))
5249 FuncToAliasMap[F].insert(&A);
5250 }
5251
5252 if (!initializeIndirectCallPromotionInfo(M))
5253 return false;
5254
5255 for (auto &F : M) {
5256 if (F.isDeclaration() || isMemProfClone(F))
5257 continue;
5258
5259 OptimizationRemarkEmitter ORE(&F);
5260
5262 bool ClonesCreated = false;
5263 unsigned NumClonesCreated = 0;
5264 auto CloneFuncIfNeeded = [&](unsigned NumClones) {
5265 // We should at least have version 0 which is the original copy.
5266 assert(NumClones > 0);
5267 // If only one copy needed use original.
5268 if (NumClones == 1)
5269 return;
5270 // If we already performed cloning of this function, confirm that the
5271 // requested number of clones matches (the thin link should ensure the
5272 // number of clones for each constituent callsite is consistent within
5273 // each function), before returning.
5274 if (ClonesCreated) {
5275 assert(NumClonesCreated == NumClones);
5276 return;
5277 }
5278 VMaps = createFunctionClones(F, NumClones, M, ORE, FuncToAliasMap);
5279 // The first "clone" is the original copy, which doesn't have a VMap.
5280 assert(VMaps.size() == NumClones - 1);
5281 Changed = true;
5282 ClonesCreated = true;
5283 NumClonesCreated = NumClones;
5284 };
5285
5286 auto CloneCallsite = [&](const CallsiteInfo &StackNode, CallBase *CB,
5287 Function *CalledFunction) {
5288 // Perform cloning if not yet done.
5289 CloneFuncIfNeeded(/*NumClones=*/StackNode.Clones.size());
5290
5291 assert(!isMemProfClone(*CalledFunction));
5292
5293 // Because we update the cloned calls by calling setCalledOperand (see
5294 // comment below), out of an abundance of caution make sure the called
5295 // function was actually the called operand (or its aliasee). We also
5296 // strip pointer casts when looking for calls (to match behavior during
5297 // summary generation), however, with opaque pointers in theory this
5298 // should not be an issue. Note we still clone the current function
5299 // (containing this call) above, as that could be needed for its callers.
5300 auto *GA = dyn_cast_or_null<GlobalAlias>(CB->getCalledOperand());
5301 if (CalledFunction != CB->getCalledOperand() &&
5302 (!GA || CalledFunction != GA->getAliaseeObject())) {
5303 SkippedCallsCloning++;
5304 return;
5305 }
5306 // Update the calls per the summary info.
5307 // Save orig name since it gets updated in the first iteration
5308 // below.
5309 auto CalleeOrigName = CalledFunction->getName();
5310 for (unsigned J = 0; J < StackNode.Clones.size(); J++) {
5311 // Do nothing if this version calls the original version of its
5312 // callee.
5313 if (!StackNode.Clones[J])
5314 continue;
5315 auto NewF = M.getOrInsertFunction(
5316 getMemProfFuncName(CalleeOrigName, StackNode.Clones[J]),
5317 CalledFunction->getFunctionType());
5318 CallBase *CBClone;
5319 // Copy 0 is the original function.
5320 if (!J)
5321 CBClone = CB;
5322 else
5323 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
5324 // Set the called operand directly instead of calling setCalledFunction,
5325 // as the latter mutates the function type on the call. In rare cases
5326 // we may have a slightly different type on a callee function
5327 // declaration due to it being imported from a different module with
5328 // incomplete types. We really just want to change the name of the
5329 // function to the clone, and not make any type changes.
5330 CBClone->setCalledOperand(NewF.getCallee());
5331 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone)
5332 << ore::NV("Call", CBClone) << " in clone "
5333 << ore::NV("Caller", CBClone->getFunction())
5334 << " assigned to call function clone "
5335 << ore::NV("Callee", NewF.getCallee()));
5336 }
5337 };
5338
5339 // Locate the summary for F.
5340 ValueInfo TheFnVI = findValueInfoForFunc(F, M, ImportSummary);
5341 // If not found, this could be an imported local (see comment in
5342 // findValueInfoForFunc). Skip for now as it will be cloned in its original
5343 // module (where it would have been promoted to global scope so should
5344 // satisfy any reference in this module).
5345 if (!TheFnVI)
5346 continue;
5347
5348 auto *GVSummary =
5349 ImportSummary->findSummaryInModule(TheFnVI, M.getModuleIdentifier());
5350 if (!GVSummary) {
5351 // Must have been imported, use the summary which matches the definition。
5352 // (might be multiple if this was a linkonce_odr).
5353 auto SrcModuleMD = F.getMetadata("thinlto_src_module");
5354 assert(SrcModuleMD &&
5355 "enable-import-metadata is needed to emit thinlto_src_module");
5356 StringRef SrcModule =
5357 dyn_cast<MDString>(SrcModuleMD->getOperand(0))->getString();
5358 for (auto &GVS : TheFnVI.getSummaryList()) {
5359 if (GVS->modulePath() == SrcModule) {
5360 GVSummary = GVS.get();
5361 break;
5362 }
5363 }
5364 assert(GVSummary && GVSummary->modulePath() == SrcModule);
5365 }
5366
5367 // If this was an imported alias skip it as we won't have the function
5368 // summary, and it should be cloned in the original module.
5369 if (isa<AliasSummary>(GVSummary))
5370 continue;
5371
5372 auto *FS = cast<FunctionSummary>(GVSummary->getBaseObject());
5373
5374 if (FS->allocs().empty() && FS->callsites().empty())
5375 continue;
5376
5377 auto SI = FS->callsites().begin();
5378 auto AI = FS->allocs().begin();
5379
5380 // To handle callsite infos synthesized for tail calls which have missing
5381 // frames in the profiled context, map callee VI to the synthesized callsite
5382 // info.
5383 DenseMap<ValueInfo, CallsiteInfo> MapTailCallCalleeVIToCallsite;
5384 // Iterate the callsites for this function in reverse, since we place all
5385 // those synthesized for tail calls at the end.
5386 for (auto CallsiteIt = FS->callsites().rbegin();
5387 CallsiteIt != FS->callsites().rend(); CallsiteIt++) {
5388 auto &Callsite = *CallsiteIt;
5389 // Stop as soon as we see a non-synthesized callsite info (see comment
5390 // above loop). All the entries added for discovered tail calls have empty
5391 // stack ids.
5392 if (!Callsite.StackIdIndices.empty())
5393 break;
5394 MapTailCallCalleeVIToCallsite.insert({Callsite.Callee, Callsite});
5395 }
5396
5397 // Keeps track of needed ICP for the function.
5398 SmallVector<ICallAnalysisData> ICallAnalysisInfo;
5399
5400 // Assume for now that the instructions are in the exact same order
5401 // as when the summary was created, but confirm this is correct by
5402 // matching the stack ids.
5403 for (auto &BB : F) {
5404 for (auto &I : BB) {
5405 auto *CB = dyn_cast<CallBase>(&I);
5406 // Same handling as when creating module summary.
5407 if (!mayHaveMemprofSummary(CB))
5408 continue;
5409
5410 auto *CalledValue = CB->getCalledOperand();
5411 auto *CalledFunction = CB->getCalledFunction();
5412 if (CalledValue && !CalledFunction) {
5413 CalledValue = CalledValue->stripPointerCasts();
5414 // Stripping pointer casts can reveal a called function.
5415 CalledFunction = dyn_cast<Function>(CalledValue);
5416 }
5417 // Check if this is an alias to a function. If so, get the
5418 // called aliasee for the checks below.
5419 if (auto *GA = dyn_cast<GlobalAlias>(CalledValue)) {
5420 assert(!CalledFunction &&
5421 "Expected null called function in callsite for alias");
5422 CalledFunction = dyn_cast<Function>(GA->getAliaseeObject());
5423 }
5424
5425 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
5426 I.getMetadata(LLVMContext::MD_callsite));
5427 auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof);
5428
5429 // Include allocs that were already assigned a memprof function
5430 // attribute in the statistics.
5431 if (CB->getAttributes().hasFnAttr("memprof")) {
5432 assert(!MemProfMD);
5433 CB->getAttributes().getFnAttr("memprof").getValueAsString() == "cold"
5434 ? AllocTypeColdThinBackend++
5435 : AllocTypeNotColdThinBackend++;
5436 OrigAllocsThinBackend++;
5437 AllocVersionsThinBackend++;
5438 if (!MaxAllocVersionsThinBackend)
5439 MaxAllocVersionsThinBackend = 1;
5440 continue;
5441 }
5442
5443 if (MemProfMD) {
5444 // Consult the next alloc node.
5445 assert(AI != FS->allocs().end());
5446 auto &AllocNode = *(AI++);
5447
5448#ifndef NDEBUG
5449 checkAllocContextIds(AllocNode, MemProfMD, CallsiteContext,
5450 ImportSummary);
5451#endif
5452
5453 // Perform cloning if not yet done.
5454 CloneFuncIfNeeded(/*NumClones=*/AllocNode.Versions.size());
5455
5456 OrigAllocsThinBackend++;
5457 AllocVersionsThinBackend += AllocNode.Versions.size();
5458 if (MaxAllocVersionsThinBackend < AllocNode.Versions.size())
5459 MaxAllocVersionsThinBackend = AllocNode.Versions.size();
5460
5461 // If there is only one version that means we didn't end up
5462 // considering this function for cloning, and in that case the alloc
5463 // will still be none type or should have gotten the default NotCold.
5464 // Skip that after calling clone helper since that does some sanity
5465 // checks that confirm we haven't decided yet that we need cloning.
5466 // We might have a single version that is cold due to the
5467 // MinClonedColdBytePercent heuristic, make sure we don't skip in that
5468 // case.
5469 if (AllocNode.Versions.size() == 1 &&
5470 (AllocationType)AllocNode.Versions[0] != AllocationType::Cold) {
5471 assert((AllocationType)AllocNode.Versions[0] ==
5472 AllocationType::NotCold ||
5473 (AllocationType)AllocNode.Versions[0] ==
5474 AllocationType::None);
5475 UnclonableAllocsThinBackend++;
5476 continue;
5477 }
5478
5479 // All versions should have a singular allocation type.
5480 assert(llvm::none_of(AllocNode.Versions, [](uint8_t Type) {
5481 return Type == ((uint8_t)AllocationType::NotCold |
5482 (uint8_t)AllocationType::Cold);
5483 }));
5484
5485 // Update the allocation types per the summary info.
5486 for (unsigned J = 0; J < AllocNode.Versions.size(); J++) {
5487 // Ignore any that didn't get an assigned allocation type.
5488 if (AllocNode.Versions[J] == (uint8_t)AllocationType::None)
5489 continue;
5490 AllocationType AllocTy = (AllocationType)AllocNode.Versions[J];
5491 AllocTy == AllocationType::Cold ? AllocTypeColdThinBackend++
5492 : AllocTypeNotColdThinBackend++;
5493 std::string AllocTypeString = getAllocTypeAttributeString(AllocTy);
5494 auto A = llvm::Attribute::get(F.getContext(), "memprof",
5495 AllocTypeString);
5496 CallBase *CBClone;
5497 // Copy 0 is the original function.
5498 if (!J)
5499 CBClone = CB;
5500 else
5501 // Since VMaps are only created for new clones, we index with
5502 // clone J-1 (J==0 is the original clone and does not have a VMaps
5503 // entry).
5504 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
5506 CBClone->addFnAttr(A);
5507 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", CBClone)
5508 << ore::NV("AllocationCall", CBClone) << " in clone "
5509 << ore::NV("Caller", CBClone->getFunction())
5510 << " marked with memprof allocation attribute "
5511 << ore::NV("Attribute", AllocTypeString));
5512 }
5513 } else if (!CallsiteContext.empty()) {
5514 if (!CalledFunction) {
5515#ifndef NDEBUG
5516 // We should have skipped inline assembly calls.
5517 auto *CI = dyn_cast<CallInst>(CB);
5518 assert(!CI || !CI->isInlineAsm());
5519#endif
5520 // We should have skipped direct calls via a Constant.
5521 assert(CalledValue && !isa<Constant>(CalledValue));
5522
5523 // This is an indirect call, see if we have profile information and
5524 // whether any clones were recorded for the profiled targets (that
5525 // we synthesized CallsiteInfo summary records for when building the
5526 // index).
5527 auto NumClones =
5528 recordICPInfo(CB, FS->callsites(), SI, ICallAnalysisInfo);
5529
5530 // Perform cloning if not yet done. This is done here in case
5531 // we don't need to do ICP, but might need to clone this
5532 // function as it is the target of other cloned calls.
5533 if (NumClones)
5534 CloneFuncIfNeeded(NumClones);
5535 }
5536
5537 else {
5538 // Consult the next callsite node.
5539 assert(SI != FS->callsites().end());
5540 auto &StackNode = *(SI++);
5541
5542#ifndef NDEBUG
5543 // Sanity check that the stack ids match between the summary and
5544 // instruction metadata.
5545 auto StackIdIndexIter = StackNode.StackIdIndices.begin();
5546 for (auto StackId : CallsiteContext) {
5547 assert(StackIdIndexIter != StackNode.StackIdIndices.end());
5548 assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
5549 StackId);
5550 StackIdIndexIter++;
5551 }
5552#endif
5553
5554 CloneCallsite(StackNode, CB, CalledFunction);
5555 }
5556 } else if (CB->isTailCall() && CalledFunction) {
5557 // Locate the synthesized callsite info for the callee VI, if any was
5558 // created, and use that for cloning.
5559 ValueInfo CalleeVI =
5560 findValueInfoForFunc(*CalledFunction, M, ImportSummary, &F);
5561 if (CalleeVI && MapTailCallCalleeVIToCallsite.count(CalleeVI)) {
5562 auto Callsite = MapTailCallCalleeVIToCallsite.find(CalleeVI);
5563 assert(Callsite != MapTailCallCalleeVIToCallsite.end());
5564 CloneCallsite(Callsite->second, CB, CalledFunction);
5565 }
5566 }
5567 }
5568 }
5569
5570 // Now do any promotion required for cloning.
5571 performICP(M, FS->callsites(), VMaps, ICallAnalysisInfo, ORE);
5572 }
5573
5574 // We skip some of the functions and instructions above, so remove all the
5575 // metadata in a single sweep here.
5576 for (auto &F : M) {
5577 // We can skip memprof clones because createFunctionClones already strips
5578 // the metadata from the newly created clones.
5579 if (F.isDeclaration() || isMemProfClone(F))
5580 continue;
5581 for (auto &BB : F) {
5582 for (auto &I : BB) {
5583 if (!isa<CallBase>(I))
5584 continue;
5585 I.setMetadata(LLVMContext::MD_memprof, nullptr);
5586 I.setMetadata(LLVMContext::MD_callsite, nullptr);
5587 }
5588 }
5589 }
5590
5591 return Changed;
5592}
5593
5594unsigned MemProfContextDisambiguation::recordICPInfo(
5595 CallBase *CB, ArrayRef<CallsiteInfo> AllCallsites,
5597 SmallVector<ICallAnalysisData> &ICallAnalysisInfo) {
5598 // First see if we have profile information for this indirect call.
5599 uint32_t NumCandidates;
5600 uint64_t TotalCount;
5601 auto CandidateProfileData =
5602 ICallAnalysis->getPromotionCandidatesForInstruction(CB, TotalCount,
5603 NumCandidates);
5604 if (CandidateProfileData.empty())
5605 return 0;
5606
5607 // Iterate through all of the candidate profiled targets along with the
5608 // CallsiteInfo summary records synthesized for them when building the index,
5609 // and see if any are cloned and/or refer to clones.
5610 bool ICPNeeded = false;
5611 unsigned NumClones = 0;
5612 size_t CallsiteInfoStartIndex = std::distance(AllCallsites.begin(), SI);
5613 for (const auto &Candidate : CandidateProfileData) {
5614#ifndef NDEBUG
5615 auto CalleeValueInfo =
5616#endif
5617 ImportSummary->getValueInfo(Candidate.Value);
5618 // We might not have a ValueInfo if this is a distributed
5619 // ThinLTO backend and decided not to import that function.
5620 assert(!CalleeValueInfo || SI->Callee == CalleeValueInfo);
5621 assert(SI != AllCallsites.end());
5622 auto &StackNode = *(SI++);
5623 // See if any of the clones of the indirect callsite for this
5624 // profiled target should call a cloned version of the profiled
5625 // target. We only need to do the ICP here if so.
5626 ICPNeeded |= llvm::any_of(StackNode.Clones,
5627 [](unsigned CloneNo) { return CloneNo != 0; });
5628 // Every callsite in the same function should have been cloned the same
5629 // number of times.
5630 assert(!NumClones || NumClones == StackNode.Clones.size());
5631 NumClones = StackNode.Clones.size();
5632 }
5633 if (!ICPNeeded)
5634 return NumClones;
5635 // Save information for ICP, which is performed later to avoid messing up the
5636 // current function traversal.
5637 ICallAnalysisInfo.push_back({CB, CandidateProfileData.vec(), NumCandidates,
5638 TotalCount, CallsiteInfoStartIndex});
5639 return NumClones;
5640}
5641
5642void MemProfContextDisambiguation::performICP(
5643 Module &M, ArrayRef<CallsiteInfo> AllCallsites,
5644 ArrayRef<std::unique_ptr<ValueToValueMapTy>> VMaps,
5645 ArrayRef<ICallAnalysisData> ICallAnalysisInfo,
5646 OptimizationRemarkEmitter &ORE) {
5647 // Now do any promotion required for cloning. Specifically, for each
5648 // recorded ICP candidate (which was only recorded because one clone of that
5649 // candidate should call a cloned target), we perform ICP (speculative
5650 // devirtualization) for each clone of the callsite, and update its callee
5651 // to the appropriate clone. Note that the ICP compares against the original
5652 // version of the target, which is what is in the vtable.
5653 for (auto &Info : ICallAnalysisInfo) {
5654 auto *CB = Info.CB;
5655 auto CallsiteIndex = Info.CallsiteInfoStartIndex;
5656 auto TotalCount = Info.TotalCount;
5657 unsigned NumPromoted = 0;
5658 unsigned NumClones = 0;
5659
5660 for (auto &Candidate : Info.CandidateProfileData) {
5661 auto &StackNode = AllCallsites[CallsiteIndex++];
5662
5663 // All calls in the same function must have the same number of clones.
5664 assert(!NumClones || NumClones == StackNode.Clones.size());
5665 NumClones = StackNode.Clones.size();
5666
5667 // See if the target is in the module. If it wasn't imported, it is
5668 // possible that this profile could have been collected on a different
5669 // target (or version of the code), and we need to be conservative
5670 // (similar to what is done in the ICP pass).
5671 Function *TargetFunction = Symtab->getFunction(Candidate.Value);
5672 if (TargetFunction == nullptr ||
5673 // Any ThinLTO global dead symbol removal should have already
5674 // occurred, so it should be safe to promote when the target is a
5675 // declaration.
5676 // TODO: Remove internal option once more fully tested.
5678 TargetFunction->isDeclaration())) {
5679 ORE.emit([&]() {
5680 return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToFindTarget", CB)
5681 << "Memprof cannot promote indirect call: target with md5sum "
5682 << ore::NV("target md5sum", Candidate.Value) << " not found";
5683 });
5684 // FIXME: See if we can use the new declaration importing support to
5685 // at least get the declarations imported for this case. Hot indirect
5686 // targets should have been imported normally, however.
5687 continue;
5688 }
5689
5690 // Check if legal to promote
5691 const char *Reason = nullptr;
5692 if (!isLegalToPromote(*CB, TargetFunction, &Reason)) {
5693 ORE.emit([&]() {
5694 return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToPromote", CB)
5695 << "Memprof cannot promote indirect call to "
5696 << ore::NV("TargetFunction", TargetFunction)
5697 << " with count of " << ore::NV("TotalCount", TotalCount)
5698 << ": " << Reason;
5699 });
5700 continue;
5701 }
5702
5703 assert(!isMemProfClone(*TargetFunction));
5704
5705 // Handle each call clone, applying ICP so that each clone directly
5706 // calls the specified callee clone, guarded by the appropriate ICP
5707 // check.
5708 CallBase *CBClone = CB;
5709 for (unsigned J = 0; J < NumClones; J++) {
5710 // Copy 0 is the original function.
5711 if (J > 0)
5712 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
5713 // We do the promotion using the original name, so that the comparison
5714 // is against the name in the vtable. Then just below, change the new
5715 // direct call to call the cloned function.
5716 auto &DirectCall =
5717 pgo::promoteIndirectCall(*CBClone, TargetFunction, Candidate.Count,
5718 TotalCount, isSamplePGO, &ORE);
5719 auto *TargetToUse = TargetFunction;
5720 // Call original if this version calls the original version of its
5721 // callee.
5722 if (StackNode.Clones[J]) {
5723 TargetToUse =
5724 cast<Function>(M.getOrInsertFunction(
5725 getMemProfFuncName(TargetFunction->getName(),
5726 StackNode.Clones[J]),
5727 TargetFunction->getFunctionType())
5728 .getCallee());
5729 }
5730 DirectCall.setCalledFunction(TargetToUse);
5731 // During matching we generate synthetic VP metadata for indirect calls
5732 // not already having any, from the memprof profile's callee GUIDs. If
5733 // we subsequently promote and inline those callees, we currently lose
5734 // the ability to generate this synthetic VP metadata. Optionally apply
5735 // a noinline attribute to promoted direct calls, where the threshold is
5736 // set to capture synthetic VP metadata targets which get a count of 1.
5738 Candidate.Count < MemProfICPNoInlineThreshold)
5739 DirectCall.setIsNoInline();
5740 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone)
5741 << ore::NV("Call", CBClone) << " in clone "
5742 << ore::NV("Caller", CBClone->getFunction())
5743 << " promoted and assigned to call function clone "
5744 << ore::NV("Callee", TargetToUse));
5745 }
5746
5747 // Update TotalCount (all clones should get same count above)
5748 TotalCount -= Candidate.Count;
5749 NumPromoted++;
5750 }
5751 // Adjust the MD.prof metadata for all clones, now that we have the new
5752 // TotalCount and the number promoted.
5753 CallBase *CBClone = CB;
5754 for (unsigned J = 0; J < NumClones; J++) {
5755 // Copy 0 is the original function.
5756 if (J > 0)
5757 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
5758 // First delete the old one.
5759 CBClone->setMetadata(LLVMContext::MD_prof, nullptr);
5760 // If all promoted, we don't need the MD.prof metadata.
5761 // Otherwise we need update with the un-promoted records back.
5762 if (TotalCount != 0)
5764 M, *CBClone, ArrayRef(Info.CandidateProfileData).slice(NumPromoted),
5765 TotalCount, IPVK_IndirectCallTarget, Info.NumCandidates);
5766 }
5767 }
5768}
5769
5770template <typename DerivedCCG, typename FuncTy, typename CallTy>
5771bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::process() {
5772 if (DumpCCG) {
5773 dbgs() << "CCG before cloning:\n";
5774 dbgs() << *this;
5775 }
5776 if (ExportToDot)
5777 exportToDot("postbuild");
5778
5779 if (VerifyCCG) {
5780 check();
5781 }
5782
5783 identifyClones();
5784
5785 if (VerifyCCG) {
5786 check();
5787 }
5788
5789 if (DumpCCG) {
5790 dbgs() << "CCG after cloning:\n";
5791 dbgs() << *this;
5792 }
5793 if (ExportToDot)
5794 exportToDot("cloned");
5795
5796 bool Changed = assignFunctions();
5797
5798 if (DumpCCG) {
5799 dbgs() << "CCG after assigning function clones:\n";
5800 dbgs() << *this;
5801 }
5802 if (ExportToDot)
5803 exportToDot("clonefuncassign");
5804
5806 printTotalSizes(errs());
5807
5808 return Changed;
5809}
5810
5811bool MemProfContextDisambiguation::processModule(
5812 Module &M,
5813 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter) {
5814
5815 // If we have an import summary, then the cloning decisions were made during
5816 // the thin link on the index. Apply them and return.
5817 if (ImportSummary)
5818 return applyImport(M);
5819
5820 // TODO: If/when other types of memprof cloning are enabled beyond just for
5821 // hot and cold, we will need to change this to individually control the
5822 // AllocationType passed to addStackNodesForMIB during CCG construction.
5823 // Note that we specifically check this after applying imports above, so that
5824 // the option isn't needed to be passed to distributed ThinLTO backend
5825 // clang processes, which won't necessarily have visibility into the linker
5826 // dependences. Instead the information is communicated from the LTO link to
5827 // the backends via the combined summary index.
5828 if (!SupportsHotColdNew)
5829 return false;
5830
5831 ModuleCallsiteContextGraph CCG(M, OREGetter);
5832 return CCG.process();
5833}
5834
5836 const ModuleSummaryIndex *Summary, bool isSamplePGO)
5837 : ImportSummary(Summary), isSamplePGO(isSamplePGO) {
5838 // Check the dot graph printing options once here, to make sure we have valid
5839 // and expected combinations.
5840 if (DotGraphScope == DotScope::Alloc && !AllocIdForDot.getNumOccurrences())
5842 "-memprof-dot-scope=alloc requires -memprof-dot-alloc-id");
5844 !ContextIdForDot.getNumOccurrences())
5846 "-memprof-dot-scope=context requires -memprof-dot-context-id");
5847 if (DotGraphScope == DotScope::All && AllocIdForDot.getNumOccurrences() &&
5848 ContextIdForDot.getNumOccurrences())
5850 "-memprof-dot-scope=all can't have both -memprof-dot-alloc-id and "
5851 "-memprof-dot-context-id");
5852 if (ImportSummary) {
5853 // The MemProfImportSummary should only be used for testing ThinLTO
5854 // distributed backend handling via opt, in which case we don't have a
5855 // summary from the pass pipeline.
5857 return;
5858 }
5859 if (MemProfImportSummary.empty())
5860 return;
5861
5862 auto ReadSummaryFile =
5864 if (!ReadSummaryFile) {
5865 logAllUnhandledErrors(ReadSummaryFile.takeError(), errs(),
5866 "Error loading file '" + MemProfImportSummary +
5867 "': ");
5868 return;
5869 }
5870 auto ImportSummaryForTestingOrErr = getModuleSummaryIndex(**ReadSummaryFile);
5871 if (!ImportSummaryForTestingOrErr) {
5872 logAllUnhandledErrors(ImportSummaryForTestingOrErr.takeError(), errs(),
5873 "Error parsing file '" + MemProfImportSummary +
5874 "': ");
5875 return;
5876 }
5877 ImportSummaryForTesting = std::move(*ImportSummaryForTestingOrErr);
5878 ImportSummary = ImportSummaryForTesting.get();
5879}
5880
5883 auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
5884 auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & {
5885 return FAM.getResult<OptimizationRemarkEmitterAnalysis>(*F);
5886 };
5887 if (!processModule(M, OREGetter))
5888 return PreservedAnalyses::all();
5889 return PreservedAnalyses::none();
5890}
5891
5893 ModuleSummaryIndex &Index,
5895 isPrevailing) {
5896 // TODO: If/when other types of memprof cloning are enabled beyond just for
5897 // hot and cold, we will need to change this to individually control the
5898 // AllocationType passed to addStackNodesForMIB during CCG construction.
5899 // The index was set from the option, so these should be in sync.
5900 assert(Index.withSupportsHotColdNew() == SupportsHotColdNew);
5901 if (!SupportsHotColdNew)
5902 return;
5903
5904 IndexCallsiteContextGraph CCG(Index, isPrevailing);
5905 CCG.process();
5906}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
aarch64 promote const
AMDGPU Prepare AGPR Alloc
Unify divergent function exit nodes
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
#define DEBUG_TYPE
global merge func
Module.h This file contains the declarations for the Module class.
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
#define G(x, y, z)
Definition MD5.cpp:56
Machine Check Debug Module
This file implements a map that provides insertion order iteration.
static SmallVector< std::unique_ptr< ValueToValueMapTy >, 4 > createFunctionClones(Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE, std::map< const Function *, SmallPtrSet< const GlobalAlias *, 1 > > &FuncToAliasMap)
static cl::opt< unsigned > TailCallSearchDepth("memprof-tail-call-search-depth", cl::init(5), cl::Hidden, cl::desc("Max depth to recursively search for missing " "frames through tail calls."))
static cl::opt< DotScope > DotGraphScope("memprof-dot-scope", cl::desc("Scope of graph to export to dot"), cl::Hidden, cl::init(DotScope::All), cl::values(clEnumValN(DotScope::All, "all", "Export full callsite graph"), clEnumValN(DotScope::Alloc, "alloc", "Export only nodes with contexts feeding given " "-memprof-dot-alloc-id"), clEnumValN(DotScope::Context, "context", "Export only nodes with given -memprof-dot-context-id")))
static cl::opt< bool > DoMergeIteration("memprof-merge-iteration", cl::init(true), cl::Hidden, cl::desc("Iteratively apply merging on a node to catch new callers"))
static bool isMemProfClone(const Function &F)
static cl::opt< unsigned > AllocIdForDot("memprof-dot-alloc-id", cl::init(0), cl::Hidden, cl::desc("Id of alloc to export if -memprof-dot-scope=alloc " "or to highlight if -memprof-dot-scope=all"))
static cl::opt< unsigned > ContextIdForDot("memprof-dot-context-id", cl::init(0), cl::Hidden, cl::desc("Id of context to export if -memprof-dot-scope=context or to " "highlight otherwise"))
static cl::opt< bool > ExportToDot("memprof-export-to-dot", cl::init(false), cl::Hidden, cl::desc("Export graph to dot files."))
static void checkEdge(const std::shared_ptr< ContextEdge< DerivedCCG, FuncTy, CallTy > > &Edge)
static cl::opt< bool > AllowRecursiveCallsites("memprof-allow-recursive-callsites", cl::init(true), cl::Hidden, cl::desc("Allow cloning of callsites involved in recursive cycles"))
bool checkColdOrNotCold(uint8_t AllocType)
static ValueInfo findValueInfoForFunc(const Function &F, const Module &M, const ModuleSummaryIndex *ImportSummary, const Function *CallingFunc=nullptr)
static cl::opt< bool > CloneRecursiveContexts("memprof-clone-recursive-contexts", cl::init(true), cl::Hidden, cl::desc("Allow cloning of contexts through recursive cycles"))
static std::string getAllocTypeString(uint8_t AllocTypes)
static cl::opt< unsigned > MemProfICPNoInlineThreshold("memprof-icp-noinline-threshold", cl::init(2), cl::Hidden, cl::desc("Minimum absolute count for promoted target to be inlinable"))
bool DOTGraphTraits< constCallsiteContextGraph< DerivedCCG, FuncTy, CallTy > * >::DoHighlight
static unsigned getMemProfCloneNum(const Function &F)
static cl::opt< bool > VerifyCCG("memprof-verify-ccg", cl::init(false), cl::Hidden, cl::desc("Perform verification checks on CallingContextGraph."))
static void checkNode(const ContextNode< DerivedCCG, FuncTy, CallTy > *Node, bool CheckEdges=true)
static cl::opt< bool > MergeClones("memprof-merge-clones", cl::init(true), cl::Hidden, cl::desc("Merge clones before assigning functions"))
static std::string getMemProfFuncName(Twine Base, unsigned CloneNo)
static cl::opt< std::string > MemProfImportSummary("memprof-import-summary", cl::desc("Import summary to use for testing the ThinLTO backend via opt"), cl::Hidden)
static const std::string MemProfCloneSuffix
static void updateSubprogramLinkageName(Function *NewFunc, StringRef Name)
static cl::opt< bool > AllowRecursiveContexts("memprof-allow-recursive-contexts", cl::init(true), cl::Hidden, cl::desc("Allow cloning of contexts having recursive cycles"))
static cl::opt< std::string > DotFilePathPrefix("memprof-dot-file-path-prefix", cl::init(""), cl::Hidden, cl::value_desc("filename"), cl::desc("Specify the path prefix of the MemProf dot files."))
static cl::opt< bool > VerifyNodes("memprof-verify-nodes", cl::init(false), cl::Hidden, cl::desc("Perform frequent verification checks on nodes."))
static void checkAllocContextIds(const AllocInfo &AllocNode, const MDNode *MemProfMD, const CallStack< MDNode, MDNode::op_iterator > &CallsiteContext, const ModuleSummaryIndex *ImportSummary)
static cl::opt< bool > DumpCCG("memprof-dump-ccg", cl::init(false), cl::Hidden, cl::desc("Dump CallingContextGraph to stdout after each stage."))
AllocType
cl::opt< bool > MemProfReportHintedSizes("memprof-report-hinted-sizes", cl::init(false), cl::Hidden, cl::desc("Report total allocation sizes of hinted allocations"))
cl::opt< unsigned > MinClonedColdBytePercent("memprof-cloning-cold-threshold", cl::init(100), cl::Hidden, cl::desc("Min percent of cold bytes to hint alloc cold during cloning"))
This is the interface to build a ModuleSummaryIndex for a module.
ModuleSummaryIndex.h This file contains the declarations the classes that hold the module index and s...
#define P(N)
if(auto Err=PB.parsePassPipeline(MPM, Passes)) return wrap(std MPM run * Mod
FunctionAnalysisManager FAM
if(PassOpts->AAPipeline)
std::pair< BasicBlock *, BasicBlock * > Edge
This file defines generic set operations that may be used on set's of different types,...
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:167
#define LLVM_DEBUG(...)
Definition Debug.h:114
ValueInfo getAliaseeVI() const
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
iterator end() const
Definition ArrayRef.h:136
const_pointer iterator
Definition ArrayRef.h:48
iterator begin() const
Definition ArrayRef.h:135
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:142
static LLVM_ABI Attribute get(LLVMContext &Context, AttrKind Kind, uint64_t Val=0)
Return a uniquified Attribute object.
void addFnAttr(Attribute::AttrKind Kind)
Adds the attribute to the function.
void setCalledOperand(Value *V)
Subprogram description. Uses SubclassData1.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:187
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:165
unsigned size() const
Definition DenseMap.h:108
bool empty() const
Definition DenseMap.h:107
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition DenseMap.h:161
iterator end()
Definition DenseMap.h:81
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition DenseMap.h:156
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:214
void reserve(size_type NumEntries)
Grow the densemap so that it can contain at least NumEntries items before resizing again.
Definition DenseMap.h:112
Implements a dense probed hash-table based set.
Definition DenseSet.h:261
Function summary information to aid decisions and implementation of importing.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:209
DISubprogram * getSubprogram() const
Get the attached subprogram.
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
static LLVM_ABI GlobalAlias * create(Type *Ty, unsigned AddressSpace, LinkageTypes Linkage, const Twine &Name, Constant *Aliasee, Module *Parent)
If a parent module is specified, the alias is automatically inserted into the end of the specified mo...
Definition Globals.cpp:585
Function and variable summary information to aid decisions and implementation of importing.
static LLVM_ABI GUID getGUIDAssumingExternalLinkage(StringRef GlobalName)
Return a 64-bit global unique ID constructed from the name of a global symbol.
Definition Globals.cpp:77
static bool isLocalLinkage(LinkageTypes Linkage)
LLVM_ABI bool isDeclaration() const
Return true if the primary definition of this global value is outside of the current translation unit...
Definition Globals.cpp:316
uint64_t GUID
Declare a type to represent a global unique identifier for a global value.
Module * getParent()
Get the module that this global value is contained inside of...
static LLVM_ABI std::string getGlobalIdentifier(StringRef Name, GlobalValue::LinkageTypes Linkage, StringRef FileName)
Return the modified name for a global value suitable to be used as the key for a global lookup (e....
Definition Globals.cpp:161
@ InternalLinkage
Rename collisions when linking (static functions).
Definition GlobalValue.h:60
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Metadata node.
Definition Metadata.h:1077
const MDOperand & getOperand(unsigned I) const
Definition Metadata.h:1445
ArrayRef< MDOperand > operands() const
Definition Metadata.h:1443
unsigned getNumOperands() const
Return number of MDNode operands.
Definition Metadata.h:1451
LLVM_ABI TempMDNode clone() const
Create a (temporary) clone of this.
Definition Metadata.cpp:668
static std::enable_if_t< std::is_base_of< MDNode, T >::value, T * > replaceWithUniqued(std::unique_ptr< T, TempMDNodeDeleter > N)
Replace a temporary node with a uniqued one.
Definition Metadata.h:1316
static LLVM_ABI MDString * get(LLVMContext &Context, StringRef Str)
Definition Metadata.cpp:607
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
size_type count(const KeyT &Key) const
Definition MapVector.h:139
MemProfContextDisambiguation(const ModuleSummaryIndex *Summary=nullptr, bool isSamplePGO=false)
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
static ErrorOr< std::unique_ptr< MemoryBuffer > > getFile(const Twine &Filename, bool IsText=false, bool RequiresNullTerminator=true, bool IsVolatile=false, std::optional< Align > Alignment=std::nullopt)
Open the specified file as a MemoryBuffer, returning a new MemoryBuffer if successful,...
Class to hold module path string table and global value map, and encapsulate methods for operating on...
static StringRef getOriginalNameBeforePromote(StringRef Name)
Helper to obtain the unpromoted name for a global value (or the original name if not promoted).
ValueInfo getValueInfo(const GlobalValueSummaryMapTy::value_type &R) const
Return a ValueInfo for the index value_type (convenient when iterating index).
uint64_t getStackIdAtIndex(unsigned Index) const
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
LLVMContext & getContext() const
Get the global data context.
Definition Module.h:285
A NodeSet contains a set of SUnit DAG nodes with additional information that assigns a priority to th...
unsigned size() const
bool insert(SUnit *SU)
The optimization diagnostic interface.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for applied optimization remarks.
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
Definition Analysis.h:115
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:181
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
LLVM_ABI void print(raw_ostream &O, bool IsForDebug=false) const
Implement operator<< on Value.
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:194
void reserve(size_t Size)
Grow the DenseSet so that it can contain at least NumEntries items before resizing again.
Definition DenseSet.h:96
void insert_range(Range &&R)
Definition DenseSet.h:220
size_type size() const
Definition DenseSet.h:87
void swap(DenseSetImpl &RHS)
Definition DenseSet.h:102
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:169
bool erase(const ValueT &V)
Definition DenseSet.h:100
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition DenseSet.h:174
An efficient, type-erasing, non-owning reference to a callable.
Helper class to iterate through stack ids in both metadata (memprof MIB and callsite) and the corresp...
CallStackIterator beginAfterSharedPrefix(const CallStack &Other)
CallStackIterator end() const
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
CallInst * Call
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ Entry
Definition COFF.h:862
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ CE
Windows NT (Windows on ARM)
Definition MCAsmInfo.h:48
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
initializer< Ty > init(const Ty &Val)
std::enable_if_t< detail::IsValidPointer< X, Y >::value, X * > dyn_extract(Y &&MD)
Extract a Value from Metadata, if any.
Definition Metadata.h:694
LLVM_ABI AllocationType getMIBAllocType(const MDNode *MIB)
Returns the allocation type from an MIB metadata node.
LLVM_ABI bool metadataMayIncludeContextSizeInfo()
Whether the alloc memprof metadata may include context size info for some MIBs (but possibly not all)...
LLVM_ABI bool hasSingleAllocType(uint8_t AllocTypes)
True if the AllocTypes bitmask contains just a single type.
LLVM_ABI std::string getAllocTypeAttributeString(AllocationType Type)
Returns the string to use in attributes with the given type.
LLVM_ABI MDNode * getMIBStackNode(const MDNode *MIB)
Returns the stack node from an MIB metadata node.
LLVM_ABI void removeAnyExistingAmbiguousAttribute(CallBase *CB)
Removes any existing "ambiguous" memprof attribute.
DiagnosticInfoOptimizationBase::Argument NV
LLVM_ABI CallBase & promoteIndirectCall(CallBase &CB, Function *F, uint64_t Count, uint64_t TotalCount, bool AttachProfToDirectCall, OptimizationRemarkEmitter *ORE)
NodeAddr< NodeBase * > Node
Definition RDFGraph.h:381
NodeAddr< FuncNode * > Func
Definition RDFGraph.h:393
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:310
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
LLVM_ABI void logAllUnhandledErrors(Error E, raw_ostream &OS, Twine ErrorBanner={})
Log all errors (if any) in E to OS.
Definition Error.cpp:65
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
void stable_sort(R &&Range)
Definition STLExtras.h:2040
LLVM_ABI bool isLegalToPromote(const CallBase &CB, Function *Callee, const char **FailureReason=nullptr)
Return true if the given indirect call site can be made to call Callee.
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2454
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
InterleavedRange< Range > interleaved(const Range &R, StringRef Separator=", ", StringRef Prefix="", StringRef Suffix="")
Output range R as a sequence of interleaved elements.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:649
LLVM_ABI bool mayHaveMemprofSummary(const CallBase *CB)
Returns true if the instruction could have memprof metadata, used to ensure consistency between summa...
static cl::opt< bool > MemProfRequireDefinitionForPromotion("memprof-require-definition-for-promotion", cl::init(false), cl::Hidden, cl::desc("Require target function definition when promoting indirect calls"))
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:738
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2118
void set_subtract(S1Ty &S1, const S2Ty &S2)
set_subtract(A, B) - Compute A := A - B
InnerAnalysisManagerProxy< FunctionAnalysisManager, Module > FunctionAnalysisManagerModuleProxy
Provide the FunctionAnalysisManager to Module proxy.
raw_ostream & WriteGraph(raw_ostream &O, const GraphType &G, bool ShortNames=false, const Twine &Title="")
bool set_intersects(const S1Ty &S1, const S2Ty &S2)
set_intersects(A, B) - Return true iff A ^ B is non empty
detail::concat_range< ValueT, RangeTs... > concat(RangeTs &&...Ranges)
Returns a concatenated range across two or more ranges.
Definition STLExtras.h:1160
LLVM_ABI Expected< std::unique_ptr< ModuleSummaryIndex > > getModuleSummaryIndex(MemoryBufferRef Buffer)
Parse the specified bitcode buffer, returning the module summary index.
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:759
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1714
LLVM_ABI void annotateValueSite(Module &M, Instruction &Inst, const InstrProfRecord &InstrProfR, InstrProfValueKind ValueKind, uint32_t SiteIndx, uint32_t MaxMDCount=3)
Get the value profile data for value site SiteIdx from InstrProfR and annotate the instruction Inst w...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1721
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
bool set_union(S1Ty &S1, const S2Ty &S2)
set_union(A, B) - Compute A := A u B, return whether A changed.
cl::opt< bool > SupportsHotColdNew
Indicate we are linking with an allocator that supports hot/cold operator new interfaces.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
S1Ty set_intersection(const S1Ty &S1, const S2Ty &S2)
set_intersection(A, B) - Return A ^ B
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
cl::opt< bool > EnableMemProfContextDisambiguation
Enable MemProf context disambiguation for thin link.
S1Ty set_difference(const S1Ty &S1, const S2Ty &S2)
set_difference(A, B) - Return A - B
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
Expected< T > errorOrToExpected(ErrorOr< T > &&EO)
Convert an ErrorOr<T> to an Expected<T>.
Definition Error.h:1245
ArrayRef(const T &OneElt) -> ArrayRef< T >
std::string toString(const APInt &I, unsigned Radix, bool Signed, bool formatAsCLiteral=false, bool UpperCase=true, bool InsertSeparators=false)
ValueMap< const Value *, WeakTrackingVH > ValueToValueMapTy
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1849
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:565
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1740
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1879
LLVM_ABI Function * CloneFunction(Function *F, ValueToValueMapTy &VMap, ClonedCodeInfo *CodeInfo=nullptr)
Return a copy of the specified function and add it to that function's module.
AnalysisManager< Module > ModuleAnalysisManager
Convenience typedef for the Module analysis manager.
Definition MIRParser.h:39
#define N
static std::string getEdgeAttributes(NodeRef, ChildIteratorType ChildIter, GraphType G)
static const ContextNode< DerivedCCG, FuncTy, CallTy > * GetCallee(const EdgePtrTy &P)
std::unique_ptr< ContextNode< DerivedCCG, FuncTy, CallTy > > NodePtrTy
mapped_iterator< typename std::vector< std::shared_ptr< ContextEdge< DerivedCCG, FuncTy, CallTy > > >::const_iterator, decltype(&GetCallee)> ChildIteratorType
mapped_iterator< typename std::vector< NodePtrTy >::const_iterator, decltype(&getNode)> nodes_iterator
std::shared_ptr< ContextEdge< DerivedCCG, FuncTy, CallTy > > EdgePtrTy
Summary of memprof metadata on allocations.
std::vector< MIBInfo > MIBs
SmallVector< unsigned > StackIdIndices
SmallVector< unsigned > Clones
DefaultDOTGraphTraits(bool simple=false)
An information struct used to provide DenseMap with the various necessary components for a given valu...
typename GraphType::UnknownGraphTypeError NodeRef
Definition GraphTraits.h:95
Struct that holds a reference to a particular GUID in a global value summary.
ArrayRef< std::unique_ptr< GlobalValueSummary > > getSummaryList() const
GlobalValue::GUID getGUID() const
PointerUnion< CallsiteInfo *, AllocInfo * > SimpleType
static SimpleType getSimplifiedValue(IndexCall &Val)
const PointerUnion< CallsiteInfo *, AllocInfo * > SimpleType
static SimpleType getSimplifiedValue(const IndexCall &Val)
Define a template that can be specialized by smart pointers to reflect the fact that they are automat...
Definition Casting.h:34