LLVM 22.0.0git
MemProfContextDisambiguation.cpp
Go to the documentation of this file.
1//==-- MemProfContextDisambiguation.cpp - Disambiguate contexts -------------=//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements support for context disambiguation of allocation
10// calls for profile guided heap optimization. Specifically, it uses Memprof
11// profiles which indicate context specific allocation behavior (currently
12// distinguishing cold vs hot memory allocations). Cloning is performed to
13// expose the cold allocation call contexts, and the allocation calls are
14// subsequently annotated with an attribute for later transformation.
15//
16// The transformations can be performed either directly on IR (regular LTO), or
17// on a ThinLTO index (and later applied to the IR during the ThinLTO backend).
18// Both types of LTO operate on a the same base graph representation, which
19// uses CRTP to support either IR or Index formats.
20//
21//===----------------------------------------------------------------------===//
22
24#include "llvm/ADT/DenseMap.h"
25#include "llvm/ADT/DenseSet.h"
26#include "llvm/ADT/MapVector.h"
29#include "llvm/ADT/SmallSet.h"
31#include "llvm/ADT/Statistic.h"
37#include "llvm/IR/Module.h"
39#include "llvm/Pass.h"
44#include "llvm/Transforms/IPO.h"
48#include <deque>
49#include <sstream>
50#include <unordered_map>
51#include <vector>
52using namespace llvm;
53using namespace llvm::memprof;
54
55#define DEBUG_TYPE "memprof-context-disambiguation"
56
57STATISTIC(FunctionClonesAnalysis,
58 "Number of function clones created during whole program analysis");
59STATISTIC(FunctionClonesThinBackend,
60 "Number of function clones created during ThinLTO backend");
61STATISTIC(FunctionsClonedThinBackend,
62 "Number of functions that had clones created during ThinLTO backend");
63STATISTIC(AllocTypeNotCold, "Number of not cold static allocations (possibly "
64 "cloned) during whole program analysis");
65STATISTIC(AllocTypeCold, "Number of cold static allocations (possibly cloned) "
66 "during whole program analysis");
67STATISTIC(AllocTypeNotColdThinBackend,
68 "Number of not cold static allocations (possibly cloned) during "
69 "ThinLTO backend");
70STATISTIC(AllocTypeColdThinBackend, "Number of cold static allocations "
71 "(possibly cloned) during ThinLTO backend");
72STATISTIC(OrigAllocsThinBackend,
73 "Number of original (not cloned) allocations with memprof profiles "
74 "during ThinLTO backend");
76 AllocVersionsThinBackend,
77 "Number of allocation versions (including clones) during ThinLTO backend");
78STATISTIC(MaxAllocVersionsThinBackend,
79 "Maximum number of allocation versions created for an original "
80 "allocation during ThinLTO backend");
81STATISTIC(UnclonableAllocsThinBackend,
82 "Number of unclonable ambigous allocations during ThinLTO backend");
83STATISTIC(RemovedEdgesWithMismatchedCallees,
84 "Number of edges removed due to mismatched callees (profiled vs IR)");
85STATISTIC(FoundProfiledCalleeCount,
86 "Number of profiled callees found via tail calls");
87STATISTIC(FoundProfiledCalleeDepth,
88 "Aggregate depth of profiled callees found via tail calls");
89STATISTIC(FoundProfiledCalleeMaxDepth,
90 "Maximum depth of profiled callees found via tail calls");
91STATISTIC(FoundProfiledCalleeNonUniquelyCount,
92 "Number of profiled callees found via multiple tail call chains");
93STATISTIC(DeferredBackedges, "Number of backedges with deferred cloning");
94STATISTIC(NewMergedNodes, "Number of new nodes created during merging");
95STATISTIC(NonNewMergedNodes, "Number of non new nodes used during merging");
96STATISTIC(MissingAllocForContextId,
97 "Number of missing alloc nodes for context ids");
98STATISTIC(SkippedCallsCloning,
99 "Number of calls skipped during cloning due to unexpected operand");
100STATISTIC(MismatchedCloneAssignments,
101 "Number of callsites assigned to call multiple non-matching clones");
102STATISTIC(TotalMergeInvokes, "Number of merge invocations for nodes");
103STATISTIC(TotalMergeIters, "Number of merge iterations for nodes");
104STATISTIC(MaxMergeIters, "Max merge iterations for nodes");
105
107 "memprof-dot-file-path-prefix", cl::init(""), cl::Hidden,
108 cl::value_desc("filename"),
109 cl::desc("Specify the path prefix of the MemProf dot files."));
110
111static cl::opt<bool> ExportToDot("memprof-export-to-dot", cl::init(false),
113 cl::desc("Export graph to dot files."));
114
115// TODO: Remove this option once new handling is validated more widely.
117 "memprof-merge-iteration", cl::init(true), cl::Hidden,
118 cl::desc("Iteratively apply merging on a node to catch new callers"));
119
120// How much of the graph to export to dot.
122 All, // The full CCG graph.
123 Alloc, // Only contexts for the specified allocation.
124 Context, // Only the specified context.
125};
126
128 "memprof-dot-scope", cl::desc("Scope of graph to export to dot"),
131 clEnumValN(DotScope::All, "all", "Export full callsite graph"),
133 "Export only nodes with contexts feeding given "
134 "-memprof-dot-alloc-id"),
135 clEnumValN(DotScope::Context, "context",
136 "Export only nodes with given -memprof-dot-context-id")));
137
139 AllocIdForDot("memprof-dot-alloc-id", cl::init(0), cl::Hidden,
140 cl::desc("Id of alloc to export if -memprof-dot-scope=alloc "
141 "or to highlight if -memprof-dot-scope=all"));
142
144 "memprof-dot-context-id", cl::init(0), cl::Hidden,
145 cl::desc("Id of context to export if -memprof-dot-scope=context or to "
146 "highlight otherwise"));
147
148static cl::opt<bool>
149 DumpCCG("memprof-dump-ccg", cl::init(false), cl::Hidden,
150 cl::desc("Dump CallingContextGraph to stdout after each stage."));
151
152static cl::opt<bool>
153 VerifyCCG("memprof-verify-ccg", cl::init(false), cl::Hidden,
154 cl::desc("Perform verification checks on CallingContextGraph."));
155
156static cl::opt<bool>
157 VerifyNodes("memprof-verify-nodes", cl::init(false), cl::Hidden,
158 cl::desc("Perform frequent verification checks on nodes."));
159
161 "memprof-import-summary",
162 cl::desc("Import summary to use for testing the ThinLTO backend via opt"),
163 cl::Hidden);
164
166 TailCallSearchDepth("memprof-tail-call-search-depth", cl::init(5),
168 cl::desc("Max depth to recursively search for missing "
169 "frames through tail calls."));
170
171// Optionally enable cloning of callsites involved with recursive cycles
173 "memprof-allow-recursive-callsites", cl::init(true), cl::Hidden,
174 cl::desc("Allow cloning of callsites involved in recursive cycles"));
175
177 "memprof-clone-recursive-contexts", cl::init(true), cl::Hidden,
178 cl::desc("Allow cloning of contexts through recursive cycles"));
179
180// Generally this is needed for correct assignment of allocation clones to
181// function clones, however, allow it to be disabled for debugging while the
182// functionality is new and being tested more widely.
183static cl::opt<bool>
184 MergeClones("memprof-merge-clones", cl::init(true), cl::Hidden,
185 cl::desc("Merge clones before assigning functions"));
186
187// When disabled, try to detect and prevent cloning of recursive contexts.
188// This is only necessary until we support cloning through recursive cycles.
189// Leave on by default for now, as disabling requires a little bit of compile
190// time overhead and doesn't affect correctness, it will just inflate the cold
191// hinted bytes reporting a bit when -memprof-report-hinted-sizes is enabled.
193 "memprof-allow-recursive-contexts", cl::init(true), cl::Hidden,
194 cl::desc("Allow cloning of contexts having recursive cycles"));
195
196// Set the minimum absolute count threshold for allowing inlining of indirect
197// calls promoted during cloning.
199 "memprof-icp-noinline-threshold", cl::init(2), cl::Hidden,
200 cl::desc("Minimum absolute count for promoted target to be inlinable"));
201
202namespace llvm {
204 "enable-memprof-context-disambiguation", cl::init(false), cl::Hidden,
205 cl::ZeroOrMore, cl::desc("Enable MemProf context disambiguation"));
206
207// Indicate we are linking with an allocator that supports hot/cold operator
208// new interfaces.
210 "supports-hot-cold-new", cl::init(false), cl::Hidden,
211 cl::desc("Linking with hot/cold operator new interfaces"));
212
214 "memprof-require-definition-for-promotion", cl::init(false), cl::Hidden,
215 cl::desc(
216 "Require target function definition when promoting indirect calls"));
217} // namespace llvm
218
221
222namespace {
223/// CRTP base for graphs built from either IR or ThinLTO summary index.
224///
225/// The graph represents the call contexts in all memprof metadata on allocation
226/// calls, with nodes for the allocations themselves, as well as for the calls
227/// in each context. The graph is initially built from the allocation memprof
228/// metadata (or summary) MIBs. It is then updated to match calls with callsite
229/// metadata onto the nodes, updating it to reflect any inlining performed on
230/// those calls.
231///
232/// Each MIB (representing an allocation's call context with allocation
233/// behavior) is assigned a unique context id during the graph build. The edges
234/// and nodes in the graph are decorated with the context ids they carry. This
235/// is used to correctly update the graph when cloning is performed so that we
236/// can uniquify the context for a single (possibly cloned) allocation.
237template <typename DerivedCCG, typename FuncTy, typename CallTy>
238class CallsiteContextGraph {
239public:
240 CallsiteContextGraph() = default;
241 CallsiteContextGraph(const CallsiteContextGraph &) = default;
242 CallsiteContextGraph(CallsiteContextGraph &&) = default;
243
244 /// Main entry point to perform analysis and transformations on graph.
245 bool process();
246
247 /// Perform cloning on the graph necessary to uniquely identify the allocation
248 /// behavior of an allocation based on its context.
249 void identifyClones();
250
251 /// Assign callsite clones to functions, cloning functions as needed to
252 /// accommodate the combinations of their callsite clones reached by callers.
253 /// For regular LTO this clones functions and callsites in the IR, but for
254 /// ThinLTO the cloning decisions are noted in the summaries and later applied
255 /// in applyImport.
256 bool assignFunctions();
257
258 void dump() const;
259 void print(raw_ostream &OS) const;
260 void printTotalSizes(raw_ostream &OS) const;
261
263 const CallsiteContextGraph &CCG) {
264 CCG.print(OS);
265 return OS;
266 }
267
268 friend struct GraphTraits<
269 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
270 friend struct DOTGraphTraits<
271 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
272
273 void exportToDot(std::string Label) const;
274
275 /// Represents a function clone via FuncTy pointer and clone number pair.
276 struct FuncInfo final
277 : public std::pair<FuncTy *, unsigned /*Clone number*/> {
278 using Base = std::pair<FuncTy *, unsigned>;
279 FuncInfo(const Base &B) : Base(B) {}
280 FuncInfo(FuncTy *F = nullptr, unsigned CloneNo = 0) : Base(F, CloneNo) {}
281 explicit operator bool() const { return this->first != nullptr; }
282 FuncTy *func() const { return this->first; }
283 unsigned cloneNo() const { return this->second; }
284 };
285
286 /// Represents a callsite clone via CallTy and clone number pair.
287 struct CallInfo final : public std::pair<CallTy, unsigned /*Clone number*/> {
288 using Base = std::pair<CallTy, unsigned>;
289 CallInfo(const Base &B) : Base(B) {}
290 CallInfo(CallTy Call = nullptr, unsigned CloneNo = 0)
291 : Base(Call, CloneNo) {}
292 explicit operator bool() const { return (bool)this->first; }
293 CallTy call() const { return this->first; }
294 unsigned cloneNo() const { return this->second; }
295 void setCloneNo(unsigned N) { this->second = N; }
296 void print(raw_ostream &OS) const {
297 if (!operator bool()) {
298 assert(!cloneNo());
299 OS << "null Call";
300 return;
301 }
302 call()->print(OS);
303 OS << "\t(clone " << cloneNo() << ")";
304 }
305 void dump() const {
306 print(dbgs());
307 dbgs() << "\n";
308 }
309 friend raw_ostream &operator<<(raw_ostream &OS, const CallInfo &Call) {
310 Call.print(OS);
311 return OS;
312 }
313 };
314
315 struct ContextEdge;
316
317 /// Node in the Callsite Context Graph
318 struct ContextNode {
319 // Assigned to nodes as they are created, useful for debugging.
320 unsigned NodeId = 0;
321
322 // Keep this for now since in the IR case where we have an Instruction* it
323 // is not as immediately discoverable. Used for printing richer information
324 // when dumping graph.
325 bool IsAllocation;
326
327 // Keeps track of when the Call was reset to null because there was
328 // recursion.
329 bool Recursive = false;
330
331 // This will be formed by ORing together the AllocationType enum values
332 // for contexts including this node.
333 uint8_t AllocTypes = 0;
334
335 // The corresponding allocation or interior call. This is the primary call
336 // for which we have created this node.
337 CallInfo Call;
338
339 // List of other calls that can be treated the same as the primary call
340 // through cloning. I.e. located in the same function and have the same
341 // (possibly pruned) stack ids. They will be updated the same way as the
342 // primary call when assigning to function clones.
343 SmallVector<CallInfo, 0> MatchingCalls;
344
345 // For alloc nodes this is a unique id assigned when constructed, and for
346 // callsite stack nodes it is the original stack id when the node is
347 // constructed from the memprof MIB metadata on the alloc nodes. Note that
348 // this is only used when matching callsite metadata onto the stack nodes
349 // created when processing the allocation memprof MIBs, and for labeling
350 // nodes in the dot graph. Therefore we don't bother to assign a value for
351 // clones.
352 uint64_t OrigStackOrAllocId = 0;
353
354 // Edges to all callees in the profiled call stacks.
355 // TODO: Should this be a map (from Callee node) for more efficient lookup?
356 std::vector<std::shared_ptr<ContextEdge>> CalleeEdges;
357
358 // Edges to all callers in the profiled call stacks.
359 // TODO: Should this be a map (from Caller node) for more efficient lookup?
360 std::vector<std::shared_ptr<ContextEdge>> CallerEdges;
361
362 // Returns true if we need to look at the callee edges for determining the
363 // node context ids and allocation type.
364 bool useCallerEdgesForContextInfo() const {
365 // Typically if the callee edges are empty either the caller edges are
366 // also empty, or this is an allocation (leaf node). However, if we are
367 // allowing recursive callsites and contexts this will be violated for
368 // incompletely cloned recursive cycles.
369 assert(!CalleeEdges.empty() || CallerEdges.empty() || IsAllocation ||
371 // When cloning for a recursive context, during cloning we might be in the
372 // midst of cloning for a recurrence and have moved context ids off of a
373 // caller edge onto the clone but not yet off of the incoming caller
374 // (back) edge. If we don't look at those we miss the fact that this node
375 // still has context ids of interest.
376 return IsAllocation || CloneRecursiveContexts;
377 }
378
379 // Compute the context ids for this node from the union of its edge context
380 // ids.
381 DenseSet<uint32_t> getContextIds() const {
382 unsigned Count = 0;
383 // Compute the number of ids for reserve below. In general we only need to
384 // look at one set of edges, typically the callee edges, since other than
385 // allocations and in some cases during recursion cloning, all the context
386 // ids on the callers should also flow out via callee edges.
387 for (auto &Edge : CalleeEdges.empty() ? CallerEdges : CalleeEdges)
388 Count += Edge->getContextIds().size();
389 DenseSet<uint32_t> ContextIds;
390 ContextIds.reserve(Count);
392 CalleeEdges, useCallerEdgesForContextInfo()
393 ? CallerEdges
394 : std::vector<std::shared_ptr<ContextEdge>>());
395 for (const auto &Edge : Edges)
396 ContextIds.insert_range(Edge->getContextIds());
397 return ContextIds;
398 }
399
400 // Compute the allocation type for this node from the OR of its edge
401 // allocation types.
402 uint8_t computeAllocType() const {
403 uint8_t BothTypes =
404 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
405 uint8_t AllocType = (uint8_t)AllocationType::None;
407 CalleeEdges, useCallerEdgesForContextInfo()
408 ? CallerEdges
409 : std::vector<std::shared_ptr<ContextEdge>>());
410 for (const auto &Edge : Edges) {
411 AllocType |= Edge->AllocTypes;
412 // Bail early if alloc type reached both, no further refinement.
413 if (AllocType == BothTypes)
414 return AllocType;
415 }
416 return AllocType;
417 }
418
419 // The context ids set for this node is empty if its edge context ids are
420 // also all empty.
421 bool emptyContextIds() const {
423 CalleeEdges, useCallerEdgesForContextInfo()
424 ? CallerEdges
425 : std::vector<std::shared_ptr<ContextEdge>>());
426 for (const auto &Edge : Edges) {
427 if (!Edge->getContextIds().empty())
428 return false;
429 }
430 return true;
431 }
432
433 // List of clones of this ContextNode, initially empty.
434 std::vector<ContextNode *> Clones;
435
436 // If a clone, points to the original uncloned node.
437 ContextNode *CloneOf = nullptr;
438
439 ContextNode(bool IsAllocation) : IsAllocation(IsAllocation), Call() {}
440
441 ContextNode(bool IsAllocation, CallInfo C)
442 : IsAllocation(IsAllocation), Call(C) {}
443
444 void addClone(ContextNode *Clone) {
445 if (CloneOf) {
446 CloneOf->Clones.push_back(Clone);
447 Clone->CloneOf = CloneOf;
448 } else {
449 Clones.push_back(Clone);
450 assert(!Clone->CloneOf);
451 Clone->CloneOf = this;
452 }
453 }
454
455 ContextNode *getOrigNode() {
456 if (!CloneOf)
457 return this;
458 return CloneOf;
459 }
460
461 void addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
462 unsigned int ContextId);
463
464 ContextEdge *findEdgeFromCallee(const ContextNode *Callee);
465 ContextEdge *findEdgeFromCaller(const ContextNode *Caller);
466 void eraseCalleeEdge(const ContextEdge *Edge);
467 void eraseCallerEdge(const ContextEdge *Edge);
468
469 void setCall(CallInfo C) { Call = C; }
470
471 bool hasCall() const { return (bool)Call.call(); }
472
473 void printCall(raw_ostream &OS) const { Call.print(OS); }
474
475 // True if this node was effectively removed from the graph, in which case
476 // it should have an allocation type of None and empty context ids.
477 bool isRemoved() const {
478 // Typically if the callee edges are empty either the caller edges are
479 // also empty, or this is an allocation (leaf node). However, if we are
480 // allowing recursive callsites and contexts this will be violated for
481 // incompletely cloned recursive cycles.
483 (AllocTypes == (uint8_t)AllocationType::None) ==
484 emptyContextIds());
485 return AllocTypes == (uint8_t)AllocationType::None;
486 }
487
488 void dump() const;
489 void print(raw_ostream &OS) const;
490
491 friend raw_ostream &operator<<(raw_ostream &OS, const ContextNode &Node) {
492 Node.print(OS);
493 return OS;
494 }
495 };
496
497 /// Edge in the Callsite Context Graph from a ContextNode N to a caller or
498 /// callee.
499 struct ContextEdge {
500 ContextNode *Callee;
501 ContextNode *Caller;
502
503 // This will be formed by ORing together the AllocationType enum values
504 // for contexts including this edge.
505 uint8_t AllocTypes = 0;
506
507 // Set just before initiating cloning when cloning of recursive contexts is
508 // enabled. Used to defer cloning of backedges until we have done cloning of
509 // the callee node for non-backedge caller edges. This exposes cloning
510 // opportunities through the backedge of the cycle.
511 // TODO: Note that this is not updated during cloning, and it is unclear
512 // whether that would be needed.
513 bool IsBackedge = false;
514
515 // The set of IDs for contexts including this edge.
516 DenseSet<uint32_t> ContextIds;
517
518 ContextEdge(ContextNode *Callee, ContextNode *Caller, uint8_t AllocType,
519 DenseSet<uint32_t> ContextIds)
520 : Callee(Callee), Caller(Caller), AllocTypes(AllocType),
521 ContextIds(std::move(ContextIds)) {}
522
523 DenseSet<uint32_t> &getContextIds() { return ContextIds; }
524
525 // Helper to clear the fields of this edge when we are removing it from the
526 // graph.
527 inline void clear() {
528 ContextIds.clear();
529 AllocTypes = (uint8_t)AllocationType::None;
530 Caller = nullptr;
531 Callee = nullptr;
532 }
533
534 // Check if edge was removed from the graph. This is useful while iterating
535 // over a copy of edge lists when performing operations that mutate the
536 // graph in ways that might remove one of the edges.
537 inline bool isRemoved() const {
538 if (Callee || Caller)
539 return false;
540 // Any edges that have been removed from the graph but are still in a
541 // shared_ptr somewhere should have all fields null'ed out by clear()
542 // above.
543 assert(AllocTypes == (uint8_t)AllocationType::None);
544 assert(ContextIds.empty());
545 return true;
546 }
547
548 void dump() const;
549 void print(raw_ostream &OS) const;
550
551 friend raw_ostream &operator<<(raw_ostream &OS, const ContextEdge &Edge) {
552 Edge.print(OS);
553 return OS;
554 }
555 };
556
557 /// Helpers to remove edges that have allocation type None (due to not
558 /// carrying any context ids) after transformations.
559 void removeNoneTypeCalleeEdges(ContextNode *Node);
560 void removeNoneTypeCallerEdges(ContextNode *Node);
561 void
562 recursivelyRemoveNoneTypeCalleeEdges(ContextNode *Node,
563 DenseSet<const ContextNode *> &Visited);
564
565protected:
566 /// Get a list of nodes corresponding to the stack ids in the given callsite
567 /// context.
568 template <class NodeT, class IteratorT>
569 std::vector<uint64_t>
570 getStackIdsWithContextNodes(CallStack<NodeT, IteratorT> &CallsiteContext);
571
572 /// Adds nodes for the given allocation and any stack ids on its memprof MIB
573 /// metadata (or summary).
574 ContextNode *addAllocNode(CallInfo Call, const FuncTy *F);
575
576 /// Adds nodes for the given MIB stack ids.
577 template <class NodeT, class IteratorT>
578 void addStackNodesForMIB(ContextNode *AllocNode,
579 CallStack<NodeT, IteratorT> &StackContext,
580 CallStack<NodeT, IteratorT> &CallsiteContext,
582 ArrayRef<ContextTotalSize> ContextSizeInfo);
583
584 /// Matches all callsite metadata (or summary) to the nodes created for
585 /// allocation memprof MIB metadata, synthesizing new nodes to reflect any
586 /// inlining performed on those callsite instructions.
587 void updateStackNodes();
588
589 /// Update graph to conservatively handle any callsite stack nodes that target
590 /// multiple different callee target functions.
591 void handleCallsitesWithMultipleTargets();
592
593 /// Mark backedges via the standard DFS based backedge algorithm.
594 void markBackedges();
595
596 /// Merge clones generated during cloning for different allocations but that
597 /// are called by the same caller node, to ensure proper function assignment.
598 void mergeClones();
599
600 // Try to partition calls on the given node (already placed into the AllCalls
601 // array) by callee function, creating new copies of Node as needed to hold
602 // calls with different callees, and moving the callee edges appropriately.
603 // Returns true if partitioning was successful.
604 bool partitionCallsByCallee(
605 ContextNode *Node, ArrayRef<CallInfo> AllCalls,
606 std::vector<std::pair<CallInfo, ContextNode *>> &NewCallToNode);
607
608 /// Save lists of calls with MemProf metadata in each function, for faster
609 /// iteration.
610 MapVector<FuncTy *, std::vector<CallInfo>> FuncToCallsWithMetadata;
611
612 /// Map from callsite node to the enclosing caller function.
613 std::map<const ContextNode *, const FuncTy *> NodeToCallingFunc;
614
615 // When exporting to dot, and an allocation id is specified, contains the
616 // context ids on that allocation.
617 DenseSet<uint32_t> DotAllocContextIds;
618
619private:
620 using EdgeIter = typename std::vector<std::shared_ptr<ContextEdge>>::iterator;
621
622 // Structure to keep track of information for each call as we are matching
623 // non-allocation callsites onto context nodes created from the allocation
624 // call metadata / summary contexts.
625 struct CallContextInfo {
626 // The callsite we're trying to match.
627 CallTy Call;
628 // The callsites stack ids that have a context node in the graph.
629 std::vector<uint64_t> StackIds;
630 // The function containing this callsite.
631 const FuncTy *Func;
632 // Initially empty, if needed this will be updated to contain the context
633 // ids for use in a new context node created for this callsite.
634 DenseSet<uint32_t> ContextIds;
635 };
636
637 /// Helper to remove edge from graph, updating edge iterator if it is provided
638 /// (in which case CalleeIter indicates which edge list is being iterated).
639 /// This will also perform the necessary clearing of the ContextEdge members
640 /// to enable later checking if the edge has been removed (since we may have
641 /// other copies of the shared_ptr in existence, and in fact rely on this to
642 /// enable removal while iterating over a copy of a node's edge list).
643 void removeEdgeFromGraph(ContextEdge *Edge, EdgeIter *EI = nullptr,
644 bool CalleeIter = true);
645
646 /// Assigns the given Node to calls at or inlined into the location with
647 /// the Node's stack id, after post order traversing and processing its
648 /// caller nodes. Uses the call information recorded in the given
649 /// StackIdToMatchingCalls map, and creates new nodes for inlined sequences
650 /// as needed. Called by updateStackNodes which sets up the given
651 /// StackIdToMatchingCalls map.
652 void assignStackNodesPostOrder(
653 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
654 DenseMap<uint64_t, std::vector<CallContextInfo>> &StackIdToMatchingCalls,
655 DenseMap<CallInfo, CallInfo> &CallToMatchingCall);
656
657 /// Duplicates the given set of context ids, updating the provided
658 /// map from each original id with the newly generated context ids,
659 /// and returning the new duplicated id set.
660 DenseSet<uint32_t> duplicateContextIds(
661 const DenseSet<uint32_t> &StackSequenceContextIds,
662 DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
663
664 /// Propagates all duplicated context ids across the graph.
665 void propagateDuplicateContextIds(
666 const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
667
668 /// Connect the NewNode to OrigNode's callees if TowardsCallee is true,
669 /// else to its callers. Also updates OrigNode's edges to remove any context
670 /// ids moved to the newly created edge.
671 void connectNewNode(ContextNode *NewNode, ContextNode *OrigNode,
672 bool TowardsCallee,
673 DenseSet<uint32_t> RemainingContextIds);
674
675 /// Get the stack id corresponding to the given Id or Index (for IR this will
676 /// return itself, for a summary index this will return the id recorded in the
677 /// index for that stack id index value).
678 uint64_t getStackId(uint64_t IdOrIndex) const {
679 return static_cast<const DerivedCCG *>(this)->getStackId(IdOrIndex);
680 }
681
682 /// Returns true if the given call targets the callee of the given edge, or if
683 /// we were able to identify the call chain through intermediate tail calls.
684 /// In the latter case new context nodes are added to the graph for the
685 /// identified tail calls, and their synthesized nodes are added to
686 /// TailCallToContextNodeMap. The EdgeIter is updated in the latter case for
687 /// the updated edges and to prepare it for an increment in the caller.
688 bool
689 calleesMatch(CallTy Call, EdgeIter &EI,
690 MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap);
691
692 // Return the callee function of the given call, or nullptr if it can't be
693 // determined
694 const FuncTy *getCalleeFunc(CallTy Call) {
695 return static_cast<DerivedCCG *>(this)->getCalleeFunc(Call);
696 }
697
698 /// Returns true if the given call targets the given function, or if we were
699 /// able to identify the call chain through intermediate tail calls (in which
700 /// case FoundCalleeChain will be populated).
701 bool calleeMatchesFunc(
702 CallTy Call, const FuncTy *Func, const FuncTy *CallerFunc,
703 std::vector<std::pair<CallTy, FuncTy *>> &FoundCalleeChain) {
704 return static_cast<DerivedCCG *>(this)->calleeMatchesFunc(
705 Call, Func, CallerFunc, FoundCalleeChain);
706 }
707
708 /// Returns true if both call instructions have the same callee.
709 bool sameCallee(CallTy Call1, CallTy Call2) {
710 return static_cast<DerivedCCG *>(this)->sameCallee(Call1, Call2);
711 }
712
713 /// Get a list of nodes corresponding to the stack ids in the given
714 /// callsite's context.
715 std::vector<uint64_t> getStackIdsWithContextNodesForCall(CallTy Call) {
716 return static_cast<DerivedCCG *>(this)->getStackIdsWithContextNodesForCall(
717 Call);
718 }
719
720 /// Get the last stack id in the context for callsite.
721 uint64_t getLastStackId(CallTy Call) {
722 return static_cast<DerivedCCG *>(this)->getLastStackId(Call);
723 }
724
725 /// Update the allocation call to record type of allocated memory.
726 void updateAllocationCall(CallInfo &Call, AllocationType AllocType) {
727 AllocType == AllocationType::Cold ? AllocTypeCold++ : AllocTypeNotCold++;
728 static_cast<DerivedCCG *>(this)->updateAllocationCall(Call, AllocType);
729 }
730
731 /// Get the AllocationType assigned to the given allocation instruction clone.
732 AllocationType getAllocationCallType(const CallInfo &Call) const {
733 return static_cast<const DerivedCCG *>(this)->getAllocationCallType(Call);
734 }
735
736 /// Update non-allocation call to invoke (possibly cloned) function
737 /// CalleeFunc.
738 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc) {
739 static_cast<DerivedCCG *>(this)->updateCall(CallerCall, CalleeFunc);
740 }
741
742 /// Clone the given function for the given callsite, recording mapping of all
743 /// of the functions tracked calls to their new versions in the CallMap.
744 /// Assigns new clones to clone number CloneNo.
745 FuncInfo cloneFunctionForCallsite(
746 FuncInfo &Func, CallInfo &Call, DenseMap<CallInfo, CallInfo> &CallMap,
747 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
748 return static_cast<DerivedCCG *>(this)->cloneFunctionForCallsite(
749 Func, Call, CallMap, CallsWithMetadataInFunc, CloneNo);
750 }
751
752 /// Gets a label to use in the dot graph for the given call clone in the given
753 /// function.
754 std::string getLabel(const FuncTy *Func, const CallTy Call,
755 unsigned CloneNo) const {
756 return static_cast<const DerivedCCG *>(this)->getLabel(Func, Call, CloneNo);
757 }
758
759 // Create and return a new ContextNode.
760 ContextNode *createNewNode(bool IsAllocation, const FuncTy *F = nullptr,
761 CallInfo C = CallInfo()) {
762 NodeOwner.push_back(std::make_unique<ContextNode>(IsAllocation, C));
763 auto *NewNode = NodeOwner.back().get();
764 if (F)
765 NodeToCallingFunc[NewNode] = F;
766 NewNode->NodeId = NodeOwner.size();
767 return NewNode;
768 }
769
770 /// Helpers to find the node corresponding to the given call or stackid.
771 ContextNode *getNodeForInst(const CallInfo &C);
772 ContextNode *getNodeForAlloc(const CallInfo &C);
773 ContextNode *getNodeForStackId(uint64_t StackId);
774
775 /// Computes the alloc type corresponding to the given context ids, by
776 /// unioning their recorded alloc types.
777 uint8_t computeAllocType(DenseSet<uint32_t> &ContextIds) const;
778
779 /// Returns the allocation type of the intersection of the contexts of two
780 /// nodes (based on their provided context id sets), optimized for the case
781 /// when Node1Ids is smaller than Node2Ids.
782 uint8_t intersectAllocTypesImpl(const DenseSet<uint32_t> &Node1Ids,
783 const DenseSet<uint32_t> &Node2Ids) const;
784
785 /// Returns the allocation type of the intersection of the contexts of two
786 /// nodes (based on their provided context id sets).
787 uint8_t intersectAllocTypes(const DenseSet<uint32_t> &Node1Ids,
788 const DenseSet<uint32_t> &Node2Ids) const;
789
790 /// Create a clone of Edge's callee and move Edge to that new callee node,
791 /// performing the necessary context id and allocation type updates.
792 /// If ContextIdsToMove is non-empty, only that subset of Edge's ids are
793 /// moved to an edge to the new callee.
794 ContextNode *
795 moveEdgeToNewCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
796 DenseSet<uint32_t> ContextIdsToMove = {});
797
798 /// Change the callee of Edge to existing callee clone NewCallee, performing
799 /// the necessary context id and allocation type updates.
800 /// If ContextIdsToMove is non-empty, only that subset of Edge's ids are
801 /// moved to an edge to the new callee.
802 void moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
803 ContextNode *NewCallee,
804 bool NewClone = false,
805 DenseSet<uint32_t> ContextIdsToMove = {});
806
807 /// Change the caller of the edge at the given callee edge iterator to be
808 /// NewCaller, performing the necessary context id and allocation type
809 /// updates. This is similar to the above moveEdgeToExistingCalleeClone, but
810 /// a simplified version of it as we always move the given edge and all of its
811 /// context ids.
812 void moveCalleeEdgeToNewCaller(const std::shared_ptr<ContextEdge> &Edge,
813 ContextNode *NewCaller);
814
815 /// Recursive helper for marking backedges via DFS.
816 void markBackedges(ContextNode *Node, DenseSet<const ContextNode *> &Visited,
817 DenseSet<const ContextNode *> &CurrentStack);
818
819 /// Recursive helper for merging clones.
820 void
821 mergeClones(ContextNode *Node, DenseSet<const ContextNode *> &Visited,
822 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode);
823 /// Main worker for merging callee clones for a given node.
824 void mergeNodeCalleeClones(
825 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
826 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode);
827 /// Helper to find other callers of the given set of callee edges that can
828 /// share the same callee merge node.
829 void findOtherCallersToShareMerge(
830 ContextNode *Node, std::vector<std::shared_ptr<ContextEdge>> &CalleeEdges,
831 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode,
832 DenseSet<ContextNode *> &OtherCallersToShareMerge);
833
834 /// Recursively perform cloning on the graph for the given Node and its
835 /// callers, in order to uniquely identify the allocation behavior of an
836 /// allocation given its context. The context ids of the allocation being
837 /// processed are given in AllocContextIds.
838 void identifyClones(ContextNode *Node, DenseSet<const ContextNode *> &Visited,
839 const DenseSet<uint32_t> &AllocContextIds);
840
841 /// Map from each context ID to the AllocationType assigned to that context.
842 DenseMap<uint32_t, AllocationType> ContextIdToAllocationType;
843
844 /// Map from each contextID to the profiled full contexts and their total
845 /// sizes (there may be more than one due to context trimming),
846 /// optionally populated when requested (via MemProfReportHintedSizes or
847 /// MinClonedColdBytePercent).
848 DenseMap<uint32_t, std::vector<ContextTotalSize>> ContextIdToContextSizeInfos;
849
850 /// Identifies the context node created for a stack id when adding the MIB
851 /// contexts to the graph. This is used to locate the context nodes when
852 /// trying to assign the corresponding callsites with those stack ids to these
853 /// nodes.
854 DenseMap<uint64_t, ContextNode *> StackEntryIdToContextNodeMap;
855
856 /// Maps to track the calls to their corresponding nodes in the graph.
857 MapVector<CallInfo, ContextNode *> AllocationCallToContextNodeMap;
858 MapVector<CallInfo, ContextNode *> NonAllocationCallToContextNodeMap;
859
860 /// Owner of all ContextNode unique_ptrs.
861 std::vector<std::unique_ptr<ContextNode>> NodeOwner;
862
863 /// Perform sanity checks on graph when requested.
864 void check() const;
865
866 /// Keeps track of the last unique context id assigned.
867 unsigned int LastContextId = 0;
868};
869
870template <typename DerivedCCG, typename FuncTy, typename CallTy>
871using ContextNode =
872 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode;
873template <typename DerivedCCG, typename FuncTy, typename CallTy>
874using ContextEdge =
875 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge;
876template <typename DerivedCCG, typename FuncTy, typename CallTy>
877using FuncInfo =
878 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::FuncInfo;
879template <typename DerivedCCG, typename FuncTy, typename CallTy>
880using CallInfo =
881 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::CallInfo;
882
883/// CRTP derived class for graphs built from IR (regular LTO).
884class ModuleCallsiteContextGraph
885 : public CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
886 Instruction *> {
887public:
888 ModuleCallsiteContextGraph(
889 Module &M,
890 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter);
891
892private:
893 friend CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
894 Instruction *>;
895
896 uint64_t getStackId(uint64_t IdOrIndex) const;
897 const Function *getCalleeFunc(Instruction *Call);
898 bool calleeMatchesFunc(
899 Instruction *Call, const Function *Func, const Function *CallerFunc,
900 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain);
901 bool sameCallee(Instruction *Call1, Instruction *Call2);
902 bool findProfiledCalleeThroughTailCalls(
903 const Function *ProfiledCallee, Value *CurCallee, unsigned Depth,
904 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain,
905 bool &FoundMultipleCalleeChains);
906 uint64_t getLastStackId(Instruction *Call);
907 std::vector<uint64_t> getStackIdsWithContextNodesForCall(Instruction *Call);
908 void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
909 AllocationType getAllocationCallType(const CallInfo &Call) const;
910 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
911 CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
912 Instruction *>::FuncInfo
913 cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
914 DenseMap<CallInfo, CallInfo> &CallMap,
915 std::vector<CallInfo> &CallsWithMetadataInFunc,
916 unsigned CloneNo);
917 std::string getLabel(const Function *Func, const Instruction *Call,
918 unsigned CloneNo) const;
919
920 const Module &Mod;
921 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter;
922};
923
924/// Represents a call in the summary index graph, which can either be an
925/// allocation or an interior callsite node in an allocation's context.
926/// Holds a pointer to the corresponding data structure in the index.
927struct IndexCall : public PointerUnion<CallsiteInfo *, AllocInfo *> {
928 IndexCall() : PointerUnion() {}
929 IndexCall(std::nullptr_t) : IndexCall() {}
930 IndexCall(CallsiteInfo *StackNode) : PointerUnion(StackNode) {}
931 IndexCall(AllocInfo *AllocNode) : PointerUnion(AllocNode) {}
932 IndexCall(PointerUnion PT) : PointerUnion(PT) {}
933
934 IndexCall *operator->() { return this; }
935
936 void print(raw_ostream &OS) const {
937 PointerUnion<CallsiteInfo *, AllocInfo *> Base = *this;
939 OS << *AI;
940 } else {
942 assert(CI);
943 OS << *CI;
944 }
945 }
946};
947} // namespace
948
949namespace llvm {
950template <> struct simplify_type<IndexCall> {
952 static SimpleType getSimplifiedValue(IndexCall &Val) { return Val; }
953};
954template <> struct simplify_type<const IndexCall> {
956 static SimpleType getSimplifiedValue(const IndexCall &Val) { return Val; }
957};
958} // namespace llvm
959
960namespace {
961/// CRTP derived class for graphs built from summary index (ThinLTO).
962class IndexCallsiteContextGraph
963 : public CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
964 IndexCall> {
965public:
966 IndexCallsiteContextGraph(
967 ModuleSummaryIndex &Index,
968 llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
969 isPrevailing);
970
971 ~IndexCallsiteContextGraph() {
972 // Now that we are done with the graph it is safe to add the new
973 // CallsiteInfo structs to the function summary vectors. The graph nodes
974 // point into locations within these vectors, so we don't want to add them
975 // any earlier.
976 for (auto &I : FunctionCalleesToSynthesizedCallsiteInfos) {
977 auto *FS = I.first;
978 for (auto &Callsite : I.second)
979 FS->addCallsite(*Callsite.second);
980 }
981 }
982
983private:
984 friend CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
985 IndexCall>;
986
987 uint64_t getStackId(uint64_t IdOrIndex) const;
988 const FunctionSummary *getCalleeFunc(IndexCall &Call);
989 bool calleeMatchesFunc(
990 IndexCall &Call, const FunctionSummary *Func,
991 const FunctionSummary *CallerFunc,
992 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain);
993 bool sameCallee(IndexCall &Call1, IndexCall &Call2);
994 bool findProfiledCalleeThroughTailCalls(
995 ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
996 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
997 bool &FoundMultipleCalleeChains);
998 uint64_t getLastStackId(IndexCall &Call);
999 std::vector<uint64_t> getStackIdsWithContextNodesForCall(IndexCall &Call);
1000 void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
1001 AllocationType getAllocationCallType(const CallInfo &Call) const;
1002 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
1003 CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
1004 IndexCall>::FuncInfo
1005 cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
1006 DenseMap<CallInfo, CallInfo> &CallMap,
1007 std::vector<CallInfo> &CallsWithMetadataInFunc,
1008 unsigned CloneNo);
1009 std::string getLabel(const FunctionSummary *Func, const IndexCall &Call,
1010 unsigned CloneNo) const;
1011
1012 // Saves mapping from function summaries containing memprof records back to
1013 // its VI, for use in checking and debugging.
1014 std::map<const FunctionSummary *, ValueInfo> FSToVIMap;
1015
1016 const ModuleSummaryIndex &Index;
1017 llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
1018 isPrevailing;
1019
1020 // Saves/owns the callsite info structures synthesized for missing tail call
1021 // frames that we discover while building the graph.
1022 // It maps from the summary of the function making the tail call, to a map
1023 // of callee ValueInfo to corresponding synthesized callsite info.
1024 std::unordered_map<FunctionSummary *,
1025 std::map<ValueInfo, std::unique_ptr<CallsiteInfo>>>
1026 FunctionCalleesToSynthesizedCallsiteInfos;
1027};
1028} // namespace
1029
1030namespace llvm {
1031template <>
1032struct DenseMapInfo<typename CallsiteContextGraph<
1033 ModuleCallsiteContextGraph, Function, Instruction *>::CallInfo>
1035template <>
1036struct DenseMapInfo<typename CallsiteContextGraph<
1037 IndexCallsiteContextGraph, FunctionSummary, IndexCall>::CallInfo>
1038 : public DenseMapInfo<std::pair<IndexCall, unsigned>> {};
1039template <>
1040struct DenseMapInfo<IndexCall>
1041 : public DenseMapInfo<PointerUnion<CallsiteInfo *, AllocInfo *>> {};
1042} // end namespace llvm
1043
1044namespace {
1045
1046// Map the uint8_t alloc types (which may contain NotCold|Cold) to the alloc
1047// type we should actually use on the corresponding allocation.
1048// If we can't clone a node that has NotCold+Cold alloc type, we will fall
1049// back to using NotCold. So don't bother cloning to distinguish NotCold+Cold
1050// from NotCold.
1051AllocationType allocTypeToUse(uint8_t AllocTypes) {
1052 assert(AllocTypes != (uint8_t)AllocationType::None);
1053 if (AllocTypes ==
1056 else
1057 return (AllocationType)AllocTypes;
1058}
1059
1060// Helper to check if the alloc types for all edges recorded in the
1061// InAllocTypes vector match the alloc types for all edges in the Edges
1062// vector.
1063template <typename DerivedCCG, typename FuncTy, typename CallTy>
1064bool allocTypesMatch(
1065 const std::vector<uint8_t> &InAllocTypes,
1066 const std::vector<std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>>
1067 &Edges) {
1068 // This should be called only when the InAllocTypes vector was computed for
1069 // this set of Edges. Make sure the sizes are the same.
1070 assert(InAllocTypes.size() == Edges.size());
1071 return std::equal(
1072 InAllocTypes.begin(), InAllocTypes.end(), Edges.begin(), Edges.end(),
1073 [](const uint8_t &l,
1074 const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &r) {
1075 // Can share if one of the edges is None type - don't
1076 // care about the type along that edge as it doesn't
1077 // exist for those context ids.
1078 if (l == (uint8_t)AllocationType::None ||
1079 r->AllocTypes == (uint8_t)AllocationType::None)
1080 return true;
1081 return allocTypeToUse(l) == allocTypeToUse(r->AllocTypes);
1082 });
1083}
1084
1085// Helper to check if the alloc types for all edges recorded in the
1086// InAllocTypes vector match the alloc types for callee edges in the given
1087// clone. Because the InAllocTypes were computed from the original node's callee
1088// edges, and other cloning could have happened after this clone was created, we
1089// need to find the matching clone callee edge, which may or may not exist.
1090template <typename DerivedCCG, typename FuncTy, typename CallTy>
1091bool allocTypesMatchClone(
1092 const std::vector<uint8_t> &InAllocTypes,
1093 const ContextNode<DerivedCCG, FuncTy, CallTy> *Clone) {
1094 const ContextNode<DerivedCCG, FuncTy, CallTy> *Node = Clone->CloneOf;
1095 assert(Node);
1096 // InAllocTypes should have been computed for the original node's callee
1097 // edges.
1098 assert(InAllocTypes.size() == Node->CalleeEdges.size());
1099 // First create a map of the clone callee edge callees to the edge alloc type.
1101 EdgeCalleeMap;
1102 for (const auto &E : Clone->CalleeEdges) {
1103 assert(!EdgeCalleeMap.contains(E->Callee));
1104 EdgeCalleeMap[E->Callee] = E->AllocTypes;
1105 }
1106 // Next, walk the original node's callees, and look for the corresponding
1107 // clone edge to that callee.
1108 for (unsigned I = 0; I < Node->CalleeEdges.size(); I++) {
1109 auto Iter = EdgeCalleeMap.find(Node->CalleeEdges[I]->Callee);
1110 // Not found is ok, we will simply add an edge if we use this clone.
1111 if (Iter == EdgeCalleeMap.end())
1112 continue;
1113 // Can share if one of the edges is None type - don't
1114 // care about the type along that edge as it doesn't
1115 // exist for those context ids.
1116 if (InAllocTypes[I] == (uint8_t)AllocationType::None ||
1117 Iter->second == (uint8_t)AllocationType::None)
1118 continue;
1119 if (allocTypeToUse(Iter->second) != allocTypeToUse(InAllocTypes[I]))
1120 return false;
1121 }
1122 return true;
1123}
1124
1125} // end anonymous namespace
1126
1127template <typename DerivedCCG, typename FuncTy, typename CallTy>
1128typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1129CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForInst(
1130 const CallInfo &C) {
1131 ContextNode *Node = getNodeForAlloc(C);
1132 if (Node)
1133 return Node;
1134
1135 return NonAllocationCallToContextNodeMap.lookup(C);
1136}
1137
1138template <typename DerivedCCG, typename FuncTy, typename CallTy>
1139typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1140CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForAlloc(
1141 const CallInfo &C) {
1142 return AllocationCallToContextNodeMap.lookup(C);
1143}
1144
1145template <typename DerivedCCG, typename FuncTy, typename CallTy>
1146typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1147CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForStackId(
1148 uint64_t StackId) {
1149 auto StackEntryNode = StackEntryIdToContextNodeMap.find(StackId);
1150 if (StackEntryNode != StackEntryIdToContextNodeMap.end())
1151 return StackEntryNode->second;
1152 return nullptr;
1153}
1154
1155template <typename DerivedCCG, typename FuncTy, typename CallTy>
1156void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1157 addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
1158 unsigned int ContextId) {
1159 for (auto &Edge : CallerEdges) {
1160 if (Edge->Caller == Caller) {
1161 Edge->AllocTypes |= (uint8_t)AllocType;
1162 Edge->getContextIds().insert(ContextId);
1163 return;
1164 }
1165 }
1166 std::shared_ptr<ContextEdge> Edge = std::make_shared<ContextEdge>(
1167 this, Caller, (uint8_t)AllocType, DenseSet<uint32_t>({ContextId}));
1168 CallerEdges.push_back(Edge);
1169 Caller->CalleeEdges.push_back(Edge);
1170}
1171
1172template <typename DerivedCCG, typename FuncTy, typename CallTy>
1173void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::removeEdgeFromGraph(
1174 ContextEdge *Edge, EdgeIter *EI, bool CalleeIter) {
1175 assert(!EI || (*EI)->get() == Edge);
1176 assert(!Edge->isRemoved());
1177 // Save the Caller and Callee pointers so we can erase Edge from their edge
1178 // lists after clearing Edge below. We do the clearing first in case it is
1179 // destructed after removing from the edge lists (if those were the last
1180 // shared_ptr references to Edge).
1181 auto *Callee = Edge->Callee;
1182 auto *Caller = Edge->Caller;
1183
1184 // Make sure the edge fields are cleared out so we can properly detect
1185 // removed edges if Edge is not destructed because there is still a shared_ptr
1186 // reference.
1187 Edge->clear();
1188
1189#ifndef NDEBUG
1190 auto CalleeCallerCount = Callee->CallerEdges.size();
1191 auto CallerCalleeCount = Caller->CalleeEdges.size();
1192#endif
1193 if (!EI) {
1194 Callee->eraseCallerEdge(Edge);
1195 Caller->eraseCalleeEdge(Edge);
1196 } else if (CalleeIter) {
1197 Callee->eraseCallerEdge(Edge);
1198 *EI = Caller->CalleeEdges.erase(*EI);
1199 } else {
1200 Caller->eraseCalleeEdge(Edge);
1201 *EI = Callee->CallerEdges.erase(*EI);
1202 }
1203 assert(Callee->CallerEdges.size() < CalleeCallerCount);
1204 assert(Caller->CalleeEdges.size() < CallerCalleeCount);
1205}
1206
1207template <typename DerivedCCG, typename FuncTy, typename CallTy>
1208void CallsiteContextGraph<
1209 DerivedCCG, FuncTy, CallTy>::removeNoneTypeCalleeEdges(ContextNode *Node) {
1210 for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();) {
1211 auto Edge = *EI;
1212 if (Edge->AllocTypes == (uint8_t)AllocationType::None) {
1213 assert(Edge->ContextIds.empty());
1214 removeEdgeFromGraph(Edge.get(), &EI, /*CalleeIter=*/true);
1215 } else
1216 ++EI;
1217 }
1218}
1219
1220template <typename DerivedCCG, typename FuncTy, typename CallTy>
1221void CallsiteContextGraph<
1222 DerivedCCG, FuncTy, CallTy>::removeNoneTypeCallerEdges(ContextNode *Node) {
1223 for (auto EI = Node->CallerEdges.begin(); EI != Node->CallerEdges.end();) {
1224 auto Edge = *EI;
1225 if (Edge->AllocTypes == (uint8_t)AllocationType::None) {
1226 assert(Edge->ContextIds.empty());
1227 Edge->Caller->eraseCalleeEdge(Edge.get());
1228 EI = Node->CallerEdges.erase(EI);
1229 } else
1230 ++EI;
1231 }
1232}
1233
1234template <typename DerivedCCG, typename FuncTy, typename CallTy>
1235typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
1236CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1237 findEdgeFromCallee(const ContextNode *Callee) {
1238 for (const auto &Edge : CalleeEdges)
1239 if (Edge->Callee == Callee)
1240 return Edge.get();
1241 return nullptr;
1242}
1243
1244template <typename DerivedCCG, typename FuncTy, typename CallTy>
1245typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
1246CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1247 findEdgeFromCaller(const ContextNode *Caller) {
1248 for (const auto &Edge : CallerEdges)
1249 if (Edge->Caller == Caller)
1250 return Edge.get();
1251 return nullptr;
1252}
1253
1254template <typename DerivedCCG, typename FuncTy, typename CallTy>
1255void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1256 eraseCalleeEdge(const ContextEdge *Edge) {
1257 auto EI = llvm::find_if(
1258 CalleeEdges, [Edge](const std::shared_ptr<ContextEdge> &CalleeEdge) {
1259 return CalleeEdge.get() == Edge;
1260 });
1261 assert(EI != CalleeEdges.end());
1262 CalleeEdges.erase(EI);
1263}
1264
1265template <typename DerivedCCG, typename FuncTy, typename CallTy>
1266void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1267 eraseCallerEdge(const ContextEdge *Edge) {
1268 auto EI = llvm::find_if(
1269 CallerEdges, [Edge](const std::shared_ptr<ContextEdge> &CallerEdge) {
1270 return CallerEdge.get() == Edge;
1271 });
1272 assert(EI != CallerEdges.end());
1273 CallerEdges.erase(EI);
1274}
1275
1276template <typename DerivedCCG, typename FuncTy, typename CallTy>
1277uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::computeAllocType(
1278 DenseSet<uint32_t> &ContextIds) const {
1279 uint8_t BothTypes =
1280 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
1281 uint8_t AllocType = (uint8_t)AllocationType::None;
1282 for (auto Id : ContextIds) {
1283 AllocType |= (uint8_t)ContextIdToAllocationType.at(Id);
1284 // Bail early if alloc type reached both, no further refinement.
1285 if (AllocType == BothTypes)
1286 return AllocType;
1287 }
1288 return AllocType;
1289}
1290
1291template <typename DerivedCCG, typename FuncTy, typename CallTy>
1292uint8_t
1293CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypesImpl(
1294 const DenseSet<uint32_t> &Node1Ids,
1295 const DenseSet<uint32_t> &Node2Ids) const {
1296 uint8_t BothTypes =
1297 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
1298 uint8_t AllocType = (uint8_t)AllocationType::None;
1299 for (auto Id : Node1Ids) {
1300 if (!Node2Ids.count(Id))
1301 continue;
1302 AllocType |= (uint8_t)ContextIdToAllocationType.at(Id);
1303 // Bail early if alloc type reached both, no further refinement.
1304 if (AllocType == BothTypes)
1305 return AllocType;
1306 }
1307 return AllocType;
1308}
1309
1310template <typename DerivedCCG, typename FuncTy, typename CallTy>
1311uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypes(
1312 const DenseSet<uint32_t> &Node1Ids,
1313 const DenseSet<uint32_t> &Node2Ids) const {
1314 if (Node1Ids.size() < Node2Ids.size())
1315 return intersectAllocTypesImpl(Node1Ids, Node2Ids);
1316 else
1317 return intersectAllocTypesImpl(Node2Ids, Node1Ids);
1318}
1319
1320template <typename DerivedCCG, typename FuncTy, typename CallTy>
1321typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1322CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addAllocNode(
1323 CallInfo Call, const FuncTy *F) {
1324 assert(!getNodeForAlloc(Call));
1325 ContextNode *AllocNode = createNewNode(/*IsAllocation=*/true, F, Call);
1326 AllocationCallToContextNodeMap[Call] = AllocNode;
1327 // Use LastContextId as a uniq id for MIB allocation nodes.
1328 AllocNode->OrigStackOrAllocId = LastContextId;
1329 // Alloc type should be updated as we add in the MIBs. We should assert
1330 // afterwards that it is not still None.
1331 AllocNode->AllocTypes = (uint8_t)AllocationType::None;
1332
1333 return AllocNode;
1334}
1335
1336static std::string getAllocTypeString(uint8_t AllocTypes) {
1337 if (!AllocTypes)
1338 return "None";
1339 std::string Str;
1340 if (AllocTypes & (uint8_t)AllocationType::NotCold)
1341 Str += "NotCold";
1342 if (AllocTypes & (uint8_t)AllocationType::Cold)
1343 Str += "Cold";
1344 return Str;
1345}
1346
1347template <typename DerivedCCG, typename FuncTy, typename CallTy>
1348template <class NodeT, class IteratorT>
1349void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
1350 ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext,
1351 CallStack<NodeT, IteratorT> &CallsiteContext, AllocationType AllocType,
1352 ArrayRef<ContextTotalSize> ContextSizeInfo) {
1353 // Treating the hot alloc type as NotCold before the disambiguation for "hot"
1354 // is done.
1355 if (AllocType == AllocationType::Hot)
1356 AllocType = AllocationType::NotCold;
1357
1358 ContextIdToAllocationType[++LastContextId] = AllocType;
1359
1360 if (!ContextSizeInfo.empty()) {
1361 auto &Entry = ContextIdToContextSizeInfos[LastContextId];
1362 Entry.insert(Entry.begin(), ContextSizeInfo.begin(), ContextSizeInfo.end());
1363 }
1364
1365 // Update alloc type and context ids for this MIB.
1366 AllocNode->AllocTypes |= (uint8_t)AllocType;
1367
1368 // Now add or update nodes for each stack id in alloc's context.
1369 // Later when processing the stack ids on non-alloc callsites we will adjust
1370 // for any inlining in the context.
1371 ContextNode *PrevNode = AllocNode;
1372 // Look for recursion (direct recursion should have been collapsed by
1373 // module summary analysis, here we should just be detecting mutual
1374 // recursion). Mark these nodes so we don't try to clone.
1375 SmallSet<uint64_t, 8> StackIdSet;
1376 // Skip any on the allocation call (inlining).
1377 for (auto ContextIter = StackContext.beginAfterSharedPrefix(CallsiteContext);
1378 ContextIter != StackContext.end(); ++ContextIter) {
1379 auto StackId = getStackId(*ContextIter);
1380 ContextNode *StackNode = getNodeForStackId(StackId);
1381 if (!StackNode) {
1382 StackNode = createNewNode(/*IsAllocation=*/false);
1383 StackEntryIdToContextNodeMap[StackId] = StackNode;
1384 StackNode->OrigStackOrAllocId = StackId;
1385 }
1386 // Marking a node recursive will prevent its cloning completely, even for
1387 // non-recursive contexts flowing through it.
1389 auto Ins = StackIdSet.insert(StackId);
1390 if (!Ins.second)
1391 StackNode->Recursive = true;
1392 }
1393 StackNode->AllocTypes |= (uint8_t)AllocType;
1394 PrevNode->addOrUpdateCallerEdge(StackNode, AllocType, LastContextId);
1395 PrevNode = StackNode;
1396 }
1397}
1398
1399template <typename DerivedCCG, typename FuncTy, typename CallTy>
1400DenseSet<uint32_t>
1401CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::duplicateContextIds(
1402 const DenseSet<uint32_t> &StackSequenceContextIds,
1403 DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
1404 DenseSet<uint32_t> NewContextIds;
1405 for (auto OldId : StackSequenceContextIds) {
1406 NewContextIds.insert(++LastContextId);
1407 OldToNewContextIds[OldId].insert(LastContextId);
1408 assert(ContextIdToAllocationType.count(OldId));
1409 // The new context has the same allocation type as original.
1410 ContextIdToAllocationType[LastContextId] = ContextIdToAllocationType[OldId];
1411 if (DotAllocContextIds.contains(OldId))
1412 DotAllocContextIds.insert(LastContextId);
1413 }
1414 return NewContextIds;
1415}
1416
1417template <typename DerivedCCG, typename FuncTy, typename CallTy>
1418void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
1419 propagateDuplicateContextIds(
1420 const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
1421 // Build a set of duplicated context ids corresponding to the input id set.
1422 auto GetNewIds = [&OldToNewContextIds](const DenseSet<uint32_t> &ContextIds) {
1423 DenseSet<uint32_t> NewIds;
1424 for (auto Id : ContextIds)
1425 if (auto NewId = OldToNewContextIds.find(Id);
1426 NewId != OldToNewContextIds.end())
1427 NewIds.insert_range(NewId->second);
1428 return NewIds;
1429 };
1430
1431 // Recursively update context ids sets along caller edges.
1432 auto UpdateCallers = [&](ContextNode *Node,
1433 DenseSet<const ContextEdge *> &Visited,
1434 auto &&UpdateCallers) -> void {
1435 for (const auto &Edge : Node->CallerEdges) {
1436 auto Inserted = Visited.insert(Edge.get());
1437 if (!Inserted.second)
1438 continue;
1439 ContextNode *NextNode = Edge->Caller;
1440 DenseSet<uint32_t> NewIdsToAdd = GetNewIds(Edge->getContextIds());
1441 // Only need to recursively iterate to NextNode via this caller edge if
1442 // it resulted in any added ids to NextNode.
1443 if (!NewIdsToAdd.empty()) {
1444 Edge->getContextIds().insert_range(NewIdsToAdd);
1445 UpdateCallers(NextNode, Visited, UpdateCallers);
1446 }
1447 }
1448 };
1449
1450 DenseSet<const ContextEdge *> Visited;
1451 for (auto &Entry : AllocationCallToContextNodeMap) {
1452 auto *Node = Entry.second;
1453 UpdateCallers(Node, Visited, UpdateCallers);
1454 }
1455}
1456
1457template <typename DerivedCCG, typename FuncTy, typename CallTy>
1458void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::connectNewNode(
1459 ContextNode *NewNode, ContextNode *OrigNode, bool TowardsCallee,
1460 // This must be passed by value to make a copy since it will be adjusted
1461 // as ids are moved.
1462 DenseSet<uint32_t> RemainingContextIds) {
1463 auto &OrigEdges =
1464 TowardsCallee ? OrigNode->CalleeEdges : OrigNode->CallerEdges;
1465 DenseSet<uint32_t> RecursiveContextIds;
1466 DenseSet<uint32_t> AllCallerContextIds;
1468 // Identify which context ids are recursive which is needed to properly
1469 // update the RemainingContextIds set. The relevant recursive context ids
1470 // are those that are in multiple edges.
1471 for (auto &CE : OrigEdges) {
1472 AllCallerContextIds.reserve(CE->getContextIds().size());
1473 for (auto Id : CE->getContextIds())
1474 if (!AllCallerContextIds.insert(Id).second)
1475 RecursiveContextIds.insert(Id);
1476 }
1477 }
1478 // Increment iterator in loop so that we can remove edges as needed.
1479 for (auto EI = OrigEdges.begin(); EI != OrigEdges.end();) {
1480 auto Edge = *EI;
1481 DenseSet<uint32_t> NewEdgeContextIds;
1482 DenseSet<uint32_t> NotFoundContextIds;
1483 // Remove any matching context ids from Edge, return set that were found and
1484 // removed, these are the new edge's context ids. Also update the remaining
1485 // (not found ids).
1486 set_subtract(Edge->getContextIds(), RemainingContextIds, NewEdgeContextIds,
1487 NotFoundContextIds);
1488 // Update the remaining context ids set for the later edges. This is a
1489 // compile time optimization.
1490 if (RecursiveContextIds.empty()) {
1491 // No recursive ids, so all of the previously remaining context ids that
1492 // were not seen on this edge are the new remaining set.
1493 RemainingContextIds.swap(NotFoundContextIds);
1494 } else {
1495 // Keep the recursive ids in the remaining set as we expect to see those
1496 // on another edge. We can remove the non-recursive remaining ids that
1497 // were seen on this edge, however. We already have the set of remaining
1498 // ids that were on this edge (in NewEdgeContextIds). Figure out which are
1499 // non-recursive and only remove those. Note that despite the higher
1500 // overhead of updating the remaining context ids set when recursion
1501 // handling is enabled, it was found to be at worst performance neutral
1502 // and in one case a clear win.
1503 DenseSet<uint32_t> NonRecursiveRemainingCurEdgeIds =
1504 set_difference(NewEdgeContextIds, RecursiveContextIds);
1505 set_subtract(RemainingContextIds, NonRecursiveRemainingCurEdgeIds);
1506 }
1507 // If no matching context ids for this edge, skip it.
1508 if (NewEdgeContextIds.empty()) {
1509 ++EI;
1510 continue;
1511 }
1512 if (TowardsCallee) {
1513 uint8_t NewAllocType = computeAllocType(NewEdgeContextIds);
1514 auto NewEdge = std::make_shared<ContextEdge>(
1515 Edge->Callee, NewNode, NewAllocType, std::move(NewEdgeContextIds));
1516 NewNode->CalleeEdges.push_back(NewEdge);
1517 NewEdge->Callee->CallerEdges.push_back(NewEdge);
1518 } else {
1519 uint8_t NewAllocType = computeAllocType(NewEdgeContextIds);
1520 auto NewEdge = std::make_shared<ContextEdge>(
1521 NewNode, Edge->Caller, NewAllocType, std::move(NewEdgeContextIds));
1522 NewNode->CallerEdges.push_back(NewEdge);
1523 NewEdge->Caller->CalleeEdges.push_back(NewEdge);
1524 }
1525 // Remove old edge if context ids empty.
1526 if (Edge->getContextIds().empty()) {
1527 removeEdgeFromGraph(Edge.get(), &EI, TowardsCallee);
1528 continue;
1529 }
1530 ++EI;
1531 }
1532}
1533
1534template <typename DerivedCCG, typename FuncTy, typename CallTy>
1535static void checkEdge(
1536 const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &Edge) {
1537 // Confirm that alloc type is not None and that we have at least one context
1538 // id.
1539 assert(Edge->AllocTypes != (uint8_t)AllocationType::None);
1540 assert(!Edge->ContextIds.empty());
1541}
1542
1543template <typename DerivedCCG, typename FuncTy, typename CallTy>
1544static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node,
1545 bool CheckEdges = true) {
1546 if (Node->isRemoved())
1547 return;
1548#ifndef NDEBUG
1549 // Compute node's context ids once for use in asserts.
1550 auto NodeContextIds = Node->getContextIds();
1551#endif
1552 // Node's context ids should be the union of both its callee and caller edge
1553 // context ids.
1554 if (Node->CallerEdges.size()) {
1555 DenseSet<uint32_t> CallerEdgeContextIds(
1556 Node->CallerEdges.front()->ContextIds);
1557 for (const auto &Edge : llvm::drop_begin(Node->CallerEdges)) {
1558 if (CheckEdges)
1560 set_union(CallerEdgeContextIds, Edge->ContextIds);
1561 }
1562 // Node can have more context ids than callers if some contexts terminate at
1563 // node and some are longer. If we are allowing recursive callsites and
1564 // contexts this will be violated for incompletely cloned recursive cycles,
1565 // so skip the checking in that case.
1567 NodeContextIds == CallerEdgeContextIds ||
1568 set_is_subset(CallerEdgeContextIds, NodeContextIds));
1569 }
1570 if (Node->CalleeEdges.size()) {
1571 DenseSet<uint32_t> CalleeEdgeContextIds(
1572 Node->CalleeEdges.front()->ContextIds);
1573 for (const auto &Edge : llvm::drop_begin(Node->CalleeEdges)) {
1574 if (CheckEdges)
1576 set_union(CalleeEdgeContextIds, Edge->getContextIds());
1577 }
1578 // If we are allowing recursive callsites and contexts this will be violated
1579 // for incompletely cloned recursive cycles, so skip the checking in that
1580 // case.
1582 NodeContextIds == CalleeEdgeContextIds);
1583 }
1584 // FIXME: Since this checking is only invoked under an option, we should
1585 // change the error checking from using assert to something that will trigger
1586 // an error on a release build.
1587#ifndef NDEBUG
1588 // Make sure we don't end up with duplicate edges between the same caller and
1589 // callee.
1591 for (const auto &E : Node->CalleeEdges)
1592 NodeSet.insert(E->Callee);
1593 assert(NodeSet.size() == Node->CalleeEdges.size());
1594#endif
1595}
1596
1597template <typename DerivedCCG, typename FuncTy, typename CallTy>
1598void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
1599 assignStackNodesPostOrder(
1600 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
1601 DenseMap<uint64_t, std::vector<CallContextInfo>>
1602 &StackIdToMatchingCalls,
1603 DenseMap<CallInfo, CallInfo> &CallToMatchingCall) {
1604 auto Inserted = Visited.insert(Node);
1605 if (!Inserted.second)
1606 return;
1607 // Post order traversal. Iterate over a copy since we may add nodes and
1608 // therefore new callers during the recursive call, invalidating any
1609 // iterator over the original edge vector. We don't need to process these
1610 // new nodes as they were already processed on creation.
1611 auto CallerEdges = Node->CallerEdges;
1612 for (auto &Edge : CallerEdges) {
1613 // Skip any that have been removed during the recursion.
1614 if (Edge->isRemoved()) {
1615 assert(!is_contained(Node->CallerEdges, Edge));
1616 continue;
1617 }
1618 assignStackNodesPostOrder(Edge->Caller, Visited, StackIdToMatchingCalls,
1619 CallToMatchingCall);
1620 }
1621
1622 // If this node's stack id is in the map, update the graph to contain new
1623 // nodes representing any inlining at interior callsites. Note we move the
1624 // associated context ids over to the new nodes.
1625
1626 // Ignore this node if it is for an allocation or we didn't record any
1627 // stack id lists ending at it.
1628 if (Node->IsAllocation ||
1629 !StackIdToMatchingCalls.count(Node->OrigStackOrAllocId))
1630 return;
1631
1632 auto &Calls = StackIdToMatchingCalls[Node->OrigStackOrAllocId];
1633 // Handle the simple case first. A single call with a single stack id.
1634 // In this case there is no need to create any new context nodes, simply
1635 // assign the context node for stack id to this Call.
1636 if (Calls.size() == 1) {
1637 auto &[Call, Ids, Func, SavedContextIds] = Calls[0];
1638 if (Ids.size() == 1) {
1639 assert(SavedContextIds.empty());
1640 // It should be this Node
1641 assert(Node == getNodeForStackId(Ids[0]));
1642 if (Node->Recursive)
1643 return;
1644 Node->setCall(Call);
1645 NonAllocationCallToContextNodeMap[Call] = Node;
1646 NodeToCallingFunc[Node] = Func;
1647 return;
1648 }
1649 }
1650
1651#ifndef NDEBUG
1652 // Find the node for the last stack id, which should be the same
1653 // across all calls recorded for this id, and is this node's id.
1654 uint64_t LastId = Node->OrigStackOrAllocId;
1655 ContextNode *LastNode = getNodeForStackId(LastId);
1656 // We should only have kept stack ids that had nodes.
1657 assert(LastNode);
1658 assert(LastNode == Node);
1659#else
1660 ContextNode *LastNode = Node;
1661#endif
1662
1663 // Compute the last node's context ids once, as it is shared by all calls in
1664 // this entry.
1665 DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds();
1666
1667 [[maybe_unused]] bool PrevIterCreatedNode = false;
1668 bool CreatedNode = false;
1669 for (unsigned I = 0; I < Calls.size();
1670 I++, PrevIterCreatedNode = CreatedNode) {
1671 CreatedNode = false;
1672 auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
1673 // Skip any for which we didn't assign any ids, these don't get a node in
1674 // the graph.
1675 if (SavedContextIds.empty()) {
1676 // If this call has a matching call (located in the same function and
1677 // having the same stack ids), simply add it to the context node created
1678 // for its matching call earlier. These can be treated the same through
1679 // cloning and get updated at the same time.
1680 if (!CallToMatchingCall.contains(Call))
1681 continue;
1682 auto MatchingCall = CallToMatchingCall[Call];
1683 if (!NonAllocationCallToContextNodeMap.contains(MatchingCall)) {
1684 // This should only happen if we had a prior iteration, and it didn't
1685 // create a node because of the below recomputation of context ids
1686 // finding none remaining and continuing early.
1687 assert(I > 0 && !PrevIterCreatedNode);
1688 continue;
1689 }
1690 NonAllocationCallToContextNodeMap[MatchingCall]->MatchingCalls.push_back(
1691 Call);
1692 continue;
1693 }
1694
1695 assert(LastId == Ids.back());
1696
1697 // Recompute the context ids for this stack id sequence (the
1698 // intersection of the context ids of the corresponding nodes).
1699 // Start with the ids we saved in the map for this call, which could be
1700 // duplicated context ids. We have to recompute as we might have overlap
1701 // overlap between the saved context ids for different last nodes, and
1702 // removed them already during the post order traversal.
1703 set_intersect(SavedContextIds, LastNodeContextIds);
1704 ContextNode *PrevNode = LastNode;
1705 bool Skip = false;
1706 // Iterate backwards through the stack Ids, starting after the last Id
1707 // in the list, which was handled once outside for all Calls.
1708 for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) {
1709 auto Id = *IdIter;
1710 ContextNode *CurNode = getNodeForStackId(Id);
1711 // We should only have kept stack ids that had nodes and weren't
1712 // recursive.
1713 assert(CurNode);
1714 assert(!CurNode->Recursive);
1715
1716 auto *Edge = CurNode->findEdgeFromCaller(PrevNode);
1717 if (!Edge) {
1718 Skip = true;
1719 break;
1720 }
1721 PrevNode = CurNode;
1722
1723 // Update the context ids, which is the intersection of the ids along
1724 // all edges in the sequence.
1725 set_intersect(SavedContextIds, Edge->getContextIds());
1726
1727 // If we now have no context ids for clone, skip this call.
1728 if (SavedContextIds.empty()) {
1729 Skip = true;
1730 break;
1731 }
1732 }
1733 if (Skip)
1734 continue;
1735
1736 // Create new context node.
1737 ContextNode *NewNode = createNewNode(/*IsAllocation=*/false, Func, Call);
1738 NonAllocationCallToContextNodeMap[Call] = NewNode;
1739 CreatedNode = true;
1740 NewNode->AllocTypes = computeAllocType(SavedContextIds);
1741
1742 ContextNode *FirstNode = getNodeForStackId(Ids[0]);
1743 assert(FirstNode);
1744
1745 // Connect to callees of innermost stack frame in inlined call chain.
1746 // This updates context ids for FirstNode's callee's to reflect those
1747 // moved to NewNode.
1748 connectNewNode(NewNode, FirstNode, /*TowardsCallee=*/true, SavedContextIds);
1749
1750 // Connect to callers of outermost stack frame in inlined call chain.
1751 // This updates context ids for FirstNode's caller's to reflect those
1752 // moved to NewNode.
1753 connectNewNode(NewNode, LastNode, /*TowardsCallee=*/false, SavedContextIds);
1754
1755 // Now we need to remove context ids from edges/nodes between First and
1756 // Last Node.
1757 PrevNode = nullptr;
1758 for (auto Id : Ids) {
1759 ContextNode *CurNode = getNodeForStackId(Id);
1760 // We should only have kept stack ids that had nodes.
1761 assert(CurNode);
1762
1763 // Remove the context ids moved to NewNode from CurNode, and the
1764 // edge from the prior node.
1765 if (PrevNode) {
1766 auto *PrevEdge = CurNode->findEdgeFromCallee(PrevNode);
1767 // If the sequence contained recursion, we might have already removed
1768 // some edges during the connectNewNode calls above.
1769 if (!PrevEdge) {
1770 PrevNode = CurNode;
1771 continue;
1772 }
1773 set_subtract(PrevEdge->getContextIds(), SavedContextIds);
1774 if (PrevEdge->getContextIds().empty())
1775 removeEdgeFromGraph(PrevEdge);
1776 }
1777 // Since we update the edges from leaf to tail, only look at the callee
1778 // edges. This isn't an alloc node, so if there are no callee edges, the
1779 // alloc type is None.
1780 CurNode->AllocTypes = CurNode->CalleeEdges.empty()
1781 ? (uint8_t)AllocationType::None
1782 : CurNode->computeAllocType();
1783 PrevNode = CurNode;
1784 }
1785 if (VerifyNodes) {
1786 checkNode<DerivedCCG, FuncTy, CallTy>(NewNode, /*CheckEdges=*/true);
1787 for (auto Id : Ids) {
1788 ContextNode *CurNode = getNodeForStackId(Id);
1789 // We should only have kept stack ids that had nodes.
1790 assert(CurNode);
1791 checkNode<DerivedCCG, FuncTy, CallTy>(CurNode, /*CheckEdges=*/true);
1792 }
1793 }
1794 }
1795}
1796
1797template <typename DerivedCCG, typename FuncTy, typename CallTy>
1798void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
1799 // Map of stack id to all calls with that as the last (outermost caller)
1800 // callsite id that has a context node (some might not due to pruning
1801 // performed during matching of the allocation profile contexts).
1802 // The CallContextInfo contains the Call and a list of its stack ids with
1803 // ContextNodes, the function containing Call, and the set of context ids
1804 // the analysis will eventually identify for use in any new node created
1805 // for that callsite.
1806 DenseMap<uint64_t, std::vector<CallContextInfo>> StackIdToMatchingCalls;
1807 for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
1808 for (auto &Call : CallsWithMetadata) {
1809 // Ignore allocations, already handled.
1810 if (AllocationCallToContextNodeMap.count(Call))
1811 continue;
1812 auto StackIdsWithContextNodes =
1813 getStackIdsWithContextNodesForCall(Call.call());
1814 // If there were no nodes created for MIBs on allocs (maybe this was in
1815 // the unambiguous part of the MIB stack that was pruned), ignore.
1816 if (StackIdsWithContextNodes.empty())
1817 continue;
1818 // Otherwise, record this Call along with the list of ids for the last
1819 // (outermost caller) stack id with a node.
1820 StackIdToMatchingCalls[StackIdsWithContextNodes.back()].push_back(
1821 {Call.call(), StackIdsWithContextNodes, Func, {}});
1822 }
1823 }
1824
1825 // First make a pass through all stack ids that correspond to a call,
1826 // as identified in the above loop. Compute the context ids corresponding to
1827 // each of these calls when they correspond to multiple stack ids due to
1828 // due to inlining. Perform any duplication of context ids required when
1829 // there is more than one call with the same stack ids. Their (possibly newly
1830 // duplicated) context ids are saved in the StackIdToMatchingCalls map.
1831 DenseMap<uint32_t, DenseSet<uint32_t>> OldToNewContextIds;
1832 // Save a map from each call to any that are found to match it. I.e. located
1833 // in the same function and have the same (possibly pruned) stack ids. We use
1834 // this to avoid creating extra graph nodes as they can be treated the same.
1835 DenseMap<CallInfo, CallInfo> CallToMatchingCall;
1836 for (auto &It : StackIdToMatchingCalls) {
1837 auto &Calls = It.getSecond();
1838 // Skip single calls with a single stack id. These don't need a new node.
1839 if (Calls.size() == 1) {
1840 auto &Ids = Calls[0].StackIds;
1841 if (Ids.size() == 1)
1842 continue;
1843 }
1844 // In order to do the best and maximal matching of inlined calls to context
1845 // node sequences we will sort the vectors of stack ids in descending order
1846 // of length, and within each length, lexicographically by stack id. The
1847 // latter is so that we can specially handle calls that have identical stack
1848 // id sequences (either due to cloning or artificially because of the MIB
1849 // context pruning). Those with the same Ids are then sorted by function to
1850 // facilitate efficiently mapping them to the same context node.
1851 // Because the functions are pointers, to ensure a stable sort first assign
1852 // each function pointer to its first index in the Calls array, and then use
1853 // that to sort by.
1854 DenseMap<const FuncTy *, unsigned> FuncToIndex;
1855 for (const auto &[Idx, CallCtxInfo] : enumerate(Calls))
1856 FuncToIndex.insert({CallCtxInfo.Func, Idx});
1858 Calls,
1859 [&FuncToIndex](const CallContextInfo &A, const CallContextInfo &B) {
1860 return A.StackIds.size() > B.StackIds.size() ||
1861 (A.StackIds.size() == B.StackIds.size() &&
1862 (A.StackIds < B.StackIds ||
1863 (A.StackIds == B.StackIds &&
1864 FuncToIndex[A.Func] < FuncToIndex[B.Func])));
1865 });
1866
1867 // Find the node for the last stack id, which should be the same
1868 // across all calls recorded for this id, and is the id for this
1869 // entry in the StackIdToMatchingCalls map.
1870 uint64_t LastId = It.getFirst();
1871 ContextNode *LastNode = getNodeForStackId(LastId);
1872 // We should only have kept stack ids that had nodes.
1873 assert(LastNode);
1874
1875 if (LastNode->Recursive)
1876 continue;
1877
1878 // Initialize the context ids with the last node's. We will subsequently
1879 // refine the context ids by computing the intersection along all edges.
1880 DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds();
1881 assert(!LastNodeContextIds.empty());
1882
1883#ifndef NDEBUG
1884 // Save the set of functions seen for a particular set of the same stack
1885 // ids. This is used to ensure that they have been correctly sorted to be
1886 // adjacent in the Calls list, since we rely on that to efficiently place
1887 // all such matching calls onto the same context node.
1888 DenseSet<const FuncTy *> MatchingIdsFuncSet;
1889#endif
1890
1891 for (unsigned I = 0; I < Calls.size(); I++) {
1892 auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
1893 assert(SavedContextIds.empty());
1894 assert(LastId == Ids.back());
1895
1896#ifndef NDEBUG
1897 // If this call has a different set of ids than the last one, clear the
1898 // set used to ensure they are sorted properly.
1899 if (I > 0 && Ids != Calls[I - 1].StackIds)
1900 MatchingIdsFuncSet.clear();
1901#endif
1902
1903 // First compute the context ids for this stack id sequence (the
1904 // intersection of the context ids of the corresponding nodes).
1905 // Start with the remaining saved ids for the last node.
1906 assert(!LastNodeContextIds.empty());
1907 DenseSet<uint32_t> StackSequenceContextIds = LastNodeContextIds;
1908
1909 ContextNode *PrevNode = LastNode;
1910 ContextNode *CurNode = LastNode;
1911 bool Skip = false;
1912
1913 // Iterate backwards through the stack Ids, starting after the last Id
1914 // in the list, which was handled once outside for all Calls.
1915 for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) {
1916 auto Id = *IdIter;
1917 CurNode = getNodeForStackId(Id);
1918 // We should only have kept stack ids that had nodes.
1919 assert(CurNode);
1920
1921 if (CurNode->Recursive) {
1922 Skip = true;
1923 break;
1924 }
1925
1926 auto *Edge = CurNode->findEdgeFromCaller(PrevNode);
1927 // If there is no edge then the nodes belong to different MIB contexts,
1928 // and we should skip this inlined context sequence. For example, this
1929 // particular inlined context may include stack ids A->B, and we may
1930 // indeed have nodes for both A and B, but it is possible that they were
1931 // never profiled in sequence in a single MIB for any allocation (i.e.
1932 // we might have profiled an allocation that involves the callsite A,
1933 // but through a different one of its callee callsites, and we might
1934 // have profiled an allocation that involves callsite B, but reached
1935 // from a different caller callsite).
1936 if (!Edge) {
1937 Skip = true;
1938 break;
1939 }
1940 PrevNode = CurNode;
1941
1942 // Update the context ids, which is the intersection of the ids along
1943 // all edges in the sequence.
1944 set_intersect(StackSequenceContextIds, Edge->getContextIds());
1945
1946 // If we now have no context ids for clone, skip this call.
1947 if (StackSequenceContextIds.empty()) {
1948 Skip = true;
1949 break;
1950 }
1951 }
1952 if (Skip)
1953 continue;
1954
1955 // If some of this call's stack ids did not have corresponding nodes (due
1956 // to pruning), don't include any context ids for contexts that extend
1957 // beyond these nodes. Otherwise we would be matching part of unrelated /
1958 // not fully matching stack contexts. To do this, subtract any context ids
1959 // found in caller nodes of the last node found above.
1960 if (Ids.back() != getLastStackId(Call)) {
1961 for (const auto &PE : LastNode->CallerEdges) {
1962 set_subtract(StackSequenceContextIds, PE->getContextIds());
1963 if (StackSequenceContextIds.empty())
1964 break;
1965 }
1966 // If we now have no context ids for clone, skip this call.
1967 if (StackSequenceContextIds.empty())
1968 continue;
1969 }
1970
1971#ifndef NDEBUG
1972 // If the prior call had the same stack ids this set would not be empty.
1973 // Check if we already have a call that "matches" because it is located
1974 // in the same function. If the Calls list was sorted properly we should
1975 // not encounter this situation as all such entries should be adjacent
1976 // and processed in bulk further below.
1977 assert(!MatchingIdsFuncSet.contains(Func));
1978
1979 MatchingIdsFuncSet.insert(Func);
1980#endif
1981
1982 // Check if the next set of stack ids is the same (since the Calls vector
1983 // of tuples is sorted by the stack ids we can just look at the next one).
1984 // If so, save them in the CallToMatchingCall map so that they get
1985 // assigned to the same context node, and skip them.
1986 bool DuplicateContextIds = false;
1987 for (unsigned J = I + 1; J < Calls.size(); J++) {
1988 auto &CallCtxInfo = Calls[J];
1989 auto &NextIds = CallCtxInfo.StackIds;
1990 if (NextIds != Ids)
1991 break;
1992 auto *NextFunc = CallCtxInfo.Func;
1993 if (NextFunc != Func) {
1994 // We have another Call with the same ids but that cannot share this
1995 // node, must duplicate ids for it.
1996 DuplicateContextIds = true;
1997 break;
1998 }
1999 auto &NextCall = CallCtxInfo.Call;
2000 CallToMatchingCall[NextCall] = Call;
2001 // Update I so that it gets incremented correctly to skip this call.
2002 I = J;
2003 }
2004
2005 // If we don't have duplicate context ids, then we can assign all the
2006 // context ids computed for the original node sequence to this call.
2007 // If there are duplicate calls with the same stack ids then we synthesize
2008 // new context ids that are duplicates of the originals. These are
2009 // assigned to SavedContextIds, which is a reference into the map entry
2010 // for this call, allowing us to access these ids later on.
2011 OldToNewContextIds.reserve(OldToNewContextIds.size() +
2012 StackSequenceContextIds.size());
2013 SavedContextIds =
2014 DuplicateContextIds
2015 ? duplicateContextIds(StackSequenceContextIds, OldToNewContextIds)
2016 : StackSequenceContextIds;
2017 assert(!SavedContextIds.empty());
2018
2019 if (!DuplicateContextIds) {
2020 // Update saved last node's context ids to remove those that are
2021 // assigned to other calls, so that it is ready for the next call at
2022 // this stack id.
2023 set_subtract(LastNodeContextIds, StackSequenceContextIds);
2024 if (LastNodeContextIds.empty())
2025 break;
2026 }
2027 }
2028 }
2029
2030 // Propagate the duplicate context ids over the graph.
2031 propagateDuplicateContextIds(OldToNewContextIds);
2032
2033 if (VerifyCCG)
2034 check();
2035
2036 // Now perform a post-order traversal over the graph, starting with the
2037 // allocation nodes, essentially processing nodes from callers to callees.
2038 // For any that contains an id in the map, update the graph to contain new
2039 // nodes representing any inlining at interior callsites. Note we move the
2040 // associated context ids over to the new nodes.
2041 DenseSet<const ContextNode *> Visited;
2042 for (auto &Entry : AllocationCallToContextNodeMap)
2043 assignStackNodesPostOrder(Entry.second, Visited, StackIdToMatchingCalls,
2044 CallToMatchingCall);
2045 if (VerifyCCG)
2046 check();
2047}
2048
2049uint64_t ModuleCallsiteContextGraph::getLastStackId(Instruction *Call) {
2050 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
2051 Call->getMetadata(LLVMContext::MD_callsite));
2052 return CallsiteContext.back();
2053}
2054
2055uint64_t IndexCallsiteContextGraph::getLastStackId(IndexCall &Call) {
2057 CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator>
2058 CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call));
2059 // Need to convert index into stack id.
2060 return Index.getStackIdAtIndex(CallsiteContext.back());
2061}
2062
2063static const std::string MemProfCloneSuffix = ".memprof.";
2064
2065static std::string getMemProfFuncName(Twine Base, unsigned CloneNo) {
2066 // We use CloneNo == 0 to refer to the original version, which doesn't get
2067 // renamed with a suffix.
2068 if (!CloneNo)
2069 return Base.str();
2070 return (Base + MemProfCloneSuffix + Twine(CloneNo)).str();
2071}
2072
2073static bool isMemProfClone(const Function &F) {
2074 return F.getName().contains(MemProfCloneSuffix);
2075}
2076
2077// Return the clone number of the given function by extracting it from the
2078// memprof suffix. Assumes the caller has already confirmed it is a memprof
2079// clone.
2080static unsigned getMemProfCloneNum(const Function &F) {
2082 auto Pos = F.getName().find_last_of('.');
2083 assert(Pos > 0);
2084 unsigned CloneNo;
2085 bool Err = F.getName().drop_front(Pos + 1).getAsInteger(10, CloneNo);
2086 assert(!Err);
2087 (void)Err;
2088 return CloneNo;
2089}
2090
2091std::string ModuleCallsiteContextGraph::getLabel(const Function *Func,
2092 const Instruction *Call,
2093 unsigned CloneNo) const {
2094 return (Twine(Call->getFunction()->getName()) + " -> " +
2095 cast<CallBase>(Call)->getCalledFunction()->getName())
2096 .str();
2097}
2098
2099std::string IndexCallsiteContextGraph::getLabel(const FunctionSummary *Func,
2100 const IndexCall &Call,
2101 unsigned CloneNo) const {
2102 auto VI = FSToVIMap.find(Func);
2103 assert(VI != FSToVIMap.end());
2104 std::string CallerName = getMemProfFuncName(VI->second.name(), CloneNo);
2106 return CallerName + " -> alloc";
2107 else {
2108 auto *Callsite = dyn_cast_if_present<CallsiteInfo *>(Call);
2109 return CallerName + " -> " +
2110 getMemProfFuncName(Callsite->Callee.name(),
2111 Callsite->Clones[CloneNo]);
2112 }
2113}
2114
2115std::vector<uint64_t>
2116ModuleCallsiteContextGraph::getStackIdsWithContextNodesForCall(
2117 Instruction *Call) {
2118 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
2119 Call->getMetadata(LLVMContext::MD_callsite));
2120 return getStackIdsWithContextNodes<MDNode, MDNode::op_iterator>(
2121 CallsiteContext);
2122}
2123
2124std::vector<uint64_t>
2125IndexCallsiteContextGraph::getStackIdsWithContextNodesForCall(IndexCall &Call) {
2127 CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator>
2128 CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call));
2129 return getStackIdsWithContextNodes<CallsiteInfo,
2130 SmallVector<unsigned>::const_iterator>(
2131 CallsiteContext);
2132}
2133
2134template <typename DerivedCCG, typename FuncTy, typename CallTy>
2135template <class NodeT, class IteratorT>
2136std::vector<uint64_t>
2137CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getStackIdsWithContextNodes(
2138 CallStack<NodeT, IteratorT> &CallsiteContext) {
2139 std::vector<uint64_t> StackIds;
2140 for (auto IdOrIndex : CallsiteContext) {
2141 auto StackId = getStackId(IdOrIndex);
2142 ContextNode *Node = getNodeForStackId(StackId);
2143 if (!Node)
2144 break;
2145 StackIds.push_back(StackId);
2146 }
2147 return StackIds;
2148}
2149
2150ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(
2151 Module &M,
2152 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter)
2153 : Mod(M), OREGetter(OREGetter) {
2154 for (auto &F : M) {
2155 std::vector<CallInfo> CallsWithMetadata;
2156 for (auto &BB : F) {
2157 for (auto &I : BB) {
2158 if (!isa<CallBase>(I))
2159 continue;
2160 if (auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof)) {
2161 CallsWithMetadata.push_back(&I);
2162 auto *AllocNode = addAllocNode(&I, &F);
2163 auto *CallsiteMD = I.getMetadata(LLVMContext::MD_callsite);
2164 assert(CallsiteMD);
2165 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(CallsiteMD);
2166 // Add all of the MIBs and their stack nodes.
2167 for (auto &MDOp : MemProfMD->operands()) {
2168 auto *MIBMD = cast<const MDNode>(MDOp);
2169 std::vector<ContextTotalSize> ContextSizeInfo;
2170 // Collect the context size information if it exists.
2171 if (MIBMD->getNumOperands() > 2) {
2172 for (unsigned I = 2; I < MIBMD->getNumOperands(); I++) {
2173 MDNode *ContextSizePair =
2174 dyn_cast<MDNode>(MIBMD->getOperand(I));
2175 assert(ContextSizePair->getNumOperands() == 2);
2177 ContextSizePair->getOperand(0))
2178 ->getZExtValue();
2180 ContextSizePair->getOperand(1))
2181 ->getZExtValue();
2182 ContextSizeInfo.push_back({FullStackId, TotalSize});
2183 }
2184 }
2188 addStackNodesForMIB<MDNode, MDNode::op_iterator>(
2189 AllocNode, StackContext, CallsiteContext,
2190 getMIBAllocType(MIBMD), ContextSizeInfo);
2191 }
2192 // If exporting the graph to dot and an allocation id of interest was
2193 // specified, record all the context ids for this allocation node.
2194 if (ExportToDot && AllocNode->OrigStackOrAllocId == AllocIdForDot)
2195 DotAllocContextIds = AllocNode->getContextIds();
2196 assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
2197 // Memprof and callsite metadata on memory allocations no longer
2198 // needed.
2199 I.setMetadata(LLVMContext::MD_memprof, nullptr);
2200 I.setMetadata(LLVMContext::MD_callsite, nullptr);
2201 }
2202 // For callsite metadata, add to list for this function for later use.
2203 else if (I.getMetadata(LLVMContext::MD_callsite)) {
2204 CallsWithMetadata.push_back(&I);
2205 }
2206 }
2207 }
2208 if (!CallsWithMetadata.empty())
2209 FuncToCallsWithMetadata[&F] = CallsWithMetadata;
2210 }
2211
2212 if (DumpCCG) {
2213 dbgs() << "CCG before updating call stack chains:\n";
2214 dbgs() << *this;
2215 }
2216
2217 if (ExportToDot)
2218 exportToDot("prestackupdate");
2219
2220 updateStackNodes();
2221
2222 if (ExportToDot)
2223 exportToDot("poststackupdate");
2224
2225 handleCallsitesWithMultipleTargets();
2226
2227 markBackedges();
2228
2229 // Strip off remaining callsite metadata, no longer needed.
2230 for (auto &FuncEntry : FuncToCallsWithMetadata)
2231 for (auto &Call : FuncEntry.second)
2232 Call.call()->setMetadata(LLVMContext::MD_callsite, nullptr);
2233}
2234
2235IndexCallsiteContextGraph::IndexCallsiteContextGraph(
2236 ModuleSummaryIndex &Index,
2238 isPrevailing)
2239 : Index(Index), isPrevailing(isPrevailing) {
2240 for (auto &I : Index) {
2241 auto VI = Index.getValueInfo(I);
2242 for (auto &S : VI.getSummaryList()) {
2243 // We should only add the prevailing nodes. Otherwise we may try to clone
2244 // in a weak copy that won't be linked (and may be different than the
2245 // prevailing version).
2246 // We only keep the memprof summary on the prevailing copy now when
2247 // building the combined index, as a space optimization, however don't
2248 // rely on this optimization. The linker doesn't resolve local linkage
2249 // values so don't check whether those are prevailing.
2250 if (!GlobalValue::isLocalLinkage(S->linkage()) &&
2251 !isPrevailing(VI.getGUID(), S.get()))
2252 continue;
2253 auto *FS = dyn_cast<FunctionSummary>(S.get());
2254 if (!FS)
2255 continue;
2256 std::vector<CallInfo> CallsWithMetadata;
2257 if (!FS->allocs().empty()) {
2258 for (auto &AN : FS->mutableAllocs()) {
2259 // This can happen because of recursion elimination handling that
2260 // currently exists in ModuleSummaryAnalysis. Skip these for now.
2261 // We still added them to the summary because we need to be able to
2262 // correlate properly in applyImport in the backends.
2263 if (AN.MIBs.empty())
2264 continue;
2265 IndexCall AllocCall(&AN);
2266 CallsWithMetadata.push_back(AllocCall);
2267 auto *AllocNode = addAllocNode(AllocCall, FS);
2268 // Pass an empty CallStack to the CallsiteContext (second)
2269 // parameter, since for ThinLTO we already collapsed out the inlined
2270 // stack ids on the allocation call during ModuleSummaryAnalysis.
2272 EmptyContext;
2273 unsigned I = 0;
2275 AN.ContextSizeInfos.size() == AN.MIBs.size());
2276 // Now add all of the MIBs and their stack nodes.
2277 for (auto &MIB : AN.MIBs) {
2279 StackContext(&MIB);
2280 std::vector<ContextTotalSize> ContextSizeInfo;
2281 if (!AN.ContextSizeInfos.empty()) {
2282 for (auto [FullStackId, TotalSize] : AN.ContextSizeInfos[I])
2283 ContextSizeInfo.push_back({FullStackId, TotalSize});
2284 }
2285 addStackNodesForMIB<MIBInfo, SmallVector<unsigned>::const_iterator>(
2286 AllocNode, StackContext, EmptyContext, MIB.AllocType,
2287 ContextSizeInfo);
2288 I++;
2289 }
2290 // If exporting the graph to dot and an allocation id of interest was
2291 // specified, record all the context ids for this allocation node.
2292 if (ExportToDot && AllocNode->OrigStackOrAllocId == AllocIdForDot)
2293 DotAllocContextIds = AllocNode->getContextIds();
2294 assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
2295 // Initialize version 0 on the summary alloc node to the current alloc
2296 // type, unless it has both types in which case make it default, so
2297 // that in the case where we aren't able to clone the original version
2298 // always ends up with the default allocation behavior.
2299 AN.Versions[0] = (uint8_t)allocTypeToUse(AllocNode->AllocTypes);
2300 }
2301 }
2302 // For callsite metadata, add to list for this function for later use.
2303 if (!FS->callsites().empty())
2304 for (auto &SN : FS->mutableCallsites()) {
2305 IndexCall StackNodeCall(&SN);
2306 CallsWithMetadata.push_back(StackNodeCall);
2307 }
2308
2309 if (!CallsWithMetadata.empty())
2310 FuncToCallsWithMetadata[FS] = CallsWithMetadata;
2311
2312 if (!FS->allocs().empty() || !FS->callsites().empty())
2313 FSToVIMap[FS] = VI;
2314 }
2315 }
2316
2317 if (DumpCCG) {
2318 dbgs() << "CCG before updating call stack chains:\n";
2319 dbgs() << *this;
2320 }
2321
2322 if (ExportToDot)
2323 exportToDot("prestackupdate");
2324
2325 updateStackNodes();
2326
2327 if (ExportToDot)
2328 exportToDot("poststackupdate");
2329
2330 handleCallsitesWithMultipleTargets();
2331
2332 markBackedges();
2333}
2334
2335template <typename DerivedCCG, typename FuncTy, typename CallTy>
2336void CallsiteContextGraph<DerivedCCG, FuncTy,
2337 CallTy>::handleCallsitesWithMultipleTargets() {
2338 // Look for and workaround callsites that call multiple functions.
2339 // This can happen for indirect calls, which needs better handling, and in
2340 // more rare cases (e.g. macro expansion).
2341 // TODO: To fix this for indirect calls we will want to perform speculative
2342 // devirtualization using either the normal PGO info with ICP, or using the
2343 // information in the profiled MemProf contexts. We can do this prior to
2344 // this transformation for regular LTO, and for ThinLTO we can simulate that
2345 // effect in the summary and perform the actual speculative devirtualization
2346 // while cloning in the ThinLTO backend.
2347
2348 // Keep track of the new nodes synthesized for discovered tail calls missing
2349 // from the profiled contexts.
2350 MapVector<CallInfo, ContextNode *> TailCallToContextNodeMap;
2351
2352 std::vector<std::pair<CallInfo, ContextNode *>> NewCallToNode;
2353 for (auto &Entry : NonAllocationCallToContextNodeMap) {
2354 auto *Node = Entry.second;
2355 assert(Node->Clones.empty());
2356 // Check all node callees and see if in the same function.
2357 // We need to check all of the calls recorded in this Node, because in some
2358 // cases we may have had multiple calls with the same debug info calling
2359 // different callees. This can happen, for example, when an object is
2360 // constructed in the paramter list - the destructor call of the object has
2361 // the same debug info (line/col) as the call the object was passed to.
2362 // Here we will prune any that don't match all callee nodes.
2363 std::vector<CallInfo> AllCalls;
2364 AllCalls.reserve(Node->MatchingCalls.size() + 1);
2365 AllCalls.push_back(Node->Call);
2366 llvm::append_range(AllCalls, Node->MatchingCalls);
2367
2368 // First see if we can partition the calls by callee function, creating new
2369 // nodes to host each set of calls calling the same callees. This is
2370 // necessary for support indirect calls with ThinLTO, for which we
2371 // synthesized CallsiteInfo records for each target. They will all have the
2372 // same callsite stack ids and would be sharing a context node at this
2373 // point. We need to perform separate cloning for each, which will be
2374 // applied along with speculative devirtualization in the ThinLTO backends
2375 // as needed. Note this does not currently support looking through tail
2376 // calls, it is unclear if we need that for indirect call targets.
2377 // First partition calls by callee func. Map indexed by func, value is
2378 // struct with list of matching calls, assigned node.
2379 if (partitionCallsByCallee(Node, AllCalls, NewCallToNode))
2380 continue;
2381
2382 auto It = AllCalls.begin();
2383 // Iterate through the calls until we find the first that matches.
2384 for (; It != AllCalls.end(); ++It) {
2385 auto ThisCall = *It;
2386 bool Match = true;
2387 for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();
2388 ++EI) {
2389 auto Edge = *EI;
2390 if (!Edge->Callee->hasCall())
2391 continue;
2392 assert(NodeToCallingFunc.count(Edge->Callee));
2393 // Check if the called function matches that of the callee node.
2394 if (!calleesMatch(ThisCall.call(), EI, TailCallToContextNodeMap)) {
2395 Match = false;
2396 break;
2397 }
2398 }
2399 // Found a call that matches the callee nodes, we can quit now.
2400 if (Match) {
2401 // If the first match is not the primary call on the Node, update it
2402 // now. We will update the list of matching calls further below.
2403 if (Node->Call != ThisCall) {
2404 Node->setCall(ThisCall);
2405 // We need to update the NonAllocationCallToContextNodeMap, but don't
2406 // want to do this during iteration over that map, so save the calls
2407 // that need updated entries.
2408 NewCallToNode.push_back({ThisCall, Node});
2409 }
2410 break;
2411 }
2412 }
2413 // We will update this list below (or leave it cleared if there was no
2414 // match found above).
2415 Node->MatchingCalls.clear();
2416 // If we hit the end of the AllCalls vector, no call matching the callee
2417 // nodes was found, clear the call information in the node.
2418 if (It == AllCalls.end()) {
2419 RemovedEdgesWithMismatchedCallees++;
2420 // Work around by setting Node to have a null call, so it gets
2421 // skipped during cloning. Otherwise assignFunctions will assert
2422 // because its data structures are not designed to handle this case.
2423 Node->setCall(CallInfo());
2424 continue;
2425 }
2426 // Now add back any matching calls that call the same function as the
2427 // matching primary call on Node.
2428 for (++It; It != AllCalls.end(); ++It) {
2429 auto ThisCall = *It;
2430 if (!sameCallee(Node->Call.call(), ThisCall.call()))
2431 continue;
2432 Node->MatchingCalls.push_back(ThisCall);
2433 }
2434 }
2435
2436 // Remove all mismatched nodes identified in the above loop from the node map
2437 // (checking whether they have a null call which is set above). For a
2438 // MapVector like NonAllocationCallToContextNodeMap it is much more efficient
2439 // to do the removal via remove_if than by individually erasing entries above.
2440 // Also remove any entries if we updated the node's primary call above.
2441 NonAllocationCallToContextNodeMap.remove_if([](const auto &it) {
2442 return !it.second->hasCall() || it.second->Call != it.first;
2443 });
2444
2445 // Add entries for any new primary calls recorded above.
2446 for (auto &[Call, Node] : NewCallToNode)
2447 NonAllocationCallToContextNodeMap[Call] = Node;
2448
2449 // Add the new nodes after the above loop so that the iteration is not
2450 // invalidated.
2451 for (auto &[Call, Node] : TailCallToContextNodeMap)
2452 NonAllocationCallToContextNodeMap[Call] = Node;
2453}
2454
2455template <typename DerivedCCG, typename FuncTy, typename CallTy>
2456bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::partitionCallsByCallee(
2457 ContextNode *Node, ArrayRef<CallInfo> AllCalls,
2458 std::vector<std::pair<CallInfo, ContextNode *>> &NewCallToNode) {
2459 // Struct to keep track of all the calls having the same callee function,
2460 // and the node we eventually assign to them. Eventually we will record the
2461 // context node assigned to this group of calls.
2462 struct CallsWithSameCallee {
2463 std::vector<CallInfo> Calls;
2464 ContextNode *Node = nullptr;
2465 };
2466
2467 // First partition calls by callee function. Build map from each function
2468 // to the list of matching calls.
2470 for (auto ThisCall : AllCalls) {
2471 auto *F = getCalleeFunc(ThisCall.call());
2472 if (F)
2473 CalleeFuncToCallInfo[F].Calls.push_back(ThisCall);
2474 }
2475
2476 // Next, walk through all callee edges. For each callee node, get its
2477 // containing function and see if it was recorded in the above map (meaning we
2478 // have at least one matching call). Build another map from each callee node
2479 // with a matching call to the structure instance created above containing all
2480 // the calls.
2482 for (const auto &Edge : Node->CalleeEdges) {
2483 if (!Edge->Callee->hasCall())
2484 continue;
2485 const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee];
2486 if (CalleeFuncToCallInfo.contains(ProfiledCalleeFunc))
2487 CalleeNodeToCallInfo[Edge->Callee] =
2488 &CalleeFuncToCallInfo[ProfiledCalleeFunc];
2489 }
2490
2491 // If there are entries in the second map, then there were no matching
2492 // calls/callees, nothing to do here. Return so we can go to the handling that
2493 // looks through tail calls.
2494 if (CalleeNodeToCallInfo.empty())
2495 return false;
2496
2497 // Walk through all callee edges again. Any and all callee edges that didn't
2498 // match any calls (callee not in the CalleeNodeToCallInfo map) are moved to a
2499 // new caller node (UnmatchedCalleesNode) which gets a null call so that it is
2500 // ignored during cloning. If it is in the map, then we use the node recorded
2501 // in that entry (creating it if needed), and move the callee edge to it.
2502 // The first callee will use the original node instead of creating a new one.
2503 // Note that any of the original calls on this node (in AllCalls) that didn't
2504 // have a callee function automatically get dropped from the node as part of
2505 // this process.
2506 ContextNode *UnmatchedCalleesNode = nullptr;
2507 // Track whether we already assigned original node to a callee.
2508 bool UsedOrigNode = false;
2509 assert(NodeToCallingFunc[Node]);
2510 // Iterate over a copy of Node's callee edges, since we may need to remove
2511 // edges in moveCalleeEdgeToNewCaller, and this simplifies the handling and
2512 // makes it less error-prone.
2513 auto CalleeEdges = Node->CalleeEdges;
2514 for (auto &Edge : CalleeEdges) {
2515 if (!Edge->Callee->hasCall())
2516 continue;
2517
2518 // Will be updated below to point to whatever (caller) node this callee edge
2519 // should be moved to.
2520 ContextNode *CallerNodeToUse = nullptr;
2521
2522 // Handle the case where there were no matching calls first. Move this
2523 // callee edge to the UnmatchedCalleesNode, creating it if needed.
2524 if (!CalleeNodeToCallInfo.contains(Edge->Callee)) {
2525 if (!UnmatchedCalleesNode)
2526 UnmatchedCalleesNode =
2527 createNewNode(/*IsAllocation=*/false, NodeToCallingFunc[Node]);
2528 CallerNodeToUse = UnmatchedCalleesNode;
2529 } else {
2530 // Look up the information recorded for this callee node, and use the
2531 // recorded caller node (creating it if needed).
2532 auto *Info = CalleeNodeToCallInfo[Edge->Callee];
2533 if (!Info->Node) {
2534 // If we haven't assigned any callees to the original node use it.
2535 if (!UsedOrigNode) {
2536 Info->Node = Node;
2537 // Clear the set of matching calls which will be updated below.
2538 Node->MatchingCalls.clear();
2539 UsedOrigNode = true;
2540 } else
2541 Info->Node =
2542 createNewNode(/*IsAllocation=*/false, NodeToCallingFunc[Node]);
2543 assert(!Info->Calls.empty());
2544 // The first call becomes the primary call for this caller node, and the
2545 // rest go in the matching calls list.
2546 Info->Node->setCall(Info->Calls.front());
2547 llvm::append_range(Info->Node->MatchingCalls,
2548 llvm::drop_begin(Info->Calls));
2549 // Save the primary call to node correspondence so that we can update
2550 // the NonAllocationCallToContextNodeMap, which is being iterated in the
2551 // caller of this function.
2552 NewCallToNode.push_back({Info->Node->Call, Info->Node});
2553 }
2554 CallerNodeToUse = Info->Node;
2555 }
2556
2557 // Don't need to move edge if we are using the original node;
2558 if (CallerNodeToUse == Node)
2559 continue;
2560
2561 moveCalleeEdgeToNewCaller(Edge, CallerNodeToUse);
2562 }
2563 // Now that we are done moving edges, clean up any caller edges that ended
2564 // up with no type or context ids. During moveCalleeEdgeToNewCaller all
2565 // caller edges from Node are replicated onto the new callers, and it
2566 // simplifies the handling to leave them until we have moved all
2567 // edges/context ids.
2568 for (auto &I : CalleeNodeToCallInfo)
2569 removeNoneTypeCallerEdges(I.second->Node);
2570 if (UnmatchedCalleesNode)
2571 removeNoneTypeCallerEdges(UnmatchedCalleesNode);
2572 removeNoneTypeCallerEdges(Node);
2573
2574 return true;
2575}
2576
2577uint64_t ModuleCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
2578 // In the Module (IR) case this is already the Id.
2579 return IdOrIndex;
2580}
2581
2582uint64_t IndexCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
2583 // In the Index case this is an index into the stack id list in the summary
2584 // index, convert it to an Id.
2585 return Index.getStackIdAtIndex(IdOrIndex);
2586}
2587
2588template <typename DerivedCCG, typename FuncTy, typename CallTy>
2589bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::calleesMatch(
2590 CallTy Call, EdgeIter &EI,
2591 MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap) {
2592 auto Edge = *EI;
2593 const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee];
2594 const FuncTy *CallerFunc = NodeToCallingFunc[Edge->Caller];
2595 // Will be populated in order of callee to caller if we find a chain of tail
2596 // calls between the profiled caller and callee.
2597 std::vector<std::pair<CallTy, FuncTy *>> FoundCalleeChain;
2598 if (!calleeMatchesFunc(Call, ProfiledCalleeFunc, CallerFunc,
2599 FoundCalleeChain))
2600 return false;
2601
2602 // The usual case where the profiled callee matches that of the IR/summary.
2603 if (FoundCalleeChain.empty())
2604 return true;
2605
2606 auto AddEdge = [Edge, &EI](ContextNode *Caller, ContextNode *Callee) {
2607 auto *CurEdge = Callee->findEdgeFromCaller(Caller);
2608 // If there is already an edge between these nodes, simply update it and
2609 // return.
2610 if (CurEdge) {
2611 CurEdge->ContextIds.insert_range(Edge->ContextIds);
2612 CurEdge->AllocTypes |= Edge->AllocTypes;
2613 return;
2614 }
2615 // Otherwise, create a new edge and insert it into the caller and callee
2616 // lists.
2617 auto NewEdge = std::make_shared<ContextEdge>(
2618 Callee, Caller, Edge->AllocTypes, Edge->ContextIds);
2619 Callee->CallerEdges.push_back(NewEdge);
2620 if (Caller == Edge->Caller) {
2621 // If we are inserting the new edge into the current edge's caller, insert
2622 // the new edge before the current iterator position, and then increment
2623 // back to the current edge.
2624 EI = Caller->CalleeEdges.insert(EI, NewEdge);
2625 ++EI;
2626 assert(*EI == Edge &&
2627 "Iterator position not restored after insert and increment");
2628 } else
2629 Caller->CalleeEdges.push_back(NewEdge);
2630 };
2631
2632 // Create new nodes for each found callee and connect in between the profiled
2633 // caller and callee.
2634 auto *CurCalleeNode = Edge->Callee;
2635 for (auto &[NewCall, Func] : FoundCalleeChain) {
2636 ContextNode *NewNode = nullptr;
2637 // First check if we have already synthesized a node for this tail call.
2638 if (TailCallToContextNodeMap.count(NewCall)) {
2639 NewNode = TailCallToContextNodeMap[NewCall];
2640 NewNode->AllocTypes |= Edge->AllocTypes;
2641 } else {
2642 FuncToCallsWithMetadata[Func].push_back({NewCall});
2643 // Create Node and record node info.
2644 NewNode = createNewNode(/*IsAllocation=*/false, Func, NewCall);
2645 TailCallToContextNodeMap[NewCall] = NewNode;
2646 NewNode->AllocTypes = Edge->AllocTypes;
2647 }
2648
2649 // Hook up node to its callee node
2650 AddEdge(NewNode, CurCalleeNode);
2651
2652 CurCalleeNode = NewNode;
2653 }
2654
2655 // Hook up edge's original caller to new callee node.
2656 AddEdge(Edge->Caller, CurCalleeNode);
2657
2658#ifndef NDEBUG
2659 // Save this because Edge's fields get cleared below when removed.
2660 auto *Caller = Edge->Caller;
2661#endif
2662
2663 // Remove old edge
2664 removeEdgeFromGraph(Edge.get(), &EI, /*CalleeIter=*/true);
2665
2666 // To simplify the increment of EI in the caller, subtract one from EI.
2667 // In the final AddEdge call we would have either added a new callee edge,
2668 // to Edge->Caller, or found an existing one. Either way we are guaranteed
2669 // that there is at least one callee edge.
2670 assert(!Caller->CalleeEdges.empty());
2671 --EI;
2672
2673 return true;
2674}
2675
2676bool ModuleCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
2677 const Function *ProfiledCallee, Value *CurCallee, unsigned Depth,
2678 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain,
2679 bool &FoundMultipleCalleeChains) {
2680 // Stop recursive search if we have already explored the maximum specified
2681 // depth.
2683 return false;
2684
2685 auto SaveCallsiteInfo = [&](Instruction *Callsite, Function *F) {
2686 FoundCalleeChain.push_back({Callsite, F});
2687 };
2688
2689 auto *CalleeFunc = dyn_cast<Function>(CurCallee);
2690 if (!CalleeFunc) {
2691 auto *Alias = dyn_cast<GlobalAlias>(CurCallee);
2692 assert(Alias);
2693 CalleeFunc = dyn_cast<Function>(Alias->getAliasee());
2694 assert(CalleeFunc);
2695 }
2696
2697 // Look for tail calls in this function, and check if they either call the
2698 // profiled callee directly, or indirectly (via a recursive search).
2699 // Only succeed if there is a single unique tail call chain found between the
2700 // profiled caller and callee, otherwise we could perform incorrect cloning.
2701 bool FoundSingleCalleeChain = false;
2702 for (auto &BB : *CalleeFunc) {
2703 for (auto &I : BB) {
2704 auto *CB = dyn_cast<CallBase>(&I);
2705 if (!CB || !CB->isTailCall())
2706 continue;
2707 auto *CalledValue = CB->getCalledOperand();
2708 auto *CalledFunction = CB->getCalledFunction();
2709 if (CalledValue && !CalledFunction) {
2710 CalledValue = CalledValue->stripPointerCasts();
2711 // Stripping pointer casts can reveal a called function.
2712 CalledFunction = dyn_cast<Function>(CalledValue);
2713 }
2714 // Check if this is an alias to a function. If so, get the
2715 // called aliasee for the checks below.
2716 if (auto *GA = dyn_cast<GlobalAlias>(CalledValue)) {
2717 assert(!CalledFunction &&
2718 "Expected null called function in callsite for alias");
2719 CalledFunction = dyn_cast<Function>(GA->getAliaseeObject());
2720 }
2721 if (!CalledFunction)
2722 continue;
2723 if (CalledFunction == ProfiledCallee) {
2724 if (FoundSingleCalleeChain) {
2725 FoundMultipleCalleeChains = true;
2726 return false;
2727 }
2728 FoundSingleCalleeChain = true;
2729 FoundProfiledCalleeCount++;
2730 FoundProfiledCalleeDepth += Depth;
2731 if (Depth > FoundProfiledCalleeMaxDepth)
2732 FoundProfiledCalleeMaxDepth = Depth;
2733 SaveCallsiteInfo(&I, CalleeFunc);
2734 } else if (findProfiledCalleeThroughTailCalls(
2735 ProfiledCallee, CalledFunction, Depth + 1,
2736 FoundCalleeChain, FoundMultipleCalleeChains)) {
2737 // findProfiledCalleeThroughTailCalls should not have returned
2738 // true if FoundMultipleCalleeChains.
2739 assert(!FoundMultipleCalleeChains);
2740 if (FoundSingleCalleeChain) {
2741 FoundMultipleCalleeChains = true;
2742 return false;
2743 }
2744 FoundSingleCalleeChain = true;
2745 SaveCallsiteInfo(&I, CalleeFunc);
2746 } else if (FoundMultipleCalleeChains)
2747 return false;
2748 }
2749 }
2750
2751 return FoundSingleCalleeChain;
2752}
2753
2754const Function *ModuleCallsiteContextGraph::getCalleeFunc(Instruction *Call) {
2755 auto *CB = dyn_cast<CallBase>(Call);
2756 if (!CB->getCalledOperand() || CB->isIndirectCall())
2757 return nullptr;
2758 auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts();
2759 auto *Alias = dyn_cast<GlobalAlias>(CalleeVal);
2760 if (Alias)
2761 return dyn_cast<Function>(Alias->getAliasee());
2762 return dyn_cast<Function>(CalleeVal);
2763}
2764
2765bool ModuleCallsiteContextGraph::calleeMatchesFunc(
2766 Instruction *Call, const Function *Func, const Function *CallerFunc,
2767 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain) {
2768 auto *CB = dyn_cast<CallBase>(Call);
2769 if (!CB->getCalledOperand() || CB->isIndirectCall())
2770 return false;
2771 auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts();
2772 auto *CalleeFunc = dyn_cast<Function>(CalleeVal);
2773 if (CalleeFunc == Func)
2774 return true;
2775 auto *Alias = dyn_cast<GlobalAlias>(CalleeVal);
2776 if (Alias && Alias->getAliasee() == Func)
2777 return true;
2778
2779 // Recursively search for the profiled callee through tail calls starting with
2780 // the actual Callee. The discovered tail call chain is saved in
2781 // FoundCalleeChain, and we will fixup the graph to include these callsites
2782 // after returning.
2783 // FIXME: We will currently redo the same recursive walk if we find the same
2784 // mismatched callee from another callsite. We can improve this with more
2785 // bookkeeping of the created chain of new nodes for each mismatch.
2786 unsigned Depth = 1;
2787 bool FoundMultipleCalleeChains = false;
2788 if (!findProfiledCalleeThroughTailCalls(Func, CalleeVal, Depth,
2789 FoundCalleeChain,
2790 FoundMultipleCalleeChains)) {
2791 LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: "
2792 << Func->getName() << " from " << CallerFunc->getName()
2793 << " that actually called " << CalleeVal->getName()
2794 << (FoundMultipleCalleeChains
2795 ? " (found multiple possible chains)"
2796 : "")
2797 << "\n");
2798 if (FoundMultipleCalleeChains)
2799 FoundProfiledCalleeNonUniquelyCount++;
2800 return false;
2801 }
2802
2803 return true;
2804}
2805
2806bool ModuleCallsiteContextGraph::sameCallee(Instruction *Call1,
2807 Instruction *Call2) {
2808 auto *CB1 = cast<CallBase>(Call1);
2809 if (!CB1->getCalledOperand() || CB1->isIndirectCall())
2810 return false;
2811 auto *CalleeVal1 = CB1->getCalledOperand()->stripPointerCasts();
2812 auto *CalleeFunc1 = dyn_cast<Function>(CalleeVal1);
2813 auto *CB2 = cast<CallBase>(Call2);
2814 if (!CB2->getCalledOperand() || CB2->isIndirectCall())
2815 return false;
2816 auto *CalleeVal2 = CB2->getCalledOperand()->stripPointerCasts();
2817 auto *CalleeFunc2 = dyn_cast<Function>(CalleeVal2);
2818 return CalleeFunc1 == CalleeFunc2;
2819}
2820
2821bool IndexCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
2822 ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
2823 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
2824 bool &FoundMultipleCalleeChains) {
2825 // Stop recursive search if we have already explored the maximum specified
2826 // depth.
2828 return false;
2829
2830 auto CreateAndSaveCallsiteInfo = [&](ValueInfo Callee, FunctionSummary *FS) {
2831 // Make a CallsiteInfo for each discovered callee, if one hasn't already
2832 // been synthesized.
2833 if (!FunctionCalleesToSynthesizedCallsiteInfos.count(FS) ||
2834 !FunctionCalleesToSynthesizedCallsiteInfos[FS].count(Callee))
2835 // StackIds is empty (we don't have debug info available in the index for
2836 // these callsites)
2837 FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee] =
2838 std::make_unique<CallsiteInfo>(Callee, SmallVector<unsigned>());
2839 CallsiteInfo *NewCallsiteInfo =
2840 FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee].get();
2841 FoundCalleeChain.push_back({NewCallsiteInfo, FS});
2842 };
2843
2844 // Look for tail calls in this function, and check if they either call the
2845 // profiled callee directly, or indirectly (via a recursive search).
2846 // Only succeed if there is a single unique tail call chain found between the
2847 // profiled caller and callee, otherwise we could perform incorrect cloning.
2848 bool FoundSingleCalleeChain = false;
2849 for (auto &S : CurCallee.getSummaryList()) {
2850 if (!GlobalValue::isLocalLinkage(S->linkage()) &&
2851 !isPrevailing(CurCallee.getGUID(), S.get()))
2852 continue;
2853 auto *FS = dyn_cast<FunctionSummary>(S->getBaseObject());
2854 if (!FS)
2855 continue;
2856 auto FSVI = CurCallee;
2857 auto *AS = dyn_cast<AliasSummary>(S.get());
2858 if (AS)
2859 FSVI = AS->getAliaseeVI();
2860 for (auto &CallEdge : FS->calls()) {
2861 if (!CallEdge.second.hasTailCall())
2862 continue;
2863 if (CallEdge.first == ProfiledCallee) {
2864 if (FoundSingleCalleeChain) {
2865 FoundMultipleCalleeChains = true;
2866 return false;
2867 }
2868 FoundSingleCalleeChain = true;
2869 FoundProfiledCalleeCount++;
2870 FoundProfiledCalleeDepth += Depth;
2871 if (Depth > FoundProfiledCalleeMaxDepth)
2872 FoundProfiledCalleeMaxDepth = Depth;
2873 CreateAndSaveCallsiteInfo(CallEdge.first, FS);
2874 // Add FS to FSToVIMap in case it isn't already there.
2875 assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI);
2876 FSToVIMap[FS] = FSVI;
2877 } else if (findProfiledCalleeThroughTailCalls(
2878 ProfiledCallee, CallEdge.first, Depth + 1,
2879 FoundCalleeChain, FoundMultipleCalleeChains)) {
2880 // findProfiledCalleeThroughTailCalls should not have returned
2881 // true if FoundMultipleCalleeChains.
2882 assert(!FoundMultipleCalleeChains);
2883 if (FoundSingleCalleeChain) {
2884 FoundMultipleCalleeChains = true;
2885 return false;
2886 }
2887 FoundSingleCalleeChain = true;
2888 CreateAndSaveCallsiteInfo(CallEdge.first, FS);
2889 // Add FS to FSToVIMap in case it isn't already there.
2890 assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI);
2891 FSToVIMap[FS] = FSVI;
2892 } else if (FoundMultipleCalleeChains)
2893 return false;
2894 }
2895 }
2896
2897 return FoundSingleCalleeChain;
2898}
2899
2900const FunctionSummary *
2901IndexCallsiteContextGraph::getCalleeFunc(IndexCall &Call) {
2902 ValueInfo Callee = dyn_cast_if_present<CallsiteInfo *>(Call)->Callee;
2903 if (Callee.getSummaryList().empty())
2904 return nullptr;
2905 return dyn_cast<FunctionSummary>(Callee.getSummaryList()[0]->getBaseObject());
2906}
2907
2908bool IndexCallsiteContextGraph::calleeMatchesFunc(
2909 IndexCall &Call, const FunctionSummary *Func,
2910 const FunctionSummary *CallerFunc,
2911 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain) {
2912 ValueInfo Callee = dyn_cast_if_present<CallsiteInfo *>(Call)->Callee;
2913 // If there is no summary list then this is a call to an externally defined
2914 // symbol.
2915 AliasSummary *Alias =
2916 Callee.getSummaryList().empty()
2917 ? nullptr
2918 : dyn_cast<AliasSummary>(Callee.getSummaryList()[0].get());
2919 assert(FSToVIMap.count(Func));
2920 auto FuncVI = FSToVIMap[Func];
2921 if (Callee == FuncVI ||
2922 // If callee is an alias, check the aliasee, since only function
2923 // summary base objects will contain the stack node summaries and thus
2924 // get a context node.
2925 (Alias && Alias->getAliaseeVI() == FuncVI))
2926 return true;
2927
2928 // Recursively search for the profiled callee through tail calls starting with
2929 // the actual Callee. The discovered tail call chain is saved in
2930 // FoundCalleeChain, and we will fixup the graph to include these callsites
2931 // after returning.
2932 // FIXME: We will currently redo the same recursive walk if we find the same
2933 // mismatched callee from another callsite. We can improve this with more
2934 // bookkeeping of the created chain of new nodes for each mismatch.
2935 unsigned Depth = 1;
2936 bool FoundMultipleCalleeChains = false;
2937 if (!findProfiledCalleeThroughTailCalls(
2938 FuncVI, Callee, Depth, FoundCalleeChain, FoundMultipleCalleeChains)) {
2939 LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: " << FuncVI
2940 << " from " << FSToVIMap[CallerFunc]
2941 << " that actually called " << Callee
2942 << (FoundMultipleCalleeChains
2943 ? " (found multiple possible chains)"
2944 : "")
2945 << "\n");
2946 if (FoundMultipleCalleeChains)
2947 FoundProfiledCalleeNonUniquelyCount++;
2948 return false;
2949 }
2950
2951 return true;
2952}
2953
2954bool IndexCallsiteContextGraph::sameCallee(IndexCall &Call1, IndexCall &Call2) {
2955 ValueInfo Callee1 = dyn_cast_if_present<CallsiteInfo *>(Call1)->Callee;
2956 ValueInfo Callee2 = dyn_cast_if_present<CallsiteInfo *>(Call2)->Callee;
2957 return Callee1 == Callee2;
2958}
2959
2960template <typename DerivedCCG, typename FuncTy, typename CallTy>
2961void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::dump()
2962 const {
2963 print(dbgs());
2964 dbgs() << "\n";
2965}
2966
2967template <typename DerivedCCG, typename FuncTy, typename CallTy>
2968void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::print(
2969 raw_ostream &OS) const {
2970 OS << "Node " << this << "\n";
2971 OS << "\t";
2972 printCall(OS);
2973 if (Recursive)
2974 OS << " (recursive)";
2975 OS << "\n";
2976 if (!MatchingCalls.empty()) {
2977 OS << "\tMatchingCalls:\n";
2978 for (auto &MatchingCall : MatchingCalls) {
2979 OS << "\t";
2980 MatchingCall.print(OS);
2981 OS << "\n";
2982 }
2983 }
2984 OS << "\tNodeId: " << NodeId << "\n";
2985 OS << "\tAllocTypes: " << getAllocTypeString(AllocTypes) << "\n";
2986 OS << "\tContextIds:";
2987 // Make a copy of the computed context ids that we can sort for stability.
2988 auto ContextIds = getContextIds();
2989 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
2990 std::sort(SortedIds.begin(), SortedIds.end());
2991 for (auto Id : SortedIds)
2992 OS << " " << Id;
2993 OS << "\n";
2994 OS << "\tCalleeEdges:\n";
2995 for (auto &Edge : CalleeEdges)
2996 OS << "\t\t" << *Edge << " (Callee NodeId: " << Edge->Callee->NodeId
2997 << ")\n";
2998 OS << "\tCallerEdges:\n";
2999 for (auto &Edge : CallerEdges)
3000 OS << "\t\t" << *Edge << " (Caller NodeId: " << Edge->Caller->NodeId
3001 << ")\n";
3002 if (!Clones.empty()) {
3003 OS << "\tClones: ";
3004 bool First = true;
3005 for (auto *C : Clones) {
3006 if (!First)
3007 OS << ", ";
3008 First = false;
3009 OS << C << " NodeId: " << C->NodeId;
3010 }
3011 OS << "\n";
3012 } else if (CloneOf) {
3013 OS << "\tClone of " << CloneOf << " NodeId: " << CloneOf->NodeId << "\n";
3014 }
3015}
3016
3017template <typename DerivedCCG, typename FuncTy, typename CallTy>
3018void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::dump()
3019 const {
3020 print(dbgs());
3021 dbgs() << "\n";
3022}
3023
3024template <typename DerivedCCG, typename FuncTy, typename CallTy>
3025void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::print(
3026 raw_ostream &OS) const {
3027 OS << "Edge from Callee " << Callee << " to Caller: " << Caller
3028 << (IsBackedge ? " (BE)" : "")
3029 << " AllocTypes: " << getAllocTypeString(AllocTypes);
3030 OS << " ContextIds:";
3031 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
3032 std::sort(SortedIds.begin(), SortedIds.end());
3033 for (auto Id : SortedIds)
3034 OS << " " << Id;
3035}
3036
3037template <typename DerivedCCG, typename FuncTy, typename CallTy>
3038void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::dump() const {
3039 print(dbgs());
3040}
3041
3042template <typename DerivedCCG, typename FuncTy, typename CallTy>
3043void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::print(
3044 raw_ostream &OS) const {
3045 OS << "Callsite Context Graph:\n";
3046 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3047 for (const auto Node : nodes<GraphType>(this)) {
3048 if (Node->isRemoved())
3049 continue;
3050 Node->print(OS);
3051 OS << "\n";
3052 }
3053}
3054
3055template <typename DerivedCCG, typename FuncTy, typename CallTy>
3056void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::printTotalSizes(
3057 raw_ostream &OS) const {
3058 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3059 for (const auto Node : nodes<GraphType>(this)) {
3060 if (Node->isRemoved())
3061 continue;
3062 if (!Node->IsAllocation)
3063 continue;
3064 DenseSet<uint32_t> ContextIds = Node->getContextIds();
3065 auto AllocTypeFromCall = getAllocationCallType(Node->Call);
3066 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
3067 std::sort(SortedIds.begin(), SortedIds.end());
3068 for (auto Id : SortedIds) {
3069 auto TypeI = ContextIdToAllocationType.find(Id);
3070 assert(TypeI != ContextIdToAllocationType.end());
3071 auto CSI = ContextIdToContextSizeInfos.find(Id);
3072 if (CSI != ContextIdToContextSizeInfos.end()) {
3073 for (auto &Info : CSI->second) {
3074 OS << "MemProf hinting: "
3075 << getAllocTypeString((uint8_t)TypeI->second)
3076 << " full allocation context " << Info.FullStackId
3077 << " with total size " << Info.TotalSize << " is "
3078 << getAllocTypeString(Node->AllocTypes) << " after cloning";
3079 if (allocTypeToUse(Node->AllocTypes) != AllocTypeFromCall)
3080 OS << " marked " << getAllocTypeString((uint8_t)AllocTypeFromCall)
3081 << " due to cold byte percent";
3082 // Print the internal context id to aid debugging and visualization.
3083 OS << " (context id " << Id << ")";
3084 OS << "\n";
3085 }
3086 }
3087 }
3088 }
3089}
3090
3091template <typename DerivedCCG, typename FuncTy, typename CallTy>
3092void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::check() const {
3093 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3094 for (const auto Node : nodes<GraphType>(this)) {
3095 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
3096 for (auto &Edge : Node->CallerEdges)
3098 }
3099}
3100
3101template <typename DerivedCCG, typename FuncTy, typename CallTy>
3102struct GraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *> {
3103 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3104 using NodeRef = const ContextNode<DerivedCCG, FuncTy, CallTy> *;
3105
3106 using NodePtrTy = std::unique_ptr<ContextNode<DerivedCCG, FuncTy, CallTy>>;
3107 static NodeRef getNode(const NodePtrTy &P) { return P.get(); }
3108
3111 decltype(&getNode)>;
3112
3114 return nodes_iterator(G->NodeOwner.begin(), &getNode);
3115 }
3116
3118 return nodes_iterator(G->NodeOwner.end(), &getNode);
3119 }
3120
3122 return G->NodeOwner.begin()->get();
3123 }
3124
3125 using EdgePtrTy = std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>;
3126 static const ContextNode<DerivedCCG, FuncTy, CallTy> *
3128 return P->Callee;
3129 }
3130
3132 mapped_iterator<typename std::vector<std::shared_ptr<ContextEdge<
3133 DerivedCCG, FuncTy, CallTy>>>::const_iterator,
3134 decltype(&GetCallee)>;
3135
3137 return ChildIteratorType(N->CalleeEdges.begin(), &GetCallee);
3138 }
3139
3141 return ChildIteratorType(N->CalleeEdges.end(), &GetCallee);
3142 }
3143};
3144
3145template <typename DerivedCCG, typename FuncTy, typename CallTy>
3146struct DOTGraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>
3147 : public DefaultDOTGraphTraits {
3148 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {
3149 // If the user requested the full graph to be exported, but provided an
3150 // allocation id, or if the user gave a context id and requested more than
3151 // just a specific context to be exported, note that highlighting is
3152 // enabled.
3153 DoHighlight =
3154 (AllocIdForDot.getNumOccurrences() && DotGraphScope == DotScope::All) ||
3155 (ContextIdForDot.getNumOccurrences() &&
3157 }
3158
3159 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3161 using NodeRef = typename GTraits::NodeRef;
3162 using ChildIteratorType = typename GTraits::ChildIteratorType;
3163
3164 static std::string getNodeLabel(NodeRef Node, GraphType G) {
3165 std::string LabelString =
3166 (Twine("OrigId: ") + (Node->IsAllocation ? "Alloc" : "") +
3167 Twine(Node->OrigStackOrAllocId) + " NodeId: " + Twine(Node->NodeId))
3168 .str();
3169 LabelString += "\n";
3170 if (Node->hasCall()) {
3171 auto Func = G->NodeToCallingFunc.find(Node);
3172 assert(Func != G->NodeToCallingFunc.end());
3173 LabelString +=
3174 G->getLabel(Func->second, Node->Call.call(), Node->Call.cloneNo());
3175 } else {
3176 LabelString += "null call";
3177 if (Node->Recursive)
3178 LabelString += " (recursive)";
3179 else
3180 LabelString += " (external)";
3181 }
3182 return LabelString;
3183 }
3184
3186 auto ContextIds = Node->getContextIds();
3187 // If highlighting enabled, see if this node contains any of the context ids
3188 // of interest. If so, it will use a different color and a larger fontsize
3189 // (which makes the node larger as well).
3190 bool Highlight = false;
3191 if (DoHighlight) {
3192 assert(ContextIdForDot.getNumOccurrences() ||
3193 AllocIdForDot.getNumOccurrences());
3194 if (ContextIdForDot.getNumOccurrences())
3195 Highlight = ContextIds.contains(ContextIdForDot);
3196 else
3197 Highlight = set_intersects(ContextIds, G->DotAllocContextIds);
3198 }
3199 std::string AttributeString = (Twine("tooltip=\"") + getNodeId(Node) + " " +
3200 getContextIds(ContextIds) + "\"")
3201 .str();
3202 // Default fontsize is 14
3203 if (Highlight)
3204 AttributeString += ",fontsize=\"30\"";
3205 AttributeString +=
3206 (Twine(",fillcolor=\"") + getColor(Node->AllocTypes, Highlight) + "\"")
3207 .str();
3208 if (Node->CloneOf) {
3209 AttributeString += ",color=\"blue\"";
3210 AttributeString += ",style=\"filled,bold,dashed\"";
3211 } else
3212 AttributeString += ",style=\"filled\"";
3213 return AttributeString;
3214 }
3215
3216 static std::string getEdgeAttributes(NodeRef, ChildIteratorType ChildIter,
3217 GraphType G) {
3218 auto &Edge = *(ChildIter.getCurrent());
3219 // If highlighting enabled, see if this edge contains any of the context ids
3220 // of interest. If so, it will use a different color and a heavier arrow
3221 // size and weight (the larger weight makes the highlighted path
3222 // straighter).
3223 bool Highlight = false;
3224 if (DoHighlight) {
3225 assert(ContextIdForDot.getNumOccurrences() ||
3226 AllocIdForDot.getNumOccurrences());
3227 if (ContextIdForDot.getNumOccurrences())
3228 Highlight = Edge->ContextIds.contains(ContextIdForDot);
3229 else
3230 Highlight = set_intersects(Edge->ContextIds, G->DotAllocContextIds);
3231 }
3232 auto Color = getColor(Edge->AllocTypes, Highlight);
3233 std::string AttributeString =
3234 (Twine("tooltip=\"") + getContextIds(Edge->ContextIds) + "\"" +
3235 // fillcolor is the arrow head and color is the line
3236 Twine(",fillcolor=\"") + Color + "\"" + Twine(",color=\"") + Color +
3237 "\"")
3238 .str();
3239 if (Edge->IsBackedge)
3240 AttributeString += ",style=\"dotted\"";
3241 // Default penwidth and weight are both 1.
3242 if (Highlight)
3243 AttributeString += ",penwidth=\"2.0\",weight=\"2\"";
3244 return AttributeString;
3245 }
3246
3247 // Since the NodeOwners list includes nodes that are no longer connected to
3248 // the graph, skip them here.
3250 if (Node->isRemoved())
3251 return true;
3252 // If a scope smaller than the full graph was requested, see if this node
3253 // contains any of the context ids of interest.
3255 return !set_intersects(Node->getContextIds(), G->DotAllocContextIds);
3257 return !Node->getContextIds().contains(ContextIdForDot);
3258 return false;
3259 }
3260
3261private:
3262 static std::string getContextIds(const DenseSet<uint32_t> &ContextIds) {
3263 std::string IdString = "ContextIds:";
3264 if (ContextIds.size() < 100) {
3265 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
3266 std::sort(SortedIds.begin(), SortedIds.end());
3267 for (auto Id : SortedIds)
3268 IdString += (" " + Twine(Id)).str();
3269 } else {
3270 IdString += (" (" + Twine(ContextIds.size()) + " ids)").str();
3271 }
3272 return IdString;
3273 }
3274
3275 static std::string getColor(uint8_t AllocTypes, bool Highlight) {
3276 // If DoHighlight is not enabled, we want to use the highlight colors for
3277 // NotCold and Cold, and the non-highlight color for NotCold+Cold. This is
3278 // both compatible with the color scheme before highlighting was supported,
3279 // and for the NotCold+Cold color the non-highlight color is a bit more
3280 // readable.
3281 if (AllocTypes == (uint8_t)AllocationType::NotCold)
3282 // Color "brown1" actually looks like a lighter red.
3283 return !DoHighlight || Highlight ? "brown1" : "lightpink";
3284 if (AllocTypes == (uint8_t)AllocationType::Cold)
3285 return !DoHighlight || Highlight ? "cyan" : "lightskyblue";
3286 if (AllocTypes ==
3287 ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold))
3288 return Highlight ? "magenta" : "mediumorchid1";
3289 return "gray";
3290 }
3291
3292 static std::string getNodeId(NodeRef Node) {
3293 std::stringstream SStream;
3294 SStream << std::hex << "N0x" << (unsigned long long)Node;
3295 std::string Result = SStream.str();
3296 return Result;
3297 }
3298
3299 // True if we should highlight a specific context or allocation's contexts in
3300 // the emitted graph.
3301 static bool DoHighlight;
3302};
3303
3304template <typename DerivedCCG, typename FuncTy, typename CallTy>
3305bool DOTGraphTraits<
3306 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>::DoHighlight =
3307 false;
3308
3309template <typename DerivedCCG, typename FuncTy, typename CallTy>
3310void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::exportToDot(
3311 std::string Label) const {
3312 WriteGraph(this, "", false, Label,
3313 DotFilePathPrefix + "ccg." + Label + ".dot");
3314}
3315
3316template <typename DerivedCCG, typename FuncTy, typename CallTy>
3317typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
3318CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::moveEdgeToNewCalleeClone(
3319 const std::shared_ptr<ContextEdge> &Edge,
3320 DenseSet<uint32_t> ContextIdsToMove) {
3321 ContextNode *Node = Edge->Callee;
3322 assert(NodeToCallingFunc.count(Node));
3323 ContextNode *Clone =
3324 createNewNode(Node->IsAllocation, NodeToCallingFunc[Node], Node->Call);
3325 Node->addClone(Clone);
3326 Clone->MatchingCalls = Node->MatchingCalls;
3327 moveEdgeToExistingCalleeClone(Edge, Clone, /*NewClone=*/true,
3328 ContextIdsToMove);
3329 return Clone;
3330}
3331
3332template <typename DerivedCCG, typename FuncTy, typename CallTy>
3333void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
3334 moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
3335 ContextNode *NewCallee, bool NewClone,
3336 DenseSet<uint32_t> ContextIdsToMove) {
3337 // NewCallee and Edge's current callee must be clones of the same original
3338 // node (Edge's current callee may be the original node too).
3339 assert(NewCallee->getOrigNode() == Edge->Callee->getOrigNode());
3340
3341 bool EdgeIsRecursive = Edge->Callee == Edge->Caller;
3342
3343 ContextNode *OldCallee = Edge->Callee;
3344
3345 // We might already have an edge to the new callee from earlier cloning for a
3346 // different allocation. If one exists we will reuse it.
3347 auto ExistingEdgeToNewCallee = NewCallee->findEdgeFromCaller(Edge->Caller);
3348
3349 // Callers will pass an empty ContextIdsToMove set when they want to move the
3350 // edge. Copy in Edge's ids for simplicity.
3351 if (ContextIdsToMove.empty())
3352 ContextIdsToMove = Edge->getContextIds();
3353
3354 // If we are moving all of Edge's ids, then just move the whole Edge.
3355 // Otherwise only move the specified subset, to a new edge if needed.
3356 if (Edge->getContextIds().size() == ContextIdsToMove.size()) {
3357 // First, update the alloc types on New Callee from Edge.
3358 // Do this before we potentially clear Edge's fields below!
3359 NewCallee->AllocTypes |= Edge->AllocTypes;
3360 // Moving the whole Edge.
3361 if (ExistingEdgeToNewCallee) {
3362 // Since we already have an edge to NewCallee, simply move the ids
3363 // onto it, and remove the existing Edge.
3364 ExistingEdgeToNewCallee->getContextIds().insert_range(ContextIdsToMove);
3365 ExistingEdgeToNewCallee->AllocTypes |= Edge->AllocTypes;
3366 assert(Edge->ContextIds == ContextIdsToMove);
3367 removeEdgeFromGraph(Edge.get());
3368 } else {
3369 // Otherwise just reconnect Edge to NewCallee.
3370 Edge->Callee = NewCallee;
3371 NewCallee->CallerEdges.push_back(Edge);
3372 // Remove it from callee where it was previously connected.
3373 OldCallee->eraseCallerEdge(Edge.get());
3374 // Don't need to update Edge's context ids since we are simply
3375 // reconnecting it.
3376 }
3377 } else {
3378 // Only moving a subset of Edge's ids.
3379 // Compute the alloc type of the subset of ids being moved.
3380 auto CallerEdgeAllocType = computeAllocType(ContextIdsToMove);
3381 if (ExistingEdgeToNewCallee) {
3382 // Since we already have an edge to NewCallee, simply move the ids
3383 // onto it.
3384 ExistingEdgeToNewCallee->getContextIds().insert_range(ContextIdsToMove);
3385 ExistingEdgeToNewCallee->AllocTypes |= CallerEdgeAllocType;
3386 } else {
3387 // Otherwise, create a new edge to NewCallee for the ids being moved.
3388 auto NewEdge = std::make_shared<ContextEdge>(
3389 NewCallee, Edge->Caller, CallerEdgeAllocType, ContextIdsToMove);
3390 Edge->Caller->CalleeEdges.push_back(NewEdge);
3391 NewCallee->CallerEdges.push_back(NewEdge);
3392 }
3393 // In either case, need to update the alloc types on NewCallee, and remove
3394 // those ids and update the alloc type on the original Edge.
3395 NewCallee->AllocTypes |= CallerEdgeAllocType;
3396 set_subtract(Edge->ContextIds, ContextIdsToMove);
3397 Edge->AllocTypes = computeAllocType(Edge->ContextIds);
3398 }
3399 // Now walk the old callee node's callee edges and move Edge's context ids
3400 // over to the corresponding edge into the clone (which is created here if
3401 // this is a newly created clone).
3402 for (auto &OldCalleeEdge : OldCallee->CalleeEdges) {
3403 ContextNode *CalleeToUse = OldCalleeEdge->Callee;
3404 // If this is a direct recursion edge, use NewCallee (the clone) as the
3405 // callee as well, so that any edge updated/created here is also direct
3406 // recursive.
3407 if (CalleeToUse == OldCallee) {
3408 // If this is a recursive edge, see if we already moved a recursive edge
3409 // (which would have to have been this one) - if we were only moving a
3410 // subset of context ids it would still be on OldCallee.
3411 if (EdgeIsRecursive) {
3412 assert(OldCalleeEdge == Edge);
3413 continue;
3414 }
3415 CalleeToUse = NewCallee;
3416 }
3417 // The context ids moving to the new callee are the subset of this edge's
3418 // context ids and the context ids on the caller edge being moved.
3419 DenseSet<uint32_t> EdgeContextIdsToMove =
3420 set_intersection(OldCalleeEdge->getContextIds(), ContextIdsToMove);
3421 set_subtract(OldCalleeEdge->getContextIds(), EdgeContextIdsToMove);
3422 OldCalleeEdge->AllocTypes =
3423 computeAllocType(OldCalleeEdge->getContextIds());
3424 if (!NewClone) {
3425 // Update context ids / alloc type on corresponding edge to NewCallee.
3426 // There is a chance this may not exist if we are reusing an existing
3427 // clone, specifically during function assignment, where we would have
3428 // removed none type edges after creating the clone. If we can't find
3429 // a corresponding edge there, fall through to the cloning below.
3430 if (auto *NewCalleeEdge = NewCallee->findEdgeFromCallee(CalleeToUse)) {
3431 NewCalleeEdge->getContextIds().insert_range(EdgeContextIdsToMove);
3432 NewCalleeEdge->AllocTypes |= computeAllocType(EdgeContextIdsToMove);
3433 continue;
3434 }
3435 }
3436 auto NewEdge = std::make_shared<ContextEdge>(
3437 CalleeToUse, NewCallee, computeAllocType(EdgeContextIdsToMove),
3438 EdgeContextIdsToMove);
3439 NewCallee->CalleeEdges.push_back(NewEdge);
3440 NewEdge->Callee->CallerEdges.push_back(NewEdge);
3441 }
3442 // Recompute the node alloc type now that its callee edges have been
3443 // updated (since we will compute from those edges).
3444 OldCallee->AllocTypes = OldCallee->computeAllocType();
3445 // OldCallee alloc type should be None iff its context id set is now empty.
3446 assert((OldCallee->AllocTypes == (uint8_t)AllocationType::None) ==
3447 OldCallee->emptyContextIds());
3448 if (VerifyCCG) {
3449 checkNode<DerivedCCG, FuncTy, CallTy>(OldCallee, /*CheckEdges=*/false);
3450 checkNode<DerivedCCG, FuncTy, CallTy>(NewCallee, /*CheckEdges=*/false);
3451 for (const auto &OldCalleeEdge : OldCallee->CalleeEdges)
3452 checkNode<DerivedCCG, FuncTy, CallTy>(OldCalleeEdge->Callee,
3453 /*CheckEdges=*/false);
3454 for (const auto &NewCalleeEdge : NewCallee->CalleeEdges)
3455 checkNode<DerivedCCG, FuncTy, CallTy>(NewCalleeEdge->Callee,
3456 /*CheckEdges=*/false);
3457 }
3458}
3459
3460template <typename DerivedCCG, typename FuncTy, typename CallTy>
3461void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
3462 moveCalleeEdgeToNewCaller(const std::shared_ptr<ContextEdge> &Edge,
3463 ContextNode *NewCaller) {
3464 auto *OldCallee = Edge->Callee;
3465 auto *NewCallee = OldCallee;
3466 // If this edge was direct recursive, make any new/updated edge also direct
3467 // recursive to NewCaller.
3468 bool Recursive = Edge->Caller == Edge->Callee;
3469 if (Recursive)
3470 NewCallee = NewCaller;
3471
3472 ContextNode *OldCaller = Edge->Caller;
3473 OldCaller->eraseCalleeEdge(Edge.get());
3474
3475 // We might already have an edge to the new caller. If one exists we will
3476 // reuse it.
3477 auto ExistingEdgeToNewCaller = NewCaller->findEdgeFromCallee(NewCallee);
3478
3479 if (ExistingEdgeToNewCaller) {
3480 // Since we already have an edge to NewCaller, simply move the ids
3481 // onto it, and remove the existing Edge.
3482 ExistingEdgeToNewCaller->getContextIds().insert_range(
3483 Edge->getContextIds());
3484 ExistingEdgeToNewCaller->AllocTypes |= Edge->AllocTypes;
3485 Edge->ContextIds.clear();
3486 Edge->AllocTypes = (uint8_t)AllocationType::None;
3487 OldCallee->eraseCallerEdge(Edge.get());
3488 } else {
3489 // Otherwise just reconnect Edge to NewCaller.
3490 Edge->Caller = NewCaller;
3491 NewCaller->CalleeEdges.push_back(Edge);
3492 if (Recursive) {
3493 assert(NewCallee == NewCaller);
3494 // In the case of (direct) recursive edges, we update the callee as well
3495 // so that it becomes recursive on the new caller.
3496 Edge->Callee = NewCallee;
3497 NewCallee->CallerEdges.push_back(Edge);
3498 OldCallee->eraseCallerEdge(Edge.get());
3499 }
3500 // Don't need to update Edge's context ids since we are simply
3501 // reconnecting it.
3502 }
3503 // In either case, need to update the alloc types on New Caller.
3504 NewCaller->AllocTypes |= Edge->AllocTypes;
3505
3506 // Now walk the old caller node's caller edges and move Edge's context ids
3507 // over to the corresponding edge into the node (which is created here if
3508 // this is a newly created node). We can tell whether this is a newly created
3509 // node by seeing if it has any caller edges yet.
3510#ifndef NDEBUG
3511 bool IsNewNode = NewCaller->CallerEdges.empty();
3512#endif
3513 // If we just moved a direct recursive edge, presumably its context ids should
3514 // also flow out of OldCaller via some other non-recursive callee edge. We
3515 // don't want to remove the recursive context ids from other caller edges yet,
3516 // otherwise the context ids get into an inconsistent state on OldCaller.
3517 // We will update these context ids on the non-recursive caller edge when and
3518 // if they are updated on the non-recursive callee.
3519 if (!Recursive) {
3520 for (auto &OldCallerEdge : OldCaller->CallerEdges) {
3521 auto OldCallerCaller = OldCallerEdge->Caller;
3522 // The context ids moving to the new caller are the subset of this edge's
3523 // context ids and the context ids on the callee edge being moved.
3524 DenseSet<uint32_t> EdgeContextIdsToMove = set_intersection(
3525 OldCallerEdge->getContextIds(), Edge->getContextIds());
3526 if (OldCaller == OldCallerCaller) {
3527 OldCallerCaller = NewCaller;
3528 // Don't actually move this one. The caller will move it directly via a
3529 // call to this function with this as the Edge if it is appropriate to
3530 // move to a diff node that has a matching callee (itself).
3531 continue;
3532 }
3533 set_subtract(OldCallerEdge->getContextIds(), EdgeContextIdsToMove);
3534 OldCallerEdge->AllocTypes =
3535 computeAllocType(OldCallerEdge->getContextIds());
3536 // In this function we expect that any pre-existing node already has edges
3537 // from the same callers as the old node. That should be true in the
3538 // current use case, where we will remove None-type edges after copying
3539 // over all caller edges from the callee.
3540 auto *ExistingCallerEdge = NewCaller->findEdgeFromCaller(OldCallerCaller);
3541 // Since we would have skipped caller edges when moving a direct recursive
3542 // edge, this may not hold true when recursive handling enabled.
3543 assert(IsNewNode || ExistingCallerEdge || AllowRecursiveCallsites);
3544 if (ExistingCallerEdge) {
3545 ExistingCallerEdge->getContextIds().insert_range(EdgeContextIdsToMove);
3546 ExistingCallerEdge->AllocTypes |=
3547 computeAllocType(EdgeContextIdsToMove);
3548 continue;
3549 }
3550 auto NewEdge = std::make_shared<ContextEdge>(
3551 NewCaller, OldCallerCaller, computeAllocType(EdgeContextIdsToMove),
3552 EdgeContextIdsToMove);
3553 NewCaller->CallerEdges.push_back(NewEdge);
3554 NewEdge->Caller->CalleeEdges.push_back(NewEdge);
3555 }
3556 }
3557 // Recompute the node alloc type now that its caller edges have been
3558 // updated (since we will compute from those edges).
3559 OldCaller->AllocTypes = OldCaller->computeAllocType();
3560 // OldCaller alloc type should be None iff its context id set is now empty.
3561 assert((OldCaller->AllocTypes == (uint8_t)AllocationType::None) ==
3562 OldCaller->emptyContextIds());
3563 if (VerifyCCG) {
3564 checkNode<DerivedCCG, FuncTy, CallTy>(OldCaller, /*CheckEdges=*/false);
3565 checkNode<DerivedCCG, FuncTy, CallTy>(NewCaller, /*CheckEdges=*/false);
3566 for (const auto &OldCallerEdge : OldCaller->CallerEdges)
3567 checkNode<DerivedCCG, FuncTy, CallTy>(OldCallerEdge->Caller,
3568 /*CheckEdges=*/false);
3569 for (const auto &NewCallerEdge : NewCaller->CallerEdges)
3570 checkNode<DerivedCCG, FuncTy, CallTy>(NewCallerEdge->Caller,
3571 /*CheckEdges=*/false);
3572 }
3573}
3574
3575template <typename DerivedCCG, typename FuncTy, typename CallTy>
3576void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
3577 recursivelyRemoveNoneTypeCalleeEdges(
3578 ContextNode *Node, DenseSet<const ContextNode *> &Visited) {
3579 auto Inserted = Visited.insert(Node);
3580 if (!Inserted.second)
3581 return;
3582
3583 removeNoneTypeCalleeEdges(Node);
3584
3585 for (auto *Clone : Node->Clones)
3586 recursivelyRemoveNoneTypeCalleeEdges(Clone, Visited);
3587
3588 // The recursive call may remove some of this Node's caller edges.
3589 // Iterate over a copy and skip any that were removed.
3590 auto CallerEdges = Node->CallerEdges;
3591 for (auto &Edge : CallerEdges) {
3592 // Skip any that have been removed by an earlier recursive call.
3593 if (Edge->isRemoved()) {
3594 assert(!is_contained(Node->CallerEdges, Edge));
3595 continue;
3596 }
3597 recursivelyRemoveNoneTypeCalleeEdges(Edge->Caller, Visited);
3598 }
3599}
3600
3601// This is the standard DFS based backedge discovery algorithm.
3602template <typename DerivedCCG, typename FuncTy, typename CallTy>
3603void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::markBackedges() {
3604 // If we are cloning recursive contexts, find and mark backedges from all root
3605 // callers, using the typical DFS based backedge analysis.
3607 return;
3608 DenseSet<const ContextNode *> Visited;
3609 DenseSet<const ContextNode *> CurrentStack;
3610 for (auto &Entry : NonAllocationCallToContextNodeMap) {
3611 auto *Node = Entry.second;
3612 if (Node->isRemoved())
3613 continue;
3614 // It is a root if it doesn't have callers.
3615 if (!Node->CallerEdges.empty())
3616 continue;
3617 markBackedges(Node, Visited, CurrentStack);
3618 assert(CurrentStack.empty());
3619 }
3620}
3621
3622// Recursive helper for above markBackedges method.
3623template <typename DerivedCCG, typename FuncTy, typename CallTy>
3624void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::markBackedges(
3625 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
3626 DenseSet<const ContextNode *> &CurrentStack) {
3627 auto I = Visited.insert(Node);
3628 // We should only call this for unvisited nodes.
3629 assert(I.second);
3630 (void)I;
3631 for (auto &CalleeEdge : Node->CalleeEdges) {
3632 auto *Callee = CalleeEdge->Callee;
3633 if (Visited.count(Callee)) {
3634 // Since this was already visited we need to check if it is currently on
3635 // the recursive stack in which case it is a backedge.
3636 if (CurrentStack.count(Callee))
3637 CalleeEdge->IsBackedge = true;
3638 continue;
3639 }
3640 CurrentStack.insert(Callee);
3641 markBackedges(Callee, Visited, CurrentStack);
3642 CurrentStack.erase(Callee);
3643 }
3644}
3645
3646template <typename DerivedCCG, typename FuncTy, typename CallTy>
3647void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones() {
3648 DenseSet<const ContextNode *> Visited;
3649 for (auto &Entry : AllocationCallToContextNodeMap) {
3650 Visited.clear();
3651 identifyClones(Entry.second, Visited, Entry.second->getContextIds());
3652 }
3653 Visited.clear();
3654 for (auto &Entry : AllocationCallToContextNodeMap)
3655 recursivelyRemoveNoneTypeCalleeEdges(Entry.second, Visited);
3656 if (VerifyCCG)
3657 check();
3658}
3659
3660// helper function to check an AllocType is cold or notcold or both.
3667
3668template <typename DerivedCCG, typename FuncTy, typename CallTy>
3669void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
3670 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
3671 const DenseSet<uint32_t> &AllocContextIds) {
3672 if (VerifyNodes)
3673 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
3674 assert(!Node->CloneOf);
3675
3676 // If Node as a null call, then either it wasn't found in the module (regular
3677 // LTO) or summary index (ThinLTO), or there were other conditions blocking
3678 // cloning (e.g. recursion, calls multiple targets, etc).
3679 // Do this here so that we don't try to recursively clone callers below, which
3680 // isn't useful at least for this node.
3681 if (!Node->hasCall())
3682 return;
3683
3684 // No need to look at any callers if allocation type already unambiguous.
3685 if (hasSingleAllocType(Node->AllocTypes))
3686 return;
3687
3688#ifndef NDEBUG
3689 auto Insert =
3690#endif
3691 Visited.insert(Node);
3692 // We should not have visited this node yet.
3693 assert(Insert.second);
3694 // The recursive call to identifyClones may delete the current edge from the
3695 // CallerEdges vector. Make a copy and iterate on that, simpler than passing
3696 // in an iterator and having recursive call erase from it. Other edges may
3697 // also get removed during the recursion, which will have null Callee and
3698 // Caller pointers (and are deleted later), so we skip those below.
3699 {
3700 auto CallerEdges = Node->CallerEdges;
3701 for (auto &Edge : CallerEdges) {
3702 // Skip any that have been removed by an earlier recursive call.
3703 if (Edge->isRemoved()) {
3704 assert(!is_contained(Node->CallerEdges, Edge));
3705 continue;
3706 }
3707 // Defer backedges. See comments further below where these edges are
3708 // handled during the cloning of this Node.
3709 if (Edge->IsBackedge) {
3710 // We should only mark these if cloning recursive contexts, where we
3711 // need to do this deferral.
3713 continue;
3714 }
3715 // Ignore any caller we previously visited via another edge.
3716 if (!Visited.count(Edge->Caller) && !Edge->Caller->CloneOf) {
3717 identifyClones(Edge->Caller, Visited, AllocContextIds);
3718 }
3719 }
3720 }
3721
3722 // Check if we reached an unambiguous call or have have only a single caller.
3723 if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
3724 return;
3725
3726 // We need to clone.
3727
3728 // Try to keep the original version as alloc type NotCold. This will make
3729 // cases with indirect calls or any other situation with an unknown call to
3730 // the original function get the default behavior. We do this by sorting the
3731 // CallerEdges of the Node we will clone by alloc type.
3732 //
3733 // Give NotCold edge the lowest sort priority so those edges are at the end of
3734 // the caller edges vector, and stay on the original version (since the below
3735 // code clones greedily until it finds all remaining edges have the same type
3736 // and leaves the remaining ones on the original Node).
3737 //
3738 // We shouldn't actually have any None type edges, so the sorting priority for
3739 // that is arbitrary, and we assert in that case below.
3740 const unsigned AllocTypeCloningPriority[] = {/*None*/ 3, /*NotCold*/ 4,
3741 /*Cold*/ 1,
3742 /*NotColdCold*/ 2};
3743 llvm::stable_sort(Node->CallerEdges,
3744 [&](const std::shared_ptr<ContextEdge> &A,
3745 const std::shared_ptr<ContextEdge> &B) {
3746 // Nodes with non-empty context ids should be sorted
3747 // before those with empty context ids.
3748 if (A->ContextIds.empty())
3749 // Either B ContextIds are non-empty (in which case we
3750 // should return false because B < A), or B ContextIds
3751 // are empty, in which case they are equal, and we
3752 // should maintain the original relative ordering.
3753 return false;
3754 if (B->ContextIds.empty())
3755 return true;
3756
3757 if (A->AllocTypes == B->AllocTypes)
3758 // Use the first context id for each edge as a
3759 // tie-breaker.
3760 return *A->ContextIds.begin() < *B->ContextIds.begin();
3761 return AllocTypeCloningPriority[A->AllocTypes] <
3762 AllocTypeCloningPriority[B->AllocTypes];
3763 });
3764
3765 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
3766
3767 DenseSet<uint32_t> RecursiveContextIds;
3769 // If we are allowing recursive callsites, but have also disabled recursive
3770 // contexts, look for context ids that show up in multiple caller edges.
3772 DenseSet<uint32_t> AllCallerContextIds;
3773 for (auto &CE : Node->CallerEdges) {
3774 // Resize to the largest set of caller context ids, since we know the
3775 // final set will be at least that large.
3776 AllCallerContextIds.reserve(CE->getContextIds().size());
3777 for (auto Id : CE->getContextIds())
3778 if (!AllCallerContextIds.insert(Id).second)
3779 RecursiveContextIds.insert(Id);
3780 }
3781 }
3782
3783 // Iterate until we find no more opportunities for disambiguating the alloc
3784 // types via cloning. In most cases this loop will terminate once the Node
3785 // has a single allocation type, in which case no more cloning is needed.
3786 // Iterate over a copy of Node's caller edges, since we may need to remove
3787 // edges in the moveEdgeTo* methods, and this simplifies the handling and
3788 // makes it less error-prone.
3789 auto CallerEdges = Node->CallerEdges;
3790 for (auto &CallerEdge : CallerEdges) {
3791 // Skip any that have been removed by an earlier recursive call.
3792 if (CallerEdge->isRemoved()) {
3793 assert(!is_contained(Node->CallerEdges, CallerEdge));
3794 continue;
3795 }
3796 assert(CallerEdge->Callee == Node);
3797
3798 // See if cloning the prior caller edge left this node with a single alloc
3799 // type or a single caller. In that case no more cloning of Node is needed.
3800 if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
3801 break;
3802
3803 // If the caller was not successfully matched to a call in the IR/summary,
3804 // there is no point in trying to clone for it as we can't update that call.
3805 if (!CallerEdge->Caller->hasCall())
3806 continue;
3807
3808 // Only need to process the ids along this edge pertaining to the given
3809 // allocation.
3810 auto CallerEdgeContextsForAlloc =
3811 set_intersection(CallerEdge->getContextIds(), AllocContextIds);
3812 if (!RecursiveContextIds.empty())
3813 CallerEdgeContextsForAlloc =
3814 set_difference(CallerEdgeContextsForAlloc, RecursiveContextIds);
3815 if (CallerEdgeContextsForAlloc.empty())
3816 continue;
3817
3818 auto CallerAllocTypeForAlloc = computeAllocType(CallerEdgeContextsForAlloc);
3819
3820 // Compute the node callee edge alloc types corresponding to the context ids
3821 // for this caller edge.
3822 std::vector<uint8_t> CalleeEdgeAllocTypesForCallerEdge;
3823 CalleeEdgeAllocTypesForCallerEdge.reserve(Node->CalleeEdges.size());
3824 for (auto &CalleeEdge : Node->CalleeEdges)
3825 CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes(
3826 CalleeEdge->getContextIds(), CallerEdgeContextsForAlloc));
3827
3828 // Don't clone if doing so will not disambiguate any alloc types amongst
3829 // caller edges (including the callee edges that would be cloned).
3830 // Otherwise we will simply move all edges to the clone.
3831 //
3832 // First check if by cloning we will disambiguate the caller allocation
3833 // type from node's allocation type. Query allocTypeToUse so that we don't
3834 // bother cloning to distinguish NotCold+Cold from NotCold. Note that
3835 // neither of these should be None type.
3836 //
3837 // Then check if by cloning node at least one of the callee edges will be
3838 // disambiguated by splitting out different context ids.
3839 //
3840 // However, always do the cloning if this is a backedge, in which case we
3841 // have not yet cloned along this caller edge.
3842 assert(CallerEdge->AllocTypes != (uint8_t)AllocationType::None);
3843 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
3844 if (!CallerEdge->IsBackedge &&
3845 allocTypeToUse(CallerAllocTypeForAlloc) ==
3846 allocTypeToUse(Node->AllocTypes) &&
3847 allocTypesMatch<DerivedCCG, FuncTy, CallTy>(
3848 CalleeEdgeAllocTypesForCallerEdge, Node->CalleeEdges)) {
3849 continue;
3850 }
3851
3852 if (CallerEdge->IsBackedge) {
3853 // We should only mark these if cloning recursive contexts, where we
3854 // need to do this deferral.
3856 DeferredBackedges++;
3857 }
3858
3859 // If this is a backedge, we now do recursive cloning starting from its
3860 // caller since we may have moved unambiguous caller contexts to a clone
3861 // of this Node in a previous iteration of the current loop, giving more
3862 // opportunity for cloning through the backedge. Because we sorted the
3863 // caller edges earlier so that cold caller edges are first, we would have
3864 // visited and cloned this node for any unamibiguously cold non-recursive
3865 // callers before any ambiguous backedge callers. Note that we don't do this
3866 // if the caller is already cloned or visited during cloning (e.g. via a
3867 // different context path from the allocation).
3868 // TODO: Can we do better in the case where the caller was already visited?
3869 if (CallerEdge->IsBackedge && !CallerEdge->Caller->CloneOf &&
3870 !Visited.count(CallerEdge->Caller)) {
3871 const auto OrigIdCount = CallerEdge->getContextIds().size();
3872 // Now do the recursive cloning of this backedge's caller, which was
3873 // deferred earlier.
3874 identifyClones(CallerEdge->Caller, Visited, CallerEdgeContextsForAlloc);
3875 removeNoneTypeCalleeEdges(CallerEdge->Caller);
3876 // See if the recursive call to identifyClones moved the context ids to a
3877 // new edge from this node to a clone of caller, and switch to looking at
3878 // that new edge so that we clone Node for the new caller clone.
3879 bool UpdatedEdge = false;
3880 if (OrigIdCount > CallerEdge->getContextIds().size()) {
3881 for (auto E : Node->CallerEdges) {
3882 // Only interested in clones of the current edges caller.
3883 if (E->Caller->CloneOf != CallerEdge->Caller)
3884 continue;
3885 // See if this edge contains any of the context ids originally on the
3886 // current caller edge.
3887 auto CallerEdgeContextsForAllocNew =
3888 set_intersection(CallerEdgeContextsForAlloc, E->getContextIds());
3889 if (CallerEdgeContextsForAllocNew.empty())
3890 continue;
3891 // Make sure we don't pick a previously existing caller edge of this
3892 // Node, which would be processed on a different iteration of the
3893 // outer loop over the saved CallerEdges.
3894 if (llvm::is_contained(CallerEdges, E))
3895 continue;
3896 // The CallerAllocTypeForAlloc and CalleeEdgeAllocTypesForCallerEdge
3897 // are updated further below for all cases where we just invoked
3898 // identifyClones recursively.
3899 CallerEdgeContextsForAlloc.swap(CallerEdgeContextsForAllocNew);
3900 CallerEdge = E;
3901 UpdatedEdge = true;
3902 break;
3903 }
3904 }
3905 // If cloning removed this edge (and we didn't update it to a new edge
3906 // above), we're done with this edge. It's possible we moved all of the
3907 // context ids to an existing clone, in which case there's no need to do
3908 // further processing for them.
3909 if (CallerEdge->isRemoved())
3910 continue;
3911
3912 // Now we need to update the information used for the cloning decisions
3913 // further below, as we may have modified edges and their context ids.
3914
3915 // Note if we changed the CallerEdge above we would have already updated
3916 // the context ids.
3917 if (!UpdatedEdge) {
3918 CallerEdgeContextsForAlloc = set_intersection(
3919 CallerEdgeContextsForAlloc, CallerEdge->getContextIds());
3920 if (CallerEdgeContextsForAlloc.empty())
3921 continue;
3922 }
3923 // Update the other information that depends on the edges and on the now
3924 // updated CallerEdgeContextsForAlloc.
3925 CallerAllocTypeForAlloc = computeAllocType(CallerEdgeContextsForAlloc);
3926 CalleeEdgeAllocTypesForCallerEdge.clear();
3927 for (auto &CalleeEdge : Node->CalleeEdges) {
3928 CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes(
3929 CalleeEdge->getContextIds(), CallerEdgeContextsForAlloc));
3930 }
3931 }
3932
3933 // First see if we can use an existing clone. Check each clone and its
3934 // callee edges for matching alloc types.
3935 ContextNode *Clone = nullptr;
3936 for (auto *CurClone : Node->Clones) {
3937 if (allocTypeToUse(CurClone->AllocTypes) !=
3938 allocTypeToUse(CallerAllocTypeForAlloc))
3939 continue;
3940
3941 bool BothSingleAlloc = hasSingleAllocType(CurClone->AllocTypes) &&
3942 hasSingleAllocType(CallerAllocTypeForAlloc);
3943 // The above check should mean that if both have single alloc types that
3944 // they should be equal.
3945 assert(!BothSingleAlloc ||
3946 CurClone->AllocTypes == CallerAllocTypeForAlloc);
3947
3948 // If either both have a single alloc type (which are the same), or if the
3949 // clone's callee edges have the same alloc types as those for the current
3950 // allocation on Node's callee edges (CalleeEdgeAllocTypesForCallerEdge),
3951 // then we can reuse this clone.
3952 if (BothSingleAlloc || allocTypesMatchClone<DerivedCCG, FuncTy, CallTy>(
3953 CalleeEdgeAllocTypesForCallerEdge, CurClone)) {
3954 Clone = CurClone;
3955 break;
3956 }
3957 }
3958
3959 // The edge iterator is adjusted when we move the CallerEdge to the clone.
3960 if (Clone)
3961 moveEdgeToExistingCalleeClone(CallerEdge, Clone, /*NewClone=*/false,
3962 CallerEdgeContextsForAlloc);
3963 else
3964 Clone = moveEdgeToNewCalleeClone(CallerEdge, CallerEdgeContextsForAlloc);
3965
3966 // Sanity check that no alloc types on clone or its edges are None.
3967 assert(Clone->AllocTypes != (uint8_t)AllocationType::None);
3968 }
3969
3970 // We should still have some context ids on the original Node.
3971 assert(!Node->emptyContextIds());
3972
3973 // Sanity check that no alloc types on node or edges are None.
3974 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
3975
3976 if (VerifyNodes)
3977 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
3978}
3979
3980void ModuleCallsiteContextGraph::updateAllocationCall(
3981 CallInfo &Call, AllocationType AllocType) {
3982 std::string AllocTypeString = getAllocTypeAttributeString(AllocType);
3984 auto A = llvm::Attribute::get(Call.call()->getFunction()->getContext(),
3985 "memprof", AllocTypeString);
3986 cast<CallBase>(Call.call())->addFnAttr(A);
3987 OREGetter(Call.call()->getFunction())
3988 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", Call.call())
3989 << ore::NV("AllocationCall", Call.call()) << " in clone "
3990 << ore::NV("Caller", Call.call()->getFunction())
3991 << " marked with memprof allocation attribute "
3992 << ore::NV("Attribute", AllocTypeString));
3993}
3994
3995void IndexCallsiteContextGraph::updateAllocationCall(CallInfo &Call,
3997 auto *AI = cast<AllocInfo *>(Call.call());
3998 assert(AI);
3999 assert(AI->Versions.size() > Call.cloneNo());
4000 AI->Versions[Call.cloneNo()] = (uint8_t)AllocType;
4001}
4002
4004ModuleCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const {
4005 const auto *CB = cast<CallBase>(Call.call());
4006 if (!CB->getAttributes().hasFnAttr("memprof"))
4007 return AllocationType::None;
4008 return CB->getAttributes().getFnAttr("memprof").getValueAsString() == "cold"
4009 ? AllocationType::Cold
4010 : AllocationType::NotCold;
4011}
4012
4014IndexCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const {
4015 const auto *AI = cast<AllocInfo *>(Call.call());
4016 assert(AI->Versions.size() > Call.cloneNo());
4017 return (AllocationType)AI->Versions[Call.cloneNo()];
4018}
4019
4020void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall,
4021 FuncInfo CalleeFunc) {
4022 auto *CurF = getCalleeFunc(CallerCall.call());
4023 auto NewCalleeCloneNo = CalleeFunc.cloneNo();
4024 if (isMemProfClone(*CurF)) {
4025 // If we already assigned this callsite to call a specific non-default
4026 // clone (i.e. not the original function which is clone 0), ensure that we
4027 // aren't trying to now update it to call a different clone, which is
4028 // indicative of a bug in the graph or function assignment.
4029 auto CurCalleeCloneNo = getMemProfCloneNum(*CurF);
4030 if (CurCalleeCloneNo != NewCalleeCloneNo) {
4031 LLVM_DEBUG(dbgs() << "Mismatch in call clone assignment: was "
4032 << CurCalleeCloneNo << " now " << NewCalleeCloneNo
4033 << "\n");
4034 MismatchedCloneAssignments++;
4035 }
4036 }
4037 if (NewCalleeCloneNo > 0)
4038 cast<CallBase>(CallerCall.call())->setCalledFunction(CalleeFunc.func());
4039 OREGetter(CallerCall.call()->getFunction())
4040 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CallerCall.call())
4041 << ore::NV("Call", CallerCall.call()) << " in clone "
4042 << ore::NV("Caller", CallerCall.call()->getFunction())
4043 << " assigned to call function clone "
4044 << ore::NV("Callee", CalleeFunc.func()));
4045}
4046
4047void IndexCallsiteContextGraph::updateCall(CallInfo &CallerCall,
4048 FuncInfo CalleeFunc) {
4049 auto *CI = cast<CallsiteInfo *>(CallerCall.call());
4050 assert(CI &&
4051 "Caller cannot be an allocation which should not have profiled calls");
4052 assert(CI->Clones.size() > CallerCall.cloneNo());
4053 auto NewCalleeCloneNo = CalleeFunc.cloneNo();
4054 auto &CurCalleeCloneNo = CI->Clones[CallerCall.cloneNo()];
4055 // If we already assigned this callsite to call a specific non-default
4056 // clone (i.e. not the original function which is clone 0), ensure that we
4057 // aren't trying to now update it to call a different clone, which is
4058 // indicative of a bug in the graph or function assignment.
4059 if (CurCalleeCloneNo != 0 && CurCalleeCloneNo != NewCalleeCloneNo) {
4060 LLVM_DEBUG(dbgs() << "Mismatch in call clone assignment: was "
4061 << CurCalleeCloneNo << " now " << NewCalleeCloneNo
4062 << "\n");
4063 MismatchedCloneAssignments++;
4064 }
4065 CurCalleeCloneNo = NewCalleeCloneNo;
4066}
4067
4068// Update the debug information attached to NewFunc to use the clone Name. Note
4069// this needs to be done for both any existing DISubprogram for the definition,
4070// as well as any separate declaration DISubprogram.
4072 assert(Name == NewFunc->getName());
4073 auto *SP = NewFunc->getSubprogram();
4074 if (!SP)
4075 return;
4076 auto *MDName = MDString::get(NewFunc->getParent()->getContext(), Name);
4077 SP->replaceLinkageName(MDName);
4078 DISubprogram *Decl = SP->getDeclaration();
4079 if (!Decl)
4080 return;
4081 TempDISubprogram NewDecl = Decl->clone();
4082 NewDecl->replaceLinkageName(MDName);
4083 SP->replaceDeclaration(MDNode::replaceWithUniqued(std::move(NewDecl)));
4084}
4085
4086CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
4087 Instruction *>::FuncInfo
4088ModuleCallsiteContextGraph::cloneFunctionForCallsite(
4089 FuncInfo &Func, CallInfo &Call, DenseMap<CallInfo, CallInfo> &CallMap,
4090 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
4091 // Use existing LLVM facilities for cloning and obtaining Call in clone
4092 ValueToValueMapTy VMap;
4093 auto *NewFunc = CloneFunction(Func.func(), VMap);
4094 std::string Name = getMemProfFuncName(Func.func()->getName(), CloneNo);
4095 assert(!Func.func()->getParent()->getFunction(Name));
4096 NewFunc->setName(Name);
4097 updateSubprogramLinkageName(NewFunc, Name);
4098 for (auto &Inst : CallsWithMetadataInFunc) {
4099 // This map always has the initial version in it.
4100 assert(Inst.cloneNo() == 0);
4101 CallMap[Inst] = {cast<Instruction>(VMap[Inst.call()]), CloneNo};
4102 }
4103 OREGetter(Func.func())
4104 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", Func.func())
4105 << "created clone " << ore::NV("NewFunction", NewFunc));
4106 return {NewFunc, CloneNo};
4107}
4108
4109CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
4110 IndexCall>::FuncInfo
4111IndexCallsiteContextGraph::cloneFunctionForCallsite(
4112 FuncInfo &Func, CallInfo &Call, DenseMap<CallInfo, CallInfo> &CallMap,
4113 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
4114 // Check how many clones we have of Call (and therefore function).
4115 // The next clone number is the current size of versions array.
4116 // Confirm this matches the CloneNo provided by the caller, which is based on
4117 // the number of function clones we have.
4118 assert(CloneNo == (isa<AllocInfo *>(Call.call())
4119 ? cast<AllocInfo *>(Call.call())->Versions.size()
4120 : cast<CallsiteInfo *>(Call.call())->Clones.size()));
4121 // Walk all the instructions in this function. Create a new version for
4122 // each (by adding an entry to the Versions/Clones summary array), and copy
4123 // over the version being called for the function clone being cloned here.
4124 // Additionally, add an entry to the CallMap for the new function clone,
4125 // mapping the original call (clone 0, what is in CallsWithMetadataInFunc)
4126 // to the new call clone.
4127 for (auto &Inst : CallsWithMetadataInFunc) {
4128 // This map always has the initial version in it.
4129 assert(Inst.cloneNo() == 0);
4130 if (auto *AI = dyn_cast<AllocInfo *>(Inst.call())) {
4131 assert(AI->Versions.size() == CloneNo);
4132 // We assign the allocation type later (in updateAllocationCall), just add
4133 // an entry for it here.
4134 AI->Versions.push_back(0);
4135 } else {
4136 auto *CI = cast<CallsiteInfo *>(Inst.call());
4137 assert(CI && CI->Clones.size() == CloneNo);
4138 // We assign the clone number later (in updateCall), just add an entry for
4139 // it here.
4140 CI->Clones.push_back(0);
4141 }
4142 CallMap[Inst] = {Inst.call(), CloneNo};
4143 }
4144 return {Func.func(), CloneNo};
4145}
4146
4147// We perform cloning for each allocation node separately. However, this
4148// sometimes results in a situation where the same node calls multiple
4149// clones of the same callee, created for different allocations. This
4150// causes issues when assigning functions to these clones, as each node can
4151// in reality only call a single callee clone.
4152//
4153// To address this, before assigning functions, merge callee clone nodes as
4154// needed using a post order traversal from the allocations. We attempt to
4155// use existing clones as the merge node when legal, and to share them
4156// among callers with the same properties (callers calling the same set of
4157// callee clone nodes for the same allocations).
4158//
4159// Without this fix, in some cases incorrect function assignment will lead
4160// to calling the wrong allocation clone.
4161template <typename DerivedCCG, typename FuncTy, typename CallTy>
4162void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::mergeClones() {
4163 if (!MergeClones)
4164 return;
4165
4166 // Generate a map from context id to the associated allocation node for use
4167 // when merging clones.
4168 DenseMap<uint32_t, ContextNode *> ContextIdToAllocationNode;
4169 for (auto &Entry : AllocationCallToContextNodeMap) {
4170 auto *Node = Entry.second;
4171 for (auto Id : Node->getContextIds())
4172 ContextIdToAllocationNode[Id] = Node->getOrigNode();
4173 for (auto *Clone : Node->Clones) {
4174 for (auto Id : Clone->getContextIds())
4175 ContextIdToAllocationNode[Id] = Clone->getOrigNode();
4176 }
4177 }
4178
4179 // Post order traversal starting from allocations to ensure each callsite
4180 // calls a single clone of its callee. Callee nodes that are clones of each
4181 // other are merged (via new merge nodes if needed) to achieve this.
4182 DenseSet<const ContextNode *> Visited;
4183 for (auto &Entry : AllocationCallToContextNodeMap) {
4184 auto *Node = Entry.second;
4185
4186 mergeClones(Node, Visited, ContextIdToAllocationNode);
4187
4188 // Make a copy so the recursive post order traversal that may create new
4189 // clones doesn't mess up iteration. Note that the recursive traversal
4190 // itself does not call mergeClones on any of these nodes, which are all
4191 // (clones of) allocations.
4192 auto Clones = Node->Clones;
4193 for (auto *Clone : Clones)
4194 mergeClones(Clone, Visited, ContextIdToAllocationNode);
4195 }
4196
4197 if (DumpCCG) {
4198 dbgs() << "CCG after merging:\n";
4199 dbgs() << *this;
4200 }
4201 if (ExportToDot)
4202 exportToDot("aftermerge");
4203
4204 if (VerifyCCG) {
4205 check();
4206 }
4207}
4208
4209// Recursive helper for above mergeClones method.
4210template <typename DerivedCCG, typename FuncTy, typename CallTy>
4211void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::mergeClones(
4212 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
4213 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode) {
4214 auto Inserted = Visited.insert(Node);
4215 if (!Inserted.second)
4216 return;
4217
4218 // Iteratively perform merging on this node to handle new caller nodes created
4219 // during the recursive traversal. We could do something more elegant such as
4220 // maintain a worklist, but this is a simple approach that doesn't cause a
4221 // measureable compile time effect, as most nodes don't have many caller
4222 // edges to check.
4223 bool FoundUnvisited = true;
4224 unsigned Iters = 0;
4225 while (FoundUnvisited) {
4226 Iters++;
4227 FoundUnvisited = false;
4228 // Make a copy since the recursive call may move a caller edge to a new
4229 // callee, messing up the iterator.
4230 auto CallerEdges = Node->CallerEdges;
4231 for (auto CallerEdge : CallerEdges) {
4232 // Skip any caller edge moved onto a different callee during recursion.
4233 if (CallerEdge->Callee != Node)
4234 continue;
4235 // If we found an unvisited caller, note that we should check the caller
4236 // edges again as mergeClones may add or change caller nodes.
4237 if (DoMergeIteration && !Visited.contains(CallerEdge->Caller))
4238 FoundUnvisited = true;
4239 mergeClones(CallerEdge->Caller, Visited, ContextIdToAllocationNode);
4240 }
4241 }
4242
4243 TotalMergeInvokes++;
4244 TotalMergeIters += Iters;
4245 if (Iters > MaxMergeIters)
4246 MaxMergeIters = Iters;
4247
4248 // Merge for this node after we handle its callers.
4249 mergeNodeCalleeClones(Node, Visited, ContextIdToAllocationNode);
4250}
4251
4252template <typename DerivedCCG, typename FuncTy, typename CallTy>
4253void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::mergeNodeCalleeClones(
4254 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
4255 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode) {
4256 // Ignore Node if we moved all of its contexts to clones.
4257 if (Node->emptyContextIds())
4258 return;
4259
4260 // First identify groups of clones among Node's callee edges, by building
4261 // a map from each callee base node to the associated callee edges from Node.
4262 MapVector<ContextNode *, std::vector<std::shared_ptr<ContextEdge>>>
4263 OrigNodeToCloneEdges;
4264 for (const auto &E : Node->CalleeEdges) {
4265 auto *Callee = E->Callee;
4266 if (!Callee->CloneOf && Callee->Clones.empty())
4267 continue;
4268 ContextNode *Base = Callee->getOrigNode();
4269 OrigNodeToCloneEdges[Base].push_back(E);
4270 }
4271
4272 // Helper for callee edge sorting below. Return true if A's callee has fewer
4273 // caller edges than B, or if A is a clone and B is not, or if A's first
4274 // context id is smaller than B's.
4275 auto CalleeCallerEdgeLessThan = [](const std::shared_ptr<ContextEdge> &A,
4276 const std::shared_ptr<ContextEdge> &B) {
4277 if (A->Callee->CallerEdges.size() != B->Callee->CallerEdges.size())
4278 return A->Callee->CallerEdges.size() < B->Callee->CallerEdges.size();
4279 if (A->Callee->CloneOf && !B->Callee->CloneOf)
4280 return true;
4281 else if (!A->Callee->CloneOf && B->Callee->CloneOf)
4282 return false;
4283 // Use the first context id for each edge as a
4284 // tie-breaker.
4285 return *A->ContextIds.begin() < *B->ContextIds.begin();
4286 };
4287
4288 // Process each set of callee clones called by Node, performing the needed
4289 // merging.
4290 for (auto Entry : OrigNodeToCloneEdges) {
4291 // CalleeEdges is the set of edges from Node reaching callees that are
4292 // mutual clones of each other.
4293 auto &CalleeEdges = Entry.second;
4294 auto NumCalleeClones = CalleeEdges.size();
4295 // A single edge means there is no merging needed.
4296 if (NumCalleeClones == 1)
4297 continue;
4298 // Sort the CalleeEdges calling this group of clones in ascending order of
4299 // their caller edge counts, putting the original non-clone node first in
4300 // cases of a tie. This simplifies finding an existing node to use as the
4301 // merge node.
4302 llvm::stable_sort(CalleeEdges, CalleeCallerEdgeLessThan);
4303
4304 /// Find other callers of the given set of callee edges that can
4305 /// share the same callee merge node. See the comments at this method
4306 /// definition for details.
4307 DenseSet<ContextNode *> OtherCallersToShareMerge;
4308 findOtherCallersToShareMerge(Node, CalleeEdges, ContextIdToAllocationNode,
4309 OtherCallersToShareMerge);
4310
4311 // Now do the actual merging. Identify existing or create a new MergeNode
4312 // during the first iteration. Move each callee over, along with edges from
4313 // other callers we've determined above can share the same merge node.
4314 ContextNode *MergeNode = nullptr;
4315 DenseMap<ContextNode *, unsigned> CallerToMoveCount;
4316 for (auto CalleeEdge : CalleeEdges) {
4317 auto *OrigCallee = CalleeEdge->Callee;
4318 // If we don't have a MergeNode yet (only happens on the first iteration,
4319 // as a new one will be created when we go to move the first callee edge
4320 // over as needed), see if we can use this callee.
4321 if (!MergeNode) {
4322 // If there are no other callers, simply use this callee.
4323 if (CalleeEdge->Callee->CallerEdges.size() == 1) {
4324 MergeNode = OrigCallee;
4325 NonNewMergedNodes++;
4326 continue;
4327 }
4328 // Otherwise, if we have identified other caller nodes that can share
4329 // the merge node with Node, see if all of OrigCallee's callers are
4330 // going to share the same merge node. In that case we can use callee
4331 // (since all of its callers would move to the new merge node).
4332 if (!OtherCallersToShareMerge.empty()) {
4333 bool MoveAllCallerEdges = true;
4334 for (auto CalleeCallerE : OrigCallee->CallerEdges) {
4335 if (CalleeCallerE == CalleeEdge)
4336 continue;
4337 if (!OtherCallersToShareMerge.contains(CalleeCallerE->Caller)) {
4338 MoveAllCallerEdges = false;
4339 break;
4340 }
4341 }
4342 // If we are going to move all callers over, we can use this callee as
4343 // the MergeNode.
4344 if (MoveAllCallerEdges) {
4345 MergeNode = OrigCallee;
4346 NonNewMergedNodes++;
4347 continue;
4348 }
4349 }
4350 }
4351 // Move this callee edge, creating a new merge node if necessary.
4352 if (MergeNode) {
4353 assert(MergeNode != OrigCallee);
4354 moveEdgeToExistingCalleeClone(CalleeEdge, MergeNode,
4355 /*NewClone*/ false);
4356 } else {
4357 MergeNode = moveEdgeToNewCalleeClone(CalleeEdge);
4358 NewMergedNodes++;
4359 }
4360 // Now move all identified edges from other callers over to the merge node
4361 // as well.
4362 if (!OtherCallersToShareMerge.empty()) {
4363 // Make and iterate over a copy of OrigCallee's caller edges because
4364 // some of these will be moved off of the OrigCallee and that would mess
4365 // up the iteration from OrigCallee.
4366 auto OrigCalleeCallerEdges = OrigCallee->CallerEdges;
4367 for (auto &CalleeCallerE : OrigCalleeCallerEdges) {
4368 if (CalleeCallerE == CalleeEdge)
4369 continue;
4370 if (!OtherCallersToShareMerge.contains(CalleeCallerE->Caller))
4371 continue;
4372 CallerToMoveCount[CalleeCallerE->Caller]++;
4373 moveEdgeToExistingCalleeClone(CalleeCallerE, MergeNode,
4374 /*NewClone*/ false);
4375 }
4376 }
4377 removeNoneTypeCalleeEdges(OrigCallee);
4378 removeNoneTypeCalleeEdges(MergeNode);
4379 }
4380 }
4381}
4382
4383// Look for other nodes that have edges to the same set of callee
4384// clones as the current Node. Those can share the eventual merge node
4385// (reducing cloning and binary size overhead) iff:
4386// - they have edges to the same set of callee clones
4387// - each callee edge reaches a subset of the same allocations as Node's
4388// corresponding edge to the same callee clone.
4389// The second requirement is to ensure that we don't undo any of the
4390// necessary cloning to distinguish contexts with different allocation
4391// behavior.
4392// FIXME: This is somewhat conservative, as we really just need to ensure
4393// that they don't reach the same allocations as contexts on edges from Node
4394// going to any of the *other* callee clones being merged. However, that
4395// requires more tracking and checking to get right.
4396template <typename DerivedCCG, typename FuncTy, typename CallTy>
4397void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
4398 findOtherCallersToShareMerge(
4399 ContextNode *Node,
4400 std::vector<std::shared_ptr<ContextEdge>> &CalleeEdges,
4401 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode,
4402 DenseSet<ContextNode *> &OtherCallersToShareMerge) {
4403 auto NumCalleeClones = CalleeEdges.size();
4404 // This map counts how many edges to the same callee clone exist for other
4405 // caller nodes of each callee clone.
4406 DenseMap<ContextNode *, unsigned> OtherCallersToSharedCalleeEdgeCount;
4407 // Counts the number of other caller nodes that have edges to all callee
4408 // clones that don't violate the allocation context checking.
4409 unsigned PossibleOtherCallerNodes = 0;
4410
4411 // We only need to look at other Caller nodes if the first callee edge has
4412 // multiple callers (recall they are sorted in ascending order above).
4413 if (CalleeEdges[0]->Callee->CallerEdges.size() < 2)
4414 return;
4415
4416 // For each callee edge:
4417 // - Collect the count of other caller nodes calling the same callees.
4418 // - Collect the alloc nodes reached by contexts on each callee edge.
4419 DenseMap<ContextEdge *, DenseSet<ContextNode *>> CalleeEdgeToAllocNodes;
4420 for (auto CalleeEdge : CalleeEdges) {
4421 assert(CalleeEdge->Callee->CallerEdges.size() > 1);
4422 // For each other caller of the same callee, increment the count of
4423 // edges reaching the same callee clone.
4424 for (auto CalleeCallerEdges : CalleeEdge->Callee->CallerEdges) {
4425 if (CalleeCallerEdges->Caller == Node) {
4426 assert(CalleeCallerEdges == CalleeEdge);
4427 continue;
4428 }
4429 OtherCallersToSharedCalleeEdgeCount[CalleeCallerEdges->Caller]++;
4430 // If this caller edge now reaches all of the same callee clones,
4431 // increment the count of candidate other caller nodes.
4432 if (OtherCallersToSharedCalleeEdgeCount[CalleeCallerEdges->Caller] ==
4433 NumCalleeClones)
4434 PossibleOtherCallerNodes++;
4435 }
4436 // Collect the alloc nodes reached by contexts on each callee edge, for
4437 // later analysis.
4438 for (auto Id : CalleeEdge->getContextIds()) {
4439 auto *Alloc = ContextIdToAllocationNode.lookup(Id);
4440 if (!Alloc) {
4441 // FIXME: unclear why this happens occasionally, presumably
4442 // imperfect graph updates possibly with recursion.
4443 MissingAllocForContextId++;
4444 continue;
4445 }
4446 CalleeEdgeToAllocNodes[CalleeEdge.get()].insert(Alloc);
4447 }
4448 }
4449
4450 // Now walk the callee edges again, and make sure that for each candidate
4451 // caller node all of its edges to the callees reach the same allocs (or
4452 // a subset) as those along the corresponding callee edge from Node.
4453 for (auto CalleeEdge : CalleeEdges) {
4454 assert(CalleeEdge->Callee->CallerEdges.size() > 1);
4455 // Stop if we do not have any (more) candidate other caller nodes.
4456 if (!PossibleOtherCallerNodes)
4457 break;
4458 auto &CurCalleeAllocNodes = CalleeEdgeToAllocNodes[CalleeEdge.get()];
4459 // Check each other caller of this callee clone.
4460 for (auto &CalleeCallerE : CalleeEdge->Callee->CallerEdges) {
4461 // Not interested in the callee edge from Node itself.
4462 if (CalleeCallerE == CalleeEdge)
4463 continue;
4464 // Skip any callers that didn't have callee edges to all the same
4465 // callee clones.
4466 if (OtherCallersToSharedCalleeEdgeCount[CalleeCallerE->Caller] !=
4467 NumCalleeClones)
4468 continue;
4469 // Make sure that each context along edge from candidate caller node
4470 // reaches an allocation also reached by this callee edge from Node.
4471 for (auto Id : CalleeCallerE->getContextIds()) {
4472 auto *Alloc = ContextIdToAllocationNode.lookup(Id);
4473 if (!Alloc)
4474 continue;
4475 // If not, simply reset the map entry to 0 so caller is ignored, and
4476 // reduce the count of candidate other caller nodes.
4477 if (!CurCalleeAllocNodes.contains(Alloc)) {
4478 OtherCallersToSharedCalleeEdgeCount[CalleeCallerE->Caller] = 0;
4479 PossibleOtherCallerNodes--;
4480 break;
4481 }
4482 }
4483 }
4484 }
4485
4486 if (!PossibleOtherCallerNodes)
4487 return;
4488
4489 // Build the set of other caller nodes that can use the same callee merge
4490 // node.
4491 for (auto &[OtherCaller, Count] : OtherCallersToSharedCalleeEdgeCount) {
4492 if (Count != NumCalleeClones)
4493 continue;
4494 OtherCallersToShareMerge.insert(OtherCaller);
4495 }
4496}
4497
4498// This method assigns cloned callsites to functions, cloning the functions as
4499// needed. The assignment is greedy and proceeds roughly as follows:
4500//
4501// For each function Func:
4502// For each call with graph Node having clones:
4503// Initialize ClonesWorklist to Node and its clones
4504// Initialize NodeCloneCount to 0
4505// While ClonesWorklist is not empty:
4506// Clone = pop front ClonesWorklist
4507// NodeCloneCount++
4508// If Func has been cloned less than NodeCloneCount times:
4509// If NodeCloneCount is 1:
4510// Assign Clone to original Func
4511// Continue
4512// Create a new function clone
4513// If other callers not assigned to call a function clone yet:
4514// Assign them to call new function clone
4515// Continue
4516// Assign any other caller calling the cloned version to new clone
4517//
4518// For each caller of Clone:
4519// If caller is assigned to call a specific function clone:
4520// If we cannot assign Clone to that function clone:
4521// Create new callsite Clone NewClone
4522// Add NewClone to ClonesWorklist
4523// Continue
4524// Assign Clone to existing caller's called function clone
4525// Else:
4526// If Clone not already assigned to a function clone:
4527// Assign to first function clone without assignment
4528// Assign caller to selected function clone
4529template <typename DerivedCCG, typename FuncTy, typename CallTy>
4530bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
4531 bool Changed = false;
4532
4533 mergeClones();
4534
4535 // Keep track of the assignment of nodes (callsites) to function clones they
4536 // call.
4537 DenseMap<ContextNode *, FuncInfo> CallsiteToCalleeFuncCloneMap;
4538
4539 // Update caller node to call function version CalleeFunc, by recording the
4540 // assignment in CallsiteToCalleeFuncCloneMap.
4541 auto RecordCalleeFuncOfCallsite = [&](ContextNode *Caller,
4542 const FuncInfo &CalleeFunc) {
4543 assert(Caller->hasCall());
4544 CallsiteToCalleeFuncCloneMap[Caller] = CalleeFunc;
4545 };
4546
4547 // Information for a single clone of this Func.
4548 struct FuncCloneInfo {
4549 // The function clone.
4550 FuncInfo FuncClone;
4551 // Remappings of each call of interest (from original uncloned call to the
4552 // corresponding cloned call in this function clone).
4553 DenseMap<CallInfo, CallInfo> CallMap;
4554 };
4555
4556 // Walk all functions for which we saw calls with memprof metadata, and handle
4557 // cloning for each of its calls.
4558 for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
4559 FuncInfo OrigFunc(Func);
4560 // Map from each clone number of OrigFunc to information about that function
4561 // clone (the function clone FuncInfo and call remappings). The index into
4562 // the vector is the clone number, as function clones are created and
4563 // numbered sequentially.
4564 std::vector<FuncCloneInfo> FuncCloneInfos;
4565 for (auto &Call : CallsWithMetadata) {
4566 ContextNode *Node = getNodeForInst(Call);
4567 // Skip call if we do not have a node for it (all uses of its stack ids
4568 // were either on inlined chains or pruned from the MIBs), or if we did
4569 // not create any clones for it.
4570 if (!Node || Node->Clones.empty())
4571 continue;
4572 assert(Node->hasCall() &&
4573 "Not having a call should have prevented cloning");
4574
4575 // Track the assignment of function clones to clones of the current
4576 // callsite Node being handled.
4577 std::map<FuncInfo, ContextNode *> FuncCloneToCurNodeCloneMap;
4578
4579 // Assign callsite version CallsiteClone to function version FuncClone,
4580 // and also assign (possibly cloned) Call to CallsiteClone.
4581 auto AssignCallsiteCloneToFuncClone = [&](const FuncInfo &FuncClone,
4582 CallInfo &Call,
4583 ContextNode *CallsiteClone,
4584 bool IsAlloc) {
4585 // Record the clone of callsite node assigned to this function clone.
4586 FuncCloneToCurNodeCloneMap[FuncClone] = CallsiteClone;
4587
4588 assert(FuncCloneInfos.size() > FuncClone.cloneNo());
4589 DenseMap<CallInfo, CallInfo> &CallMap =
4590 FuncCloneInfos[FuncClone.cloneNo()].CallMap;
4591 CallInfo CallClone(Call);
4592 if (auto It = CallMap.find(Call); It != CallMap.end())
4593 CallClone = It->second;
4594 CallsiteClone->setCall(CallClone);
4595 // Need to do the same for all matching calls.
4596 for (auto &MatchingCall : Node->MatchingCalls) {
4597 CallInfo CallClone(MatchingCall);
4598 if (auto It = CallMap.find(MatchingCall); It != CallMap.end())
4599 CallClone = It->second;
4600 // Updates the call in the list.
4601 MatchingCall = CallClone;
4602 }
4603 };
4604
4605 // Keep track of the clones of callsite Node that need to be assigned to
4606 // function clones. This list may be expanded in the loop body below if we
4607 // find additional cloning is required.
4608 std::deque<ContextNode *> ClonesWorklist;
4609 // Ignore original Node if we moved all of its contexts to clones.
4610 if (!Node->emptyContextIds())
4611 ClonesWorklist.push_back(Node);
4612 llvm::append_range(ClonesWorklist, Node->Clones);
4613
4614 // Now walk through all of the clones of this callsite Node that we need,
4615 // and determine the assignment to a corresponding clone of the current
4616 // function (creating new function clones as needed).
4617 unsigned NodeCloneCount = 0;
4618 while (!ClonesWorklist.empty()) {
4619 ContextNode *Clone = ClonesWorklist.front();
4620 ClonesWorklist.pop_front();
4621 NodeCloneCount++;
4622 if (VerifyNodes)
4624
4625 // Need to create a new function clone if we have more callsite clones
4626 // than existing function clones, which would have been assigned to an
4627 // earlier clone in the list (we assign callsite clones to function
4628 // clones greedily).
4629 if (FuncCloneInfos.size() < NodeCloneCount) {
4630 // If this is the first callsite copy, assign to original function.
4631 if (NodeCloneCount == 1) {
4632 // Since FuncCloneInfos is empty in this case, no clones have
4633 // been created for this function yet, and no callers should have
4634 // been assigned a function clone for this callee node yet.
4636 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
4637 return CallsiteToCalleeFuncCloneMap.count(E->Caller);
4638 }));
4639 // Initialize with empty call map, assign Clone to original function
4640 // and its callers, and skip to the next clone.
4641 FuncCloneInfos.push_back(
4642 {OrigFunc, DenseMap<CallInfo, CallInfo>()});
4643 AssignCallsiteCloneToFuncClone(
4644 OrigFunc, Call, Clone,
4645 AllocationCallToContextNodeMap.count(Call));
4646 for (auto &CE : Clone->CallerEdges) {
4647 // Ignore any caller that does not have a recorded callsite Call.
4648 if (!CE->Caller->hasCall())
4649 continue;
4650 RecordCalleeFuncOfCallsite(CE->Caller, OrigFunc);
4651 }
4652 continue;
4653 }
4654
4655 // First locate which copy of OrigFunc to clone again. If a caller
4656 // of this callsite clone was already assigned to call a particular
4657 // function clone, we need to redirect all of those callers to the
4658 // new function clone, and update their other callees within this
4659 // function.
4660 FuncInfo PreviousAssignedFuncClone;
4661 auto EI = llvm::find_if(
4662 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
4663 return CallsiteToCalleeFuncCloneMap.count(E->Caller);
4664 });
4665 bool CallerAssignedToCloneOfFunc = false;
4666 if (EI != Clone->CallerEdges.end()) {
4667 const std::shared_ptr<ContextEdge> &Edge = *EI;
4668 PreviousAssignedFuncClone =
4669 CallsiteToCalleeFuncCloneMap[Edge->Caller];
4670 CallerAssignedToCloneOfFunc = true;
4671 }
4672
4673 // Clone function and save it along with the CallInfo map created
4674 // during cloning in the FuncCloneInfos.
4675 DenseMap<CallInfo, CallInfo> NewCallMap;
4676 unsigned CloneNo = FuncCloneInfos.size();
4677 assert(CloneNo > 0 && "Clone 0 is the original function, which "
4678 "should already exist in the map");
4679 FuncInfo NewFuncClone = cloneFunctionForCallsite(
4680 OrigFunc, Call, NewCallMap, CallsWithMetadata, CloneNo);
4681 FuncCloneInfos.push_back({NewFuncClone, std::move(NewCallMap)});
4682 FunctionClonesAnalysis++;
4683 Changed = true;
4684
4685 // If no caller callsites were already assigned to a clone of this
4686 // function, we can simply assign this clone to the new func clone
4687 // and update all callers to it, then skip to the next clone.
4688 if (!CallerAssignedToCloneOfFunc) {
4689 AssignCallsiteCloneToFuncClone(
4690 NewFuncClone, Call, Clone,
4691 AllocationCallToContextNodeMap.count(Call));
4692 for (auto &CE : Clone->CallerEdges) {
4693 // Ignore any caller that does not have a recorded callsite Call.
4694 if (!CE->Caller->hasCall())
4695 continue;
4696 RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
4697 }
4698 continue;
4699 }
4700
4701 // We may need to do additional node cloning in this case.
4702 // Reset the CallsiteToCalleeFuncCloneMap entry for any callers
4703 // that were previously assigned to call PreviousAssignedFuncClone,
4704 // to record that they now call NewFuncClone.
4705 // The none type edge removal may remove some of this Clone's caller
4706 // edges, if it is reached via another of its caller's callees.
4707 // Iterate over a copy and skip any that were removed.
4708 auto CallerEdges = Clone->CallerEdges;
4709 for (auto CE : CallerEdges) {
4710 // Skip any that have been removed on an earlier iteration.
4711 if (CE->isRemoved()) {
4712 assert(!is_contained(Clone->CallerEdges, CE));
4713 continue;
4714 }
4715 assert(CE);
4716 // Ignore any caller that does not have a recorded callsite Call.
4717 if (!CE->Caller->hasCall())
4718 continue;
4719
4720 if (!CallsiteToCalleeFuncCloneMap.count(CE->Caller) ||
4721 // We subsequently fall through to later handling that
4722 // will perform any additional cloning required for
4723 // callers that were calling other function clones.
4724 CallsiteToCalleeFuncCloneMap[CE->Caller] !=
4725 PreviousAssignedFuncClone)
4726 continue;
4727
4728 RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
4729
4730 // If we are cloning a function that was already assigned to some
4731 // callers, then essentially we are creating new callsite clones
4732 // of the other callsites in that function that are reached by those
4733 // callers. Clone the other callees of the current callsite's caller
4734 // that were already assigned to PreviousAssignedFuncClone
4735 // accordingly. This is important since we subsequently update the
4736 // calls from the nodes in the graph and their assignments to callee
4737 // functions recorded in CallsiteToCalleeFuncCloneMap.
4738 // The none type edge removal may remove some of this caller's
4739 // callee edges, if it is reached via another of its callees.
4740 // Iterate over a copy and skip any that were removed.
4741 auto CalleeEdges = CE->Caller->CalleeEdges;
4742 for (auto CalleeEdge : CalleeEdges) {
4743 // Skip any that have been removed on an earlier iteration when
4744 // cleaning up newly None type callee edges.
4745 if (CalleeEdge->isRemoved()) {
4746 assert(!is_contained(CE->Caller->CalleeEdges, CalleeEdge));
4747 continue;
4748 }
4749 assert(CalleeEdge);
4750 ContextNode *Callee = CalleeEdge->Callee;
4751 // Skip the current callsite, we are looking for other
4752 // callsites Caller calls, as well as any that does not have a
4753 // recorded callsite Call.
4754 if (Callee == Clone || !Callee->hasCall())
4755 continue;
4756 // Skip direct recursive calls. We don't need/want to clone the
4757 // caller node again, and this loop will not behave as expected if
4758 // we tried.
4759 if (Callee == CalleeEdge->Caller)
4760 continue;
4761 ContextNode *NewClone = moveEdgeToNewCalleeClone(CalleeEdge);
4762 removeNoneTypeCalleeEdges(NewClone);
4763 // Moving the edge may have resulted in some none type
4764 // callee edges on the original Callee.
4765 removeNoneTypeCalleeEdges(Callee);
4766 assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
4767 // If the Callee node was already assigned to call a specific
4768 // function version, make sure its new clone is assigned to call
4769 // that same function clone.
4770 if (CallsiteToCalleeFuncCloneMap.count(Callee))
4771 RecordCalleeFuncOfCallsite(
4772 NewClone, CallsiteToCalleeFuncCloneMap[Callee]);
4773 // Update NewClone with the new Call clone of this callsite's Call
4774 // created for the new function clone created earlier.
4775 // Recall that we have already ensured when building the graph
4776 // that each caller can only call callsites within the same
4777 // function, so we are guaranteed that Callee Call is in the
4778 // current OrigFunc.
4779 // CallMap is set up as indexed by original Call at clone 0.
4780 CallInfo OrigCall(Callee->getOrigNode()->Call);
4781 OrigCall.setCloneNo(0);
4782 DenseMap<CallInfo, CallInfo> &CallMap =
4783 FuncCloneInfos[NewFuncClone.cloneNo()].CallMap;
4784 assert(CallMap.count(OrigCall));
4785 CallInfo NewCall(CallMap[OrigCall]);
4786 assert(NewCall);
4787 NewClone->setCall(NewCall);
4788 // Need to do the same for all matching calls.
4789 for (auto &MatchingCall : NewClone->MatchingCalls) {
4790 CallInfo OrigMatchingCall(MatchingCall);
4791 OrigMatchingCall.setCloneNo(0);
4792 assert(CallMap.count(OrigMatchingCall));
4793 CallInfo NewCall(CallMap[OrigMatchingCall]);
4794 assert(NewCall);
4795 // Updates the call in the list.
4796 MatchingCall = NewCall;
4797 }
4798 }
4799 }
4800 // Fall through to handling below to perform the recording of the
4801 // function for this callsite clone. This enables handling of cases
4802 // where the callers were assigned to different clones of a function.
4803 }
4804
4805 auto FindFirstAvailFuncClone = [&]() {
4806 // Find first function in FuncCloneInfos without an assigned
4807 // clone of this callsite Node. We should always have one
4808 // available at this point due to the earlier cloning when the
4809 // FuncCloneInfos size was smaller than the clone number.
4810 for (auto &CF : FuncCloneInfos) {
4811 if (!FuncCloneToCurNodeCloneMap.count(CF.FuncClone))
4812 return CF.FuncClone;
4813 }
4815 "Expected an available func clone for this callsite clone");
4816 };
4817
4818 // See if we can use existing function clone. Walk through
4819 // all caller edges to see if any have already been assigned to
4820 // a clone of this callsite's function. If we can use it, do so. If not,
4821 // because that function clone is already assigned to a different clone
4822 // of this callsite, then we need to clone again.
4823 // Basically, this checking is needed to handle the case where different
4824 // caller functions/callsites may need versions of this function
4825 // containing different mixes of callsite clones across the different
4826 // callsites within the function. If that happens, we need to create
4827 // additional function clones to handle the various combinations.
4828 //
4829 // Keep track of any new clones of this callsite created by the
4830 // following loop, as well as any existing clone that we decided to
4831 // assign this clone to.
4832 std::map<FuncInfo, ContextNode *> FuncCloneToNewCallsiteCloneMap;
4833 FuncInfo FuncCloneAssignedToCurCallsiteClone;
4834 // Iterate over a copy of Clone's caller edges, since we may need to
4835 // remove edges in the moveEdgeTo* methods, and this simplifies the
4836 // handling and makes it less error-prone.
4837 auto CloneCallerEdges = Clone->CallerEdges;
4838 for (auto &Edge : CloneCallerEdges) {
4839 // Skip removed edges (due to direct recursive edges updated when
4840 // updating callee edges when moving an edge and subsequently
4841 // removed by call to removeNoneTypeCalleeEdges on the Clone).
4842 if (Edge->isRemoved())
4843 continue;
4844 // Ignore any caller that does not have a recorded callsite Call.
4845 if (!Edge->Caller->hasCall())
4846 continue;
4847 // If this caller already assigned to call a version of OrigFunc, need
4848 // to ensure we can assign this callsite clone to that function clone.
4849 if (CallsiteToCalleeFuncCloneMap.count(Edge->Caller)) {
4850 FuncInfo FuncCloneCalledByCaller =
4851 CallsiteToCalleeFuncCloneMap[Edge->Caller];
4852 // First we need to confirm that this function clone is available
4853 // for use by this callsite node clone.
4854 //
4855 // While FuncCloneToCurNodeCloneMap is built only for this Node and
4856 // its callsite clones, one of those callsite clones X could have
4857 // been assigned to the same function clone called by Edge's caller
4858 // - if Edge's caller calls another callsite within Node's original
4859 // function, and that callsite has another caller reaching clone X.
4860 // We need to clone Node again in this case.
4861 if ((FuncCloneToCurNodeCloneMap.count(FuncCloneCalledByCaller) &&
4862 FuncCloneToCurNodeCloneMap[FuncCloneCalledByCaller] !=
4863 Clone) ||
4864 // Detect when we have multiple callers of this callsite that
4865 // have already been assigned to specific, and different, clones
4866 // of OrigFunc (due to other unrelated callsites in Func they
4867 // reach via call contexts). Is this Clone of callsite Node
4868 // assigned to a different clone of OrigFunc? If so, clone Node
4869 // again.
4870 (FuncCloneAssignedToCurCallsiteClone &&
4871 FuncCloneAssignedToCurCallsiteClone !=
4872 FuncCloneCalledByCaller)) {
4873 // We need to use a different newly created callsite clone, in
4874 // order to assign it to another new function clone on a
4875 // subsequent iteration over the Clones array (adjusted below).
4876 // Note we specifically do not reset the
4877 // CallsiteToCalleeFuncCloneMap entry for this caller, so that
4878 // when this new clone is processed later we know which version of
4879 // the function to copy (so that other callsite clones we have
4880 // assigned to that function clone are properly cloned over). See
4881 // comments in the function cloning handling earlier.
4882
4883 // Check if we already have cloned this callsite again while
4884 // walking through caller edges, for a caller calling the same
4885 // function clone. If so, we can move this edge to that new clone
4886 // rather than creating yet another new clone.
4887 if (FuncCloneToNewCallsiteCloneMap.count(
4888 FuncCloneCalledByCaller)) {
4889 ContextNode *NewClone =
4890 FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller];
4891 moveEdgeToExistingCalleeClone(Edge, NewClone);
4892 // Cleanup any none type edges cloned over.
4893 removeNoneTypeCalleeEdges(NewClone);
4894 } else {
4895 // Create a new callsite clone.
4896 ContextNode *NewClone = moveEdgeToNewCalleeClone(Edge);
4897 removeNoneTypeCalleeEdges(NewClone);
4898 FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller] =
4899 NewClone;
4900 // Add to list of clones and process later.
4901 ClonesWorklist.push_back(NewClone);
4902 assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
4903 }
4904 // Moving the caller edge may have resulted in some none type
4905 // callee edges.
4906 removeNoneTypeCalleeEdges(Clone);
4907 // We will handle the newly created callsite clone in a subsequent
4908 // iteration over this Node's Clones.
4909 continue;
4910 }
4911
4912 // Otherwise, we can use the function clone already assigned to this
4913 // caller.
4914 if (!FuncCloneAssignedToCurCallsiteClone) {
4915 FuncCloneAssignedToCurCallsiteClone = FuncCloneCalledByCaller;
4916 // Assign Clone to FuncCloneCalledByCaller
4917 AssignCallsiteCloneToFuncClone(
4918 FuncCloneCalledByCaller, Call, Clone,
4919 AllocationCallToContextNodeMap.count(Call));
4920 } else
4921 // Don't need to do anything - callsite is already calling this
4922 // function clone.
4923 assert(FuncCloneAssignedToCurCallsiteClone ==
4924 FuncCloneCalledByCaller);
4925
4926 } else {
4927 // We have not already assigned this caller to a version of
4928 // OrigFunc. Do the assignment now.
4929
4930 // First check if we have already assigned this callsite clone to a
4931 // clone of OrigFunc for another caller during this iteration over
4932 // its caller edges.
4933 if (!FuncCloneAssignedToCurCallsiteClone) {
4934 FuncCloneAssignedToCurCallsiteClone = FindFirstAvailFuncClone();
4935 assert(FuncCloneAssignedToCurCallsiteClone);
4936 // Assign Clone to FuncCloneAssignedToCurCallsiteClone
4937 AssignCallsiteCloneToFuncClone(
4938 FuncCloneAssignedToCurCallsiteClone, Call, Clone,
4939 AllocationCallToContextNodeMap.count(Call));
4940 } else
4941 assert(FuncCloneToCurNodeCloneMap
4942 [FuncCloneAssignedToCurCallsiteClone] == Clone);
4943 // Update callers to record function version called.
4944 RecordCalleeFuncOfCallsite(Edge->Caller,
4945 FuncCloneAssignedToCurCallsiteClone);
4946 }
4947 }
4948 // If we didn't assign a function clone to this callsite clone yet, e.g.
4949 // none of its callers has a non-null call, do the assignment here.
4950 // We want to ensure that every callsite clone is assigned to some
4951 // function clone, so that the call updates below work as expected.
4952 // In particular if this is the original callsite, we want to ensure it
4953 // is assigned to the original function, otherwise the original function
4954 // will appear available for assignment to other callsite clones,
4955 // leading to unintended effects. For one, the unknown and not updated
4956 // callers will call into cloned paths leading to the wrong hints,
4957 // because they still call the original function (clone 0). Also,
4958 // because all callsites start out as being clone 0 by default, we can't
4959 // easily distinguish between callsites explicitly assigned to clone 0
4960 // vs those never assigned, which can lead to multiple updates of the
4961 // calls when invoking updateCall below, with mismatched clone values.
4962 // TODO: Add a flag to the callsite nodes or some other mechanism to
4963 // better distinguish and identify callsite clones that are not getting
4964 // assigned to function clones as expected.
4965 if (!FuncCloneAssignedToCurCallsiteClone) {
4966 FuncCloneAssignedToCurCallsiteClone = FindFirstAvailFuncClone();
4967 assert(FuncCloneAssignedToCurCallsiteClone &&
4968 "No available func clone for this callsite clone");
4969 AssignCallsiteCloneToFuncClone(
4970 FuncCloneAssignedToCurCallsiteClone, Call, Clone,
4971 /*IsAlloc=*/AllocationCallToContextNodeMap.contains(Call));
4972 }
4973 }
4974 if (VerifyCCG) {
4976 for (const auto &PE : Node->CalleeEdges)
4978 for (const auto &CE : Node->CallerEdges)
4980 for (auto *Clone : Node->Clones) {
4982 for (const auto &PE : Clone->CalleeEdges)
4984 for (const auto &CE : Clone->CallerEdges)
4986 }
4987 }
4988 }
4989 }
4990
4991 uint8_t BothTypes =
4992 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
4993
4994 auto UpdateCalls = [&](ContextNode *Node,
4995 DenseSet<const ContextNode *> &Visited,
4996 auto &&UpdateCalls) {
4997 auto Inserted = Visited.insert(Node);
4998 if (!Inserted.second)
4999 return;
5000
5001 for (auto *Clone : Node->Clones)
5002 UpdateCalls(Clone, Visited, UpdateCalls);
5003
5004 for (auto &Edge : Node->CallerEdges)
5005 UpdateCalls(Edge->Caller, Visited, UpdateCalls);
5006
5007 // Skip if either no call to update, or if we ended up with no context ids
5008 // (we moved all edges onto other clones).
5009 if (!Node->hasCall() || Node->emptyContextIds())
5010 return;
5011
5012 if (Node->IsAllocation) {
5013 auto AT = allocTypeToUse(Node->AllocTypes);
5014 // If the allocation type is ambiguous, and more aggressive hinting
5015 // has been enabled via the MinClonedColdBytePercent flag, see if this
5016 // allocation should be hinted cold anyway because its fraction cold bytes
5017 // allocated is at least the given threshold.
5018 if (Node->AllocTypes == BothTypes && MinClonedColdBytePercent < 100 &&
5019 !ContextIdToContextSizeInfos.empty()) {
5020 uint64_t TotalCold = 0;
5021 uint64_t Total = 0;
5022 for (auto Id : Node->getContextIds()) {
5023 auto TypeI = ContextIdToAllocationType.find(Id);
5024 assert(TypeI != ContextIdToAllocationType.end());
5025 auto CSI = ContextIdToContextSizeInfos.find(Id);
5026 if (CSI != ContextIdToContextSizeInfos.end()) {
5027 for (auto &Info : CSI->second) {
5028 Total += Info.TotalSize;
5029 if (TypeI->second == AllocationType::Cold)
5030 TotalCold += Info.TotalSize;
5031 }
5032 }
5033 }
5034 if (TotalCold * 100 >= Total * MinClonedColdBytePercent)
5035 AT = AllocationType::Cold;
5036 }
5037 updateAllocationCall(Node->Call, AT);
5038 assert(Node->MatchingCalls.empty());
5039 return;
5040 }
5041
5042 if (!CallsiteToCalleeFuncCloneMap.count(Node))
5043 return;
5044
5045 auto CalleeFunc = CallsiteToCalleeFuncCloneMap[Node];
5046 updateCall(Node->Call, CalleeFunc);
5047 // Update all the matching calls as well.
5048 for (auto &Call : Node->MatchingCalls)
5049 updateCall(Call, CalleeFunc);
5050 };
5051
5052 // Performs DFS traversal starting from allocation nodes to update calls to
5053 // reflect cloning decisions recorded earlier. For regular LTO this will
5054 // update the actual calls in the IR to call the appropriate function clone
5055 // (and add attributes to allocation calls), whereas for ThinLTO the decisions
5056 // are recorded in the summary entries.
5057 DenseSet<const ContextNode *> Visited;
5058 for (auto &Entry : AllocationCallToContextNodeMap)
5059 UpdateCalls(Entry.second, Visited, UpdateCalls);
5060
5061 return Changed;
5062}
5063
5065 Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE,
5067 &FuncToAliasMap) {
5068 // The first "clone" is the original copy, we should only call this if we
5069 // needed to create new clones.
5070 assert(NumClones > 1);
5072 VMaps.reserve(NumClones - 1);
5073 FunctionsClonedThinBackend++;
5074 for (unsigned I = 1; I < NumClones; I++) {
5075 VMaps.emplace_back(std::make_unique<ValueToValueMapTy>());
5076 auto *NewF = CloneFunction(&F, *VMaps.back());
5077 FunctionClonesThinBackend++;
5078 // Strip memprof and callsite metadata from clone as they are no longer
5079 // needed.
5080 for (auto &BB : *NewF) {
5081 for (auto &Inst : BB) {
5082 Inst.setMetadata(LLVMContext::MD_memprof, nullptr);
5083 Inst.setMetadata(LLVMContext::MD_callsite, nullptr);
5084 }
5085 }
5086 std::string Name = getMemProfFuncName(F.getName(), I);
5087 auto *PrevF = M.getFunction(Name);
5088 if (PrevF) {
5089 // We might have created this when adjusting callsite in another
5090 // function. It should be a declaration.
5091 assert(PrevF->isDeclaration());
5092 NewF->takeName(PrevF);
5093 PrevF->replaceAllUsesWith(NewF);
5094 PrevF->eraseFromParent();
5095 } else
5096 NewF->setName(Name);
5097 updateSubprogramLinkageName(NewF, Name);
5098 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F)
5099 << "created clone " << ore::NV("NewFunction", NewF));
5100
5101 // Now handle aliases to this function, and clone those as well.
5102 if (!FuncToAliasMap.count(&F))
5103 continue;
5104 for (auto *A : FuncToAliasMap[&F]) {
5105 std::string Name = getMemProfFuncName(A->getName(), I);
5106 auto *PrevA = M.getNamedAlias(Name);
5107 auto *NewA = GlobalAlias::create(A->getValueType(),
5108 A->getType()->getPointerAddressSpace(),
5109 A->getLinkage(), Name, NewF);
5110 NewA->copyAttributesFrom(A);
5111 if (PrevA) {
5112 // We might have created this when adjusting callsite in another
5113 // function. It should be a declaration.
5114 assert(PrevA->isDeclaration());
5115 NewA->takeName(PrevA);
5116 PrevA->replaceAllUsesWith(NewA);
5117 PrevA->eraseFromParent();
5118 }
5119 }
5120 }
5121 return VMaps;
5122}
5123
5124// Locate the summary for F. This is complicated by the fact that it might
5125// have been internalized or promoted.
5127 const ModuleSummaryIndex *ImportSummary,
5128 const Function *CallingFunc = nullptr) {
5129 // FIXME: Ideally we would retain the original GUID in some fashion on the
5130 // function (e.g. as metadata), but for now do our best to locate the
5131 // summary without that information.
5132 ValueInfo TheFnVI = ImportSummary->getValueInfo(F.getGUID());
5133 if (!TheFnVI)
5134 // See if theFn was internalized, by checking index directly with
5135 // original name (this avoids the name adjustment done by getGUID() for
5136 // internal symbols).
5137 TheFnVI = ImportSummary->getValueInfo(
5139 if (TheFnVI)
5140 return TheFnVI;
5141 // Now query with the original name before any promotion was performed.
5142 StringRef OrigName =
5144 // When this pass is enabled, we always add thinlto_src_file provenance
5145 // metadata to imported function definitions, which allows us to recreate the
5146 // original internal symbol's GUID.
5147 auto SrcFileMD = F.getMetadata("thinlto_src_file");
5148 // If this is a call to an imported/promoted local for which we didn't import
5149 // the definition, the metadata will not exist on the declaration. However,
5150 // since we are doing this early, before any inlining in the LTO backend, we
5151 // can simply look at the metadata on the calling function which must have
5152 // been from the same module if F was an internal symbol originally.
5153 if (!SrcFileMD && F.isDeclaration()) {
5154 // We would only call this for a declaration for a direct callsite, in which
5155 // case the caller would have provided the calling function pointer.
5156 assert(CallingFunc);
5157 SrcFileMD = CallingFunc->getMetadata("thinlto_src_file");
5158 // If this is a promoted local (OrigName != F.getName()), since this is a
5159 // declaration, it must be imported from a different module and therefore we
5160 // should always find the metadata on its calling function. Any call to a
5161 // promoted local that came from this module should still be a definition.
5162 assert(SrcFileMD || OrigName == F.getName());
5163 }
5164 StringRef SrcFile = M.getSourceFileName();
5165 if (SrcFileMD)
5166 SrcFile = dyn_cast<MDString>(SrcFileMD->getOperand(0))->getString();
5167 std::string OrigId = GlobalValue::getGlobalIdentifier(
5168 OrigName, GlobalValue::InternalLinkage, SrcFile);
5169 TheFnVI = ImportSummary->getValueInfo(
5171 // Internal func in original module may have gotten a numbered suffix if we
5172 // imported an external function with the same name. This happens
5173 // automatically during IR linking for naming conflicts. It would have to
5174 // still be internal in that case (otherwise it would have been renamed on
5175 // promotion in which case we wouldn't have a naming conflict).
5176 if (!TheFnVI && OrigName == F.getName() && F.hasLocalLinkage() &&
5177 F.getName().contains('.')) {
5178 OrigName = F.getName().rsplit('.').first;
5180 OrigName, GlobalValue::InternalLinkage, SrcFile);
5181 TheFnVI = ImportSummary->getValueInfo(
5183 }
5184 // The only way we may not have a VI is if this is a declaration created for
5185 // an imported reference. For distributed ThinLTO we may not have a VI for
5186 // such declarations in the distributed summary.
5187 assert(TheFnVI || F.isDeclaration());
5188 return TheFnVI;
5189}
5190
5191bool MemProfContextDisambiguation::initializeIndirectCallPromotionInfo(
5192 Module &M) {
5193 ICallAnalysis = std::make_unique<ICallPromotionAnalysis>();
5194 Symtab = std::make_unique<InstrProfSymtab>();
5195 // Don't add canonical names, to avoid multiple functions to the symtab
5196 // when they both have the same root name with "." suffixes stripped.
5197 // If we pick the wrong one then this could lead to incorrect ICP and calling
5198 // a memprof clone that we don't actually create (resulting in linker unsats).
5199 // What this means is that the GUID of the function (or its PGOFuncName
5200 // metadata) *must* match that in the VP metadata to allow promotion.
5201 // In practice this should not be a limitation, since local functions should
5202 // have PGOFuncName metadata and global function names shouldn't need any
5203 // special handling (they should not get the ".llvm.*" suffix that the
5204 // canonicalization handling is attempting to strip).
5205 if (Error E = Symtab->create(M, /*InLTO=*/true, /*AddCanonical=*/false)) {
5206 std::string SymtabFailure = toString(std::move(E));
5207 M.getContext().emitError("Failed to create symtab: " + SymtabFailure);
5208 return false;
5209 }
5210 return true;
5211}
5212
5213#ifndef NDEBUG
5214// Sanity check that the MIB stack ids match between the summary and
5215// instruction metadata.
5217 const AllocInfo &AllocNode, const MDNode *MemProfMD,
5218 const CallStack<MDNode, MDNode::op_iterator> &CallsiteContext,
5219 const ModuleSummaryIndex *ImportSummary) {
5220 auto MIBIter = AllocNode.MIBs.begin();
5221 for (auto &MDOp : MemProfMD->operands()) {
5222 assert(MIBIter != AllocNode.MIBs.end());
5223 auto StackIdIndexIter = MIBIter->StackIdIndices.begin();
5224 auto *MIBMD = cast<const MDNode>(MDOp);
5225 MDNode *StackMDNode = getMIBStackNode(MIBMD);
5226 assert(StackMDNode);
5227 CallStack<MDNode, MDNode::op_iterator> StackContext(StackMDNode);
5228 auto ContextIterBegin =
5229 StackContext.beginAfterSharedPrefix(CallsiteContext);
5230 // Skip the checking on the first iteration.
5231 uint64_t LastStackContextId =
5232 (ContextIterBegin != StackContext.end() && *ContextIterBegin == 0) ? 1
5233 : 0;
5234 for (auto ContextIter = ContextIterBegin; ContextIter != StackContext.end();
5235 ++ContextIter) {
5236 // If this is a direct recursion, simply skip the duplicate
5237 // entries, to be consistent with how the summary ids were
5238 // generated during ModuleSummaryAnalysis.
5239 if (LastStackContextId == *ContextIter)
5240 continue;
5241 LastStackContextId = *ContextIter;
5242 assert(StackIdIndexIter != MIBIter->StackIdIndices.end());
5243 assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
5244 *ContextIter);
5245 StackIdIndexIter++;
5246 }
5247 MIBIter++;
5248 }
5249}
5250#endif
5251
5252bool MemProfContextDisambiguation::applyImport(Module &M) {
5253 assert(ImportSummary);
5254 bool Changed = false;
5255
5256 // We also need to clone any aliases that reference cloned functions, because
5257 // the modified callsites may invoke via the alias. Keep track of the aliases
5258 // for each function.
5259 std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>>
5260 FuncToAliasMap;
5261 for (auto &A : M.aliases()) {
5262 auto *Aliasee = A.getAliaseeObject();
5263 if (auto *F = dyn_cast<Function>(Aliasee))
5264 FuncToAliasMap[F].insert(&A);
5265 }
5266
5267 if (!initializeIndirectCallPromotionInfo(M))
5268 return false;
5269
5270 for (auto &F : M) {
5271 if (F.isDeclaration() || isMemProfClone(F))
5272 continue;
5273
5274 OptimizationRemarkEmitter ORE(&F);
5275
5277 bool ClonesCreated = false;
5278 unsigned NumClonesCreated = 0;
5279 auto CloneFuncIfNeeded = [&](unsigned NumClones) {
5280 // We should at least have version 0 which is the original copy.
5281 assert(NumClones > 0);
5282 // If only one copy needed use original.
5283 if (NumClones == 1)
5284 return;
5285 // If we already performed cloning of this function, confirm that the
5286 // requested number of clones matches (the thin link should ensure the
5287 // number of clones for each constituent callsite is consistent within
5288 // each function), before returning.
5289 if (ClonesCreated) {
5290 assert(NumClonesCreated == NumClones);
5291 return;
5292 }
5293 VMaps = createFunctionClones(F, NumClones, M, ORE, FuncToAliasMap);
5294 // The first "clone" is the original copy, which doesn't have a VMap.
5295 assert(VMaps.size() == NumClones - 1);
5296 Changed = true;
5297 ClonesCreated = true;
5298 NumClonesCreated = NumClones;
5299 };
5300
5301 auto CloneCallsite = [&](const CallsiteInfo &StackNode, CallBase *CB,
5302 Function *CalledFunction) {
5303 // Perform cloning if not yet done.
5304 CloneFuncIfNeeded(/*NumClones=*/StackNode.Clones.size());
5305
5306 assert(!isMemProfClone(*CalledFunction));
5307
5308 // Because we update the cloned calls by calling setCalledOperand (see
5309 // comment below), out of an abundance of caution make sure the called
5310 // function was actually the called operand (or its aliasee). We also
5311 // strip pointer casts when looking for calls (to match behavior during
5312 // summary generation), however, with opaque pointers in theory this
5313 // should not be an issue. Note we still clone the current function
5314 // (containing this call) above, as that could be needed for its callers.
5315 auto *GA = dyn_cast_or_null<GlobalAlias>(CB->getCalledOperand());
5316 if (CalledFunction != CB->getCalledOperand() &&
5317 (!GA || CalledFunction != GA->getAliaseeObject())) {
5318 SkippedCallsCloning++;
5319 return;
5320 }
5321 // Update the calls per the summary info.
5322 // Save orig name since it gets updated in the first iteration
5323 // below.
5324 auto CalleeOrigName = CalledFunction->getName();
5325 for (unsigned J = 0; J < StackNode.Clones.size(); J++) {
5326 // Do nothing if this version calls the original version of its
5327 // callee.
5328 if (!StackNode.Clones[J])
5329 continue;
5330 auto NewF = M.getOrInsertFunction(
5331 getMemProfFuncName(CalleeOrigName, StackNode.Clones[J]),
5332 CalledFunction->getFunctionType());
5333 CallBase *CBClone;
5334 // Copy 0 is the original function.
5335 if (!J)
5336 CBClone = CB;
5337 else
5338 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
5339 // Set the called operand directly instead of calling setCalledFunction,
5340 // as the latter mutates the function type on the call. In rare cases
5341 // we may have a slightly different type on a callee function
5342 // declaration due to it being imported from a different module with
5343 // incomplete types. We really just want to change the name of the
5344 // function to the clone, and not make any type changes.
5345 CBClone->setCalledOperand(NewF.getCallee());
5346 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone)
5347 << ore::NV("Call", CBClone) << " in clone "
5348 << ore::NV("Caller", CBClone->getFunction())
5349 << " assigned to call function clone "
5350 << ore::NV("Callee", NewF.getCallee()));
5351 }
5352 };
5353
5354 // Locate the summary for F.
5355 ValueInfo TheFnVI = findValueInfoForFunc(F, M, ImportSummary);
5356 // If not found, this could be an imported local (see comment in
5357 // findValueInfoForFunc). Skip for now as it will be cloned in its original
5358 // module (where it would have been promoted to global scope so should
5359 // satisfy any reference in this module).
5360 if (!TheFnVI)
5361 continue;
5362
5363 auto *GVSummary =
5364 ImportSummary->findSummaryInModule(TheFnVI, M.getModuleIdentifier());
5365 if (!GVSummary) {
5366 // Must have been imported, use the summary which matches the definition。
5367 // (might be multiple if this was a linkonce_odr).
5368 auto SrcModuleMD = F.getMetadata("thinlto_src_module");
5369 assert(SrcModuleMD &&
5370 "enable-import-metadata is needed to emit thinlto_src_module");
5371 StringRef SrcModule =
5372 dyn_cast<MDString>(SrcModuleMD->getOperand(0))->getString();
5373 for (auto &GVS : TheFnVI.getSummaryList()) {
5374 if (GVS->modulePath() == SrcModule) {
5375 GVSummary = GVS.get();
5376 break;
5377 }
5378 }
5379 assert(GVSummary && GVSummary->modulePath() == SrcModule);
5380 }
5381
5382 // If this was an imported alias skip it as we won't have the function
5383 // summary, and it should be cloned in the original module.
5384 if (isa<AliasSummary>(GVSummary))
5385 continue;
5386
5387 auto *FS = cast<FunctionSummary>(GVSummary->getBaseObject());
5388
5389 if (FS->allocs().empty() && FS->callsites().empty())
5390 continue;
5391
5392 auto SI = FS->callsites().begin();
5393 auto AI = FS->allocs().begin();
5394
5395 // To handle callsite infos synthesized for tail calls which have missing
5396 // frames in the profiled context, map callee VI to the synthesized callsite
5397 // info.
5398 DenseMap<ValueInfo, CallsiteInfo> MapTailCallCalleeVIToCallsite;
5399 // Iterate the callsites for this function in reverse, since we place all
5400 // those synthesized for tail calls at the end.
5401 for (auto CallsiteIt = FS->callsites().rbegin();
5402 CallsiteIt != FS->callsites().rend(); CallsiteIt++) {
5403 auto &Callsite = *CallsiteIt;
5404 // Stop as soon as we see a non-synthesized callsite info (see comment
5405 // above loop). All the entries added for discovered tail calls have empty
5406 // stack ids.
5407 if (!Callsite.StackIdIndices.empty())
5408 break;
5409 MapTailCallCalleeVIToCallsite.insert({Callsite.Callee, Callsite});
5410 }
5411
5412 // Keeps track of needed ICP for the function.
5413 SmallVector<ICallAnalysisData> ICallAnalysisInfo;
5414
5415 // Assume for now that the instructions are in the exact same order
5416 // as when the summary was created, but confirm this is correct by
5417 // matching the stack ids.
5418 for (auto &BB : F) {
5419 for (auto &I : BB) {
5420 auto *CB = dyn_cast<CallBase>(&I);
5421 // Same handling as when creating module summary.
5422 if (!mayHaveMemprofSummary(CB))
5423 continue;
5424
5425 auto *CalledValue = CB->getCalledOperand();
5426 auto *CalledFunction = CB->getCalledFunction();
5427 if (CalledValue && !CalledFunction) {
5428 CalledValue = CalledValue->stripPointerCasts();
5429 // Stripping pointer casts can reveal a called function.
5430 CalledFunction = dyn_cast<Function>(CalledValue);
5431 }
5432 // Check if this is an alias to a function. If so, get the
5433 // called aliasee for the checks below.
5434 if (auto *GA = dyn_cast<GlobalAlias>(CalledValue)) {
5435 assert(!CalledFunction &&
5436 "Expected null called function in callsite for alias");
5437 CalledFunction = dyn_cast<Function>(GA->getAliaseeObject());
5438 }
5439
5440 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
5441 I.getMetadata(LLVMContext::MD_callsite));
5442 auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof);
5443
5444 // Include allocs that were already assigned a memprof function
5445 // attribute in the statistics.
5446 if (CB->getAttributes().hasFnAttr("memprof")) {
5447 assert(!MemProfMD);
5448 CB->getAttributes().getFnAttr("memprof").getValueAsString() == "cold"
5449 ? AllocTypeColdThinBackend++
5450 : AllocTypeNotColdThinBackend++;
5451 OrigAllocsThinBackend++;
5452 AllocVersionsThinBackend++;
5453 if (!MaxAllocVersionsThinBackend)
5454 MaxAllocVersionsThinBackend = 1;
5455 continue;
5456 }
5457
5458 if (MemProfMD) {
5459 // Consult the next alloc node.
5460 assert(AI != FS->allocs().end());
5461 auto &AllocNode = *(AI++);
5462
5463#ifndef NDEBUG
5464 checkAllocContextIds(AllocNode, MemProfMD, CallsiteContext,
5465 ImportSummary);
5466#endif
5467
5468 // Perform cloning if not yet done.
5469 CloneFuncIfNeeded(/*NumClones=*/AllocNode.Versions.size());
5470
5471 OrigAllocsThinBackend++;
5472 AllocVersionsThinBackend += AllocNode.Versions.size();
5473 if (MaxAllocVersionsThinBackend < AllocNode.Versions.size())
5474 MaxAllocVersionsThinBackend = AllocNode.Versions.size();
5475
5476 // If there is only one version that means we didn't end up
5477 // considering this function for cloning, and in that case the alloc
5478 // will still be none type or should have gotten the default NotCold.
5479 // Skip that after calling clone helper since that does some sanity
5480 // checks that confirm we haven't decided yet that we need cloning.
5481 // We might have a single version that is cold due to the
5482 // MinClonedColdBytePercent heuristic, make sure we don't skip in that
5483 // case.
5484 if (AllocNode.Versions.size() == 1 &&
5485 (AllocationType)AllocNode.Versions[0] != AllocationType::Cold) {
5486 assert((AllocationType)AllocNode.Versions[0] ==
5487 AllocationType::NotCold ||
5488 (AllocationType)AllocNode.Versions[0] ==
5489 AllocationType::None);
5490 UnclonableAllocsThinBackend++;
5491 continue;
5492 }
5493
5494 // All versions should have a singular allocation type.
5495 assert(llvm::none_of(AllocNode.Versions, [](uint8_t Type) {
5496 return Type == ((uint8_t)AllocationType::NotCold |
5497 (uint8_t)AllocationType::Cold);
5498 }));
5499
5500 // Update the allocation types per the summary info.
5501 for (unsigned J = 0; J < AllocNode.Versions.size(); J++) {
5502 // Ignore any that didn't get an assigned allocation type.
5503 if (AllocNode.Versions[J] == (uint8_t)AllocationType::None)
5504 continue;
5505 AllocationType AllocTy = (AllocationType)AllocNode.Versions[J];
5506 AllocTy == AllocationType::Cold ? AllocTypeColdThinBackend++
5507 : AllocTypeNotColdThinBackend++;
5508 std::string AllocTypeString = getAllocTypeAttributeString(AllocTy);
5509 auto A = llvm::Attribute::get(F.getContext(), "memprof",
5510 AllocTypeString);
5511 CallBase *CBClone;
5512 // Copy 0 is the original function.
5513 if (!J)
5514 CBClone = CB;
5515 else
5516 // Since VMaps are only created for new clones, we index with
5517 // clone J-1 (J==0 is the original clone and does not have a VMaps
5518 // entry).
5519 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
5521 CBClone->addFnAttr(A);
5522 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", CBClone)
5523 << ore::NV("AllocationCall", CBClone) << " in clone "
5524 << ore::NV("Caller", CBClone->getFunction())
5525 << " marked with memprof allocation attribute "
5526 << ore::NV("Attribute", AllocTypeString));
5527 }
5528 } else if (!CallsiteContext.empty()) {
5529 if (!CalledFunction) {
5530#ifndef NDEBUG
5531 // We should have skipped inline assembly calls.
5532 auto *CI = dyn_cast<CallInst>(CB);
5533 assert(!CI || !CI->isInlineAsm());
5534#endif
5535 // We should have skipped direct calls via a Constant.
5536 assert(CalledValue && !isa<Constant>(CalledValue));
5537
5538 // This is an indirect call, see if we have profile information and
5539 // whether any clones were recorded for the profiled targets (that
5540 // we synthesized CallsiteInfo summary records for when building the
5541 // index).
5542 auto NumClones =
5543 recordICPInfo(CB, FS->callsites(), SI, ICallAnalysisInfo);
5544
5545 // Perform cloning if not yet done. This is done here in case
5546 // we don't need to do ICP, but might need to clone this
5547 // function as it is the target of other cloned calls.
5548 if (NumClones)
5549 CloneFuncIfNeeded(NumClones);
5550 }
5551
5552 else {
5553 // Consult the next callsite node.
5554 assert(SI != FS->callsites().end());
5555 auto &StackNode = *(SI++);
5556
5557#ifndef NDEBUG
5558 // Sanity check that the stack ids match between the summary and
5559 // instruction metadata.
5560 auto StackIdIndexIter = StackNode.StackIdIndices.begin();
5561 for (auto StackId : CallsiteContext) {
5562 assert(StackIdIndexIter != StackNode.StackIdIndices.end());
5563 assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
5564 StackId);
5565 StackIdIndexIter++;
5566 }
5567#endif
5568
5569 CloneCallsite(StackNode, CB, CalledFunction);
5570 }
5571 } else if (CB->isTailCall() && CalledFunction) {
5572 // Locate the synthesized callsite info for the callee VI, if any was
5573 // created, and use that for cloning.
5574 ValueInfo CalleeVI =
5575 findValueInfoForFunc(*CalledFunction, M, ImportSummary, &F);
5576 if (CalleeVI && MapTailCallCalleeVIToCallsite.count(CalleeVI)) {
5577 auto Callsite = MapTailCallCalleeVIToCallsite.find(CalleeVI);
5578 assert(Callsite != MapTailCallCalleeVIToCallsite.end());
5579 CloneCallsite(Callsite->second, CB, CalledFunction);
5580 }
5581 }
5582 }
5583 }
5584
5585 // Now do any promotion required for cloning.
5586 performICP(M, FS->callsites(), VMaps, ICallAnalysisInfo, ORE);
5587 }
5588
5589 // We skip some of the functions and instructions above, so remove all the
5590 // metadata in a single sweep here.
5591 for (auto &F : M) {
5592 // We can skip memprof clones because createFunctionClones already strips
5593 // the metadata from the newly created clones.
5594 if (F.isDeclaration() || isMemProfClone(F))
5595 continue;
5596 for (auto &BB : F) {
5597 for (auto &I : BB) {
5598 if (!isa<CallBase>(I))
5599 continue;
5600 I.setMetadata(LLVMContext::MD_memprof, nullptr);
5601 I.setMetadata(LLVMContext::MD_callsite, nullptr);
5602 }
5603 }
5604 }
5605
5606 return Changed;
5607}
5608
5609unsigned MemProfContextDisambiguation::recordICPInfo(
5610 CallBase *CB, ArrayRef<CallsiteInfo> AllCallsites,
5612 SmallVector<ICallAnalysisData> &ICallAnalysisInfo) {
5613 // First see if we have profile information for this indirect call.
5614 uint32_t NumCandidates;
5615 uint64_t TotalCount;
5616 auto CandidateProfileData =
5617 ICallAnalysis->getPromotionCandidatesForInstruction(CB, TotalCount,
5618 NumCandidates);
5619 if (CandidateProfileData.empty())
5620 return 0;
5621
5622 // Iterate through all of the candidate profiled targets along with the
5623 // CallsiteInfo summary records synthesized for them when building the index,
5624 // and see if any are cloned and/or refer to clones.
5625 bool ICPNeeded = false;
5626 unsigned NumClones = 0;
5627 size_t CallsiteInfoStartIndex = std::distance(AllCallsites.begin(), SI);
5628 for (const auto &Candidate : CandidateProfileData) {
5629#ifndef NDEBUG
5630 auto CalleeValueInfo =
5631#endif
5632 ImportSummary->getValueInfo(Candidate.Value);
5633 // We might not have a ValueInfo if this is a distributed
5634 // ThinLTO backend and decided not to import that function.
5635 assert(!CalleeValueInfo || SI->Callee == CalleeValueInfo);
5636 assert(SI != AllCallsites.end());
5637 auto &StackNode = *(SI++);
5638 // See if any of the clones of the indirect callsite for this
5639 // profiled target should call a cloned version of the profiled
5640 // target. We only need to do the ICP here if so.
5641 ICPNeeded |= llvm::any_of(StackNode.Clones,
5642 [](unsigned CloneNo) { return CloneNo != 0; });
5643 // Every callsite in the same function should have been cloned the same
5644 // number of times.
5645 assert(!NumClones || NumClones == StackNode.Clones.size());
5646 NumClones = StackNode.Clones.size();
5647 }
5648 if (!ICPNeeded)
5649 return NumClones;
5650 // Save information for ICP, which is performed later to avoid messing up the
5651 // current function traversal.
5652 ICallAnalysisInfo.push_back({CB, CandidateProfileData.vec(), NumCandidates,
5653 TotalCount, CallsiteInfoStartIndex});
5654 return NumClones;
5655}
5656
5657void MemProfContextDisambiguation::performICP(
5658 Module &M, ArrayRef<CallsiteInfo> AllCallsites,
5659 ArrayRef<std::unique_ptr<ValueToValueMapTy>> VMaps,
5660 ArrayRef<ICallAnalysisData> ICallAnalysisInfo,
5661 OptimizationRemarkEmitter &ORE) {
5662 // Now do any promotion required for cloning. Specifically, for each
5663 // recorded ICP candidate (which was only recorded because one clone of that
5664 // candidate should call a cloned target), we perform ICP (speculative
5665 // devirtualization) for each clone of the callsite, and update its callee
5666 // to the appropriate clone. Note that the ICP compares against the original
5667 // version of the target, which is what is in the vtable.
5668 for (auto &Info : ICallAnalysisInfo) {
5669 auto *CB = Info.CB;
5670 auto CallsiteIndex = Info.CallsiteInfoStartIndex;
5671 auto TotalCount = Info.TotalCount;
5672 unsigned NumPromoted = 0;
5673 unsigned NumClones = 0;
5674
5675 for (auto &Candidate : Info.CandidateProfileData) {
5676 auto &StackNode = AllCallsites[CallsiteIndex++];
5677
5678 // All calls in the same function must have the same number of clones.
5679 assert(!NumClones || NumClones == StackNode.Clones.size());
5680 NumClones = StackNode.Clones.size();
5681
5682 // See if the target is in the module. If it wasn't imported, it is
5683 // possible that this profile could have been collected on a different
5684 // target (or version of the code), and we need to be conservative
5685 // (similar to what is done in the ICP pass).
5686 Function *TargetFunction = Symtab->getFunction(Candidate.Value);
5687 if (TargetFunction == nullptr ||
5688 // Any ThinLTO global dead symbol removal should have already
5689 // occurred, so it should be safe to promote when the target is a
5690 // declaration.
5691 // TODO: Remove internal option once more fully tested.
5693 TargetFunction->isDeclaration())) {
5694 ORE.emit([&]() {
5695 return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToFindTarget", CB)
5696 << "Memprof cannot promote indirect call: target with md5sum "
5697 << ore::NV("target md5sum", Candidate.Value) << " not found";
5698 });
5699 // FIXME: See if we can use the new declaration importing support to
5700 // at least get the declarations imported for this case. Hot indirect
5701 // targets should have been imported normally, however.
5702 continue;
5703 }
5704
5705 // Check if legal to promote
5706 const char *Reason = nullptr;
5707 if (!isLegalToPromote(*CB, TargetFunction, &Reason)) {
5708 ORE.emit([&]() {
5709 return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToPromote", CB)
5710 << "Memprof cannot promote indirect call to "
5711 << ore::NV("TargetFunction", TargetFunction)
5712 << " with count of " << ore::NV("TotalCount", TotalCount)
5713 << ": " << Reason;
5714 });
5715 continue;
5716 }
5717
5718 assert(!isMemProfClone(*TargetFunction));
5719
5720 // Handle each call clone, applying ICP so that each clone directly
5721 // calls the specified callee clone, guarded by the appropriate ICP
5722 // check.
5723 CallBase *CBClone = CB;
5724 for (unsigned J = 0; J < NumClones; J++) {
5725 // Copy 0 is the original function.
5726 if (J > 0)
5727 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
5728 // We do the promotion using the original name, so that the comparison
5729 // is against the name in the vtable. Then just below, change the new
5730 // direct call to call the cloned function.
5731 auto &DirectCall =
5732 pgo::promoteIndirectCall(*CBClone, TargetFunction, Candidate.Count,
5733 TotalCount, isSamplePGO, &ORE);
5734 auto *TargetToUse = TargetFunction;
5735 // Call original if this version calls the original version of its
5736 // callee.
5737 if (StackNode.Clones[J]) {
5738 TargetToUse =
5739 cast<Function>(M.getOrInsertFunction(
5740 getMemProfFuncName(TargetFunction->getName(),
5741 StackNode.Clones[J]),
5742 TargetFunction->getFunctionType())
5743 .getCallee());
5744 }
5745 DirectCall.setCalledFunction(TargetToUse);
5746 // During matching we generate synthetic VP metadata for indirect calls
5747 // not already having any, from the memprof profile's callee GUIDs. If
5748 // we subsequently promote and inline those callees, we currently lose
5749 // the ability to generate this synthetic VP metadata. Optionally apply
5750 // a noinline attribute to promoted direct calls, where the threshold is
5751 // set to capture synthetic VP metadata targets which get a count of 1.
5753 Candidate.Count < MemProfICPNoInlineThreshold)
5754 DirectCall.setIsNoInline();
5755 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone)
5756 << ore::NV("Call", CBClone) << " in clone "
5757 << ore::NV("Caller", CBClone->getFunction())
5758 << " promoted and assigned to call function clone "
5759 << ore::NV("Callee", TargetToUse));
5760 }
5761
5762 // Update TotalCount (all clones should get same count above)
5763 TotalCount -= Candidate.Count;
5764 NumPromoted++;
5765 }
5766 // Adjust the MD.prof metadata for all clones, now that we have the new
5767 // TotalCount and the number promoted.
5768 CallBase *CBClone = CB;
5769 for (unsigned J = 0; J < NumClones; J++) {
5770 // Copy 0 is the original function.
5771 if (J > 0)
5772 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
5773 // First delete the old one.
5774 CBClone->setMetadata(LLVMContext::MD_prof, nullptr);
5775 // If all promoted, we don't need the MD.prof metadata.
5776 // Otherwise we need update with the un-promoted records back.
5777 if (TotalCount != 0)
5779 M, *CBClone, ArrayRef(Info.CandidateProfileData).slice(NumPromoted),
5780 TotalCount, IPVK_IndirectCallTarget, Info.NumCandidates);
5781 }
5782 }
5783}
5784
5785template <typename DerivedCCG, typename FuncTy, typename CallTy>
5786bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::process() {
5787 if (DumpCCG) {
5788 dbgs() << "CCG before cloning:\n";
5789 dbgs() << *this;
5790 }
5791 if (ExportToDot)
5792 exportToDot("postbuild");
5793
5794 if (VerifyCCG) {
5795 check();
5796 }
5797
5798 identifyClones();
5799
5800 if (VerifyCCG) {
5801 check();
5802 }
5803
5804 if (DumpCCG) {
5805 dbgs() << "CCG after cloning:\n";
5806 dbgs() << *this;
5807 }
5808 if (ExportToDot)
5809 exportToDot("cloned");
5810
5811 bool Changed = assignFunctions();
5812
5813 if (DumpCCG) {
5814 dbgs() << "CCG after assigning function clones:\n";
5815 dbgs() << *this;
5816 }
5817 if (ExportToDot)
5818 exportToDot("clonefuncassign");
5819
5821 printTotalSizes(errs());
5822
5823 return Changed;
5824}
5825
5826bool MemProfContextDisambiguation::processModule(
5827 Module &M,
5828 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter) {
5829
5830 // If we have an import summary, then the cloning decisions were made during
5831 // the thin link on the index. Apply them and return.
5832 if (ImportSummary)
5833 return applyImport(M);
5834
5835 // TODO: If/when other types of memprof cloning are enabled beyond just for
5836 // hot and cold, we will need to change this to individually control the
5837 // AllocationType passed to addStackNodesForMIB during CCG construction.
5838 // Note that we specifically check this after applying imports above, so that
5839 // the option isn't needed to be passed to distributed ThinLTO backend
5840 // clang processes, which won't necessarily have visibility into the linker
5841 // dependences. Instead the information is communicated from the LTO link to
5842 // the backends via the combined summary index.
5843 if (!SupportsHotColdNew)
5844 return false;
5845
5846 ModuleCallsiteContextGraph CCG(M, OREGetter);
5847 return CCG.process();
5848}
5849
5851 const ModuleSummaryIndex *Summary, bool isSamplePGO)
5852 : ImportSummary(Summary), isSamplePGO(isSamplePGO) {
5853 // Check the dot graph printing options once here, to make sure we have valid
5854 // and expected combinations.
5855 if (DotGraphScope == DotScope::Alloc && !AllocIdForDot.getNumOccurrences())
5857 "-memprof-dot-scope=alloc requires -memprof-dot-alloc-id");
5859 !ContextIdForDot.getNumOccurrences())
5861 "-memprof-dot-scope=context requires -memprof-dot-context-id");
5862 if (DotGraphScope == DotScope::All && AllocIdForDot.getNumOccurrences() &&
5863 ContextIdForDot.getNumOccurrences())
5865 "-memprof-dot-scope=all can't have both -memprof-dot-alloc-id and "
5866 "-memprof-dot-context-id");
5867 if (ImportSummary) {
5868 // The MemProfImportSummary should only be used for testing ThinLTO
5869 // distributed backend handling via opt, in which case we don't have a
5870 // summary from the pass pipeline.
5872 return;
5873 }
5874 if (MemProfImportSummary.empty())
5875 return;
5876
5877 auto ReadSummaryFile =
5879 if (!ReadSummaryFile) {
5880 logAllUnhandledErrors(ReadSummaryFile.takeError(), errs(),
5881 "Error loading file '" + MemProfImportSummary +
5882 "': ");
5883 return;
5884 }
5885 auto ImportSummaryForTestingOrErr = getModuleSummaryIndex(**ReadSummaryFile);
5886 if (!ImportSummaryForTestingOrErr) {
5887 logAllUnhandledErrors(ImportSummaryForTestingOrErr.takeError(), errs(),
5888 "Error parsing file '" + MemProfImportSummary +
5889 "': ");
5890 return;
5891 }
5892 ImportSummaryForTesting = std::move(*ImportSummaryForTestingOrErr);
5893 ImportSummary = ImportSummaryForTesting.get();
5894}
5895
5898 auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
5899 auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & {
5900 return FAM.getResult<OptimizationRemarkEmitterAnalysis>(*F);
5901 };
5902 if (!processModule(M, OREGetter))
5903 return PreservedAnalyses::all();
5904 return PreservedAnalyses::none();
5905}
5906
5908 ModuleSummaryIndex &Index,
5910 isPrevailing) {
5911 // TODO: If/when other types of memprof cloning are enabled beyond just for
5912 // hot and cold, we will need to change this to individually control the
5913 // AllocationType passed to addStackNodesForMIB during CCG construction.
5914 // The index was set from the option, so these should be in sync.
5915 assert(Index.withSupportsHotColdNew() == SupportsHotColdNew);
5916 if (!SupportsHotColdNew)
5917 return;
5918
5919 IndexCallsiteContextGraph CCG(Index, isPrevailing);
5920 CCG.process();
5921}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
aarch64 promote const
AMDGPU Prepare AGPR Alloc
Unify divergent function exit nodes
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
#define DEBUG_TYPE
global merge func
Module.h This file contains the declarations for the Module class.
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
#define G(x, y, z)
Definition MD5.cpp:56
Machine Check Debug Module
This file implements a map that provides insertion order iteration.
static SmallVector< std::unique_ptr< ValueToValueMapTy >, 4 > createFunctionClones(Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE, std::map< const Function *, SmallPtrSet< const GlobalAlias *, 1 > > &FuncToAliasMap)
static cl::opt< unsigned > TailCallSearchDepth("memprof-tail-call-search-depth", cl::init(5), cl::Hidden, cl::desc("Max depth to recursively search for missing " "frames through tail calls."))
static cl::opt< DotScope > DotGraphScope("memprof-dot-scope", cl::desc("Scope of graph to export to dot"), cl::Hidden, cl::init(DotScope::All), cl::values(clEnumValN(DotScope::All, "all", "Export full callsite graph"), clEnumValN(DotScope::Alloc, "alloc", "Export only nodes with contexts feeding given " "-memprof-dot-alloc-id"), clEnumValN(DotScope::Context, "context", "Export only nodes with given -memprof-dot-context-id")))
static cl::opt< bool > DoMergeIteration("memprof-merge-iteration", cl::init(true), cl::Hidden, cl::desc("Iteratively apply merging on a node to catch new callers"))
static bool isMemProfClone(const Function &F)
static cl::opt< unsigned > AllocIdForDot("memprof-dot-alloc-id", cl::init(0), cl::Hidden, cl::desc("Id of alloc to export if -memprof-dot-scope=alloc " "or to highlight if -memprof-dot-scope=all"))
static cl::opt< unsigned > ContextIdForDot("memprof-dot-context-id", cl::init(0), cl::Hidden, cl::desc("Id of context to export if -memprof-dot-scope=context or to " "highlight otherwise"))
static cl::opt< bool > ExportToDot("memprof-export-to-dot", cl::init(false), cl::Hidden, cl::desc("Export graph to dot files."))
static void checkEdge(const std::shared_ptr< ContextEdge< DerivedCCG, FuncTy, CallTy > > &Edge)
static cl::opt< bool > AllowRecursiveCallsites("memprof-allow-recursive-callsites", cl::init(true), cl::Hidden, cl::desc("Allow cloning of callsites involved in recursive cycles"))
bool checkColdOrNotCold(uint8_t AllocType)
static ValueInfo findValueInfoForFunc(const Function &F, const Module &M, const ModuleSummaryIndex *ImportSummary, const Function *CallingFunc=nullptr)
static cl::opt< bool > CloneRecursiveContexts("memprof-clone-recursive-contexts", cl::init(true), cl::Hidden, cl::desc("Allow cloning of contexts through recursive cycles"))
static std::string getAllocTypeString(uint8_t AllocTypes)
static cl::opt< unsigned > MemProfICPNoInlineThreshold("memprof-icp-noinline-threshold", cl::init(2), cl::Hidden, cl::desc("Minimum absolute count for promoted target to be inlinable"))
bool DOTGraphTraits< constCallsiteContextGraph< DerivedCCG, FuncTy, CallTy > * >::DoHighlight
static unsigned getMemProfCloneNum(const Function &F)
static cl::opt< bool > VerifyCCG("memprof-verify-ccg", cl::init(false), cl::Hidden, cl::desc("Perform verification checks on CallingContextGraph."))
static void checkNode(const ContextNode< DerivedCCG, FuncTy, CallTy > *Node, bool CheckEdges=true)
static cl::opt< bool > MergeClones("memprof-merge-clones", cl::init(true), cl::Hidden, cl::desc("Merge clones before assigning functions"))
static std::string getMemProfFuncName(Twine Base, unsigned CloneNo)
static cl::opt< std::string > MemProfImportSummary("memprof-import-summary", cl::desc("Import summary to use for testing the ThinLTO backend via opt"), cl::Hidden)
static const std::string MemProfCloneSuffix
static void updateSubprogramLinkageName(Function *NewFunc, StringRef Name)
static cl::opt< bool > AllowRecursiveContexts("memprof-allow-recursive-contexts", cl::init(true), cl::Hidden, cl::desc("Allow cloning of contexts having recursive cycles"))
static cl::opt< std::string > DotFilePathPrefix("memprof-dot-file-path-prefix", cl::init(""), cl::Hidden, cl::value_desc("filename"), cl::desc("Specify the path prefix of the MemProf dot files."))
static cl::opt< bool > VerifyNodes("memprof-verify-nodes", cl::init(false), cl::Hidden, cl::desc("Perform frequent verification checks on nodes."))
static void checkAllocContextIds(const AllocInfo &AllocNode, const MDNode *MemProfMD, const CallStack< MDNode, MDNode::op_iterator > &CallsiteContext, const ModuleSummaryIndex *ImportSummary)
static cl::opt< bool > DumpCCG("memprof-dump-ccg", cl::init(false), cl::Hidden, cl::desc("Dump CallingContextGraph to stdout after each stage."))
AllocType
cl::opt< bool > MemProfReportHintedSizes("memprof-report-hinted-sizes", cl::init(false), cl::Hidden, cl::desc("Report total allocation sizes of hinted allocations"))
cl::opt< unsigned > MinClonedColdBytePercent("memprof-cloning-cold-threshold", cl::init(100), cl::Hidden, cl::desc("Min percent of cold bytes to hint alloc cold during cloning"))
This is the interface to build a ModuleSummaryIndex for a module.
ModuleSummaryIndex.h This file contains the declarations the classes that hold the module index and s...
#define P(N)
if(auto Err=PB.parsePassPipeline(MPM, Passes)) return wrap(std MPM run * Mod
FunctionAnalysisManager FAM
if(PassOpts->AAPipeline)
std::pair< BasicBlock *, BasicBlock * > Edge
This file defines generic set operations that may be used on set's of different types,...
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
ValueInfo getAliaseeVI() const
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
iterator end() const
Definition ArrayRef.h:136
const_pointer iterator
Definition ArrayRef.h:48
iterator begin() const
Definition ArrayRef.h:135
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:142
static LLVM_ABI Attribute get(LLVMContext &Context, AttrKind Kind, uint64_t Val=0)
Return a uniquified Attribute object.
void addFnAttr(Attribute::AttrKind Kind)
Adds the attribute to the function.
void setCalledOperand(Value *V)
Subprogram description. Uses SubclassData1.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:187
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:165
unsigned size() const
Definition DenseMap.h:108
bool empty() const
Definition DenseMap.h:107
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition DenseMap.h:161
iterator end()
Definition DenseMap.h:81
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition DenseMap.h:156
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:214
void reserve(size_type NumEntries)
Grow the densemap so that it can contain at least NumEntries items before resizing again.
Definition DenseMap.h:112
Implements a dense probed hash-table based set.
Definition DenseSet.h:269
Function summary information to aid decisions and implementation of importing.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:209
DISubprogram * getSubprogram() const
Get the attached subprogram.
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
static LLVM_ABI GlobalAlias * create(Type *Ty, unsigned AddressSpace, LinkageTypes Linkage, const Twine &Name, Constant *Aliasee, Module *Parent)
If a parent module is specified, the alias is automatically inserted into the end of the specified mo...
Definition Globals.cpp:597
Function and variable summary information to aid decisions and implementation of importing.
static LLVM_ABI GUID getGUIDAssumingExternalLinkage(StringRef GlobalName)
Return a 64-bit global unique ID constructed from the name of a global symbol.
Definition Globals.cpp:77
static bool isLocalLinkage(LinkageTypes Linkage)
LLVM_ABI bool isDeclaration() const
Return true if the primary definition of this global value is outside of the current translation unit...
Definition Globals.cpp:328
uint64_t GUID
Declare a type to represent a global unique identifier for a global value.
Module * getParent()
Get the module that this global value is contained inside of...
static LLVM_ABI std::string getGlobalIdentifier(StringRef Name, GlobalValue::LinkageTypes Linkage, StringRef FileName)
Return the modified name for a global value suitable to be used as the key for a global lookup (e....
Definition Globals.cpp:161
@ InternalLinkage
Rename collisions when linking (static functions).
Definition GlobalValue.h:60
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Metadata node.
Definition Metadata.h:1077
const MDOperand & getOperand(unsigned I) const
Definition Metadata.h:1445
ArrayRef< MDOperand > operands() const
Definition Metadata.h:1443
unsigned getNumOperands() const
Return number of MDNode operands.
Definition Metadata.h:1451
LLVM_ABI TempMDNode clone() const
Create a (temporary) clone of this.
Definition Metadata.cpp:668
static std::enable_if_t< std::is_base_of< MDNode, T >::value, T * > replaceWithUniqued(std::unique_ptr< T, TempMDNodeDeleter > N)
Replace a temporary node with a uniqued one.
Definition Metadata.h:1316
static LLVM_ABI MDString * get(LLVMContext &Context, StringRef Str)
Definition Metadata.cpp:607
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
size_type count(const KeyT &Key) const
Definition MapVector.h:139
MemProfContextDisambiguation(const ModuleSummaryIndex *Summary=nullptr, bool isSamplePGO=false)
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
static ErrorOr< std::unique_ptr< MemoryBuffer > > getFile(const Twine &Filename, bool IsText=false, bool RequiresNullTerminator=true, bool IsVolatile=false, std::optional< Align > Alignment=std::nullopt)
Open the specified file as a MemoryBuffer, returning a new MemoryBuffer if successful,...
Class to hold module path string table and global value map, and encapsulate methods for operating on...
static StringRef getOriginalNameBeforePromote(StringRef Name)
Helper to obtain the unpromoted name for a global value (or the original name if not promoted).
ValueInfo getValueInfo(const GlobalValueSummaryMapTy::value_type &R) const
Return a ValueInfo for the index value_type (convenient when iterating index).
uint64_t getStackIdAtIndex(unsigned Index) const
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
LLVMContext & getContext() const
Get the global data context.
Definition Module.h:285
A NodeSet contains a set of SUnit DAG nodes with additional information that assigns a priority to th...
unsigned size() const
bool insert(SUnit *SU)
The optimization diagnostic interface.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for applied optimization remarks.
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
Definition Analysis.h:115
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:181
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
LLVM_ABI void print(raw_ostream &O, bool IsForDebug=false) const
Implement operator<< on Value.
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:194
void reserve(size_t Size)
Grow the DenseSet so that it can contain at least NumEntries items before resizing again.
Definition DenseSet.h:96
void insert_range(Range &&R)
Definition DenseSet.h:220
size_type size() const
Definition DenseSet.h:87
void swap(DenseSetImpl &RHS)
Definition DenseSet.h:102
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:169
bool erase(const ValueT &V)
Definition DenseSet.h:100
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition DenseSet.h:174
An efficient, type-erasing, non-owning reference to a callable.
Helper class to iterate through stack ids in both metadata (memprof MIB and callsite) and the corresp...
CallStackIterator beginAfterSharedPrefix(const CallStack &Other)
CallStackIterator end() const
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
CallInst * Call
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ Entry
Definition COFF.h:862
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ CE
Windows NT (Windows on ARM)
Definition MCAsmInfo.h:48
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
initializer< Ty > init(const Ty &Val)
std::enable_if_t< detail::IsValidPointer< X, Y >::value, X * > dyn_extract(Y &&MD)
Extract a Value from Metadata, if any.
Definition Metadata.h:694
LLVM_ABI AllocationType getMIBAllocType(const MDNode *MIB)
Returns the allocation type from an MIB metadata node.
LLVM_ABI bool metadataMayIncludeContextSizeInfo()
Whether the alloc memprof metadata may include context size info for some MIBs (but possibly not all)...
LLVM_ABI bool hasSingleAllocType(uint8_t AllocTypes)
True if the AllocTypes bitmask contains just a single type.
LLVM_ABI std::string getAllocTypeAttributeString(AllocationType Type)
Returns the string to use in attributes with the given type.
LLVM_ABI MDNode * getMIBStackNode(const MDNode *MIB)
Returns the stack node from an MIB metadata node.
LLVM_ABI void removeAnyExistingAmbiguousAttribute(CallBase *CB)
Removes any existing "ambiguous" memprof attribute.
DiagnosticInfoOptimizationBase::Argument NV
LLVM_ABI CallBase & promoteIndirectCall(CallBase &CB, Function *F, uint64_t Count, uint64_t TotalCount, bool AttachProfToDirectCall, OptimizationRemarkEmitter *ORE)
uint32_t NodeId
Definition RDFGraph.h:262
NodeAddr< NodeBase * > Node
Definition RDFGraph.h:381
NodeAddr< FuncNode * > Func
Definition RDFGraph.h:393
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:311
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
LLVM_ABI void logAllUnhandledErrors(Error E, raw_ostream &OS, Twine ErrorBanner={})
Log all errors (if any) in E to OS.
Definition Error.cpp:65
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
void stable_sort(R &&Range)
Definition STLExtras.h:2047
LLVM_ABI bool isLegalToPromote(const CallBase &CB, Function *Callee, const char **FailureReason=nullptr)
Return true if the given indirect call site can be made to call Callee.
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2461
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:649
LLVM_ABI bool mayHaveMemprofSummary(const CallBase *CB)
Returns true if the instruction could have memprof metadata, used to ensure consistency between summa...
static cl::opt< bool > MemProfRequireDefinitionForPromotion("memprof-require-definition-for-promotion", cl::init(false), cl::Hidden, cl::desc("Require target function definition when promoting indirect calls"))
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:738
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2125
void set_subtract(S1Ty &S1, const S2Ty &S2)
set_subtract(A, B) - Compute A := A - B
InnerAnalysisManagerProxy< FunctionAnalysisManager, Module > FunctionAnalysisManagerModuleProxy
Provide the FunctionAnalysisManager to Module proxy.
raw_ostream & WriteGraph(raw_ostream &O, const GraphType &G, bool ShortNames=false, const Twine &Title="")
bool set_intersects(const S1Ty &S1, const S2Ty &S2)
set_intersects(A, B) - Return true iff A ^ B is non empty
detail::concat_range< ValueT, RangeTs... > concat(RangeTs &&...Ranges)
Returns a concatenated range across two or more ranges.
Definition STLExtras.h:1161
LLVM_ABI Expected< std::unique_ptr< ModuleSummaryIndex > > getModuleSummaryIndex(MemoryBufferRef Buffer)
Parse the specified bitcode buffer, returning the module summary index.
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:759
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1721
LLVM_ABI void annotateValueSite(Module &M, Instruction &Inst, const InstrProfRecord &InstrProfR, InstrProfValueKind ValueKind, uint32_t SiteIndx, uint32_t MaxMDCount=3)
Get the value profile data for value site SiteIdx from InstrProfR and annotate the instruction Inst w...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1728
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
bool set_union(S1Ty &S1, const S2Ty &S2)
set_union(A, B) - Compute A := A u B, return whether A changed.
cl::opt< bool > SupportsHotColdNew
Indicate we are linking with an allocator that supports hot/cold operator new interfaces.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
S1Ty set_intersection(const S1Ty &S1, const S2Ty &S2)
set_intersection(A, B) - Return A ^ B
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:71
cl::opt< bool > EnableMemProfContextDisambiguation
Enable MemProf context disambiguation for thin link.
S1Ty set_difference(const S1Ty &S1, const S2Ty &S2)
set_difference(A, B) - Return A - B
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
Expected< T > errorOrToExpected(ErrorOr< T > &&EO)
Convert an ErrorOr<T> to an Expected<T>.
Definition Error.h:1245
ArrayRef(const T &OneElt) -> ArrayRef< T >
std::string toString(const APInt &I, unsigned Radix, bool Signed, bool formatAsCLiteral=false, bool UpperCase=true, bool InsertSeparators=false)
ValueMap< const Value *, WeakTrackingVH > ValueToValueMapTy
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1856
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:565
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1747
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1886
LLVM_ABI Function * CloneFunction(Function *F, ValueToValueMapTy &VMap, ClonedCodeInfo *CodeInfo=nullptr)
Return a copy of the specified function and add it to that function's module.
AnalysisManager< Module > ModuleAnalysisManager
Convenience typedef for the Module analysis manager.
Definition MIRParser.h:39
#define N
static std::string getEdgeAttributes(NodeRef, ChildIteratorType ChildIter, GraphType G)
static const ContextNode< DerivedCCG, FuncTy, CallTy > * GetCallee(const EdgePtrTy &P)
std::unique_ptr< ContextNode< DerivedCCG, FuncTy, CallTy > > NodePtrTy
mapped_iterator< typename std::vector< std::shared_ptr< ContextEdge< DerivedCCG, FuncTy, CallTy > > >::const_iterator, decltype(&GetCallee)> ChildIteratorType
mapped_iterator< typename std::vector< NodePtrTy >::const_iterator, decltype(&getNode)> nodes_iterator
std::shared_ptr< ContextEdge< DerivedCCG, FuncTy, CallTy > > EdgePtrTy
Summary of memprof metadata on allocations.
std::vector< MIBInfo > MIBs
SmallVector< unsigned > StackIdIndices
SmallVector< unsigned > Clones
DefaultDOTGraphTraits(bool simple=false)
An information struct used to provide DenseMap with the various necessary components for a given valu...
typename GraphType::UnknownGraphTypeError NodeRef
Definition GraphTraits.h:95
Struct that holds a reference to a particular GUID in a global value summary.
ArrayRef< std::unique_ptr< GlobalValueSummary > > getSummaryList() const
GlobalValue::GUID getGUID() const
PointerUnion< CallsiteInfo *, AllocInfo * > SimpleType
static SimpleType getSimplifiedValue(IndexCall &Val)
const PointerUnion< CallsiteInfo *, AllocInfo * > SimpleType
static SimpleType getSimplifiedValue(const IndexCall &Val)
Define a template that can be specialized by smart pointers to reflect the fact that they are automat...
Definition Casting.h:34